aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-08 11:02:50 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-08 11:02:57 -0400
commitff96e612cba32510e263e17b213235fe5746397e (patch)
treea8df57d76b10e0901a4fb76cd2987eb9826a560a /fs
parentcd84a42f315e50edd454c27a3da3951ccd3d735a (diff)
parent577c9c456f0e1371cbade38eaf91ae8e8a308555 (diff)
Merge commit 'v2.6.30-rc1' into core/urgent
Merge reason: need latest upstream to queue up dependent fix Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig34
-rw-r--r--fs/Makefile6
-rw-r--r--fs/adfs/super.c16
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/Kconfig8
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cache.c503
-rw-r--r--fs/afs/cache.h15
-rw-r--r--fs/afs/cell.c16
-rw-r--r--fs/afs/file.c220
-rw-r--r--fs/afs/inode.c31
-rw-r--r--fs/afs/internal.h53
-rw-r--r--fs/afs/main.c27
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/vlocation.c25
-rw-r--r--fs/afs/volume.c14
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c25
-rw-r--r--fs/binfmt_som.c7
-rw-r--r--fs/bio.c3
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/async-thread.c7
-rw-r--r--fs/btrfs/ctree.c312
-rw-r--r--fs/btrfs/ctree.h84
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent-tree.c398
-rw-r--r--fs/btrfs/extent_io.c16
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/free-space-cache.c530
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode.c5
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/locking.c4
-rw-r--r--fs/btrfs/super.c54
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c12
-rw-r--r--fs/btrfs/volumes.c41
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/buffer.c34
-rw-r--r--fs/cachefiles/Kconfig39
-rw-r--r--fs/cachefiles/Makefile18
-rw-r--r--fs/cachefiles/bind.c286
-rw-r--r--fs/cachefiles/daemon.c755
-rw-r--r--fs/cachefiles/interface.c449
-rw-r--r--fs/cachefiles/internal.h360
-rw-r--r--fs/cachefiles/key.c159
-rw-r--r--fs/cachefiles/main.c106
-rw-r--r--fs/cachefiles/namei.c771
-rw-r--r--fs/cachefiles/proc.c134
-rw-r--r--fs/cachefiles/rdwr.c879
-rw-r--r--fs/cachefiles/security.c116
-rw-r--r--fs/cachefiles/xattr.c291
-rw-r--r--fs/cifs/dir.c4
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/compat.c105
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/cramfs/inode.c39
-rw-r--r--fs/cramfs/uncompress.c2
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/inode.c16
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/efs/super.c20
-rw-r--r--fs/exec.c35
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild16
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/common.h184
-rw-r--r--fs/exofs/dir.c672
-rw-r--r--fs/exofs/exofs.h180
-rw-r--r--fs/exofs/file.c87
-rw-r--r--fs/exofs/inode.c1303
-rw-r--r--fs/exofs/namei.c342
-rw-r--r--fs/exofs/osd.c153
-rw-r--r--fs/exofs/super.c584
-rw-r--r--fs/exofs/symlink.c57
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext3/Kconfig19
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/file.c6
-rw-r--r--fs/ext3/inode.c142
-rw-r--r--fs/ext3/ioctl.c59
-rw-r--r--fs/ext3/namei.c35
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/fat/inode.c8
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/fs-writeback.c31
-rw-r--r--fs/fs_struct.c177
-rw-r--r--fs/fscache/Kconfig56
-rw-r--r--fs/fscache/Makefile19
-rw-r--r--fs/fscache/cache.c415
-rw-r--r--fs/fscache/cookie.c500
-rw-r--r--fs/fscache/fsdef.c144
-rw-r--r--fs/fscache/histogram.c109
-rw-r--r--fs/fscache/internal.h380
-rw-r--r--fs/fscache/main.c124
-rw-r--r--fs/fscache/netfs.c103
-rw-r--r--fs/fscache/object.c810
-rw-r--r--fs/fscache/operation.c459
-rw-r--r--fs/fscache/page.c816
-rw-r--r--fs/fscache/proc.c68
-rw-r--r--fs/fscache/stats.c212
-rw-r--r--fs/fuse/dir.c1
-rw-r--r--fs/fuse/file.c54
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hpfs/super.c5
-rw-r--r--fs/hppfs/hppfs.c7
-rw-r--r--fs/internal.h8
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd/commit.c28
-rw-r--r--fs/jbd/journal.c34
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jffs2/acl.c6
-rw-r--r--fs/jffs2/malloc.c6
-rw-r--r--fs/jfs/acl.c2
-rw-r--r--fs/libfs.c16
-rw-r--r--fs/lockd/svclock.c13
-rw-r--r--fs/minix/inode.c11
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c14
-rw-r--r--fs/namespace.c61
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/client.c14
-rw-r--r--fs/nfs/file.c40
-rw-r--r--fs/nfs/fscache-index.c337
-rw-r--r--fs/nfs/fscache.c523
-rw-r--r--fs/nfs/fscache.h220
-rw-r--r--fs/nfs/inode.c14
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/iostat.h18
-rw-r--r--fs/nfs/nfs3proc.c6
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/read.c27
-rw-r--r--fs/nfs/super.c47
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs3proc.c10
-rw-r--r--fs/nfsd/nfs4callback.c47
-rw-r--r--fs/nfsd/nfs4proc.c246
-rw-r--r--fs/nfsd/nfs4recover.c74
-rw-r--r--fs/nfsd/nfs4state.c1196
-rw-r--r--fs/nfsd/nfs4xdr.c633
-rw-r--r--fs/nfsd/nfsctl.c38
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c95
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/nilfs2/Makefile5
-rw-r--r--fs/nilfs2/alloc.c504
-rw-r--r--fs/nilfs2/alloc.h72
-rw-r--r--fs/nilfs2/bmap.c783
-rw-r--r--fs/nilfs2/bmap.h244
-rw-r--r--fs/nilfs2/bmap_union.h42
-rw-r--r--fs/nilfs2/btnode.c316
-rw-r--r--fs/nilfs2/btnode.h58
-rw-r--r--fs/nilfs2/btree.c2269
-rw-r--r--fs/nilfs2/btree.h117
-rw-r--r--fs/nilfs2/cpfile.c925
-rw-r--r--fs/nilfs2/cpfile.h45
-rw-r--r--fs/nilfs2/dat.c430
-rw-r--r--fs/nilfs2/dat.h52
-rw-r--r--fs/nilfs2/dir.c711
-rw-r--r--fs/nilfs2/direct.c436
-rw-r--r--fs/nilfs2/direct.h78
-rw-r--r--fs/nilfs2/file.c160
-rw-r--r--fs/nilfs2/gcdat.c84
-rw-r--r--fs/nilfs2/gcinode.c288
-rw-r--r--fs/nilfs2/ifile.c150
-rw-r--r--fs/nilfs2/ifile.h53
-rw-r--r--fs/nilfs2/inode.c785
-rw-r--r--fs/nilfs2/ioctl.c654
-rw-r--r--fs/nilfs2/mdt.c563
-rw-r--r--fs/nilfs2/mdt.h125
-rw-r--r--fs/nilfs2/namei.c474
-rw-r--r--fs/nilfs2/nilfs.h318
-rw-r--r--fs/nilfs2/page.c540
-rw-r--r--fs/nilfs2/page.h76
-rw-r--r--fs/nilfs2/recovery.c929
-rw-r--r--fs/nilfs2/sb.h102
-rw-r--r--fs/nilfs2/segbuf.c439
-rw-r--r--fs/nilfs2/segbuf.h201
-rw-r--r--fs/nilfs2/seglist.h85
-rw-r--r--fs/nilfs2/segment.c2977
-rw-r--r--fs/nilfs2/segment.h243
-rw-r--r--fs/nilfs2/sufile.c640
-rw-r--r--fs/nilfs2/sufile.h54
-rw-r--r--fs/nilfs2/super.c1323
-rw-r--r--fs/nilfs2/the_nilfs.c637
-rw-r--r--fs/nilfs2/the_nilfs.h298
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/alloc.c57
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/aops.c23
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c9
-rw-r--r--fs/ocfs2/dir.c2806
-rw-r--r--fs/ocfs2/dir.h57
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h58
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c87
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c387
-rw-r--r--fs/ocfs2/dlm/dlmthread.c20
-rw-r--r--fs/ocfs2/dlmglue.c46
-rw-r--r--fs/ocfs2/dlmglue.h2
-rw-r--r--fs/ocfs2/export.c84
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/inode.c48
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c173
-rw-r--r--fs/ocfs2/journal.h77
-rw-r--r--fs/ocfs2/localalloc.c86
-rw-r--r--fs/ocfs2/namei.c250
-rw-r--r--fs/ocfs2/ocfs2.h76
-rw-r--r--fs/ocfs2/ocfs2_fs.h136
-rw-r--r--fs/ocfs2/ocfs2_lockid.h4
-rw-r--r--fs/ocfs2/suballoc.c254
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c188
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/inode.c7
-rw-r--r--fs/open.c1
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/proc/task_nommu.c9
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/ramfs/inode.c19
-rw-r--r--fs/read_write.c56
-rw-r--r--fs/reiserfs/Kconfig1
-rw-r--r--fs/reiserfs/super.c5
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/romfs/Kconfig48
-rw-r--r--fs/romfs/Makefile9
-rw-r--r--fs/romfs/inode.c665
-rw-r--r--fs/romfs/internal.h47
-rw-r--r--fs/romfs/mmap-nommu.c75
-rw-r--r--fs/romfs/storage.c261
-rw-r--r--fs/romfs/super.c653
-rw-r--r--fs/splice.c28
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/super.c41
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/ubifs/Kconfig4
-rw-r--r--fs/ubifs/budget.c37
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/file.c16
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/gc.c428
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/log.c5
-rw-r--r--fs/ubifs/lpt_commit.c34
-rw-r--r--fs/ubifs/recovery.c70
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/sb.c36
-rw-r--r--fs/ubifs/shrinker.c6
-rw-r--r--fs/ubifs/super.c37
-rw-r--r--fs/ubifs/tnc.c2
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/udf/balloc.c150
-rw-r--r--fs/udf/dir.c14
-rw-r--r--fs/udf/directory.c38
-rw-r--r--fs/udf/ecma_167.h416
-rw-r--r--fs/udf/ialloc.c9
-rw-r--r--fs/udf/inode.c213
-rw-r--r--fs/udf/misc.c29
-rw-r--r--fs/udf/namei.c86
-rw-r--r--fs/udf/osta_udf.h22
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c605
-rw-r--r--fs/udf/truncate.c44
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/udf/udf_sb.h9
-rw-r--r--fs/udf/udfdecl.h57
-rw-r--r--fs/udf/udfend.h28
-rw-r--r--fs/udf/udftime.c6
-rw-r--r--fs/udf/unicode.c62
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/mutex.h25
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c107
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h13
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c157
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c137
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h32
-rw-r--r--fs/xfs/quota/xfs_dquot.c28
-rw-r--r--fs/xfs/quota/xfs_dquot.h18
-rw-r--r--fs/xfs/quota/xfs_qm.c212
-rw-r--r--fs/xfs/quota/xfs_qm.h26
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c190
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h40
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c16
-rw-r--r--fs/xfs/support/debug.c1
-rw-r--r--fs/xfs/support/uuid.c71
-rw-r--r--fs/xfs/support/uuid.h4
-rw-r--r--fs/xfs/xfs_ag.h4
-rw-r--r--fs/xfs/xfs_alloc.c26
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_attr_leaf.c58
-rw-r--r--fs/xfs/xfs_bmap.c76
-rw-r--r--fs/xfs/xfs_bmap.h6
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_btree.h2
-rw-r--r--fs/xfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/xfs_da_btree.h9
-rw-r--r--fs/xfs/xfs_dfrag.c68
-rw-r--r--fs/xfs/xfs_dinode.h4
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_block.c7
-rw-r--r--fs/xfs/xfs_dir2_data.h2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c17
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c13
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_filestream.c9
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c12
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.h22
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_itable.c9
-rw-r--r--fs/xfs/xfs_log.c67
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c308
-rw-r--r--fs/xfs/xfs_mount.c253
-rw-r--r--fs/xfs/xfs_mount.h19
-rw-r--r--fs/xfs/xfs_qmops.c1
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c10
-rw-r--r--fs/xfs/xfs_rtalloc.h8
-rw-r--r--fs/xfs/xfs_trans.h24
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_trans_item.c2
-rw-r--r--fs/xfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_types.h8
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c408
-rw-r--r--fs/xfs/xfs_vnodeops.h3
365 files changed, 45195 insertions, 5780 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa3..9f7270f36b2a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -66,6 +66,13 @@ config GENERIC_ACL
66 bool 66 bool
67 select FS_POSIX_ACL 67 select FS_POSIX_ACL
68 68
69menu "Caches"
70
71source "fs/fscache/Kconfig"
72source "fs/cachefiles/Kconfig"
73
74endmenu
75
69if BLOCK 76if BLOCK
70menu "CD-ROM/DVD Filesystems" 77menu "CD-ROM/DVD Filesystems"
71 78
@@ -168,6 +175,33 @@ source "fs/qnx4/Kconfig"
168source "fs/romfs/Kconfig" 175source "fs/romfs/Kconfig"
169source "fs/sysv/Kconfig" 176source "fs/sysv/Kconfig"
170source "fs/ufs/Kconfig" 177source "fs/ufs/Kconfig"
178source "fs/exofs/Kconfig"
179
180config NILFS2_FS
181 tristate "NILFS2 file system support (EXPERIMENTAL)"
182 depends on BLOCK && EXPERIMENTAL
183 select CRC32
184 help
185 NILFS2 is a log-structured file system (LFS) supporting continuous
186 snapshotting. In addition to versioning capability of the entire
187 file system, users can even restore files mistakenly overwritten or
188 destroyed just a few seconds ago. Since this file system can keep
189 consistency like conventional LFS, it achieves quick recovery after
190 system crashes.
191
192 NILFS2 creates a number of checkpoints every few seconds or per
193 synchronous write basis (unless there is no change). Users can
194 select significant versions among continuously created checkpoints,
195 and can change them into snapshots which will be preserved for long
196 periods until they are changed back to checkpoints. Each
197 snapshot is mountable as a read-only file system concurrently with
198 its writable mount, and this feature is convenient for online backup.
199
200 Some features including atime, extended attributes, and POSIX ACLs,
201 are not supported yet.
202
203 To compile this file system support as a module, choose M here: the
204 module will be called nilfs2. If unsure, say N.
171 205
172endif # MISC_FILESYSTEMS 206endif # MISC_FILESYSTEMS
173 207
diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..af6d04700d9c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o 14 stack.o fs_struct.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
63obj-$(CONFIG_DLM) += dlm/ 63obj-$(CONFIG_DLM) += dlm/
64 64
65# Do not add any filesystems before this line 65# Do not add any filesystems before this line
66obj-$(CONFIG_FSCACHE) += fscache/
66obj-$(CONFIG_REISERFS_FS) += reiserfs/ 67obj-$(CONFIG_REISERFS_FS) += reiserfs/
67obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 68obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
68obj-$(CONFIG_EXT2_FS) += ext2/ 69obj-$(CONFIG_EXT2_FS) += ext2/
@@ -113,10 +114,13 @@ obj-$(CONFIG_JFS_FS) += jfs/
113obj-$(CONFIG_XFS_FS) += xfs/ 114obj-$(CONFIG_XFS_FS) += xfs/
114obj-$(CONFIG_9P_FS) += 9p/ 115obj-$(CONFIG_9P_FS) += 9p/
115obj-$(CONFIG_AFS_FS) += afs/ 116obj-$(CONFIG_AFS_FS) += afs/
117obj-$(CONFIG_NILFS2_FS) += nilfs2/
116obj-$(CONFIG_BEFS_FS) += befs/ 118obj-$(CONFIG_BEFS_FS) += befs/
117obj-$(CONFIG_HOSTFS) += hostfs/ 119obj-$(CONFIG_HOSTFS) += hostfs/
118obj-$(CONFIG_HPPFS) += hppfs/ 120obj-$(CONFIG_HPPFS) += hppfs/
121obj-$(CONFIG_CACHEFILES) += cachefiles/
119obj-$(CONFIG_DEBUG_FS) += debugfs/ 122obj-$(CONFIG_DEBUG_FS) += debugfs/
120obj-$(CONFIG_OCFS2_FS) += ocfs2/ 123obj-$(CONFIG_OCFS2_FS) += ocfs2/
121obj-$(CONFIG_BTRFS_FS) += btrfs/ 124obj-$(CONFIG_BTRFS_FS) += btrfs/
122obj-$(CONFIG_GFS2_FS) += gfs2/ 125obj-$(CONFIG_GFS2_FS) += gfs2/
126obj-$(CONFIG_EXOFS_FS) += exofs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7e..dd9becca4241 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
219 219
220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) 220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
221{ 221{
222 struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb); 222 struct super_block *sb = dentry->d_sb;
223 struct adfs_sb_info *sbi = ADFS_SB(sb);
224 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
223 225
224 buf->f_type = ADFS_SUPER_MAGIC; 226 buf->f_type = ADFS_SUPER_MAGIC;
225 buf->f_namelen = asb->s_namelen; 227 buf->f_namelen = sbi->s_namelen;
226 buf->f_bsize = dentry->d_sb->s_blocksize; 228 buf->f_bsize = sb->s_blocksize;
227 buf->f_blocks = asb->s_size; 229 buf->f_blocks = sbi->s_size;
228 buf->f_files = asb->s_ids_per_zone * asb->s_map_size; 230 buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size;
229 buf->f_bavail = 231 buf->f_bavail =
230 buf->f_bfree = adfs_map_free(dentry->d_sb); 232 buf->f_bfree = adfs_map_free(sb);
231 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; 233 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
234 buf->f_fsid.val[0] = (u32)id;
235 buf->f_fsid.val[1] = (u32)(id >> 32);
232 236
233 return 0; 237 return 0;
234} 238}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582aa..5ce695e707fe 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
533{ 533{
534 struct super_block *sb = dentry->d_sb; 534 struct super_block *sb = dentry->d_sb;
535 int free; 535 int free;
536 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
536 537
537 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, 538 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
538 AFFS_SB(sb)->s_reserved); 539 AFFS_SB(sb)->s_reserved);
@@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
543 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; 544 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
544 buf->f_bfree = free; 545 buf->f_bfree = free;
545 buf->f_bavail = free; 546 buf->f_bavail = free;
547 buf->f_fsid.val[0] = (u32)id;
548 buf->f_fsid.val[1] = (u32)(id >> 32);
549 buf->f_namelen = 30;
546 return 0; 550 return 0;
547} 551}
548 552
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e1..5c4e61d3c772 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
19 See <file:Documentation/filesystems/afs.txt> for more information. 19 See <file:Documentation/filesystems/afs.txt> for more information.
20 20
21 If unsure, say N. 21 If unsure, say N.
22
23config AFS_FSCACHE
24 bool "Provide AFS client caching support (EXPERIMENTAL)"
25 depends on EXPERIMENTAL
26 depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
27 help
28 Say Y here if you want AFS data to be cached locally on disk through
29 the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cfb..4f64b95d57bd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
2# Makefile for Red Hat Linux AFS client. 2# Makefile for Red Hat Linux AFS client.
3# 3#
4 4
5afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
6
5kafs-objs := \ 7kafs-objs := \
8 $(afs-cache-y) \
6 callback.o \ 9 callback.o \
7 cell.o \ 10 cell.o \
8 cmservice.o \ 11 cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69edc..e2b1d3f16519 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
1/* AFS caching stuff 1/* AFS caching stuff
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifdef AFS_CACHING_SUPPORT 12#include <linux/slab.h>
13static cachefs_match_val_t afs_cell_cache_match(void *target, 13#include <linux/sched.h>
14 const void *entry); 14#include "internal.h"
15static void afs_cell_cache_update(void *source, void *entry); 15
16 16static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
17struct cachefs_index_def afs_cache_cell_index_def = { 17 void *buffer, uint16_t buflen);
18 .name = "cell_ix", 18static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
19 .data_size = sizeof(struct afs_cache_cell), 19 void *buffer, uint16_t buflen);
20 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, 20static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
21 .match = afs_cell_cache_match, 21 const void *buffer,
22 .update = afs_cell_cache_update, 22 uint16_t buflen);
23
24static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
25 void *buffer, uint16_t buflen);
26static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
27 void *buffer, uint16_t buflen);
28static enum fscache_checkaux afs_vlocation_cache_check_aux(
29 void *cookie_netfs_data, const void *buffer, uint16_t buflen);
30
31static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
32 void *buffer, uint16_t buflen);
33
34static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
35 void *buffer, uint16_t buflen);
36static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
37 uint64_t *size);
38static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
39 void *buffer, uint16_t buflen);
40static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
41 const void *buffer,
42 uint16_t buflen);
43static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
44
45struct fscache_netfs afs_cache_netfs = {
46 .name = "afs",
47 .version = 0,
48};
49
50struct fscache_cookie_def afs_cell_cache_index_def = {
51 .name = "AFS.cell",
52 .type = FSCACHE_COOKIE_TYPE_INDEX,
53 .get_key = afs_cell_cache_get_key,
54 .get_aux = afs_cell_cache_get_aux,
55 .check_aux = afs_cell_cache_check_aux,
56};
57
58struct fscache_cookie_def afs_vlocation_cache_index_def = {
59 .name = "AFS.vldb",
60 .type = FSCACHE_COOKIE_TYPE_INDEX,
61 .get_key = afs_vlocation_cache_get_key,
62 .get_aux = afs_vlocation_cache_get_aux,
63 .check_aux = afs_vlocation_cache_check_aux,
64};
65
66struct fscache_cookie_def afs_volume_cache_index_def = {
67 .name = "AFS.volume",
68 .type = FSCACHE_COOKIE_TYPE_INDEX,
69 .get_key = afs_volume_cache_get_key,
70};
71
72struct fscache_cookie_def afs_vnode_cache_index_def = {
73 .name = "AFS.vnode",
74 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
75 .get_key = afs_vnode_cache_get_key,
76 .get_attr = afs_vnode_cache_get_attr,
77 .get_aux = afs_vnode_cache_get_aux,
78 .check_aux = afs_vnode_cache_check_aux,
79 .now_uncached = afs_vnode_cache_now_uncached,
23}; 80};
24#endif
25 81
26/* 82/*
27 * match a cell record obtained from the cache 83 * set the key for the index entry
28 */ 84 */
29#ifdef AFS_CACHING_SUPPORT 85static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
30static cachefs_match_val_t afs_cell_cache_match(void *target, 86 void *buffer, uint16_t bufmax)
31 const void *entry)
32{ 87{
33 const struct afs_cache_cell *ccell = entry; 88 const struct afs_cell *cell = cookie_netfs_data;
34 struct afs_cell *cell = target; 89 uint16_t klen;
35 90
36 _enter("{%s},{%s}", ccell->name, cell->name); 91 _enter("%p,%p,%u", cell, buffer, bufmax);
37 92
38 if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) { 93 klen = strlen(cell->name);
39 _leave(" = SUCCESS"); 94 if (klen > bufmax)
40 return CACHEFS_MATCH_SUCCESS; 95 return 0;
41 }
42 96
43 _leave(" = FAILED"); 97 memcpy(buffer, cell->name, klen);
44 return CACHEFS_MATCH_FAILED; 98 return klen;
45} 99}
46#endif
47 100
48/* 101/*
49 * update a cell record in the cache 102 * provide new auxilliary cache data
50 */ 103 */
51#ifdef AFS_CACHING_SUPPORT 104static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
52static void afs_cell_cache_update(void *source, void *entry) 105 void *buffer, uint16_t bufmax)
53{ 106{
54 struct afs_cache_cell *ccell = entry; 107 const struct afs_cell *cell = cookie_netfs_data;
55 struct afs_cell *cell = source; 108 uint16_t dlen;
56 109
57 _enter("%p,%p", source, entry); 110 _enter("%p,%p,%u", cell, buffer, bufmax);
58 111
59 strncpy(ccell->name, cell->name, sizeof(ccell->name)); 112 dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
113 dlen = min(dlen, bufmax);
114 dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
60 115
61 memcpy(ccell->vl_servers, 116 memcpy(buffer, cell->vl_addrs, dlen);
62 cell->vl_addrs, 117 return dlen;
63 min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs))); 118}
64 119
120/*
121 * check that the auxilliary data indicates that the entry is still valid
122 */
123static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
124 const void *buffer,
125 uint16_t buflen)
126{
127 _leave(" = OKAY");
128 return FSCACHE_CHECKAUX_OKAY;
65} 129}
66#endif
67
68#ifdef AFS_CACHING_SUPPORT
69static cachefs_match_val_t afs_vlocation_cache_match(void *target,
70 const void *entry);
71static void afs_vlocation_cache_update(void *source, void *entry);
72
73struct cachefs_index_def afs_vlocation_cache_index_def = {
74 .name = "vldb",
75 .data_size = sizeof(struct afs_cache_vlocation),
76 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
77 .match = afs_vlocation_cache_match,
78 .update = afs_vlocation_cache_update,
79};
80#endif
81 130
131/*****************************************************************************/
82/* 132/*
83 * match a VLDB record stored in the cache 133 * set the key for the index entry
84 * - may also load target from entry
85 */ 134 */
86#ifdef AFS_CACHING_SUPPORT 135static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
87static cachefs_match_val_t afs_vlocation_cache_match(void *target, 136 void *buffer, uint16_t bufmax)
88 const void *entry)
89{ 137{
90 const struct afs_cache_vlocation *vldb = entry; 138 const struct afs_vlocation *vlocation = cookie_netfs_data;
91 struct afs_vlocation *vlocation = target; 139 uint16_t klen;
140
141 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
142
143 klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
144 if (klen > bufmax)
145 return 0;
92 146
93 _enter("{%s},{%s}", vlocation->vldb.name, vldb->name); 147 memcpy(buffer, vlocation->vldb.name, klen);
94 148
95 if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0 149 _leave(" = %u", klen);
96 ) { 150 return klen;
97 if (!vlocation->valid || 151}
98 vlocation->vldb.rtime == vldb->rtime 152
153/*
154 * provide new auxilliary cache data
155 */
156static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
157 void *buffer, uint16_t bufmax)
158{
159 const struct afs_vlocation *vlocation = cookie_netfs_data;
160 uint16_t dlen;
161
162 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
163
164 dlen = sizeof(struct afs_cache_vlocation);
165 dlen -= offsetof(struct afs_cache_vlocation, nservers);
166 if (dlen > bufmax)
167 return 0;
168
169 memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
170
171 _leave(" = %u", dlen);
172 return dlen;
173}
174
175/*
176 * check that the auxilliary data indicates that the entry is still valid
177 */
178static
179enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
180 const void *buffer,
181 uint16_t buflen)
182{
183 const struct afs_cache_vlocation *cvldb;
184 struct afs_vlocation *vlocation = cookie_netfs_data;
185 uint16_t dlen;
186
187 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
188
189 /* check the size of the data is what we're expecting */
190 dlen = sizeof(struct afs_cache_vlocation);
191 dlen -= offsetof(struct afs_cache_vlocation, nservers);
192 if (dlen != buflen)
193 return FSCACHE_CHECKAUX_OBSOLETE;
194
195 cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
196
197 /* if what's on disk is more valid than what's in memory, then use the
198 * VL record from the cache */
199 if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
200 memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
201 vlocation->valid = 1;
202 _leave(" = SUCCESS [c->m]");
203 return FSCACHE_CHECKAUX_OKAY;
204 }
205
206 /* need to update the cache if the cached info differs */
207 if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
208 /* delete if the volume IDs for this name differ */
209 if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
210 sizeof(cvldb->vid)) != 0
99 ) { 211 ) {
100 vlocation->vldb = *vldb; 212 _leave(" = OBSOLETE");
101 vlocation->valid = 1; 213 return FSCACHE_CHECKAUX_OBSOLETE;
102 _leave(" = SUCCESS [c->m]");
103 return CACHEFS_MATCH_SUCCESS;
104 } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
105 /* delete if VIDs for this name differ */
106 if (memcmp(&vlocation->vldb.vid,
107 &vldb->vid,
108 sizeof(vldb->vid)) != 0) {
109 _leave(" = DELETE");
110 return CACHEFS_MATCH_SUCCESS_DELETE;
111 }
112
113 _leave(" = UPDATE");
114 return CACHEFS_MATCH_SUCCESS_UPDATE;
115 } else {
116 _leave(" = SUCCESS");
117 return CACHEFS_MATCH_SUCCESS;
118 } 214 }
215
216 _leave(" = UPDATE");
217 return FSCACHE_CHECKAUX_NEEDS_UPDATE;
119 } 218 }
120 219
121 _leave(" = FAILED"); 220 _leave(" = OKAY");
122 return CACHEFS_MATCH_FAILED; 221 return FSCACHE_CHECKAUX_OKAY;
123} 222}
124#endif
125 223
224/*****************************************************************************/
126/* 225/*
127 * update a VLDB record stored in the cache 226 * set the key for the volume index entry
128 */ 227 */
129#ifdef AFS_CACHING_SUPPORT 228static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
130static void afs_vlocation_cache_update(void *source, void *entry) 229 void *buffer, uint16_t bufmax)
131{ 230{
132 struct afs_cache_vlocation *vldb = entry; 231 const struct afs_volume *volume = cookie_netfs_data;
133 struct afs_vlocation *vlocation = source; 232 uint16_t klen;
233
234 _enter("{%u},%p,%u", volume->type, buffer, bufmax);
235
236 klen = sizeof(volume->type);
237 if (klen > bufmax)
238 return 0;
134 239
135 _enter(""); 240 memcpy(buffer, &volume->type, sizeof(volume->type));
241
242 _leave(" = %u", klen);
243 return klen;
136 244
137 *vldb = vlocation->vldb;
138} 245}
139#endif
140
141#ifdef AFS_CACHING_SUPPORT
142static cachefs_match_val_t afs_volume_cache_match(void *target,
143 const void *entry);
144static void afs_volume_cache_update(void *source, void *entry);
145
146struct cachefs_index_def afs_volume_cache_index_def = {
147 .name = "volume",
148 .data_size = sizeof(struct afs_cache_vhash),
149 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
150 .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
151 .match = afs_volume_cache_match,
152 .update = afs_volume_cache_update,
153};
154#endif
155 246
247/*****************************************************************************/
156/* 248/*
157 * match a volume hash record stored in the cache 249 * set the key for the index entry
158 */ 250 */
159#ifdef AFS_CACHING_SUPPORT 251static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
160static cachefs_match_val_t afs_volume_cache_match(void *target, 252 void *buffer, uint16_t bufmax)
161 const void *entry)
162{ 253{
163 const struct afs_cache_vhash *vhash = entry; 254 const struct afs_vnode *vnode = cookie_netfs_data;
164 struct afs_volume *volume = target; 255 uint16_t klen;
165 256
166 _enter("{%u},{%u}", volume->type, vhash->vtype); 257 _enter("{%x,%x,%llx},%p,%u",
258 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
259 buffer, bufmax);
167 260
168 if (volume->type == vhash->vtype) { 261 klen = sizeof(vnode->fid.vnode);
169 _leave(" = SUCCESS"); 262 if (klen > bufmax)
170 return CACHEFS_MATCH_SUCCESS; 263 return 0;
171 } 264
265 memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
172 266
173 _leave(" = FAILED"); 267 _leave(" = %u", klen);
174 return CACHEFS_MATCH_FAILED; 268 return klen;
175} 269}
176#endif
177 270
178/* 271/*
179 * update a volume hash record stored in the cache 272 * provide updated file attributes
180 */ 273 */
181#ifdef AFS_CACHING_SUPPORT 274static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
182static void afs_volume_cache_update(void *source, void *entry) 275 uint64_t *size)
183{ 276{
184 struct afs_cache_vhash *vhash = entry; 277 const struct afs_vnode *vnode = cookie_netfs_data;
185 struct afs_volume *volume = source;
186 278
187 _enter(""); 279 _enter("{%x,%x,%llx},",
280 vnode->fid.vnode, vnode->fid.unique,
281 vnode->status.data_version);
188 282
189 vhash->vtype = volume->type; 283 *size = vnode->status.size;
190} 284}
191#endif
192
193#ifdef AFS_CACHING_SUPPORT
194static cachefs_match_val_t afs_vnode_cache_match(void *target,
195 const void *entry);
196static void afs_vnode_cache_update(void *source, void *entry);
197
198struct cachefs_index_def afs_vnode_cache_index_def = {
199 .name = "vnode",
200 .data_size = sizeof(struct afs_cache_vnode),
201 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
202 .match = afs_vnode_cache_match,
203 .update = afs_vnode_cache_update,
204};
205#endif
206 285
207/* 286/*
208 * match a vnode record stored in the cache 287 * provide new auxilliary cache data
288 */
289static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
290 void *buffer, uint16_t bufmax)
291{
292 const struct afs_vnode *vnode = cookie_netfs_data;
293 uint16_t dlen;
294
295 _enter("{%x,%x,%Lx},%p,%u",
296 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
297 buffer, bufmax);
298
299 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
300 if (dlen > bufmax)
301 return 0;
302
303 memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
304 buffer += sizeof(vnode->fid.unique);
305 memcpy(buffer, &vnode->status.data_version,
306 sizeof(vnode->status.data_version));
307
308 _leave(" = %u", dlen);
309 return dlen;
310}
311
312/*
313 * check that the auxilliary data indicates that the entry is still valid
209 */ 314 */
210#ifdef AFS_CACHING_SUPPORT 315static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
211static cachefs_match_val_t afs_vnode_cache_match(void *target, 316 const void *buffer,
212 const void *entry) 317 uint16_t buflen)
213{ 318{
214 const struct afs_cache_vnode *cvnode = entry; 319 struct afs_vnode *vnode = cookie_netfs_data;
215 struct afs_vnode *vnode = target; 320 uint16_t dlen;
216 321
217 _enter("{%x,%x,%Lx},{%x,%x,%Lx}", 322 _enter("{%x,%x,%llx},%p,%u",
218 vnode->fid.vnode, 323 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
219 vnode->fid.unique, 324 buffer, buflen);
220 vnode->status.version, 325
221 cvnode->vnode_id, 326 /* check the size of the data is what we're expecting */
222 cvnode->vnode_unique, 327 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
223 cvnode->data_version); 328 if (dlen != buflen) {
224 329 _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
225 if (vnode->fid.vnode != cvnode->vnode_id) { 330 return FSCACHE_CHECKAUX_OBSOLETE;
226 _leave(" = FAILED");
227 return CACHEFS_MATCH_FAILED;
228 } 331 }
229 332
230 if (vnode->fid.unique != cvnode->vnode_unique || 333 if (memcmp(buffer,
231 vnode->status.version != cvnode->data_version) { 334 &vnode->fid.unique,
232 _leave(" = DELETE"); 335 sizeof(vnode->fid.unique)
233 return CACHEFS_MATCH_SUCCESS_DELETE; 336 ) != 0) {
337 unsigned unique;
338
339 memcpy(&unique, buffer, sizeof(unique));
340
341 _leave(" = OBSOLETE [uniq %x != %x]",
342 unique, vnode->fid.unique);
343 return FSCACHE_CHECKAUX_OBSOLETE;
344 }
345
346 if (memcmp(buffer + sizeof(vnode->fid.unique),
347 &vnode->status.data_version,
348 sizeof(vnode->status.data_version)
349 ) != 0) {
350 afs_dataversion_t version;
351
352 memcpy(&version, buffer + sizeof(vnode->fid.unique),
353 sizeof(version));
354
355 _leave(" = OBSOLETE [vers %llx != %llx]",
356 version, vnode->status.data_version);
357 return FSCACHE_CHECKAUX_OBSOLETE;
234 } 358 }
235 359
236 _leave(" = SUCCESS"); 360 _leave(" = SUCCESS");
237 return CACHEFS_MATCH_SUCCESS; 361 return FSCACHE_CHECKAUX_OKAY;
238} 362}
239#endif
240 363
241/* 364/*
242 * update a vnode record stored in the cache 365 * indication the cookie is no longer uncached
366 * - this function is called when the backing store currently caching a cookie
367 * is removed
368 * - the netfs should use this to clean up any markers indicating cached pages
369 * - this is mandatory for any object that may have data
243 */ 370 */
244#ifdef AFS_CACHING_SUPPORT 371static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
245static void afs_vnode_cache_update(void *source, void *entry)
246{ 372{
247 struct afs_cache_vnode *cvnode = entry; 373 struct afs_vnode *vnode = cookie_netfs_data;
248 struct afs_vnode *vnode = source; 374 struct pagevec pvec;
375 pgoff_t first;
376 int loop, nr_pages;
377
378 _enter("{%x,%x,%Lx}",
379 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
380
381 pagevec_init(&pvec, 0);
382 first = 0;
383
384 for (;;) {
385 /* grab a bunch of pages to clean */
386 nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
387 first,
388 PAGEVEC_SIZE - pagevec_count(&pvec));
389 if (!nr_pages)
390 break;
249 391
250 _enter(""); 392 for (loop = 0; loop < nr_pages; loop++)
393 ClearPageFsCache(pvec.pages[loop]);
394
395 first = pvec.pages[nr_pages - 1]->index + 1;
396
397 pvec.nr = nr_pages;
398 pagevec_release(&pvec);
399 cond_resched();
400 }
251 401
252 cvnode->vnode_id = vnode->fid.vnode; 402 _leave("");
253 cvnode->vnode_unique = vnode->fid.unique;
254 cvnode->data_version = vnode->status.version;
255} 403}
256#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90e..5c4f6b499e90 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
1/* AFS local cache management interface 1/* AFS local cache management interface
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifndef AFS_CACHE_H 12#include <linux/fscache.h>
13#define AFS_CACHE_H
14
15#undef AFS_CACHING_SUPPORT
16
17#include <linux/mm.h>
18#ifdef AFS_CACHING_SUPPORT
19#include <linux/cachefs.h>
20#endif
21#include "types.h"
22
23#endif /* AFS_CACHE_H */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b1..e19c13f059ed 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
147 if (ret < 0) 147 if (ret < 0)
148 goto error; 148 goto error;
149 149
150#ifdef AFS_CACHING_SUPPORT 150#ifdef CONFIG_AFS_FSCACHE
151 /* put it up for caching */ 151 /* put it up for caching (this never returns an error) */
152 cachefs_acquire_cookie(afs_cache_netfs.primary_index, 152 cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
153 &afs_vlocation_cache_index_def, 153 &afs_cell_cache_index_def,
154 cell, 154 cell);
155 &cell->cache);
156#endif 155#endif
157 156
158 /* add to the cell lists */ 157 /* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
362 list_del_init(&cell->proc_link); 361 list_del_init(&cell->proc_link);
363 up_write(&afs_proc_cells_sem); 362 up_write(&afs_proc_cells_sem);
364 363
365#ifdef AFS_CACHING_SUPPORT 364#ifdef CONFIG_AFS_FSCACHE
366 cachefs_relinquish_cookie(cell->cache, 0); 365 fscache_relinquish_cookie(cell->cache, 0);
367#endif 366#endif
368
369 key_put(cell->anonymous_key); 367 key_put(cell->anonymous_key);
370 kfree(cell); 368 kfree(cell);
371 369
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96c..7a1d942ef68d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
23static int afs_releasepage(struct page *page, gfp_t gfp_flags); 23static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page); 24static int afs_launder_page(struct page *page);
25 25
26static int afs_readpages(struct file *filp, struct address_space *mapping,
27 struct list_head *pages, unsigned nr_pages);
28
26const struct file_operations afs_file_operations = { 29const struct file_operations afs_file_operations = {
27 .open = afs_open, 30 .open = afs_open,
28 .release = afs_release, 31 .release = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
46 49
47const struct address_space_operations afs_fs_aops = { 50const struct address_space_operations afs_fs_aops = {
48 .readpage = afs_readpage, 51 .readpage = afs_readpage,
52 .readpages = afs_readpages,
49 .set_page_dirty = afs_set_page_dirty, 53 .set_page_dirty = afs_set_page_dirty,
50 .launder_page = afs_launder_page, 54 .launder_page = afs_launder_page,
51 .releasepage = afs_releasepage, 55 .releasepage = afs_releasepage,
@@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file)
101/* 105/*
102 * deal with notification that a page was read from the cache 106 * deal with notification that a page was read from the cache
103 */ 107 */
104#ifdef AFS_CACHING_SUPPORT 108static void afs_file_readpage_read_complete(struct page *page,
105static void afs_readpage_read_complete(void *cookie_data, 109 void *data,
106 struct page *page, 110 int error)
107 void *data,
108 int error)
109{ 111{
110 _enter("%p,%p,%p,%d", cookie_data, page, data, error); 112 _enter("%p,%p,%d", page, data, error);
111 113
112 if (error) 114 /* if the read completes with an error, we just unlock the page and let
113 SetPageError(page); 115 * the VM reissue the readpage */
114 else 116 if (!error)
115 SetPageUptodate(page); 117 SetPageUptodate(page);
116 unlock_page(page); 118 unlock_page(page);
117
118} 119}
119#endif
120
121/*
122 * deal with notification that a page was written to the cache
123 */
124#ifdef AFS_CACHING_SUPPORT
125static void afs_readpage_write_complete(void *cookie_data,
126 struct page *page,
127 void *data,
128 int error)
129{
130 _enter("%p,%p,%p,%d", cookie_data, page, data, error);
131
132 unlock_page(page);
133}
134#endif
135 120
136/* 121/*
137 * AFS read page from file, directory or symlink 122 * AFS read page from file, directory or symlink
@@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page)
161 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) 146 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
162 goto error; 147 goto error;
163 148
164#ifdef AFS_CACHING_SUPPORT
165 /* is it cached? */ 149 /* is it cached? */
166 ret = cachefs_read_or_alloc_page(vnode->cache, 150#ifdef CONFIG_AFS_FSCACHE
151 ret = fscache_read_or_alloc_page(vnode->cache,
167 page, 152 page,
168 afs_file_readpage_read_complete, 153 afs_file_readpage_read_complete,
169 NULL, 154 NULL,
@@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page)
171#else 156#else
172 ret = -ENOBUFS; 157 ret = -ENOBUFS;
173#endif 158#endif
174
175 switch (ret) { 159 switch (ret) {
176 /* read BIO submitted and wb-journal entry found */
177 case 1:
178 BUG(); // TODO - handle wb-journal match
179
180 /* read BIO submitted (page in cache) */ 160 /* read BIO submitted (page in cache) */
181 case 0: 161 case 0:
182 break; 162 break;
183 163
184 /* no page available in cache */ 164 /* page not yet cached */
185 case -ENOBUFS:
186 case -ENODATA: 165 case -ENODATA:
166 _debug("cache said ENODATA");
167 goto go_on;
168
169 /* page will not be cached */
170 case -ENOBUFS:
171 _debug("cache said ENOBUFS");
187 default: 172 default:
173 go_on:
188 offset = page->index << PAGE_CACHE_SHIFT; 174 offset = page->index << PAGE_CACHE_SHIFT;
189 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); 175 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
190 176
@@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page)
198 set_bit(AFS_VNODE_DELETED, &vnode->flags); 184 set_bit(AFS_VNODE_DELETED, &vnode->flags);
199 ret = -ESTALE; 185 ret = -ESTALE;
200 } 186 }
201#ifdef AFS_CACHING_SUPPORT 187
202 cachefs_uncache_page(vnode->cache, page); 188#ifdef CONFIG_AFS_FSCACHE
189 fscache_uncache_page(vnode->cache, page);
203#endif 190#endif
191 BUG_ON(PageFsCache(page));
204 goto error; 192 goto error;
205 } 193 }
206 194
207 SetPageUptodate(page); 195 SetPageUptodate(page);
208 196
209#ifdef AFS_CACHING_SUPPORT 197 /* send the page to the cache */
210 if (cachefs_write_page(vnode->cache, 198#ifdef CONFIG_AFS_FSCACHE
211 page, 199 if (PageFsCache(page) &&
212 afs_file_readpage_write_complete, 200 fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
213 NULL, 201 fscache_uncache_page(vnode->cache, page);
214 GFP_KERNEL) != 0 202 BUG_ON(PageFsCache(page));
215 ) {
216 cachefs_uncache_page(vnode->cache, page);
217 unlock_page(page);
218 } 203 }
219#else
220 unlock_page(page);
221#endif 204#endif
205 unlock_page(page);
222 } 206 }
223 207
224 _leave(" = 0"); 208 _leave(" = 0");
@@ -232,34 +216,59 @@ error:
232} 216}
233 217
234/* 218/*
235 * invalidate part or all of a page 219 * read a set of pages
236 */ 220 */
237static void afs_invalidatepage(struct page *page, unsigned long offset) 221static int afs_readpages(struct file *file, struct address_space *mapping,
222 struct list_head *pages, unsigned nr_pages)
238{ 223{
239 int ret = 1; 224 struct afs_vnode *vnode;
225 int ret = 0;
240 226
241 _enter("{%lu},%lu", page->index, offset); 227 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
242 228
243 BUG_ON(!PageLocked(page)); 229 vnode = AFS_FS_I(mapping->host);
230 if (vnode->flags & AFS_VNODE_DELETED) {
231 _leave(" = -ESTALE");
232 return -ESTALE;
233 }
244 234
245 if (PagePrivate(page)) { 235 /* attempt to read as many of the pages as possible */
246 /* We release buffers only if the entire page is being 236#ifdef CONFIG_AFS_FSCACHE
247 * invalidated. 237 ret = fscache_read_or_alloc_pages(vnode->cache,
248 * The get_block cached value has been unconditionally 238 mapping,
249 * invalidated, so real IO is not possible anymore. 239 pages,
250 */ 240 &nr_pages,
251 if (offset == 0) { 241 afs_file_readpage_read_complete,
252 BUG_ON(!PageLocked(page)); 242 NULL,
253 243 mapping_gfp_mask(mapping));
254 ret = 0; 244#else
255 if (!PageWriteback(page)) 245 ret = -ENOBUFS;
256 ret = page->mapping->a_ops->releasepage(page, 246#endif
257 0); 247
258 /* possibly should BUG_ON(!ret); - neilb */ 248 switch (ret) {
259 } 249 /* all pages are being read from the cache */
250 case 0:
251 BUG_ON(!list_empty(pages));
252 BUG_ON(nr_pages != 0);
253 _leave(" = 0 [reading all]");
254 return 0;
255
256 /* there were pages that couldn't be read from the cache */
257 case -ENODATA:
258 case -ENOBUFS:
259 break;
260
261 /* other error */
262 default:
263 _leave(" = %d", ret);
264 return ret;
260 } 265 }
261 266
262 _leave(" = %d", ret); 267 /* load the missing pages from the network */
268 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
269
270 _leave(" = %d [netting]", ret);
271 return ret;
263} 272}
264 273
265/* 274/*
@@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page)
273} 282}
274 283
275/* 284/*
276 * release a page and cleanup its private data 285 * invalidate part or all of a page
286 * - release a page and clean up its private data if offset is 0 (indicating
287 * the entire page)
288 */
289static void afs_invalidatepage(struct page *page, unsigned long offset)
290{
291 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
292
293 _enter("{%lu},%lu", page->index, offset);
294
295 BUG_ON(!PageLocked(page));
296
297 /* we clean up only if the entire page is being invalidated */
298 if (offset == 0) {
299#ifdef CONFIG_AFS_FSCACHE
300 if (PageFsCache(page)) {
301 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
302 fscache_wait_on_page_write(vnode->cache, page);
303 fscache_uncache_page(vnode->cache, page);
304 ClearPageFsCache(page);
305 }
306#endif
307
308 if (PagePrivate(page)) {
309 if (wb && !PageWriteback(page)) {
310 set_page_private(page, 0);
311 afs_put_writeback(wb);
312 }
313
314 if (!page_private(page))
315 ClearPagePrivate(page);
316 }
317 }
318
319 _leave("");
320}
321
322/*
323 * release a page and clean up its private state if it's not busy
324 * - return true if the page can now be released, false if not
277 */ 325 */
278static int afs_releasepage(struct page *page, gfp_t gfp_flags) 326static int afs_releasepage(struct page *page, gfp_t gfp_flags)
279{ 327{
328 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
280 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 329 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
281 struct afs_writeback *wb;
282 330
283 _enter("{{%x:%u}[%lu],%lx},%x", 331 _enter("{{%x:%u}[%lu],%lx},%x",
284 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, 332 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
285 gfp_flags); 333 gfp_flags);
286 334
335 /* deny if page is being written to the cache and the caller hasn't
336 * elected to wait */
337#ifdef CONFIG_AFS_FSCACHE
338 if (PageFsCache(page)) {
339 if (fscache_check_page_write(vnode->cache, page)) {
340 if (!(gfp_flags & __GFP_WAIT)) {
341 _leave(" = F [cache busy]");
342 return 0;
343 }
344 fscache_wait_on_page_write(vnode->cache, page);
345 }
346
347 fscache_uncache_page(vnode->cache, page);
348 ClearPageFsCache(page);
349 }
350#endif
351
287 if (PagePrivate(page)) { 352 if (PagePrivate(page)) {
288 wb = (struct afs_writeback *) page_private(page); 353 if (wb) {
289 ASSERT(wb != NULL); 354 set_page_private(page, 0);
290 set_page_private(page, 0); 355 afs_put_writeback(wb);
356 }
291 ClearPagePrivate(page); 357 ClearPagePrivate(page);
292 afs_put_writeback(wb);
293 } 358 }
294 359
295 _leave(" = 0"); 360 /* indicate that the page can be released */
296 return 0; 361 _leave(" = T");
362 return 1;
297} 363}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a18..c048f0658751 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
61 return -EBADMSG; 61 return -EBADMSG;
62 } 62 }
63 63
64#ifdef CONFIG_AFS_FSCACHE
65 if (vnode->status.size != inode->i_size)
66 fscache_attr_changed(vnode->cache);
67#endif
68
64 inode->i_nlink = vnode->status.nlink; 69 inode->i_nlink = vnode->status.nlink;
65 inode->i_uid = vnode->status.owner; 70 inode->i_uid = vnode->status.owner;
66 inode->i_gid = 0; 71 inode->i_gid = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
149 return inode; 154 return inode;
150 } 155 }
151 156
152#ifdef AFS_CACHING_SUPPORT
153 /* set up caching before reading the status, as fetch-status reads the
154 * first page of symlinks to see if they're really mntpts */
155 cachefs_acquire_cookie(vnode->volume->cache,
156 NULL,
157 vnode,
158 &vnode->cache);
159#endif
160
161 if (!status) { 157 if (!status) {
162 /* it's a remotely extant inode */ 158 /* it's a remotely extant inode */
163 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); 159 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
183 } 179 }
184 } 180 }
185 181
182 /* set up caching before mapping the status, as map-status reads the
183 * first page of symlinks to see if they're really mountpoints */
184 inode->i_size = vnode->status.size;
185#ifdef CONFIG_AFS_FSCACHE
186 vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
187 &afs_vnode_cache_index_def,
188 vnode);
189#endif
190
186 ret = afs_inode_map_status(vnode, key); 191 ret = afs_inode_map_status(vnode, key);
187 if (ret < 0) 192 if (ret < 0)
188 goto bad_inode; 193 goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
196 201
197 /* failure */ 202 /* failure */
198bad_inode: 203bad_inode:
204#ifdef CONFIG_AFS_FSCACHE
205 fscache_relinquish_cookie(vnode->cache, 0);
206 vnode->cache = NULL;
207#endif
199 iget_failed(inode); 208 iget_failed(inode);
200 _leave(" = %d [bad]", ret); 209 _leave(" = %d [bad]", ret);
201 return ERR_PTR(ret); 210 return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
340 ASSERT(list_empty(&vnode->writebacks)); 349 ASSERT(list_empty(&vnode->writebacks));
341 ASSERT(!vnode->cb_promised); 350 ASSERT(!vnode->cb_promised);
342 351
343#ifdef AFS_CACHING_SUPPORT 352#ifdef CONFIG_AFS_FSCACHE
344 cachefs_relinquish_cookie(vnode->cache, 0); 353 fscache_relinquish_cookie(vnode->cache, 0);
345 vnode->cache = NULL; 354 vnode->cache = NULL;
346#endif 355#endif
347 356
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd6..106be66dafd2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
21 21
22#include "afs.h" 22#include "afs.h"
23#include "afs_vl.h" 23#include "afs_vl.h"
24#include "cache.h"
24 25
25#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
26 27
@@ -193,8 +194,8 @@ struct afs_cell {
193 struct key *anonymous_key; /* anonymous user key for this cell */ 194 struct key *anonymous_key; /* anonymous user key for this cell */
194 struct list_head proc_link; /* /proc cell list link */ 195 struct list_head proc_link; /* /proc cell list link */
195 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ 196 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
196#ifdef AFS_CACHING_SUPPORT 197#ifdef CONFIG_AFS_FSCACHE
197 struct cachefs_cookie *cache; /* caching cookie */ 198 struct fscache_cookie *cache; /* caching cookie */
198#endif 199#endif
199 200
200 /* server record management */ 201 /* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
249 struct list_head grave; /* link in master graveyard list */ 250 struct list_head grave; /* link in master graveyard list */
250 struct list_head update; /* link in master update list */ 251 struct list_head update; /* link in master update list */
251 struct afs_cell *cell; /* cell to which volume belongs */ 252 struct afs_cell *cell; /* cell to which volume belongs */
252#ifdef AFS_CACHING_SUPPORT 253#ifdef CONFIG_AFS_FSCACHE
253 struct cachefs_cookie *cache; /* caching cookie */ 254 struct fscache_cookie *cache; /* caching cookie */
254#endif 255#endif
255 struct afs_cache_vlocation vldb; /* volume information DB record */ 256 struct afs_cache_vlocation vldb; /* volume information DB record */
256 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ 257 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
302 atomic_t usage; 303 atomic_t usage;
303 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ 304 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
304 struct afs_vlocation *vlocation; /* volume location */ 305 struct afs_vlocation *vlocation; /* volume location */
305#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
306 struct cachefs_cookie *cache; /* caching cookie */ 307 struct fscache_cookie *cache; /* caching cookie */
307#endif 308#endif
308 afs_volid_t vid; /* volume ID */ 309 afs_volid_t vid; /* volume ID */
309 afs_voltype_t type; /* type of volume */ 310 afs_voltype_t type; /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
333 struct afs_server *server; /* server currently supplying this file */ 334 struct afs_server *server; /* server currently supplying this file */
334 struct afs_fid fid; /* the file identifier for this inode */ 335 struct afs_fid fid; /* the file identifier for this inode */
335 struct afs_file_status status; /* AFS status info for this file */ 336 struct afs_file_status status; /* AFS status info for this file */
336#ifdef AFS_CACHING_SUPPORT 337#ifdef CONFIG_AFS_FSCACHE
337 struct cachefs_cookie *cache; /* caching cookie */ 338 struct fscache_cookie *cache; /* caching cookie */
338#endif 339#endif
339 struct afs_permits *permits; /* cache of permits so far obtained */ 340 struct afs_permits *permits; /* cache of permits so far obtained */
340 struct mutex permits_lock; /* lock for altering permits list */ 341 struct mutex permits_lock; /* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
428 429
429/*****************************************************************************/ 430/*****************************************************************************/
430/* 431/*
432 * cache.c
433 */
434#ifdef CONFIG_AFS_FSCACHE
435extern struct fscache_netfs afs_cache_netfs;
436extern struct fscache_cookie_def afs_cell_cache_index_def;
437extern struct fscache_cookie_def afs_vlocation_cache_index_def;
438extern struct fscache_cookie_def afs_volume_cache_index_def;
439extern struct fscache_cookie_def afs_vnode_cache_index_def;
440#else
441#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL)
442#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL)
443#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL)
444#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL)
445#endif
446
447/*
431 * callback.c 448 * callback.c
432 */ 449 */
433extern void afs_init_callback_state(struct afs_server *); 450extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
446 */ 463 */
447extern struct rw_semaphore afs_proc_cells_sem; 464extern struct rw_semaphore afs_proc_cells_sem;
448extern struct list_head afs_proc_cells; 465extern struct list_head afs_proc_cells;
449#ifdef AFS_CACHING_SUPPORT
450extern struct cachefs_index_def afs_cache_cell_index_def;
451#endif
452 466
453#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) 467#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
454extern int afs_cell_init(char *); 468extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
554 * main.c 568 * main.c
555 */ 569 */
556extern struct afs_uuid afs_uuid; 570extern struct afs_uuid afs_uuid;
557#ifdef AFS_CACHING_SUPPORT
558extern struct cachefs_netfs afs_cache_netfs;
559#endif
560 571
561/* 572/*
562 * misc.c 573 * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
637/* 648/*
638 * vlclient.c 649 * vlclient.c
639 */ 650 */
640#ifdef AFS_CACHING_SUPPORT
641extern struct cachefs_index_def afs_vlocation_cache_index_def;
642#endif
643
644extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, 651extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
645 const char *, struct afs_cache_vlocation *, 652 const char *, struct afs_cache_vlocation *,
646 const struct afs_wait_mode *); 653 const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
664/* 671/*
665 * vnode.c 672 * vnode.c
666 */ 673 */
667#ifdef AFS_CACHING_SUPPORT
668extern struct cachefs_index_def afs_vnode_cache_index_def;
669#endif
670
671extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
672
673static inline struct afs_vnode *AFS_FS_I(struct inode *inode) 674static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
674{ 675{
675 return container_of(inode, struct afs_vnode, vfs_inode); 676 return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
711/* 712/*
712 * volume.c 713 * volume.c
713 */ 714 */
714#ifdef AFS_CACHING_SUPPORT
715extern struct cachefs_index_def afs_volume_cache_index_def;
716#endif
717
718#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) 715#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
719 716
720extern void afs_put_volume(struct afs_volume *); 717extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f7..66d54d348c55 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
1/* AFS client file system 1/* AFS client file system
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
29module_param(rootcell, charp, 0); 29module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32#ifdef AFS_CACHING_SUPPORT
33static struct cachefs_netfs_operations afs_cache_ops = {
34 .get_page_cookie = afs_cache_get_page_cookie,
35};
36
37struct cachefs_netfs afs_cache_netfs = {
38 .name = "afs",
39 .version = 0,
40 .ops = &afs_cache_ops,
41};
42#endif
43
44struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
45 33
46/* 34/*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
104 if (ret < 0) 92 if (ret < 0)
105 return ret; 93 return ret;
106 94
107#ifdef AFS_CACHING_SUPPORT 95#ifdef CONFIG_AFS_FSCACHE
108 /* we want to be able to cache */ 96 /* we want to be able to cache */
109 ret = cachefs_register_netfs(&afs_cache_netfs, 97 ret = fscache_register_netfs(&afs_cache_netfs);
110 &afs_cache_cell_index_def);
111 if (ret < 0) 98 if (ret < 0)
112 goto error_cache; 99 goto error_cache;
113#endif 100#endif
@@ -142,8 +129,8 @@ error_fs:
142error_open_socket: 129error_open_socket:
143error_vl_update_init: 130error_vl_update_init:
144error_cell_init: 131error_cell_init:
145#ifdef AFS_CACHING_SUPPORT 132#ifdef CONFIG_AFS_FSCACHE
146 cachefs_unregister_netfs(&afs_cache_netfs); 133 fscache_unregister_netfs(&afs_cache_netfs);
147error_cache: 134error_cache:
148#endif 135#endif
149 afs_callback_update_kill(); 136 afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
175 afs_vlocation_purge(); 162 afs_vlocation_purge();
176 flush_scheduled_work(); 163 flush_scheduled_work();
177 afs_cell_purge(); 164 afs_cell_purge();
178#ifdef AFS_CACHING_SUPPORT 165#ifdef CONFIG_AFS_FSCACHE
179 cachefs_unregister_netfs(&afs_cache_netfs); 166 fscache_unregister_netfs(&afs_cache_netfs);
180#endif 167#endif
181 afs_proc_cleanup(); 168 afs_proc_cleanup();
182 rcu_barrier(); 169 rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a800..2b9e2d03a390 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
173 if (PageError(page)) 173 if (PageError(page))
174 goto error; 174 goto error;
175 175
176 buf = kmap(page); 176 buf = kmap_atomic(page, KM_USER0);
177 memcpy(devname, buf, size); 177 memcpy(devname, buf, size);
178 kunmap(page); 178 kunmap_atomic(buf, KM_USER0);
179 page_cache_release(page); 179 page_cache_release(page);
180 page = NULL; 180 page = NULL;
181 181
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb5..ec2a7431e458 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
281 281
282 vl->vldb = *vldb; 282 vl->vldb = *vldb;
283 283
284#ifdef AFS_CACHING_SUPPORT 284#ifdef CONFIG_AFS_FSCACHE
285 /* update volume entry in local cache */ 285 fscache_update_cookie(vl->cache);
286 cachefs_update_cookie(vl->cache);
287#endif 286#endif
288} 287}
289 288
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
304 memset(&vldb, 0, sizeof(vldb)); 303 memset(&vldb, 0, sizeof(vldb));
305 304
306 /* see if we have an in-cache copy (will set vl->valid if there is) */ 305 /* see if we have an in-cache copy (will set vl->valid if there is) */
307#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
308 cachefs_acquire_cookie(cell->cache, 307 vl->cache = fscache_acquire_cookie(vl->cell->cache,
309 &afs_volume_cache_index_def, 308 &afs_vlocation_cache_index_def, vl);
310 vlocation,
311 &vl->cache);
312#endif 309#endif
313 310
314 if (vl->valid) { 311 if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
420 spin_unlock(&vl->lock); 417 spin_unlock(&vl->lock);
421 wake_up(&vl->waitq); 418 wake_up(&vl->waitq);
422 419
420 /* update volume entry in local cache */
421#ifdef CONFIG_AFS_FSCACHE
422 fscache_update_cookie(vl->cache);
423#endif
424
423 /* schedule for regular updates */ 425 /* schedule for regular updates */
424 afs_vlocation_queue_for_updates(vl); 426 afs_vlocation_queue_for_updates(vl);
425 goto success; 427 goto success;
@@ -465,7 +467,7 @@ found_in_memory:
465 spin_unlock(&vl->lock); 467 spin_unlock(&vl->lock);
466 468
467success: 469success:
468 _leave(" = %p",vl); 470 _leave(" = %p", vl);
469 return vl; 471 return vl;
470 472
471error_abandon: 473error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
523{ 525{
524 _enter("%p", vl); 526 _enter("%p", vl);
525 527
526#ifdef AFS_CACHING_SUPPORT 528#ifdef CONFIG_AFS_FSCACHE
527 cachefs_relinquish_cookie(vl->cache, 0); 529 fscache_relinquish_cookie(vl->cache, 0);
528#endif 530#endif
529
530 afs_put_cell(vl->cell); 531 afs_put_cell(vl->cell);
531 kfree(vl); 532 kfree(vl);
532} 533}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f9..a353e69e2391 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
124 } 124 }
125 125
126 /* attach the cache and volume location */ 126 /* attach the cache and volume location */
127#ifdef AFS_CACHING_SUPPORT 127#ifdef CONFIG_AFS_FSCACHE
128 cachefs_acquire_cookie(vlocation->cache, 128 volume->cache = fscache_acquire_cookie(vlocation->cache,
129 &afs_vnode_cache_index_def, 129 &afs_volume_cache_index_def,
130 volume, 130 volume);
131 &volume->cache);
132#endif 131#endif
133
134 afs_get_vlocation(vlocation); 132 afs_get_vlocation(vlocation);
135 volume->vlocation = vlocation; 133 volume->vlocation = vlocation;
136 134
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
194 up_write(&vlocation->cell->vl_sem); 192 up_write(&vlocation->cell->vl_sem);
195 193
196 /* finish cleaning up the volume */ 194 /* finish cleaning up the volume */
197#ifdef AFS_CACHING_SUPPORT 195#ifdef CONFIG_AFS_FSCACHE
198 cachefs_relinquish_cookie(volume->cache, 0); 196 fscache_relinquish_cookie(volume->cache, 0);
199#endif 197#endif
200 afs_put_vlocation(vlocation); 198 afs_put_vlocation(vlocation);
201 199
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d433621..c2e7a7ff0080 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
780 _leave(" = %d", ret); 780 _leave(" = %d", ret);
781 return ret; 781 return ret;
782} 782}
783
784/*
785 * notification that a previously read-only page is about to become writable
786 * - if it returns an error, the caller will deliver a bus error signal
787 */
788int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
789{
790 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
791
792 _enter("{{%x:%u}},{%lx}",
793 vnode->fid.vid, vnode->fid.vnode, page->index);
794
795 /* wait for the page to be written to the cache before we allow it to
796 * be modified */
797#ifdef CONFIG_AFS_FSCACHE
798 fscache_wait_on_page_write(vnode->cache, page);
799#endif
800
801 _leave(" = 0");
802 return 0;
803}
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20 21
21#endif /* __KERNEL__ */ 22#endif /* __KERNEL__ */
22 23
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad02..76afd0d6b86c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
900befs_statfs(struct dentry *dentry, struct kstatfs *buf) 900befs_statfs(struct dentry *dentry, struct kstatfs *buf)
901{ 901{
902 struct super_block *sb = dentry->d_sb; 902 struct super_block *sb = dentry->d_sb;
903 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
903 904
904 befs_debug(sb, "---> befs_statfs()"); 905 befs_debug(sb, "---> befs_statfs()");
905 906
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
910 buf->f_bavail = buf->f_bfree; 911 buf->f_bavail = buf->f_bfree;
911 buf->f_files = 0; /* UNKNOWN */ 912 buf->f_files = 0; /* UNKNOWN */
912 buf->f_ffree = 0; /* UNKNOWN */ 913 buf->f_ffree = 0; /* UNKNOWN */
914 buf->f_fsid.val[0] = (u32)id;
915 buf->f_fsid.val[1] = (u32)(id >> 32);
913 buf->f_namelen = BEFS_NAME_LEN; 916 buf->f_namelen = BEFS_NAME_LEN;
914 917
915 befs_debug(sb, "<--- befs_statfs()"); 918 befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853b..40381df34869 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/stat.h>
16#include <linux/time.h>
17#include <linux/mm.h> 15#include <linux/mm.h>
18#include <linux/mman.h> 16#include <linux/mman.h>
19#include <linux/errno.h> 17#include <linux/errno.h>
@@ -21,20 +19,15 @@
21#include <linux/binfmts.h> 19#include <linux/binfmts.h>
22#include <linux/string.h> 20#include <linux/string.h>
23#include <linux/file.h> 21#include <linux/file.h>
24#include <linux/fcntl.h>
25#include <linux/ptrace.h>
26#include <linux/slab.h> 22#include <linux/slab.h>
27#include <linux/shm.h>
28#include <linux/personality.h> 23#include <linux/personality.h>
29#include <linux/elfcore.h> 24#include <linux/elfcore.h>
30#include <linux/init.h> 25#include <linux/init.h>
31#include <linux/highuid.h> 26#include <linux/highuid.h>
32#include <linux/smp.h>
33#include <linux/compiler.h> 27#include <linux/compiler.h>
34#include <linux/highmem.h> 28#include <linux/highmem.h>
35#include <linux/pagemap.h> 29#include <linux/pagemap.h>
36#include <linux/security.h> 30#include <linux/security.h>
37#include <linux/syscalls.h>
38#include <linux/random.h> 31#include <linux/random.h>
39#include <linux/elf.h> 32#include <linux/elf.h>
40#include <linux/utsname.h> 33#include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
576 unsigned long error; 569 unsigned long error;
577 struct elf_phdr *elf_ppnt, *elf_phdata; 570 struct elf_phdr *elf_ppnt, *elf_phdata;
578 unsigned long elf_bss, elf_brk; 571 unsigned long elf_bss, elf_brk;
579 int elf_exec_fileno;
580 int retval, i; 572 int retval, i;
581 unsigned int size; 573 unsigned int size;
582 unsigned long elf_entry; 574 unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
631 goto out_free_ph; 623 goto out_free_ph;
632 } 624 }
633 625
634 retval = get_unused_fd();
635 if (retval < 0)
636 goto out_free_ph;
637 get_file(bprm->file);
638 fd_install(elf_exec_fileno = retval, bprm->file);
639
640 elf_ppnt = elf_phdata; 626 elf_ppnt = elf_phdata;
641 elf_bss = 0; 627 elf_bss = 0;
642 elf_brk = 0; 628 elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
655 retval = -ENOEXEC; 641 retval = -ENOEXEC;
656 if (elf_ppnt->p_filesz > PATH_MAX || 642 if (elf_ppnt->p_filesz > PATH_MAX ||
657 elf_ppnt->p_filesz < 2) 643 elf_ppnt->p_filesz < 2)
658 goto out_free_file; 644 goto out_free_ph;
659 645
660 retval = -ENOMEM; 646 retval = -ENOMEM;
661 elf_interpreter = kmalloc(elf_ppnt->p_filesz, 647 elf_interpreter = kmalloc(elf_ppnt->p_filesz,
662 GFP_KERNEL); 648 GFP_KERNEL);
663 if (!elf_interpreter) 649 if (!elf_interpreter)
664 goto out_free_file; 650 goto out_free_ph;
665 651
666 retval = kernel_read(bprm->file, elf_ppnt->p_offset, 652 retval = kernel_read(bprm->file, elf_ppnt->p_offset,
667 elf_interpreter, 653 elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
956 942
957 kfree(elf_phdata); 943 kfree(elf_phdata);
958 944
959 sys_close(elf_exec_fileno);
960
961 set_binfmt(&elf_format); 945 set_binfmt(&elf_format);
962 946
963#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES 947#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
1028 fput(interpreter); 1012 fput(interpreter);
1029out_free_interp: 1013out_free_interp:
1030 kfree(elf_interpreter); 1014 kfree(elf_interpreter);
1031out_free_file:
1032 sys_close(elf_exec_fileno);
1033out_free_ph: 1015out_free_ph:
1034 kfree(elf_phdata); 1016 kfree(elf_phdata);
1035 goto out; 1017 goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f5..70cfc4b84ae0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
972 params->elfhdr_addr = seg->addr; 972 params->elfhdr_addr = seg->addr;
973 973
974 /* clear any space allocated but not loaded */ 974 /* clear any space allocated but not loaded */
975 if (phdr->p_filesz < phdr->p_memsz) 975 if (phdr->p_filesz < phdr->p_memsz) {
976 clear_user((void *) (seg->addr + phdr->p_filesz), 976 ret = clear_user((void *) (seg->addr + phdr->p_filesz),
977 phdr->p_memsz - phdr->p_filesz); 977 phdr->p_memsz - phdr->p_filesz);
978 if (ret)
979 return ret;
980 }
978 981
979 if (mm) { 982 if (mm) {
980 if (phdr->p_flags & PF_X) { 983 if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1014 struct elf32_fdpic_loadseg *seg; 1017 struct elf32_fdpic_loadseg *seg;
1015 struct elf32_phdr *phdr; 1018 struct elf32_phdr *phdr;
1016 unsigned long load_addr, delta_vaddr; 1019 unsigned long load_addr, delta_vaddr;
1017 int loop, dvset; 1020 int loop, dvset, ret;
1018 1021
1019 load_addr = params->load_addr; 1022 load_addr = params->load_addr;
1020 delta_vaddr = 0; 1023 delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1114 * PT_LOAD */ 1117 * PT_LOAD */
1115 if (prot & PROT_WRITE && disp > 0) { 1118 if (prot & PROT_WRITE && disp > 0) {
1116 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1119 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1117 clear_user((void __user *) maddr, disp); 1120 ret = clear_user((void __user *) maddr, disp);
1121 if (ret)
1122 return ret;
1118 maddr += disp; 1123 maddr += disp;
1119 } 1124 }
1120 1125
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1149 if (prot & PROT_WRITE && excess1 > 0) { 1154 if (prot & PROT_WRITE && excess1 > 0) {
1150 kdebug("clear[%d] ad=%lx sz=%lx", 1155 kdebug("clear[%d] ad=%lx sz=%lx",
1151 loop, maddr + phdr->p_filesz, excess1); 1156 loop, maddr + phdr->p_filesz, excess1);
1152 clear_user((void __user *) maddr + phdr->p_filesz, 1157 ret = clear_user((void __user *) maddr + phdr->p_filesz,
1153 excess1); 1158 excess1);
1159 if (ret)
1160 return ret;
1154 } 1161 }
1155 1162
1156#else 1163#else
1157 if (excess > 0) { 1164 if (excess > 0) {
1158 kdebug("clear[%d] ad=%lx sz=%lx", 1165 kdebug("clear[%d] ad=%lx sz=%lx",
1159 loop, maddr + phdr->p_filesz, excess); 1166 loop, maddr + phdr->p_filesz, excess);
1160 clear_user((void *) maddr + phdr->p_filesz, excess); 1167 ret = clear_user((void *) maddr + phdr->p_filesz, excess);
1168 if (ret)
1169 return ret;
1161 } 1170 }
1162#endif 1171#endif
1163 1172
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616e..eff74b9c9e77 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
188static int 188static int
189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) 189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
190{ 190{
191 int som_exec_fileno;
192 int retval; 191 int retval;
193 unsigned int size; 192 unsigned int size;
194 unsigned long som_entry; 193 unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
220 goto out_free; 219 goto out_free;
221 } 220 }
222 221
223 retval = get_unused_fd();
224 if (retval < 0)
225 goto out_free;
226 get_file(bprm->file);
227 fd_install(som_exec_fileno = retval, bprm->file);
228
229 /* Flush all traces of the currently running executable */ 222 /* Flush all traces of the currently running executable */
230 retval = flush_old_exec(bprm); 223 retval = flush_old_exec(bprm);
231 if (retval) 224 if (retval)
diff --git a/fs/bio.c b/fs/bio.c
index a040cde7f6fd..e0c9e545bbfa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1420,8 +1420,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
1420} 1420}
1421 1421
1422/* 1422/*
1423 * split a bio - only worry about a bio with a single page 1423 * split a bio - only worry about a bio with a single page in its iovec
1424 * in it's iovec
1425 */ 1424 */
1426struct bio_pair *bio_split(struct bio *bi, int first_sectors) 1425struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1427{ 1426{
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf3..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
204 } 204 }
205 return sync_blockdev(bdev); 205 return sync_blockdev(bdev);
206} 206}
207EXPORT_SYMBOL(fsync_bdev);
207 208
208/** 209/**
209 * freeze_bdev -- lock a filesystem and force it into a consistent state 210 * freeze_bdev -- lock a filesystem and force it into a consistent state
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
256 } 256 }
257 257
258 if (!acl) 258 if (!acl)
259 inode->i_mode &= ~current->fs->umask; 259 inode->i_mode &= ~current_umask();
260 } 260 }
261 261
262 if (IS_POSIXACL(dir) && acl) { 262 if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 23#include "async-thread.h"
25 24
26#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
195 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending))
196 continue; 195 continue;
197 196
197 if (kthread_should_stop())
198 break;
199
198 /* still no more work?, sleep for real */ 200 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock); 201 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE); 202 set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
208 worker->working = 0; 210 worker->working = 0;
209 spin_unlock_irq(&worker->lock); 211 spin_unlock_irq(&worker->lock);
210 212
211 schedule(); 213 if (!kthread_should_stop())
214 schedule();
212 } 215 }
213 __set_current_state(TASK_RUNNING); 216 __set_current_state(TASK_RUNNING);
214 } 217 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dbb724124633..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1244 * readahead one full node of leaves, finding things that are close 1244 * readahead one full node of leaves, finding things that are close
1245 * to the block in 'slot', and triggering ra on them. 1245 * to the block in 'slot', and triggering ra on them.
1246 */ 1246 */
1247static noinline void reada_for_search(struct btrfs_root *root, 1247static void reada_for_search(struct btrfs_root *root,
1248 struct btrfs_path *path, 1248 struct btrfs_path *path,
1249 int level, int slot, u64 objectid) 1249 int level, int slot, u64 objectid)
1250{ 1250{
1251 struct extent_buffer *node; 1251 struct extent_buffer *node;
1252 struct btrfs_disk_key disk_key; 1252 struct btrfs_disk_key disk_key;
@@ -1447,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1447} 1447}
1448 1448
1449/* 1449/*
1450 * helper function for btrfs_search_slot. The goal is to find a block
1451 * in cache without setting the path to blocking. If we find the block
1452 * we return zero and the path is unchanged.
1453 *
1454 * If we can't find the block, we set the path blocking and do some
1455 * reada. -EAGAIN is returned and the search must be repeated.
1456 */
1457static int
1458read_block_for_search(struct btrfs_trans_handle *trans,
1459 struct btrfs_root *root, struct btrfs_path *p,
1460 struct extent_buffer **eb_ret, int level, int slot,
1461 struct btrfs_key *key)
1462{
1463 u64 blocknr;
1464 u64 gen;
1465 u32 blocksize;
1466 struct extent_buffer *b = *eb_ret;
1467 struct extent_buffer *tmp;
1468
1469 blocknr = btrfs_node_blockptr(b, slot);
1470 gen = btrfs_node_ptr_generation(b, slot);
1471 blocksize = btrfs_level_size(root, level - 1);
1472
1473 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1475 *eb_ret = tmp;
1476 return 0;
1477 }
1478
1479 /*
1480 * reduce lock contention at high levels
1481 * of the btree by dropping locks before
1482 * we read.
1483 */
1484 btrfs_release_path(NULL, p);
1485 if (tmp)
1486 free_extent_buffer(tmp);
1487 if (p->reada)
1488 reada_for_search(root, p, level, slot, key->objectid);
1489
1490 tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 if (tmp)
1492 free_extent_buffer(tmp);
1493 return -EAGAIN;
1494}
1495
1496/*
1497 * helper function for btrfs_search_slot. This does all of the checks
1498 * for node-level blocks and does any balancing required based on
1499 * the ins_len.
1500 *
1501 * If no extra work was required, zero is returned. If we had to
1502 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1503 * start over
1504 */
1505static int
1506setup_nodes_for_search(struct btrfs_trans_handle *trans,
1507 struct btrfs_root *root, struct btrfs_path *p,
1508 struct extent_buffer *b, int level, int ins_len)
1509{
1510 int ret;
1511 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1512 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1513 int sret;
1514
1515 sret = reada_for_balance(root, p, level);
1516 if (sret)
1517 goto again;
1518
1519 btrfs_set_path_blocking(p);
1520 sret = split_node(trans, root, p, level);
1521 btrfs_clear_path_blocking(p, NULL);
1522
1523 BUG_ON(sret > 0);
1524 if (sret) {
1525 ret = sret;
1526 goto done;
1527 }
1528 b = p->nodes[level];
1529 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1530 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1531 int sret;
1532
1533 sret = reada_for_balance(root, p, level);
1534 if (sret)
1535 goto again;
1536
1537 btrfs_set_path_blocking(p);
1538 sret = balance_level(trans, root, p, level);
1539 btrfs_clear_path_blocking(p, NULL);
1540
1541 if (sret) {
1542 ret = sret;
1543 goto done;
1544 }
1545 b = p->nodes[level];
1546 if (!b) {
1547 btrfs_release_path(NULL, p);
1548 goto again;
1549 }
1550 BUG_ON(btrfs_header_nritems(b) == 1);
1551 }
1552 return 0;
1553
1554again:
1555 ret = -EAGAIN;
1556done:
1557 return ret;
1558}
1559
1560/*
1450 * look for key in the tree. path is filled in with nodes along the way 1561 * look for key in the tree. path is filled in with nodes along the way
1451 * if key is found, we return zero and you can find the item in the leaf 1562 * if key is found, we return zero and you can find the item in the leaf
1452 * level of the path (level 0) 1563 * level of the path (level 0)
@@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1464 ins_len, int cow) 1575 ins_len, int cow)
1465{ 1576{
1466 struct extent_buffer *b; 1577 struct extent_buffer *b;
1467 struct extent_buffer *tmp;
1468 int slot; 1578 int slot;
1469 int ret; 1579 int ret;
1470 int level; 1580 int level;
1471 int should_reada = p->reada;
1472 int lowest_unlock = 1; 1581 int lowest_unlock = 1;
1473 int blocksize;
1474 u8 lowest_level = 0; 1582 u8 lowest_level = 0;
1475 u64 blocknr;
1476 u64 gen;
1477 1583
1478 lowest_level = p->lowest_level; 1584 lowest_level = p->lowest_level;
1479 WARN_ON(lowest_level && ins_len > 0); 1585 WARN_ON(lowest_level && ins_len > 0);
@@ -1502,7 +1608,11 @@ again:
1502 if (cow) { 1608 if (cow) {
1503 int wret; 1609 int wret;
1504 1610
1505 /* is a cow on this block not required */ 1611 /*
1612 * if we don't really need to cow this block
1613 * then we don't want to set the path blocking,
1614 * so we test it here
1615 */
1506 if (btrfs_header_generation(b) == trans->transid && 1616 if (btrfs_header_generation(b) == trans->transid &&
1507 btrfs_header_owner(b) == root->root_key.objectid && 1617 btrfs_header_owner(b) == root->root_key.objectid &&
1508 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1618 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -1557,51 +1667,15 @@ cow_done:
1557 if (ret && slot > 0) 1667 if (ret && slot > 0)
1558 slot -= 1; 1668 slot -= 1;
1559 p->slots[level] = slot; 1669 p->slots[level] = slot;
1560 if ((p->search_for_split || ins_len > 0) && 1670 ret = setup_nodes_for_search(trans, root, p, b, level,
1561 btrfs_header_nritems(b) >= 1671 ins_len);
1562 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1672 if (ret == -EAGAIN)
1563 int sret; 1673 goto again;
1564 1674 else if (ret)
1565 sret = reada_for_balance(root, p, level); 1675 goto done;
1566 if (sret) 1676 b = p->nodes[level];
1567 goto again; 1677 slot = p->slots[level];
1568
1569 btrfs_set_path_blocking(p);
1570 sret = split_node(trans, root, p, level);
1571 btrfs_clear_path_blocking(p, NULL);
1572
1573 BUG_ON(sret > 0);
1574 if (sret) {
1575 ret = sret;
1576 goto done;
1577 }
1578 b = p->nodes[level];
1579 slot = p->slots[level];
1580 } else if (ins_len < 0 &&
1581 btrfs_header_nritems(b) <
1582 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1583 int sret;
1584
1585 sret = reada_for_balance(root, p, level);
1586 if (sret)
1587 goto again;
1588
1589 btrfs_set_path_blocking(p);
1590 sret = balance_level(trans, root, p, level);
1591 btrfs_clear_path_blocking(p, NULL);
1592 1678
1593 if (sret) {
1594 ret = sret;
1595 goto done;
1596 }
1597 b = p->nodes[level];
1598 if (!b) {
1599 btrfs_release_path(NULL, p);
1600 goto again;
1601 }
1602 slot = p->slots[level];
1603 BUG_ON(btrfs_header_nritems(b) == 1);
1604 }
1605 unlock_up(p, level, lowest_unlock); 1679 unlock_up(p, level, lowest_unlock);
1606 1680
1607 /* this is only true while dropping a snapshot */ 1681 /* this is only true while dropping a snapshot */
@@ -1610,44 +1684,11 @@ cow_done:
1610 goto done; 1684 goto done;
1611 } 1685 }
1612 1686
1613 blocknr = btrfs_node_blockptr(b, slot); 1687 ret = read_block_for_search(trans, root, p,
1614 gen = btrfs_node_ptr_generation(b, slot); 1688 &b, level, slot, key);
1615 blocksize = btrfs_level_size(root, level - 1); 1689 if (ret == -EAGAIN)
1690 goto again;
1616 1691
1617 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1618 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1619 b = tmp;
1620 } else {
1621 /*
1622 * reduce lock contention at high levels
1623 * of the btree by dropping locks before
1624 * we read.
1625 */
1626 if (level > 0) {
1627 btrfs_release_path(NULL, p);
1628 if (tmp)
1629 free_extent_buffer(tmp);
1630 if (should_reada)
1631 reada_for_search(root, p,
1632 level, slot,
1633 key->objectid);
1634
1635 tmp = read_tree_block(root, blocknr,
1636 blocksize, gen);
1637 if (tmp)
1638 free_extent_buffer(tmp);
1639 goto again;
1640 } else {
1641 btrfs_set_path_blocking(p);
1642 if (tmp)
1643 free_extent_buffer(tmp);
1644 if (should_reada)
1645 reada_for_search(root, p,
1646 level, slot,
1647 key->objectid);
1648 b = read_node_slot(root, b, slot);
1649 }
1650 }
1651 if (!p->skip_locking) { 1692 if (!p->skip_locking) {
1652 int lret; 1693 int lret;
1653 1694
@@ -2116,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2116 BUG_ON(!path->nodes[level]); 2157 BUG_ON(!path->nodes[level]);
2117 lower = path->nodes[level]; 2158 lower = path->nodes[level];
2118 nritems = btrfs_header_nritems(lower); 2159 nritems = btrfs_header_nritems(lower);
2119 if (slot > nritems) 2160 BUG_ON(slot > nritems);
2120 BUG();
2121 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2161 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2122 BUG(); 2162 BUG();
2123 if (slot != nritems) { 2163 if (slot != nritems) {
@@ -4086,28 +4126,44 @@ next:
4086int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4126int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4087{ 4127{
4088 int slot; 4128 int slot;
4089 int level = 1; 4129 int level;
4090 struct extent_buffer *c; 4130 struct extent_buffer *c;
4091 struct extent_buffer *next = NULL; 4131 struct extent_buffer *next;
4092 struct btrfs_key key; 4132 struct btrfs_key key;
4093 u32 nritems; 4133 u32 nritems;
4094 int ret; 4134 int ret;
4135 int old_spinning = path->leave_spinning;
4136 int force_blocking = 0;
4095 4137
4096 nritems = btrfs_header_nritems(path->nodes[0]); 4138 nritems = btrfs_header_nritems(path->nodes[0]);
4097 if (nritems == 0) 4139 if (nritems == 0)
4098 return 1; 4140 return 1;
4099 4141
4100 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4142 /*
4143 * we take the blocks in an order that upsets lockdep. Using
4144 * blocking mode is the only way around it.
4145 */
4146#ifdef CONFIG_DEBUG_LOCK_ALLOC
4147 force_blocking = 1;
4148#endif
4101 4149
4150 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4151again:
4152 level = 1;
4153 next = NULL;
4102 btrfs_release_path(root, path); 4154 btrfs_release_path(root, path);
4155
4103 path->keep_locks = 1; 4156 path->keep_locks = 1;
4157
4158 if (!force_blocking)
4159 path->leave_spinning = 1;
4160
4104 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4161 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4105 path->keep_locks = 0; 4162 path->keep_locks = 0;
4106 4163
4107 if (ret < 0) 4164 if (ret < 0)
4108 return ret; 4165 return ret;
4109 4166
4110 btrfs_set_path_blocking(path);
4111 nritems = btrfs_header_nritems(path->nodes[0]); 4167 nritems = btrfs_header_nritems(path->nodes[0]);
4112 /* 4168 /*
4113 * by releasing the path above we dropped all our locks. A balance 4169 * by releasing the path above we dropped all our locks. A balance
@@ -4117,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4117 */ 4173 */
4118 if (nritems > 0 && path->slots[0] < nritems - 1) { 4174 if (nritems > 0 && path->slots[0] < nritems - 1) {
4119 path->slots[0]++; 4175 path->slots[0]++;
4176 ret = 0;
4120 goto done; 4177 goto done;
4121 } 4178 }
4122 4179
4123 while (level < BTRFS_MAX_LEVEL) { 4180 while (level < BTRFS_MAX_LEVEL) {
4124 if (!path->nodes[level]) 4181 if (!path->nodes[level]) {
4125 return 1; 4182 ret = 1;
4183 goto done;
4184 }
4126 4185
4127 slot = path->slots[level] + 1; 4186 slot = path->slots[level] + 1;
4128 c = path->nodes[level]; 4187 c = path->nodes[level];
4129 if (slot >= btrfs_header_nritems(c)) { 4188 if (slot >= btrfs_header_nritems(c)) {
4130 level++; 4189 level++;
4131 if (level == BTRFS_MAX_LEVEL) 4190 if (level == BTRFS_MAX_LEVEL) {
4132 return 1; 4191 ret = 1;
4192 goto done;
4193 }
4133 continue; 4194 continue;
4134 } 4195 }
4135 4196
@@ -4138,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4138 free_extent_buffer(next); 4199 free_extent_buffer(next);
4139 } 4200 }
4140 4201
4141 /* the path was set to blocking above */ 4202 next = c;
4142 if (level == 1 && (path->locks[1] || path->skip_locking) && 4203 ret = read_block_for_search(NULL, root, path, &next, level,
4143 path->reada) 4204 slot, &key);
4144 reada_for_search(root, path, level, slot, 0); 4205 if (ret == -EAGAIN)
4206 goto again;
4145 4207
4146 next = read_node_slot(root, c, slot);
4147 if (!path->skip_locking) { 4208 if (!path->skip_locking) {
4148 btrfs_assert_tree_locked(c); 4209 ret = btrfs_try_spin_lock(next);
4149 btrfs_tree_lock(next); 4210 if (!ret) {
4150 btrfs_set_lock_blocking(next); 4211 btrfs_set_path_blocking(path);
4212 btrfs_tree_lock(next);
4213 if (!force_blocking)
4214 btrfs_clear_path_blocking(path, next);
4215 }
4216 if (force_blocking)
4217 btrfs_set_lock_blocking(next);
4151 } 4218 }
4152 break; 4219 break;
4153 } 4220 }
@@ -4157,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4157 c = path->nodes[level]; 4224 c = path->nodes[level];
4158 if (path->locks[level]) 4225 if (path->locks[level])
4159 btrfs_tree_unlock(c); 4226 btrfs_tree_unlock(c);
4227
4160 free_extent_buffer(c); 4228 free_extent_buffer(c);
4161 path->nodes[level] = next; 4229 path->nodes[level] = next;
4162 path->slots[level] = 0; 4230 path->slots[level] = 0;
4163 if (!path->skip_locking) 4231 if (!path->skip_locking)
4164 path->locks[level] = 1; 4232 path->locks[level] = 1;
4233
4165 if (!level) 4234 if (!level)
4166 break; 4235 break;
4167 4236
4168 btrfs_set_path_blocking(path); 4237 ret = read_block_for_search(NULL, root, path, &next, level,
4169 if (level == 1 && path->locks[1] && path->reada) 4238 0, &key);
4170 reada_for_search(root, path, level, slot, 0); 4239 if (ret == -EAGAIN)
4171 next = read_node_slot(root, next, 0); 4240 goto again;
4241
4172 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4173 btrfs_assert_tree_locked(path->nodes[level]); 4243 btrfs_assert_tree_locked(path->nodes[level]);
4174 btrfs_tree_lock(next); 4244 ret = btrfs_try_spin_lock(next);
4175 btrfs_set_lock_blocking(next); 4245 if (!ret) {
4246 btrfs_set_path_blocking(path);
4247 btrfs_tree_lock(next);
4248 if (!force_blocking)
4249 btrfs_clear_path_blocking(path, next);
4250 }
4251 if (force_blocking)
4252 btrfs_set_lock_blocking(next);
4176 } 4253 }
4177 } 4254 }
4255 ret = 0;
4178done: 4256done:
4179 unlock_up(path, 0, 1); 4257 unlock_up(path, 0, 1);
4180 return 0; 4258 path->leave_spinning = old_spinning;
4259 if (!old_spinning)
4260 btrfs_set_path_blocking(path);
4261
4262 return ret;
4181} 4263}
4182 4264
4183/* 4265/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9417713542a2..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
143#define BTRFS_FT_MAX 9 143#define BTRFS_FT_MAX 9
144 144
145/* 145/*
146 * the key defines the order in the tree, and so it also defines (optimal) 146 * The key defines the order in the tree, and so it also defines (optimal)
147 * block layout. objectid corresonds to the inode number. The flags 147 * block layout.
148 * tells us things about the object, and is a kind of stream selector. 148 *
149 * so for a given inode, keys with flags of 1 might refer to the inode 149 * objectid corresponds to the inode number.
150 * data, flags of 2 may point to file data in the btree and flags == 3 150 *
151 * may point to extents. 151 * type tells us things about the object, and is a kind of stream selector.
152 * so for a given inode, keys with type of 1 might refer to the inode data,
153 * type of 2 may point to file data in the btree and type == 3 may point to
154 * extents.
152 * 155 *
153 * offset is the starting byte offset for this key in the stream. 156 * offset is the starting byte offset for this key in the stream.
154 * 157 *
@@ -200,7 +203,7 @@ struct btrfs_dev_item {
200 203
201 /* 204 /*
202 * starting byte of this partition on the device, 205 * starting byte of this partition on the device,
203 * to allowr for stripe alignment in the future 206 * to allow for stripe alignment in the future
204 */ 207 */
205 __le64 start_offset; 208 __le64 start_offset;
206 209
@@ -633,18 +636,35 @@ struct btrfs_space_info {
633 struct rw_semaphore groups_sem; 636 struct rw_semaphore groups_sem;
634}; 637};
635 638
636struct btrfs_free_space { 639/*
637 struct rb_node bytes_index; 640 * free clusters are used to claim free space in relatively large chunks,
638 struct rb_node offset_index; 641 * allowing us to do less seeky writes. They are used for all metadata
639 u64 offset; 642 * allocations and data allocations in ssd mode.
640 u64 bytes; 643 */
644struct btrfs_free_cluster {
645 spinlock_t lock;
646 spinlock_t refill_lock;
647 struct rb_root root;
648
649 /* largest extent in this cluster */
650 u64 max_size;
651
652 /* first extent starting offset */
653 u64 window_start;
654
655 struct btrfs_block_group_cache *block_group;
656 /*
657 * when a cluster is allocated from a block group, we put the
658 * cluster onto a list in the block group so that it can
659 * be freed before the block group is freed.
660 */
661 struct list_head block_group_list;
641}; 662};
642 663
643struct btrfs_block_group_cache { 664struct btrfs_block_group_cache {
644 struct btrfs_key key; 665 struct btrfs_key key;
645 struct btrfs_block_group_item item; 666 struct btrfs_block_group_item item;
646 spinlock_t lock; 667 spinlock_t lock;
647 struct mutex alloc_mutex;
648 struct mutex cache_mutex; 668 struct mutex cache_mutex;
649 u64 pinned; 669 u64 pinned;
650 u64 reserved; 670 u64 reserved;
@@ -656,6 +676,7 @@ struct btrfs_block_group_cache {
656 struct btrfs_space_info *space_info; 676 struct btrfs_space_info *space_info;
657 677
658 /* free space cache stuff */ 678 /* free space cache stuff */
679 spinlock_t tree_lock;
659 struct rb_root free_space_bytes; 680 struct rb_root free_space_bytes;
660 struct rb_root free_space_offset; 681 struct rb_root free_space_offset;
661 682
@@ -667,6 +688,11 @@ struct btrfs_block_group_cache {
667 688
668 /* usage count */ 689 /* usage count */
669 atomic_t count; 690 atomic_t count;
691
692 /* List of struct btrfs_free_clusters for this block group.
693 * Today it will only have one thing on it, but that may change
694 */
695 struct list_head cluster_list;
670}; 696};
671 697
672struct btrfs_leaf_ref_tree { 698struct btrfs_leaf_ref_tree {
@@ -728,7 +754,6 @@ struct btrfs_fs_info {
728 struct mutex tree_log_mutex; 754 struct mutex tree_log_mutex;
729 struct mutex transaction_kthread_mutex; 755 struct mutex transaction_kthread_mutex;
730 struct mutex cleaner_mutex; 756 struct mutex cleaner_mutex;
731 struct mutex pinned_mutex;
732 struct mutex chunk_mutex; 757 struct mutex chunk_mutex;
733 struct mutex drop_mutex; 758 struct mutex drop_mutex;
734 struct mutex volume_mutex; 759 struct mutex volume_mutex;
@@ -839,8 +864,12 @@ struct btrfs_fs_info {
839 spinlock_t delalloc_lock; 864 spinlock_t delalloc_lock;
840 spinlock_t new_trans_lock; 865 spinlock_t new_trans_lock;
841 u64 delalloc_bytes; 866 u64 delalloc_bytes;
842 u64 last_alloc; 867
843 u64 last_data_alloc; 868 /* data_alloc_cluster is only used in ssd mode */
869 struct btrfs_free_cluster data_alloc_cluster;
870
871 /* all metadata allocations go through this cluster */
872 struct btrfs_free_cluster meta_alloc_cluster;
844 873
845 spinlock_t ref_cache_lock; 874 spinlock_t ref_cache_lock;
846 u64 total_ref_cache_size; 875 u64 total_ref_cache_size;
@@ -932,7 +961,6 @@ struct btrfs_root {
932}; 961};
933 962
934/* 963/*
935
936 * inode items have the data typically returned from stat and store other 964 * inode items have the data typically returned from stat and store other
937 * info about object characteristics. There is one for every file and dir in 965 * info about object characteristics. There is one for every file and dir in
938 * the FS 966 * the FS
@@ -963,7 +991,7 @@ struct btrfs_root {
963#define BTRFS_EXTENT_CSUM_KEY 128 991#define BTRFS_EXTENT_CSUM_KEY 128
964 992
965/* 993/*
966 * root items point to tree roots. There are typically in the root 994 * root items point to tree roots. They are typically in the root
967 * tree used by the super block to find all the other trees 995 * tree used by the super block to find all the other trees
968 */ 996 */
969#define BTRFS_ROOT_ITEM_KEY 132 997#define BTRFS_ROOT_ITEM_KEY 132
@@ -1010,6 +1038,8 @@ struct btrfs_root {
1010#define BTRFS_MOUNT_SSD (1 << 3) 1038#define BTRFS_MOUNT_SSD (1 << 3)
1011#define BTRFS_MOUNT_DEGRADED (1 << 4) 1039#define BTRFS_MOUNT_DEGRADED (1 << 4)
1012#define BTRFS_MOUNT_COMPRESS (1 << 5) 1040#define BTRFS_MOUNT_COMPRESS (1 << 5)
1041#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1042#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1013 1043
1014#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1044#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1015#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1045#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1748,6 +1778,7 @@ static inline struct dentry *fdentry(struct file *file)
1748} 1778}
1749 1779
1750/* extent-tree.c */ 1780/* extent-tree.c */
1781void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1751int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1782int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1752 struct btrfs_root *root, unsigned long count); 1783 struct btrfs_root *root, unsigned long count);
1753int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1784int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -2174,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2174int btrfs_init_acl(struct inode *inode, struct inode *dir); 2205int btrfs_init_acl(struct inode *inode, struct inode *dir);
2175int btrfs_acl_chmod(struct inode *inode); 2206int btrfs_acl_chmod(struct inode *inode);
2176 2207
2177/* free-space-cache.c */
2178int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2179 u64 bytenr, u64 size);
2180int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2181 u64 offset, u64 bytes);
2182int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2183 u64 bytenr, u64 size);
2184int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2185 u64 offset, u64 bytes);
2186void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2187 *block_group);
2188struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2189 *block_group, u64 offset,
2190 u64 bytes);
2191void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2192 u64 bytes);
2193u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2194#endif 2208#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cbf7dc8ae3ec..d6c01c096a40 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -18,7 +18,6 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h> 20#include <linux/sort.h>
21#include <linux/ftrace.h>
22#include "ctree.h" 21#include "ctree.h"
23#include "delayed-ref.h" 22#include "delayed-ref.h"
24#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92d73929d381..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -1412,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
1412 1413
1413 ret = extent_range_uptodate(io_tree, start + length, 1414 ret = extent_range_uptodate(io_tree, start + length,
1414 start + buf_len - 1); 1415 start + buf_len - 1);
1415 if (ret == 1)
1416 return ret;
1417 return ret; 1416 return ret;
1418} 1417}
1419 1418
@@ -1647,12 +1646,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1647 mutex_init(&fs_info->ordered_operations_mutex); 1646 mutex_init(&fs_info->ordered_operations_mutex);
1648 mutex_init(&fs_info->tree_log_mutex); 1647 mutex_init(&fs_info->tree_log_mutex);
1649 mutex_init(&fs_info->drop_mutex); 1648 mutex_init(&fs_info->drop_mutex);
1650 mutex_init(&fs_info->pinned_mutex);
1651 mutex_init(&fs_info->chunk_mutex); 1649 mutex_init(&fs_info->chunk_mutex);
1652 mutex_init(&fs_info->transaction_kthread_mutex); 1650 mutex_init(&fs_info->transaction_kthread_mutex);
1653 mutex_init(&fs_info->cleaner_mutex); 1651 mutex_init(&fs_info->cleaner_mutex);
1654 mutex_init(&fs_info->volume_mutex); 1652 mutex_init(&fs_info->volume_mutex);
1655 mutex_init(&fs_info->tree_reloc_mutex); 1653 mutex_init(&fs_info->tree_reloc_mutex);
1654
1655 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1656 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1657
1656 init_waitqueue_head(&fs_info->transaction_throttle); 1658 init_waitqueue_head(&fs_info->transaction_throttle);
1657 init_waitqueue_head(&fs_info->transaction_wait); 1659 init_waitqueue_head(&fs_info->transaction_wait);
1658 init_waitqueue_head(&fs_info->async_submit_wait); 1660 init_waitqueue_head(&fs_info->async_submit_wait);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5e7cae63d80..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
34 35
35#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
166 u64 extent_start, extent_end, size; 167 u64 extent_start, extent_end, size;
167 int ret; 168 int ret;
168 169
169 mutex_lock(&info->pinned_mutex);
170 while (start < end) { 170 while (start < end) {
171 ret = find_first_extent_bit(&info->pinned_extents, start, 171 ret = find_first_extent_bit(&info->pinned_extents, start,
172 &extent_start, &extent_end, 172 &extent_start, &extent_end,
@@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
192 ret = btrfs_add_free_space(block_group, start, size); 192 ret = btrfs_add_free_space(block_group, start, size);
193 BUG_ON(ret); 193 BUG_ON(ret);
194 } 194 }
195 mutex_unlock(&info->pinned_mutex);
196 195
197 return 0; 196 return 0;
198} 197}
@@ -291,8 +290,8 @@ next:
291 block_group->key.objectid + 290 block_group->key.objectid +
292 block_group->key.offset); 291 block_group->key.offset);
293 292
294 remove_sb_from_cache(root, block_group);
295 block_group->cached = 1; 293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
296 ret = 0; 295 ret = 0;
297err: 296err:
298 btrfs_free_path(path); 297 btrfs_free_path(path);
@@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
326 return cache; 325 return cache;
327} 326}
328 327
329static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
330{ 329{
331 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
332 kfree(cache); 331 kfree(cache);
@@ -399,12 +398,12 @@ again:
399 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
400 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
401 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
402 put_block_group(cache); 401 btrfs_put_block_group(cache);
403 goto found; 402 goto found;
404 } 403 }
405 } 404 }
406 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
407 put_block_group(cache); 406 btrfs_put_block_group(cache);
408 cond_resched(); 407 cond_resched();
409 } 408 }
410 if (!wrapped) { 409 if (!wrapped) {
@@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1594 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1595 readonly = 1; 1594 readonly = 1;
1596 if (block_group) 1595 if (block_group)
1597 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1598 return readonly; 1597 return readonly;
1599} 1598}
1600 1599
@@ -2018,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2018 WARN_ON(ret); 2017 WARN_ON(ret);
2019 } 2018 }
2020 } 2019 }
2021 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2022 total -= num_bytes; 2021 total -= num_bytes;
2023 bytenr += num_bytes; 2022 bytenr += num_bytes;
2024 } 2023 }
@@ -2035,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2035 return 0; 2034 return 0;
2036 2035
2037 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2038 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2039 2038
2040 return bytenr; 2039 return bytenr;
2041} 2040}
@@ -2047,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2047 struct btrfs_block_group_cache *cache; 2046 struct btrfs_block_group_cache *cache;
2048 struct btrfs_fs_info *fs_info = root->fs_info; 2047 struct btrfs_fs_info *fs_info = root->fs_info;
2049 2048
2050 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2051 if (pin) { 2049 if (pin) {
2052 set_extent_dirty(&fs_info->pinned_extents, 2050 set_extent_dirty(&fs_info->pinned_extents,
2053 bytenr, bytenr + num - 1, GFP_NOFS); 2051 bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2055,7 +2053,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2055 clear_extent_dirty(&fs_info->pinned_extents, 2053 clear_extent_dirty(&fs_info->pinned_extents,
2056 bytenr, bytenr + num - 1, GFP_NOFS); 2054 bytenr, bytenr + num - 1, GFP_NOFS);
2057 } 2055 }
2058 mutex_unlock(&root->fs_info->pinned_mutex);
2059 2056
2060 while (num > 0) { 2057 while (num > 0) {
2061 cache = btrfs_lookup_block_group(fs_info, bytenr); 2058 cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2081,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2081 if (cache->cached) 2078 if (cache->cached)
2082 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2083 } 2080 }
2084 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2085 bytenr += len; 2082 bytenr += len;
2086 num -= len; 2083 num -= len;
2087 } 2084 }
@@ -2112,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2112 } 2109 }
2113 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2114 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2115 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2116 bytenr += len; 2113 bytenr += len;
2117 num -= len; 2114 num -= len;
2118 } 2115 }
@@ -2127,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2127 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2124 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2128 int ret; 2125 int ret;
2129 2126
2130 mutex_lock(&root->fs_info->pinned_mutex);
2131 while (1) { 2127 while (1) {
2132 ret = find_first_extent_bit(pinned_extents, last, 2128 ret = find_first_extent_bit(pinned_extents, last,
2133 &start, &end, EXTENT_DIRTY); 2129 &start, &end, EXTENT_DIRTY);
@@ -2136,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2136 set_extent_dirty(copy, start, end, GFP_NOFS); 2132 set_extent_dirty(copy, start, end, GFP_NOFS);
2137 last = end + 1; 2133 last = end + 1;
2138 } 2134 }
2139 mutex_unlock(&root->fs_info->pinned_mutex);
2140 return 0; 2135 return 0;
2141} 2136}
2142 2137
@@ -2149,7 +2144,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2149 int ret; 2144 int ret;
2150 2145
2151 while (1) { 2146 while (1) {
2152 mutex_lock(&root->fs_info->pinned_mutex);
2153 ret = find_first_extent_bit(unpin, 0, &start, &end, 2147 ret = find_first_extent_bit(unpin, 0, &start, &end,
2154 EXTENT_DIRTY); 2148 EXTENT_DIRTY);
2155 if (ret) 2149 if (ret)
@@ -2163,7 +2157,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2163 2157
2164 cond_resched(); 2158 cond_resched();
2165 } 2159 }
2166 mutex_unlock(&root->fs_info->pinned_mutex);
2167 return ret; 2160 return ret;
2168} 2161}
2169 2162
@@ -2205,7 +2198,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2205 free_extent_buffer(buf); 2198 free_extent_buffer(buf);
2206pinit: 2199pinit:
2207 btrfs_set_path_blocking(path); 2200 btrfs_set_path_blocking(path);
2208 mutex_lock(&root->fs_info->pinned_mutex);
2209 /* unlocks the pinned mutex */ 2201 /* unlocks the pinned mutex */
2210 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2202 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2211 2203
@@ -2511,8 +2503,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2511 */ 2503 */
2512 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && 2504 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2513 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2505 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2514 mutex_lock(&root->fs_info->pinned_mutex);
2515
2516 /* unlocks the pinned mutex */ 2506 /* unlocks the pinned mutex */
2517 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2507 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2518 update_reserved_extents(root, bytenr, num_bytes, 0); 2508 update_reserved_extents(root, bytenr, num_bytes, 0);
@@ -2554,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2554{ 2544{
2555 int ret = 0; 2545 int ret = 0;
2556 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
2557 u64 total_needed = num_bytes; 2547 struct btrfs_free_cluster *last_ptr = NULL;
2558 u64 *last_ptr = NULL;
2559 u64 last_wanted = 0;
2560 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
2561 int chunk_alloc_done = 0;
2562 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
2563 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
2564 struct list_head *head = NULL, *cur = NULL;
2565 int loop = 0;
2566 int extra_loop = 0;
2567 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
2568 2554
2569 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
2570 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2571 ins->objectid = 0; 2557 ins->objectid = 0;
2572 ins->offset = 0; 2558 ins->offset = 0;
2573 2559
2560 space_info = __find_space_info(root->fs_info, data);
2561
2574 if (orig_root->ref_cows || empty_size) 2562 if (orig_root->ref_cows || empty_size)
2575 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
2576 2564
2577 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2578 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
2579 if (!btrfs_test_opt(root, SSD)) 2567 if (!btrfs_test_opt(root, SSD))
2580 empty_cluster = 64 * 1024; 2568 empty_cluster = 64 * 1024;
2581 } 2569 }
2582 2570
2583 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
2584 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
2585 2574
2586 if (last_ptr) { 2575 if (last_ptr) {
2587 if (*last_ptr) { 2576 spin_lock(&last_ptr->lock);
2588 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
2589 last_wanted = *last_ptr; 2578 hint_byte = last_ptr->window_start;
2590 } else 2579 spin_unlock(&last_ptr->lock);
2591 empty_size += empty_cluster;
2592 } else {
2593 empty_cluster = 0;
2594 } 2580 }
2581
2595 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
2596 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
2597 2584
2598 if (last_wanted && search_start != last_wanted) { 2585 if (!last_ptr) {
2599 last_wanted = 0; 2586 empty_cluster = 0;
2600 empty_size += empty_cluster; 2587 loop = 1;
2601 } 2588 }
2602 2589
2603 total_needed += empty_size; 2590 if (search_start == hint_byte) {
2604 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2591 block_group = btrfs_lookup_block_group(root->fs_info,
2605 if (!block_group) 2592 search_start);
2606 block_group = btrfs_lookup_first_block_group(root->fs_info, 2593 if (block_group && block_group_bits(block_group, data)) {
2607 search_start); 2594 down_read(&space_info->groups_sem);
2608 space_info = __find_space_info(root->fs_info, data); 2595 goto have_block_group;
2596 } else if (block_group) {
2597 btrfs_put_block_group(block_group);
2598 }
2599 }
2609 2600
2601search:
2610 down_read(&space_info->groups_sem); 2602 down_read(&space_info->groups_sem);
2611 while (1) { 2603 list_for_each_entry(block_group, &space_info->block_groups, list) {
2612 struct btrfs_free_space *free_space; 2604 u64 offset;
2613 /*
2614 * the only way this happens if our hint points to a block
2615 * group thats not of the proper type, while looping this
2616 * should never happen
2617 */
2618 if (empty_size)
2619 extra_loop = 1;
2620 2605
2621 if (!block_group) 2606 atomic_inc(&block_group->count);
2622 goto new_group_no_lock; 2607 search_start = block_group->key.objectid;
2623 2608
2609have_block_group:
2624 if (unlikely(!block_group->cached)) { 2610 if (unlikely(!block_group->cached)) {
2625 mutex_lock(&block_group->cache_mutex); 2611 mutex_lock(&block_group->cache_mutex);
2626 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
2627 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
2628 if (ret) 2614 if (ret) {
2615 btrfs_put_block_group(block_group);
2629 break; 2616 break;
2617 }
2630 } 2618 }
2631 2619
2632 mutex_lock(&block_group->alloc_mutex);
2633 if (unlikely(!block_group_bits(block_group, data)))
2634 goto new_group;
2635
2636 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
2637 goto new_group; 2621 goto loop;
2638 2622
2639 free_space = btrfs_find_free_space(block_group, search_start, 2623 if (last_ptr) {
2640 total_needed); 2624 /*
2641 if (free_space) { 2625 * the refill lock keeps out other
2642 u64 start = block_group->key.objectid; 2626 * people trying to start a new cluster
2643 u64 end = block_group->key.objectid + 2627 */
2644 block_group->key.offset; 2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
2645 2636
2646 search_start = stripe_align(root, free_space->offset); 2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
2647 2657
2648 /* move on to the next group */ 2658 /*
2649 if (search_start + num_bytes >= search_end) 2659 * this cluster didn't work out, free it and
2650 goto new_group; 2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
2651 2663
2652 /* move on to the next group */ 2664 last_ptr_loop = 0;
2653 if (search_start + num_bytes > end)
2654 goto new_group;
2655 2665
2656 if (last_wanted && search_start != last_wanted) { 2666 /* allocate a cluster in this block group */
2657 total_needed += empty_cluster; 2667 ret = btrfs_find_space_cluster(trans,
2658 empty_size += empty_cluster; 2668 block_group, last_ptr,
2659 last_wanted = 0; 2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
2660 /* 2672 /*
2661 * if search_start is still in this block group 2673 * now pull our allocation out of this
2662 * then we just re-search this block group 2674 * cluster
2663 */ 2675 */
2664 if (search_start >= start && 2676 offset = btrfs_alloc_from_cluster(block_group,
2665 search_start < end) { 2677 last_ptr, num_bytes,
2666 mutex_unlock(&block_group->alloc_mutex); 2678 search_start);
2667 continue; 2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
2668 } 2683 }
2669
2670 /* else we go to the next block group */
2671 goto new_group;
2672 } 2684 }
2673 2685 /*
2674 if (exclude_nr > 0 && 2686 * at this point we either didn't find a cluster
2675 (search_start + num_bytes > exclude_start && 2687 * or we weren't able to allocate a block from our
2676 search_start < exclude_start + exclude_nr)) { 2688 * cluster. Free the cluster we've been trying
2677 search_start = exclude_start + exclude_nr; 2689 * to use, and go to the next block group
2678 /* 2690 */
2679 * if search_start is still in this block group 2691 if (loop < 2) {
2680 * then we just re-search this block group 2692 btrfs_return_cluster_to_free_space(NULL,
2681 */ 2693 last_ptr);
2682 if (search_start >= start && 2694 spin_unlock(&last_ptr->refill_lock);
2683 search_start < end) { 2695 goto loop;
2684 mutex_unlock(&block_group->alloc_mutex);
2685 last_wanted = 0;
2686 continue;
2687 }
2688
2689 /* else we go to the next block group */
2690 goto new_group;
2691 } 2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
2692 2699
2693 ins->objectid = search_start; 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
2694 ins->offset = num_bytes; 2701 num_bytes, empty_size);
2702 if (!offset)
2703 goto loop;
2704checks:
2705 search_start = stripe_align(root, offset);
2706
2707 /* move on to the next group */
2708 if (search_start + num_bytes >= search_end) {
2709 btrfs_add_free_space(block_group, offset, num_bytes);
2710 goto loop;
2711 }
2695 2712
2696 btrfs_remove_free_space_lock(block_group, search_start, 2713 /* move on to the next group */
2697 num_bytes); 2714 if (search_start + num_bytes >
2698 /* we are all good, lets return */ 2715 block_group->key.objectid + block_group->key.offset) {
2699 mutex_unlock(&block_group->alloc_mutex); 2716 btrfs_add_free_space(block_group, offset, num_bytes);
2700 break; 2717 goto loop;
2701 } 2718 }
2702new_group:
2703 mutex_unlock(&block_group->alloc_mutex);
2704 put_block_group(block_group);
2705 block_group = NULL;
2706new_group_no_lock:
2707 /* don't try to compare new allocations against the
2708 * last allocation any more
2709 */
2710 last_wanted = 0;
2711 2719
2712 /* 2720 if (exclude_nr > 0 &&
2713 * Here's how this works. 2721 (search_start + num_bytes > exclude_start &&
2714 * loop == 0: we were searching a block group via a hint 2722 search_start < exclude_start + exclude_nr)) {
2715 * and didn't find anything, so we start at 2723 search_start = exclude_start + exclude_nr;
2716 * the head of the block groups and keep searching 2724
2717 * loop == 1: we're searching through all of the block groups 2725 btrfs_add_free_space(block_group, offset, num_bytes);
2718 * if we hit the head again we have searched 2726 /*
2719 * all of the block groups for this space and we 2727 * if search_start is still in this block group
2720 * need to try and allocate, if we cant error out. 2728 * then we just re-search this block group
2721 * loop == 2: we allocated more space and are looping through
2722 * all of the block groups again.
2723 */
2724 if (loop == 0) {
2725 head = &space_info->block_groups;
2726 cur = head->next;
2727 loop++;
2728 } else if (loop == 1 && cur == head) {
2729 int keep_going;
2730
2731 /* at this point we give up on the empty_size
2732 * allocations and just try to allocate the min
2733 * space.
2734 *
2735 * The extra_loop field was set if an empty_size
2736 * allocation was attempted above, and if this
2737 * is try we need to try the loop again without
2738 * the additional empty_size.
2739 */ 2729 */
2740 total_needed -= empty_size; 2730 if (search_start >= block_group->key.objectid &&
2741 empty_size = 0; 2731 search_start < (block_group->key.objectid +
2742 keep_going = extra_loop; 2732 block_group->key.offset))
2743 loop++; 2733 goto have_block_group;
2734 goto loop;
2735 }
2744 2736
2745 if (allowed_chunk_alloc && !chunk_alloc_done) { 2737 ins->objectid = search_start;
2746 up_read(&space_info->groups_sem); 2738 ins->offset = num_bytes;
2747 ret = do_chunk_alloc(trans, root, num_bytes + 2739
2748 2 * 1024 * 1024, data, 1); 2740 if (offset < search_start)
2749 down_read(&space_info->groups_sem); 2741 btrfs_add_free_space(block_group, offset,
2750 if (ret < 0) 2742 search_start - offset);
2751 goto loop_check; 2743 BUG_ON(offset > search_start);
2752 head = &space_info->block_groups; 2744
2753 /* 2745 /* we are all good, lets return */
2754 * we've allocated a new chunk, keep 2746 break;
2755 * trying 2747loop:
2756 */ 2748 btrfs_put_block_group(block_group);
2757 keep_going = 1; 2749 }
2758 chunk_alloc_done = 1; 2750 up_read(&space_info->groups_sem);
2759 } else if (!allowed_chunk_alloc) { 2751
2760 space_info->force_alloc = 1; 2752 /* loop == 0, try to find a clustered alloc in every block group
2761 } 2753 * loop == 1, try again after forcing a chunk allocation
2762loop_check: 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
2763 if (keep_going) { 2755 */
2764 cur = head->next; 2756 if (!ins->objectid && loop < 3 &&
2765 extra_loop = 0; 2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
2766 } else { 2758 if (loop >= 2) {
2767 break; 2759 empty_size = 0;
2768 } 2760 empty_cluster = 0;
2769 } else if (cur == head) {
2770 break;
2771 } 2761 }
2772 2762
2773 block_group = list_entry(cur, struct btrfs_block_group_cache, 2763 if (allowed_chunk_alloc) {
2774 list); 2764 ret = do_chunk_alloc(trans, root, num_bytes +
2775 atomic_inc(&block_group->count); 2765 2 * 1024 * 1024, data, 1);
2766 allowed_chunk_alloc = 0;
2767 } else {
2768 space_info->force_alloc = 1;
2769 }
2776 2770
2777 search_start = block_group->key.objectid; 2771 if (loop < 3) {
2778 cur = cur->next; 2772 loop++;
2773 goto search;
2774 }
2775 ret = -ENOSPC;
2776 } else if (!ins->objectid) {
2777 ret = -ENOSPC;
2779 } 2778 }
2780 2779
2781 /* we found what we needed */ 2780 /* we found what we needed */
@@ -2783,21 +2782,10 @@ loop_check:
2783 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2784 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
2785 2784
2786 if (last_ptr) 2785 btrfs_put_block_group(block_group);
2787 *last_ptr = ins->objectid + ins->offset;
2788 ret = 0; 2786 ret = 0;
2789 } else if (!ret) {
2790 printk(KERN_ERR "btrfs searching for %llu bytes, "
2791 "num_bytes %llu, loop %d, allowed_alloc %d\n",
2792 (unsigned long long)total_needed,
2793 (unsigned long long)num_bytes,
2794 loop, allowed_chunk_alloc);
2795 ret = -ENOSPC;
2796 } 2787 }
2797 if (block_group)
2798 put_block_group(block_group);
2799 2788
2800 up_read(&space_info->groups_sem);
2801 return ret; 2789 return ret;
2802} 2790}
2803 2791
@@ -2902,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2902 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
2903 2891
2904 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
2905 put_block_group(cache); 2893 btrfs_put_block_group(cache);
2906 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
2907 2895
2908 return ret; 2896 return ret;
@@ -3040,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3040 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
3041 ins->offset); 3029 ins->offset);
3042 BUG_ON(ret); 3030 BUG_ON(ret);
3043 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
3044 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3045 ref_generation, owner, ins, 1); 3033 ref_generation, owner, ins, 1);
3046 return ret; 3034 return ret;
@@ -5729,7 +5717,7 @@ next:
5729 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
5730 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5731 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
5732 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
5733 ret = 0; 5721 ret = 0;
5734out: 5722out:
5735 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -5856,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5856 5844
5857 atomic_set(&cache->count, 1); 5845 atomic_set(&cache->count, 1);
5858 spin_lock_init(&cache->lock); 5846 spin_lock_init(&cache->lock);
5859 mutex_init(&cache->alloc_mutex); 5847 spin_lock_init(&cache->tree_lock);
5860 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
5861 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
5862 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
5863 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
5864 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -5912,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5912 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5901 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5913 atomic_set(&cache->count, 1); 5902 atomic_set(&cache->count, 1);
5914 spin_lock_init(&cache->lock); 5903 spin_lock_init(&cache->lock);
5915 mutex_init(&cache->alloc_mutex); 5904 spin_lock_init(&cache->tree_lock);
5916 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
5917 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
5918 5908
5919 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
5920 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5974,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5974 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
5975 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
5976 5966
5977 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
5978 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
5979 5969
5980 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5981 if (ret > 0) 5971 if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 08085af089e2..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2884 disko = 0; 2884 disko = 0;
2885 flags = 0; 2885 flags = 0;
2886 2886
2887 switch (em->block_start) { 2887 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1; 2888 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST; 2889 flags |= FIEMAP_EXTENT_LAST;
2891 break; 2890 } else if (em->block_start == EXTENT_MAP_HOLE) {
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN; 2891 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break; 2892 } else if (em->block_start == EXTENT_MAP_INLINE) {
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2893 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED); 2894 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break; 2895 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC | 2896 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN); 2897 FIEMAP_EXTENT_UNKNOWN);
2902 break; 2898 } else {
2903 default:
2904 disko = em->block_start; 2899 disko = em->block_start;
2905 break;
2906 } 2900 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2901 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED; 2902 flags |= FIEMAP_EXTENT_ENCODED;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 234 rb = tree_insert(&tree->map, em->start, &em->rb_node);
235 if (rb) { 235 if (rb) {
236 ret = -EEXIST; 236 ret = -EEXIST;
237 free_extent_map(merge);
238 goto out; 237 goto out;
239 } 238 }
240 atomic_inc(&em->refs); 239 atomic_inc(&em->refs);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
68} 77}
69 78
70/* 79/*
71 * searches the tree for the given offset. If contains is set we will return 80 * searches the tree for the given offset.
72 * the free space that contains the given offset. If contains is not set we 81 *
73 * will return the free space that starts at or after the given offset and is 82 * fuzzy == 1: this is used for allocations where we are given a hint of where
74 * at least bytes long. 83 * to look for free space. Because the hint may not be completely on an offset
84 * mark, or the hint may no longer point to free space we need to fudge our
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
75 */ 94 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 95static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes, 96 u64 offset, u64 bytes,
78 int contains) 97 int fuzzy)
79{ 98{
80 struct rb_node *n = root->rb_node; 99 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL; 100 struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 103 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85 104
86 if (offset < entry->offset) { 105 if (offset < entry->offset) {
87 if (!contains && 106 if (fuzzy &&
88 (!ret || entry->offset < ret->offset) && 107 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes)) 108 (bytes <= entry->bytes))
90 ret = entry; 109 ret = entry;
91 n = n->rb_left; 110 n = n->rb_left;
92 } else if (offset > entry->offset) { 111 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset && 112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) { 114 bytes <= entry->bytes) {
95 ret = entry; 115 ret = entry;
96 break; 116 break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
171 int ret = 0; 191 int ret = 0;
172 192
173 193
194 BUG_ON(!info->bytes);
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index); 196 &info->offset_index);
176 if (ret) 197 if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
184 return ret; 205 return ret;
185} 206}
186 207
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes) 209 u64 offset, u64 bytes)
189{ 210{
190 struct btrfs_free_space *right_info; 211 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info; 212 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL; 213 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0; 214 int ret = 0;
195 215
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info) 217 if (!info)
198 return -ENOMEM; 218 return -ENOMEM;
199 219
220 info->offset = offset;
221 info->bytes = bytes;
222
223 spin_lock(&block_group->tree_lock);
224
200 /* 225 /*
201 * first we want to see if there is free space adjacent to the range we 226 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to 227 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range 228 * cover the entire range
204 */ 229 */
205 right_info = tree_search_offset(&block_group->free_space_offset, 230 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1); 231 offset+bytes, 0, 0);
207 left_info = tree_search_offset(&block_group->free_space_offset, 232 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1); 233 offset-1, 0, 1);
209 234
210 if (right_info && right_info->offset == offset+bytes) { 235 if (right_info) {
211 unlink_free_space(block_group, right_info); 236 unlink_free_space(block_group, right_info);
212 info = right_info; 237 info->bytes += right_info->bytes;
213 info->offset = offset; 238 kfree(right_info);
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 } 239 }
225 240
226 if (left_info) { 241 if (left_info && left_info->offset + left_info->bytes == offset) {
227 unlink_free_space(block_group, left_info); 242 unlink_free_space(block_group, left_info);
228 243 info->offset = left_info->offset;
229 if (unlikely((left_info->offset + left_info->bytes) != 244 info->bytes += left_info->bytes;
230 offset)) { 245 kfree(left_info);
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 } 246 }
251 247
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info); 248 ret = link_free_space(block_group, info);
265 if (ret) 249 if (ret)
266 kfree(info); 250 kfree(info);
267out: 251
252 spin_unlock(&block_group->tree_lock);
253
268 if (ret) { 254 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST) 256 BUG_ON(ret == -EEXIST);
271 BUG();
272 } 257 }
273 258
274 kfree(alloc_info);
275
276 return ret; 259 return ret;
277} 260}
278 261
279static int 262int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes)
281 u64 offset, u64 bytes)
282{ 264{
283 struct btrfs_free_space *info; 265 struct btrfs_free_space *info;
284 int ret = 0; 266 int ret = 0;
285 267
268 spin_lock(&block_group->tree_lock);
269
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1); 271 1);
288
289 if (info && info->offset == offset) { 272 if (info && info->offset == offset) {
290 if (info->bytes < bytes) { 273 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu," 274 printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
295 (unsigned long long)bytes); 278 (unsigned long long)bytes);
296 WARN_ON(1); 279 WARN_ON(1);
297 ret = -EINVAL; 280 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock);
298 goto out; 282 goto out;
299 } 283 }
300 unlink_free_space(block_group, info); 284 unlink_free_space(block_group, info);
301 285
302 if (info->bytes == bytes) { 286 if (info->bytes == bytes) {
303 kfree(info); 287 kfree(info);
288 spin_unlock(&block_group->tree_lock);
304 goto out; 289 goto out;
305 } 290 }
306 291
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
308 info->bytes -= bytes; 293 info->bytes -= bytes;
309 294
310 ret = link_free_space(block_group, info); 295 ret = link_free_space(block_group, info);
296 spin_unlock(&block_group->tree_lock);
311 BUG_ON(ret); 297 BUG_ON(ret);
312 } else if (info && info->offset < offset && 298 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) { 299 info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
333 */ 319 */
334 kfree(info); 320 kfree(info);
335 } 321 }
336 322 spin_unlock(&block_group->tree_lock);
337 /* step two, insert a new info struct to cover anything 323 /* step two, insert a new info struct to cover anything
338 * before the hole 324 * before the hole
339 */ 325 */
340 ret = __btrfs_add_free_space(block_group, old_start, 326 ret = btrfs_add_free_space(block_group, old_start,
341 offset - old_start); 327 offset - old_start);
342 BUG_ON(ret); 328 BUG_ON(ret);
343 } else { 329 } else {
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached, block_group->key.objectid,
336 block_group->key.offset);
337 btrfs_dump_free_space(block_group, bytes);
338 } else if (info) {
339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
340 "but wanted offset=%llu bytes=%llu\n",
341 info->offset, info->bytes, offset, bytes);
342 }
344 WARN_ON(1); 343 WARN_ON(1);
345 } 344 }
346out: 345out:
347 return ret; 346 return ret;
348} 347}
349 348
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 349void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes) 350 u64 bytes)
402{ 351{
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
408 info = rb_entry(n, struct btrfs_free_space, offset_index); 357 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes) 358 if (info->bytes >= bytes)
410 count++; 359 count++;
360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
361 info->bytes);
411 } 362 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count); 364 "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
428 return ret; 379 return ret;
429} 380}
430 381
382/*
383 * for a given cluster, put all of its extents back into the free
384 * space cache. If the block group passed doesn't match the block group
385 * pointed to by the cluster, someone else raced in and freed the
386 * cluster already. In that case, we just return without changing anything
387 */
388static int
389__btrfs_return_cluster_to_free_space(
390 struct btrfs_block_group_cache *block_group,
391 struct btrfs_free_cluster *cluster)
392{
393 struct btrfs_free_space *entry;
394 struct rb_node *node;
395
396 spin_lock(&cluster->lock);
397 if (cluster->block_group != block_group)
398 goto out;
399
400 cluster->window_start = 0;
401 node = rb_first(&cluster->root);
402 while(node) {
403 entry = rb_entry(node, struct btrfs_free_space, offset_index);
404 node = rb_next(&entry->offset_index);
405 rb_erase(&entry->offset_index, &cluster->root);
406 link_free_space(block_group, entry);
407 }
408 list_del_init(&cluster->block_group_list);
409
410 btrfs_put_block_group(cluster->block_group);
411 cluster->block_group = NULL;
412 cluster->root.rb_node = NULL;
413out:
414 spin_unlock(&cluster->lock);
415 return 0;
416}
417
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 418void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{ 419{
433 struct btrfs_free_space *info; 420 struct btrfs_free_space *info;
434 struct rb_node *node; 421 struct rb_node *node;
422 struct btrfs_free_cluster *cluster;
423 struct btrfs_free_cluster *safe;
424
425 spin_lock(&block_group->tree_lock);
426
427 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
428 block_group_list) {
429
430 WARN_ON(cluster->block_group != block_group);
431 __btrfs_return_cluster_to_free_space(block_group, cluster);
432 }
435 433
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 434 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index); 435 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info); 436 unlink_free_space(block_group, info);
440 kfree(info); 437 kfree(info);
441 if (need_resched()) { 438 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex); 439 spin_unlock(&block_group->tree_lock);
443 cond_resched(); 440 cond_resched();
444 mutex_lock(&block_group->alloc_mutex); 441 spin_lock(&block_group->tree_lock);
445 } 442 }
446 } 443 }
447 mutex_unlock(&block_group->alloc_mutex); 444 spin_unlock(&block_group->tree_lock);
448} 445}
449 446
450#if 0 447u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct 448 u64 offset, u64 bytes, u64 empty_size)
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{ 449{
456 struct btrfs_free_space *ret; 450 struct btrfs_free_space *entry = NULL;
451 u64 ret = 0;
457 452
458 mutex_lock(&block_group->alloc_mutex); 453 spin_lock(&block_group->tree_lock);
459 ret = tree_search_offset(&block_group->free_space_offset, offset, 454 entry = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0); 455 bytes + empty_size, 1);
461 mutex_unlock(&block_group->alloc_mutex); 456 if (!entry)
457 entry = tree_search_bytes(&block_group->free_space_bytes,
458 offset, bytes + empty_size);
459 if (entry) {
460 unlink_free_space(block_group, entry);
461 ret = entry->offset;
462 entry->offset += bytes;
463 entry->bytes -= bytes;
464
465 if (!entry->bytes)
466 kfree(entry);
467 else
468 link_free_space(block_group, entry);
469 }
470 spin_unlock(&block_group->tree_lock);
462 471
463 return ret; 472 return ret;
464} 473}
465 474
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 475/*
467 btrfs_block_group_cache 476 * given a cluster, put all of its extents back into the free space
468 *block_group, u64 offset, 477 * cache. If a block group is passed, this function will only free
469 u64 bytes) 478 * a cluster that belongs to the passed block group.
479 *
480 * Otherwise, it'll get a reference on the block group pointed to by the
481 * cluster and remove the cluster from it.
482 */
483int btrfs_return_cluster_to_free_space(
484 struct btrfs_block_group_cache *block_group,
485 struct btrfs_free_cluster *cluster)
470{ 486{
471 struct btrfs_free_space *ret; 487 int ret;
472 488
473 mutex_lock(&block_group->alloc_mutex); 489 /* first, get a safe pointer to the block group */
490 spin_lock(&cluster->lock);
491 if (!block_group) {
492 block_group = cluster->block_group;
493 if (!block_group) {
494 spin_unlock(&cluster->lock);
495 return 0;
496 }
497 } else if (cluster->block_group != block_group) {
498 /* someone else has already freed it don't redo their work */
499 spin_unlock(&cluster->lock);
500 return 0;
501 }
502 atomic_inc(&block_group->count);
503 spin_unlock(&cluster->lock);
474 504
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 505 /* now return any extents the cluster had on it */
476 mutex_unlock(&block_group->alloc_mutex); 506 spin_lock(&block_group->tree_lock);
507 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
508 spin_unlock(&block_group->tree_lock);
477 509
510 /* finally drop our ref */
511 btrfs_put_block_group(block_group);
478 return ret; 512 return ret;
479} 513}
480#endif
481 514
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 515/*
483 *block_group, u64 offset, 516 * given a cluster, try to allocate 'bytes' from it, returns 0
484 u64 bytes) 517 * if it couldn't find anything suitably large, or a logical disk offset
518 * if things worked out
519 */
520u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
521 struct btrfs_free_cluster *cluster, u64 bytes,
522 u64 min_start)
523{
524 struct btrfs_free_space *entry = NULL;
525 struct rb_node *node;
526 u64 ret = 0;
527
528 spin_lock(&cluster->lock);
529 if (bytes > cluster->max_size)
530 goto out;
531
532 if (cluster->block_group != block_group)
533 goto out;
534
535 node = rb_first(&cluster->root);
536 if (!node)
537 goto out;
538
539 entry = rb_entry(node, struct btrfs_free_space, offset_index);
540
541 while(1) {
542 if (entry->bytes < bytes || entry->offset < min_start) {
543 struct rb_node *node;
544
545 node = rb_next(&entry->offset_index);
546 if (!node)
547 break;
548 entry = rb_entry(node, struct btrfs_free_space,
549 offset_index);
550 continue;
551 }
552 ret = entry->offset;
553
554 entry->offset += bytes;
555 entry->bytes -= bytes;
556
557 if (entry->bytes == 0) {
558 rb_erase(&entry->offset_index, &cluster->root);
559 kfree(entry);
560 }
561 break;
562 }
563out:
564 spin_unlock(&cluster->lock);
565 return ret;
566}
567
568/*
569 * here we try to find a cluster of blocks in a block group. The goal
570 * is to find at least bytes free and up to empty_size + bytes free.
571 * We might not find them all in one contiguous area.
572 *
573 * returns zero and sets up cluster if things worked out, otherwise
574 * it returns -enospc
575 */
576int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
577 struct btrfs_block_group_cache *block_group,
578 struct btrfs_free_cluster *cluster,
579 u64 offset, u64 bytes, u64 empty_size)
485{ 580{
486 struct btrfs_free_space *ret = NULL; 581 struct btrfs_free_space *entry = NULL;
582 struct rb_node *node;
583 struct btrfs_free_space *next;
584 struct btrfs_free_space *last;
585 u64 min_bytes;
586 u64 window_start;
587 u64 window_free;
588 u64 max_extent = 0;
589 int total_retries = 0;
590 int ret;
591
592 /* for metadata, allow allocates with more holes */
593 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
594 /*
595 * we want to do larger allocations when we are
596 * flushing out the delayed refs, it helps prevent
597 * making more work as we go along.
598 */
599 if (trans->transaction->delayed_refs.flushing)
600 min_bytes = max(bytes, (bytes + empty_size) >> 1);
601 else
602 min_bytes = max(bytes, (bytes + empty_size) >> 4);
603 } else
604 min_bytes = max(bytes, (bytes + empty_size) >> 2);
605
606 spin_lock(&block_group->tree_lock);
607 spin_lock(&cluster->lock);
608
609 /* someone already found a cluster, hooray */
610 if (cluster->block_group) {
611 ret = 0;
612 goto out;
613 }
614again:
615 min_bytes = min(min_bytes, bytes + empty_size);
616 entry = tree_search_bytes(&block_group->free_space_bytes,
617 offset, min_bytes);
618 if (!entry) {
619 ret = -ENOSPC;
620 goto out;
621 }
622 window_start = entry->offset;
623 window_free = entry->bytes;
624 last = entry;
625 max_extent = entry->bytes;
626
627 while(1) {
628 /* out window is just right, lets fill it */
629 if (window_free >= bytes + empty_size)
630 break;
487 631
488 ret = tree_search_offset(&block_group->free_space_offset, offset, 632 node = rb_next(&last->offset_index);
489 bytes, 0); 633 if (!node) {
490 if (!ret) 634 ret = -ENOSPC;
491 ret = tree_search_bytes(&block_group->free_space_bytes, 635 goto out;
492 offset, bytes); 636 }
637 next = rb_entry(node, struct btrfs_free_space, offset_index);
638
639 /*
640 * we haven't filled the empty size and the window is
641 * very large. reset and try again
642 */
643 if (next->offset - window_start > (bytes + empty_size) * 2) {
644 entry = next;
645 window_start = entry->offset;
646 window_free = entry->bytes;
647 last = entry;
648 max_extent = 0;
649 total_retries++;
650 if (total_retries % 256 == 0) {
651 if (min_bytes >= (bytes + empty_size)) {
652 ret = -ENOSPC;
653 goto out;
654 }
655 /*
656 * grow our allocation a bit, we're not having
657 * much luck
658 */
659 min_bytes *= 2;
660 goto again;
661 }
662 } else {
663 last = next;
664 window_free += next->bytes;
665 if (entry->bytes > max_extent)
666 max_extent = entry->bytes;
667 }
668 }
669
670 cluster->window_start = entry->offset;
671
672 /*
673 * now we've found our entries, pull them out of the free space
674 * cache and put them into the cluster rbtree
675 *
676 * The cluster includes an rbtree, but only uses the offset index
677 * of each free space cache entry.
678 */
679 while(1) {
680 node = rb_next(&entry->offset_index);
681 unlink_free_space(block_group, entry);
682 ret = tree_insert_offset(&cluster->root, entry->offset,
683 &entry->offset_index);
684 BUG_ON(ret);
685
686 if (!node || entry == last)
687 break;
688
689 entry = rb_entry(node, struct btrfs_free_space, offset_index);
690 }
691 ret = 0;
692 cluster->max_size = max_extent;
693 atomic_inc(&block_group->count);
694 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
695 cluster->block_group = block_group;
696out:
697 spin_unlock(&cluster->lock);
698 spin_unlock(&block_group->tree_lock);
493 699
494 return ret; 700 return ret;
495} 701}
702
703/*
704 * simple code to zero out a cluster
705 */
706void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
707{
708 spin_lock_init(&cluster->lock);
709 spin_lock_init(&cluster->refill_lock);
710 cluster->root.rb_node = NULL;
711 cluster->max_size = 0;
712 INIT_LIST_HEAD(&cluster->block_group_list);
713 cluster->block_group = NULL;
714}
715
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06d8db5afb08..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3481 3481
3482 if (dir) { 3482 if (dir) {
3483 ret = btrfs_set_inode_index(dir, index); 3483 ret = btrfs_set_inode_index(dir, index);
3484 if (ret) 3484 if (ret) {
3485 iput(inode);
3485 return ERR_PTR(ret); 3486 return ERR_PTR(ret);
3487 }
3486 } 3488 }
3487 /* 3489 /*
3488 * index_cnt is ignored for everything but a dir, 3490 * index_cnt is ignored for everything but a dir,
@@ -3565,6 +3567,7 @@ fail:
3565 if (dir) 3567 if (dir)
3566 BTRFS_I(dir)->index_cnt--; 3568 BTRFS_I(dir)->index_cnt--;
3567 btrfs_free_path(path); 3569 btrfs_free_path(path);
3570 iput(inode);
3568 return ERR_PTR(ret); 3571 return ERR_PTR(ret);
3569} 3572}
3570 3573
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
267 goto out_dput; 267 goto out_dput;
268 268
269 if (!IS_POSIXACL(parent->dentry->d_inode)) 269 if (!IS_POSIXACL(parent->dentry->d_inode))
270 mode &= ~current->fs->umask; 270 mode &= ~current_umask();
271 271
272 error = mnt_want_write(parent->mnt); 272 error = mnt_want_write(parent->mnt);
273 if (error) 273 if (error)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a5310c0f41e2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
60 60
61/* 61/*
62 * unfortunately, many of the places that currently set a lock to blocking 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block 63 * don't end up blocking for very long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit 64 * at all. For a dbench 50 run, if we don't spin on the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more. 65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 * 66 *
67 * So, we're still stuck with this crummy spin on the blocking bit, 67 * So, we're still stuck with this crummy spin on the blocking bit,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
66enum { 67enum {
67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
69 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_flushoncommit, Opt_err,
70}; 72};
71 73
72static match_table_t tokens = { 74static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
83 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
84 {Opt_ssd, "ssd"}, 86 {Opt_ssd, "ssd"},
85 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"},
86 {Opt_err, NULL}, 90 {Opt_err, NULL},
87}; 91};
88 92
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
222 case Opt_noacl: 226 case Opt_noacl:
223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 227 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
224 break; 228 break;
229 case Opt_notreelog:
230 printk(KERN_INFO "btrfs: disabling tree log\n");
231 btrfs_set_opt(info->mount_opt, NOTREELOG);
232 break;
233 case Opt_flushoncommit:
234 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
235 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
236 break;
225 default: 237 default:
226 break; 238 break;
227 } 239 }
@@ -363,9 +375,8 @@ fail_close:
363int btrfs_sync_fs(struct super_block *sb, int wait) 375int btrfs_sync_fs(struct super_block *sb, int wait)
364{ 376{
365 struct btrfs_trans_handle *trans; 377 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root; 378 struct btrfs_root *root = btrfs_sb(sb);
367 int ret; 379 int ret;
368 root = btrfs_sb(sb);
369 380
370 if (sb->s_flags & MS_RDONLY) 381 if (sb->s_flags & MS_RDONLY)
371 return 0; 382 return 0;
@@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
385 return ret; 396 return ret;
386} 397}
387 398
399static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
400{
401 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
402 struct btrfs_fs_info *info = root->fs_info;
403
404 if (btrfs_test_opt(root, DEGRADED))
405 seq_puts(seq, ",degraded");
406 if (btrfs_test_opt(root, NODATASUM))
407 seq_puts(seq, ",nodatasum");
408 if (btrfs_test_opt(root, NODATACOW))
409 seq_puts(seq, ",nodatacow");
410 if (btrfs_test_opt(root, NOBARRIER))
411 seq_puts(seq, ",nobarrier");
412 if (info->max_extent != (u64)-1)
413 seq_printf(seq, ",max_extent=%llu", info->max_extent);
414 if (info->max_inline != 8192 * 1024)
415 seq_printf(seq, ",max_inline=%llu", info->max_inline);
416 if (info->alloc_start != 0)
417 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
418 if (info->thread_pool_size != min_t(unsigned long,
419 num_online_cpus() + 2, 8))
420 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
421 if (btrfs_test_opt(root, COMPRESS))
422 seq_puts(seq, ",compress");
423 if (btrfs_test_opt(root, SSD))
424 seq_puts(seq, ",ssd");
425 if (btrfs_test_opt(root, NOTREELOG))
426 seq_puts(seq, ",no-treelog");
427 if (btrfs_test_opt(root, FLUSHONCOMMIT))
428 seq_puts(seq, ",flush-on-commit");
429 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
430 seq_puts(seq, ",noacl");
431 return 0;
432}
433
388static void btrfs_write_super(struct super_block *sb) 434static void btrfs_write_super(struct super_block *sb)
389{ 435{
390 sb->s_dirt = 0; 436 sb->s_dirt = 0;
@@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
630 .put_super = btrfs_put_super, 676 .put_super = btrfs_put_super,
631 .write_super = btrfs_write_super, 677 .write_super = btrfs_write_super,
632 .sync_fs = btrfs_sync_fs, 678 .sync_fs = btrfs_sync_fs,
633 .show_options = generic_show_options, 679 .show_options = btrfs_show_options,
634 .write_inode = btrfs_write_inode, 680 .write_inode = btrfs_write_inode,
635 .dirty_inode = btrfs_dirty_inode, 681 .dirty_inode = btrfs_dirty_inode,
636 .alloc_inode = btrfs_alloc_inode, 682 .alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 664782c6a2df..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;
@@ -974,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
974 int ret; 972 int ret;
975 int should_grow = 0; 973 int should_grow = 0;
976 unsigned long now = get_seconds(); 974 unsigned long now = get_seconds();
975 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
977 976
978 btrfs_run_ordered_operations(root, 0); 977 btrfs_run_ordered_operations(root, 0);
979 978
@@ -1053,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1053 1052
1054 mutex_unlock(&root->fs_info->trans_mutex); 1053 mutex_unlock(&root->fs_info->trans_mutex);
1055 1054
1056 if (snap_pending) { 1055 if (flush_on_commit || snap_pending) {
1056 if (flush_on_commit)
1057 btrfs_start_delalloc_inodes(root);
1057 ret = btrfs_wait_ordered_extents(root, 1); 1058 ret = btrfs_wait_ordered_extents(root, 1);
1058 BUG_ON(ret); 1059 BUG_ON(ret);
1059 } 1060 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fc9b87a7975b..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
262 struct extent_buffer *eb, 262 struct extent_buffer *eb,
263 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
264{ 264{
265 if (wc->pin) { 265 if (wc->pin)
266 mutex_lock(&log->fs_info->pinned_mutex);
267 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
268 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
269 }
270 268
271 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
272 if (wc->write) 270 if (wc->write)
@@ -1224,8 +1222,7 @@ insert:
1224 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1225 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1226 1224
1227 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1228 BUG();
1229 goto out; 1226 goto out;
1230} 1227}
1231 1228
@@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2900 2897
2901 sb = inode->i_sb; 2898 sb = inode->i_sb;
2902 2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2903 if (root->fs_info->last_trans_log_full_commit > 2905 if (root->fs_info->last_trans_log_full_commit >
2904 root->fs_info->last_trans_committed) { 2906 root->fs_info->last_trans_committed) {
2905 ret = 1; 2907 ret = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/iocontext.h>
23#include <asm/div64.h> 24#include <asm/div64.h>
24#include "compat.h" 25#include "compat.h"
25#include "ctree.h" 26#include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
145 int again = 0; 146 int again = 0;
146 unsigned long num_run = 0; 147 unsigned long num_run = 0;
147 unsigned long limit; 148 unsigned long limit;
149 unsigned long last_waited = 0;
148 150
149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 151 bdi = blk_get_backing_dev_info(device->bdev);
150 fs_info = device->dev_root->fs_info; 152 fs_info = device->dev_root->fs_info;
151 limit = btrfs_async_submit_limit(fs_info); 153 limit = btrfs_async_submit_limit(fs_info);
152 limit = limit * 2 / 3; 154 limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 209 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
208 fs_info->fs_devices->open_devices > 1) { 210 fs_info->fs_devices->open_devices > 1) {
209 struct bio *old_head; 211 struct bio *old_head;
212 struct io_context *ioc;
210 213
214 ioc = current->io_context;
215
216 /*
217 * the main goal here is that we don't want to
218 * block if we're going to be able to submit
219 * more requests without blocking.
220 *
221 * This code does two great things, it pokes into
222 * the elevator code from a filesystem _and_
223 * it makes assumptions about how batching works.
224 */
225 if (ioc && ioc->nr_batch_requests > 0 &&
226 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
227 (last_waited == 0 ||
228 ioc->last_waited == last_waited)) {
229 /*
230 * we want to go through our batch of
231 * requests and stop. So, we copy out
232 * the ioc->last_waited time and test
233 * against it before looping
234 */
235 last_waited = ioc->last_waited;
236 continue;
237 }
211 spin_lock(&device->io_lock); 238 spin_lock(&device->io_lock);
212 239
213 old_head = device->pending_bios; 240 old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
231 if (device->pending_bios) 258 if (device->pending_bios)
232 goto loop_lock; 259 goto loop_lock;
233 spin_unlock(&device->io_lock); 260 spin_unlock(&device->io_lock);
261
262 /*
263 * IO has already been through a long path to get here. Checksumming,
264 * async helper threads, perhaps compression. We've done a pretty
265 * good job of collecting a batch of IO and should just unplug
266 * the device right away.
267 *
268 * This will help anyone who is waiting on the IO, they might have
269 * already unplugged, but managed to do so before the bio they
270 * cared about found its way down here.
271 */
272 blk_run_backing_dev(bdi, NULL);
234done: 273done:
235 return 0; 274 return 0;
236} 275}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
76struct btrfs_fs_devices { 76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78 78
79 /* the device with this id has the most recent coyp of the super */ 79 /* the device with this id has the most recent copy of the super */
80 u64 latest_devid; 80 u64 latest_devid;
81 u64 latest_trans; 81 u64 latest_trans;
82 u64 num_devices; 82 u64 num_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index f5f8b15a6e40..6e35762b6169 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
199 head = page_buffers(page); 199 head = page_buffers(page);
200 bh = head; 200 bh = head;
201 do { 201 do {
202 if (bh->b_blocknr == block) { 202 if (!buffer_mapped(bh))
203 all_mapped = 0;
204 else if (bh->b_blocknr == block) {
203 ret = bh; 205 ret = bh;
204 get_bh(bh); 206 get_bh(bh);
205 goto out_unlock; 207 goto out_unlock;
206 } 208 }
207 if (!buffer_mapped(bh))
208 all_mapped = 0;
209 bh = bh->b_this_page; 209 bh = bh->b_this_page;
210 } while (bh != head); 210 } while (bh != head);
211 211
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
737{ 737{
738 struct buffer_head *bh; 738 struct buffer_head *bh;
739 struct list_head tmp; 739 struct list_head tmp;
740 struct address_space *mapping; 740 struct address_space *mapping, *prev_mapping = NULL;
741 int err = 0, err2; 741 int err = 0, err2;
742 742
743 INIT_LIST_HEAD(&tmp); 743 INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
762 * contents - it is a noop if I/O is still in 762 * contents - it is a noop if I/O is still in
763 * flight on potentially older contents. 763 * flight on potentially older contents.
764 */ 764 */
765 ll_rw_block(SWRITE_SYNC, 1, &bh); 765 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
766
767 /*
768 * Kick off IO for the previous mapping. Note
769 * that we will not run the very last mapping,
770 * wait_on_buffer() will do that for us
771 * through sync_buffer().
772 */
773 if (prev_mapping && prev_mapping != mapping)
774 blk_run_address_space(prev_mapping);
775 prev_mapping = mapping;
776
766 brelse(bh); 777 brelse(bh);
767 spin_lock(lock); 778 spin_lock(lock);
768 } 779 }
@@ -1595,6 +1606,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1595 struct buffer_head *bh, *head; 1606 struct buffer_head *bh, *head;
1596 const unsigned blocksize = 1 << inode->i_blkbits; 1607 const unsigned blocksize = 1 << inode->i_blkbits;
1597 int nr_underway = 0; 1608 int nr_underway = 0;
1609 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
1598 1610
1599 BUG_ON(!PageLocked(page)); 1611 BUG_ON(!PageLocked(page));
1600 1612
@@ -1686,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1686 do { 1698 do {
1687 struct buffer_head *next = bh->b_this_page; 1699 struct buffer_head *next = bh->b_this_page;
1688 if (buffer_async_write(bh)) { 1700 if (buffer_async_write(bh)) {
1689 submit_bh(WRITE, bh); 1701 submit_bh(write_op, bh);
1690 nr_underway++; 1702 nr_underway++;
1691 } 1703 }
1692 bh = next; 1704 bh = next;
@@ -1740,7 +1752,7 @@ recover:
1740 struct buffer_head *next = bh->b_this_page; 1752 struct buffer_head *next = bh->b_this_page;
1741 if (buffer_async_write(bh)) { 1753 if (buffer_async_write(bh)) {
1742 clear_buffer_dirty(bh); 1754 clear_buffer_dirty(bh);
1743 submit_bh(WRITE, bh); 1755 submit_bh(write_op, bh);
1744 nr_underway++; 1756 nr_underway++;
1745 } 1757 }
1746 bh = next; 1758 bh = next;
@@ -2956,12 +2968,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2956 for (i = 0; i < nr; i++) { 2968 for (i = 0; i < nr; i++) {
2957 struct buffer_head *bh = bhs[i]; 2969 struct buffer_head *bh = bhs[i];
2958 2970
2959 if (rw == SWRITE || rw == SWRITE_SYNC) 2971 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
2960 lock_buffer(bh); 2972 lock_buffer(bh);
2961 else if (!trylock_buffer(bh)) 2973 else if (!trylock_buffer(bh))
2962 continue; 2974 continue;
2963 2975
2964 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { 2976 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
2977 rw == SWRITE_SYNC_PLUG) {
2965 if (test_clear_buffer_dirty(bh)) { 2978 if (test_clear_buffer_dirty(bh)) {
2966 bh->b_end_io = end_buffer_write_sync; 2979 bh->b_end_io = end_buffer_write_sync;
2967 get_bh(bh); 2980 get_bh(bh);
@@ -2997,7 +3010,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
2997 if (test_clear_buffer_dirty(bh)) { 3010 if (test_clear_buffer_dirty(bh)) {
2998 get_bh(bh); 3011 get_bh(bh);
2999 bh->b_end_io = end_buffer_write_sync; 3012 bh->b_end_io = end_buffer_write_sync;
3000 ret = submit_bh(WRITE, bh); 3013 ret = submit_bh(WRITE_SYNC, bh);
3001 wait_on_buffer(bh); 3014 wait_on_buffer(bh);
3002 if (buffer_eopnotsupp(bh)) { 3015 if (buffer_eopnotsupp(bh)) {
3003 clear_buffer_eopnotsupp(bh); 3016 clear_buffer_eopnotsupp(bh);
@@ -3315,7 +3328,6 @@ EXPORT_SYMBOL(cont_write_begin);
3315EXPORT_SYMBOL(end_buffer_read_sync); 3328EXPORT_SYMBOL(end_buffer_read_sync);
3316EXPORT_SYMBOL(end_buffer_write_sync); 3329EXPORT_SYMBOL(end_buffer_write_sync);
3317EXPORT_SYMBOL(file_fsync); 3330EXPORT_SYMBOL(file_fsync);
3318EXPORT_SYMBOL(fsync_bdev);
3319EXPORT_SYMBOL(generic_block_bmap); 3331EXPORT_SYMBOL(generic_block_bmap);
3320EXPORT_SYMBOL(generic_cont_expand_simple); 3332EXPORT_SYMBOL(generic_cont_expand_simple);
3321EXPORT_SYMBOL(init_buffer); 3333EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 000000000000..80e9c6167f0b
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
1
2config CACHEFILES
3 tristate "Filesystem caching on files"
4 depends on FSCACHE && BLOCK
5 help
6 This permits use of a mounted filesystem as a cache for other
7 filesystems - primarily networking filesystems - thus allowing fast
8 local disk to enhance the speed of slower devices.
9
10 See Documentation/filesystems/caching/cachefiles.txt for more
11 information.
12
13config CACHEFILES_DEBUG
14 bool "Debug CacheFiles"
15 depends on CACHEFILES
16 help
17 This permits debugging to be dynamically enabled in the filesystem
18 caching on files module. If this is set, the debugging output may be
19 enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
20 by including a debugging specifier in /etc/cachefilesd.conf.
21
22config CACHEFILES_HISTOGRAM
23 bool "Gather latency information on CacheFiles"
24 depends on CACHEFILES && PROC_FS
25 help
26
27 This option causes latency information to be gathered on CacheFiles
28 operation and exported through file:
29
30 /proc/fs/cachefiles/histogram
31
32 The generation of this histogram adds a certain amount of overhead to
33 execution as there are a number of points at which data is gathered,
34 and on a multi-CPU system these may be on cachelines that keep
35 bouncing between CPUs. On the other hand, the histogram may be
36 useful for debugging purposes. Saying 'N' here is recommended.
37
38 See Documentation/filesystems/caching/cachefiles.txt for more
39 information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 000000000000..32cbab0ffce3
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
1#
2# Makefile for caching in a mounted filesystem
3#
4
5cachefiles-y := \
6 bind.o \
7 daemon.o \
8 interface.o \
9 key.o \
10 main.o \
11 namei.o \
12 rdwr.o \
13 security.o \
14 xattr.o
15
16cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
17
18obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 000000000000..3797e0077b35
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
1/* Bind and unbind a cache from the filesystem backing it
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/mount.h>
21#include <linux/statfs.h>
22#include <linux/ctype.h>
23#include "internal.h"
24
25static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
26
27/*
28 * bind a directory as a cache
29 */
30int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
31{
32 _enter("{%u,%u,%u,%u,%u,%u},%s",
33 cache->frun_percent,
34 cache->fcull_percent,
35 cache->fstop_percent,
36 cache->brun_percent,
37 cache->bcull_percent,
38 cache->bstop_percent,
39 args);
40
41 /* start by checking things over */
42 ASSERT(cache->fstop_percent >= 0 &&
43 cache->fstop_percent < cache->fcull_percent &&
44 cache->fcull_percent < cache->frun_percent &&
45 cache->frun_percent < 100);
46
47 ASSERT(cache->bstop_percent >= 0 &&
48 cache->bstop_percent < cache->bcull_percent &&
49 cache->bcull_percent < cache->brun_percent &&
50 cache->brun_percent < 100);
51
52 if (*args) {
53 kerror("'bind' command doesn't take an argument");
54 return -EINVAL;
55 }
56
57 if (!cache->rootdirname) {
58 kerror("No cache directory specified");
59 return -EINVAL;
60 }
61
62 /* don't permit already bound caches to be re-bound */
63 if (test_bit(CACHEFILES_READY, &cache->flags)) {
64 kerror("Cache already bound");
65 return -EBUSY;
66 }
67
68 /* make sure we have copies of the tag and dirname strings */
69 if (!cache->tag) {
70 /* the tag string is released by the fops->release()
71 * function, so we don't release it on error here */
72 cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
73 if (!cache->tag)
74 return -ENOMEM;
75 }
76
77 /* add the cache */
78 return cachefiles_daemon_add_cache(cache);
79}
80
81/*
82 * add a cache
83 */
84static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
85{
86 struct cachefiles_object *fsdef;
87 struct nameidata nd;
88 struct kstatfs stats;
89 struct dentry *graveyard, *cachedir, *root;
90 const struct cred *saved_cred;
91 int ret;
92
93 _enter("");
94
95 /* we want to work under the module's security ID */
96 ret = cachefiles_get_security_ID(cache);
97 if (ret < 0)
98 return ret;
99
100 cachefiles_begin_secure(cache, &saved_cred);
101
102 /* allocate the root index object */
103 ret = -ENOMEM;
104
105 fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
106 if (!fsdef)
107 goto error_root_object;
108
109 ASSERTCMP(fsdef->backer, ==, NULL);
110
111 atomic_set(&fsdef->usage, 1);
112 fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
113
114 _debug("- fsdef %p", fsdef);
115
116 /* look up the directory at the root of the cache */
117 memset(&nd, 0, sizeof(nd));
118
119 ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
120 if (ret < 0)
121 goto error_open_root;
122
123 cache->mnt = mntget(nd.path.mnt);
124 root = dget(nd.path.dentry);
125 path_put(&nd.path);
126
127 /* check parameters */
128 ret = -EOPNOTSUPP;
129 if (!root->d_inode ||
130 !root->d_inode->i_op ||
131 !root->d_inode->i_op->lookup ||
132 !root->d_inode->i_op->mkdir ||
133 !root->d_inode->i_op->setxattr ||
134 !root->d_inode->i_op->getxattr ||
135 !root->d_sb ||
136 !root->d_sb->s_op ||
137 !root->d_sb->s_op->statfs ||
138 !root->d_sb->s_op->sync_fs)
139 goto error_unsupported;
140
141 ret = -EROFS;
142 if (root->d_sb->s_flags & MS_RDONLY)
143 goto error_unsupported;
144
145 /* determine the security of the on-disk cache as this governs
146 * security ID of files we create */
147 ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
148 if (ret < 0)
149 goto error_unsupported;
150
151 /* get the cache size and blocksize */
152 ret = vfs_statfs(root, &stats);
153 if (ret < 0)
154 goto error_unsupported;
155
156 ret = -ERANGE;
157 if (stats.f_bsize <= 0)
158 goto error_unsupported;
159
160 ret = -EOPNOTSUPP;
161 if (stats.f_bsize > PAGE_SIZE)
162 goto error_unsupported;
163
164 cache->bsize = stats.f_bsize;
165 cache->bshift = 0;
166 if (stats.f_bsize < PAGE_SIZE)
167 cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
168
169 _debug("blksize %u (shift %u)",
170 cache->bsize, cache->bshift);
171
172 _debug("size %llu, avail %llu",
173 (unsigned long long) stats.f_blocks,
174 (unsigned long long) stats.f_bavail);
175
176 /* set up caching limits */
177 do_div(stats.f_files, 100);
178 cache->fstop = stats.f_files * cache->fstop_percent;
179 cache->fcull = stats.f_files * cache->fcull_percent;
180 cache->frun = stats.f_files * cache->frun_percent;
181
182 _debug("limits {%llu,%llu,%llu} files",
183 (unsigned long long) cache->frun,
184 (unsigned long long) cache->fcull,
185 (unsigned long long) cache->fstop);
186
187 stats.f_blocks >>= cache->bshift;
188 do_div(stats.f_blocks, 100);
189 cache->bstop = stats.f_blocks * cache->bstop_percent;
190 cache->bcull = stats.f_blocks * cache->bcull_percent;
191 cache->brun = stats.f_blocks * cache->brun_percent;
192
193 _debug("limits {%llu,%llu,%llu} blocks",
194 (unsigned long long) cache->brun,
195 (unsigned long long) cache->bcull,
196 (unsigned long long) cache->bstop);
197
198 /* get the cache directory and check its type */
199 cachedir = cachefiles_get_directory(cache, root, "cache");
200 if (IS_ERR(cachedir)) {
201 ret = PTR_ERR(cachedir);
202 goto error_unsupported;
203 }
204
205 fsdef->dentry = cachedir;
206 fsdef->fscache.cookie = NULL;
207
208 ret = cachefiles_check_object_type(fsdef);
209 if (ret < 0)
210 goto error_unsupported;
211
212 /* get the graveyard directory */
213 graveyard = cachefiles_get_directory(cache, root, "graveyard");
214 if (IS_ERR(graveyard)) {
215 ret = PTR_ERR(graveyard);
216 goto error_unsupported;
217 }
218
219 cache->graveyard = graveyard;
220
221 /* publish the cache */
222 fscache_init_cache(&cache->cache,
223 &cachefiles_cache_ops,
224 "%s",
225 fsdef->dentry->d_sb->s_id);
226
227 fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
228
229 ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
230 if (ret < 0)
231 goto error_add_cache;
232
233 /* done */
234 set_bit(CACHEFILES_READY, &cache->flags);
235 dput(root);
236
237 printk(KERN_INFO "CacheFiles:"
238 " File cache on %s registered\n",
239 cache->cache.identifier);
240
241 /* check how much space the cache has */
242 cachefiles_has_space(cache, 0, 0);
243 cachefiles_end_secure(cache, saved_cred);
244 return 0;
245
246error_add_cache:
247 dput(cache->graveyard);
248 cache->graveyard = NULL;
249error_unsupported:
250 mntput(cache->mnt);
251 cache->mnt = NULL;
252 dput(fsdef->dentry);
253 fsdef->dentry = NULL;
254 dput(root);
255error_open_root:
256 kmem_cache_free(cachefiles_object_jar, fsdef);
257error_root_object:
258 cachefiles_end_secure(cache, saved_cred);
259 kerror("Failed to register: %d", ret);
260 return ret;
261}
262
263/*
264 * unbind a cache on fd release
265 */
266void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
267{
268 _enter("");
269
270 if (test_bit(CACHEFILES_READY, &cache->flags)) {
271 printk(KERN_INFO "CacheFiles:"
272 " File cache on %s unregistering\n",
273 cache->cache.identifier);
274
275 fscache_withdraw_cache(&cache->cache);
276 }
277
278 dput(cache->graveyard);
279 mntput(cache->mnt);
280
281 kfree(cache->rootdirname);
282 kfree(cache->secctx);
283 kfree(cache->tag);
284
285 _leave("");
286}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 000000000000..4618516dd994
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
1/* Daemon interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/poll.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/ctype.h>
24#include <linux/fs_struct.h>
25#include "internal.h"
26
27static int cachefiles_daemon_open(struct inode *, struct file *);
28static int cachefiles_daemon_release(struct inode *, struct file *);
29static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
30 loff_t *);
31static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
32 size_t, loff_t *);
33static unsigned int cachefiles_daemon_poll(struct file *,
34 struct poll_table_struct *);
35static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
36static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
37static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
38static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
39static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
40static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
41static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
42static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
43static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
44static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
45static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
46static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
47
48static unsigned long cachefiles_open;
49
50const struct file_operations cachefiles_daemon_fops = {
51 .owner = THIS_MODULE,
52 .open = cachefiles_daemon_open,
53 .release = cachefiles_daemon_release,
54 .read = cachefiles_daemon_read,
55 .write = cachefiles_daemon_write,
56 .poll = cachefiles_daemon_poll,
57};
58
59struct cachefiles_daemon_cmd {
60 char name[8];
61 int (*handler)(struct cachefiles_cache *cache, char *args);
62};
63
64static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
65 { "bind", cachefiles_daemon_bind },
66 { "brun", cachefiles_daemon_brun },
67 { "bcull", cachefiles_daemon_bcull },
68 { "bstop", cachefiles_daemon_bstop },
69 { "cull", cachefiles_daemon_cull },
70 { "debug", cachefiles_daemon_debug },
71 { "dir", cachefiles_daemon_dir },
72 { "frun", cachefiles_daemon_frun },
73 { "fcull", cachefiles_daemon_fcull },
74 { "fstop", cachefiles_daemon_fstop },
75 { "inuse", cachefiles_daemon_inuse },
76 { "secctx", cachefiles_daemon_secctx },
77 { "tag", cachefiles_daemon_tag },
78 { "", NULL }
79};
80
81
82/*
83 * do various checks
84 */
85static int cachefiles_daemon_open(struct inode *inode, struct file *file)
86{
87 struct cachefiles_cache *cache;
88
89 _enter("");
90
91 /* only the superuser may do this */
92 if (!capable(CAP_SYS_ADMIN))
93 return -EPERM;
94
95 /* the cachefiles device may only be open once at a time */
96 if (xchg(&cachefiles_open, 1) == 1)
97 return -EBUSY;
98
99 /* allocate a cache record */
100 cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
101 if (!cache) {
102 cachefiles_open = 0;
103 return -ENOMEM;
104 }
105
106 mutex_init(&cache->daemon_mutex);
107 cache->active_nodes = RB_ROOT;
108 rwlock_init(&cache->active_lock);
109 init_waitqueue_head(&cache->daemon_pollwq);
110
111 /* set default caching limits
112 * - limit at 1% free space and/or free files
113 * - cull below 5% free space and/or free files
114 * - cease culling above 7% free space and/or free files
115 */
116 cache->frun_percent = 7;
117 cache->fcull_percent = 5;
118 cache->fstop_percent = 1;
119 cache->brun_percent = 7;
120 cache->bcull_percent = 5;
121 cache->bstop_percent = 1;
122
123 file->private_data = cache;
124 cache->cachefilesd = file;
125 return 0;
126}
127
128/*
129 * release a cache
130 */
131static int cachefiles_daemon_release(struct inode *inode, struct file *file)
132{
133 struct cachefiles_cache *cache = file->private_data;
134
135 _enter("");
136
137 ASSERT(cache);
138
139 set_bit(CACHEFILES_DEAD, &cache->flags);
140
141 cachefiles_daemon_unbind(cache);
142
143 ASSERT(!cache->active_nodes.rb_node);
144
145 /* clean up the control file interface */
146 cache->cachefilesd = NULL;
147 file->private_data = NULL;
148 cachefiles_open = 0;
149
150 kfree(cache);
151
152 _leave("");
153 return 0;
154}
155
156/*
157 * read the cache state
158 */
159static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
160 size_t buflen, loff_t *pos)
161{
162 struct cachefiles_cache *cache = file->private_data;
163 char buffer[256];
164 int n;
165
166 //_enter(",,%zu,", buflen);
167
168 if (!test_bit(CACHEFILES_READY, &cache->flags))
169 return 0;
170
171 /* check how much space the cache has */
172 cachefiles_has_space(cache, 0, 0);
173
174 /* summarise */
175 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
176
177 n = snprintf(buffer, sizeof(buffer),
178 "cull=%c"
179 " frun=%llx"
180 " fcull=%llx"
181 " fstop=%llx"
182 " brun=%llx"
183 " bcull=%llx"
184 " bstop=%llx",
185 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
186 (unsigned long long) cache->frun,
187 (unsigned long long) cache->fcull,
188 (unsigned long long) cache->fstop,
189 (unsigned long long) cache->brun,
190 (unsigned long long) cache->bcull,
191 (unsigned long long) cache->bstop
192 );
193
194 if (n > buflen)
195 return -EMSGSIZE;
196
197 if (copy_to_user(_buffer, buffer, n) != 0)
198 return -EFAULT;
199
200 return n;
201}
202
203/*
204 * command the cache
205 */
206static ssize_t cachefiles_daemon_write(struct file *file,
207 const char __user *_data,
208 size_t datalen,
209 loff_t *pos)
210{
211 const struct cachefiles_daemon_cmd *cmd;
212 struct cachefiles_cache *cache = file->private_data;
213 ssize_t ret;
214 char *data, *args, *cp;
215
216 //_enter(",,%zu,", datalen);
217
218 ASSERT(cache);
219
220 if (test_bit(CACHEFILES_DEAD, &cache->flags))
221 return -EIO;
222
223 if (datalen < 0 || datalen > PAGE_SIZE - 1)
224 return -EOPNOTSUPP;
225
226 /* drag the command string into the kernel so we can parse it */
227 data = kmalloc(datalen + 1, GFP_KERNEL);
228 if (!data)
229 return -ENOMEM;
230
231 ret = -EFAULT;
232 if (copy_from_user(data, _data, datalen) != 0)
233 goto error;
234
235 data[datalen] = '\0';
236
237 ret = -EINVAL;
238 if (memchr(data, '\0', datalen))
239 goto error;
240
241 /* strip any newline */
242 cp = memchr(data, '\n', datalen);
243 if (cp) {
244 if (cp == data)
245 goto error;
246
247 *cp = '\0';
248 }
249
250 /* parse the command */
251 ret = -EOPNOTSUPP;
252
253 for (args = data; *args; args++)
254 if (isspace(*args))
255 break;
256 if (*args) {
257 if (args == data)
258 goto error;
259 *args = '\0';
260 for (args++; isspace(*args); args++)
261 continue;
262 }
263
264 /* run the appropriate command handler */
265 for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
266 if (strcmp(cmd->name, data) == 0)
267 goto found_command;
268
269error:
270 kfree(data);
271 //_leave(" = %zd", ret);
272 return ret;
273
274found_command:
275 mutex_lock(&cache->daemon_mutex);
276
277 ret = -EIO;
278 if (!test_bit(CACHEFILES_DEAD, &cache->flags))
279 ret = cmd->handler(cache, args);
280
281 mutex_unlock(&cache->daemon_mutex);
282
283 if (ret == 0)
284 ret = datalen;
285 goto error;
286}
287
288/*
289 * poll for culling state
290 * - use POLLOUT to indicate culling state
291 */
292static unsigned int cachefiles_daemon_poll(struct file *file,
293 struct poll_table_struct *poll)
294{
295 struct cachefiles_cache *cache = file->private_data;
296 unsigned int mask;
297
298 poll_wait(file, &cache->daemon_pollwq, poll);
299 mask = 0;
300
301 if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
302 mask |= POLLIN;
303
304 if (test_bit(CACHEFILES_CULLING, &cache->flags))
305 mask |= POLLOUT;
306
307 return mask;
308}
309
310/*
311 * give a range error for cache space constraints
312 * - can be tail-called
313 */
314static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
315 char *args)
316{
317 kerror("Free space limits must be in range"
318 " 0%%<=stop<cull<run<100%%");
319
320 return -EINVAL;
321}
322
323/*
324 * set the percentage of files at which to stop culling
325 * - command: "frun <N>%"
326 */
327static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
328{
329 unsigned long frun;
330
331 _enter(",%s", args);
332
333 if (!*args)
334 return -EINVAL;
335
336 frun = simple_strtoul(args, &args, 10);
337 if (args[0] != '%' || args[1] != '\0')
338 return -EINVAL;
339
340 if (frun <= cache->fcull_percent || frun >= 100)
341 return cachefiles_daemon_range_error(cache, args);
342
343 cache->frun_percent = frun;
344 return 0;
345}
346
347/*
348 * set the percentage of files at which to start culling
349 * - command: "fcull <N>%"
350 */
351static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
352{
353 unsigned long fcull;
354
355 _enter(",%s", args);
356
357 if (!*args)
358 return -EINVAL;
359
360 fcull = simple_strtoul(args, &args, 10);
361 if (args[0] != '%' || args[1] != '\0')
362 return -EINVAL;
363
364 if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
365 return cachefiles_daemon_range_error(cache, args);
366
367 cache->fcull_percent = fcull;
368 return 0;
369}
370
371/*
372 * set the percentage of files at which to stop allocating
373 * - command: "fstop <N>%"
374 */
375static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
376{
377 unsigned long fstop;
378
379 _enter(",%s", args);
380
381 if (!*args)
382 return -EINVAL;
383
384 fstop = simple_strtoul(args, &args, 10);
385 if (args[0] != '%' || args[1] != '\0')
386 return -EINVAL;
387
388 if (fstop < 0 || fstop >= cache->fcull_percent)
389 return cachefiles_daemon_range_error(cache, args);
390
391 cache->fstop_percent = fstop;
392 return 0;
393}
394
395/*
396 * set the percentage of blocks at which to stop culling
397 * - command: "brun <N>%"
398 */
399static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
400{
401 unsigned long brun;
402
403 _enter(",%s", args);
404
405 if (!*args)
406 return -EINVAL;
407
408 brun = simple_strtoul(args, &args, 10);
409 if (args[0] != '%' || args[1] != '\0')
410 return -EINVAL;
411
412 if (brun <= cache->bcull_percent || brun >= 100)
413 return cachefiles_daemon_range_error(cache, args);
414
415 cache->brun_percent = brun;
416 return 0;
417}
418
419/*
420 * set the percentage of blocks at which to start culling
421 * - command: "bcull <N>%"
422 */
423static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
424{
425 unsigned long bcull;
426
427 _enter(",%s", args);
428
429 if (!*args)
430 return -EINVAL;
431
432 bcull = simple_strtoul(args, &args, 10);
433 if (args[0] != '%' || args[1] != '\0')
434 return -EINVAL;
435
436 if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
437 return cachefiles_daemon_range_error(cache, args);
438
439 cache->bcull_percent = bcull;
440 return 0;
441}
442
443/*
444 * set the percentage of blocks at which to stop allocating
445 * - command: "bstop <N>%"
446 */
447static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
448{
449 unsigned long bstop;
450
451 _enter(",%s", args);
452
453 if (!*args)
454 return -EINVAL;
455
456 bstop = simple_strtoul(args, &args, 10);
457 if (args[0] != '%' || args[1] != '\0')
458 return -EINVAL;
459
460 if (bstop < 0 || bstop >= cache->bcull_percent)
461 return cachefiles_daemon_range_error(cache, args);
462
463 cache->bstop_percent = bstop;
464 return 0;
465}
466
467/*
468 * set the cache directory
469 * - command: "dir <name>"
470 */
471static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
472{
473 char *dir;
474
475 _enter(",%s", args);
476
477 if (!*args) {
478 kerror("Empty directory specified");
479 return -EINVAL;
480 }
481
482 if (cache->rootdirname) {
483 kerror("Second cache directory specified");
484 return -EEXIST;
485 }
486
487 dir = kstrdup(args, GFP_KERNEL);
488 if (!dir)
489 return -ENOMEM;
490
491 cache->rootdirname = dir;
492 return 0;
493}
494
495/*
496 * set the cache security context
497 * - command: "secctx <ctx>"
498 */
499static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
500{
501 char *secctx;
502
503 _enter(",%s", args);
504
505 if (!*args) {
506 kerror("Empty security context specified");
507 return -EINVAL;
508 }
509
510 if (cache->secctx) {
511 kerror("Second security context specified");
512 return -EINVAL;
513 }
514
515 secctx = kstrdup(args, GFP_KERNEL);
516 if (!secctx)
517 return -ENOMEM;
518
519 cache->secctx = secctx;
520 return 0;
521}
522
523/*
524 * set the cache tag
525 * - command: "tag <name>"
526 */
527static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
528{
529 char *tag;
530
531 _enter(",%s", args);
532
533 if (!*args) {
534 kerror("Empty tag specified");
535 return -EINVAL;
536 }
537
538 if (cache->tag)
539 return -EEXIST;
540
541 tag = kstrdup(args, GFP_KERNEL);
542 if (!tag)
543 return -ENOMEM;
544
545 cache->tag = tag;
546 return 0;
547}
548
549/*
550 * request a node in the cache be culled from the current working directory
551 * - command: "cull <name>"
552 */
553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554{
555 struct fs_struct *fs;
556 struct dentry *dir;
557 const struct cred *saved_cred;
558 int ret;
559
560 _enter(",%s", args);
561
562 if (strchr(args, '/'))
563 goto inval;
564
565 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
566 kerror("cull applied to unready cache");
567 return -EIO;
568 }
569
570 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
571 kerror("cull applied to dead cache");
572 return -EIO;
573 }
574
575 /* extract the directory dentry from the cwd */
576 fs = current->fs;
577 read_lock(&fs->lock);
578 dir = dget(fs->pwd.dentry);
579 read_unlock(&fs->lock);
580
581 if (!S_ISDIR(dir->d_inode->i_mode))
582 goto notdir;
583
584 cachefiles_begin_secure(cache, &saved_cred);
585 ret = cachefiles_cull(cache, dir, args);
586 cachefiles_end_secure(cache, saved_cred);
587
588 dput(dir);
589 _leave(" = %d", ret);
590 return ret;
591
592notdir:
593 dput(dir);
594 kerror("cull command requires dirfd to be a directory");
595 return -ENOTDIR;
596
597inval:
598 kerror("cull command requires dirfd and filename");
599 return -EINVAL;
600}
601
602/*
603 * set debugging mode
604 * - command: "debug <mask>"
605 */
606static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
607{
608 unsigned long mask;
609
610 _enter(",%s", args);
611
612 mask = simple_strtoul(args, &args, 0);
613 if (args[0] != '\0')
614 goto inval;
615
616 cachefiles_debug = mask;
617 _leave(" = 0");
618 return 0;
619
620inval:
621 kerror("debug command requires mask");
622 return -EINVAL;
623}
624
625/*
626 * find out whether an object in the current working directory is in use or not
627 * - command: "inuse <name>"
628 */
629static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
630{
631 struct fs_struct *fs;
632 struct dentry *dir;
633 const struct cred *saved_cred;
634 int ret;
635
636 //_enter(",%s", args);
637
638 if (strchr(args, '/'))
639 goto inval;
640
641 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
642 kerror("inuse applied to unready cache");
643 return -EIO;
644 }
645
646 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
647 kerror("inuse applied to dead cache");
648 return -EIO;
649 }
650
651 /* extract the directory dentry from the cwd */
652 fs = current->fs;
653 read_lock(&fs->lock);
654 dir = dget(fs->pwd.dentry);
655 read_unlock(&fs->lock);
656
657 if (!S_ISDIR(dir->d_inode->i_mode))
658 goto notdir;
659
660 cachefiles_begin_secure(cache, &saved_cred);
661 ret = cachefiles_check_in_use(cache, dir, args);
662 cachefiles_end_secure(cache, saved_cred);
663
664 dput(dir);
665 //_leave(" = %d", ret);
666 return ret;
667
668notdir:
669 dput(dir);
670 kerror("inuse command requires dirfd to be a directory");
671 return -ENOTDIR;
672
673inval:
674 kerror("inuse command requires dirfd and filename");
675 return -EINVAL;
676}
677
678/*
679 * see if we have space for a number of pages and/or a number of files in the
680 * cache
681 */
682int cachefiles_has_space(struct cachefiles_cache *cache,
683 unsigned fnr, unsigned bnr)
684{
685 struct kstatfs stats;
686 int ret;
687
688 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
689 // (unsigned long long) cache->frun,
690 // (unsigned long long) cache->fcull,
691 // (unsigned long long) cache->fstop,
692 // (unsigned long long) cache->brun,
693 // (unsigned long long) cache->bcull,
694 // (unsigned long long) cache->bstop,
695 // fnr, bnr);
696
697 /* find out how many pages of blockdev are available */
698 memset(&stats, 0, sizeof(stats));
699
700 ret = vfs_statfs(cache->mnt->mnt_root, &stats);
701 if (ret < 0) {
702 if (ret == -EIO)
703 cachefiles_io_error(cache, "statfs failed");
704 _leave(" = %d", ret);
705 return ret;
706 }
707
708 stats.f_bavail >>= cache->bshift;
709
710 //_debug("avail %llu,%llu",
711 // (unsigned long long) stats.f_ffree,
712 // (unsigned long long) stats.f_bavail);
713
714 /* see if there is sufficient space */
715 if (stats.f_ffree > fnr)
716 stats.f_ffree -= fnr;
717 else
718 stats.f_ffree = 0;
719
720 if (stats.f_bavail > bnr)
721 stats.f_bavail -= bnr;
722 else
723 stats.f_bavail = 0;
724
725 ret = -ENOBUFS;
726 if (stats.f_ffree < cache->fstop ||
727 stats.f_bavail < cache->bstop)
728 goto begin_cull;
729
730 ret = 0;
731 if (stats.f_ffree < cache->fcull ||
732 stats.f_bavail < cache->bcull)
733 goto begin_cull;
734
735 if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
736 stats.f_ffree >= cache->frun &&
737 stats.f_bavail >= cache->brun &&
738 test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
739 ) {
740 _debug("cease culling");
741 cachefiles_state_changed(cache);
742 }
743
744 //_leave(" = 0");
745 return 0;
746
747begin_cull:
748 if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
749 _debug("### CULL CACHE ###");
750 cachefiles_state_changed(cache);
751 }
752
753 _leave(" = %d", ret);
754 return ret;
755}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 000000000000..1e962348d111
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
1/* FS-Cache interface to CacheFiles
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/buffer_head.h>
14#include "internal.h"
15
16#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
17
18struct cachefiles_lookup_data {
19 struct cachefiles_xattr *auxdata; /* auxiliary data */
20 char *key; /* key path */
21};
22
23static int cachefiles_attr_changed(struct fscache_object *_object);
24
25/*
26 * allocate an object record for a cookie lookup and prepare the lookup data
27 */
28static struct fscache_object *cachefiles_alloc_object(
29 struct fscache_cache *_cache,
30 struct fscache_cookie *cookie)
31{
32 struct cachefiles_lookup_data *lookup_data;
33 struct cachefiles_object *object;
34 struct cachefiles_cache *cache;
35 struct cachefiles_xattr *auxdata;
36 unsigned keylen, auxlen;
37 void *buffer;
38 char *key;
39
40 cache = container_of(_cache, struct cachefiles_cache, cache);
41
42 _enter("{%s},%p,", cache->cache.identifier, cookie);
43
44 lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
45 if (!lookup_data)
46 goto nomem_lookup_data;
47
48 /* create a new object record and a temporary leaf image */
49 object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
50 if (!object)
51 goto nomem_object;
52
53 ASSERTCMP(object->backer, ==, NULL);
54
55 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
56 atomic_set(&object->usage, 1);
57
58 fscache_object_init(&object->fscache, cookie, &cache->cache);
59
60 object->type = cookie->def->type;
61
62 /* get hold of the raw key
63 * - stick the length on the front and leave space on the back for the
64 * encoder
65 */
66 buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
67 if (!buffer)
68 goto nomem_buffer;
69
70 keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
71 ASSERTCMP(keylen, <, 512);
72
73 *(uint16_t *)buffer = keylen;
74 ((char *)buffer)[keylen + 2] = 0;
75 ((char *)buffer)[keylen + 3] = 0;
76 ((char *)buffer)[keylen + 4] = 0;
77
78 /* turn the raw key into something that can work with as a filename */
79 key = cachefiles_cook_key(buffer, keylen + 2, object->type);
80 if (!key)
81 goto nomem_key;
82
83 /* get hold of the auxiliary data and prepend the object type */
84 auxdata = buffer;
85 auxlen = 0;
86 if (cookie->def->get_aux) {
87 auxlen = cookie->def->get_aux(cookie->netfs_data,
88 auxdata->data, 511);
89 ASSERTCMP(auxlen, <, 511);
90 }
91
92 auxdata->len = auxlen + 1;
93 auxdata->type = cookie->def->type;
94
95 lookup_data->auxdata = auxdata;
96 lookup_data->key = key;
97 object->lookup_data = lookup_data;
98
99 _leave(" = %p [%p]", &object->fscache, lookup_data);
100 return &object->fscache;
101
102nomem_key:
103 kfree(buffer);
104nomem_buffer:
105 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
106 kmem_cache_free(cachefiles_object_jar, object);
107 fscache_object_destroyed(&cache->cache);
108nomem_object:
109 kfree(lookup_data);
110nomem_lookup_data:
111 _leave(" = -ENOMEM");
112 return ERR_PTR(-ENOMEM);
113}
114
115/*
116 * attempt to look up the nominated node in this cache
117 */
118static void cachefiles_lookup_object(struct fscache_object *_object)
119{
120 struct cachefiles_lookup_data *lookup_data;
121 struct cachefiles_object *parent, *object;
122 struct cachefiles_cache *cache;
123 const struct cred *saved_cred;
124 int ret;
125
126 _enter("{OBJ%x}", _object->debug_id);
127
128 cache = container_of(_object->cache, struct cachefiles_cache, cache);
129 parent = container_of(_object->parent,
130 struct cachefiles_object, fscache);
131 object = container_of(_object, struct cachefiles_object, fscache);
132 lookup_data = object->lookup_data;
133
134 ASSERTCMP(lookup_data, !=, NULL);
135
136 /* look up the key, creating any missing bits */
137 cachefiles_begin_secure(cache, &saved_cred);
138 ret = cachefiles_walk_to_object(parent, object,
139 lookup_data->key,
140 lookup_data->auxdata);
141 cachefiles_end_secure(cache, saved_cred);
142
143 /* polish off by setting the attributes of non-index files */
144 if (ret == 0 &&
145 object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
146 cachefiles_attr_changed(&object->fscache);
147
148 if (ret < 0) {
149 printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
150 ret);
151 fscache_object_lookup_error(&object->fscache);
152 }
153
154 _leave(" [%d]", ret);
155}
156
157/*
158 * indication of lookup completion
159 */
160static void cachefiles_lookup_complete(struct fscache_object *_object)
161{
162 struct cachefiles_object *object;
163
164 object = container_of(_object, struct cachefiles_object, fscache);
165
166 _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
167
168 if (object->lookup_data) {
169 kfree(object->lookup_data->key);
170 kfree(object->lookup_data->auxdata);
171 kfree(object->lookup_data);
172 object->lookup_data = NULL;
173 }
174}
175
176/*
177 * increment the usage count on an inode object (may fail if unmounting)
178 */
179static
180struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
181{
182 struct cachefiles_object *object =
183 container_of(_object, struct cachefiles_object, fscache);
184
185 _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
186
187#ifdef CACHEFILES_DEBUG_SLAB
188 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
189#endif
190
191 atomic_inc(&object->usage);
192 return &object->fscache;
193}
194
195/*
196 * update the auxilliary data for an object object on disk
197 */
198static void cachefiles_update_object(struct fscache_object *_object)
199{
200 struct cachefiles_object *object;
201 struct cachefiles_xattr *auxdata;
202 struct cachefiles_cache *cache;
203 struct fscache_cookie *cookie;
204 const struct cred *saved_cred;
205 unsigned auxlen;
206
207 _enter("{OBJ%x}", _object->debug_id);
208
209 object = container_of(_object, struct cachefiles_object, fscache);
210 cache = container_of(object->fscache.cache, struct cachefiles_cache,
211 cache);
212 cookie = object->fscache.cookie;
213
214 if (!cookie->def->get_aux) {
215 _leave(" [no aux]");
216 return;
217 }
218
219 auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
220 if (!auxdata) {
221 _leave(" [nomem]");
222 return;
223 }
224
225 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
226 ASSERTCMP(auxlen, <, 511);
227
228 auxdata->len = auxlen + 1;
229 auxdata->type = cookie->def->type;
230
231 cachefiles_begin_secure(cache, &saved_cred);
232 cachefiles_update_object_xattr(object, auxdata);
233 cachefiles_end_secure(cache, saved_cred);
234 kfree(auxdata);
235 _leave("");
236}
237
238/*
239 * discard the resources pinned by an object and effect retirement if
240 * requested
241 */
242static void cachefiles_drop_object(struct fscache_object *_object)
243{
244 struct cachefiles_object *object;
245 struct cachefiles_cache *cache;
246 const struct cred *saved_cred;
247
248 ASSERT(_object);
249
250 object = container_of(_object, struct cachefiles_object, fscache);
251
252 _enter("{OBJ%x,%d}",
253 object->fscache.debug_id, atomic_read(&object->usage));
254
255 cache = container_of(object->fscache.cache,
256 struct cachefiles_cache, cache);
257
258#ifdef CACHEFILES_DEBUG_SLAB
259 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
260#endif
261
262 /* delete retired objects */
263 if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
264 _object != cache->cache.fsdef
265 ) {
266 _debug("- retire object OBJ%x", object->fscache.debug_id);
267 cachefiles_begin_secure(cache, &saved_cred);
268 cachefiles_delete_object(cache, object);
269 cachefiles_end_secure(cache, saved_cred);
270 }
271
272 /* close the filesystem stuff attached to the object */
273 if (object->backer != object->dentry)
274 dput(object->backer);
275 object->backer = NULL;
276
277 /* note that the object is now inactive */
278 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
279 write_lock(&cache->active_lock);
280 if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
281 &object->flags))
282 BUG();
283 rb_erase(&object->active_node, &cache->active_nodes);
284 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
285 write_unlock(&cache->active_lock);
286 }
287
288 dput(object->dentry);
289 object->dentry = NULL;
290
291 _leave("");
292}
293
294/*
295 * dispose of a reference to an object
296 */
297static void cachefiles_put_object(struct fscache_object *_object)
298{
299 struct cachefiles_object *object;
300 struct fscache_cache *cache;
301
302 ASSERT(_object);
303
304 object = container_of(_object, struct cachefiles_object, fscache);
305
306 _enter("{OBJ%x,%d}",
307 object->fscache.debug_id, atomic_read(&object->usage));
308
309#ifdef CACHEFILES_DEBUG_SLAB
310 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
311#endif
312
313 ASSERTIFCMP(object->fscache.parent,
314 object->fscache.parent->n_children, >, 0);
315
316 if (atomic_dec_and_test(&object->usage)) {
317 _debug("- kill object OBJ%x", object->fscache.debug_id);
318
319 ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
320 ASSERTCMP(object->fscache.parent, ==, NULL);
321 ASSERTCMP(object->backer, ==, NULL);
322 ASSERTCMP(object->dentry, ==, NULL);
323 ASSERTCMP(object->fscache.n_ops, ==, 0);
324 ASSERTCMP(object->fscache.n_children, ==, 0);
325
326 if (object->lookup_data) {
327 kfree(object->lookup_data->key);
328 kfree(object->lookup_data->auxdata);
329 kfree(object->lookup_data);
330 object->lookup_data = NULL;
331 }
332
333 cache = object->fscache.cache;
334 kmem_cache_free(cachefiles_object_jar, object);
335 fscache_object_destroyed(cache);
336 }
337
338 _leave("");
339}
340
341/*
342 * sync a cache
343 */
344static void cachefiles_sync_cache(struct fscache_cache *_cache)
345{
346 struct cachefiles_cache *cache;
347 const struct cred *saved_cred;
348 int ret;
349
350 _enter("%p", _cache);
351
352 cache = container_of(_cache, struct cachefiles_cache, cache);
353
354 /* make sure all pages pinned by operations on behalf of the netfs are
355 * written to disc */
356 cachefiles_begin_secure(cache, &saved_cred);
357 ret = fsync_super(cache->mnt->mnt_sb);
358 cachefiles_end_secure(cache, saved_cred);
359
360 if (ret == -EIO)
361 cachefiles_io_error(cache,
362 "Attempt to sync backing fs superblock"
363 " returned error %d",
364 ret);
365}
366
367/*
368 * notification the attributes on an object have changed
369 * - called with reads/writes excluded by FS-Cache
370 */
371static int cachefiles_attr_changed(struct fscache_object *_object)
372{
373 struct cachefiles_object *object;
374 struct cachefiles_cache *cache;
375 const struct cred *saved_cred;
376 struct iattr newattrs;
377 uint64_t ni_size;
378 loff_t oi_size;
379 int ret;
380
381 _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
382
383 _enter("{OBJ%x},[%llu]",
384 _object->debug_id, (unsigned long long) ni_size);
385
386 object = container_of(_object, struct cachefiles_object, fscache);
387 cache = container_of(object->fscache.cache,
388 struct cachefiles_cache, cache);
389
390 if (ni_size == object->i_size)
391 return 0;
392
393 if (!object->backer)
394 return -ENOBUFS;
395
396 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
397
398 fscache_set_store_limit(&object->fscache, ni_size);
399
400 oi_size = i_size_read(object->backer->d_inode);
401 if (oi_size == ni_size)
402 return 0;
403
404 newattrs.ia_size = ni_size;
405 newattrs.ia_valid = ATTR_SIZE;
406
407 cachefiles_begin_secure(cache, &saved_cred);
408 mutex_lock(&object->backer->d_inode->i_mutex);
409 ret = notify_change(object->backer, &newattrs);
410 mutex_unlock(&object->backer->d_inode->i_mutex);
411 cachefiles_end_secure(cache, saved_cred);
412
413 if (ret == -EIO) {
414 fscache_set_store_limit(&object->fscache, 0);
415 cachefiles_io_error_obj(object, "Size set failed");
416 ret = -ENOBUFS;
417 }
418
419 _leave(" = %d", ret);
420 return ret;
421}
422
423/*
424 * dissociate a cache from all the pages it was backing
425 */
426static void cachefiles_dissociate_pages(struct fscache_cache *cache)
427{
428 _enter("");
429}
430
431const struct fscache_cache_ops cachefiles_cache_ops = {
432 .name = "cachefiles",
433 .alloc_object = cachefiles_alloc_object,
434 .lookup_object = cachefiles_lookup_object,
435 .lookup_complete = cachefiles_lookup_complete,
436 .grab_object = cachefiles_grab_object,
437 .update_object = cachefiles_update_object,
438 .drop_object = cachefiles_drop_object,
439 .put_object = cachefiles_put_object,
440 .sync_cache = cachefiles_sync_cache,
441 .attr_changed = cachefiles_attr_changed,
442 .read_or_alloc_page = cachefiles_read_or_alloc_page,
443 .read_or_alloc_pages = cachefiles_read_or_alloc_pages,
444 .allocate_page = cachefiles_allocate_page,
445 .allocate_pages = cachefiles_allocate_pages,
446 .write_page = cachefiles_write_page,
447 .uncache_page = cachefiles_uncache_page,
448 .dissociate_pages = cachefiles_dissociate_pages,
449};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 000000000000..19218e1463d6
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
1/* General netfs cache on cache files internal defs
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fscache-cache.h>
13#include <linux/timer.h>
14#include <linux/wait.h>
15#include <linux/workqueue.h>
16#include <linux/security.h>
17
18struct cachefiles_cache;
19struct cachefiles_object;
20
21extern unsigned cachefiles_debug;
22#define CACHEFILES_DEBUG_KENTER 1
23#define CACHEFILES_DEBUG_KLEAVE 2
24#define CACHEFILES_DEBUG_KDEBUG 4
25
26/*
27 * node records
28 */
29struct cachefiles_object {
30 struct fscache_object fscache; /* fscache handle */
31 struct cachefiles_lookup_data *lookup_data; /* cached lookup data */
32 struct dentry *dentry; /* the file/dir representing this object */
33 struct dentry *backer; /* backing file */
34 loff_t i_size; /* object size */
35 unsigned long flags;
36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
37 atomic_t usage; /* object usage count */
38 uint8_t type; /* object type */
39 uint8_t new; /* T if object new */
40 spinlock_t work_lock;
41 struct rb_node active_node; /* link in active tree (dentry is key) */
42};
43
44extern struct kmem_cache *cachefiles_object_jar;
45
46/*
47 * Cache files cache definition
48 */
49struct cachefiles_cache {
50 struct fscache_cache cache; /* FS-Cache record */
51 struct vfsmount *mnt; /* mountpoint holding the cache */
52 struct dentry *graveyard; /* directory into which dead objects go */
53 struct file *cachefilesd; /* manager daemon handle */
54 const struct cred *cache_cred; /* security override for accessing cache */
55 struct mutex daemon_mutex; /* command serialisation mutex */
56 wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */
57 struct rb_root active_nodes; /* active nodes (can't be culled) */
58 rwlock_t active_lock; /* lock for active_nodes */
59 atomic_t gravecounter; /* graveyard uniquifier */
60 unsigned frun_percent; /* when to stop culling (% files) */
61 unsigned fcull_percent; /* when to start culling (% files) */
62 unsigned fstop_percent; /* when to stop allocating (% files) */
63 unsigned brun_percent; /* when to stop culling (% blocks) */
64 unsigned bcull_percent; /* when to start culling (% blocks) */
65 unsigned bstop_percent; /* when to stop allocating (% blocks) */
66 unsigned bsize; /* cache's block size */
67 unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */
68 uint64_t frun; /* when to stop culling */
69 uint64_t fcull; /* when to start culling */
70 uint64_t fstop; /* when to stop allocating */
71 sector_t brun; /* when to stop culling */
72 sector_t bcull; /* when to start culling */
73 sector_t bstop; /* when to stop allocating */
74 unsigned long flags;
75#define CACHEFILES_READY 0 /* T if cache prepared */
76#define CACHEFILES_DEAD 1 /* T if cache dead */
77#define CACHEFILES_CULLING 2 /* T if cull engaged */
78#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
79 char *rootdirname; /* name of cache root directory */
80 char *secctx; /* LSM security context */
81 char *tag; /* cache binding tag */
82};
83
84/*
85 * backing file read tracking
86 */
87struct cachefiles_one_read {
88 wait_queue_t monitor; /* link into monitored waitqueue */
89 struct page *back_page; /* backing file page we're waiting for */
90 struct page *netfs_page; /* netfs page we're going to fill */
91 struct fscache_retrieval *op; /* retrieval op covering this */
92 struct list_head op_link; /* link in op's todo list */
93};
94
95/*
96 * backing file write tracking
97 */
98struct cachefiles_one_write {
99 struct page *netfs_page; /* netfs page to copy */
100 struct cachefiles_object *object;
101 struct list_head obj_link; /* link in object's lists */
102 fscache_rw_complete_t end_io_func;
103 void *context;
104};
105
106/*
107 * auxiliary data xattr buffer
108 */
109struct cachefiles_xattr {
110 uint16_t len;
111 uint8_t type;
112 uint8_t data[];
113};
114
115/*
116 * note change of state for daemon
117 */
118static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
119{
120 set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
121 wake_up_all(&cache->daemon_pollwq);
122}
123
124/*
125 * cf-bind.c
126 */
127extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
128extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
129
130/*
131 * cf-daemon.c
132 */
133extern const struct file_operations cachefiles_daemon_fops;
134
135extern int cachefiles_has_space(struct cachefiles_cache *cache,
136 unsigned fnr, unsigned bnr);
137
138/*
139 * cf-interface.c
140 */
141extern const struct fscache_cache_ops cachefiles_cache_ops;
142
143/*
144 * cf-key.c
145 */
146extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
147
148/*
149 * cf-namei.c
150 */
151extern int cachefiles_delete_object(struct cachefiles_cache *cache,
152 struct cachefiles_object *object);
153extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
154 struct cachefiles_object *object,
155 const char *key,
156 struct cachefiles_xattr *auxdata);
157extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
158 struct dentry *dir,
159 const char *name);
160
161extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
162 char *filename);
163
164extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
165 struct dentry *dir, char *filename);
166
167/*
168 * cf-proc.c
169 */
170#ifdef CONFIG_CACHEFILES_HISTOGRAM
171extern atomic_t cachefiles_lookup_histogram[HZ];
172extern atomic_t cachefiles_mkdir_histogram[HZ];
173extern atomic_t cachefiles_create_histogram[HZ];
174
175extern int __init cachefiles_proc_init(void);
176extern void cachefiles_proc_cleanup(void);
177static inline
178void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
179{
180 unsigned long jif = jiffies - start_jif;
181 if (jif >= HZ)
182 jif = HZ - 1;
183 atomic_inc(&histogram[jif]);
184}
185
186#else
187#define cachefiles_proc_init() (0)
188#define cachefiles_proc_cleanup() do {} while (0)
189#define cachefiles_hist(hist, start_jif) do {} while (0)
190#endif
191
192/*
193 * cf-rdwr.c
194 */
195extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
196 struct page *, gfp_t);
197extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
198 struct list_head *, unsigned *,
199 gfp_t);
200extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
201 gfp_t);
202extern int cachefiles_allocate_pages(struct fscache_retrieval *,
203 struct list_head *, unsigned *, gfp_t);
204extern int cachefiles_write_page(struct fscache_storage *, struct page *);
205extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
206
207/*
208 * cf-security.c
209 */
210extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
211extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
212 struct dentry *root,
213 const struct cred **_saved_cred);
214
215static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
216 const struct cred **_saved_cred)
217{
218 *_saved_cred = override_creds(cache->cache_cred);
219}
220
221static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
222 const struct cred *saved_cred)
223{
224 revert_creds(saved_cred);
225}
226
227/*
228 * cf-xattr.c
229 */
230extern int cachefiles_check_object_type(struct cachefiles_object *object);
231extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
232 struct cachefiles_xattr *auxdata);
233extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
234 struct cachefiles_xattr *auxdata);
235extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
236 struct cachefiles_xattr *auxdata);
237extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
238 struct dentry *dentry);
239
240
241/*
242 * error handling
243 */
244#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
245
246#define cachefiles_io_error(___cache, FMT, ...) \
247do { \
248 kerror("I/O Error: " FMT, ##__VA_ARGS__); \
249 fscache_io_error(&(___cache)->cache); \
250 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
251} while (0)
252
253#define cachefiles_io_error_obj(object, FMT, ...) \
254do { \
255 struct cachefiles_cache *___cache; \
256 \
257 ___cache = container_of((object)->fscache.cache, \
258 struct cachefiles_cache, cache); \
259 cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \
260} while (0)
261
262
263/*
264 * debug tracing
265 */
266#define dbgprintk(FMT, ...) \
267 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
268
269/* make sure we maintain the format strings, even when debugging is disabled */
270static inline void _dbprintk(const char *fmt, ...)
271 __attribute__((format(printf, 1, 2)));
272static inline void _dbprintk(const char *fmt, ...)
273{
274}
275
276#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
277#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
278#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
279
280
281#if defined(__KDEBUG)
282#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
283#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
284#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
285
286#elif defined(CONFIG_CACHEFILES_DEBUG)
287#define _enter(FMT, ...) \
288do { \
289 if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
290 kenter(FMT, ##__VA_ARGS__); \
291} while (0)
292
293#define _leave(FMT, ...) \
294do { \
295 if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
296 kleave(FMT, ##__VA_ARGS__); \
297} while (0)
298
299#define _debug(FMT, ...) \
300do { \
301 if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
302 kdebug(FMT, ##__VA_ARGS__); \
303} while (0)
304
305#else
306#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
307#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
308#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
309#endif
310
311#if 1 /* defined(__KDEBUGALL) */
312
313#define ASSERT(X) \
314do { \
315 if (unlikely(!(X))) { \
316 printk(KERN_ERR "\n"); \
317 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
318 BUG(); \
319 } \
320} while (0)
321
322#define ASSERTCMP(X, OP, Y) \
323do { \
324 if (unlikely(!((X) OP (Y)))) { \
325 printk(KERN_ERR "\n"); \
326 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
327 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
328 (unsigned long)(X), (unsigned long)(Y)); \
329 BUG(); \
330 } \
331} while (0)
332
333#define ASSERTIF(C, X) \
334do { \
335 if (unlikely((C) && !(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTIFCMP(C, X, OP, Y) \
343do { \
344 if (unlikely((C) && !((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#else
354
355#define ASSERT(X) do {} while (0)
356#define ASSERTCMP(X, OP, Y) do {} while (0)
357#define ASSERTIF(C, X) do {} while (0)
358#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
359
360#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 000000000000..81b8b2b3a674
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
1/* Key to pathname encoder
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/slab.h>
13#include "internal.h"
14
15static const char cachefiles_charmap[64] =
16 "0123456789" /* 0 - 9 */
17 "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
18 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
19 "_-" /* 62 - 63 */
20 ;
21
22static const char cachefiles_filecharmap[256] = {
23 /* we skip space and tab and control chars */
24 [33 ... 46] = 1, /* '!' -> '.' */
25 /* we skip '/' as it's significant to pathwalk */
26 [48 ... 127] = 1, /* '0' -> '~' */
27};
28
29/*
30 * turn the raw key into something cooked
31 * - the raw key should include the length in the two bytes at the front
32 * - the key may be up to 514 bytes in length (including the length word)
33 * - "base64" encode the strange keys, mapping 3 bytes of raw to four of
34 * cooked
35 * - need to cut the cooked key into 252 char lengths (189 raw bytes)
36 */
37char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
38{
39 unsigned char csum, ch;
40 unsigned int acc;
41 char *key;
42 int loop, len, max, seg, mark, print;
43
44 _enter(",%d", keylen);
45
46 BUG_ON(keylen < 2 || keylen > 514);
47
48 csum = raw[0] + raw[1];
49 print = 1;
50 for (loop = 2; loop < keylen; loop++) {
51 ch = raw[loop];
52 csum += ch;
53 print &= cachefiles_filecharmap[ch];
54 }
55
56 if (print) {
57 /* if the path is usable ASCII, then we render it directly */
58 max = keylen - 2;
59 max += 2; /* two base64'd length chars on the front */
60 max += 5; /* @checksum/M */
61 max += 3 * 2; /* maximum number of segment dividers (".../M")
62 * is ((514 + 251) / 252) = 3
63 */
64 max += 1; /* NUL on end */
65 } else {
66 /* calculate the maximum length of the cooked key */
67 keylen = (keylen + 2) / 3;
68
69 max = keylen * 4;
70 max += 5; /* @checksum/M */
71 max += 3 * 2; /* maximum number of segment dividers (".../M")
72 * is ((514 + 188) / 189) = 3
73 */
74 max += 1; /* NUL on end */
75 }
76
77 max += 1; /* 2nd NUL on end */
78
79 _debug("max: %d", max);
80
81 key = kmalloc(max, GFP_KERNEL);
82 if (!key)
83 return NULL;
84
85 len = 0;
86
87 /* build the cooked key */
88 sprintf(key, "@%02x%c+", (unsigned) csum, 0);
89 len = 5;
90 mark = len - 1;
91
92 if (print) {
93 acc = *(uint16_t *) raw;
94 raw += 2;
95
96 key[len + 1] = cachefiles_charmap[acc & 63];
97 acc >>= 6;
98 key[len] = cachefiles_charmap[acc & 63];
99 len += 2;
100
101 seg = 250;
102 for (loop = keylen; loop > 0; loop--) {
103 if (seg <= 0) {
104 key[len++] = '\0';
105 mark = len;
106 key[len++] = '+';
107 seg = 252;
108 }
109
110 key[len++] = *raw++;
111 ASSERT(len < max);
112 }
113
114 switch (type) {
115 case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break;
116 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break;
117 default: type = 'S'; break;
118 }
119 } else {
120 seg = 252;
121 for (loop = keylen; loop > 0; loop--) {
122 if (seg <= 0) {
123 key[len++] = '\0';
124 mark = len;
125 key[len++] = '+';
126 seg = 252;
127 }
128
129 acc = *raw++;
130 acc |= *raw++ << 8;
131 acc |= *raw++ << 16;
132
133 _debug("acc: %06x", acc);
134
135 key[len++] = cachefiles_charmap[acc & 63];
136 acc >>= 6;
137 key[len++] = cachefiles_charmap[acc & 63];
138 acc >>= 6;
139 key[len++] = cachefiles_charmap[acc & 63];
140 acc >>= 6;
141 key[len++] = cachefiles_charmap[acc & 63];
142
143 ASSERT(len < max);
144 }
145
146 switch (type) {
147 case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break;
148 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break;
149 default: type = 'T'; break;
150 }
151 }
152
153 key[mark] = type;
154 key[len++] = 0;
155 key[len] = 0;
156
157 _leave(" = %p %d", key, len);
158 return key;
159}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 000000000000..4bfa8cf43bf5
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
1/* Network filesystem caching backend to use cache files on a premounted
2 * filesystem
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public Licence
9 * as published by the Free Software Foundation; either version
10 * 2 of the Licence, or (at your option) any later version.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include <linux/fs.h>
19#include <linux/file.h>
20#include <linux/namei.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/sysctl.h>
24#include <linux/miscdevice.h>
25#include "internal.h"
26
27unsigned cachefiles_debug;
28module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
29MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
30
31MODULE_DESCRIPTION("Mounted-filesystem based cache");
32MODULE_AUTHOR("Red Hat, Inc.");
33MODULE_LICENSE("GPL");
34
35struct kmem_cache *cachefiles_object_jar;
36
37static struct miscdevice cachefiles_dev = {
38 .minor = MISC_DYNAMIC_MINOR,
39 .name = "cachefiles",
40 .fops = &cachefiles_daemon_fops,
41};
42
43static void cachefiles_object_init_once(void *_object)
44{
45 struct cachefiles_object *object = _object;
46
47 memset(object, 0, sizeof(*object));
48 spin_lock_init(&object->work_lock);
49}
50
51/*
52 * initialise the fs caching module
53 */
54static int __init cachefiles_init(void)
55{
56 int ret;
57
58 ret = misc_register(&cachefiles_dev);
59 if (ret < 0)
60 goto error_dev;
61
62 /* create an object jar */
63 ret = -ENOMEM;
64 cachefiles_object_jar =
65 kmem_cache_create("cachefiles_object_jar",
66 sizeof(struct cachefiles_object),
67 0,
68 SLAB_HWCACHE_ALIGN,
69 cachefiles_object_init_once);
70 if (!cachefiles_object_jar) {
71 printk(KERN_NOTICE
72 "CacheFiles: Failed to allocate an object jar\n");
73 goto error_object_jar;
74 }
75
76 ret = cachefiles_proc_init();
77 if (ret < 0)
78 goto error_proc;
79
80 printk(KERN_INFO "CacheFiles: Loaded\n");
81 return 0;
82
83error_proc:
84 kmem_cache_destroy(cachefiles_object_jar);
85error_object_jar:
86 misc_deregister(&cachefiles_dev);
87error_dev:
88 kerror("failed to register: %d", ret);
89 return ret;
90}
91
92fs_initcall(cachefiles_init);
93
94/*
95 * clean up on module removal
96 */
97static void __exit cachefiles_exit(void)
98{
99 printk(KERN_INFO "CacheFiles: Unloading\n");
100
101 cachefiles_proc_cleanup();
102 kmem_cache_destroy(cachefiles_object_jar);
103 misc_deregister(&cachefiles_dev);
104}
105
106module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 000000000000..4ce818ae39ea
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
1/* CacheFiles path walking and related routines
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include <linux/mount.h>
20#include <linux/namei.h>
21#include <linux/security.h>
22#include "internal.h"
23
24static int cachefiles_wait_bit(void *flags)
25{
26 schedule();
27 return 0;
28}
29
30/*
31 * record the fact that an object is now active
32 */
33static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
34 struct cachefiles_object *object)
35{
36 struct cachefiles_object *xobject;
37 struct rb_node **_p, *_parent = NULL;
38 struct dentry *dentry;
39
40 _enter(",%p", object);
41
42try_again:
43 write_lock(&cache->active_lock);
44
45 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
46 BUG();
47
48 dentry = object->dentry;
49 _p = &cache->active_nodes.rb_node;
50 while (*_p) {
51 _parent = *_p;
52 xobject = rb_entry(_parent,
53 struct cachefiles_object, active_node);
54
55 ASSERT(xobject != object);
56
57 if (xobject->dentry > dentry)
58 _p = &(*_p)->rb_left;
59 else if (xobject->dentry < dentry)
60 _p = &(*_p)->rb_right;
61 else
62 goto wait_for_old_object;
63 }
64
65 rb_link_node(&object->active_node, _parent, _p);
66 rb_insert_color(&object->active_node, &cache->active_nodes);
67
68 write_unlock(&cache->active_lock);
69 _leave("");
70 return;
71
72 /* an old object from a previous incarnation is hogging the slot - we
73 * need to wait for it to be destroyed */
74wait_for_old_object:
75 if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
76 printk(KERN_ERR "\n");
77 printk(KERN_ERR "CacheFiles: Error:"
78 " Unexpected object collision\n");
79 printk(KERN_ERR "xobject: OBJ%x\n",
80 xobject->fscache.debug_id);
81 printk(KERN_ERR "xobjstate=%s\n",
82 fscache_object_states[xobject->fscache.state]);
83 printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
84 printk(KERN_ERR "xobjevent=%lx [%lx]\n",
85 xobject->fscache.events, xobject->fscache.event_mask);
86 printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
87 xobject->fscache.n_ops, xobject->fscache.n_in_progress,
88 xobject->fscache.n_exclusive);
89 printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
90 xobject->fscache.cookie,
91 xobject->fscache.cookie->parent,
92 xobject->fscache.cookie->netfs_data,
93 xobject->fscache.cookie->flags);
94 printk(KERN_ERR "xparent=%p\n",
95 xobject->fscache.parent);
96 printk(KERN_ERR "object: OBJ%x\n",
97 object->fscache.debug_id);
98 printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
99 object->fscache.cookie,
100 object->fscache.cookie->parent,
101 object->fscache.cookie->netfs_data,
102 object->fscache.cookie->flags);
103 printk(KERN_ERR "parent=%p\n",
104 object->fscache.parent);
105 BUG();
106 }
107 atomic_inc(&xobject->usage);
108 write_unlock(&cache->active_lock);
109
110 _debug(">>> wait");
111 wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
112 cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
113 _debug("<<< waited");
114
115 cache->cache.ops->put_object(&xobject->fscache);
116 goto try_again;
117}
118
119/*
120 * delete an object representation from the cache
121 * - file backed objects are unlinked
122 * - directory backed objects are stuffed into the graveyard for userspace to
123 * delete
124 * - unlocks the directory mutex
125 */
126static int cachefiles_bury_object(struct cachefiles_cache *cache,
127 struct dentry *dir,
128 struct dentry *rep)
129{
130 struct dentry *grave, *trap;
131 char nbuffer[8 + 8 + 1];
132 int ret;
133
134 _enter(",'%*.*s','%*.*s'",
135 dir->d_name.len, dir->d_name.len, dir->d_name.name,
136 rep->d_name.len, rep->d_name.len, rep->d_name.name);
137
138 /* non-directories can just be unlinked */
139 if (!S_ISDIR(rep->d_inode->i_mode)) {
140 _debug("unlink stale object");
141 ret = vfs_unlink(dir->d_inode, rep);
142
143 mutex_unlock(&dir->d_inode->i_mutex);
144
145 if (ret == -EIO)
146 cachefiles_io_error(cache, "Unlink failed");
147
148 _leave(" = %d", ret);
149 return ret;
150 }
151
152 /* directories have to be moved to the graveyard */
153 _debug("move stale object to graveyard");
154 mutex_unlock(&dir->d_inode->i_mutex);
155
156try_again:
157 /* first step is to make up a grave dentry in the graveyard */
158 sprintf(nbuffer, "%08x%08x",
159 (uint32_t) get_seconds(),
160 (uint32_t) atomic_inc_return(&cache->gravecounter));
161
162 /* do the multiway lock magic */
163 trap = lock_rename(cache->graveyard, dir);
164
165 /* do some checks before getting the grave dentry */
166 if (rep->d_parent != dir) {
167 /* the entry was probably culled when we dropped the parent dir
168 * lock */
169 unlock_rename(cache->graveyard, dir);
170 _leave(" = 0 [culled?]");
171 return 0;
172 }
173
174 if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
175 unlock_rename(cache->graveyard, dir);
176 cachefiles_io_error(cache, "Graveyard no longer a directory");
177 return -EIO;
178 }
179
180 if (trap == rep) {
181 unlock_rename(cache->graveyard, dir);
182 cachefiles_io_error(cache, "May not make directory loop");
183 return -EIO;
184 }
185
186 if (d_mountpoint(rep)) {
187 unlock_rename(cache->graveyard, dir);
188 cachefiles_io_error(cache, "Mountpoint in cache");
189 return -EIO;
190 }
191
192 grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
193 if (IS_ERR(grave)) {
194 unlock_rename(cache->graveyard, dir);
195
196 if (PTR_ERR(grave) == -ENOMEM) {
197 _leave(" = -ENOMEM");
198 return -ENOMEM;
199 }
200
201 cachefiles_io_error(cache, "Lookup error %ld",
202 PTR_ERR(grave));
203 return -EIO;
204 }
205
206 if (grave->d_inode) {
207 unlock_rename(cache->graveyard, dir);
208 dput(grave);
209 grave = NULL;
210 cond_resched();
211 goto try_again;
212 }
213
214 if (d_mountpoint(grave)) {
215 unlock_rename(cache->graveyard, dir);
216 dput(grave);
217 cachefiles_io_error(cache, "Mountpoint in graveyard");
218 return -EIO;
219 }
220
221 /* target should not be an ancestor of source */
222 if (trap == grave) {
223 unlock_rename(cache->graveyard, dir);
224 dput(grave);
225 cachefiles_io_error(cache, "May not make directory loop");
226 return -EIO;
227 }
228
229 /* attempt the rename */
230 ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
231 if (ret != 0 && ret != -ENOMEM)
232 cachefiles_io_error(cache, "Rename failed with error %d", ret);
233
234 unlock_rename(cache->graveyard, dir);
235 dput(grave);
236 _leave(" = 0");
237 return 0;
238}
239
240/*
241 * delete an object representation from the cache
242 */
243int cachefiles_delete_object(struct cachefiles_cache *cache,
244 struct cachefiles_object *object)
245{
246 struct dentry *dir;
247 int ret;
248
249 _enter(",{%p}", object->dentry);
250
251 ASSERT(object->dentry);
252 ASSERT(object->dentry->d_inode);
253 ASSERT(object->dentry->d_parent);
254
255 dir = dget_parent(object->dentry);
256
257 mutex_lock(&dir->d_inode->i_mutex);
258 ret = cachefiles_bury_object(cache, dir, object->dentry);
259
260 dput(dir);
261 _leave(" = %d", ret);
262 return ret;
263}
264
265/*
266 * walk from the parent object to the child object through the backing
267 * filesystem, creating directories as we go
268 */
269int cachefiles_walk_to_object(struct cachefiles_object *parent,
270 struct cachefiles_object *object,
271 const char *key,
272 struct cachefiles_xattr *auxdata)
273{
274 struct cachefiles_cache *cache;
275 struct dentry *dir, *next = NULL;
276 unsigned long start;
277 const char *name;
278 int ret, nlen;
279
280 _enter("{%p},,%s,", parent->dentry, key);
281
282 cache = container_of(parent->fscache.cache,
283 struct cachefiles_cache, cache);
284
285 ASSERT(parent->dentry);
286 ASSERT(parent->dentry->d_inode);
287
288 if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
289 // TODO: convert file to dir
290 _leave("looking up in none directory");
291 return -ENOBUFS;
292 }
293
294 dir = dget(parent->dentry);
295
296advance:
297 /* attempt to transit the first directory component */
298 name = key;
299 nlen = strlen(key);
300
301 /* key ends in a double NUL */
302 key = key + nlen + 1;
303 if (!*key)
304 key = NULL;
305
306lookup_again:
307 /* search the current directory for the element name */
308 _debug("lookup '%s'", name);
309
310 mutex_lock(&dir->d_inode->i_mutex);
311
312 start = jiffies;
313 next = lookup_one_len(name, dir, nlen);
314 cachefiles_hist(cachefiles_lookup_histogram, start);
315 if (IS_ERR(next))
316 goto lookup_error;
317
318 _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
319
320 if (!key)
321 object->new = !next->d_inode;
322
323 /* if this element of the path doesn't exist, then the lookup phase
324 * failed, and we can release any readers in the certain knowledge that
325 * there's nothing for them to actually read */
326 if (!next->d_inode)
327 fscache_object_lookup_negative(&object->fscache);
328
329 /* we need to create the object if it's negative */
330 if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
331 /* index objects and intervening tree levels must be subdirs */
332 if (!next->d_inode) {
333 ret = cachefiles_has_space(cache, 1, 0);
334 if (ret < 0)
335 goto create_error;
336
337 start = jiffies;
338 ret = vfs_mkdir(dir->d_inode, next, 0);
339 cachefiles_hist(cachefiles_mkdir_histogram, start);
340 if (ret < 0)
341 goto create_error;
342
343 ASSERT(next->d_inode);
344
345 _debug("mkdir -> %p{%p{ino=%lu}}",
346 next, next->d_inode, next->d_inode->i_ino);
347
348 } else if (!S_ISDIR(next->d_inode->i_mode)) {
349 kerror("inode %lu is not a directory",
350 next->d_inode->i_ino);
351 ret = -ENOBUFS;
352 goto error;
353 }
354
355 } else {
356 /* non-index objects start out life as files */
357 if (!next->d_inode) {
358 ret = cachefiles_has_space(cache, 1, 0);
359 if (ret < 0)
360 goto create_error;
361
362 start = jiffies;
363 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
364 cachefiles_hist(cachefiles_create_histogram, start);
365 if (ret < 0)
366 goto create_error;
367
368 ASSERT(next->d_inode);
369
370 _debug("create -> %p{%p{ino=%lu}}",
371 next, next->d_inode, next->d_inode->i_ino);
372
373 } else if (!S_ISDIR(next->d_inode->i_mode) &&
374 !S_ISREG(next->d_inode->i_mode)
375 ) {
376 kerror("inode %lu is not a file or directory",
377 next->d_inode->i_ino);
378 ret = -ENOBUFS;
379 goto error;
380 }
381 }
382
383 /* process the next component */
384 if (key) {
385 _debug("advance");
386 mutex_unlock(&dir->d_inode->i_mutex);
387 dput(dir);
388 dir = next;
389 next = NULL;
390 goto advance;
391 }
392
393 /* we've found the object we were looking for */
394 object->dentry = next;
395
396 /* if we've found that the terminal object exists, then we need to
397 * check its attributes and delete it if it's out of date */
398 if (!object->new) {
399 _debug("validate '%*.*s'",
400 next->d_name.len, next->d_name.len, next->d_name.name);
401
402 ret = cachefiles_check_object_xattr(object, auxdata);
403 if (ret == -ESTALE) {
404 /* delete the object (the deleter drops the directory
405 * mutex) */
406 object->dentry = NULL;
407
408 ret = cachefiles_bury_object(cache, dir, next);
409 dput(next);
410 next = NULL;
411
412 if (ret < 0)
413 goto delete_error;
414
415 _debug("redo lookup");
416 goto lookup_again;
417 }
418 }
419
420 /* note that we're now using this object */
421 cachefiles_mark_object_active(cache, object);
422
423 mutex_unlock(&dir->d_inode->i_mutex);
424 dput(dir);
425 dir = NULL;
426
427 _debug("=== OBTAINED_OBJECT ===");
428
429 if (object->new) {
430 /* attach data to a newly constructed terminal object */
431 ret = cachefiles_set_object_xattr(object, auxdata);
432 if (ret < 0)
433 goto check_error;
434 } else {
435 /* always update the atime on an object we've just looked up
436 * (this is used to keep track of culling, and atimes are only
437 * updated by read, write and readdir but not lookup or
438 * open) */
439 touch_atime(cache->mnt, next);
440 }
441
442 /* open a file interface onto a data file */
443 if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
444 if (S_ISREG(object->dentry->d_inode->i_mode)) {
445 const struct address_space_operations *aops;
446
447 ret = -EPERM;
448 aops = object->dentry->d_inode->i_mapping->a_ops;
449 if (!aops->bmap)
450 goto check_error;
451
452 object->backer = object->dentry;
453 } else {
454 BUG(); // TODO: open file in data-class subdir
455 }
456 }
457
458 object->new = 0;
459 fscache_obtained_object(&object->fscache);
460
461 _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
462 return 0;
463
464create_error:
465 _debug("create error %d", ret);
466 if (ret == -EIO)
467 cachefiles_io_error(cache, "Create/mkdir failed");
468 goto error;
469
470check_error:
471 _debug("check error %d", ret);
472 write_lock(&cache->active_lock);
473 rb_erase(&object->active_node, &cache->active_nodes);
474 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
475 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
476 write_unlock(&cache->active_lock);
477
478 dput(object->dentry);
479 object->dentry = NULL;
480 goto error_out;
481
482delete_error:
483 _debug("delete error %d", ret);
484 goto error_out2;
485
486lookup_error:
487 _debug("lookup error %ld", PTR_ERR(next));
488 ret = PTR_ERR(next);
489 if (ret == -EIO)
490 cachefiles_io_error(cache, "Lookup failed");
491 next = NULL;
492error:
493 mutex_unlock(&dir->d_inode->i_mutex);
494 dput(next);
495error_out2:
496 dput(dir);
497error_out:
498 if (ret == -ENOSPC)
499 ret = -ENOBUFS;
500
501 _leave(" = error %d", -ret);
502 return ret;
503}
504
505/*
506 * get a subdirectory
507 */
508struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
509 struct dentry *dir,
510 const char *dirname)
511{
512 struct dentry *subdir;
513 unsigned long start;
514 int ret;
515
516 _enter(",,%s", dirname);
517
518 /* search the current directory for the element name */
519 mutex_lock(&dir->d_inode->i_mutex);
520
521 start = jiffies;
522 subdir = lookup_one_len(dirname, dir, strlen(dirname));
523 cachefiles_hist(cachefiles_lookup_histogram, start);
524 if (IS_ERR(subdir)) {
525 if (PTR_ERR(subdir) == -ENOMEM)
526 goto nomem_d_alloc;
527 goto lookup_error;
528 }
529
530 _debug("subdir -> %p %s",
531 subdir, subdir->d_inode ? "positive" : "negative");
532
533 /* we need to create the subdir if it doesn't exist yet */
534 if (!subdir->d_inode) {
535 ret = cachefiles_has_space(cache, 1, 0);
536 if (ret < 0)
537 goto mkdir_error;
538
539 _debug("attempt mkdir");
540
541 ret = vfs_mkdir(dir->d_inode, subdir, 0700);
542 if (ret < 0)
543 goto mkdir_error;
544
545 ASSERT(subdir->d_inode);
546
547 _debug("mkdir -> %p{%p{ino=%lu}}",
548 subdir,
549 subdir->d_inode,
550 subdir->d_inode->i_ino);
551 }
552
553 mutex_unlock(&dir->d_inode->i_mutex);
554
555 /* we need to make sure the subdir is a directory */
556 ASSERT(subdir->d_inode);
557
558 if (!S_ISDIR(subdir->d_inode->i_mode)) {
559 kerror("%s is not a directory", dirname);
560 ret = -EIO;
561 goto check_error;
562 }
563
564 ret = -EPERM;
565 if (!subdir->d_inode->i_op ||
566 !subdir->d_inode->i_op->setxattr ||
567 !subdir->d_inode->i_op->getxattr ||
568 !subdir->d_inode->i_op->lookup ||
569 !subdir->d_inode->i_op->mkdir ||
570 !subdir->d_inode->i_op->create ||
571 !subdir->d_inode->i_op->rename ||
572 !subdir->d_inode->i_op->rmdir ||
573 !subdir->d_inode->i_op->unlink)
574 goto check_error;
575
576 _leave(" = [%lu]", subdir->d_inode->i_ino);
577 return subdir;
578
579check_error:
580 dput(subdir);
581 _leave(" = %d [check]", ret);
582 return ERR_PTR(ret);
583
584mkdir_error:
585 mutex_unlock(&dir->d_inode->i_mutex);
586 dput(subdir);
587 kerror("mkdir %s failed with error %d", dirname, ret);
588 return ERR_PTR(ret);
589
590lookup_error:
591 mutex_unlock(&dir->d_inode->i_mutex);
592 ret = PTR_ERR(subdir);
593 kerror("Lookup %s failed with error %d", dirname, ret);
594 return ERR_PTR(ret);
595
596nomem_d_alloc:
597 mutex_unlock(&dir->d_inode->i_mutex);
598 _leave(" = -ENOMEM");
599 return ERR_PTR(-ENOMEM);
600}
601
602/*
603 * find out if an object is in use or not
604 * - if finds object and it's not in use:
605 * - returns a pointer to the object and a reference on it
606 * - returns with the directory locked
607 */
608static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
609 struct dentry *dir,
610 char *filename)
611{
612 struct cachefiles_object *object;
613 struct rb_node *_n;
614 struct dentry *victim;
615 unsigned long start;
616 int ret;
617
618 //_enter(",%*.*s/,%s",
619 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
620
621 /* look up the victim */
622 mutex_lock_nested(&dir->d_inode->i_mutex, 1);
623
624 start = jiffies;
625 victim = lookup_one_len(filename, dir, strlen(filename));
626 cachefiles_hist(cachefiles_lookup_histogram, start);
627 if (IS_ERR(victim))
628 goto lookup_error;
629
630 //_debug("victim -> %p %s",
631 // victim, victim->d_inode ? "positive" : "negative");
632
633 /* if the object is no longer there then we probably retired the object
634 * at the netfs's request whilst the cull was in progress
635 */
636 if (!victim->d_inode) {
637 mutex_unlock(&dir->d_inode->i_mutex);
638 dput(victim);
639 _leave(" = -ENOENT [absent]");
640 return ERR_PTR(-ENOENT);
641 }
642
643 /* check to see if we're using this object */
644 read_lock(&cache->active_lock);
645
646 _n = cache->active_nodes.rb_node;
647
648 while (_n) {
649 object = rb_entry(_n, struct cachefiles_object, active_node);
650
651 if (object->dentry > victim)
652 _n = _n->rb_left;
653 else if (object->dentry < victim)
654 _n = _n->rb_right;
655 else
656 goto object_in_use;
657 }
658
659 read_unlock(&cache->active_lock);
660
661 //_leave(" = %p", victim);
662 return victim;
663
664object_in_use:
665 read_unlock(&cache->active_lock);
666 mutex_unlock(&dir->d_inode->i_mutex);
667 dput(victim);
668 //_leave(" = -EBUSY [in use]");
669 return ERR_PTR(-EBUSY);
670
671lookup_error:
672 mutex_unlock(&dir->d_inode->i_mutex);
673 ret = PTR_ERR(victim);
674 if (ret == -ENOENT) {
675 /* file or dir now absent - probably retired by netfs */
676 _leave(" = -ESTALE [absent]");
677 return ERR_PTR(-ESTALE);
678 }
679
680 if (ret == -EIO) {
681 cachefiles_io_error(cache, "Lookup failed");
682 } else if (ret != -ENOMEM) {
683 kerror("Internal error: %d", ret);
684 ret = -EIO;
685 }
686
687 _leave(" = %d", ret);
688 return ERR_PTR(ret);
689}
690
691/*
692 * cull an object if it's not in use
693 * - called only by cache manager daemon
694 */
695int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
696 char *filename)
697{
698 struct dentry *victim;
699 int ret;
700
701 _enter(",%*.*s/,%s",
702 dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
703
704 victim = cachefiles_check_active(cache, dir, filename);
705 if (IS_ERR(victim))
706 return PTR_ERR(victim);
707
708 _debug("victim -> %p %s",
709 victim, victim->d_inode ? "positive" : "negative");
710
711 /* okay... the victim is not being used so we can cull it
712 * - start by marking it as stale
713 */
714 _debug("victim is cullable");
715
716 ret = cachefiles_remove_object_xattr(cache, victim);
717 if (ret < 0)
718 goto error_unlock;
719
720 /* actually remove the victim (drops the dir mutex) */
721 _debug("bury");
722
723 ret = cachefiles_bury_object(cache, dir, victim);
724 if (ret < 0)
725 goto error;
726
727 dput(victim);
728 _leave(" = 0");
729 return 0;
730
731error_unlock:
732 mutex_unlock(&dir->d_inode->i_mutex);
733error:
734 dput(victim);
735 if (ret == -ENOENT) {
736 /* file or dir now absent - probably retired by netfs */
737 _leave(" = -ESTALE [absent]");
738 return -ESTALE;
739 }
740
741 if (ret != -ENOMEM) {
742 kerror("Internal error: %d", ret);
743 ret = -EIO;
744 }
745
746 _leave(" = %d", ret);
747 return ret;
748}
749
750/*
751 * find out if an object is in use or not
752 * - called only by cache manager daemon
753 * - returns -EBUSY or 0 to indicate whether an object is in use or not
754 */
755int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
756 char *filename)
757{
758 struct dentry *victim;
759
760 //_enter(",%*.*s/,%s",
761 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
762
763 victim = cachefiles_check_active(cache, dir, filename);
764 if (IS_ERR(victim))
765 return PTR_ERR(victim);
766
767 mutex_unlock(&dir->d_inode->i_mutex);
768 dput(victim);
769 //_leave(" = 0");
770 return 0;
771}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 000000000000..eccd33941199
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
1/* CacheFiles statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/proc_fs.h>
14#include <linux/seq_file.h>
15#include "internal.h"
16
17atomic_t cachefiles_lookup_histogram[HZ];
18atomic_t cachefiles_mkdir_histogram[HZ];
19atomic_t cachefiles_create_histogram[HZ];
20
21/*
22 * display the latency histogram
23 */
24static int cachefiles_histogram_show(struct seq_file *m, void *v)
25{
26 unsigned long index;
27 unsigned x, y, z, t;
28
29 switch ((unsigned long) v) {
30 case 1:
31 seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n");
32 return 0;
33 case 2:
34 seq_puts(m, "===== ===== ========= ========= =========\n");
35 return 0;
36 default:
37 index = (unsigned long) v - 3;
38 x = atomic_read(&cachefiles_lookup_histogram[index]);
39 y = atomic_read(&cachefiles_mkdir_histogram[index]);
40 z = atomic_read(&cachefiles_create_histogram[index]);
41 if (x == 0 && y == 0 && z == 0)
42 return 0;
43
44 t = (index * 1000) / HZ;
45
46 seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z);
47 return 0;
48 }
49}
50
51/*
52 * set up the iterator to start reading from the first line
53 */
54static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
55{
56 if ((unsigned long long)*_pos >= HZ + 2)
57 return NULL;
58 if (*_pos == 0)
59 *_pos = 1;
60 return (void *)(unsigned long) *_pos;
61}
62
63/*
64 * move to the next line
65 */
66static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
67{
68 (*pos)++;
69 return (unsigned long long)*pos > HZ + 2 ?
70 NULL : (void *)(unsigned long) *pos;
71}
72
73/*
74 * clean up after reading
75 */
76static void cachefiles_histogram_stop(struct seq_file *m, void *v)
77{
78}
79
80static const struct seq_operations cachefiles_histogram_ops = {
81 .start = cachefiles_histogram_start,
82 .stop = cachefiles_histogram_stop,
83 .next = cachefiles_histogram_next,
84 .show = cachefiles_histogram_show,
85};
86
87/*
88 * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
89 */
90static int cachefiles_histogram_open(struct inode *inode, struct file *file)
91{
92 return seq_open(file, &cachefiles_histogram_ops);
93}
94
95static const struct file_operations cachefiles_histogram_fops = {
96 .owner = THIS_MODULE,
97 .open = cachefiles_histogram_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = seq_release,
101};
102
103/*
104 * initialise the /proc/fs/cachefiles/ directory
105 */
106int __init cachefiles_proc_init(void)
107{
108 _enter("");
109
110 if (!proc_mkdir("fs/cachefiles", NULL))
111 goto error_dir;
112
113 if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
114 &cachefiles_histogram_fops))
115 goto error_histogram;
116
117 _leave(" = 0");
118 return 0;
119
120error_histogram:
121 remove_proc_entry("fs/cachefiles", NULL);
122error_dir:
123 _leave(" = -ENOMEM");
124 return -ENOMEM;
125}
126
127/*
128 * clean up the /proc/fs/cachefiles/ directory
129 */
130void cachefiles_proc_cleanup(void)
131{
132 remove_proc_entry("fs/cachefiles/histogram", NULL);
133 remove_proc_entry("fs/cachefiles", NULL);
134}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000000..a69787e7dd96
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
1/* Storage object read/write
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/file.h>
14#include "internal.h"
15
16/*
17 * detect wake up events generated by the unlocking of pages in which we're
18 * interested
19 * - we use this to detect read completion of backing pages
20 * - the caller holds the waitqueue lock
21 */
22static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
23 int sync, void *_key)
24{
25 struct cachefiles_one_read *monitor =
26 container_of(wait, struct cachefiles_one_read, monitor);
27 struct cachefiles_object *object;
28 struct wait_bit_key *key = _key;
29 struct page *page = wait->private;
30
31 ASSERT(key);
32
33 _enter("{%lu},%u,%d,{%p,%u}",
34 monitor->netfs_page->index, mode, sync,
35 key->flags, key->bit_nr);
36
37 if (key->flags != &page->flags ||
38 key->bit_nr != PG_locked)
39 return 0;
40
41 _debug("--- monitor %p %lx ---", page, page->flags);
42
43 if (!PageUptodate(page) && !PageError(page))
44 dump_stack();
45
46 /* remove from the waitqueue */
47 list_del(&wait->task_list);
48
49 /* move onto the action list and queue for FS-Cache thread pool */
50 ASSERT(monitor->op);
51
52 object = container_of(monitor->op->op.object,
53 struct cachefiles_object, fscache);
54
55 spin_lock(&object->work_lock);
56 list_add_tail(&monitor->op_link, &monitor->op->to_do);
57 spin_unlock(&object->work_lock);
58
59 fscache_enqueue_retrieval(monitor->op);
60 return 0;
61}
62
63/*
64 * copy data from backing pages to netfs pages to complete a read operation
65 * - driven by FS-Cache's thread pool
66 */
67static void cachefiles_read_copier(struct fscache_operation *_op)
68{
69 struct cachefiles_one_read *monitor;
70 struct cachefiles_object *object;
71 struct fscache_retrieval *op;
72 struct pagevec pagevec;
73 int error, max;
74
75 op = container_of(_op, struct fscache_retrieval, op);
76 object = container_of(op->op.object,
77 struct cachefiles_object, fscache);
78
79 _enter("{ino=%lu}", object->backer->d_inode->i_ino);
80
81 pagevec_init(&pagevec, 0);
82
83 max = 8;
84 spin_lock_irq(&object->work_lock);
85
86 while (!list_empty(&op->to_do)) {
87 monitor = list_entry(op->to_do.next,
88 struct cachefiles_one_read, op_link);
89 list_del(&monitor->op_link);
90
91 spin_unlock_irq(&object->work_lock);
92
93 _debug("- copy {%lu}", monitor->back_page->index);
94
95 error = -EIO;
96 if (PageUptodate(monitor->back_page)) {
97 copy_highpage(monitor->netfs_page, monitor->back_page);
98
99 pagevec_add(&pagevec, monitor->netfs_page);
100 fscache_mark_pages_cached(monitor->op, &pagevec);
101 error = 0;
102 }
103
104 if (error)
105 cachefiles_io_error_obj(
106 object,
107 "Readpage failed on backing file %lx",
108 (unsigned long) monitor->back_page->flags);
109
110 page_cache_release(monitor->back_page);
111
112 fscache_end_io(op, monitor->netfs_page, error);
113 page_cache_release(monitor->netfs_page);
114 fscache_put_retrieval(op);
115 kfree(monitor);
116
117 /* let the thread pool have some air occasionally */
118 max--;
119 if (max < 0 || need_resched()) {
120 if (!list_empty(&op->to_do))
121 fscache_enqueue_retrieval(op);
122 _leave(" [maxed out]");
123 return;
124 }
125
126 spin_lock_irq(&object->work_lock);
127 }
128
129 spin_unlock_irq(&object->work_lock);
130 _leave("");
131}
132
133/*
134 * read the corresponding page to the given set from the backing file
135 * - an uncertain page is simply discarded, to be tried again another time
136 */
137static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
138 struct fscache_retrieval *op,
139 struct page *netpage,
140 struct pagevec *pagevec)
141{
142 struct cachefiles_one_read *monitor;
143 struct address_space *bmapping;
144 struct page *newpage, *backpage;
145 int ret;
146
147 _enter("");
148
149 pagevec_reinit(pagevec);
150
151 _debug("read back %p{%lu,%d}",
152 netpage, netpage->index, page_count(netpage));
153
154 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
155 if (!monitor)
156 goto nomem;
157
158 monitor->netfs_page = netpage;
159 monitor->op = fscache_get_retrieval(op);
160
161 init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
162
163 /* attempt to get hold of the backing page */
164 bmapping = object->backer->d_inode->i_mapping;
165 newpage = NULL;
166
167 for (;;) {
168 backpage = find_get_page(bmapping, netpage->index);
169 if (backpage)
170 goto backing_page_already_present;
171
172 if (!newpage) {
173 newpage = page_cache_alloc_cold(bmapping);
174 if (!newpage)
175 goto nomem_monitor;
176 }
177
178 ret = add_to_page_cache(newpage, bmapping,
179 netpage->index, GFP_KERNEL);
180 if (ret == 0)
181 goto installed_new_backing_page;
182 if (ret != -EEXIST)
183 goto nomem_page;
184 }
185
186 /* we've installed a new backing page, so now we need to add it
187 * to the LRU list and start it reading */
188installed_new_backing_page:
189 _debug("- new %p", newpage);
190
191 backpage = newpage;
192 newpage = NULL;
193
194 page_cache_get(backpage);
195 pagevec_add(pagevec, backpage);
196 __pagevec_lru_add_file(pagevec);
197
198read_backing_page:
199 ret = bmapping->a_ops->readpage(NULL, backpage);
200 if (ret < 0)
201 goto read_error;
202
203 /* set the monitor to transfer the data across */
204monitor_backing_page:
205 _debug("- monitor add");
206
207 /* install the monitor */
208 page_cache_get(monitor->netfs_page);
209 page_cache_get(backpage);
210 monitor->back_page = backpage;
211 monitor->monitor.private = backpage;
212 add_page_wait_queue(backpage, &monitor->monitor);
213 monitor = NULL;
214
215 /* but the page may have been read before the monitor was installed, so
216 * the monitor may miss the event - so we have to ensure that we do get
217 * one in such a case */
218 if (trylock_page(backpage)) {
219 _debug("jumpstart %p {%lx}", backpage, backpage->flags);
220 unlock_page(backpage);
221 }
222 goto success;
223
224 /* if the backing page is already present, it can be in one of
225 * three states: read in progress, read failed or read okay */
226backing_page_already_present:
227 _debug("- present");
228
229 if (newpage) {
230 page_cache_release(newpage);
231 newpage = NULL;
232 }
233
234 if (PageError(backpage))
235 goto io_error;
236
237 if (PageUptodate(backpage))
238 goto backing_page_already_uptodate;
239
240 if (!trylock_page(backpage))
241 goto monitor_backing_page;
242 _debug("read %p {%lx}", backpage, backpage->flags);
243 goto read_backing_page;
244
245 /* the backing page is already up to date, attach the netfs
246 * page to the pagecache and LRU and copy the data across */
247backing_page_already_uptodate:
248 _debug("- uptodate");
249
250 pagevec_add(pagevec, netpage);
251 fscache_mark_pages_cached(op, pagevec);
252
253 copy_highpage(netpage, backpage);
254 fscache_end_io(op, netpage, 0);
255
256success:
257 _debug("success");
258 ret = 0;
259
260out:
261 if (backpage)
262 page_cache_release(backpage);
263 if (monitor) {
264 fscache_put_retrieval(monitor->op);
265 kfree(monitor);
266 }
267 _leave(" = %d", ret);
268 return ret;
269
270read_error:
271 _debug("read error %d", ret);
272 if (ret == -ENOMEM)
273 goto out;
274io_error:
275 cachefiles_io_error_obj(object, "Page read error on backing file");
276 ret = -ENOBUFS;
277 goto out;
278
279nomem_page:
280 page_cache_release(newpage);
281nomem_monitor:
282 fscache_put_retrieval(monitor->op);
283 kfree(monitor);
284nomem:
285 _leave(" = -ENOMEM");
286 return -ENOMEM;
287}
288
289/*
290 * read a page from the cache or allocate a block in which to store it
291 * - cache withdrawal is prevented by the caller
292 * - returns -EINTR if interrupted
293 * - returns -ENOMEM if ran out of memory
294 * - returns -ENOBUFS if no buffers can be made available
295 * - returns -ENOBUFS if page is beyond EOF
296 * - if the page is backed by a block in the cache:
297 * - a read will be started which will call the callback on completion
298 * - 0 will be returned
299 * - else if the page is unbacked:
300 * - the metadata will be retained
301 * - -ENODATA will be returned
302 */
303int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
304 struct page *page,
305 gfp_t gfp)
306{
307 struct cachefiles_object *object;
308 struct cachefiles_cache *cache;
309 struct pagevec pagevec;
310 struct inode *inode;
311 sector_t block0, block;
312 unsigned shift;
313 int ret;
314
315 object = container_of(op->op.object,
316 struct cachefiles_object, fscache);
317 cache = container_of(object->fscache.cache,
318 struct cachefiles_cache, cache);
319
320 _enter("{%p},{%lx},,,", object, page->index);
321
322 if (!object->backer)
323 return -ENOBUFS;
324
325 inode = object->backer->d_inode;
326 ASSERT(S_ISREG(inode->i_mode));
327 ASSERT(inode->i_mapping->a_ops->bmap);
328 ASSERT(inode->i_mapping->a_ops->readpages);
329
330 /* calculate the shift required to use bmap */
331 if (inode->i_sb->s_blocksize > PAGE_SIZE)
332 return -ENOBUFS;
333
334 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
335
336 op->op.flags = FSCACHE_OP_FAST;
337 op->op.processor = cachefiles_read_copier;
338
339 pagevec_init(&pagevec, 0);
340
341 /* we assume the absence or presence of the first block is a good
342 * enough indication for the page as a whole
343 * - TODO: don't use bmap() for this as it is _not_ actually good
344 * enough for this as it doesn't indicate errors, but it's all we've
345 * got for the moment
346 */
347 block0 = page->index;
348 block0 <<= shift;
349
350 block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
351 _debug("%llx -> %llx",
352 (unsigned long long) block0,
353 (unsigned long long) block);
354
355 if (block) {
356 /* submit the apparently valid page to the backing fs to be
357 * read from disk */
358 ret = cachefiles_read_backing_file_one(object, op, page,
359 &pagevec);
360 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
361 /* there's space in the cache we can use */
362 pagevec_add(&pagevec, page);
363 fscache_mark_pages_cached(op, &pagevec);
364 ret = -ENODATA;
365 } else {
366 ret = -ENOBUFS;
367 }
368
369 _leave(" = %d", ret);
370 return ret;
371}
372
373/*
374 * read the corresponding pages to the given set from the backing file
375 * - any uncertain pages are simply discarded, to be tried again another time
376 */
377static int cachefiles_read_backing_file(struct cachefiles_object *object,
378 struct fscache_retrieval *op,
379 struct list_head *list,
380 struct pagevec *mark_pvec)
381{
382 struct cachefiles_one_read *monitor = NULL;
383 struct address_space *bmapping = object->backer->d_inode->i_mapping;
384 struct pagevec lru_pvec;
385 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
386 int ret = 0;
387
388 _enter("");
389
390 pagevec_init(&lru_pvec, 0);
391
392 list_for_each_entry_safe(netpage, _n, list, lru) {
393 list_del(&netpage->lru);
394
395 _debug("read back %p{%lu,%d}",
396 netpage, netpage->index, page_count(netpage));
397
398 if (!monitor) {
399 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
400 if (!monitor)
401 goto nomem;
402
403 monitor->op = fscache_get_retrieval(op);
404 init_waitqueue_func_entry(&monitor->monitor,
405 cachefiles_read_waiter);
406 }
407
408 for (;;) {
409 backpage = find_get_page(bmapping, netpage->index);
410 if (backpage)
411 goto backing_page_already_present;
412
413 if (!newpage) {
414 newpage = page_cache_alloc_cold(bmapping);
415 if (!newpage)
416 goto nomem;
417 }
418
419 ret = add_to_page_cache(newpage, bmapping,
420 netpage->index, GFP_KERNEL);
421 if (ret == 0)
422 goto installed_new_backing_page;
423 if (ret != -EEXIST)
424 goto nomem;
425 }
426
427 /* we've installed a new backing page, so now we need to add it
428 * to the LRU list and start it reading */
429 installed_new_backing_page:
430 _debug("- new %p", newpage);
431
432 backpage = newpage;
433 newpage = NULL;
434
435 page_cache_get(backpage);
436 if (!pagevec_add(&lru_pvec, backpage))
437 __pagevec_lru_add_file(&lru_pvec);
438
439 reread_backing_page:
440 ret = bmapping->a_ops->readpage(NULL, backpage);
441 if (ret < 0)
442 goto read_error;
443
444 /* add the netfs page to the pagecache and LRU, and set the
445 * monitor to transfer the data across */
446 monitor_backing_page:
447 _debug("- monitor add");
448
449 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
450 GFP_KERNEL);
451 if (ret < 0) {
452 if (ret == -EEXIST) {
453 page_cache_release(netpage);
454 continue;
455 }
456 goto nomem;
457 }
458
459 page_cache_get(netpage);
460 if (!pagevec_add(&lru_pvec, netpage))
461 __pagevec_lru_add_file(&lru_pvec);
462
463 /* install a monitor */
464 page_cache_get(netpage);
465 monitor->netfs_page = netpage;
466
467 page_cache_get(backpage);
468 monitor->back_page = backpage;
469 monitor->monitor.private = backpage;
470 add_page_wait_queue(backpage, &monitor->monitor);
471 monitor = NULL;
472
473 /* but the page may have been read before the monitor was
474 * installed, so the monitor may miss the event - so we have to
475 * ensure that we do get one in such a case */
476 if (trylock_page(backpage)) {
477 _debug("2unlock %p {%lx}", backpage, backpage->flags);
478 unlock_page(backpage);
479 }
480
481 page_cache_release(backpage);
482 backpage = NULL;
483
484 page_cache_release(netpage);
485 netpage = NULL;
486 continue;
487
488 /* if the backing page is already present, it can be in one of
489 * three states: read in progress, read failed or read okay */
490 backing_page_already_present:
491 _debug("- present %p", backpage);
492
493 if (PageError(backpage))
494 goto io_error;
495
496 if (PageUptodate(backpage))
497 goto backing_page_already_uptodate;
498
499 _debug("- not ready %p{%lx}", backpage, backpage->flags);
500
501 if (!trylock_page(backpage))
502 goto monitor_backing_page;
503
504 if (PageError(backpage)) {
505 _debug("error %lx", backpage->flags);
506 unlock_page(backpage);
507 goto io_error;
508 }
509
510 if (PageUptodate(backpage))
511 goto backing_page_already_uptodate_unlock;
512
513 /* we've locked a page that's neither up to date nor erroneous,
514 * so we need to attempt to read it again */
515 goto reread_backing_page;
516
517 /* the backing page is already up to date, attach the netfs
518 * page to the pagecache and LRU and copy the data across */
519 backing_page_already_uptodate_unlock:
520 _debug("uptodate %lx", backpage->flags);
521 unlock_page(backpage);
522 backing_page_already_uptodate:
523 _debug("- uptodate");
524
525 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
526 GFP_KERNEL);
527 if (ret < 0) {
528 if (ret == -EEXIST) {
529 page_cache_release(netpage);
530 continue;
531 }
532 goto nomem;
533 }
534
535 copy_highpage(netpage, backpage);
536
537 page_cache_release(backpage);
538 backpage = NULL;
539
540 if (!pagevec_add(mark_pvec, netpage))
541 fscache_mark_pages_cached(op, mark_pvec);
542
543 page_cache_get(netpage);
544 if (!pagevec_add(&lru_pvec, netpage))
545 __pagevec_lru_add_file(&lru_pvec);
546
547 fscache_end_io(op, netpage, 0);
548 page_cache_release(netpage);
549 netpage = NULL;
550 continue;
551 }
552
553 netpage = NULL;
554
555 _debug("out");
556
557out:
558 /* tidy up */
559 pagevec_lru_add_file(&lru_pvec);
560
561 if (newpage)
562 page_cache_release(newpage);
563 if (netpage)
564 page_cache_release(netpage);
565 if (backpage)
566 page_cache_release(backpage);
567 if (monitor) {
568 fscache_put_retrieval(op);
569 kfree(monitor);
570 }
571
572 list_for_each_entry_safe(netpage, _n, list, lru) {
573 list_del(&netpage->lru);
574 page_cache_release(netpage);
575 }
576
577 _leave(" = %d", ret);
578 return ret;
579
580nomem:
581 _debug("nomem");
582 ret = -ENOMEM;
583 goto out;
584
585read_error:
586 _debug("read error %d", ret);
587 if (ret == -ENOMEM)
588 goto out;
589io_error:
590 cachefiles_io_error_obj(object, "Page read error on backing file");
591 ret = -ENOBUFS;
592 goto out;
593}
594
595/*
596 * read a list of pages from the cache or allocate blocks in which to store
597 * them
598 */
599int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
600 struct list_head *pages,
601 unsigned *nr_pages,
602 gfp_t gfp)
603{
604 struct cachefiles_object *object;
605 struct cachefiles_cache *cache;
606 struct list_head backpages;
607 struct pagevec pagevec;
608 struct inode *inode;
609 struct page *page, *_n;
610 unsigned shift, nrbackpages;
611 int ret, ret2, space;
612
613 object = container_of(op->op.object,
614 struct cachefiles_object, fscache);
615 cache = container_of(object->fscache.cache,
616 struct cachefiles_cache, cache);
617
618 _enter("{OBJ%x,%d},,%d,,",
619 object->fscache.debug_id, atomic_read(&op->op.usage),
620 *nr_pages);
621
622 if (!object->backer)
623 return -ENOBUFS;
624
625 space = 1;
626 if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
627 space = 0;
628
629 inode = object->backer->d_inode;
630 ASSERT(S_ISREG(inode->i_mode));
631 ASSERT(inode->i_mapping->a_ops->bmap);
632 ASSERT(inode->i_mapping->a_ops->readpages);
633
634 /* calculate the shift required to use bmap */
635 if (inode->i_sb->s_blocksize > PAGE_SIZE)
636 return -ENOBUFS;
637
638 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
639
640 pagevec_init(&pagevec, 0);
641
642 op->op.flags = FSCACHE_OP_FAST;
643 op->op.processor = cachefiles_read_copier;
644
645 INIT_LIST_HEAD(&backpages);
646 nrbackpages = 0;
647
648 ret = space ? -ENODATA : -ENOBUFS;
649 list_for_each_entry_safe(page, _n, pages, lru) {
650 sector_t block0, block;
651
652 /* we assume the absence or presence of the first block is a
653 * good enough indication for the page as a whole
654 * - TODO: don't use bmap() for this as it is _not_ actually
655 * good enough for this as it doesn't indicate errors, but
656 * it's all we've got for the moment
657 */
658 block0 = page->index;
659 block0 <<= shift;
660
661 block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
662 block0);
663 _debug("%llx -> %llx",
664 (unsigned long long) block0,
665 (unsigned long long) block);
666
667 if (block) {
668 /* we have data - add it to the list to give to the
669 * backing fs */
670 list_move(&page->lru, &backpages);
671 (*nr_pages)--;
672 nrbackpages++;
673 } else if (space && pagevec_add(&pagevec, page) == 0) {
674 fscache_mark_pages_cached(op, &pagevec);
675 ret = -ENODATA;
676 }
677 }
678
679 if (pagevec_count(&pagevec) > 0)
680 fscache_mark_pages_cached(op, &pagevec);
681
682 if (list_empty(pages))
683 ret = 0;
684
685 /* submit the apparently valid pages to the backing fs to be read from
686 * disk */
687 if (nrbackpages > 0) {
688 ret2 = cachefiles_read_backing_file(object, op, &backpages,
689 &pagevec);
690 if (ret2 == -ENOMEM || ret2 == -EINTR)
691 ret = ret2;
692 }
693
694 if (pagevec_count(&pagevec) > 0)
695 fscache_mark_pages_cached(op, &pagevec);
696
697 _leave(" = %d [nr=%u%s]",
698 ret, *nr_pages, list_empty(pages) ? " empty" : "");
699 return ret;
700}
701
702/*
703 * allocate a block in the cache in which to store a page
704 * - cache withdrawal is prevented by the caller
705 * - returns -EINTR if interrupted
706 * - returns -ENOMEM if ran out of memory
707 * - returns -ENOBUFS if no buffers can be made available
708 * - returns -ENOBUFS if page is beyond EOF
709 * - otherwise:
710 * - the metadata will be retained
711 * - 0 will be returned
712 */
713int cachefiles_allocate_page(struct fscache_retrieval *op,
714 struct page *page,
715 gfp_t gfp)
716{
717 struct cachefiles_object *object;
718 struct cachefiles_cache *cache;
719 struct pagevec pagevec;
720 int ret;
721
722 object = container_of(op->op.object,
723 struct cachefiles_object, fscache);
724 cache = container_of(object->fscache.cache,
725 struct cachefiles_cache, cache);
726
727 _enter("%p,{%lx},", object, page->index);
728
729 ret = cachefiles_has_space(cache, 0, 1);
730 if (ret == 0) {
731 pagevec_init(&pagevec, 0);
732 pagevec_add(&pagevec, page);
733 fscache_mark_pages_cached(op, &pagevec);
734 } else {
735 ret = -ENOBUFS;
736 }
737
738 _leave(" = %d", ret);
739 return ret;
740}
741
742/*
743 * allocate blocks in the cache in which to store a set of pages
744 * - cache withdrawal is prevented by the caller
745 * - returns -EINTR if interrupted
746 * - returns -ENOMEM if ran out of memory
747 * - returns -ENOBUFS if some buffers couldn't be made available
748 * - returns -ENOBUFS if some pages are beyond EOF
749 * - otherwise:
750 * - -ENODATA will be returned
751 * - metadata will be retained for any page marked
752 */
753int cachefiles_allocate_pages(struct fscache_retrieval *op,
754 struct list_head *pages,
755 unsigned *nr_pages,
756 gfp_t gfp)
757{
758 struct cachefiles_object *object;
759 struct cachefiles_cache *cache;
760 struct pagevec pagevec;
761 struct page *page;
762 int ret;
763
764 object = container_of(op->op.object,
765 struct cachefiles_object, fscache);
766 cache = container_of(object->fscache.cache,
767 struct cachefiles_cache, cache);
768
769 _enter("%p,,,%d,", object, *nr_pages);
770
771 ret = cachefiles_has_space(cache, 0, *nr_pages);
772 if (ret == 0) {
773 pagevec_init(&pagevec, 0);
774
775 list_for_each_entry(page, pages, lru) {
776 if (pagevec_add(&pagevec, page) == 0)
777 fscache_mark_pages_cached(op, &pagevec);
778 }
779
780 if (pagevec_count(&pagevec) > 0)
781 fscache_mark_pages_cached(op, &pagevec);
782 ret = -ENODATA;
783 } else {
784 ret = -ENOBUFS;
785 }
786
787 _leave(" = %d", ret);
788 return ret;
789}
790
791/*
792 * request a page be stored in the cache
793 * - cache withdrawal is prevented by the caller
794 * - this request may be ignored if there's no cache block available, in which
795 * case -ENOBUFS will be returned
796 * - if the op is in progress, 0 will be returned
797 */
798int cachefiles_write_page(struct fscache_storage *op, struct page *page)
799{
800 struct cachefiles_object *object;
801 struct cachefiles_cache *cache;
802 mm_segment_t old_fs;
803 struct file *file;
804 loff_t pos;
805 void *data;
806 int ret;
807
808 ASSERT(op != NULL);
809 ASSERT(page != NULL);
810
811 object = container_of(op->op.object,
812 struct cachefiles_object, fscache);
813
814 _enter("%p,%p{%lx},,,", object, page, page->index);
815
816 if (!object->backer) {
817 _leave(" = -ENOBUFS");
818 return -ENOBUFS;
819 }
820
821 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
822
823 cache = container_of(object->fscache.cache,
824 struct cachefiles_cache, cache);
825
826 /* write the page to the backing filesystem and let it store it in its
827 * own time */
828 dget(object->backer);
829 mntget(cache->mnt);
830 file = dentry_open(object->backer, cache->mnt, O_RDWR,
831 cache->cache_cred);
832 if (IS_ERR(file)) {
833 ret = PTR_ERR(file);
834 } else {
835 ret = -EIO;
836 if (file->f_op->write) {
837 pos = (loff_t) page->index << PAGE_SHIFT;
838 data = kmap(page);
839 old_fs = get_fs();
840 set_fs(KERNEL_DS);
841 ret = file->f_op->write(
842 file, (const void __user *) data, PAGE_SIZE,
843 &pos);
844 set_fs(old_fs);
845 kunmap(page);
846 if (ret != PAGE_SIZE)
847 ret = -EIO;
848 }
849 fput(file);
850 }
851
852 if (ret < 0) {
853 if (ret == -EIO)
854 cachefiles_io_error_obj(
855 object, "Write page to backing file failed");
856 ret = -ENOBUFS;
857 }
858
859 _leave(" = %d", ret);
860 return ret;
861}
862
863/*
864 * detach a backing block from a page
865 * - cache withdrawal is prevented by the caller
866 */
867void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
868{
869 struct cachefiles_object *object;
870 struct cachefiles_cache *cache;
871
872 object = container_of(_object, struct cachefiles_object, fscache);
873 cache = container_of(object->fscache.cache,
874 struct cachefiles_cache, cache);
875
876 _enter("%p,{%lu}", object, page->index);
877
878 spin_unlock(&object->fscache.cookie->lock);
879}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 000000000000..b5808cdb2232
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
1/* CacheFiles security management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/cred.h>
14#include "internal.h"
15
16/*
17 * determine the security context within which we access the cache from within
18 * the kernel
19 */
20int cachefiles_get_security_ID(struct cachefiles_cache *cache)
21{
22 struct cred *new;
23 int ret;
24
25 _enter("{%s}", cache->secctx);
26
27 new = prepare_kernel_cred(current);
28 if (!new) {
29 ret = -ENOMEM;
30 goto error;
31 }
32
33 if (cache->secctx) {
34 ret = set_security_override_from_ctx(new, cache->secctx);
35 if (ret < 0) {
36 put_cred(new);
37 printk(KERN_ERR "CacheFiles:"
38 " Security denies permission to nominate"
39 " security context: error %d\n",
40 ret);
41 goto error;
42 }
43 }
44
45 cache->cache_cred = new;
46 ret = 0;
47error:
48 _leave(" = %d", ret);
49 return ret;
50}
51
52/*
53 * see if mkdir and create can be performed in the root directory
54 */
55static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
56 struct dentry *root)
57{
58 int ret;
59
60 ret = security_inode_mkdir(root->d_inode, root, 0);
61 if (ret < 0) {
62 printk(KERN_ERR "CacheFiles:"
63 " Security denies permission to make dirs: error %d",
64 ret);
65 return ret;
66 }
67
68 ret = security_inode_create(root->d_inode, root, 0);
69 if (ret < 0)
70 printk(KERN_ERR "CacheFiles:"
71 " Security denies permission to create files: error %d",
72 ret);
73
74 return ret;
75}
76
77/*
78 * check the security details of the on-disk cache
79 * - must be called with security override in force
80 */
81int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
82 struct dentry *root,
83 const struct cred **_saved_cred)
84{
85 struct cred *new;
86 int ret;
87
88 _enter("");
89
90 /* duplicate the cache creds for COW (the override is currently in
91 * force, so we can use prepare_creds() to do this) */
92 new = prepare_creds();
93 if (!new)
94 return -ENOMEM;
95
96 cachefiles_end_secure(cache, *_saved_cred);
97
98 /* use the cache root dir's security context as the basis with
99 * which create files */
100 ret = set_create_files_as(new, root->d_inode);
101 if (ret < 0) {
102 _leave(" = %d [cfa]", ret);
103 return ret;
104 }
105
106 put_cred(cache->cache_cred);
107 cache->cache_cred = new;
108
109 cachefiles_begin_secure(cache, _saved_cred);
110 ret = cachefiles_check_cache_dir(cache, root);
111
112 if (ret == -EOPNOTSUPP)
113 ret = 0;
114 _leave(" = %d", ret);
115 return ret;
116}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 000000000000..f3e7a0bf068b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
1/* CacheFiles extended attribute management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include "internal.h"
20
21static const char cachefiles_xattr_cache[] =
22 XATTR_USER_PREFIX "CacheFiles.cache";
23
24/*
25 * check the type label on an object
26 * - done using xattrs
27 */
28int cachefiles_check_object_type(struct cachefiles_object *object)
29{
30 struct dentry *dentry = object->dentry;
31 char type[3], xtype[3];
32 int ret;
33
34 ASSERT(dentry);
35 ASSERT(dentry->d_inode);
36
37 if (!object->fscache.cookie)
38 strcpy(type, "C3");
39 else
40 snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
41
42 _enter("%p{%s}", object, type);
43
44 /* attempt to install a type label directly */
45 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
46 XATTR_CREATE);
47 if (ret == 0) {
48 _debug("SET"); /* we succeeded */
49 goto error;
50 }
51
52 if (ret != -EEXIST) {
53 kerror("Can't set xattr on %*.*s [%lu] (err %d)",
54 dentry->d_name.len, dentry->d_name.len,
55 dentry->d_name.name, dentry->d_inode->i_ino,
56 -ret);
57 goto error;
58 }
59
60 /* read the current type label */
61 ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
62 if (ret < 0) {
63 if (ret == -ERANGE)
64 goto bad_type_length;
65
66 kerror("Can't read xattr on %*.*s [%lu] (err %d)",
67 dentry->d_name.len, dentry->d_name.len,
68 dentry->d_name.name, dentry->d_inode->i_ino,
69 -ret);
70 goto error;
71 }
72
73 /* check the type is what we're expecting */
74 if (ret != 2)
75 goto bad_type_length;
76
77 if (xtype[0] != type[0] || xtype[1] != type[1])
78 goto bad_type;
79
80 ret = 0;
81
82error:
83 _leave(" = %d", ret);
84 return ret;
85
86bad_type_length:
87 kerror("Cache object %lu type xattr length incorrect",
88 dentry->d_inode->i_ino);
89 ret = -EIO;
90 goto error;
91
92bad_type:
93 xtype[2] = 0;
94 kerror("Cache object %*.*s [%lu] type %s not %s",
95 dentry->d_name.len, dentry->d_name.len,
96 dentry->d_name.name, dentry->d_inode->i_ino,
97 xtype, type);
98 ret = -EIO;
99 goto error;
100}
101
102/*
103 * set the state xattr on a cache file
104 */
105int cachefiles_set_object_xattr(struct cachefiles_object *object,
106 struct cachefiles_xattr *auxdata)
107{
108 struct dentry *dentry = object->dentry;
109 int ret;
110
111 ASSERT(object->fscache.cookie);
112 ASSERT(dentry);
113
114 _enter("%p,#%d", object, auxdata->len);
115
116 /* attempt to install the cache metadata directly */
117 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
118
119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
120 &auxdata->type, auxdata->len,
121 XATTR_CREATE);
122 if (ret < 0 && ret != -ENOMEM)
123 cachefiles_io_error_obj(
124 object,
125 "Failed to set xattr with error %d", ret);
126
127 _leave(" = %d", ret);
128 return ret;
129}
130
131/*
132 * update the state xattr on a cache file
133 */
134int cachefiles_update_object_xattr(struct cachefiles_object *object,
135 struct cachefiles_xattr *auxdata)
136{
137 struct dentry *dentry = object->dentry;
138 int ret;
139
140 ASSERT(object->fscache.cookie);
141 ASSERT(dentry);
142
143 _enter("%p,#%d", object, auxdata->len);
144
145 /* attempt to install the cache metadata directly */
146 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
147
148 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
149 &auxdata->type, auxdata->len,
150 XATTR_REPLACE);
151 if (ret < 0 && ret != -ENOMEM)
152 cachefiles_io_error_obj(
153 object,
154 "Failed to update xattr with error %d", ret);
155
156 _leave(" = %d", ret);
157 return ret;
158}
159
160/*
161 * check the state xattr on a cache file
162 * - return -ESTALE if the object should be deleted
163 */
164int cachefiles_check_object_xattr(struct cachefiles_object *object,
165 struct cachefiles_xattr *auxdata)
166{
167 struct cachefiles_xattr *auxbuf;
168 struct dentry *dentry = object->dentry;
169 int ret;
170
171 _enter("%p,#%d", object, auxdata->len);
172
173 ASSERT(dentry);
174 ASSERT(dentry->d_inode);
175
176 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
177 if (!auxbuf) {
178 _leave(" = -ENOMEM");
179 return -ENOMEM;
180 }
181
182 /* read the current type label */
183 ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
184 &auxbuf->type, 512 + 1);
185 if (ret < 0) {
186 if (ret == -ENODATA)
187 goto stale; /* no attribute - power went off
188 * mid-cull? */
189
190 if (ret == -ERANGE)
191 goto bad_type_length;
192
193 cachefiles_io_error_obj(object,
194 "Can't read xattr on %lu (err %d)",
195 dentry->d_inode->i_ino, -ret);
196 goto error;
197 }
198
199 /* check the on-disk object */
200 if (ret < 1)
201 goto bad_type_length;
202
203 if (auxbuf->type != auxdata->type)
204 goto stale;
205
206 auxbuf->len = ret;
207
208 /* consult the netfs */
209 if (object->fscache.cookie->def->check_aux) {
210 enum fscache_checkaux result;
211 unsigned int dlen;
212
213 dlen = auxbuf->len - 1;
214
215 _debug("checkaux %s #%u",
216 object->fscache.cookie->def->name, dlen);
217
218 result = fscache_check_aux(&object->fscache,
219 &auxbuf->data, dlen);
220
221 switch (result) {
222 /* entry okay as is */
223 case FSCACHE_CHECKAUX_OKAY:
224 goto okay;
225
226 /* entry requires update */
227 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
228 break;
229
230 /* entry requires deletion */
231 case FSCACHE_CHECKAUX_OBSOLETE:
232 goto stale;
233
234 default:
235 BUG();
236 }
237
238 /* update the current label */
239 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
240 &auxdata->type, auxdata->len,
241 XATTR_REPLACE);
242 if (ret < 0) {
243 cachefiles_io_error_obj(object,
244 "Can't update xattr on %lu"
245 " (error %d)",
246 dentry->d_inode->i_ino, -ret);
247 goto error;
248 }
249 }
250
251okay:
252 ret = 0;
253
254error:
255 kfree(auxbuf);
256 _leave(" = %d", ret);
257 return ret;
258
259bad_type_length:
260 kerror("Cache object %lu xattr length incorrect",
261 dentry->d_inode->i_ino);
262 ret = -EIO;
263 goto error;
264
265stale:
266 ret = -ESTALE;
267 goto error;
268}
269
270/*
271 * remove the object's xattr to mark it stale
272 */
273int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
274 struct dentry *dentry)
275{
276 int ret;
277
278 ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
279 if (ret < 0) {
280 if (ret == -ENOENT || ret == -ENODATA)
281 ret = 0;
282 else if (ret != -ENOMEM)
283 cachefiles_io_error(cache,
284 "Can't remove xattr from %lu"
285 " (error %d)",
286 dentry->d_inode->i_ino, -ret);
287 }
288
289 _leave(" = %d", ret);
290 return ret;
291}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8d..54dce78fbb73 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
254 return -ENOMEM; 254 return -ENOMEM;
255 } 255 }
256 256
257 mode &= ~current->fs->umask; 257 mode &= ~current_umask();
258 if (oplockEnabled) 258 if (oplockEnabled)
259 oplock = REQ_OPLOCK; 259 oplock = REQ_OPLOCK;
260 260
@@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
479 rc = -ENOMEM; 479 rc = -ENOMEM;
480 else if (pTcon->unix_ext) { 480 else if (pTcon->unix_ext) {
481 struct cifs_unix_set_info_args args = { 481 struct cifs_unix_set_info_args args = {
482 .mode = mode & ~current->fs->umask, 482 .mode = mode & ~current_umask(),
483 .ctime = NO_CHANGE_64, 483 .ctime = NO_CHANGE_64,
484 .atime = NO_CHANGE_64, 484 .atime = NO_CHANGE_64,
485 .mtime = NO_CHANGE_64, 485 .mtime = NO_CHANGE_64,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc60805..f121a80fdd6f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1125 goto mkdir_out; 1125 goto mkdir_out;
1126 } 1126 }
1127 1127
1128 mode &= ~current->fs->umask; 1128 mode &= ~current_umask();
1129 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, 1129 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
1130 mode, NULL /* netfid */, pInfo, &oplock, 1130 mode, NULL /* netfid */, pInfo, &oplock,
1131 full_path, cifs_sb->local_nls, 1131 full_path, cifs_sb->local_nls,
@@ -1204,7 +1204,7 @@ mkdir_get_info:
1204 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) 1204 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
1205 direntry->d_inode->i_nlink = 2; 1205 direntry->d_inode->i_nlink = 2;
1206 1206
1207 mode &= ~current->fs->umask; 1207 mode &= ~current_umask();
1208 /* must turn on setgid bit if parent dir has it */ 1208 /* must turn on setgid bit if parent dir has it */
1209 if (inode->i_mode & S_ISGID) 1209 if (inode->i_mode & S_ISGID)
1210 mode |= S_ISGID; 1210 mode |= S_ISGID;
diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..3f84d5f15889 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
51#include <linux/poll.h> 51#include <linux/poll.h>
52#include <linux/mm.h> 52#include <linux/mm.h>
53#include <linux/eventpoll.h> 53#include <linux/eventpoll.h>
54#include <linux/fs_struct.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
@@ -1195,16 +1196,12 @@ out:
1195 return ret; 1196 return ret;
1196} 1197}
1197 1198
1198asmlinkage ssize_t 1199static size_t compat_readv(struct file *file,
1199compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1200 const struct compat_iovec __user *vec,
1201 unsigned long vlen, loff_t *pos)
1200{ 1202{
1201 struct file *file;
1202 ssize_t ret = -EBADF; 1203 ssize_t ret = -EBADF;
1203 1204
1204 file = fget(fd);
1205 if (!file)
1206 return -EBADF;
1207
1208 if (!(file->f_mode & FMODE_READ)) 1205 if (!(file->f_mode & FMODE_READ))
1209 goto out; 1206 goto out;
1210 1207
@@ -1212,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
1212 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) 1209 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
1213 goto out; 1210 goto out;
1214 1211
1215 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); 1212 ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1216 1213
1217out: 1214out:
1218 if (ret > 0) 1215 if (ret > 0)
1219 add_rchar(current, ret); 1216 add_rchar(current, ret);
1220 inc_syscr(current); 1217 inc_syscr(current);
1221 fput(file);
1222 return ret; 1218 return ret;
1223} 1219}
1224 1220
1225asmlinkage ssize_t 1221asmlinkage ssize_t
1226compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1222compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
1223 unsigned long vlen)
1227{ 1224{
1228 struct file *file; 1225 struct file *file;
1229 ssize_t ret = -EBADF; 1226 int fput_needed;
1227 ssize_t ret;
1230 1228
1231 file = fget(fd); 1229 file = fget_light(fd, &fput_needed);
1232 if (!file) 1230 if (!file)
1233 return -EBADF; 1231 return -EBADF;
1232 ret = compat_readv(file, vec, vlen, &file->f_pos);
1233 fput_light(file, fput_needed);
1234 return ret;
1235}
1236
1237asmlinkage ssize_t
1238compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
1239 unsigned long vlen, u32 pos_low, u32 pos_high)
1240{
1241 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1242 struct file *file;
1243 int fput_needed;
1244 ssize_t ret;
1245
1246 if (pos < 0)
1247 return -EINVAL;
1248 file = fget_light(fd, &fput_needed);
1249 if (!file)
1250 return -EBADF;
1251 ret = compat_readv(file, vec, vlen, &pos);
1252 fput_light(file, fput_needed);
1253 return ret;
1254}
1255
1256static size_t compat_writev(struct file *file,
1257 const struct compat_iovec __user *vec,
1258 unsigned long vlen, loff_t *pos)
1259{
1260 ssize_t ret = -EBADF;
1261
1234 if (!(file->f_mode & FMODE_WRITE)) 1262 if (!(file->f_mode & FMODE_WRITE))
1235 goto out; 1263 goto out;
1236 1264
@@ -1238,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
1238 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) 1266 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1239 goto out; 1267 goto out;
1240 1268
1241 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); 1269 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1242 1270
1243out: 1271out:
1244 if (ret > 0) 1272 if (ret > 0)
1245 add_wchar(current, ret); 1273 add_wchar(current, ret);
1246 inc_syscw(current); 1274 inc_syscw(current);
1247 fput(file); 1275 return ret;
1276}
1277
1278asmlinkage ssize_t
1279compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
1280 unsigned long vlen)
1281{
1282 struct file *file;
1283 int fput_needed;
1284 ssize_t ret;
1285
1286 file = fget_light(fd, &fput_needed);
1287 if (!file)
1288 return -EBADF;
1289 ret = compat_writev(file, vec, vlen, &file->f_pos);
1290 fput_light(file, fput_needed);
1291 return ret;
1292}
1293
1294asmlinkage ssize_t
1295compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
1296 unsigned long vlen, u32 pos_low, u32 pos_high)
1297{
1298 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1299 struct file *file;
1300 int fput_needed;
1301 ssize_t ret;
1302
1303 if (pos < 0)
1304 return -EINVAL;
1305 file = fget_light(fd, &fput_needed);
1306 if (!file)
1307 return -EBADF;
1308 ret = compat_writev(file, vec, vlen, &pos);
1309 fput_light(file, fput_needed);
1248 return ret; 1310 return ret;
1249} 1311}
1250 1312
@@ -1441,12 +1503,15 @@ int compat_do_execve(char * filename,
1441 bprm->cred = prepare_exec_creds(); 1503 bprm->cred = prepare_exec_creds();
1442 if (!bprm->cred) 1504 if (!bprm->cred)
1443 goto out_unlock; 1505 goto out_unlock;
1444 check_unsafe_exec(bprm); 1506
1507 retval = check_unsafe_exec(bprm);
1508 if (retval)
1509 goto out_unlock;
1445 1510
1446 file = open_exec(filename); 1511 file = open_exec(filename);
1447 retval = PTR_ERR(file); 1512 retval = PTR_ERR(file);
1448 if (IS_ERR(file)) 1513 if (IS_ERR(file))
1449 goto out_unlock; 1514 goto out_unmark;
1450 1515
1451 sched_exec(); 1516 sched_exec();
1452 1517
@@ -1488,6 +1553,9 @@ int compat_do_execve(char * filename,
1488 goto out; 1553 goto out;
1489 1554
1490 /* execve succeeded */ 1555 /* execve succeeded */
1556 write_lock(&current->fs->lock);
1557 current->fs->in_exec = 0;
1558 write_unlock(&current->fs->lock);
1491 current->in_execve = 0; 1559 current->in_execve = 0;
1492 mutex_unlock(&current->cred_exec_mutex); 1560 mutex_unlock(&current->cred_exec_mutex);
1493 acct_update_integrals(current); 1561 acct_update_integrals(current);
@@ -1506,6 +1574,11 @@ out_file:
1506 fput(bprm->file); 1574 fput(bprm->file);
1507 } 1575 }
1508 1576
1577out_unmark:
1578 write_lock(&current->fs->lock);
1579 current->fs->in_exec = 0;
1580 write_unlock(&current->fs->lock);
1581
1509out_unlock: 1582out_unlock:
1510 current->in_execve = 0; 1583 current->in_execve = 0;
1511 mutex_unlock(&current->cred_exec_mutex); 1584 mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ff786687e93b..3e87ce443ea2 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
23#include <linux/if.h> 23#include <linux/if.h>
24#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/raid/md.h> 26#include <linux/raid/md_u.h>
27#include <linux/kd.h> 27#include <linux/kd.h>
28#include <linux/route.h> 28#include <linux/route.h>
29#include <linux/in6.h> 29#include <linux/in6.h>
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d140..dd3634e4c967 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) 318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
319{ 319{
320 struct super_block *sb = dentry->d_sb; 320 struct super_block *sb = dentry->d_sb;
321 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
321 322
322 buf->f_type = CRAMFS_MAGIC; 323 buf->f_type = CRAMFS_MAGIC;
323 buf->f_bsize = PAGE_CACHE_SIZE; 324 buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
326 buf->f_bavail = 0; 327 buf->f_bavail = 0;
327 buf->f_files = CRAMFS_SB(sb)->files; 328 buf->f_files = CRAMFS_SB(sb)->files;
328 buf->f_ffree = 0; 329 buf->f_ffree = 0;
330 buf->f_fsid.val[0] = (u32)id;
331 buf->f_fsid.val[1] = (u32)(id >> 32);
329 buf->f_namelen = CRAMFS_MAXPATHLEN; 332 buf->f_namelen = CRAMFS_MAXPATHLEN;
330 return 0; 333 return 0;
331} 334}
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
459static int cramfs_readpage(struct file *file, struct page * page) 462static int cramfs_readpage(struct file *file, struct page * page)
460{ 463{
461 struct inode *inode = page->mapping->host; 464 struct inode *inode = page->mapping->host;
462 u32 maxblock, bytes_filled; 465 u32 maxblock;
466 int bytes_filled;
463 void *pgdata; 467 void *pgdata;
464 468
465 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 469 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
466 bytes_filled = 0; 470 bytes_filled = 0;
471 pgdata = kmap(page);
472
467 if (page->index < maxblock) { 473 if (page->index < maxblock) {
468 struct super_block *sb = inode->i_sb; 474 struct super_block *sb = inode->i_sb;
469 u32 blkptr_offset = OFFSET(inode) + page->index*4; 475 u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
472 start_offset = OFFSET(inode) + maxblock*4; 478 start_offset = OFFSET(inode) + maxblock*4;
473 mutex_lock(&read_mutex); 479 mutex_lock(&read_mutex);
474 if (page->index) 480 if (page->index)
475 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4); 481 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
476 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset); 482 4);
483 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
484 start_offset);
477 mutex_unlock(&read_mutex); 485 mutex_unlock(&read_mutex);
478 pgdata = kmap(page); 486
479 if (compr_len == 0) 487 if (compr_len == 0)
480 ; /* hole */ 488 ; /* hole */
481 else if (compr_len > (PAGE_CACHE_SIZE << 1)) 489 else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
482 printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len); 490 pr_err("cramfs: bad compressed blocksize %u\n",
483 else { 491 compr_len);
492 goto err;
493 } else {
484 mutex_lock(&read_mutex); 494 mutex_lock(&read_mutex);
485 bytes_filled = cramfs_uncompress_block(pgdata, 495 bytes_filled = cramfs_uncompress_block(pgdata,
486 PAGE_CACHE_SIZE, 496 PAGE_CACHE_SIZE,
487 cramfs_read(sb, start_offset, compr_len), 497 cramfs_read(sb, start_offset, compr_len),
488 compr_len); 498 compr_len);
489 mutex_unlock(&read_mutex); 499 mutex_unlock(&read_mutex);
500 if (unlikely(bytes_filled < 0))
501 goto err;
490 } 502 }
491 } else 503 }
492 pgdata = kmap(page); 504
493 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); 505 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
494 kunmap(page);
495 flush_dcache_page(page); 506 flush_dcache_page(page);
507 kunmap(page);
496 SetPageUptodate(page); 508 SetPageUptodate(page);
497 unlock_page(page); 509 unlock_page(page);
498 return 0; 510 return 0;
511
512err:
513 kunmap(page);
514 ClearPageUptodate(page);
515 SetPageError(page);
516 unlock_page(page);
517 return 0;
499} 518}
500 519
501static const struct address_space_operations cramfs_aops = { 520static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626f..023329800d2e 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
50err: 50err:
51 printk("Error %d while decompressing!\n", err); 51 printk("Error %d while decompressing!\n", err);
52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); 52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
53 return 0; 53 return -EIO;
54} 54}
55 55
56int cramfs_uncompress_init(void) 56int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..761d30be2683 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/fdtable.h>
21#include <linux/fs.h> 20#include <linux/fs.h>
22#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
23#include <linux/slab.h> 22#include <linux/slab.h>
@@ -32,6 +31,7 @@
32#include <linux/seqlock.h> 31#include <linux/seqlock.h>
33#include <linux/swap.h> 32#include <linux/swap.h>
34#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h>
35#include "internal.h" 35#include "internal.h"
36 36
37int sysctl_vfs_cache_pressure __read_mostly = 100; 37int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
30 30
31static struct vfsmount *debugfs_mount; 31static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 32static int debugfs_mount_count;
33static bool debugfs_registered;
33 34
34static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) 35static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
35{ 36{
@@ -496,6 +497,16 @@ exit:
496} 497}
497EXPORT_SYMBOL_GPL(debugfs_rename); 498EXPORT_SYMBOL_GPL(debugfs_rename);
498 499
500/**
501 * debugfs_initialized - Tells whether debugfs has been registered
502 */
503bool debugfs_initialized(void)
504{
505 return debugfs_registered;
506}
507EXPORT_SYMBOL_GPL(debugfs_initialized);
508
509
499static struct kobject *debug_kobj; 510static struct kobject *debug_kobj;
500 511
501static int __init debugfs_init(void) 512static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
509 retval = register_filesystem(&debug_fs_type); 520 retval = register_filesystem(&debug_fs_type);
510 if (retval) 521 if (retval)
511 kobject_put(debug_kobj); 522 kobject_put(debug_kobj);
523 else
524 debugfs_registered = true;
525
512 return retval; 526 return retval;
513} 527}
514 528
515static void __exit debugfs_exit(void) 529static void __exit debugfs_exit(void)
516{ 530{
531 debugfs_registered = false;
532
517 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 533 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
518 unregister_filesystem(&debug_fs_type); 534 unregister_filesystem(&debug_fs_type);
519 kobject_put(debug_kobj); 535 kobject_put(debug_kobj);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 int acquire_i_mutex = 0; 1126 int acquire_i_mutex = 0;
1127 1127
1128 if (rw & WRITE) 1128 if (rw & WRITE)
1129 rw = WRITE_SYNC; 1129 rw = WRITE_ODIRECT;
1130 1130
1131 if (bdev) 1131 if (bdev)
1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); 1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 44d725f612cf..b6a719a909f8 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
18 18
19 spin_lock(&inode_lock); 19 spin_lock(&inode_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 21 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
22 continue; 22 continue;
23 if (inode->i_mapping->nrpages == 0) 23 if (inode->i_mapping->nrpages == 0)
24 continue; 24 continue;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91fc..f04942810818 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
329} 329}
330 330
331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { 331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
332 struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb); 332 struct super_block *sb = dentry->d_sb;
333 struct efs_sb_info *sbi = SUPER_INFO(sb);
334 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
333 335
334 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ 336 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */
335 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ 337 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */
336 buf->f_blocks = sb->total_groups * /* total data blocks */ 338 buf->f_blocks = sbi->total_groups * /* total data blocks */
337 (sb->group_size - sb->inode_blocks); 339 (sbi->group_size - sbi->inode_blocks);
338 buf->f_bfree = sb->data_free; /* free data blocks */ 340 buf->f_bfree = sbi->data_free; /* free data blocks */
339 buf->f_bavail = sb->data_free; /* free blocks for non-root */ 341 buf->f_bavail = sbi->data_free; /* free blocks for non-root */
340 buf->f_files = sb->total_groups * /* total inodes */ 342 buf->f_files = sbi->total_groups * /* total inodes */
341 sb->inode_blocks * 343 sbi->inode_blocks *
342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); 344 (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
343 buf->f_ffree = sb->inode_free; /* free inodes */ 345 buf->f_ffree = sbi->inode_free; /* free inodes */
346 buf->f_fsid.val[0] = (u32)id;
347 buf->f_fsid.val[1] = (u32)(id >> 32);
344 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ 348 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
345 349
346 return 0; 350 return 0;
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
53#include <linux/tracehook.h> 53#include <linux/tracehook.h>
54#include <linux/kmod.h> 54#include <linux/kmod.h>
55#include <linux/fsnotify.h> 55#include <linux/fsnotify.h>
56#include <linux/fs_struct.h>
56 57
57#include <asm/uaccess.h> 58#include <asm/uaccess.h>
58#include <asm/mmu_context.h> 59#include <asm/mmu_context.h>
@@ -1056,28 +1057,35 @@ EXPORT_SYMBOL(install_exec_creds);
1056 * - the caller must hold current->cred_exec_mutex to protect against 1057 * - the caller must hold current->cred_exec_mutex to protect against
1057 * PTRACE_ATTACH 1058 * PTRACE_ATTACH
1058 */ 1059 */
1059void check_unsafe_exec(struct linux_binprm *bprm) 1060int check_unsafe_exec(struct linux_binprm *bprm)
1060{ 1061{
1061 struct task_struct *p = current, *t; 1062 struct task_struct *p = current, *t;
1062 unsigned long flags; 1063 unsigned long flags;
1063 unsigned n_fs, n_sighand; 1064 unsigned n_fs;
1065 int res = 0;
1064 1066
1065 bprm->unsafe = tracehook_unsafe_exec(p); 1067 bprm->unsafe = tracehook_unsafe_exec(p);
1066 1068
1067 n_fs = 1; 1069 n_fs = 1;
1068 n_sighand = 1; 1070 write_lock(&p->fs->lock);
1069 lock_task_sighand(p, &flags); 1071 lock_task_sighand(p, &flags);
1070 for (t = next_thread(p); t != p; t = next_thread(t)) { 1072 for (t = next_thread(p); t != p; t = next_thread(t)) {
1071 if (t->fs == p->fs) 1073 if (t->fs == p->fs)
1072 n_fs++; 1074 n_fs++;
1073 n_sighand++;
1074 } 1075 }
1075 1076
1076 if (atomic_read(&p->fs->count) > n_fs || 1077 if (p->fs->users > n_fs) {
1077 atomic_read(&p->sighand->count) > n_sighand)
1078 bprm->unsafe |= LSM_UNSAFE_SHARE; 1078 bprm->unsafe |= LSM_UNSAFE_SHARE;
1079 } else {
1080 if (p->fs->in_exec)
1081 res = -EAGAIN;
1082 p->fs->in_exec = 1;
1083 }
1079 1084
1080 unlock_task_sighand(p, &flags); 1085 unlock_task_sighand(p, &flags);
1086 write_unlock(&p->fs->lock);
1087
1088 return res;
1081} 1089}
1082 1090
1083/* 1091/*
@@ -1296,12 +1304,15 @@ int do_execve(char * filename,
1296 bprm->cred = prepare_exec_creds(); 1304 bprm->cred = prepare_exec_creds();
1297 if (!bprm->cred) 1305 if (!bprm->cred)
1298 goto out_unlock; 1306 goto out_unlock;
1299 check_unsafe_exec(bprm); 1307
1308 retval = check_unsafe_exec(bprm);
1309 if (retval)
1310 goto out_unlock;
1300 1311
1301 file = open_exec(filename); 1312 file = open_exec(filename);
1302 retval = PTR_ERR(file); 1313 retval = PTR_ERR(file);
1303 if (IS_ERR(file)) 1314 if (IS_ERR(file))
1304 goto out_unlock; 1315 goto out_unmark;
1305 1316
1306 sched_exec(); 1317 sched_exec();
1307 1318
@@ -1344,6 +1355,9 @@ int do_execve(char * filename,
1344 goto out; 1355 goto out;
1345 1356
1346 /* execve succeeded */ 1357 /* execve succeeded */
1358 write_lock(&current->fs->lock);
1359 current->fs->in_exec = 0;
1360 write_unlock(&current->fs->lock);
1347 current->in_execve = 0; 1361 current->in_execve = 0;
1348 mutex_unlock(&current->cred_exec_mutex); 1362 mutex_unlock(&current->cred_exec_mutex);
1349 acct_update_integrals(current); 1363 acct_update_integrals(current);
@@ -1362,6 +1376,11 @@ out_file:
1362 fput(bprm->file); 1376 fput(bprm->file);
1363 } 1377 }
1364 1378
1379out_unmark:
1380 write_lock(&current->fs->lock);
1381 current->fs->in_exec = 0;
1382 write_unlock(&current->fs->lock);
1383
1365out_unlock: 1384out_unlock:
1366 current->in_execve = 0; 1385 current->in_execve = 0;
1367 mutex_unlock(&current->cred_exec_mutex); 1386 mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
1- Out-of-space may cause a severe problem if the object (and directory entry)
2 were written, but the inode attributes failed. Then if the filesystem was
3 unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..cc2d22db119c
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
1#
2# Kbuild for the EXOFS module
3#
4# Copyright (C) 2008 Panasas Inc. All rights reserved.
5#
6# Authors:
7# Boaz Harrosh <bharrosh@panasas.com>
8#
9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License version 2
11#
12# Kbuild - Gets included from the Kernels Makefile and build system
13#
14
15exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
1config EXOFS_FS
2 tristate "exofs: OSD based file system support"
3 depends on SCSI_OSD_ULD
4 help
5 EXOFS is a file system that uses an OSD storage device,
6 as its backing storage.
7
8# Debugging-related stuff
9config EXOFS_DEBUG
10 bool "Enable debugging"
11 depends on EXOFS_FS
12 help
13 This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
1/*
2 * common.h - Common definitions for both Kernel and user-mode utilities
3 *
4 * Copyright (C) 2005, 2006
5 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
6 * Copyright (C) 2005, 2006
7 * International Business Machines
8 * Copyright (C) 2008, 2009
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * Copyrights for code taken from ext2:
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise Pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 * from
17 * linux/fs/minix/inode.c
18 * Copyright (C) 1991, 1992 Linus Torvalds
19 *
20 * This file is part of exofs.
21 *
22 * exofs is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation. Since it is based on ext2, and the only
25 * valid version of GPL for the Linux kernel is version 2, the only valid
26 * version of GPL for exofs is version 2.
27 *
28 * exofs is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with exofs; if not, write to the Free Software
35 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
36 */
37
38#ifndef __EXOFS_COM_H__
39#define __EXOFS_COM_H__
40
41#include <linux/types.h>
42
43#include <scsi/osd_attributes.h>
44#include <scsi/osd_initiator.h>
45#include <scsi/osd_sec.h>
46
47/****************************************************************************
48 * Object ID related defines
49 * NOTE: inode# = object ID - EXOFS_OBJ_OFF
50 ****************************************************************************/
51#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
52#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
53#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
54#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
55
56/* exofs Application specific page/attribute */
57# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
58# define EXOFS_ATTR_INODE_DATA 1
59
60/*
61 * The maximum number of files we can have is limited by the size of the
62 * inode number. This is the largest object ID that the file system supports.
63 * Object IDs 0, 1, and 2 are always in use (see above defines).
64 */
65enum {
66 EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
67 (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
68 EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
69};
70
71/****************************************************************************
72 * Misc.
73 ****************************************************************************/
74#define EXOFS_BLKSHIFT 12
75#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
76
77/****************************************************************************
78 * superblock-related things
79 ****************************************************************************/
80#define EXOFS_SUPER_MAGIC 0x5DF5
81
82/*
83 * The file system control block - stored in an object's data (mainly, the one
84 * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
85 * on disk. Right now it just has a magic value, which is basically a sanity
86 * check on our ability to communicate with the object store.
87 */
88struct exofs_fscb {
89 __le64 s_nextid; /* Highest object ID used */
90 __le32 s_numfiles; /* Number of files on fs */
91 __le16 s_magic; /* Magic signature */
92 __le16 s_newfs; /* Non-zero if this is a new fs */
93};
94
95/****************************************************************************
96 * inode-related things
97 ****************************************************************************/
98#define EXOFS_IDATA 5
99
100/*
101 * The file control block - stored in an object's attributes. This is where
102 * the in-memory inode is stored on disk.
103 */
104struct exofs_fcb {
105 __le64 i_size; /* Size of the file */
106 __le16 i_mode; /* File mode */
107 __le16 i_links_count; /* Links count */
108 __le32 i_uid; /* Owner Uid */
109 __le32 i_gid; /* Group Id */
110 __le32 i_atime; /* Access time */
111 __le32 i_ctime; /* Creation time */
112 __le32 i_mtime; /* Modification time */
113 __le32 i_flags; /* File flags (unused for now)*/
114 __le32 i_generation; /* File version (for NFS) */
115 __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
116};
117
118#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
119
120/* This is the Attribute the fcb is stored in */
121static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
122 EXOFS_APAGE_FS_DATA,
123 EXOFS_ATTR_INODE_DATA,
124 EXOFS_INO_ATTR_SIZE);
125
126/****************************************************************************
127 * dentry-related things
128 ****************************************************************************/
129#define EXOFS_NAME_LEN 255
130
131/*
132 * The on-disk directory entry
133 */
134struct exofs_dir_entry {
135 __le64 inode_no; /* inode number */
136 __le16 rec_len; /* directory entry length */
137 u8 name_len; /* name length */
138 u8 file_type; /* umm...file type */
139 char name[EXOFS_NAME_LEN]; /* file name */
140};
141
142enum {
143 EXOFS_FT_UNKNOWN,
144 EXOFS_FT_REG_FILE,
145 EXOFS_FT_DIR,
146 EXOFS_FT_CHRDEV,
147 EXOFS_FT_BLKDEV,
148 EXOFS_FT_FIFO,
149 EXOFS_FT_SOCK,
150 EXOFS_FT_SYMLINK,
151 EXOFS_FT_MAX
152};
153
154#define EXOFS_DIR_PAD 4
155#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
156#define EXOFS_DIR_REC_LEN(name_len) \
157 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
158 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
159
160/*************************
161 * function declarations *
162 *************************/
163/* osd.c */
164void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
165 const struct osd_obj_id *obj);
166
167int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
168static inline int exofs_check_ok(struct osd_request *or)
169{
170 return exofs_check_ok_resid(or, NULL, NULL);
171}
172int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
173int exofs_async_op(struct osd_request *or,
174 osd_req_done_fn *async_done, void *caller_context, u8 *cred);
175
176int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
177
178int osd_req_read_kern(struct osd_request *or,
179 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
180
181int osd_req_write_kern(struct osd_request *or,
182 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
183
184#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..65b0c8c776a1
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline unsigned exofs_chunk_size(struct inode *inode)
39{
40 return inode->i_sb->s_blocksize;
41}
42
43static inline void exofs_put_page(struct page *page)
44{
45 kunmap(page);
46 page_cache_release(page);
47}
48
49/* Accesses dir's inode->i_size must be called under inode lock */
50static inline unsigned long dir_pages(struct inode *inode)
51{
52 return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
53}
54
55static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
56{
57 loff_t last_byte = inode->i_size;
58
59 last_byte -= page_nr << PAGE_CACHE_SHIFT;
60 if (last_byte > PAGE_CACHE_SIZE)
61 last_byte = PAGE_CACHE_SIZE;
62 return last_byte;
63}
64
65static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
66{
67 struct address_space *mapping = page->mapping;
68 struct inode *dir = mapping->host;
69 int err = 0;
70
71 dir->i_version++;
72
73 if (!PageUptodate(page))
74 SetPageUptodate(page);
75
76 if (pos+len > dir->i_size) {
77 i_size_write(dir, pos+len);
78 mark_inode_dirty(dir);
79 }
80 set_page_dirty(page);
81
82 if (IS_DIRSYNC(dir))
83 err = write_one_page(page, 1);
84 else
85 unlock_page(page);
86
87 return err;
88}
89
90static void exofs_check_page(struct page *page)
91{
92 struct inode *dir = page->mapping->host;
93 unsigned chunk_size = exofs_chunk_size(dir);
94 char *kaddr = page_address(page);
95 unsigned offs, rec_len;
96 unsigned limit = PAGE_CACHE_SIZE;
97 struct exofs_dir_entry *p;
98 char *error;
99
100 /* if the page is the last one in the directory */
101 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
102 limit = dir->i_size & ~PAGE_CACHE_MASK;
103 if (limit & (chunk_size - 1))
104 goto Ebadsize;
105 if (!limit)
106 goto out;
107 }
108 for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
109 p = (struct exofs_dir_entry *)(kaddr + offs);
110 rec_len = le16_to_cpu(p->rec_len);
111
112 if (rec_len < EXOFS_DIR_REC_LEN(1))
113 goto Eshort;
114 if (rec_len & 3)
115 goto Ealign;
116 if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
117 goto Enamelen;
118 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
119 goto Espan;
120 }
121 if (offs != limit)
122 goto Eend;
123out:
124 SetPageChecked(page);
125 return;
126
127Ebadsize:
128 EXOFS_ERR("ERROR [exofs_check_page]: "
129 "size of directory #%lu is not a multiple of chunk size",
130 dir->i_ino
131 );
132 goto fail;
133Eshort:
134 error = "rec_len is smaller than minimal";
135 goto bad_entry;
136Ealign:
137 error = "unaligned directory entry";
138 goto bad_entry;
139Enamelen:
140 error = "rec_len is too small for name_len";
141 goto bad_entry;
142Espan:
143 error = "directory entry across blocks";
144 goto bad_entry;
145bad_entry:
146 EXOFS_ERR(
147 "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
148 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
149 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
150 _LLU(le64_to_cpu(p->inode_no)),
151 rec_len, p->name_len);
152 goto fail;
153Eend:
154 p = (struct exofs_dir_entry *)(kaddr + offs);
155 EXOFS_ERR("ERROR [exofs_check_page]: "
156 "entry in directory #%lu spans the page boundary"
157 "offset=%lu, inode=%llu",
158 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
159 _LLU(le64_to_cpu(p->inode_no)));
160fail:
161 SetPageChecked(page);
162 SetPageError(page);
163}
164
165static struct page *exofs_get_page(struct inode *dir, unsigned long n)
166{
167 struct address_space *mapping = dir->i_mapping;
168 struct page *page = read_mapping_page(mapping, n, NULL);
169
170 if (!IS_ERR(page)) {
171 kmap(page);
172 if (!PageChecked(page))
173 exofs_check_page(page);
174 if (PageError(page))
175 goto fail;
176 }
177 return page;
178
179fail:
180 exofs_put_page(page);
181 return ERR_PTR(-EIO);
182}
183
184static inline int exofs_match(int len, const unsigned char *name,
185 struct exofs_dir_entry *de)
186{
187 if (len != de->name_len)
188 return 0;
189 if (!de->inode_no)
190 return 0;
191 return !memcmp(name, de->name, len);
192}
193
194static inline
195struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
196{
197 return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
198}
199
200static inline unsigned
201exofs_validate_entry(char *base, unsigned offset, unsigned mask)
202{
203 struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
204 struct exofs_dir_entry *p =
205 (struct exofs_dir_entry *)(base + (offset&mask));
206 while ((char *)p < (char *)de) {
207 if (p->rec_len == 0)
208 break;
209 p = exofs_next_entry(p);
210 }
211 return (char *)p - base;
212}
213
214static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
215 [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
216 [EXOFS_FT_REG_FILE] = DT_REG,
217 [EXOFS_FT_DIR] = DT_DIR,
218 [EXOFS_FT_CHRDEV] = DT_CHR,
219 [EXOFS_FT_BLKDEV] = DT_BLK,
220 [EXOFS_FT_FIFO] = DT_FIFO,
221 [EXOFS_FT_SOCK] = DT_SOCK,
222 [EXOFS_FT_SYMLINK] = DT_LNK,
223};
224
225#define S_SHIFT 12
226static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
227 [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
228 [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
229 [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
230 [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
231 [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
232 [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
233 [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
234};
235
236static inline
237void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
238{
239 mode_t mode = inode->i_mode;
240 de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
241}
242
243static int
244exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
245{
246 loff_t pos = filp->f_pos;
247 struct inode *inode = filp->f_path.dentry->d_inode;
248 unsigned int offset = pos & ~PAGE_CACHE_MASK;
249 unsigned long n = pos >> PAGE_CACHE_SHIFT;
250 unsigned long npages = dir_pages(inode);
251 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
252 unsigned char *types = NULL;
253 int need_revalidate = (filp->f_version != inode->i_version);
254
255 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
256 return 0;
257
258 types = exofs_filetype_table;
259
260 for ( ; n < npages; n++, offset = 0) {
261 char *kaddr, *limit;
262 struct exofs_dir_entry *de;
263 struct page *page = exofs_get_page(inode, n);
264
265 if (IS_ERR(page)) {
266 EXOFS_ERR("ERROR: "
267 "bad page in #%lu",
268 inode->i_ino);
269 filp->f_pos += PAGE_CACHE_SIZE - offset;
270 return PTR_ERR(page);
271 }
272 kaddr = page_address(page);
273 if (unlikely(need_revalidate)) {
274 if (offset) {
275 offset = exofs_validate_entry(kaddr, offset,
276 chunk_mask);
277 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
278 }
279 filp->f_version = inode->i_version;
280 need_revalidate = 0;
281 }
282 de = (struct exofs_dir_entry *)(kaddr + offset);
283 limit = kaddr + exofs_last_byte(inode, n) -
284 EXOFS_DIR_REC_LEN(1);
285 for (; (char *)de <= limit; de = exofs_next_entry(de)) {
286 if (de->rec_len == 0) {
287 EXOFS_ERR("ERROR: "
288 "zero-length directory entry");
289 exofs_put_page(page);
290 return -EIO;
291 }
292 if (de->inode_no) {
293 int over;
294 unsigned char d_type = DT_UNKNOWN;
295
296 if (types && de->file_type < EXOFS_FT_MAX)
297 d_type = types[de->file_type];
298
299 offset = (char *)de - kaddr;
300 over = filldir(dirent, de->name, de->name_len,
301 (n<<PAGE_CACHE_SHIFT) | offset,
302 le64_to_cpu(de->inode_no),
303 d_type);
304 if (over) {
305 exofs_put_page(page);
306 return 0;
307 }
308 }
309 filp->f_pos += le16_to_cpu(de->rec_len);
310 }
311 exofs_put_page(page);
312 }
313
314 return 0;
315}
316
317struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
318 struct dentry *dentry, struct page **res_page)
319{
320 const unsigned char *name = dentry->d_name.name;
321 int namelen = dentry->d_name.len;
322 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
323 unsigned long start, n;
324 unsigned long npages = dir_pages(dir);
325 struct page *page = NULL;
326 struct exofs_i_info *oi = exofs_i(dir);
327 struct exofs_dir_entry *de;
328
329 if (npages == 0)
330 goto out;
331
332 *res_page = NULL;
333
334 start = oi->i_dir_start_lookup;
335 if (start >= npages)
336 start = 0;
337 n = start;
338 do {
339 char *kaddr;
340 page = exofs_get_page(dir, n);
341 if (!IS_ERR(page)) {
342 kaddr = page_address(page);
343 de = (struct exofs_dir_entry *) kaddr;
344 kaddr += exofs_last_byte(dir, n) - reclen;
345 while ((char *) de <= kaddr) {
346 if (de->rec_len == 0) {
347 EXOFS_ERR(
348 "ERROR: exofs_find_entry: "
349 "zero-length directory entry");
350 exofs_put_page(page);
351 goto out;
352 }
353 if (exofs_match(namelen, name, de))
354 goto found;
355 de = exofs_next_entry(de);
356 }
357 exofs_put_page(page);
358 }
359 if (++n >= npages)
360 n = 0;
361 } while (n != start);
362out:
363 return NULL;
364
365found:
366 *res_page = page;
367 oi->i_dir_start_lookup = n;
368 return de;
369}
370
371struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
372{
373 struct page *page = exofs_get_page(dir, 0);
374 struct exofs_dir_entry *de = NULL;
375
376 if (!IS_ERR(page)) {
377 de = exofs_next_entry(
378 (struct exofs_dir_entry *)page_address(page));
379 *p = page;
380 }
381 return de;
382}
383
384ino_t exofs_parent_ino(struct dentry *child)
385{
386 struct page *page;
387 struct exofs_dir_entry *de;
388 ino_t ino;
389
390 de = exofs_dotdot(child->d_inode, &page);
391 if (!de)
392 return 0;
393
394 ino = le64_to_cpu(de->inode_no);
395 exofs_put_page(page);
396 return ino;
397}
398
399ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
400{
401 ino_t res = 0;
402 struct exofs_dir_entry *de;
403 struct page *page;
404
405 de = exofs_find_entry(dir, dentry, &page);
406 if (de) {
407 res = le64_to_cpu(de->inode_no);
408 exofs_put_page(page);
409 }
410 return res;
411}
412
413int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
414 struct page *page, struct inode *inode)
415{
416 loff_t pos = page_offset(page) +
417 (char *) de - (char *) page_address(page);
418 unsigned len = le16_to_cpu(de->rec_len);
419 int err;
420
421 lock_page(page);
422 err = exofs_write_begin(NULL, page->mapping, pos, len,
423 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
424 if (err)
425 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
426 err);
427
428 de->inode_no = cpu_to_le64(inode->i_ino);
429 exofs_set_de_type(de, inode);
430 if (likely(!err))
431 err = exofs_commit_chunk(page, pos, len);
432 exofs_put_page(page);
433 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
434 mark_inode_dirty(dir);
435 return err;
436}
437
438int exofs_add_link(struct dentry *dentry, struct inode *inode)
439{
440 struct inode *dir = dentry->d_parent->d_inode;
441 const unsigned char *name = dentry->d_name.name;
442 int namelen = dentry->d_name.len;
443 unsigned chunk_size = exofs_chunk_size(dir);
444 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
445 unsigned short rec_len, name_len;
446 struct page *page = NULL;
447 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
448 struct exofs_dir_entry *de;
449 unsigned long npages = dir_pages(dir);
450 unsigned long n;
451 char *kaddr;
452 loff_t pos;
453 int err;
454
455 for (n = 0; n <= npages; n++) {
456 char *dir_end;
457
458 page = exofs_get_page(dir, n);
459 err = PTR_ERR(page);
460 if (IS_ERR(page))
461 goto out;
462 lock_page(page);
463 kaddr = page_address(page);
464 dir_end = kaddr + exofs_last_byte(dir, n);
465 de = (struct exofs_dir_entry *)kaddr;
466 kaddr += PAGE_CACHE_SIZE - reclen;
467 while ((char *)de <= kaddr) {
468 if ((char *)de == dir_end) {
469 name_len = 0;
470 rec_len = chunk_size;
471 de->rec_len = cpu_to_le16(chunk_size);
472 de->inode_no = 0;
473 goto got_it;
474 }
475 if (de->rec_len == 0) {
476 EXOFS_ERR("ERROR: exofs_add_link: "
477 "zero-length directory entry");
478 err = -EIO;
479 goto out_unlock;
480 }
481 err = -EEXIST;
482 if (exofs_match(namelen, name, de))
483 goto out_unlock;
484 name_len = EXOFS_DIR_REC_LEN(de->name_len);
485 rec_len = le16_to_cpu(de->rec_len);
486 if (!de->inode_no && rec_len >= reclen)
487 goto got_it;
488 if (rec_len >= name_len + reclen)
489 goto got_it;
490 de = (struct exofs_dir_entry *) ((char *) de + rec_len);
491 }
492 unlock_page(page);
493 exofs_put_page(page);
494 }
495
496 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
497 return -EINVAL;
498
499got_it:
500 pos = page_offset(page) +
501 (char *)de - (char *)page_address(page);
502 err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
503 &page, NULL);
504 if (err)
505 goto out_unlock;
506 if (de->inode_no) {
507 struct exofs_dir_entry *de1 =
508 (struct exofs_dir_entry *)((char *)de + name_len);
509 de1->rec_len = cpu_to_le16(rec_len - name_len);
510 de->rec_len = cpu_to_le16(name_len);
511 de = de1;
512 }
513 de->name_len = namelen;
514 memcpy(de->name, name, namelen);
515 de->inode_no = cpu_to_le64(inode->i_ino);
516 exofs_set_de_type(de, inode);
517 err = exofs_commit_chunk(page, pos, rec_len);
518 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
519 mark_inode_dirty(dir);
520 sbi->s_numfiles++;
521
522out_put:
523 exofs_put_page(page);
524out:
525 return err;
526out_unlock:
527 unlock_page(page);
528 goto out_put;
529}
530
531int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
532{
533 struct address_space *mapping = page->mapping;
534 struct inode *inode = mapping->host;
535 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
536 char *kaddr = page_address(page);
537 unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
538 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
539 loff_t pos;
540 struct exofs_dir_entry *pde = NULL;
541 struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
542 int err;
543
544 while (de < dir) {
545 if (de->rec_len == 0) {
546 EXOFS_ERR("ERROR: exofs_delete_entry:"
547 "zero-length directory entry");
548 err = -EIO;
549 goto out;
550 }
551 pde = de;
552 de = exofs_next_entry(de);
553 }
554 if (pde)
555 from = (char *)pde - (char *)page_address(page);
556 pos = page_offset(page) + from;
557 lock_page(page);
558 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
559 &page, NULL);
560 if (err)
561 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
562 err);
563 if (pde)
564 pde->rec_len = cpu_to_le16(to - from);
565 dir->inode_no = 0;
566 if (likely(!err))
567 err = exofs_commit_chunk(page, pos, to - from);
568 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
569 mark_inode_dirty(inode);
570 sbi->s_numfiles--;
571out:
572 exofs_put_page(page);
573 return err;
574}
575
576/* kept aligned on 4 bytes */
577#define THIS_DIR ".\0\0"
578#define PARENT_DIR "..\0"
579
580int exofs_make_empty(struct inode *inode, struct inode *parent)
581{
582 struct address_space *mapping = inode->i_mapping;
583 struct page *page = grab_cache_page(mapping, 0);
584 unsigned chunk_size = exofs_chunk_size(inode);
585 struct exofs_dir_entry *de;
586 int err;
587 void *kaddr;
588
589 if (!page)
590 return -ENOMEM;
591
592 err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
593 &page, NULL);
594 if (err) {
595 unlock_page(page);
596 goto fail;
597 }
598
599 kaddr = kmap_atomic(page, KM_USER0);
600 de = (struct exofs_dir_entry *)kaddr;
601 de->name_len = 1;
602 de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
603 memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
604 de->inode_no = cpu_to_le64(inode->i_ino);
605 exofs_set_de_type(de, inode);
606
607 de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
608 de->name_len = 2;
609 de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
610 de->inode_no = cpu_to_le64(parent->i_ino);
611 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
612 exofs_set_de_type(de, inode);
613 kunmap_atomic(page, KM_USER0);
614 err = exofs_commit_chunk(page, 0, chunk_size);
615fail:
616 page_cache_release(page);
617 return err;
618}
619
620int exofs_empty_dir(struct inode *inode)
621{
622 struct page *page = NULL;
623 unsigned long i, npages = dir_pages(inode);
624
625 for (i = 0; i < npages; i++) {
626 char *kaddr;
627 struct exofs_dir_entry *de;
628 page = exofs_get_page(inode, i);
629
630 if (IS_ERR(page))
631 continue;
632
633 kaddr = page_address(page);
634 de = (struct exofs_dir_entry *)kaddr;
635 kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
636
637 while ((char *)de <= kaddr) {
638 if (de->rec_len == 0) {
639 EXOFS_ERR("ERROR: exofs_empty_dir: "
640 "zero-length directory entry"
641 "kaddr=%p, de=%p\n", kaddr, de);
642 goto not_empty;
643 }
644 if (de->inode_no != 0) {
645 /* check for . and .. */
646 if (de->name[0] != '.')
647 goto not_empty;
648 if (de->name_len > 2)
649 goto not_empty;
650 if (de->name_len < 2) {
651 if (le64_to_cpu(de->inode_no) !=
652 inode->i_ino)
653 goto not_empty;
654 } else if (de->name[1] != '.')
655 goto not_empty;
656 }
657 de = exofs_next_entry(de);
658 }
659 exofs_put_page(page);
660 }
661 return 1;
662
663not_empty:
664 exofs_put_page(page);
665 return 0;
666}
667
668const struct file_operations exofs_dir_operations = {
669 .llseek = generic_file_llseek,
670 .read = generic_read_dir,
671 .readdir = exofs_readdir,
672};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..0fd4c7859679
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/fs.h>
37#include <linux/time.h>
38#include "common.h"
39
40#ifndef __EXOFS_H__
41#define __EXOFS_H__
42
43#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
44
45#ifdef CONFIG_EXOFS_DEBUG
46#define EXOFS_DBGMSG(fmt, a...) \
47 printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
48#else
49#define EXOFS_DBGMSG(fmt, a...) \
50 do { if (0) printk(fmt, ##a); } while (0)
51#endif
52
53/* u64 has problems with printk this will cast it to unsigned long long */
54#define _LLU(x) (unsigned long long)(x)
55
56/*
57 * our extension to the in-memory superblock
58 */
59struct exofs_sb_info {
60 struct osd_dev *s_dev; /* returned by get_osd_dev */
61 osd_id s_pid; /* partition ID of file system*/
62 int s_timeout; /* timeout for OSD operations */
63 uint64_t s_nextid; /* highest object ID used */
64 uint32_t s_numfiles; /* number of files on fs */
65 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
66 u32 s_next_generation; /* next gen # to use */
67 atomic_t s_curr_pending; /* number of pending commands */
68 uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */
69};
70
71/*
72 * our extension to the in-memory inode
73 */
74struct exofs_i_info {
75 unsigned long i_flags; /* various atomic flags */
76 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
77 uint32_t i_dir_start_lookup; /* which page to start lookup */
78 wait_queue_head_t i_wq; /* wait queue for inode */
79 uint64_t i_commit_size; /* the object's written length */
80 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
81 struct inode vfs_inode; /* normal in-memory inode */
82};
83
84/*
85 * our inode flags
86 */
87#define OBJ_2BCREATED 0 /* object will be created soon*/
88#define OBJ_CREATED 1 /* object has been created on the osd*/
89
90static inline int obj_2bcreated(struct exofs_i_info *oi)
91{
92 return test_bit(OBJ_2BCREATED, &oi->i_flags);
93}
94
95static inline void set_obj_2bcreated(struct exofs_i_info *oi)
96{
97 set_bit(OBJ_2BCREATED, &oi->i_flags);
98}
99
100static inline int obj_created(struct exofs_i_info *oi)
101{
102 return test_bit(OBJ_CREATED, &oi->i_flags);
103}
104
105static inline void set_obj_created(struct exofs_i_info *oi)
106{
107 set_bit(OBJ_CREATED, &oi->i_flags);
108}
109
110int __exofs_wait_obj_created(struct exofs_i_info *oi);
111static inline int wait_obj_created(struct exofs_i_info *oi)
112{
113 if (likely(obj_created(oi)))
114 return 0;
115
116 return __exofs_wait_obj_created(oi);
117}
118
119/*
120 * get to our inode from the vfs inode
121 */
122static inline struct exofs_i_info *exofs_i(struct inode *inode)
123{
124 return container_of(inode, struct exofs_i_info, vfs_inode);
125}
126
127/*
128 * Maximum count of links to a file
129 */
130#define EXOFS_LINK_MAX 32000
131
132/*************************
133 * function declarations *
134 *************************/
135/* inode.c */
136void exofs_truncate(struct inode *inode);
137int exofs_setattr(struct dentry *, struct iattr *);
138int exofs_write_begin(struct file *file, struct address_space *mapping,
139 loff_t pos, unsigned len, unsigned flags,
140 struct page **pagep, void **fsdata);
141extern struct inode *exofs_iget(struct super_block *, unsigned long);
142struct inode *exofs_new_inode(struct inode *, int);
143extern int exofs_write_inode(struct inode *, int);
144extern void exofs_delete_inode(struct inode *);
145
146/* dir.c: */
147int exofs_add_link(struct dentry *, struct inode *);
148ino_t exofs_inode_by_name(struct inode *, struct dentry *);
149int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
150int exofs_make_empty(struct inode *, struct inode *);
151struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
152 struct page **);
153int exofs_empty_dir(struct inode *);
154struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
155ino_t exofs_parent_ino(struct dentry *child);
156int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
157 struct inode *);
158
159/*********************
160 * operation vectors *
161 *********************/
162/* dir.c: */
163extern const struct file_operations exofs_dir_operations;
164
165/* file.c */
166extern const struct inode_operations exofs_file_inode_operations;
167extern const struct file_operations exofs_file_operations;
168
169/* inode.c */
170extern const struct address_space_operations exofs_aops;
171
172/* namei.c */
173extern const struct inode_operations exofs_dir_inode_operations;
174extern const struct inode_operations exofs_special_inode_operations;
175
176/* symlink.c */
177extern const struct inode_operations exofs_symlink_inode_operations;
178extern const struct inode_operations exofs_fast_symlink_inode_operations;
179
180#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/buffer_head.h>
37
38#include "exofs.h"
39
40static int exofs_release_file(struct inode *inode, struct file *filp)
41{
42 return 0;
43}
44
45static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
46 int datasync)
47{
48 int ret;
49 struct address_space *mapping = filp->f_mapping;
50
51 ret = filemap_write_and_wait(mapping);
52 if (ret)
53 return ret;
54
55 /*Note: file_fsync below also calles sync_blockdev, which is a no-op
56 * for exofs, but other then that it does sync_inode and
57 * sync_superblock which is what we need here.
58 */
59 return file_fsync(filp, dentry, datasync);
60}
61
62static int exofs_flush(struct file *file, fl_owner_t id)
63{
64 exofs_file_fsync(file, file->f_path.dentry, 1);
65 /* TODO: Flush the OSD target */
66 return 0;
67}
68
69const struct file_operations exofs_file_operations = {
70 .llseek = generic_file_llseek,
71 .read = do_sync_read,
72 .write = do_sync_write,
73 .aio_read = generic_file_aio_read,
74 .aio_write = generic_file_aio_write,
75 .mmap = generic_file_mmap,
76 .open = generic_file_open,
77 .release = exofs_release_file,
78 .fsync = exofs_file_fsync,
79 .flush = exofs_flush,
80 .splice_read = generic_file_splice_read,
81 .splice_write = generic_file_splice_write,
82};
83
84const struct inode_operations exofs_file_inode_operations = {
85 .truncate = exofs_truncate,
86 .setattr = exofs_setattr,
87};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..ba8d9fab4693
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/writeback.h>
37#include <linux/buffer_head.h>
38#include <scsi/scsi_device.h>
39
40#include "exofs.h"
41
42#ifdef CONFIG_EXOFS_DEBUG
43# define EXOFS_DEBUG_OBJ_ISIZE 1
44#endif
45
46struct page_collect {
47 struct exofs_sb_info *sbi;
48 struct request_queue *req_q;
49 struct inode *inode;
50 unsigned expected_pages;
51
52 struct bio *bio;
53 unsigned nr_pages;
54 unsigned long length;
55 loff_t pg_first; /* keep 64bit also in 32-arches */
56};
57
58static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
59 struct inode *inode)
60{
61 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
62 struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
63
64 pcol->sbi = sbi;
65 pcol->req_q = req_q;
66 pcol->inode = inode;
67 pcol->expected_pages = expected_pages;
68
69 pcol->bio = NULL;
70 pcol->nr_pages = 0;
71 pcol->length = 0;
72 pcol->pg_first = -1;
73
74 EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
75 expected_pages);
76}
77
78static void _pcol_reset(struct page_collect *pcol)
79{
80 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
81
82 pcol->bio = NULL;
83 pcol->nr_pages = 0;
84 pcol->length = 0;
85 pcol->pg_first = -1;
86 EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
87 pcol->inode->i_ino, pcol->expected_pages);
88
89 /* this is probably the end of the loop but in writes
90 * it might not end here. don't be left with nothing
91 */
92 if (!pcol->expected_pages)
93 pcol->expected_pages = 128;
94}
95
96static int pcol_try_alloc(struct page_collect *pcol)
97{
98 int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
99
100 for (; pages; pages >>= 1) {
101 pcol->bio = bio_alloc(GFP_KERNEL, pages);
102 if (likely(pcol->bio))
103 return 0;
104 }
105
106 EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
107 pcol->expected_pages);
108 return -ENOMEM;
109}
110
111static void pcol_free(struct page_collect *pcol)
112{
113 bio_put(pcol->bio);
114 pcol->bio = NULL;
115}
116
117static int pcol_add_page(struct page_collect *pcol, struct page *page,
118 unsigned len)
119{
120 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
121 if (unlikely(len != added_len))
122 return -ENOMEM;
123
124 ++pcol->nr_pages;
125 pcol->length += len;
126 return 0;
127}
128
129static int update_read_page(struct page *page, int ret)
130{
131 if (ret == 0) {
132 /* Everything is OK */
133 SetPageUptodate(page);
134 if (PageError(page))
135 ClearPageError(page);
136 } else if (ret == -EFAULT) {
137 /* In this case we were trying to read something that wasn't on
138 * disk yet - return a page full of zeroes. This should be OK,
139 * because the object should be empty (if there was a write
140 * before this read, the read would be waiting with the page
141 * locked */
142 clear_highpage(page);
143
144 SetPageUptodate(page);
145 if (PageError(page))
146 ClearPageError(page);
147 ret = 0; /* recovered error */
148 EXOFS_DBGMSG("recovered read error\n");
149 } else /* Error */
150 SetPageError(page);
151
152 return ret;
153}
154
155static void update_write_page(struct page *page, int ret)
156{
157 if (ret) {
158 mapping_set_error(page->mapping, ret);
159 SetPageError(page);
160 }
161 end_page_writeback(page);
162}
163
164/* Called at the end of reads, to optionally unlock pages and update their
165 * status.
166 */
167static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
168 bool do_unlock)
169{
170 struct bio_vec *bvec;
171 int i;
172 u64 resid;
173 u64 good_bytes;
174 u64 length = 0;
175 int ret = exofs_check_ok_resid(or, &resid, NULL);
176
177 osd_end_request(or);
178
179 if (likely(!ret))
180 good_bytes = pcol->length;
181 else if (!resid)
182 good_bytes = 0;
183 else
184 good_bytes = pcol->length - resid;
185
186 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
187 " length=0x%lx nr_pages=%u\n",
188 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
189 pcol->nr_pages);
190
191 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
192 struct page *page = bvec->bv_page;
193 struct inode *inode = page->mapping->host;
194 int page_stat;
195
196 if (inode != pcol->inode)
197 continue; /* osd might add more pages at end */
198
199 if (likely(length < good_bytes))
200 page_stat = 0;
201 else
202 page_stat = ret;
203
204 EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n",
205 inode->i_ino, page->index,
206 page_stat ? "bad_bytes" : "good_bytes");
207
208 ret = update_read_page(page, page_stat);
209 if (do_unlock)
210 unlock_page(page);
211 length += bvec->bv_len;
212 }
213
214 pcol_free(pcol);
215 EXOFS_DBGMSG("readpages_done END\n");
216 return ret;
217}
218
219/* callback of async reads */
220static void readpages_done(struct osd_request *or, void *p)
221{
222 struct page_collect *pcol = p;
223
224 __readpages_done(or, pcol, true);
225 atomic_dec(&pcol->sbi->s_curr_pending);
226 kfree(p);
227}
228
229static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
230{
231 struct bio_vec *bvec;
232 int i;
233
234 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
235 struct page *page = bvec->bv_page;
236
237 if (rw == READ)
238 update_read_page(page, ret);
239 else
240 update_write_page(page, ret);
241
242 unlock_page(page);
243 }
244 pcol_free(pcol);
245}
246
247static int read_exec(struct page_collect *pcol, bool is_sync)
248{
249 struct exofs_i_info *oi = exofs_i(pcol->inode);
250 struct osd_obj_id obj = {pcol->sbi->s_pid,
251 pcol->inode->i_ino + EXOFS_OBJ_OFF};
252 struct osd_request *or = NULL;
253 struct page_collect *pcol_copy = NULL;
254 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
255 int ret;
256
257 if (!pcol->bio)
258 return 0;
259
260 /* see comment in _readpage() about sync reads */
261 WARN_ON(is_sync && (pcol->nr_pages != 1));
262
263 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
264 if (unlikely(!or)) {
265 ret = -ENOMEM;
266 goto err;
267 }
268
269 osd_req_read(or, &obj, pcol->bio, i_start);
270
271 if (is_sync) {
272 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
273 return __readpages_done(or, pcol, false);
274 }
275
276 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
277 if (!pcol_copy) {
278 ret = -ENOMEM;
279 goto err;
280 }
281
282 *pcol_copy = *pcol;
283 ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
284 if (unlikely(ret))
285 goto err;
286
287 atomic_inc(&pcol->sbi->s_curr_pending);
288
289 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
290 obj.id, _LLU(i_start), pcol->length);
291
292 /* pages ownership was passed to pcol_copy */
293 _pcol_reset(pcol);
294 return 0;
295
296err:
297 if (!is_sync)
298 _unlock_pcol_pages(pcol, ret, READ);
299 kfree(pcol_copy);
300 if (or)
301 osd_end_request(or);
302 return ret;
303}
304
305/* readpage_strip is called either directly from readpage() or by the VFS from
306 * within read_cache_pages(), to add one more page to be read. It will try to
307 * collect as many contiguous pages as posible. If a discontinuity is
308 * encountered, or it runs out of resources, it will submit the previous segment
309 * and will start a new collection. Eventually caller must submit the last
310 * segment if present.
311 */
312static int readpage_strip(void *data, struct page *page)
313{
314 struct page_collect *pcol = data;
315 struct inode *inode = pcol->inode;
316 struct exofs_i_info *oi = exofs_i(inode);
317 loff_t i_size = i_size_read(inode);
318 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
319 size_t len;
320 int ret;
321
322 /* FIXME: Just for debugging, will be removed */
323 if (PageUptodate(page))
324 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
325 page->index);
326
327 if (page->index < end_index)
328 len = PAGE_CACHE_SIZE;
329 else if (page->index == end_index)
330 len = i_size & ~PAGE_CACHE_MASK;
331 else
332 len = 0;
333
334 if (!len || !obj_created(oi)) {
335 /* this will be out of bounds, or doesn't exist yet.
336 * Current page is cleared and the request is split
337 */
338 clear_highpage(page);
339
340 SetPageUptodate(page);
341 if (PageError(page))
342 ClearPageError(page);
343
344 unlock_page(page);
345 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
346 " splitting\n", inode->i_ino, page->index);
347
348 return read_exec(pcol, false);
349 }
350
351try_again:
352
353 if (unlikely(pcol->pg_first == -1)) {
354 pcol->pg_first = page->index;
355 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
356 page->index)) {
357 /* Discontinuity detected, split the request */
358 ret = read_exec(pcol, false);
359 if (unlikely(ret))
360 goto fail;
361 goto try_again;
362 }
363
364 if (!pcol->bio) {
365 ret = pcol_try_alloc(pcol);
366 if (unlikely(ret))
367 goto fail;
368 }
369
370 if (len != PAGE_CACHE_SIZE)
371 zero_user(page, len, PAGE_CACHE_SIZE - len);
372
373 EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
374 inode->i_ino, page->index, len);
375
376 ret = pcol_add_page(pcol, page, len);
377 if (ret) {
378 EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
379 "this_len=0x%zx nr_pages=%u length=0x%lx\n",
380 page, len, pcol->nr_pages, pcol->length);
381
382 /* split the request, and start again with current page */
383 ret = read_exec(pcol, false);
384 if (unlikely(ret))
385 goto fail;
386
387 goto try_again;
388 }
389
390 return 0;
391
392fail:
393 /* SetPageError(page); ??? */
394 unlock_page(page);
395 return ret;
396}
397
398static int exofs_readpages(struct file *file, struct address_space *mapping,
399 struct list_head *pages, unsigned nr_pages)
400{
401 struct page_collect pcol;
402 int ret;
403
404 _pcol_init(&pcol, nr_pages, mapping->host);
405
406 ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
407 if (ret) {
408 EXOFS_ERR("read_cache_pages => %d\n", ret);
409 return ret;
410 }
411
412 return read_exec(&pcol, false);
413}
414
415static int _readpage(struct page *page, bool is_sync)
416{
417 struct page_collect pcol;
418 int ret;
419
420 _pcol_init(&pcol, 1, page->mapping->host);
421
422 /* readpage_strip might call read_exec(,async) inside at several places
423 * but this is safe for is_async=0 since read_exec will not do anything
424 * when we have a single page.
425 */
426 ret = readpage_strip(&pcol, page);
427 if (ret) {
428 EXOFS_ERR("_readpage => %d\n", ret);
429 return ret;
430 }
431
432 return read_exec(&pcol, is_sync);
433}
434
435/*
436 * We don't need the file
437 */
438static int exofs_readpage(struct file *file, struct page *page)
439{
440 return _readpage(page, false);
441}
442
443/* Callback for osd_write. All writes are asynchronouse */
444static void writepages_done(struct osd_request *or, void *p)
445{
446 struct page_collect *pcol = p;
447 struct bio_vec *bvec;
448 int i;
449 u64 resid;
450 u64 good_bytes;
451 u64 length = 0;
452
453 int ret = exofs_check_ok_resid(or, NULL, &resid);
454
455 osd_end_request(or);
456 atomic_dec(&pcol->sbi->s_curr_pending);
457
458 if (likely(!ret))
459 good_bytes = pcol->length;
460 else if (!resid)
461 good_bytes = 0;
462 else
463 good_bytes = pcol->length - resid;
464
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages);
469
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
471 struct page *page = bvec->bv_page;
472 struct inode *inode = page->mapping->host;
473 int page_stat;
474
475 if (inode != pcol->inode)
476 continue; /* osd might add more pages to a bio */
477
478 if (likely(length < good_bytes))
479 page_stat = 0;
480 else
481 page_stat = ret;
482
483 update_write_page(page, page_stat);
484 unlock_page(page);
485 EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat);
487
488 length += bvec->bv_len;
489 }
490
491 pcol_free(pcol);
492 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n");
494}
495
496static int write_exec(struct page_collect *pcol)
497{
498 struct exofs_i_info *oi = exofs_i(pcol->inode);
499 struct osd_obj_id obj = {pcol->sbi->s_pid,
500 pcol->inode->i_ino + EXOFS_OBJ_OFF};
501 struct osd_request *or = NULL;
502 struct page_collect *pcol_copy = NULL;
503 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
504 int ret;
505
506 if (!pcol->bio)
507 return 0;
508
509 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
510 if (unlikely(!or)) {
511 EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
512 ret = -ENOMEM;
513 goto err;
514 }
515
516 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
517 if (!pcol_copy) {
518 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
519 ret = -ENOMEM;
520 goto err;
521 }
522
523 *pcol_copy = *pcol;
524
525 osd_req_write(or, &obj, pcol_copy->bio, i_start);
526 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
527 if (unlikely(ret)) {
528 EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
529 goto err;
530 }
531
532 atomic_inc(&pcol->sbi->s_curr_pending);
533 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
534 pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
535 pcol->length);
536 /* pages ownership was passed to pcol_copy */
537 _pcol_reset(pcol);
538 return 0;
539
540err:
541 _unlock_pcol_pages(pcol, ret, WRITE);
542 kfree(pcol_copy);
543 if (or)
544 osd_end_request(or);
545 return ret;
546}
547
548/* writepage_strip is called either directly from writepage() or by the VFS from
549 * within write_cache_pages(), to add one more page to be written to storage.
550 * It will try to collect as many contiguous pages as possible. If a
551 * discontinuity is encountered or it runs out of resources it will submit the
552 * previous segment and will start a new collection.
553 * Eventually caller must submit the last segment if present.
554 */
555static int writepage_strip(struct page *page,
556 struct writeback_control *wbc_unused, void *data)
557{
558 struct page_collect *pcol = data;
559 struct inode *inode = pcol->inode;
560 struct exofs_i_info *oi = exofs_i(inode);
561 loff_t i_size = i_size_read(inode);
562 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
563 size_t len;
564 int ret;
565
566 BUG_ON(!PageLocked(page));
567
568 ret = wait_obj_created(oi);
569 if (unlikely(ret))
570 goto fail;
571
572 if (page->index < end_index)
573 /* in this case, the page is within the limits of the file */
574 len = PAGE_CACHE_SIZE;
575 else {
576 len = i_size & ~PAGE_CACHE_MASK;
577
578 if (page->index > end_index || !len) {
579 /* in this case, the page is outside the limits
580 * (truncate in progress)
581 */
582 ret = write_exec(pcol);
583 if (unlikely(ret))
584 goto fail;
585 if (PageError(page))
586 ClearPageError(page);
587 unlock_page(page);
588 return 0;
589 }
590 }
591
592try_again:
593
594 if (unlikely(pcol->pg_first == -1)) {
595 pcol->pg_first = page->index;
596 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
597 page->index)) {
598 /* Discontinuity detected, split the request */
599 ret = write_exec(pcol);
600 if (unlikely(ret))
601 goto fail;
602 goto try_again;
603 }
604
605 if (!pcol->bio) {
606 ret = pcol_try_alloc(pcol);
607 if (unlikely(ret))
608 goto fail;
609 }
610
611 EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
612 inode->i_ino, page->index, len);
613
614 ret = pcol_add_page(pcol, page, len);
615 if (unlikely(ret)) {
616 EXOFS_DBGMSG("Failed pcol_add_page "
617 "nr_pages=%u total_length=0x%lx\n",
618 pcol->nr_pages, pcol->length);
619
620 /* split the request, next loop will start again */
621 ret = write_exec(pcol);
622 if (unlikely(ret)) {
623 EXOFS_DBGMSG("write_exec faild => %d", ret);
624 goto fail;
625 }
626
627 goto try_again;
628 }
629
630 BUG_ON(PageWriteback(page));
631 set_page_writeback(page);
632
633 return 0;
634
635fail:
636 set_bit(AS_EIO, &page->mapping->flags);
637 unlock_page(page);
638 return ret;
639}
640
641static int exofs_writepages(struct address_space *mapping,
642 struct writeback_control *wbc)
643{
644 struct page_collect pcol;
645 long start, end, expected_pages;
646 int ret;
647
648 start = wbc->range_start >> PAGE_CACHE_SHIFT;
649 end = (wbc->range_end == LLONG_MAX) ?
650 start + mapping->nrpages :
651 wbc->range_end >> PAGE_CACHE_SHIFT;
652
653 if (start || end)
654 expected_pages = min(end - start + 1, 32L);
655 else
656 expected_pages = mapping->nrpages;
657
658 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
659 " m->nrpages=%lu start=0x%lx end=0x%lx\n",
660 mapping->host->i_ino, wbc->range_start, wbc->range_end,
661 mapping->nrpages, start, end);
662
663 _pcol_init(&pcol, expected_pages, mapping->host);
664
665 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
666 if (ret) {
667 EXOFS_ERR("write_cache_pages => %d\n", ret);
668 return ret;
669 }
670
671 return write_exec(&pcol);
672}
673
674static int exofs_writepage(struct page *page, struct writeback_control *wbc)
675{
676 struct page_collect pcol;
677 int ret;
678
679 _pcol_init(&pcol, 1, page->mapping->host);
680
681 ret = writepage_strip(page, NULL, &pcol);
682 if (ret) {
683 EXOFS_ERR("exofs_writepage => %d\n", ret);
684 return ret;
685 }
686
687 return write_exec(&pcol);
688}
689
690int exofs_write_begin(struct file *file, struct address_space *mapping,
691 loff_t pos, unsigned len, unsigned flags,
692 struct page **pagep, void **fsdata)
693{
694 int ret = 0;
695 struct page *page;
696
697 page = *pagep;
698 if (page == NULL) {
699 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
700 fsdata);
701 if (ret) {
702 EXOFS_DBGMSG("simple_write_begin faild\n");
703 return ret;
704 }
705
706 page = *pagep;
707 }
708
709 /* read modify write */
710 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
711 ret = _readpage(page, true);
712 if (ret) {
713 /*SetPageError was done by _readpage. Is it ok?*/
714 unlock_page(page);
715 EXOFS_DBGMSG("__readpage_filler faild\n");
716 }
717 }
718
719 return ret;
720}
721
722static int exofs_write_begin_export(struct file *file,
723 struct address_space *mapping,
724 loff_t pos, unsigned len, unsigned flags,
725 struct page **pagep, void **fsdata)
726{
727 *pagep = NULL;
728
729 return exofs_write_begin(file, mapping, pos, len, flags, pagep,
730 fsdata);
731}
732
733const struct address_space_operations exofs_aops = {
734 .readpage = exofs_readpage,
735 .readpages = exofs_readpages,
736 .writepage = exofs_writepage,
737 .writepages = exofs_writepages,
738 .write_begin = exofs_write_begin_export,
739 .write_end = simple_write_end,
740};
741
742/******************************************************************************
743 * INODE OPERATIONS
744 *****************************************************************************/
745
746/*
747 * Test whether an inode is a fast symlink.
748 */
749static inline int exofs_inode_is_fast_symlink(struct inode *inode)
750{
751 struct exofs_i_info *oi = exofs_i(inode);
752
753 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
754}
755
756/*
757 * get_block_t - Fill in a buffer_head
758 * An OSD takes care of block allocation so we just fake an allocation by
759 * putting in the inode's sector_t in the buffer_head.
760 * TODO: What about the case of create==0 and @iblock does not exist in the
761 * object?
762 */
763static int exofs_get_block(struct inode *inode, sector_t iblock,
764 struct buffer_head *bh_result, int create)
765{
766 map_bh(bh_result, inode->i_sb, iblock);
767 return 0;
768}
769
770const struct osd_attr g_attr_logical_length = ATTR_DEF(
771 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
772
773/*
774 * Truncate a file to the specified size - all we have to do is set the size
775 * attribute. We make sure the object exists first.
776 */
777void exofs_truncate(struct inode *inode)
778{
779 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
780 struct exofs_i_info *oi = exofs_i(inode);
781 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
782 struct osd_request *or;
783 struct osd_attr attr;
784 loff_t isize = i_size_read(inode);
785 __be64 newsize;
786 int ret;
787
788 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
789 || S_ISLNK(inode->i_mode)))
790 return;
791 if (exofs_inode_is_fast_symlink(inode))
792 return;
793 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
794 return;
795 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
796
797 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
798
799 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
800 if (unlikely(!or)) {
801 EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
802 goto fail;
803 }
804
805 osd_req_set_attributes(or, &obj);
806
807 newsize = cpu_to_be64((u64)isize);
808 attr = g_attr_logical_length;
809 attr.val_ptr = &newsize;
810 osd_req_add_set_attr_list(or, &attr, 1);
811
812 /* if we are about to truncate an object, and it hasn't been
813 * created yet, wait
814 */
815 if (unlikely(wait_obj_created(oi)))
816 goto fail;
817
818 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
819 osd_end_request(or);
820 if (ret)
821 goto fail;
822
823out:
824 mark_inode_dirty(inode);
825 return;
826fail:
827 make_bad_inode(inode);
828 goto out;
829}
830
831/*
832 * Set inode attributes - just call generic functions.
833 */
834int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
835{
836 struct inode *inode = dentry->d_inode;
837 int error;
838
839 error = inode_change_ok(inode, iattr);
840 if (error)
841 return error;
842
843 error = inode_setattr(inode, iattr);
844 return error;
845}
846
847/*
848 * Read an inode from the OSD, and return it as is. We also return the size
849 * attribute in the 'sanity' argument if we got compiled with debugging turned
850 * on.
851 */
852static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
853 struct exofs_fcb *inode, uint64_t *sanity)
854{
855 struct exofs_sb_info *sbi = sb->s_fs_info;
856 struct osd_request *or;
857 struct osd_attr attr;
858 struct osd_obj_id obj = {sbi->s_pid,
859 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
860 int ret;
861
862 exofs_make_credential(oi->i_cred, &obj);
863
864 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
865 if (unlikely(!or)) {
866 EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
867 return -ENOMEM;
868 }
869 osd_req_get_attributes(or, &obj);
870
871 /* we need the inode attribute */
872 osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
873
874#ifdef EXOFS_DEBUG_OBJ_ISIZE
875 /* we get the size attributes to do a sanity check */
876 osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
877#endif
878
879 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
880 if (ret)
881 goto out;
882
883 attr = g_attr_inode_data;
884 ret = extract_attr_from_req(or, &attr);
885 if (ret) {
886 EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
887 goto out;
888 }
889
890 WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
891 memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
892
893#ifdef EXOFS_DEBUG_OBJ_ISIZE
894 attr = g_attr_logical_length;
895 ret = extract_attr_from_req(or, &attr);
896 if (ret) {
897 EXOFS_ERR("ERROR: extract attr from or failed\n");
898 goto out;
899 }
900 *sanity = get_unaligned_be64(attr.val_ptr);
901#endif
902
903out:
904 osd_end_request(or);
905 return ret;
906}
907
908/*
909 * Fill in an inode read from the OSD and set it up for use
910 */
911struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
912{
913 struct exofs_i_info *oi;
914 struct exofs_fcb fcb;
915 struct inode *inode;
916 uint64_t uninitialized_var(sanity);
917 int ret;
918
919 inode = iget_locked(sb, ino);
920 if (!inode)
921 return ERR_PTR(-ENOMEM);
922 if (!(inode->i_state & I_NEW))
923 return inode;
924 oi = exofs_i(inode);
925
926 /* read the inode from the osd */
927 ret = exofs_get_inode(sb, oi, &fcb, &sanity);
928 if (ret)
929 goto bad_inode;
930
931 init_waitqueue_head(&oi->i_wq);
932 set_obj_created(oi);
933
934 /* copy stuff from on-disk struct to in-memory struct */
935 inode->i_mode = le16_to_cpu(fcb.i_mode);
936 inode->i_uid = le32_to_cpu(fcb.i_uid);
937 inode->i_gid = le32_to_cpu(fcb.i_gid);
938 inode->i_nlink = le16_to_cpu(fcb.i_links_count);
939 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
940 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
941 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
942 inode->i_ctime.tv_nsec =
943 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
944 oi->i_commit_size = le64_to_cpu(fcb.i_size);
945 i_size_write(inode, oi->i_commit_size);
946 inode->i_blkbits = EXOFS_BLKSHIFT;
947 inode->i_generation = le32_to_cpu(fcb.i_generation);
948
949#ifdef EXOFS_DEBUG_OBJ_ISIZE
950 if ((inode->i_size != sanity) &&
951 (!exofs_inode_is_fast_symlink(inode))) {
952 EXOFS_ERR("WARNING: Size of object from inode and "
953 "attributes differ (%lld != %llu)\n",
954 inode->i_size, _LLU(sanity));
955 }
956#endif
957
958 oi->i_dir_start_lookup = 0;
959
960 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
961 ret = -ESTALE;
962 goto bad_inode;
963 }
964
965 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
966 if (fcb.i_data[0])
967 inode->i_rdev =
968 old_decode_dev(le32_to_cpu(fcb.i_data[0]));
969 else
970 inode->i_rdev =
971 new_decode_dev(le32_to_cpu(fcb.i_data[1]));
972 } else {
973 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
974 }
975
976 if (S_ISREG(inode->i_mode)) {
977 inode->i_op = &exofs_file_inode_operations;
978 inode->i_fop = &exofs_file_operations;
979 inode->i_mapping->a_ops = &exofs_aops;
980 } else if (S_ISDIR(inode->i_mode)) {
981 inode->i_op = &exofs_dir_inode_operations;
982 inode->i_fop = &exofs_dir_operations;
983 inode->i_mapping->a_ops = &exofs_aops;
984 } else if (S_ISLNK(inode->i_mode)) {
985 if (exofs_inode_is_fast_symlink(inode))
986 inode->i_op = &exofs_fast_symlink_inode_operations;
987 else {
988 inode->i_op = &exofs_symlink_inode_operations;
989 inode->i_mapping->a_ops = &exofs_aops;
990 }
991 } else {
992 inode->i_op = &exofs_special_inode_operations;
993 if (fcb.i_data[0])
994 init_special_inode(inode, inode->i_mode,
995 old_decode_dev(le32_to_cpu(fcb.i_data[0])));
996 else
997 init_special_inode(inode, inode->i_mode,
998 new_decode_dev(le32_to_cpu(fcb.i_data[1])));
999 }
1000
1001 unlock_new_inode(inode);
1002 return inode;
1003
1004bad_inode:
1005 iget_failed(inode);
1006 return ERR_PTR(ret);
1007}
1008
1009int __exofs_wait_obj_created(struct exofs_i_info *oi)
1010{
1011 if (!obj_created(oi)) {
1012 BUG_ON(!obj_2bcreated(oi));
1013 wait_event(oi->i_wq, obj_created(oi));
1014 }
1015 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1016}
1017/*
1018 * Callback function from exofs_new_inode(). The important thing is that we
1019 * set the obj_created flag so that other methods know that the object exists on
1020 * the OSD.
1021 */
1022static void create_done(struct osd_request *or, void *p)
1023{
1024 struct inode *inode = p;
1025 struct exofs_i_info *oi = exofs_i(inode);
1026 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1027 int ret;
1028
1029 ret = exofs_check_ok(or);
1030 osd_end_request(or);
1031 atomic_dec(&sbi->s_curr_pending);
1032
1033 if (unlikely(ret)) {
1034 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1035 _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
1036 make_bad_inode(inode);
1037 } else
1038 set_obj_created(oi);
1039
1040 atomic_dec(&inode->i_count);
1041 wake_up(&oi->i_wq);
1042}
1043
1044/*
1045 * Set up a new inode and create an object for it on the OSD
1046 */
1047struct inode *exofs_new_inode(struct inode *dir, int mode)
1048{
1049 struct super_block *sb;
1050 struct inode *inode;
1051 struct exofs_i_info *oi;
1052 struct exofs_sb_info *sbi;
1053 struct osd_request *or;
1054 struct osd_obj_id obj;
1055 int ret;
1056
1057 sb = dir->i_sb;
1058 inode = new_inode(sb);
1059 if (!inode)
1060 return ERR_PTR(-ENOMEM);
1061
1062 oi = exofs_i(inode);
1063
1064 init_waitqueue_head(&oi->i_wq);
1065 set_obj_2bcreated(oi);
1066
1067 sbi = sb->s_fs_info;
1068
1069 sb->s_dirt = 1;
1070 inode->i_uid = current->cred->fsuid;
1071 if (dir->i_mode & S_ISGID) {
1072 inode->i_gid = dir->i_gid;
1073 if (S_ISDIR(mode))
1074 mode |= S_ISGID;
1075 } else {
1076 inode->i_gid = current->cred->fsgid;
1077 }
1078 inode->i_mode = mode;
1079
1080 inode->i_ino = sbi->s_nextid++;
1081 inode->i_blkbits = EXOFS_BLKSHIFT;
1082 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1083 oi->i_commit_size = inode->i_size = 0;
1084 spin_lock(&sbi->s_next_gen_lock);
1085 inode->i_generation = sbi->s_next_generation++;
1086 spin_unlock(&sbi->s_next_gen_lock);
1087 insert_inode_hash(inode);
1088
1089 mark_inode_dirty(inode);
1090
1091 obj.partition = sbi->s_pid;
1092 obj.id = inode->i_ino + EXOFS_OBJ_OFF;
1093 exofs_make_credential(oi->i_cred, &obj);
1094
1095 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1096 if (unlikely(!or)) {
1097 EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
1098 return ERR_PTR(-ENOMEM);
1099 }
1100
1101 osd_req_create_object(or, &obj);
1102
1103 /* increment the refcount so that the inode will still be around when we
1104 * reach the callback
1105 */
1106 atomic_inc(&inode->i_count);
1107
1108 ret = exofs_async_op(or, create_done, inode, oi->i_cred);
1109 if (ret) {
1110 atomic_dec(&inode->i_count);
1111 osd_end_request(or);
1112 return ERR_PTR(-EIO);
1113 }
1114 atomic_inc(&sbi->s_curr_pending);
1115
1116 return inode;
1117}
1118
1119/*
1120 * struct to pass two arguments to update_inode's callback
1121 */
1122struct updatei_args {
1123 struct exofs_sb_info *sbi;
1124 struct exofs_fcb fcb;
1125};
1126
1127/*
1128 * Callback function from exofs_update_inode().
1129 */
1130static void updatei_done(struct osd_request *or, void *p)
1131{
1132 struct updatei_args *args = p;
1133
1134 osd_end_request(or);
1135
1136 atomic_dec(&args->sbi->s_curr_pending);
1137
1138 kfree(args);
1139}
1140
1141/*
1142 * Write the inode to the OSD. Just fill up the struct, and set the attribute
1143 * synchronously or asynchronously depending on the do_sync flag.
1144 */
1145static int exofs_update_inode(struct inode *inode, int do_sync)
1146{
1147 struct exofs_i_info *oi = exofs_i(inode);
1148 struct super_block *sb = inode->i_sb;
1149 struct exofs_sb_info *sbi = sb->s_fs_info;
1150 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1151 struct osd_request *or;
1152 struct osd_attr attr;
1153 struct exofs_fcb *fcb;
1154 struct updatei_args *args;
1155 int ret;
1156
1157 args = kzalloc(sizeof(*args), GFP_KERNEL);
1158 if (!args)
1159 return -ENOMEM;
1160
1161 fcb = &args->fcb;
1162
1163 fcb->i_mode = cpu_to_le16(inode->i_mode);
1164 fcb->i_uid = cpu_to_le32(inode->i_uid);
1165 fcb->i_gid = cpu_to_le32(inode->i_gid);
1166 fcb->i_links_count = cpu_to_le16(inode->i_nlink);
1167 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
1168 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
1169 fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
1170 oi->i_commit_size = i_size_read(inode);
1171 fcb->i_size = cpu_to_le64(oi->i_commit_size);
1172 fcb->i_generation = cpu_to_le32(inode->i_generation);
1173
1174 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1175 if (old_valid_dev(inode->i_rdev)) {
1176 fcb->i_data[0] =
1177 cpu_to_le32(old_encode_dev(inode->i_rdev));
1178 fcb->i_data[1] = 0;
1179 } else {
1180 fcb->i_data[0] = 0;
1181 fcb->i_data[1] =
1182 cpu_to_le32(new_encode_dev(inode->i_rdev));
1183 fcb->i_data[2] = 0;
1184 }
1185 } else
1186 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1187
1188 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1189 if (unlikely(!or)) {
1190 EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
1191 ret = -ENOMEM;
1192 goto free_args;
1193 }
1194
1195 osd_req_set_attributes(or, &obj);
1196
1197 attr = g_attr_inode_data;
1198 attr.val_ptr = fcb;
1199 osd_req_add_set_attr_list(or, &attr, 1);
1200
1201 if (!obj_created(oi)) {
1202 EXOFS_DBGMSG("!obj_created\n");
1203 BUG_ON(!obj_2bcreated(oi));
1204 wait_event(oi->i_wq, obj_created(oi));
1205 EXOFS_DBGMSG("wait_event done\n");
1206 }
1207
1208 if (do_sync) {
1209 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
1210 osd_end_request(or);
1211 goto free_args;
1212 } else {
1213 args->sbi = sbi;
1214
1215 ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
1216 if (ret) {
1217 osd_end_request(or);
1218 goto free_args;
1219 }
1220 atomic_inc(&sbi->s_curr_pending);
1221 goto out; /* deallocation in updatei_done */
1222 }
1223
1224free_args:
1225 kfree(args);
1226out:
1227 EXOFS_DBGMSG("ret=>%d\n", ret);
1228 return ret;
1229}
1230
1231int exofs_write_inode(struct inode *inode, int wait)
1232{
1233 return exofs_update_inode(inode, wait);
1234}
1235
1236/*
1237 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1238 * do.
1239 */
1240static void delete_done(struct osd_request *or, void *p)
1241{
1242 struct exofs_sb_info *sbi;
1243 osd_end_request(or);
1244 sbi = p;
1245 atomic_dec(&sbi->s_curr_pending);
1246}
1247
1248/*
1249 * Called when the refcount of an inode reaches zero. We remove the object
1250 * from the OSD here. We make sure the object was created before we try and
1251 * delete it.
1252 */
1253void exofs_delete_inode(struct inode *inode)
1254{
1255 struct exofs_i_info *oi = exofs_i(inode);
1256 struct super_block *sb = inode->i_sb;
1257 struct exofs_sb_info *sbi = sb->s_fs_info;
1258 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1259 struct osd_request *or;
1260 int ret;
1261
1262 truncate_inode_pages(&inode->i_data, 0);
1263
1264 if (is_bad_inode(inode))
1265 goto no_delete;
1266
1267 mark_inode_dirty(inode);
1268 exofs_update_inode(inode, inode_needs_sync(inode));
1269
1270 inode->i_size = 0;
1271 if (inode->i_blocks)
1272 exofs_truncate(inode);
1273
1274 clear_inode(inode);
1275
1276 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1277 if (unlikely(!or)) {
1278 EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
1279 return;
1280 }
1281
1282 osd_req_remove_object(or, &obj);
1283
1284 /* if we are deleting an obj that hasn't been created yet, wait */
1285 if (!obj_created(oi)) {
1286 BUG_ON(!obj_2bcreated(oi));
1287 wait_event(oi->i_wq, obj_created(oi));
1288 }
1289
1290 ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
1291 if (ret) {
1292 EXOFS_ERR(
1293 "ERROR: @exofs_delete_inode exofs_async_op failed\n");
1294 osd_end_request(or);
1295 return;
1296 }
1297 atomic_inc(&sbi->s_curr_pending);
1298
1299 return;
1300
1301no_delete:
1302 clear_inode(inode);
1303}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
39{
40 int err = exofs_add_link(dentry, inode);
41 if (!err) {
42 d_instantiate(dentry, inode);
43 return 0;
44 }
45 inode_dec_link_count(inode);
46 iput(inode);
47 return err;
48}
49
50static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
51 struct nameidata *nd)
52{
53 struct inode *inode;
54 ino_t ino;
55
56 if (dentry->d_name.len > EXOFS_NAME_LEN)
57 return ERR_PTR(-ENAMETOOLONG);
58
59 ino = exofs_inode_by_name(dir, dentry);
60 inode = NULL;
61 if (ino) {
62 inode = exofs_iget(dir->i_sb, ino);
63 if (IS_ERR(inode))
64 return ERR_CAST(inode);
65 }
66 return d_splice_alias(inode, dentry);
67}
68
69static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
70 struct nameidata *nd)
71{
72 struct inode *inode = exofs_new_inode(dir, mode);
73 int err = PTR_ERR(inode);
74 if (!IS_ERR(inode)) {
75 inode->i_op = &exofs_file_inode_operations;
76 inode->i_fop = &exofs_file_operations;
77 inode->i_mapping->a_ops = &exofs_aops;
78 mark_inode_dirty(inode);
79 err = exofs_add_nondir(dentry, inode);
80 }
81 return err;
82}
83
84static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
85 dev_t rdev)
86{
87 struct inode *inode;
88 int err;
89
90 if (!new_valid_dev(rdev))
91 return -EINVAL;
92
93 inode = exofs_new_inode(dir, mode);
94 err = PTR_ERR(inode);
95 if (!IS_ERR(inode)) {
96 init_special_inode(inode, inode->i_mode, rdev);
97 mark_inode_dirty(inode);
98 err = exofs_add_nondir(dentry, inode);
99 }
100 return err;
101}
102
103static int exofs_symlink(struct inode *dir, struct dentry *dentry,
104 const char *symname)
105{
106 struct super_block *sb = dir->i_sb;
107 int err = -ENAMETOOLONG;
108 unsigned l = strlen(symname)+1;
109 struct inode *inode;
110 struct exofs_i_info *oi;
111
112 if (l > sb->s_blocksize)
113 goto out;
114
115 inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
116 err = PTR_ERR(inode);
117 if (IS_ERR(inode))
118 goto out;
119
120 oi = exofs_i(inode);
121 if (l > sizeof(oi->i_data)) {
122 /* slow symlink */
123 inode->i_op = &exofs_symlink_inode_operations;
124 inode->i_mapping->a_ops = &exofs_aops;
125 memset(oi->i_data, 0, sizeof(oi->i_data));
126
127 err = page_symlink(inode, symname, l);
128 if (err)
129 goto out_fail;
130 } else {
131 /* fast symlink */
132 inode->i_op = &exofs_fast_symlink_inode_operations;
133 memcpy(oi->i_data, symname, l);
134 inode->i_size = l-1;
135 }
136 mark_inode_dirty(inode);
137
138 err = exofs_add_nondir(dentry, inode);
139out:
140 return err;
141
142out_fail:
143 inode_dec_link_count(inode);
144 iput(inode);
145 goto out;
146}
147
148static int exofs_link(struct dentry *old_dentry, struct inode *dir,
149 struct dentry *dentry)
150{
151 struct inode *inode = old_dentry->d_inode;
152
153 if (inode->i_nlink >= EXOFS_LINK_MAX)
154 return -EMLINK;
155
156 inode->i_ctime = CURRENT_TIME;
157 inode_inc_link_count(inode);
158 atomic_inc(&inode->i_count);
159
160 return exofs_add_nondir(dentry, inode);
161}
162
163static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
164{
165 struct inode *inode;
166 int err = -EMLINK;
167
168 if (dir->i_nlink >= EXOFS_LINK_MAX)
169 goto out;
170
171 inode_inc_link_count(dir);
172
173 inode = exofs_new_inode(dir, S_IFDIR | mode);
174 err = PTR_ERR(inode);
175 if (IS_ERR(inode))
176 goto out_dir;
177
178 inode->i_op = &exofs_dir_inode_operations;
179 inode->i_fop = &exofs_dir_operations;
180 inode->i_mapping->a_ops = &exofs_aops;
181
182 inode_inc_link_count(inode);
183
184 err = exofs_make_empty(inode, dir);
185 if (err)
186 goto out_fail;
187
188 err = exofs_add_link(dentry, inode);
189 if (err)
190 goto out_fail;
191
192 d_instantiate(dentry, inode);
193out:
194 return err;
195
196out_fail:
197 inode_dec_link_count(inode);
198 inode_dec_link_count(inode);
199 iput(inode);
200out_dir:
201 inode_dec_link_count(dir);
202 goto out;
203}
204
205static int exofs_unlink(struct inode *dir, struct dentry *dentry)
206{
207 struct inode *inode = dentry->d_inode;
208 struct exofs_dir_entry *de;
209 struct page *page;
210 int err = -ENOENT;
211
212 de = exofs_find_entry(dir, dentry, &page);
213 if (!de)
214 goto out;
215
216 err = exofs_delete_entry(de, page);
217 if (err)
218 goto out;
219
220 inode->i_ctime = dir->i_ctime;
221 inode_dec_link_count(inode);
222 err = 0;
223out:
224 return err;
225}
226
227static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
228{
229 struct inode *inode = dentry->d_inode;
230 int err = -ENOTEMPTY;
231
232 if (exofs_empty_dir(inode)) {
233 err = exofs_unlink(dir, dentry);
234 if (!err) {
235 inode->i_size = 0;
236 inode_dec_link_count(inode);
237 inode_dec_link_count(dir);
238 }
239 }
240 return err;
241}
242
243static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
244 struct inode *new_dir, struct dentry *new_dentry)
245{
246 struct inode *old_inode = old_dentry->d_inode;
247 struct inode *new_inode = new_dentry->d_inode;
248 struct page *dir_page = NULL;
249 struct exofs_dir_entry *dir_de = NULL;
250 struct page *old_page;
251 struct exofs_dir_entry *old_de;
252 int err = -ENOENT;
253
254 old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
255 if (!old_de)
256 goto out;
257
258 if (S_ISDIR(old_inode->i_mode)) {
259 err = -EIO;
260 dir_de = exofs_dotdot(old_inode, &dir_page);
261 if (!dir_de)
262 goto out_old;
263 }
264
265 if (new_inode) {
266 struct page *new_page;
267 struct exofs_dir_entry *new_de;
268
269 err = -ENOTEMPTY;
270 if (dir_de && !exofs_empty_dir(new_inode))
271 goto out_dir;
272
273 err = -ENOENT;
274 new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
275 if (!new_de)
276 goto out_dir;
277 inode_inc_link_count(old_inode);
278 err = exofs_set_link(new_dir, new_de, new_page, old_inode);
279 new_inode->i_ctime = CURRENT_TIME;
280 if (dir_de)
281 drop_nlink(new_inode);
282 inode_dec_link_count(new_inode);
283 if (err)
284 goto out_dir;
285 } else {
286 if (dir_de) {
287 err = -EMLINK;
288 if (new_dir->i_nlink >= EXOFS_LINK_MAX)
289 goto out_dir;
290 }
291 inode_inc_link_count(old_inode);
292 err = exofs_add_link(new_dentry, old_inode);
293 if (err) {
294 inode_dec_link_count(old_inode);
295 goto out_dir;
296 }
297 if (dir_de)
298 inode_inc_link_count(new_dir);
299 }
300
301 old_inode->i_ctime = CURRENT_TIME;
302
303 exofs_delete_entry(old_de, old_page);
304 inode_dec_link_count(old_inode);
305
306 if (dir_de) {
307 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
308 inode_dec_link_count(old_dir);
309 if (err)
310 goto out_dir;
311 }
312 return 0;
313
314
315out_dir:
316 if (dir_de) {
317 kunmap(dir_page);
318 page_cache_release(dir_page);
319 }
320out_old:
321 kunmap(old_page);
322 page_cache_release(old_page);
323out:
324 return err;
325}
326
327const struct inode_operations exofs_dir_inode_operations = {
328 .create = exofs_create,
329 .lookup = exofs_lookup,
330 .link = exofs_link,
331 .unlink = exofs_unlink,
332 .symlink = exofs_symlink,
333 .mkdir = exofs_mkdir,
334 .rmdir = exofs_rmdir,
335 .mknod = exofs_mknod,
336 .rename = exofs_rename,
337 .setattr = exofs_setattr,
338};
339
340const struct inode_operations exofs_special_inode_operations = {
341 .setattr = exofs_setattr,
342};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * This file is part of exofs.
10 *
11 * exofs is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation. Since it is based on ext2, and the only
14 * valid version of GPL for the Linux kernel is version 2, the only valid
15 * version of GPL for exofs is version 2.
16 *
17 * exofs is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with exofs; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <scsi/scsi_device.h>
28#include <scsi/osd_sense.h>
29
30#include "exofs.h"
31
32int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
33{
34 struct osd_sense_info osi;
35 int ret = osd_req_decode_sense(or, &osi);
36
37 if (ret) { /* translate to Linux codes */
38 if (osi.additional_code == scsi_invalid_field_in_cdb) {
39 if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
40 ret = -EFAULT;
41 if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
42 ret = -ENOENT;
43 else
44 ret = -EINVAL;
45 } else if (osi.additional_code == osd_quota_error)
46 ret = -ENOSPC;
47 else
48 ret = -EIO;
49 }
50
51 /* FIXME: should be include in osd_sense_info */
52 if (in_resid)
53 *in_resid = or->in.req ? or->in.req->data_len : 0;
54
55 if (out_resid)
56 *out_resid = or->out.req ? or->out.req->data_len : 0;
57
58 return ret;
59}
60
61void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
62{
63 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
64}
65
66/*
67 * Perform a synchronous OSD operation.
68 */
69int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
70{
71 int ret;
72
73 or->timeout = timeout;
74 ret = osd_finalize_request(or, 0, credential, NULL);
75 if (ret) {
76 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
77 return ret;
78 }
79
80 ret = osd_execute_request(or);
81
82 if (ret)
83 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
84 /* osd_req_decode_sense(or, ret); */
85 return ret;
86}
87
88/*
89 * Perform an asynchronous OSD operation.
90 */
91int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
92 void *caller_context, u8 *cred)
93{
94 int ret;
95
96 ret = osd_finalize_request(or, 0, cred, NULL);
97 if (ret) {
98 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
99 return ret;
100 }
101
102 ret = osd_execute_request_async(or, async_done, caller_context);
103
104 if (ret)
105 EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
106 return ret;
107}
108
109int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
110{
111 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
112 void *iter = NULL;
113 int nelem;
114
115 do {
116 nelem = 1;
117 osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
118 if ((cur_attr.attr_page == attr->attr_page) &&
119 (cur_attr.attr_id == attr->attr_id)) {
120 attr->len = cur_attr.len;
121 attr->val_ptr = cur_attr.val_ptr;
122 return 0;
123 }
124 } while (iter);
125
126 return -EIO;
127}
128
129int osd_req_read_kern(struct osd_request *or,
130 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
131{
132 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
133 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
134
135 if (!bio)
136 return -ENOMEM;
137
138 osd_req_read(or, obj, bio, offset);
139 return 0;
140}
141
142int osd_req_write_kern(struct osd_request *or,
143 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
144{
145 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
146 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
147
148 if (!bio)
149 return -ENOMEM;
150
151 osd_req_write(or, obj, bio, offset);
152 return 0;
153}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..9f1985e857e2
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/string.h>
37#include <linux/parser.h>
38#include <linux/vfs.h>
39#include <linux/random.h>
40#include <linux/exportfs.h>
41
42#include "exofs.h"
43
44/******************************************************************************
45 * MOUNT OPTIONS
46 *****************************************************************************/
47
48/*
49 * struct to hold what we get from mount options
50 */
51struct exofs_mountopt {
52 const char *dev_name;
53 uint64_t pid;
54 int timeout;
55};
56
57/*
58 * exofs-specific mount-time options.
59 */
60enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
61
62/*
63 * Our mount-time options. These should ideally be 64-bit unsigned, but the
64 * kernel's parsing functions do not currently support that. 32-bit should be
65 * sufficient for most applications now.
66 */
67static match_table_t tokens = {
68 {Opt_pid, "pid=%u"},
69 {Opt_to, "to=%u"},
70 {Opt_err, NULL}
71};
72
73/*
74 * The main option parsing method. Also makes sure that all of the mandatory
75 * mount options were set.
76 */
77static int parse_options(char *options, struct exofs_mountopt *opts)
78{
79 char *p;
80 substring_t args[MAX_OPT_ARGS];
81 int option;
82 bool s_pid = false;
83
84 EXOFS_DBGMSG("parse_options %s\n", options);
85 /* defaults */
86 memset(opts, 0, sizeof(*opts));
87 opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
88
89 while ((p = strsep(&options, ",")) != NULL) {
90 int token;
91 char str[32];
92
93 if (!*p)
94 continue;
95
96 token = match_token(p, tokens, args);
97 switch (token) {
98 case Opt_pid:
99 if (0 == match_strlcpy(str, &args[0], sizeof(str)))
100 return -EINVAL;
101 opts->pid = simple_strtoull(str, NULL, 0);
102 if (opts->pid < EXOFS_MIN_PID) {
103 EXOFS_ERR("Partition ID must be >= %u",
104 EXOFS_MIN_PID);
105 return -EINVAL;
106 }
107 s_pid = 1;
108 break;
109 case Opt_to:
110 if (match_int(&args[0], &option))
111 return -EINVAL;
112 if (option <= 0) {
113 EXOFS_ERR("Timout must be > 0");
114 return -EINVAL;
115 }
116 opts->timeout = option * HZ;
117 break;
118 }
119 }
120
121 if (!s_pid) {
122 EXOFS_ERR("Need to specify the following options:\n");
123 EXOFS_ERR(" -o pid=pid_no_to_use\n");
124 return -EINVAL;
125 }
126
127 return 0;
128}
129
130/******************************************************************************
131 * INODE CACHE
132 *****************************************************************************/
133
134/*
135 * Our inode cache. Isn't it pretty?
136 */
137static struct kmem_cache *exofs_inode_cachep;
138
139/*
140 * Allocate an inode in the cache
141 */
142static struct inode *exofs_alloc_inode(struct super_block *sb)
143{
144 struct exofs_i_info *oi;
145
146 oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
147 if (!oi)
148 return NULL;
149
150 oi->vfs_inode.i_version = 1;
151 return &oi->vfs_inode;
152}
153
154/*
155 * Remove an inode from the cache
156 */
157static void exofs_destroy_inode(struct inode *inode)
158{
159 kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
160}
161
162/*
163 * Initialize the inode
164 */
165static void exofs_init_once(void *foo)
166{
167 struct exofs_i_info *oi = foo;
168
169 inode_init_once(&oi->vfs_inode);
170}
171
172/*
173 * Create and initialize the inode cache
174 */
175static int init_inodecache(void)
176{
177 exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
178 sizeof(struct exofs_i_info), 0,
179 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
180 exofs_init_once);
181 if (exofs_inode_cachep == NULL)
182 return -ENOMEM;
183 return 0;
184}
185
186/*
187 * Destroy the inode cache
188 */
189static void destroy_inodecache(void)
190{
191 kmem_cache_destroy(exofs_inode_cachep);
192}
193
194/******************************************************************************
195 * SUPERBLOCK FUNCTIONS
196 *****************************************************************************/
197static const struct super_operations exofs_sops;
198static const struct export_operations exofs_export_ops;
199
200/*
201 * Write the superblock to the OSD
202 */
203static void exofs_write_super(struct super_block *sb)
204{
205 struct exofs_sb_info *sbi;
206 struct exofs_fscb *fscb;
207 struct osd_request *or;
208 struct osd_obj_id obj;
209 int ret;
210
211 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
212 if (!fscb) {
213 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
214 return;
215 }
216
217 lock_kernel();
218 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
220 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
221 fscb->s_magic = cpu_to_le16(sb->s_magic);
222 fscb->s_newfs = 0;
223
224 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
225 if (unlikely(!or)) {
226 EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
227 goto out;
228 }
229
230 obj.partition = sbi->s_pid;
231 obj.id = EXOFS_SUPER_ID;
232 ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
233 if (unlikely(ret)) {
234 EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
235 goto out;
236 }
237
238 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
239 if (unlikely(ret)) {
240 EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
241 goto out;
242 }
243 sb->s_dirt = 0;
244
245out:
246 if (or)
247 osd_end_request(or);
248 unlock_kernel();
249 kfree(fscb);
250}
251
252/*
253 * This function is called when the vfs is freeing the superblock. We just
254 * need to free our own part.
255 */
256static void exofs_put_super(struct super_block *sb)
257{
258 int num_pend;
259 struct exofs_sb_info *sbi = sb->s_fs_info;
260
261 /* make sure there are no pending commands */
262 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
263 num_pend = atomic_read(&sbi->s_curr_pending)) {
264 wait_queue_head_t wq;
265 init_waitqueue_head(&wq);
266 wait_event_timeout(wq,
267 (atomic_read(&sbi->s_curr_pending) == 0),
268 msecs_to_jiffies(100));
269 }
270
271 osduld_put_device(sbi->s_dev);
272 kfree(sb->s_fs_info);
273 sb->s_fs_info = NULL;
274}
275
276/*
277 * Read the superblock from the OSD and fill in the fields
278 */
279static int exofs_fill_super(struct super_block *sb, void *data, int silent)
280{
281 struct inode *root;
282 struct exofs_mountopt *opts = data;
283 struct exofs_sb_info *sbi; /*extended info */
284 struct exofs_fscb fscb; /*on-disk superblock info */
285 struct osd_request *or = NULL;
286 struct osd_obj_id obj;
287 int ret;
288
289 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
290 if (!sbi)
291 return -ENOMEM;
292 sb->s_fs_info = sbi;
293
294 /* use mount options to fill superblock */
295 sbi->s_dev = osduld_path_lookup(opts->dev_name);
296 if (IS_ERR(sbi->s_dev)) {
297 ret = PTR_ERR(sbi->s_dev);
298 sbi->s_dev = NULL;
299 goto free_sbi;
300 }
301
302 sbi->s_pid = opts->pid;
303 sbi->s_timeout = opts->timeout;
304
305 /* fill in some other data by hand */
306 memset(sb->s_id, 0, sizeof(sb->s_id));
307 strcpy(sb->s_id, "exofs");
308 sb->s_blocksize = EXOFS_BLKSIZE;
309 sb->s_blocksize_bits = EXOFS_BLKSHIFT;
310 sb->s_maxbytes = MAX_LFS_FILESIZE;
311 atomic_set(&sbi->s_curr_pending, 0);
312 sb->s_bdev = NULL;
313 sb->s_dev = 0;
314
315 /* read data from on-disk superblock object */
316 obj.partition = sbi->s_pid;
317 obj.id = EXOFS_SUPER_ID;
318 exofs_make_credential(sbi->s_cred, &obj);
319
320 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
321 if (unlikely(!or)) {
322 if (!silent)
323 EXOFS_ERR(
324 "exofs_fill_super: osd_start_request failed.\n");
325 ret = -ENOMEM;
326 goto free_sbi;
327 }
328 ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
329 if (unlikely(ret)) {
330 if (!silent)
331 EXOFS_ERR(
332 "exofs_fill_super: osd_req_read_kern failed.\n");
333 ret = -ENOMEM;
334 goto free_sbi;
335 }
336
337 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
338 if (unlikely(ret)) {
339 if (!silent)
340 EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
341 ret = -EIO;
342 goto free_sbi;
343 }
344
345 sb->s_magic = le16_to_cpu(fscb.s_magic);
346 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
347 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
348
349 /* make sure what we read from the object store is correct */
350 if (sb->s_magic != EXOFS_SUPER_MAGIC) {
351 if (!silent)
352 EXOFS_ERR("ERROR: Bad magic value\n");
353 ret = -EINVAL;
354 goto free_sbi;
355 }
356
357 /* start generation numbers from a random point */
358 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
359 spin_lock_init(&sbi->s_next_gen_lock);
360
361 /* set up operation vectors */
362 sb->s_op = &exofs_sops;
363 sb->s_export_op = &exofs_export_ops;
364 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
365 if (IS_ERR(root)) {
366 EXOFS_ERR("ERROR: exofs_iget failed\n");
367 ret = PTR_ERR(root);
368 goto free_sbi;
369 }
370 sb->s_root = d_alloc_root(root);
371 if (!sb->s_root) {
372 iput(root);
373 EXOFS_ERR("ERROR: get root inode failed\n");
374 ret = -ENOMEM;
375 goto free_sbi;
376 }
377
378 if (!S_ISDIR(root->i_mode)) {
379 dput(sb->s_root);
380 sb->s_root = NULL;
381 EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
382 root->i_mode);
383 ret = -EINVAL;
384 goto free_sbi;
385 }
386
387 ret = 0;
388out:
389 if (or)
390 osd_end_request(or);
391 return ret;
392
393free_sbi:
394 osduld_put_device(sbi->s_dev); /* NULL safe */
395 kfree(sbi);
396 goto out;
397}
398
399/*
400 * Set up the superblock (calls exofs_fill_super eventually)
401 */
402static int exofs_get_sb(struct file_system_type *type,
403 int flags, const char *dev_name,
404 void *data, struct vfsmount *mnt)
405{
406 struct exofs_mountopt opts;
407 int ret;
408
409 ret = parse_options(data, &opts);
410 if (ret)
411 return ret;
412
413 opts.dev_name = dev_name;
414 return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
415}
416
417/*
418 * Return information about the file system state in the buffer. This is used
419 * by the 'df' command, for example.
420 */
421static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
422{
423 struct super_block *sb = dentry->d_sb;
424 struct exofs_sb_info *sbi = sb->s_fs_info;
425 struct osd_obj_id obj = {sbi->s_pid, 0};
426 struct osd_attr attrs[] = {
427 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
428 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
429 ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
430 OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
431 };
432 uint64_t capacity = ULLONG_MAX;
433 uint64_t used = ULLONG_MAX;
434 struct osd_request *or;
435 uint8_t cred_a[OSD_CAP_LEN];
436 int ret;
437
438 /* get used/capacity attributes */
439 exofs_make_credential(cred_a, &obj);
440
441 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
442 if (unlikely(!or)) {
443 EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
444 return -ENOMEM;
445 }
446
447 osd_req_get_attributes(or, &obj);
448 osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
449 ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
450 if (unlikely(ret))
451 goto out;
452
453 ret = extract_attr_from_req(or, &attrs[0]);
454 if (likely(!ret))
455 capacity = get_unaligned_be64(attrs[0].val_ptr);
456 else
457 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
458
459 ret = extract_attr_from_req(or, &attrs[1]);
460 if (likely(!ret))
461 used = get_unaligned_be64(attrs[1].val_ptr);
462 else
463 EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
464
465 /* fill in the stats buffer */
466 buf->f_type = EXOFS_SUPER_MAGIC;
467 buf->f_bsize = EXOFS_BLKSIZE;
468 buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
469 buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
470 buf->f_bavail = buf->f_bfree;
471 buf->f_files = sbi->s_numfiles;
472 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
473 buf->f_namelen = EXOFS_NAME_LEN;
474
475out:
476 osd_end_request(or);
477 return ret;
478}
479
480static const struct super_operations exofs_sops = {
481 .alloc_inode = exofs_alloc_inode,
482 .destroy_inode = exofs_destroy_inode,
483 .write_inode = exofs_write_inode,
484 .delete_inode = exofs_delete_inode,
485 .put_super = exofs_put_super,
486 .write_super = exofs_write_super,
487 .statfs = exofs_statfs,
488};
489
490/******************************************************************************
491 * EXPORT OPERATIONS
492 *****************************************************************************/
493
494struct dentry *exofs_get_parent(struct dentry *child)
495{
496 unsigned long ino = exofs_parent_ino(child);
497
498 if (!ino)
499 return NULL;
500
501 return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
502}
503
504static struct inode *exofs_nfs_get_inode(struct super_block *sb,
505 u64 ino, u32 generation)
506{
507 struct inode *inode;
508
509 inode = exofs_iget(sb, ino);
510 if (IS_ERR(inode))
511 return ERR_CAST(inode);
512 if (generation && inode->i_generation != generation) {
513 /* we didn't find the right inode.. */
514 iput(inode);
515 return ERR_PTR(-ESTALE);
516 }
517 return inode;
518}
519
520static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
521 struct fid *fid, int fh_len, int fh_type)
522{
523 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
524 exofs_nfs_get_inode);
525}
526
527static struct dentry *exofs_fh_to_parent(struct super_block *sb,
528 struct fid *fid, int fh_len, int fh_type)
529{
530 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
531 exofs_nfs_get_inode);
532}
533
534static const struct export_operations exofs_export_ops = {
535 .fh_to_dentry = exofs_fh_to_dentry,
536 .fh_to_parent = exofs_fh_to_parent,
537 .get_parent = exofs_get_parent,
538};
539
540/******************************************************************************
541 * INSMOD/RMMOD
542 *****************************************************************************/
543
544/*
545 * struct that describes this file system
546 */
547static struct file_system_type exofs_type = {
548 .owner = THIS_MODULE,
549 .name = "exofs",
550 .get_sb = exofs_get_sb,
551 .kill_sb = generic_shutdown_super,
552};
553
554static int __init init_exofs(void)
555{
556 int err;
557
558 err = init_inodecache();
559 if (err)
560 goto out;
561
562 err = register_filesystem(&exofs_type);
563 if (err)
564 goto out_d;
565
566 return 0;
567out_d:
568 destroy_inodecache();
569out:
570 return err;
571}
572
573static void __exit exit_exofs(void)
574{
575 unregister_filesystem(&exofs_type);
576 destroy_inodecache();
577}
578
579MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
580MODULE_DESCRIPTION("exofs");
581MODULE_LICENSE("GPL");
582
583module_init(init_exofs)
584module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/namei.h>
37
38#include "exofs.h"
39
40static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
41{
42 struct exofs_i_info *oi = exofs_i(dentry->d_inode);
43
44 nd_set_link(nd, (char *)oi->i_data);
45 return NULL;
46}
47
48const struct inode_operations exofs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52};
53
54const struct inode_operations exofs_fast_symlink_inode_operations = {
55 .readlink = generic_readlink,
56 .follow_link = exofs_follow_link,
57};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
318 return PTR_ERR(acl); 318 return PTR_ERR(acl);
319 } 319 }
320 if (!acl) 320 if (!acl)
321 inode->i_mode &= ~current->fs->umask; 321 inode->i_mode &= ~current_umask();
322 } 322 }
323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
324 struct posix_acl *clone; 324 struct posix_acl *clone;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
28 To compile this file system support as a module, choose M here: the 28 To compile this file system support as a module, choose M here: the
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)"
33 depends on EXT3_FS
34 help
35 If a filesystem does not explicitly specify a data ordering
36 mode, and the journal capability allowed it, ext3 used to
37 historically default to 'data=ordered'.
38
39 That was a rather unfortunate choice, because it leads to all
40 kinds of latency problems, and the 'data=writeback' mode is more
41 appropriate these days.
42
43 You should probably always answer 'n' here, and if you really
44 want to use 'data=ordered' mode, set it in the filesystem itself
45 with 'tune2fs -o journal_data_ordered'.
46
47 But if you really want to enable the legacy default, you can do
48 so by answering 'y' to this question.
49
31config EXT3_FS_XATTR 50config EXT3_FS_XATTR
32 bool "Ext3 extended attributes" 51 bool "Ext3 extended attributes"
33 depends on EXT3_FS 52 depends on EXT3_FS
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext3_readdir, /* we take BKL. needed?*/ 44 .readdir = ext3_readdir, /* we take BKL. needed?*/
45 .ioctl = ext3_ioctl, /* BKL held */ 45 .unlocked_ioctl = ext3_ioctl,
46#ifdef CONFIG_COMPAT 46#ifdef CONFIG_COMPAT
47 .compat_ioctl = ext3_compat_ioctl, 47 .compat_ioctl = ext3_compat_ioctl,
48#endif 48#endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..5b49704b231b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
33 */ 33 */
34static int ext3_release_file (struct inode * inode, struct file * filp) 34static int ext3_release_file (struct inode * inode, struct file * filp)
35{ 35{
36 if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
37 filemap_flush(inode->i_mapping);
38 EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1))
@@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = {
112 .write = do_sync_write, 116 .write = do_sync_write,
113 .aio_read = generic_file_aio_read, 117 .aio_read = generic_file_aio_read,
114 .aio_write = ext3_file_write, 118 .aio_write = ext3_file_write,
115 .ioctl = ext3_ioctl, 119 .unlocked_ioctl = ext3_ioctl,
116#ifdef CONFIG_COMPAT 120#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext3_compat_ioctl, 121 .compat_ioctl = ext3_compat_ioctl,
118#endif 122#endif
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 4a09ff169870..466a332e0bd1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
1149 struct page **pagep, void **fsdata) 1149 struct page **pagep, void **fsdata)
1150{ 1150{
1151 struct inode *inode = mapping->host; 1151 struct inode *inode = mapping->host;
1152 int ret, needed_blocks = ext3_writepage_trans_blocks(inode); 1152 int ret;
1153 handle_t *handle; 1153 handle_t *handle;
1154 int retries = 0; 1154 int retries = 0;
1155 struct page *page; 1155 struct page *page;
1156 pgoff_t index; 1156 pgoff_t index;
1157 unsigned from, to; 1157 unsigned from, to;
1158 /* Reserve one block more for addition to orphan list in case
1159 * we allocate blocks but write fails for some reason */
1160 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1158 1161
1159 index = pos >> PAGE_CACHE_SHIFT; 1162 index = pos >> PAGE_CACHE_SHIFT;
1160 from = pos & (PAGE_CACHE_SIZE - 1); 1163 from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
1184 } 1187 }
1185write_begin_failed: 1188write_begin_failed:
1186 if (ret) { 1189 if (ret) {
1187 ext3_journal_stop(handle);
1188 unlock_page(page);
1189 page_cache_release(page);
1190 /* 1190 /*
1191 * block_write_begin may have instantiated a few blocks 1191 * block_write_begin may have instantiated a few blocks
1192 * outside i_size. Trim these off again. Don't need 1192 * outside i_size. Trim these off again. Don't need
1193 * i_size_read because we hold i_mutex. 1193 * i_size_read because we hold i_mutex.
1194 *
1195 * Add inode to orphan list in case we crash before truncate
1196 * finishes.
1194 */ 1197 */
1195 if (pos + len > inode->i_size) 1198 if (pos + len > inode->i_size)
1199 ext3_orphan_add(handle, inode);
1200 ext3_journal_stop(handle);
1201 unlock_page(page);
1202 page_cache_release(page);
1203 if (pos + len > inode->i_size)
1196 vmtruncate(inode, inode->i_size); 1204 vmtruncate(inode, inode->i_size);
1197 } 1205 }
1198 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1206 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1211 return err; 1219 return err;
1212} 1220}
1213 1221
1222/* For ordered writepage and write_end functions */
1223static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1224{
1225 /*
1226 * Write could have mapped the buffer but it didn't copy the data in
1227 * yet. So avoid filing such buffer into a transaction.
1228 */
1229 if (buffer_mapped(bh) && buffer_uptodate(bh))
1230 return ext3_journal_dirty_data(handle, bh);
1231 return 0;
1232}
1233
1214/* For write_end() in data=journal mode */ 1234/* For write_end() in data=journal mode */
1215static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1235static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1216{ 1236{
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1221} 1241}
1222 1242
1223/* 1243/*
1224 * Generic write_end handler for ordered and writeback ext3 journal modes. 1244 * This is nasty and subtle: ext3_write_begin() could have allocated blocks
1225 * We can't use generic_write_end, because that unlocks the page and we need to 1245 * for the whole page but later we failed to copy the data in. Update inode
1226 * unlock the page after ext3_journal_stop, but ext3_journal_stop must run 1246 * size according to what we managed to copy. The rest is going to be
1227 * after block_write_end. 1247 * truncated in write_end function.
1228 */ 1248 */
1229static int ext3_generic_write_end(struct file *file, 1249static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
1230 struct address_space *mapping,
1231 loff_t pos, unsigned len, unsigned copied,
1232 struct page *page, void *fsdata)
1233{ 1250{
1234 struct inode *inode = file->f_mapping->host; 1251 /* What matters to us is i_disksize. We don't write i_size anywhere */
1235 1252 if (pos + copied > inode->i_size)
1236 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1253 i_size_write(inode, pos + copied);
1237 1254 if (pos + copied > EXT3_I(inode)->i_disksize) {
1238 if (pos+copied > inode->i_size) { 1255 EXT3_I(inode)->i_disksize = pos + copied;
1239 i_size_write(inode, pos+copied);
1240 mark_inode_dirty(inode); 1256 mark_inode_dirty(inode);
1241 } 1257 }
1242
1243 return copied;
1244} 1258}
1245 1259
1246/* 1260/*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
1260 unsigned from, to; 1274 unsigned from, to;
1261 int ret = 0, ret2; 1275 int ret = 0, ret2;
1262 1276
1263 from = pos & (PAGE_CACHE_SIZE - 1); 1277 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1264 to = from + len;
1265 1278
1279 from = pos & (PAGE_CACHE_SIZE - 1);
1280 to = from + copied;
1266 ret = walk_page_buffers(handle, page_buffers(page), 1281 ret = walk_page_buffers(handle, page_buffers(page),
1267 from, to, NULL, ext3_journal_dirty_data); 1282 from, to, NULL, journal_dirty_data_fn);
1268 1283
1269 if (ret == 0) { 1284 if (ret == 0)
1270 /* 1285 update_file_sizes(inode, pos, copied);
1271 * generic_write_end() will run mark_inode_dirty() if i_size 1286 /*
1272 * changes. So let's piggyback the i_disksize mark_inode_dirty 1287 * There may be allocated blocks outside of i_size because
1273 * into that. 1288 * we failed to copy some data. Prepare for truncate.
1274 */ 1289 */
1275 loff_t new_i_size; 1290 if (pos + len > inode->i_size)
1276 1291 ext3_orphan_add(handle, inode);
1277 new_i_size = pos + copied;
1278 if (new_i_size > EXT3_I(inode)->i_disksize)
1279 EXT3_I(inode)->i_disksize = new_i_size;
1280 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1281 page, fsdata);
1282 copied = ret2;
1283 if (ret2 < 0)
1284 ret = ret2;
1285 }
1286 ret2 = ext3_journal_stop(handle); 1292 ret2 = ext3_journal_stop(handle);
1287 if (!ret) 1293 if (!ret)
1288 ret = ret2; 1294 ret = ret2;
1289 unlock_page(page); 1295 unlock_page(page);
1290 page_cache_release(page); 1296 page_cache_release(page);
1291 1297
1298 if (pos + len > inode->i_size)
1299 vmtruncate(inode, inode->i_size);
1292 return ret ? ret : copied; 1300 return ret ? ret : copied;
1293} 1301}
1294 1302
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
1299{ 1307{
1300 handle_t *handle = ext3_journal_current_handle(); 1308 handle_t *handle = ext3_journal_current_handle();
1301 struct inode *inode = file->f_mapping->host; 1309 struct inode *inode = file->f_mapping->host;
1302 int ret = 0, ret2; 1310 int ret;
1303 loff_t new_i_size;
1304
1305 new_i_size = pos + copied;
1306 if (new_i_size > EXT3_I(inode)->i_disksize)
1307 EXT3_I(inode)->i_disksize = new_i_size;
1308
1309 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1310 page, fsdata);
1311 copied = ret2;
1312 if (ret2 < 0)
1313 ret = ret2;
1314 1311
1315 ret2 = ext3_journal_stop(handle); 1312 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1316 if (!ret) 1313 update_file_sizes(inode, pos, copied);
1317 ret = ret2; 1314 /*
1315 * There may be allocated blocks outside of i_size because
1316 * we failed to copy some data. Prepare for truncate.
1317 */
1318 if (pos + len > inode->i_size)
1319 ext3_orphan_add(handle, inode);
1320 ret = ext3_journal_stop(handle);
1318 unlock_page(page); 1321 unlock_page(page);
1319 page_cache_release(page); 1322 page_cache_release(page);
1320 1323
1324 if (pos + len > inode->i_size)
1325 vmtruncate(inode, inode->i_size);
1321 return ret ? ret : copied; 1326 return ret ? ret : copied;
1322} 1327}
1323 1328
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
1338 if (copied < len) { 1343 if (copied < len) {
1339 if (!PageUptodate(page)) 1344 if (!PageUptodate(page))
1340 copied = 0; 1345 copied = 0;
1341 page_zero_new_buffers(page, from+copied, to); 1346 page_zero_new_buffers(page, from + copied, to);
1347 to = from + copied;
1342 } 1348 }
1343 1349
1344 ret = walk_page_buffers(handle, page_buffers(page), from, 1350 ret = walk_page_buffers(handle, page_buffers(page), from,
1345 to, &partial, write_end_fn); 1351 to, &partial, write_end_fn);
1346 if (!partial) 1352 if (!partial)
1347 SetPageUptodate(page); 1353 SetPageUptodate(page);
1348 if (pos+copied > inode->i_size) 1354
1349 i_size_write(inode, pos+copied); 1355 if (pos + copied > inode->i_size)
1356 i_size_write(inode, pos + copied);
1357 /*
1358 * There may be allocated blocks outside of i_size because
1359 * we failed to copy some data. Prepare for truncate.
1360 */
1361 if (pos + len > inode->i_size)
1362 ext3_orphan_add(handle, inode);
1350 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1351 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1364 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1352 EXT3_I(inode)->i_disksize = inode->i_size; 1365 EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
1361 unlock_page(page); 1374 unlock_page(page);
1362 page_cache_release(page); 1375 page_cache_release(page);
1363 1376
1377 if (pos + len > inode->i_size)
1378 vmtruncate(inode, inode->i_size);
1364 return ret ? ret : copied; 1379 return ret ? ret : copied;
1365} 1380}
1366 1381
@@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1428 return 0; 1443 return 0;
1429} 1444}
1430 1445
1431static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1432{
1433 if (buffer_mapped(bh))
1434 return ext3_journal_dirty_data(handle, bh);
1435 return 0;
1436}
1437
1438static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) 1446static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1439{ 1447{
1440 return !buffer_mapped(bh); 1448 return !buffer_mapped(bh);
1441} 1449}
1450
1442/* 1451/*
1443 * Note that we always start a transaction even if we're not journalling 1452 * Note that we always start a transaction even if we're not journalling
1444 * data. This is to preserve ordering: any hole instantiation within 1453 * data. This is to preserve ordering: any hole instantiation within
@@ -2354,6 +2363,9 @@ void ext3_truncate(struct inode *inode)
2354 if (!ext3_can_truncate(inode)) 2363 if (!ext3_can_truncate(inode))
2355 return; 2364 return;
2356 2365
2366 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2367 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
2368
2357 /* 2369 /*
2358 * We have to lock the EOF page here, because lock_page() nests 2370 * We have to lock the EOF page here, because lock_page() nests
2359 * outside journal_start(). 2371 * outside journal_start().
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
15#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/compat.h> 17#include <linux/compat.h>
18#include <linux/smp_lock.h>
19#include <asm/uaccess.h> 18#include <asm/uaccess.h>
20 19
21int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, 20long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
22 unsigned long arg)
23{ 21{
22 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext3_inode_info *ei = EXT3_I(inode); 23 struct ext3_inode_info *ei = EXT3_I(inode);
25 unsigned int flags; 24 unsigned int flags;
26 unsigned short rsv_window_size; 25 unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
41 40
41 if (!is_owner_or_cap(inode))
42 return -EACCES;
43
44 if (get_user(flags, (int __user *) arg))
45 return -EFAULT;
46
42 err = mnt_want_write(filp->f_path.mnt); 47 err = mnt_want_write(filp->f_path.mnt);
43 if (err) 48 if (err)
44 return err; 49 return err;
45 50
46 if (!is_owner_or_cap(inode)) {
47 err = -EACCES;
48 goto flags_out;
49 }
50
51 if (get_user(flags, (int __user *) arg)) {
52 err = -EFAULT;
53 goto flags_out;
54 }
55
56 flags = ext3_mask_flags(inode->i_mode, flags); 51 flags = ext3_mask_flags(inode->i_mode, flags);
57 52
58 mutex_lock(&inode->i_mutex); 53 mutex_lock(&inode->i_mutex);
54
59 /* Is it quota file? Do not allow user to mess with it */ 55 /* Is it quota file? Do not allow user to mess with it */
60 if (IS_NOQUOTA(inode)) { 56 err = -EPERM;
61 mutex_unlock(&inode->i_mutex); 57 if (IS_NOQUOTA(inode))
62 err = -EPERM;
63 goto flags_out; 58 goto flags_out;
64 } 59
65 oldflags = ei->i_flags; 60 oldflags = ei->i_flags;
66 61
67 /* The JOURNAL_DATA flag is modifiable only by root */ 62 /* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
74 * This test looks nicer. Thanks to Pauline Middelink 69 * This test looks nicer. Thanks to Pauline Middelink
75 */ 70 */
76 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { 71 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
77 if (!capable(CAP_LINUX_IMMUTABLE)) { 72 if (!capable(CAP_LINUX_IMMUTABLE))
78 mutex_unlock(&inode->i_mutex);
79 err = -EPERM;
80 goto flags_out; 73 goto flags_out;
81 }
82 } 74 }
83 75
84 /* 76 /*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
86 * the relevant capability. 78 * the relevant capability.
87 */ 79 */
88 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { 80 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
89 if (!capable(CAP_SYS_RESOURCE)) { 81 if (!capable(CAP_SYS_RESOURCE))
90 mutex_unlock(&inode->i_mutex);
91 err = -EPERM;
92 goto flags_out; 82 goto flags_out;
93 }
94 } 83 }
95 84
96
97 handle = ext3_journal_start(inode, 1); 85 handle = ext3_journal_start(inode, 1);
98 if (IS_ERR(handle)) { 86 if (IS_ERR(handle)) {
99 mutex_unlock(&inode->i_mutex);
100 err = PTR_ERR(handle); 87 err = PTR_ERR(handle);
101 goto flags_out; 88 goto flags_out;
102 } 89 }
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
116 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 103 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
117flags_err: 104flags_err:
118 ext3_journal_stop(handle); 105 ext3_journal_stop(handle);
119 if (err) { 106 if (err)
120 mutex_unlock(&inode->i_mutex); 107 goto flags_out;
121 return err;
122 }
123 108
124 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) 109 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
125 err = ext3_change_inode_journal_flag(inode, jflag); 110 err = ext3_change_inode_journal_flag(inode, jflag);
126 mutex_unlock(&inode->i_mutex);
127flags_out: 111flags_out:
112 mutex_unlock(&inode->i_mutex);
128 mnt_drop_write(filp->f_path.mnt); 113 mnt_drop_write(filp->f_path.mnt);
129 return err; 114 return err;
130 } 115 }
@@ -140,6 +125,7 @@ flags_out:
140 125
141 if (!is_owner_or_cap(inode)) 126 if (!is_owner_or_cap(inode))
142 return -EPERM; 127 return -EPERM;
128
143 err = mnt_want_write(filp->f_path.mnt); 129 err = mnt_want_write(filp->f_path.mnt);
144 if (err) 130 if (err)
145 return err; 131 return err;
@@ -147,6 +133,7 @@ flags_out:
147 err = -EFAULT; 133 err = -EFAULT;
148 goto setversion_out; 134 goto setversion_out;
149 } 135 }
136
150 handle = ext3_journal_start(inode, 1); 137 handle = ext3_journal_start(inode, 1);
151 if (IS_ERR(handle)) { 138 if (IS_ERR(handle)) {
152 err = PTR_ERR(handle); 139 err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
299#ifdef CONFIG_COMPAT 286#ifdef CONFIG_COMPAT
300long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 287long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
301{ 288{
302 struct inode *inode = file->f_path.dentry->d_inode;
303 int ret;
304
305 /* These are just misnamed, they actually get/put from/to user an int */ 289 /* These are just misnamed, they actually get/put from/to user an int */
306 switch (cmd) { 290 switch (cmd) {
307 case EXT3_IOC32_GETFLAGS: 291 case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
341 default: 325 default:
342 return -ENOIOCTLCMD; 326 return -ENOIOCTLCMD;
343 } 327 }
344 lock_kernel(); 328 return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
345 ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
346 unlock_kernel();
347 return ret;
348} 329}
349#endif 330#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e2fc63cbba8b..6ff7b9730234 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release (struct dx_frame *frames); 163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, 167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count);
169static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); 169static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); 170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext3_htree_next_block(struct inode *dir, __u32 hash, 171static int ext3_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame, 172 struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
708 * Create map of hash values, offsets, and sizes, stored at end of block. 708 * Create map of hash values, offsets, and sizes, stored at end of block.
709 * Returns number of entries mapped. 709 * Returns number of entries mapped.
710 */ 710 */
711static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 711static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
713{ 713{
714 int count = 0; 714 int count = 0;
715 char *base = (char *) de; 715 char *base = (char *) de;
716 struct dx_hash_info h = *hinfo; 716 struct dx_hash_info h = *hinfo;
717 717
718 while ((char *) de < base + size) 718 while ((char *) de < base + blocksize)
719 { 719 {
720 if (de->name_len && de->inode) { 720 if (de->name_len && de->inode) {
721 ext3fs_dirhash(de->name, de->name_len, &h); 721 ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1047 return ERR_PTR(-EIO); 1047 return ERR_PTR(-EIO);
1048 } 1048 }
1049 inode = ext3_iget(dir->i_sb, ino); 1049 inode = ext3_iget(dir->i_sb, ino);
1050 if (IS_ERR(inode)) 1050 if (unlikely(IS_ERR(inode))) {
1051 return ERR_CAST(inode); 1051 if (PTR_ERR(inode) == -ESTALE) {
1052 ext3_error(dir->i_sb, __func__,
1053 "deleted inode referenced: %lu",
1054 ino);
1055 return ERR_PTR(-EIO);
1056 } else {
1057 return ERR_CAST(inode);
1058 }
1059 }
1052 } 1060 }
1053 return d_splice_alias(inode, dentry); 1061 return d_splice_alias(inode, dentry);
1054} 1062}
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1120 * Compact each dir entry in the range to the minimal rec_len. 1128 * Compact each dir entry in the range to the minimal rec_len.
1121 * Returns pointer to last entry in range. 1129 * Returns pointer to last entry in range.
1122 */ 1130 */
1123static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) 1131static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
1124{ 1132{
1125 struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; 1133 struct ext3_dir_entry_2 *next, *to, *prev;
1134 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
1126 unsigned rec_len = 0; 1135 unsigned rec_len = 0;
1127 1136
1128 prev = to = de; 1137 prev = to = de;
1129 while ((char*)de < base + size) { 1138 while ((char *)de < base + blocksize) {
1130 next = ext3_next_entry(de); 1139 next = ext3_next_entry(de);
1131 if (de->inode && de->name_len) { 1140 if (de->inode && de->name_len) {
1132 rec_len = EXT3_DIR_REC_LEN(de->name_len); 1141 rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@ -2265,7 +2274,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2265 struct inode * old_inode, * new_inode; 2274 struct inode * old_inode, * new_inode;
2266 struct buffer_head * old_bh, * new_bh, * dir_bh; 2275 struct buffer_head * old_bh, * new_bh, * dir_bh;
2267 struct ext3_dir_entry_2 * old_de, * new_de; 2276 struct ext3_dir_entry_2 * old_de, * new_de;
2268 int retval; 2277 int retval, flush_file = 0;
2269 2278
2270 old_bh = new_bh = dir_bh = NULL; 2279 old_bh = new_bh = dir_bh = NULL;
2271 2280
@@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2401 ext3_mark_inode_dirty(handle, new_inode); 2410 ext3_mark_inode_dirty(handle, new_inode);
2402 if (!new_inode->i_nlink) 2411 if (!new_inode->i_nlink)
2403 ext3_orphan_add(handle, new_inode); 2412 ext3_orphan_add(handle, new_inode);
2413 if (ext3_should_writeback_data(new_inode))
2414 flush_file = 1;
2404 } 2415 }
2405 retval = 0; 2416 retval = 0;
2406 2417
@@ -2409,6 +2420,8 @@ end_rename:
2409 brelse (old_bh); 2420 brelse (old_bh);
2410 brelse (new_bh); 2421 brelse (new_bh);
2411 ext3_journal_stop(handle); 2422 ext3_journal_stop(handle);
2423 if (retval == 0 && flush_file)
2424 filemap_flush(old_inode->i_mapping);
2412 return retval; 2425 return retval;
2413} 2426}
2414 2427
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
44#include "acl.h" 44#include "acl.h"
45#include "namei.h" 45#include "namei.h"
46 46
47#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
48 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
49#else
50 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
51#endif
52
47static int ext3_load_journal(struct super_block *, struct ext3_super_block *, 53static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 54 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 55static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1919 cope, else JOURNAL_DATA */ 1925 cope, else JOURNAL_DATA */
1920 if (journal_check_available_features 1926 if (journal_check_available_features
1921 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) 1927 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
1922 set_opt(sbi->s_mount_opt, ORDERED_DATA); 1928 set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
1923 else 1929 else
1924 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 1930 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1925 break; 1931 break;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
18 filesystem; while there will be some performance gains from 18 filesystem; while there will be some performance gains from
19 the delayed allocation and inode table readahead, the best 19 the delayed allocation and inode table readahead, the best
20 performance gains will require enabling ext4 features in the 20 performance gains will require enabling ext4 features in the
21 filesystem, or formating a new filesystem as an ext4 21 filesystem, or formatting a new filesystem as an ext4
22 filesystem initially. 22 filesystem initially.
23 23
24 To compile this file system support as a module, choose M here. The 24 To compile this file system support as a module, choose M here. The
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e00..296785a0dec8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
523 523
524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) 524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
525{ 525{
526 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); 526 struct super_block *sb = dentry->d_sb;
527 struct msdos_sb_info *sbi = MSDOS_SB(sb);
528 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
527 529
528 /* If the count of free cluster is still unknown, counts it here. */ 530 /* If the count of free cluster is still unknown, counts it here. */
529 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { 531 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
537 buf->f_blocks = sbi->max_cluster - FAT_START_ENT; 539 buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
538 buf->f_bfree = sbi->free_clusters; 540 buf->f_bfree = sbi->free_clusters;
539 buf->f_bavail = sbi->free_clusters; 541 buf->f_bavail = sbi->free_clusters;
542 buf->f_fsid.val[0] = (u32)id;
543 buf->f_fsid.val[1] = (u32)(id >> 32);
540 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 544 buf->f_namelen = sbi->options.isvfat ? 260 : 12;
541 545
542 return 0; 546 return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
930 934
931 opts->fs_uid = current_uid(); 935 opts->fs_uid = current_uid();
932 opts->fs_gid = current_gid(); 936 opts->fs_gid = current_gid();
933 opts->fs_fmask = opts->fs_dmask = current->fs->umask; 937 opts->fs_fmask = current_umask();
934 opts->allow_utime = -1; 938 opts->allow_utime = -1;
935 opts->codepage = fat_default_codepage; 939 opts->codepage = fat_default_codepage;
936 opts->iocharset = fat_default_iocharset; 940 opts->iocharset = fat_default_iocharset;
diff --git a/fs/file_table.c b/fs/file_table.c
index b74a8e1da913..54018fe48840 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
169 fmode_t mode, const struct file_operations *fop) 169 fmode_t mode, const struct file_operations *fop)
170{ 170{
171 struct file *file; 171 struct file *file;
172 struct path;
173 172
174 file = get_empty_filp(); 173 file = get_empty_filp();
175 if (!file) 174 if (!file)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e3fe9918faaf..91013ff7dd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
196 struct inode *tail_inode; 196 struct inode *tail_inode;
197 197
198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
199 if (!time_after_eq(inode->dirtied_when, 199 if (time_before(inode->dirtied_when,
200 tail_inode->dirtied_when)) 200 tail_inode->dirtied_when))
201 inode->dirtied_when = jiffies; 201 inode->dirtied_when = jiffies;
202 } 202 }
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
220 wake_up_bit(&inode->i_state, __I_SYNC); 220 wake_up_bit(&inode->i_state, __I_SYNC);
221} 221}
222 222
223static bool inode_dirtied_after(struct inode *inode, unsigned long t)
224{
225 bool ret = time_after(inode->dirtied_when, t);
226#ifndef CONFIG_64BIT
227 /*
228 * For inodes being constantly redirtied, dirtied_when can get stuck.
229 * It _appears_ to be in the future, but is actually in distant past.
230 * This test is necessary to prevent such wrapped-around relative times
231 * from permanently stopping the whole pdflush writeback.
232 */
233 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
234#endif
235 return ret;
236}
237
223/* 238/*
224 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 239 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
225 */ 240 */
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
231 struct inode *inode = list_entry(delaying_queue->prev, 246 struct inode *inode = list_entry(delaying_queue->prev,
232 struct inode, i_list); 247 struct inode, i_list);
233 if (older_than_this && 248 if (older_than_this &&
234 time_after(inode->dirtied_when, *older_than_this)) 249 inode_dirtied_after(inode, *older_than_this))
235 break; 250 break;
236 list_move(&inode->i_list, dispatch_queue); 251 list_move(&inode->i_list, dispatch_queue);
237 } 252 }
@@ -420,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
420 * If older_than_this is non-NULL, then only write out inodes which 435 * If older_than_this is non-NULL, then only write out inodes which
421 * had their first dirtying at a time earlier than *older_than_this. 436 * had their first dirtying at a time earlier than *older_than_this.
422 * 437 *
423 * If we're a pdlfush thread, then implement pdflush collision avoidance 438 * If we're a pdflush thread, then implement pdflush collision avoidance
424 * against the entire list. 439 * against the entire list.
425 * 440 *
426 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 441 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
@@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
492 continue; /* blockdev has wrong queue */ 507 continue; /* blockdev has wrong queue */
493 } 508 }
494 509
495 /* Was this inode dirtied after sync_sb_inodes was called? */ 510 /*
496 if (time_after(inode->dirtied_when, start)) 511 * Was this inode dirtied after sync_sb_inodes was called?
512 * This keeps sync from extra jobs and livelock.
513 */
514 if (inode_dirtied_after(inode, start))
497 break; 515 break;
498 516
499 /* Is another pdflush already flushing this queue? */ 517 /* Is another pdflush already flushing this queue? */
@@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
538 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 556 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
539 struct address_space *mapping; 557 struct address_space *mapping;
540 558
541 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 559 if (inode->i_state &
560 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
542 continue; 561 continue;
543 mapping = inode->i_mapping; 562 mapping = inode->i_mapping;
544 if (mapping->nrpages == 0) 563 if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..eee059052db5
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/fs.h>
4#include <linux/path.h>
5#include <linux/slab.h>
6#include <linux/fs_struct.h>
7
8/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
10 * It can block.
11 */
12void set_fs_root(struct fs_struct *fs, struct path *path)
13{
14 struct path old_root;
15
16 write_lock(&fs->lock);
17 old_root = fs->root;
18 fs->root = *path;
19 path_get(path);
20 write_unlock(&fs->lock);
21 if (old_root.dentry)
22 path_put(&old_root);
23}
24
25/*
26 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
27 * It can block.
28 */
29void set_fs_pwd(struct fs_struct *fs, struct path *path)
30{
31 struct path old_pwd;
32
33 write_lock(&fs->lock);
34 old_pwd = fs->pwd;
35 fs->pwd = *path;
36 path_get(path);
37 write_unlock(&fs->lock);
38
39 if (old_pwd.dentry)
40 path_put(&old_pwd);
41}
42
43void chroot_fs_refs(struct path *old_root, struct path *new_root)
44{
45 struct task_struct *g, *p;
46 struct fs_struct *fs;
47 int count = 0;
48
49 read_lock(&tasklist_lock);
50 do_each_thread(g, p) {
51 task_lock(p);
52 fs = p->fs;
53 if (fs) {
54 write_lock(&fs->lock);
55 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root);
58 fs->root = *new_root;
59 count++;
60 }
61 if (fs->pwd.dentry == old_root->dentry
62 && fs->pwd.mnt == old_root->mnt) {
63 path_get(new_root);
64 fs->pwd = *new_root;
65 count++;
66 }
67 write_unlock(&fs->lock);
68 }
69 task_unlock(p);
70 } while_each_thread(g, p);
71 read_unlock(&tasklist_lock);
72 while (count--)
73 path_put(old_root);
74}
75
76void free_fs_struct(struct fs_struct *fs)
77{
78 path_put(&fs->root);
79 path_put(&fs->pwd);
80 kmem_cache_free(fs_cachep, fs);
81}
82
83void exit_fs(struct task_struct *tsk)
84{
85 struct fs_struct *fs = tsk->fs;
86
87 if (fs) {
88 int kill;
89 task_lock(tsk);
90 write_lock(&fs->lock);
91 tsk->fs = NULL;
92 kill = !--fs->users;
93 write_unlock(&fs->lock);
94 task_unlock(tsk);
95 if (kill)
96 free_fs_struct(fs);
97 }
98}
99
100struct fs_struct *copy_fs_struct(struct fs_struct *old)
101{
102 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
103 /* We don't need to lock fs - think why ;-) */
104 if (fs) {
105 fs->users = 1;
106 fs->in_exec = 0;
107 rwlock_init(&fs->lock);
108 fs->umask = old->umask;
109 read_lock(&old->lock);
110 fs->root = old->root;
111 path_get(&old->root);
112 fs->pwd = old->pwd;
113 path_get(&old->pwd);
114 read_unlock(&old->lock);
115 }
116 return fs;
117}
118
119int unshare_fs_struct(void)
120{
121 struct fs_struct *fs = current->fs;
122 struct fs_struct *new_fs = copy_fs_struct(fs);
123 int kill;
124
125 if (!new_fs)
126 return -ENOMEM;
127
128 task_lock(current);
129 write_lock(&fs->lock);
130 kill = !--fs->users;
131 current->fs = new_fs;
132 write_unlock(&fs->lock);
133 task_unlock(current);
134
135 if (kill)
136 free_fs_struct(fs);
137
138 return 0;
139}
140EXPORT_SYMBOL_GPL(unshare_fs_struct);
141
142int current_umask(void)
143{
144 return current->fs->umask;
145}
146EXPORT_SYMBOL(current_umask);
147
148/* to be mentioned only in INIT_TASK */
149struct fs_struct init_fs = {
150 .users = 1,
151 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
152 .umask = 0022,
153};
154
155void daemonize_fs_struct(void)
156{
157 struct fs_struct *fs = current->fs;
158
159 if (fs) {
160 int kill;
161
162 task_lock(current);
163
164 write_lock(&init_fs.lock);
165 init_fs.users++;
166 write_unlock(&init_fs.lock);
167
168 write_lock(&fs->lock);
169 current->fs = &init_fs;
170 kill = !--fs->users;
171 write_unlock(&fs->lock);
172
173 task_unlock(current);
174 if (kill)
175 free_fs_struct(fs);
176 }
177}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 000000000000..9bbb8ce7bea0
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
1
2config FSCACHE
3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK
6 help
7 This option enables a generic filesystem caching manager that can be
8 used by various network and other filesystems to cache data locally.
9 Different sorts of caches can be plugged in, depending on the
10 resources available.
11
12 See Documentation/filesystems/caching/fscache.txt for more information.
13
14config FSCACHE_STATS
15 bool "Gather statistical information on local caching"
16 depends on FSCACHE && PROC_FS
17 help
18 This option causes statistical information to be gathered on local
19 caching and exported through file:
20
21 /proc/fs/fscache/stats
22
23 The gathering of statistics adds a certain amount of overhead to
24 execution as there are a quite a few stats gathered, and on a
25 multi-CPU system these may be on cachelines that keep bouncing
26 between CPUs. On the other hand, the stats are very useful for
27 debugging purposes. Saying 'Y' here is recommended.
28
29 See Documentation/filesystems/caching/fscache.txt for more information.
30
31config FSCACHE_HISTOGRAM
32 bool "Gather latency information on local caching"
33 depends on FSCACHE && PROC_FS
34 help
35 This option causes latency information to be gathered on local
36 caching and exported through file:
37
38 /proc/fs/fscache/histogram
39
40 The generation of this histogram adds a certain amount of overhead to
41 execution as there are a number of points at which data is gathered,
42 and on a multi-CPU system these may be on cachelines that keep
43 bouncing between CPUs. On the other hand, the histogram may be
44 useful for debugging purposes. Saying 'N' here is recommended.
45
46 See Documentation/filesystems/caching/fscache.txt for more information.
47
48config FSCACHE_DEBUG
49 bool "Debug FS-Cache"
50 depends on FSCACHE
51 help
52 This permits debugging to be dynamically enabled in the local caching
53 management module. If this is set, the debugging output may be
54 enabled by setting bits in /sys/modules/fscache/parameter/debug.
55
56 See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 000000000000..91571b95aacc
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
1#
2# Makefile for general filesystem caching code
3#
4
5fscache-y := \
6 cache.o \
7 cookie.o \
8 fsdef.o \
9 main.o \
10 netfs.o \
11 object.o \
12 operation.o \
13 page.o
14
15fscache-$(CONFIG_PROC_FS) += proc.o
16fscache-$(CONFIG_FSCACHE_STATS) += stats.o
17fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
18
19obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..e21985bbb1fb
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
1/* FS-Cache cache handling
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17LIST_HEAD(fscache_cache_list);
18DECLARE_RWSEM(fscache_addremove_sem);
19DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
20EXPORT_SYMBOL(fscache_cache_cleared_wq);
21
22static LIST_HEAD(fscache_cache_tag_list);
23
24/*
25 * look up a cache tag
26 */
27struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
28{
29 struct fscache_cache_tag *tag, *xtag;
30
31 /* firstly check for the existence of the tag under read lock */
32 down_read(&fscache_addremove_sem);
33
34 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
35 if (strcmp(tag->name, name) == 0) {
36 atomic_inc(&tag->usage);
37 up_read(&fscache_addremove_sem);
38 return tag;
39 }
40 }
41
42 up_read(&fscache_addremove_sem);
43
44 /* the tag does not exist - create a candidate */
45 xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
46 if (!xtag)
47 /* return a dummy tag if out of memory */
48 return ERR_PTR(-ENOMEM);
49
50 atomic_set(&xtag->usage, 1);
51 strcpy(xtag->name, name);
52
53 /* write lock, search again and add if still not present */
54 down_write(&fscache_addremove_sem);
55
56 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
57 if (strcmp(tag->name, name) == 0) {
58 atomic_inc(&tag->usage);
59 up_write(&fscache_addremove_sem);
60 kfree(xtag);
61 return tag;
62 }
63 }
64
65 list_add_tail(&xtag->link, &fscache_cache_tag_list);
66 up_write(&fscache_addremove_sem);
67 return xtag;
68}
69
70/*
71 * release a reference to a cache tag
72 */
73void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
74{
75 if (tag != ERR_PTR(-ENOMEM)) {
76 down_write(&fscache_addremove_sem);
77
78 if (atomic_dec_and_test(&tag->usage))
79 list_del_init(&tag->link);
80 else
81 tag = NULL;
82
83 up_write(&fscache_addremove_sem);
84
85 kfree(tag);
86 }
87}
88
89/*
90 * select a cache in which to store an object
91 * - the cache addremove semaphore must be at least read-locked by the caller
92 * - the object will never be an index
93 */
94struct fscache_cache *fscache_select_cache_for_object(
95 struct fscache_cookie *cookie)
96{
97 struct fscache_cache_tag *tag;
98 struct fscache_object *object;
99 struct fscache_cache *cache;
100
101 _enter("");
102
103 if (list_empty(&fscache_cache_list)) {
104 _leave(" = NULL [no cache]");
105 return NULL;
106 }
107
108 /* we check the parent to determine the cache to use */
109 spin_lock(&cookie->lock);
110
111 /* the first in the parent's backing list should be the preferred
112 * cache */
113 if (!hlist_empty(&cookie->backing_objects)) {
114 object = hlist_entry(cookie->backing_objects.first,
115 struct fscache_object, cookie_link);
116
117 cache = object->cache;
118 if (object->state >= FSCACHE_OBJECT_DYING ||
119 test_bit(FSCACHE_IOERROR, &cache->flags))
120 cache = NULL;
121
122 spin_unlock(&cookie->lock);
123 _leave(" = %p [parent]", cache);
124 return cache;
125 }
126
127 /* the parent is unbacked */
128 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
129 /* cookie not an index and is unbacked */
130 spin_unlock(&cookie->lock);
131 _leave(" = NULL [cookie ub,ni]");
132 return NULL;
133 }
134
135 spin_unlock(&cookie->lock);
136
137 if (!cookie->def->select_cache)
138 goto no_preference;
139
140 /* ask the netfs for its preference */
141 tag = cookie->def->select_cache(cookie->parent->netfs_data,
142 cookie->netfs_data);
143 if (!tag)
144 goto no_preference;
145
146 if (tag == ERR_PTR(-ENOMEM)) {
147 _leave(" = NULL [nomem tag]");
148 return NULL;
149 }
150
151 if (!tag->cache) {
152 _leave(" = NULL [unbacked tag]");
153 return NULL;
154 }
155
156 if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
157 return NULL;
158
159 _leave(" = %p [specific]", tag->cache);
160 return tag->cache;
161
162no_preference:
163 /* netfs has no preference - just select first cache */
164 cache = list_entry(fscache_cache_list.next,
165 struct fscache_cache, link);
166 _leave(" = %p [first]", cache);
167 return cache;
168}
169
170/**
171 * fscache_init_cache - Initialise a cache record
172 * @cache: The cache record to be initialised
173 * @ops: The cache operations to be installed in that record
174 * @idfmt: Format string to define identifier
175 * @...: sprintf-style arguments
176 *
177 * Initialise a record of a cache and fill in the name.
178 *
179 * See Documentation/filesystems/caching/backend-api.txt for a complete
180 * description.
181 */
182void fscache_init_cache(struct fscache_cache *cache,
183 const struct fscache_cache_ops *ops,
184 const char *idfmt,
185 ...)
186{
187 va_list va;
188
189 memset(cache, 0, sizeof(*cache));
190
191 cache->ops = ops;
192
193 va_start(va, idfmt);
194 vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
195 va_end(va);
196
197 INIT_WORK(&cache->op_gc, fscache_operation_gc);
198 INIT_LIST_HEAD(&cache->link);
199 INIT_LIST_HEAD(&cache->object_list);
200 INIT_LIST_HEAD(&cache->op_gc_list);
201 spin_lock_init(&cache->object_list_lock);
202 spin_lock_init(&cache->op_gc_list_lock);
203}
204EXPORT_SYMBOL(fscache_init_cache);
205
206/**
207 * fscache_add_cache - Declare a cache as being open for business
208 * @cache: The record describing the cache
209 * @ifsdef: The record of the cache object describing the top-level index
210 * @tagname: The tag describing this cache
211 *
212 * Add a cache to the system, making it available for netfs's to use.
213 *
214 * See Documentation/filesystems/caching/backend-api.txt for a complete
215 * description.
216 */
217int fscache_add_cache(struct fscache_cache *cache,
218 struct fscache_object *ifsdef,
219 const char *tagname)
220{
221 struct fscache_cache_tag *tag;
222
223 BUG_ON(!cache->ops);
224 BUG_ON(!ifsdef);
225
226 cache->flags = 0;
227 ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
228 ifsdef->state = FSCACHE_OBJECT_ACTIVE;
229
230 if (!tagname)
231 tagname = cache->identifier;
232
233 BUG_ON(!tagname[0]);
234
235 _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
236
237 /* we use the cache tag to uniquely identify caches */
238 tag = __fscache_lookup_cache_tag(tagname);
239 if (IS_ERR(tag))
240 goto nomem;
241
242 if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
243 goto tag_in_use;
244
245 cache->kobj = kobject_create_and_add(tagname, fscache_root);
246 if (!cache->kobj)
247 goto error;
248
249 ifsdef->cookie = &fscache_fsdef_index;
250 ifsdef->cache = cache;
251 cache->fsdef = ifsdef;
252
253 down_write(&fscache_addremove_sem);
254
255 tag->cache = cache;
256 cache->tag = tag;
257
258 /* add the cache to the list */
259 list_add(&cache->link, &fscache_cache_list);
260
261 /* add the cache's netfs definition index object to the cache's
262 * list */
263 spin_lock(&cache->object_list_lock);
264 list_add_tail(&ifsdef->cache_link, &cache->object_list);
265 spin_unlock(&cache->object_list_lock);
266
267 /* add the cache's netfs definition index object to the top level index
268 * cookie as a known backing object */
269 spin_lock(&fscache_fsdef_index.lock);
270
271 hlist_add_head(&ifsdef->cookie_link,
272 &fscache_fsdef_index.backing_objects);
273
274 atomic_inc(&fscache_fsdef_index.usage);
275
276 /* done */
277 spin_unlock(&fscache_fsdef_index.lock);
278 up_write(&fscache_addremove_sem);
279
280 printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
281 cache->tag->name, cache->ops->name);
282 kobject_uevent(cache->kobj, KOBJ_ADD);
283
284 _leave(" = 0 [%s]", cache->identifier);
285 return 0;
286
287tag_in_use:
288 printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
289 __fscache_release_cache_tag(tag);
290 _leave(" = -EXIST");
291 return -EEXIST;
292
293error:
294 __fscache_release_cache_tag(tag);
295 _leave(" = -EINVAL");
296 return -EINVAL;
297
298nomem:
299 _leave(" = -ENOMEM");
300 return -ENOMEM;
301}
302EXPORT_SYMBOL(fscache_add_cache);
303
304/**
305 * fscache_io_error - Note a cache I/O error
306 * @cache: The record describing the cache
307 *
308 * Note that an I/O error occurred in a cache and that it should no longer be
309 * used for anything. This also reports the error into the kernel log.
310 *
311 * See Documentation/filesystems/caching/backend-api.txt for a complete
312 * description.
313 */
314void fscache_io_error(struct fscache_cache *cache)
315{
316 set_bit(FSCACHE_IOERROR, &cache->flags);
317
318 printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
319 cache->ops->name);
320}
321EXPORT_SYMBOL(fscache_io_error);
322
323/*
324 * request withdrawal of all the objects in a cache
325 * - all the objects being withdrawn are moved onto the supplied list
326 */
327static void fscache_withdraw_all_objects(struct fscache_cache *cache,
328 struct list_head *dying_objects)
329{
330 struct fscache_object *object;
331
332 spin_lock(&cache->object_list_lock);
333
334 while (!list_empty(&cache->object_list)) {
335 object = list_entry(cache->object_list.next,
336 struct fscache_object, cache_link);
337 list_move_tail(&object->cache_link, dying_objects);
338
339 _debug("withdraw %p", object->cookie);
340
341 spin_lock(&object->lock);
342 spin_unlock(&cache->object_list_lock);
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
344 spin_unlock(&object->lock);
345
346 cond_resched();
347 spin_lock(&cache->object_list_lock);
348 }
349
350 spin_unlock(&cache->object_list_lock);
351}
352
353/**
354 * fscache_withdraw_cache - Withdraw a cache from the active service
355 * @cache: The record describing the cache
356 *
357 * Withdraw a cache from service, unbinding all its cache objects from the
358 * netfs cookies they're currently representing.
359 *
360 * See Documentation/filesystems/caching/backend-api.txt for a complete
361 * description.
362 */
363void fscache_withdraw_cache(struct fscache_cache *cache)
364{
365 LIST_HEAD(dying_objects);
366
367 _enter("");
368
369 printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
370 cache->tag->name);
371
372 /* make the cache unavailable for cookie acquisition */
373 if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
374 BUG();
375
376 down_write(&fscache_addremove_sem);
377 list_del_init(&cache->link);
378 cache->tag->cache = NULL;
379 up_write(&fscache_addremove_sem);
380
381 /* make sure all pages pinned by operations on behalf of the netfs are
382 * written to disk */
383 cache->ops->sync_cache(cache);
384
385 /* dissociate all the netfs pages backed by this cache from the block
386 * mappings in the cache */
387 cache->ops->dissociate_pages(cache);
388
389 /* we now have to destroy all the active objects pertaining to this
390 * cache - which we do by passing them off to thread pool to be
391 * disposed of */
392 _debug("destroy");
393
394 fscache_withdraw_all_objects(cache, &dying_objects);
395
396 /* wait for all extant objects to finish their outstanding operations
397 * and go away */
398 _debug("wait for finish");
399 wait_event(fscache_cache_cleared_wq,
400 atomic_read(&cache->object_count) == 0);
401 _debug("wait for clearance");
402 wait_event(fscache_cache_cleared_wq,
403 list_empty(&cache->object_list));
404 _debug("cleared");
405 ASSERT(list_empty(&dying_objects));
406
407 kobject_put(cache->kobj);
408
409 clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
410 fscache_release_cache_tag(cache->tag);
411 cache->tag = NULL;
412
413 _leave("");
414}
415EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 000000000000..72fd18f6c71f
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
1/* netfs cookie management
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/netfs-api.txt for more information on
12 * the netfs API.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20struct kmem_cache *fscache_cookie_jar;
21
22static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
23
24static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
25static int fscache_alloc_object(struct fscache_cache *cache,
26 struct fscache_cookie *cookie);
27static int fscache_attach_object(struct fscache_cookie *cookie,
28 struct fscache_object *object);
29
30/*
31 * initialise an cookie jar slab element prior to any use
32 */
33void fscache_cookie_init_once(void *_cookie)
34{
35 struct fscache_cookie *cookie = _cookie;
36
37 memset(cookie, 0, sizeof(*cookie));
38 spin_lock_init(&cookie->lock);
39 INIT_HLIST_HEAD(&cookie->backing_objects);
40}
41
42/*
43 * request a cookie to represent an object (index, datafile, xattr, etc)
44 * - parent specifies the parent object
45 * - the top level index cookie for each netfs is stored in the fscache_netfs
46 * struct upon registration
47 * - def points to the definition
48 * - the netfs_data will be passed to the functions pointed to in *def
49 * - all attached caches will be searched to see if they contain this object
50 * - index objects aren't stored on disk until there's a dependent file that
51 * needs storing
52 * - other objects are stored in a selected cache immediately, and all the
53 * indices forming the path to it are instantiated if necessary
54 * - we never let on to the netfs about errors
55 * - we may set a negative cookie pointer, but that's okay
56 */
57struct fscache_cookie *__fscache_acquire_cookie(
58 struct fscache_cookie *parent,
59 const struct fscache_cookie_def *def,
60 void *netfs_data)
61{
62 struct fscache_cookie *cookie;
63
64 BUG_ON(!def);
65
66 _enter("{%s},{%s},%p",
67 parent ? (char *) parent->def->name : "<no-parent>",
68 def->name, netfs_data);
69
70 fscache_stat(&fscache_n_acquires);
71
72 /* if there's no parent cookie, then we don't create one here either */
73 if (!parent) {
74 fscache_stat(&fscache_n_acquires_null);
75 _leave(" [no parent]");
76 return NULL;
77 }
78
79 /* validate the definition */
80 BUG_ON(!def->get_key);
81 BUG_ON(!def->name[0]);
82
83 BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
84 parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
85
86 /* allocate and initialise a cookie */
87 cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
88 if (!cookie) {
89 fscache_stat(&fscache_n_acquires_oom);
90 _leave(" [ENOMEM]");
91 return NULL;
92 }
93
94 atomic_set(&cookie->usage, 1);
95 atomic_set(&cookie->n_children, 0);
96
97 atomic_inc(&parent->usage);
98 atomic_inc(&parent->n_children);
99
100 cookie->def = def;
101 cookie->parent = parent;
102 cookie->netfs_data = netfs_data;
103 cookie->flags = 0;
104
105 INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
106
107 switch (cookie->def->type) {
108 case FSCACHE_COOKIE_TYPE_INDEX:
109 fscache_stat(&fscache_n_cookie_index);
110 break;
111 case FSCACHE_COOKIE_TYPE_DATAFILE:
112 fscache_stat(&fscache_n_cookie_data);
113 break;
114 default:
115 fscache_stat(&fscache_n_cookie_special);
116 break;
117 }
118
119 /* if the object is an index then we need do nothing more here - we
120 * create indices on disk when we need them as an index may exist in
121 * multiple caches */
122 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
123 if (fscache_acquire_non_index_cookie(cookie) < 0) {
124 atomic_dec(&parent->n_children);
125 __fscache_cookie_put(cookie);
126 fscache_stat(&fscache_n_acquires_nobufs);
127 _leave(" = NULL");
128 return NULL;
129 }
130 }
131
132 fscache_stat(&fscache_n_acquires_ok);
133 _leave(" = %p", cookie);
134 return cookie;
135}
136EXPORT_SYMBOL(__fscache_acquire_cookie);
137
138/*
139 * acquire a non-index cookie
140 * - this must make sure the index chain is instantiated and instantiate the
141 * object representation too
142 */
143static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
144{
145 struct fscache_object *object;
146 struct fscache_cache *cache;
147 uint64_t i_size;
148 int ret;
149
150 _enter("");
151
152 cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
153
154 /* now we need to see whether the backing objects for this cookie yet
155 * exist, if not there'll be nothing to search */
156 down_read(&fscache_addremove_sem);
157
158 if (list_empty(&fscache_cache_list)) {
159 up_read(&fscache_addremove_sem);
160 _leave(" = 0 [no caches]");
161 return 0;
162 }
163
164 /* select a cache in which to store the object */
165 cache = fscache_select_cache_for_object(cookie->parent);
166 if (!cache) {
167 up_read(&fscache_addremove_sem);
168 fscache_stat(&fscache_n_acquires_no_cache);
169 _leave(" = -ENOMEDIUM [no cache]");
170 return -ENOMEDIUM;
171 }
172
173 _debug("cache %s", cache->tag->name);
174
175 cookie->flags =
176 (1 << FSCACHE_COOKIE_LOOKING_UP) |
177 (1 << FSCACHE_COOKIE_CREATING) |
178 (1 << FSCACHE_COOKIE_NO_DATA_YET);
179
180 /* ask the cache to allocate objects for this cookie and its parent
181 * chain */
182 ret = fscache_alloc_object(cache, cookie);
183 if (ret < 0) {
184 up_read(&fscache_addremove_sem);
185 _leave(" = %d", ret);
186 return ret;
187 }
188
189 /* pass on how big the object we're caching is supposed to be */
190 cookie->def->get_attr(cookie->netfs_data, &i_size);
191
192 spin_lock(&cookie->lock);
193 if (hlist_empty(&cookie->backing_objects)) {
194 spin_unlock(&cookie->lock);
195 goto unavailable;
196 }
197
198 object = hlist_entry(cookie->backing_objects.first,
199 struct fscache_object, cookie_link);
200
201 fscache_set_store_limit(object, i_size);
202
203 /* initiate the process of looking up all the objects in the chain
204 * (done by fscache_initialise_object()) */
205 fscache_enqueue_object(object);
206
207 spin_unlock(&cookie->lock);
208
209 /* we may be required to wait for lookup to complete at this point */
210 if (!fscache_defer_lookup) {
211 _debug("non-deferred lookup %p", &cookie->flags);
212 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
213 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
214 _debug("complete");
215 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
216 goto unavailable;
217 }
218
219 up_read(&fscache_addremove_sem);
220 _leave(" = 0 [deferred]");
221 return 0;
222
223unavailable:
224 up_read(&fscache_addremove_sem);
225 _leave(" = -ENOBUFS");
226 return -ENOBUFS;
227}
228
229/*
230 * recursively allocate cache object records for a cookie/cache combination
231 * - caller must be holding the addremove sem
232 */
233static int fscache_alloc_object(struct fscache_cache *cache,
234 struct fscache_cookie *cookie)
235{
236 struct fscache_object *object;
237 struct hlist_node *_n;
238 int ret;
239
240 _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
241
242 spin_lock(&cookie->lock);
243 hlist_for_each_entry(object, _n, &cookie->backing_objects,
244 cookie_link) {
245 if (object->cache == cache)
246 goto object_already_extant;
247 }
248 spin_unlock(&cookie->lock);
249
250 /* ask the cache to allocate an object (we may end up with duplicate
251 * objects at this stage, but we sort that out later) */
252 object = cache->ops->alloc_object(cache, cookie);
253 if (IS_ERR(object)) {
254 fscache_stat(&fscache_n_object_no_alloc);
255 ret = PTR_ERR(object);
256 goto error;
257 }
258
259 fscache_stat(&fscache_n_object_alloc);
260
261 object->debug_id = atomic_inc_return(&fscache_object_debug_id);
262
263 _debug("ALLOC OBJ%x: %s {%lx}",
264 object->debug_id, cookie->def->name, object->events);
265
266 ret = fscache_alloc_object(cache, cookie->parent);
267 if (ret < 0)
268 goto error_put;
269
270 /* only attach if we managed to allocate all we needed, otherwise
271 * discard the object we just allocated and instead use the one
272 * attached to the cookie */
273 if (fscache_attach_object(cookie, object) < 0)
274 cache->ops->put_object(object);
275
276 _leave(" = 0");
277 return 0;
278
279object_already_extant:
280 ret = -ENOBUFS;
281 if (object->state >= FSCACHE_OBJECT_DYING) {
282 spin_unlock(&cookie->lock);
283 goto error;
284 }
285 spin_unlock(&cookie->lock);
286 _leave(" = 0 [found]");
287 return 0;
288
289error_put:
290 cache->ops->put_object(object);
291error:
292 _leave(" = %d", ret);
293 return ret;
294}
295
296/*
297 * attach a cache object to a cookie
298 */
299static int fscache_attach_object(struct fscache_cookie *cookie,
300 struct fscache_object *object)
301{
302 struct fscache_object *p;
303 struct fscache_cache *cache = object->cache;
304 struct hlist_node *_n;
305 int ret;
306
307 _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
308
309 spin_lock(&cookie->lock);
310
311 /* there may be multiple initial creations of this object, but we only
312 * want one */
313 ret = -EEXIST;
314 hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
315 if (p->cache == object->cache) {
316 if (p->state >= FSCACHE_OBJECT_DYING)
317 ret = -ENOBUFS;
318 goto cant_attach_object;
319 }
320 }
321
322 /* pin the parent object */
323 spin_lock_nested(&cookie->parent->lock, 1);
324 hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
325 cookie_link) {
326 if (p->cache == object->cache) {
327 if (p->state >= FSCACHE_OBJECT_DYING) {
328 ret = -ENOBUFS;
329 spin_unlock(&cookie->parent->lock);
330 goto cant_attach_object;
331 }
332 object->parent = p;
333 spin_lock(&p->lock);
334 p->n_children++;
335 spin_unlock(&p->lock);
336 break;
337 }
338 }
339 spin_unlock(&cookie->parent->lock);
340
341 /* attach to the cache's object list */
342 if (list_empty(&object->cache_link)) {
343 spin_lock(&cache->object_list_lock);
344 list_add(&object->cache_link, &cache->object_list);
345 spin_unlock(&cache->object_list_lock);
346 }
347
348 /* attach to the cookie */
349 object->cookie = cookie;
350 atomic_inc(&cookie->usage);
351 hlist_add_head(&object->cookie_link, &cookie->backing_objects);
352 ret = 0;
353
354cant_attach_object:
355 spin_unlock(&cookie->lock);
356 _leave(" = %d", ret);
357 return ret;
358}
359
360/*
361 * update the index entries backing a cookie
362 */
363void __fscache_update_cookie(struct fscache_cookie *cookie)
364{
365 struct fscache_object *object;
366 struct hlist_node *_p;
367
368 fscache_stat(&fscache_n_updates);
369
370 if (!cookie) {
371 fscache_stat(&fscache_n_updates_null);
372 _leave(" [no cookie]");
373 return;
374 }
375
376 _enter("{%s}", cookie->def->name);
377
378 BUG_ON(!cookie->def->get_aux);
379
380 spin_lock(&cookie->lock);
381
382 /* update the index entry on disk in each cache backing this cookie */
383 hlist_for_each_entry(object, _p,
384 &cookie->backing_objects, cookie_link) {
385 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
386 }
387
388 spin_unlock(&cookie->lock);
389 _leave("");
390}
391EXPORT_SYMBOL(__fscache_update_cookie);
392
393/*
394 * release a cookie back to the cache
395 * - the object will be marked as recyclable on disk if retire is true
396 * - all dependents of this cookie must have already been unregistered
397 * (indices/files/pages)
398 */
399void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
400{
401 struct fscache_cache *cache;
402 struct fscache_object *object;
403 unsigned long event;
404
405 fscache_stat(&fscache_n_relinquishes);
406
407 if (!cookie) {
408 fscache_stat(&fscache_n_relinquishes_null);
409 _leave(" [no cookie]");
410 return;
411 }
412
413 _enter("%p{%s,%p},%d",
414 cookie, cookie->def->name, cookie->netfs_data, retire);
415
416 if (atomic_read(&cookie->n_children) != 0) {
417 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
418 cookie->def->name);
419 BUG();
420 }
421
422 /* wait for the cookie to finish being instantiated (or to fail) */
423 if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
424 fscache_stat(&fscache_n_relinquishes_waitcrt);
425 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
426 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
427 }
428
429 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
430
431 /* detach pointers back to the netfs */
432 spin_lock(&cookie->lock);
433
434 cookie->netfs_data = NULL;
435 cookie->def = NULL;
436
437 /* break links with all the active objects */
438 while (!hlist_empty(&cookie->backing_objects)) {
439 object = hlist_entry(cookie->backing_objects.first,
440 struct fscache_object,
441 cookie_link);
442
443 _debug("RELEASE OBJ%x", object->debug_id);
444
445 /* detach each cache object from the object cookie */
446 spin_lock(&object->lock);
447 hlist_del_init(&object->cookie_link);
448
449 cache = object->cache;
450 object->cookie = NULL;
451 fscache_raise_event(object, event);
452 spin_unlock(&object->lock);
453
454 if (atomic_dec_and_test(&cookie->usage))
455 /* the cookie refcount shouldn't be reduced to 0 yet */
456 BUG();
457 }
458
459 spin_unlock(&cookie->lock);
460
461 if (cookie->parent) {
462 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
463 ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
464 atomic_dec(&cookie->parent->n_children);
465 }
466
467 /* finally dispose of the cookie */
468 ASSERTCMP(atomic_read(&cookie->usage), >, 0);
469 fscache_cookie_put(cookie);
470
471 _leave("");
472}
473EXPORT_SYMBOL(__fscache_relinquish_cookie);
474
475/*
476 * destroy a cookie
477 */
478void __fscache_cookie_put(struct fscache_cookie *cookie)
479{
480 struct fscache_cookie *parent;
481
482 _enter("%p", cookie);
483
484 for (;;) {
485 _debug("FREE COOKIE %p", cookie);
486 parent = cookie->parent;
487 BUG_ON(!hlist_empty(&cookie->backing_objects));
488 kmem_cache_free(fscache_cookie_jar, cookie);
489
490 if (!parent)
491 break;
492
493 cookie = parent;
494 BUG_ON(atomic_read(&cookie->usage) <= 0);
495 if (!atomic_dec_and_test(&cookie->usage))
496 break;
497 }
498
499 _leave("");
500}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 000000000000..f5b4baee7352
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
1/* Filesystem index definition
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include "internal.h"
15
16static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
17 void *buffer, uint16_t bufmax);
18
19static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
20 void *buffer, uint16_t bufmax);
21
22static
23enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
24 const void *data,
25 uint16_t datalen);
26
27/*
28 * The root index is owned by FS-Cache itself.
29 *
30 * When a netfs requests caching facilities, FS-Cache will, if one doesn't
31 * already exist, create an entry in the root index with the key being the name
32 * of the netfs ("AFS" for example), and the auxiliary data holding the index
33 * structure version supplied by the netfs:
34 *
35 * FSDEF
36 * |
37 * +-----------+
38 * | |
39 * NFS AFS
40 * [v=1] [v=1]
41 *
42 * If an entry with the appropriate name does already exist, the version is
43 * compared. If the version is different, the entire subtree from that entry
44 * will be discarded and a new entry created.
45 *
46 * The new entry will be an index, and a cookie referring to it will be passed
47 * to the netfs. This is then the root handle by which the netfs accesses the
48 * cache. It can create whatever objects it likes in that index, including
49 * further indices.
50 */
51static struct fscache_cookie_def fscache_fsdef_index_def = {
52 .name = ".FS-Cache",
53 .type = FSCACHE_COOKIE_TYPE_INDEX,
54};
55
56struct fscache_cookie fscache_fsdef_index = {
57 .usage = ATOMIC_INIT(1),
58 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
59 .backing_objects = HLIST_HEAD_INIT,
60 .def = &fscache_fsdef_index_def,
61};
62EXPORT_SYMBOL(fscache_fsdef_index);
63
64/*
65 * Definition of an entry in the root index. Each entry is an index, keyed to
66 * a specific netfs and only applicable to a particular version of the index
67 * structure used by that netfs.
68 */
69struct fscache_cookie_def fscache_fsdef_netfs_def = {
70 .name = "FSDEF.netfs",
71 .type = FSCACHE_COOKIE_TYPE_INDEX,
72 .get_key = fscache_fsdef_netfs_get_key,
73 .get_aux = fscache_fsdef_netfs_get_aux,
74 .check_aux = fscache_fsdef_netfs_check_aux,
75};
76
77/*
78 * get the key data for an FSDEF index record - this is the name of the netfs
79 * for which this entry is created
80 */
81static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
82 void *buffer, uint16_t bufmax)
83{
84 const struct fscache_netfs *netfs = cookie_netfs_data;
85 unsigned klen;
86
87 _enter("{%s.%u},", netfs->name, netfs->version);
88
89 klen = strlen(netfs->name);
90 if (klen > bufmax)
91 return 0;
92
93 memcpy(buffer, netfs->name, klen);
94 return klen;
95}
96
97/*
98 * get the auxiliary data for an FSDEF index record - this is the index
99 * structure version number of the netfs for which this version is created
100 */
101static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
102 void *buffer, uint16_t bufmax)
103{
104 const struct fscache_netfs *netfs = cookie_netfs_data;
105 unsigned dlen;
106
107 _enter("{%s.%u},", netfs->name, netfs->version);
108
109 dlen = sizeof(uint32_t);
110 if (dlen > bufmax)
111 return 0;
112
113 memcpy(buffer, &netfs->version, dlen);
114 return dlen;
115}
116
117/*
118 * check that the index structure version number stored in the auxiliary data
119 * matches the one the netfs gave us
120 */
121static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
122 void *cookie_netfs_data,
123 const void *data,
124 uint16_t datalen)
125{
126 struct fscache_netfs *netfs = cookie_netfs_data;
127 uint32_t version;
128
129 _enter("{%s},,%hu", netfs->name, datalen);
130
131 if (datalen != sizeof(version)) {
132 _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
133 return FSCACHE_CHECKAUX_OBSOLETE;
134 }
135
136 memcpy(&version, data, sizeof(version));
137 if (version != netfs->version) {
138 _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
139 return FSCACHE_CHECKAUX_OBSOLETE;
140 }
141
142 _leave(" = OKAY");
143 return FSCACHE_CHECKAUX_OKAY;
144}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
1/* FS-Cache latency histogram
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18atomic_t fscache_obj_instantiate_histogram[HZ];
19atomic_t fscache_objs_histogram[HZ];
20atomic_t fscache_ops_histogram[HZ];
21atomic_t fscache_retrieval_delay_histogram[HZ];
22atomic_t fscache_retrieval_histogram[HZ];
23
24/*
25 * display the time-taken histogram
26 */
27static int fscache_histogram_show(struct seq_file *m, void *v)
28{
29 unsigned long index;
30 unsigned n[5], t;
31
32 switch ((unsigned long) v) {
33 case 1:
34 seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS "
35 " RETRV DLY RETRIEVLS\n");
36 return 0;
37 case 2:
38 seq_puts(m, "===== ===== ========= ========= ========="
39 " ========= =========\n");
40 return 0;
41 default:
42 index = (unsigned long) v - 3;
43 n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
44 n[1] = atomic_read(&fscache_ops_histogram[index]);
45 n[2] = atomic_read(&fscache_objs_histogram[index]);
46 n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
47 n[4] = atomic_read(&fscache_retrieval_histogram[index]);
48 if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
49 return 0;
50
51 t = (index * 1000) / HZ;
52
53 seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n",
54 index, t, n[0], n[1], n[2], n[3], n[4]);
55 return 0;
56 }
57}
58
59/*
60 * set up the iterator to start reading from the first line
61 */
62static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
63{
64 if ((unsigned long long)*_pos >= HZ + 2)
65 return NULL;
66 if (*_pos == 0)
67 *_pos = 1;
68 return (void *)(unsigned long) *_pos;
69}
70
71/*
72 * move to the next line
73 */
74static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
75{
76 (*pos)++;
77 return (unsigned long long)*pos > HZ + 2 ?
78 NULL : (void *)(unsigned long) *pos;
79}
80
81/*
82 * clean up after reading
83 */
84static void fscache_histogram_stop(struct seq_file *m, void *v)
85{
86}
87
88static const struct seq_operations fscache_histogram_ops = {
89 .start = fscache_histogram_start,
90 .stop = fscache_histogram_stop,
91 .next = fscache_histogram_next,
92 .show = fscache_histogram_show,
93};
94
95/*
96 * open "/proc/fs/fscache/histogram" to provide latency data
97 */
98static int fscache_histogram_open(struct inode *inode, struct file *file)
99{
100 return seq_open(file, &fscache_histogram_ops);
101}
102
103const struct file_operations fscache_histogram_fops = {
104 .owner = THIS_MODULE,
105 .open = fscache_histogram_open,
106 .read = seq_read,
107 .llseek = seq_lseek,
108 .release = seq_release,
109};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 000000000000..e0cbd16f6dc9
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
1/* Internal definitions for FS-Cache
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12/*
13 * Lock order, in the order in which multiple locks should be obtained:
14 * - fscache_addremove_sem
15 * - cookie->lock
16 * - cookie->parent->lock
17 * - cache->object_list_lock
18 * - object->lock
19 * - object->parent->lock
20 * - fscache_thread_lock
21 *
22 */
23
24#include <linux/fscache-cache.h>
25#include <linux/sched.h>
26
27#define FSCACHE_MIN_THREADS 4
28#define FSCACHE_MAX_THREADS 32
29
30/*
31 * fsc-cache.c
32 */
33extern struct list_head fscache_cache_list;
34extern struct rw_semaphore fscache_addremove_sem;
35
36extern struct fscache_cache *fscache_select_cache_for_object(
37 struct fscache_cookie *);
38
39/*
40 * fsc-cookie.c
41 */
42extern struct kmem_cache *fscache_cookie_jar;
43
44extern void fscache_cookie_init_once(void *);
45extern void __fscache_cookie_put(struct fscache_cookie *);
46
47/*
48 * fsc-fsdef.c
49 */
50extern struct fscache_cookie fscache_fsdef_index;
51extern struct fscache_cookie_def fscache_fsdef_netfs_def;
52
53/*
54 * fsc-histogram.c
55 */
56#ifdef CONFIG_FSCACHE_HISTOGRAM
57extern atomic_t fscache_obj_instantiate_histogram[HZ];
58extern atomic_t fscache_objs_histogram[HZ];
59extern atomic_t fscache_ops_histogram[HZ];
60extern atomic_t fscache_retrieval_delay_histogram[HZ];
61extern atomic_t fscache_retrieval_histogram[HZ];
62
63static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
64{
65 unsigned long jif = jiffies - start_jif;
66 if (jif >= HZ)
67 jif = HZ - 1;
68 atomic_inc(&histogram[jif]);
69}
70
71extern const struct file_operations fscache_histogram_fops;
72
73#else
74#define fscache_hist(hist, start_jif) do {} while (0)
75#endif
76
77/*
78 * fsc-main.c
79 */
80extern unsigned fscache_defer_lookup;
81extern unsigned fscache_defer_create;
82extern unsigned fscache_debug;
83extern struct kobject *fscache_root;
84
85extern int fscache_wait_bit(void *);
86extern int fscache_wait_bit_interruptible(void *);
87
88/*
89 * fsc-object.c
90 */
91extern void fscache_withdrawing_object(struct fscache_cache *,
92 struct fscache_object *);
93extern void fscache_enqueue_object(struct fscache_object *);
94
95/*
96 * fsc-operation.c
97 */
98extern int fscache_submit_exclusive_op(struct fscache_object *,
99 struct fscache_operation *);
100extern int fscache_submit_op(struct fscache_object *,
101 struct fscache_operation *);
102extern void fscache_abort_object(struct fscache_object *);
103extern void fscache_start_operations(struct fscache_object *);
104extern void fscache_operation_gc(struct work_struct *);
105
106/*
107 * fsc-proc.c
108 */
109#ifdef CONFIG_PROC_FS
110extern int __init fscache_proc_init(void);
111extern void fscache_proc_cleanup(void);
112#else
113#define fscache_proc_init() (0)
114#define fscache_proc_cleanup() do {} while (0)
115#endif
116
117/*
118 * fsc-stats.c
119 */
120#ifdef CONFIG_FSCACHE_STATS
121extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
122extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
123
124extern atomic_t fscache_n_op_pend;
125extern atomic_t fscache_n_op_run;
126extern atomic_t fscache_n_op_enqueue;
127extern atomic_t fscache_n_op_deferred_release;
128extern atomic_t fscache_n_op_release;
129extern atomic_t fscache_n_op_gc;
130
131extern atomic_t fscache_n_attr_changed;
132extern atomic_t fscache_n_attr_changed_ok;
133extern atomic_t fscache_n_attr_changed_nobufs;
134extern atomic_t fscache_n_attr_changed_nomem;
135extern atomic_t fscache_n_attr_changed_calls;
136
137extern atomic_t fscache_n_allocs;
138extern atomic_t fscache_n_allocs_ok;
139extern atomic_t fscache_n_allocs_wait;
140extern atomic_t fscache_n_allocs_nobufs;
141extern atomic_t fscache_n_alloc_ops;
142extern atomic_t fscache_n_alloc_op_waits;
143
144extern atomic_t fscache_n_retrievals;
145extern atomic_t fscache_n_retrievals_ok;
146extern atomic_t fscache_n_retrievals_wait;
147extern atomic_t fscache_n_retrievals_nodata;
148extern atomic_t fscache_n_retrievals_nobufs;
149extern atomic_t fscache_n_retrievals_intr;
150extern atomic_t fscache_n_retrievals_nomem;
151extern atomic_t fscache_n_retrieval_ops;
152extern atomic_t fscache_n_retrieval_op_waits;
153
154extern atomic_t fscache_n_stores;
155extern atomic_t fscache_n_stores_ok;
156extern atomic_t fscache_n_stores_again;
157extern atomic_t fscache_n_stores_nobufs;
158extern atomic_t fscache_n_stores_oom;
159extern atomic_t fscache_n_store_ops;
160extern atomic_t fscache_n_store_calls;
161
162extern atomic_t fscache_n_marks;
163extern atomic_t fscache_n_uncaches;
164
165extern atomic_t fscache_n_acquires;
166extern atomic_t fscache_n_acquires_null;
167extern atomic_t fscache_n_acquires_no_cache;
168extern atomic_t fscache_n_acquires_ok;
169extern atomic_t fscache_n_acquires_nobufs;
170extern atomic_t fscache_n_acquires_oom;
171
172extern atomic_t fscache_n_updates;
173extern atomic_t fscache_n_updates_null;
174extern atomic_t fscache_n_updates_run;
175
176extern atomic_t fscache_n_relinquishes;
177extern atomic_t fscache_n_relinquishes_null;
178extern atomic_t fscache_n_relinquishes_waitcrt;
179
180extern atomic_t fscache_n_cookie_index;
181extern atomic_t fscache_n_cookie_data;
182extern atomic_t fscache_n_cookie_special;
183
184extern atomic_t fscache_n_object_alloc;
185extern atomic_t fscache_n_object_no_alloc;
186extern atomic_t fscache_n_object_lookups;
187extern atomic_t fscache_n_object_lookups_negative;
188extern atomic_t fscache_n_object_lookups_positive;
189extern atomic_t fscache_n_object_created;
190extern atomic_t fscache_n_object_avail;
191extern atomic_t fscache_n_object_dead;
192
193extern atomic_t fscache_n_checkaux_none;
194extern atomic_t fscache_n_checkaux_okay;
195extern atomic_t fscache_n_checkaux_update;
196extern atomic_t fscache_n_checkaux_obsolete;
197
198static inline void fscache_stat(atomic_t *stat)
199{
200 atomic_inc(stat);
201}
202
203extern const struct file_operations fscache_stats_fops;
204#else
205
206#define fscache_stat(stat) do {} while (0)
207#endif
208
209/*
210 * raise an event on an object
211 * - if the event is not masked for that object, then the object is
212 * queued for attention by the thread pool.
213 */
214static inline void fscache_raise_event(struct fscache_object *object,
215 unsigned event)
216{
217 if (!test_and_set_bit(event, &object->events) &&
218 test_bit(event, &object->event_mask))
219 fscache_enqueue_object(object);
220}
221
222/*
223 * drop a reference to a cookie
224 */
225static inline void fscache_cookie_put(struct fscache_cookie *cookie)
226{
227 BUG_ON(atomic_read(&cookie->usage) <= 0);
228 if (atomic_dec_and_test(&cookie->usage))
229 __fscache_cookie_put(cookie);
230}
231
232/*
233 * get an extra reference to a netfs retrieval context
234 */
235static inline
236void *fscache_get_context(struct fscache_cookie *cookie, void *context)
237{
238 if (cookie->def->get_context)
239 cookie->def->get_context(cookie->netfs_data, context);
240 return context;
241}
242
243/*
244 * release a reference to a netfs retrieval context
245 */
246static inline
247void fscache_put_context(struct fscache_cookie *cookie, void *context)
248{
249 if (cookie->def->put_context)
250 cookie->def->put_context(cookie->netfs_data, context);
251}
252
253/*****************************************************************************/
254/*
255 * debug tracing
256 */
257#define dbgprintk(FMT, ...) \
258 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
259
260/* make sure we maintain the format strings, even when debugging is disabled */
261static inline __attribute__((format(printf, 1, 2)))
262void _dbprintk(const char *fmt, ...)
263{
264}
265
266#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
267#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
268#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
269
270#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
271
272#ifdef __KDEBUG
273#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
274#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
275#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
276
277#elif defined(CONFIG_FSCACHE_DEBUG)
278#define _enter(FMT, ...) \
279do { \
280 if (__do_kdebug(ENTER)) \
281 kenter(FMT, ##__VA_ARGS__); \
282} while (0)
283
284#define _leave(FMT, ...) \
285do { \
286 if (__do_kdebug(LEAVE)) \
287 kleave(FMT, ##__VA_ARGS__); \
288} while (0)
289
290#define _debug(FMT, ...) \
291do { \
292 if (__do_kdebug(DEBUG)) \
293 kdebug(FMT, ##__VA_ARGS__); \
294} while (0)
295
296#else
297#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
298#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
299#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
300#endif
301
302/*
303 * determine whether a particular optional debugging point should be logged
304 * - we need to go through three steps to persuade cpp to correctly join the
305 * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
306 */
307#define ____do_kdebug(LEVEL, POINT) \
308 unlikely((fscache_debug & \
309 (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
310#define ___do_kdebug(LEVEL, POINT) \
311 ____do_kdebug(LEVEL, POINT)
312#define __do_kdebug(POINT) \
313 ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
314
315#define FSCACHE_DEBUG_CACHE 0
316#define FSCACHE_DEBUG_COOKIE 1
317#define FSCACHE_DEBUG_PAGE 2
318#define FSCACHE_DEBUG_OPERATION 3
319
320#define FSCACHE_POINT_ENTER 1
321#define FSCACHE_POINT_LEAVE 2
322#define FSCACHE_POINT_DEBUG 4
323
324#ifndef FSCACHE_DEBUG_LEVEL
325#define FSCACHE_DEBUG_LEVEL CACHE
326#endif
327
328/*
329 * assertions
330 */
331#if 1 /* defined(__KDEBUGALL) */
332
333#define ASSERT(X) \
334do { \
335 if (unlikely(!(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTCMP(X, OP, Y) \
343do { \
344 if (unlikely(!((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#define ASSERTIF(C, X) \
354do { \
355 if (unlikely((C) && !(X))) { \
356 printk(KERN_ERR "\n"); \
357 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
358 BUG(); \
359 } \
360} while (0)
361
362#define ASSERTIFCMP(C, X, OP, Y) \
363do { \
364 if (unlikely((C) && !((X) OP (Y)))) { \
365 printk(KERN_ERR "\n"); \
366 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
367 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
368 (unsigned long)(X), (unsigned long)(Y)); \
369 BUG(); \
370 } \
371} while (0)
372
373#else
374
375#define ASSERT(X) do {} while (0)
376#define ASSERTCMP(X, OP, Y) do {} while (0)
377#define ASSERTIF(C, X) do {} while (0)
378#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
379
380#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 000000000000..4de41b597499
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
1/* General filesystem local caching manager
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20MODULE_DESCRIPTION("FS Cache Manager");
21MODULE_AUTHOR("Red Hat, Inc.");
22MODULE_LICENSE("GPL");
23
24unsigned fscache_defer_lookup = 1;
25module_param_named(defer_lookup, fscache_defer_lookup, uint,
26 S_IWUSR | S_IRUGO);
27MODULE_PARM_DESC(fscache_defer_lookup,
28 "Defer cookie lookup to background thread");
29
30unsigned fscache_defer_create = 1;
31module_param_named(defer_create, fscache_defer_create, uint,
32 S_IWUSR | S_IRUGO);
33MODULE_PARM_DESC(fscache_defer_create,
34 "Defer cookie creation to background thread");
35
36unsigned fscache_debug;
37module_param_named(debug, fscache_debug, uint,
38 S_IWUSR | S_IRUGO);
39MODULE_PARM_DESC(fscache_debug,
40 "FS-Cache debugging mask");
41
42struct kobject *fscache_root;
43
44/*
45 * initialise the fs caching module
46 */
47static int __init fscache_init(void)
48{
49 int ret;
50
51 ret = slow_work_register_user();
52 if (ret < 0)
53 goto error_slow_work;
54
55 ret = fscache_proc_init();
56 if (ret < 0)
57 goto error_proc;
58
59 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
60 sizeof(struct fscache_cookie),
61 0,
62 0,
63 fscache_cookie_init_once);
64 if (!fscache_cookie_jar) {
65 printk(KERN_NOTICE
66 "FS-Cache: Failed to allocate a cookie jar\n");
67 ret = -ENOMEM;
68 goto error_cookie_jar;
69 }
70
71 fscache_root = kobject_create_and_add("fscache", kernel_kobj);
72 if (!fscache_root)
73 goto error_kobj;
74
75 printk(KERN_NOTICE "FS-Cache: Loaded\n");
76 return 0;
77
78error_kobj:
79 kmem_cache_destroy(fscache_cookie_jar);
80error_cookie_jar:
81 fscache_proc_cleanup();
82error_proc:
83 slow_work_unregister_user();
84error_slow_work:
85 return ret;
86}
87
88fs_initcall(fscache_init);
89
90/*
91 * clean up on module removal
92 */
93static void __exit fscache_exit(void)
94{
95 _enter("");
96
97 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar);
99 fscache_proc_cleanup();
100 slow_work_unregister_user();
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102}
103
104module_exit(fscache_exit);
105
106/*
107 * wait_on_bit() sleep function for uninterruptible waiting
108 */
109int fscache_wait_bit(void *flags)
110{
111 schedule();
112 return 0;
113}
114EXPORT_SYMBOL(fscache_wait_bit);
115
116/*
117 * wait_on_bit() sleep function for interruptible waiting
118 */
119int fscache_wait_bit_interruptible(void *flags)
120{
121 schedule();
122 return signal_pending(current);
123}
124EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
1/* FS-Cache netfs (client) registration
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17static LIST_HEAD(fscache_netfs_list);
18
19/*
20 * register a network filesystem for caching
21 */
22int __fscache_register_netfs(struct fscache_netfs *netfs)
23{
24 struct fscache_netfs *ptr;
25 int ret;
26
27 _enter("{%s}", netfs->name);
28
29 INIT_LIST_HEAD(&netfs->link);
30
31 /* allocate a cookie for the primary index */
32 netfs->primary_index =
33 kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
34
35 if (!netfs->primary_index) {
36 _leave(" = -ENOMEM");
37 return -ENOMEM;
38 }
39
40 /* initialise the primary index cookie */
41 atomic_set(&netfs->primary_index->usage, 1);
42 atomic_set(&netfs->primary_index->n_children, 0);
43
44 netfs->primary_index->def = &fscache_fsdef_netfs_def;
45 netfs->primary_index->parent = &fscache_fsdef_index;
46 netfs->primary_index->netfs_data = netfs;
47
48 atomic_inc(&netfs->primary_index->parent->usage);
49 atomic_inc(&netfs->primary_index->parent->n_children);
50
51 spin_lock_init(&netfs->primary_index->lock);
52 INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
53
54 /* check the netfs type is not already present */
55 down_write(&fscache_addremove_sem);
56
57 ret = -EEXIST;
58 list_for_each_entry(ptr, &fscache_netfs_list, link) {
59 if (strcmp(ptr->name, netfs->name) == 0)
60 goto already_registered;
61 }
62
63 list_add(&netfs->link, &fscache_netfs_list);
64 ret = 0;
65
66 printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
67 netfs->name);
68
69already_registered:
70 up_write(&fscache_addremove_sem);
71
72 if (ret < 0) {
73 netfs->primary_index->parent = NULL;
74 __fscache_cookie_put(netfs->primary_index);
75 netfs->primary_index = NULL;
76 }
77
78 _leave(" = %d", ret);
79 return ret;
80}
81EXPORT_SYMBOL(__fscache_register_netfs);
82
83/*
84 * unregister a network filesystem from the cache
85 * - all cookies must have been released first
86 */
87void __fscache_unregister_netfs(struct fscache_netfs *netfs)
88{
89 _enter("{%s.%u}", netfs->name, netfs->version);
90
91 down_write(&fscache_addremove_sem);
92
93 list_del(&netfs->link);
94 fscache_relinquish_cookie(netfs->primary_index, 0);
95
96 up_write(&fscache_addremove_sem);
97
98 printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
99 netfs->name);
100
101 _leave("");
102}
103EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 000000000000..392a41b1b79d
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
1/* FS-Cache object state machine handler
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/object.txt for a description of the
12 * object state machine and the in-kernel representations.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include "internal.h"
18
19const char *fscache_object_states[] = {
20 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
21 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
22 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
23 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
24 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
25 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
26 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
27 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
28 [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
29 [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
30 [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
31 [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
32 [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
33};
34EXPORT_SYMBOL(fscache_object_states);
35
36static void fscache_object_slow_work_put_ref(struct slow_work *);
37static int fscache_object_slow_work_get_ref(struct slow_work *);
38static void fscache_object_slow_work_execute(struct slow_work *);
39static void fscache_initialise_object(struct fscache_object *);
40static void fscache_lookup_object(struct fscache_object *);
41static void fscache_object_available(struct fscache_object *);
42static void fscache_release_object(struct fscache_object *);
43static void fscache_withdraw_object(struct fscache_object *);
44static void fscache_enqueue_dependents(struct fscache_object *);
45static void fscache_dequeue_object(struct fscache_object *);
46
47const struct slow_work_ops fscache_object_slow_work_ops = {
48 .get_ref = fscache_object_slow_work_get_ref,
49 .put_ref = fscache_object_slow_work_put_ref,
50 .execute = fscache_object_slow_work_execute,
51};
52EXPORT_SYMBOL(fscache_object_slow_work_ops);
53
54/*
55 * we need to notify the parent when an op completes that we had outstanding
56 * upon it
57 */
58static inline void fscache_done_parent_op(struct fscache_object *object)
59{
60 struct fscache_object *parent = object->parent;
61
62 _enter("OBJ%x {OBJ%x,%x}",
63 object->debug_id, parent->debug_id, parent->n_ops);
64
65 spin_lock_nested(&parent->lock, 1);
66 parent->n_ops--;
67 parent->n_obj_ops--;
68 if (parent->n_ops == 0)
69 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
70 spin_unlock(&parent->lock);
71}
72
73/*
74 * process events that have been sent to an object's state machine
75 * - initiates parent lookup
76 * - does object lookup
77 * - does object creation
78 * - does object recycling and retirement
79 * - does object withdrawal
80 */
81static void fscache_object_state_machine(struct fscache_object *object)
82{
83 enum fscache_object_state new_state;
84
85 ASSERT(object != NULL);
86
87 _enter("{OBJ%x,%s,%lx}",
88 object->debug_id, fscache_object_states[object->state],
89 object->events);
90
91 switch (object->state) {
92 /* wait for the parent object to become ready */
93 case FSCACHE_OBJECT_INIT:
94 object->event_mask =
95 ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
96 fscache_initialise_object(object);
97 goto done;
98
99 /* look up the object metadata on disk */
100 case FSCACHE_OBJECT_LOOKING_UP:
101 fscache_lookup_object(object);
102 goto lookup_transit;
103
104 /* create the object metadata on disk */
105 case FSCACHE_OBJECT_CREATING:
106 fscache_lookup_object(object);
107 goto lookup_transit;
108
109 /* handle an object becoming available; start pending
110 * operations and queue dependent operations for processing */
111 case FSCACHE_OBJECT_AVAILABLE:
112 fscache_object_available(object);
113 goto active_transit;
114
115 /* normal running state */
116 case FSCACHE_OBJECT_ACTIVE:
117 goto active_transit;
118
119 /* update the object metadata on disk */
120 case FSCACHE_OBJECT_UPDATING:
121 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
122 fscache_stat(&fscache_n_updates_run);
123 object->cache->ops->update_object(object);
124 goto active_transit;
125
126 /* handle an object dying during lookup or creation */
127 case FSCACHE_OBJECT_LC_DYING:
128 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
129 object->cache->ops->lookup_complete(object);
130
131 spin_lock(&object->lock);
132 object->state = FSCACHE_OBJECT_DYING;
133 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
134 &object->cookie->flags))
135 wake_up_bit(&object->cookie->flags,
136 FSCACHE_COOKIE_CREATING);
137 spin_unlock(&object->lock);
138
139 fscache_done_parent_op(object);
140
141 /* wait for completion of all active operations on this object
142 * and the death of all child objects of this object */
143 case FSCACHE_OBJECT_DYING:
144 dying:
145 clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
146 spin_lock(&object->lock);
147 _debug("dying OBJ%x {%d,%d}",
148 object->debug_id, object->n_ops, object->n_children);
149 if (object->n_ops == 0 && object->n_children == 0) {
150 object->event_mask &=
151 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
152 object->event_mask |=
153 (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
154 (1 << FSCACHE_OBJECT_EV_RETIRE) |
155 (1 << FSCACHE_OBJECT_EV_RELEASE) |
156 (1 << FSCACHE_OBJECT_EV_ERROR);
157 } else {
158 object->event_mask &=
159 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
160 (1 << FSCACHE_OBJECT_EV_RETIRE) |
161 (1 << FSCACHE_OBJECT_EV_RELEASE) |
162 (1 << FSCACHE_OBJECT_EV_ERROR));
163 object->event_mask |=
164 1 << FSCACHE_OBJECT_EV_CLEARED;
165 }
166 spin_unlock(&object->lock);
167 fscache_enqueue_dependents(object);
168 goto terminal_transit;
169
170 /* handle an abort during initialisation */
171 case FSCACHE_OBJECT_ABORT_INIT:
172 _debug("handle abort init %lx", object->events);
173 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
174
175 spin_lock(&object->lock);
176 fscache_dequeue_object(object);
177
178 object->state = FSCACHE_OBJECT_DYING;
179 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
180 &object->cookie->flags))
181 wake_up_bit(&object->cookie->flags,
182 FSCACHE_COOKIE_CREATING);
183 spin_unlock(&object->lock);
184 goto dying;
185
186 /* handle the netfs releasing an object and possibly marking it
187 * obsolete too */
188 case FSCACHE_OBJECT_RELEASING:
189 case FSCACHE_OBJECT_RECYCLING:
190 object->event_mask &=
191 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
192 (1 << FSCACHE_OBJECT_EV_RETIRE) |
193 (1 << FSCACHE_OBJECT_EV_RELEASE) |
194 (1 << FSCACHE_OBJECT_EV_ERROR));
195 fscache_release_object(object);
196 spin_lock(&object->lock);
197 object->state = FSCACHE_OBJECT_DEAD;
198 spin_unlock(&object->lock);
199 fscache_stat(&fscache_n_object_dead);
200 goto terminal_transit;
201
202 /* handle the parent cache of this object being withdrawn from
203 * active service */
204 case FSCACHE_OBJECT_WITHDRAWING:
205 object->event_mask &=
206 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
207 (1 << FSCACHE_OBJECT_EV_RETIRE) |
208 (1 << FSCACHE_OBJECT_EV_RELEASE) |
209 (1 << FSCACHE_OBJECT_EV_ERROR));
210 fscache_withdraw_object(object);
211 spin_lock(&object->lock);
212 object->state = FSCACHE_OBJECT_DEAD;
213 spin_unlock(&object->lock);
214 fscache_stat(&fscache_n_object_dead);
215 goto terminal_transit;
216
217 /* complain about the object being woken up once it is
218 * deceased */
219 case FSCACHE_OBJECT_DEAD:
220 printk(KERN_ERR "FS-Cache:"
221 " Unexpected event in dead state %lx\n",
222 object->events & object->event_mask);
223 BUG();
224
225 default:
226 printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
227 object->state);
228 BUG();
229 }
230
231 /* determine the transition from a lookup state */
232lookup_transit:
233 switch (fls(object->events & object->event_mask) - 1) {
234 case FSCACHE_OBJECT_EV_WITHDRAW:
235 case FSCACHE_OBJECT_EV_RETIRE:
236 case FSCACHE_OBJECT_EV_RELEASE:
237 case FSCACHE_OBJECT_EV_ERROR:
238 new_state = FSCACHE_OBJECT_LC_DYING;
239 goto change_state;
240 case FSCACHE_OBJECT_EV_REQUEUE:
241 goto done;
242 case -1:
243 goto done; /* sleep until event */
244 default:
245 goto unsupported_event;
246 }
247
248 /* determine the transition from an active state */
249active_transit:
250 switch (fls(object->events & object->event_mask) - 1) {
251 case FSCACHE_OBJECT_EV_WITHDRAW:
252 case FSCACHE_OBJECT_EV_RETIRE:
253 case FSCACHE_OBJECT_EV_RELEASE:
254 case FSCACHE_OBJECT_EV_ERROR:
255 new_state = FSCACHE_OBJECT_DYING;
256 goto change_state;
257 case FSCACHE_OBJECT_EV_UPDATE:
258 new_state = FSCACHE_OBJECT_UPDATING;
259 goto change_state;
260 case -1:
261 new_state = FSCACHE_OBJECT_ACTIVE;
262 goto change_state; /* sleep until event */
263 default:
264 goto unsupported_event;
265 }
266
267 /* determine the transition from a terminal state */
268terminal_transit:
269 switch (fls(object->events & object->event_mask) - 1) {
270 case FSCACHE_OBJECT_EV_WITHDRAW:
271 new_state = FSCACHE_OBJECT_WITHDRAWING;
272 goto change_state;
273 case FSCACHE_OBJECT_EV_RETIRE:
274 new_state = FSCACHE_OBJECT_RECYCLING;
275 goto change_state;
276 case FSCACHE_OBJECT_EV_RELEASE:
277 new_state = FSCACHE_OBJECT_RELEASING;
278 goto change_state;
279 case FSCACHE_OBJECT_EV_ERROR:
280 new_state = FSCACHE_OBJECT_WITHDRAWING;
281 goto change_state;
282 case FSCACHE_OBJECT_EV_CLEARED:
283 new_state = FSCACHE_OBJECT_DYING;
284 goto change_state;
285 case -1:
286 goto done; /* sleep until event */
287 default:
288 goto unsupported_event;
289 }
290
291change_state:
292 spin_lock(&object->lock);
293 object->state = new_state;
294 spin_unlock(&object->lock);
295
296done:
297 _leave(" [->%s]", fscache_object_states[object->state]);
298 return;
299
300unsupported_event:
301 printk(KERN_ERR "FS-Cache:"
302 " Unsupported event %lx [mask %lx] in state %s\n",
303 object->events, object->event_mask,
304 fscache_object_states[object->state]);
305 BUG();
306}
307
308/*
309 * execute an object
310 */
311static void fscache_object_slow_work_execute(struct slow_work *work)
312{
313 struct fscache_object *object =
314 container_of(work, struct fscache_object, work);
315 unsigned long start;
316
317 _enter("{OBJ%x}", object->debug_id);
318
319 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
320
321 start = jiffies;
322 fscache_object_state_machine(object);
323 fscache_hist(fscache_objs_histogram, start);
324 if (object->events & object->event_mask)
325 fscache_enqueue_object(object);
326}
327
328/*
329 * initialise an object
330 * - check the specified object's parent to see if we can make use of it
331 * immediately to do a creation
332 * - we may need to start the process of creating a parent and we need to wait
333 * for the parent's lookup and creation to complete if it's not there yet
334 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
335 * leaf-most cookies of the object and all its children
336 */
337static void fscache_initialise_object(struct fscache_object *object)
338{
339 struct fscache_object *parent;
340
341 _enter("");
342 ASSERT(object->cookie != NULL);
343 ASSERT(object->cookie->parent != NULL);
344 ASSERT(list_empty(&object->work.link));
345
346 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
347 (1 << FSCACHE_OBJECT_EV_RELEASE) |
348 (1 << FSCACHE_OBJECT_EV_RETIRE) |
349 (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
350 _debug("abort init %lx", object->events);
351 spin_lock(&object->lock);
352 object->state = FSCACHE_OBJECT_ABORT_INIT;
353 spin_unlock(&object->lock);
354 return;
355 }
356
357 spin_lock(&object->cookie->lock);
358 spin_lock_nested(&object->cookie->parent->lock, 1);
359
360 parent = object->parent;
361 if (!parent) {
362 _debug("no parent");
363 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
364 } else {
365 spin_lock(&object->lock);
366 spin_lock_nested(&parent->lock, 1);
367 _debug("parent %s", fscache_object_states[parent->state]);
368
369 if (parent->state >= FSCACHE_OBJECT_DYING) {
370 _debug("bad parent");
371 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
372 } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
373 _debug("wait");
374
375 /* we may get woken up in this state by child objects
376 * binding on to us, so we need to make sure we don't
377 * add ourself to the list multiple times */
378 if (list_empty(&object->dep_link)) {
379 object->cache->ops->grab_object(object);
380 list_add(&object->dep_link,
381 &parent->dependents);
382
383 /* fscache_acquire_non_index_cookie() uses this
384 * to wake the chain up */
385 if (parent->state == FSCACHE_OBJECT_INIT)
386 fscache_enqueue_object(parent);
387 }
388 } else {
389 _debug("go");
390 parent->n_ops++;
391 parent->n_obj_ops++;
392 object->lookup_jif = jiffies;
393 object->state = FSCACHE_OBJECT_LOOKING_UP;
394 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
395 }
396
397 spin_unlock(&parent->lock);
398 spin_unlock(&object->lock);
399 }
400
401 spin_unlock(&object->cookie->parent->lock);
402 spin_unlock(&object->cookie->lock);
403 _leave("");
404}
405
406/*
407 * look an object up in the cache from which it was allocated
408 * - we hold an "access lock" on the parent object, so the parent object cannot
409 * be withdrawn by either party till we've finished
410 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
411 * leaf-most cookies of the object and all its children
412 */
413static void fscache_lookup_object(struct fscache_object *object)
414{
415 struct fscache_cookie *cookie = object->cookie;
416 struct fscache_object *parent;
417
418 _enter("");
419
420 parent = object->parent;
421 ASSERT(parent != NULL);
422 ASSERTCMP(parent->n_ops, >, 0);
423 ASSERTCMP(parent->n_obj_ops, >, 0);
424
425 /* make sure the parent is still available */
426 ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
427
428 if (parent->state >= FSCACHE_OBJECT_DYING ||
429 test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
430 _debug("unavailable");
431 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
432 _leave("");
433 return;
434 }
435
436 _debug("LOOKUP \"%s/%s\" in \"%s\"",
437 parent->cookie->def->name, cookie->def->name,
438 object->cache->tag->name);
439
440 fscache_stat(&fscache_n_object_lookups);
441 object->cache->ops->lookup_object(object);
442
443 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
444 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
445
446 _leave("");
447}
448
449/**
450 * fscache_object_lookup_negative - Note negative cookie lookup
451 * @object: Object pointing to cookie to mark
452 *
453 * Note negative lookup, permitting those waiting to read data from an already
454 * existing backing object to continue as there's no data for them to read.
455 */
456void fscache_object_lookup_negative(struct fscache_object *object)
457{
458 struct fscache_cookie *cookie = object->cookie;
459
460 _enter("{OBJ%x,%s}",
461 object->debug_id, fscache_object_states[object->state]);
462
463 spin_lock(&object->lock);
464 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
465 fscache_stat(&fscache_n_object_lookups_negative);
466
467 /* transit here to allow write requests to begin stacking up
468 * and read requests to begin returning ENODATA */
469 object->state = FSCACHE_OBJECT_CREATING;
470 spin_unlock(&object->lock);
471
472 set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
473 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
474
475 _debug("wake up lookup %p", &cookie->flags);
476 smp_mb__before_clear_bit();
477 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
478 smp_mb__after_clear_bit();
479 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
480 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
481 } else {
482 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
483 spin_unlock(&object->lock);
484 }
485
486 _leave("");
487}
488EXPORT_SYMBOL(fscache_object_lookup_negative);
489
490/**
491 * fscache_obtained_object - Note successful object lookup or creation
492 * @object: Object pointing to cookie to mark
493 *
494 * Note successful lookup and/or creation, permitting those waiting to write
495 * data to a backing object to continue.
496 *
497 * Note that after calling this, an object's cookie may be relinquished by the
498 * netfs, and so must be accessed with object lock held.
499 */
500void fscache_obtained_object(struct fscache_object *object)
501{
502 struct fscache_cookie *cookie = object->cookie;
503
504 _enter("{OBJ%x,%s}",
505 object->debug_id, fscache_object_states[object->state]);
506
507 /* if we were still looking up, then we must have a positive lookup
508 * result, in which case there may be data available */
509 spin_lock(&object->lock);
510 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
511 fscache_stat(&fscache_n_object_lookups_positive);
512
513 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
514
515 object->state = FSCACHE_OBJECT_AVAILABLE;
516 spin_unlock(&object->lock);
517
518 smp_mb__before_clear_bit();
519 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
520 smp_mb__after_clear_bit();
521 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
522 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
523 } else {
524 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
525 fscache_stat(&fscache_n_object_created);
526
527 object->state = FSCACHE_OBJECT_AVAILABLE;
528 spin_unlock(&object->lock);
529 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
530 smp_wmb();
531 }
532
533 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
534 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
535
536 _leave("");
537}
538EXPORT_SYMBOL(fscache_obtained_object);
539
540/*
541 * handle an object that has just become available
542 */
543static void fscache_object_available(struct fscache_object *object)
544{
545 _enter("{OBJ%x}", object->debug_id);
546
547 spin_lock(&object->lock);
548
549 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
550 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
551
552 fscache_done_parent_op(object);
553 if (object->n_in_progress == 0) {
554 if (object->n_ops > 0) {
555 ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
556 ASSERTIF(object->n_ops > object->n_obj_ops,
557 !list_empty(&object->pending_ops));
558 fscache_start_operations(object);
559 } else {
560 ASSERT(list_empty(&object->pending_ops));
561 }
562 }
563 spin_unlock(&object->lock);
564
565 object->cache->ops->lookup_complete(object);
566 fscache_enqueue_dependents(object);
567
568 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
569 fscache_stat(&fscache_n_object_avail);
570
571 _leave("");
572}
573
574/*
575 * drop an object's attachments
576 */
577static void fscache_drop_object(struct fscache_object *object)
578{
579 struct fscache_object *parent = object->parent;
580 struct fscache_cache *cache = object->cache;
581
582 _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
583
584 spin_lock(&cache->object_list_lock);
585 list_del_init(&object->cache_link);
586 spin_unlock(&cache->object_list_lock);
587
588 cache->ops->drop_object(object);
589
590 if (parent) {
591 _debug("release parent OBJ%x {%d}",
592 parent->debug_id, parent->n_children);
593
594 spin_lock(&parent->lock);
595 parent->n_children--;
596 if (parent->n_children == 0)
597 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
598 spin_unlock(&parent->lock);
599 object->parent = NULL;
600 }
601
602 /* this just shifts the object release to the slow work processor */
603 object->cache->ops->put_object(object);
604
605 _leave("");
606}
607
608/*
609 * release or recycle an object that the netfs has discarded
610 */
611static void fscache_release_object(struct fscache_object *object)
612{
613 _enter("");
614
615 fscache_drop_object(object);
616}
617
618/*
619 * withdraw an object from active service
620 */
621static void fscache_withdraw_object(struct fscache_object *object)
622{
623 struct fscache_cookie *cookie;
624 bool detached;
625
626 _enter("");
627
628 spin_lock(&object->lock);
629 cookie = object->cookie;
630 if (cookie) {
631 /* need to get the cookie lock before the object lock, starting
632 * from the object pointer */
633 atomic_inc(&cookie->usage);
634 spin_unlock(&object->lock);
635
636 detached = false;
637 spin_lock(&cookie->lock);
638 spin_lock(&object->lock);
639
640 if (object->cookie == cookie) {
641 hlist_del_init(&object->cookie_link);
642 object->cookie = NULL;
643 detached = true;
644 }
645 spin_unlock(&cookie->lock);
646 fscache_cookie_put(cookie);
647 if (detached)
648 fscache_cookie_put(cookie);
649 }
650
651 spin_unlock(&object->lock);
652
653 fscache_drop_object(object);
654}
655
656/*
657 * withdraw an object from active service at the behest of the cache
658 * - need break the links to a cached object cookie
659 * - called under two situations:
660 * (1) recycler decides to reclaim an in-use object
661 * (2) a cache is unmounted
662 * - have to take care as the cookie can be being relinquished by the netfs
663 * simultaneously
664 * - the object is pinned by the caller holding a refcount on it
665 */
666void fscache_withdrawing_object(struct fscache_cache *cache,
667 struct fscache_object *object)
668{
669 bool enqueue = false;
670
671 _enter(",OBJ%x", object->debug_id);
672
673 spin_lock(&object->lock);
674 if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
675 object->state = FSCACHE_OBJECT_WITHDRAWING;
676 enqueue = true;
677 }
678 spin_unlock(&object->lock);
679
680 if (enqueue)
681 fscache_enqueue_object(object);
682
683 _leave("");
684}
685
686/*
687 * allow the slow work item processor to get a ref on an object
688 */
689static int fscache_object_slow_work_get_ref(struct slow_work *work)
690{
691 struct fscache_object *object =
692 container_of(work, struct fscache_object, work);
693
694 return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
695}
696
697/*
698 * allow the slow work item processor to discard a ref on a work item
699 */
700static void fscache_object_slow_work_put_ref(struct slow_work *work)
701{
702 struct fscache_object *object =
703 container_of(work, struct fscache_object, work);
704
705 return object->cache->ops->put_object(object);
706}
707
708/*
709 * enqueue an object for metadata-type processing
710 */
711void fscache_enqueue_object(struct fscache_object *object)
712{
713 _enter("{OBJ%x}", object->debug_id);
714
715 slow_work_enqueue(&object->work);
716}
717
718/*
719 * enqueue the dependents of an object for metadata-type processing
720 * - the caller must hold the object's lock
721 * - this may cause an already locked object to wind up being processed again
722 */
723static void fscache_enqueue_dependents(struct fscache_object *object)
724{
725 struct fscache_object *dep;
726
727 _enter("{OBJ%x}", object->debug_id);
728
729 if (list_empty(&object->dependents))
730 return;
731
732 spin_lock(&object->lock);
733
734 while (!list_empty(&object->dependents)) {
735 dep = list_entry(object->dependents.next,
736 struct fscache_object, dep_link);
737 list_del_init(&dep->dep_link);
738
739
740 /* sort onto appropriate lists */
741 fscache_enqueue_object(dep);
742 dep->cache->ops->put_object(dep);
743
744 if (!list_empty(&object->dependents))
745 cond_resched_lock(&object->lock);
746 }
747
748 spin_unlock(&object->lock);
749}
750
751/*
752 * remove an object from whatever queue it's waiting on
753 * - the caller must hold object->lock
754 */
755void fscache_dequeue_object(struct fscache_object *object)
756{
757 _enter("{OBJ%x}", object->debug_id);
758
759 if (!list_empty(&object->dep_link)) {
760 spin_lock(&object->parent->lock);
761 list_del_init(&object->dep_link);
762 spin_unlock(&object->parent->lock);
763 }
764
765 _leave("");
766}
767
768/**
769 * fscache_check_aux - Ask the netfs whether an object on disk is still valid
770 * @object: The object to ask about
771 * @data: The auxiliary data for the object
772 * @datalen: The size of the auxiliary data
773 *
774 * This function consults the netfs about the coherency state of an object
775 */
776enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
777 const void *data, uint16_t datalen)
778{
779 enum fscache_checkaux result;
780
781 if (!object->cookie->def->check_aux) {
782 fscache_stat(&fscache_n_checkaux_none);
783 return FSCACHE_CHECKAUX_OKAY;
784 }
785
786 result = object->cookie->def->check_aux(object->cookie->netfs_data,
787 data, datalen);
788 switch (result) {
789 /* entry okay as is */
790 case FSCACHE_CHECKAUX_OKAY:
791 fscache_stat(&fscache_n_checkaux_okay);
792 break;
793
794 /* entry requires update */
795 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
796 fscache_stat(&fscache_n_checkaux_update);
797 break;
798
799 /* entry requires deletion */
800 case FSCACHE_CHECKAUX_OBSOLETE:
801 fscache_stat(&fscache_n_checkaux_obsolete);
802 break;
803
804 default:
805 BUG();
806 }
807
808 return result;
809}
810EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 000000000000..e7f8d53b8b6b
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
1/* FS-Cache worker operation management routines
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/operations.txt
12 */
13
14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h>
16#include "internal.h"
17
18atomic_t fscache_op_debug_id;
19EXPORT_SYMBOL(fscache_op_debug_id);
20
21/**
22 * fscache_enqueue_operation - Enqueue an operation for processing
23 * @op: The operation to enqueue
24 *
25 * Enqueue an operation for processing by the FS-Cache thread pool.
26 *
27 * This will get its own ref on the object.
28 */
29void fscache_enqueue_operation(struct fscache_operation *op)
30{
31 _enter("{OBJ%x OP%x,%u}",
32 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
33
34 ASSERT(op->processor != NULL);
35 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
36 ASSERTCMP(atomic_read(&op->usage), >, 0);
37
38 if (list_empty(&op->pend_link)) {
39 switch (op->flags & FSCACHE_OP_TYPE) {
40 case FSCACHE_OP_FAST:
41 _debug("queue fast");
42 atomic_inc(&op->usage);
43 if (!schedule_work(&op->fast_work))
44 fscache_put_operation(op);
45 break;
46 case FSCACHE_OP_SLOW:
47 _debug("queue slow");
48 slow_work_enqueue(&op->slow_work);
49 break;
50 case FSCACHE_OP_MYTHREAD:
51 _debug("queue for caller's attention");
52 break;
53 default:
54 printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
55 op->flags);
56 BUG();
57 break;
58 }
59 fscache_stat(&fscache_n_op_enqueue);
60 }
61}
62EXPORT_SYMBOL(fscache_enqueue_operation);
63
64/*
65 * start an op running
66 */
67static void fscache_run_op(struct fscache_object *object,
68 struct fscache_operation *op)
69{
70 object->n_in_progress++;
71 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
72 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
73 if (op->processor)
74 fscache_enqueue_operation(op);
75 fscache_stat(&fscache_n_op_run);
76}
77
78/*
79 * submit an exclusive operation for an object
80 * - other ops are excluded from running simultaneously with this one
81 * - this gets any extra refs it needs on an op
82 */
83int fscache_submit_exclusive_op(struct fscache_object *object,
84 struct fscache_operation *op)
85{
86 int ret;
87
88 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
89
90 spin_lock(&object->lock);
91 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
92 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
93
94 ret = -ENOBUFS;
95 if (fscache_object_is_active(object)) {
96 op->object = object;
97 object->n_ops++;
98 object->n_exclusive++; /* reads and writes must wait */
99
100 if (object->n_ops > 0) {
101 atomic_inc(&op->usage);
102 list_add_tail(&op->pend_link, &object->pending_ops);
103 fscache_stat(&fscache_n_op_pend);
104 } else if (!list_empty(&object->pending_ops)) {
105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend);
108 fscache_start_operations(object);
109 } else {
110 ASSERTCMP(object->n_in_progress, ==, 0);
111 fscache_run_op(object, op);
112 }
113
114 /* need to issue a new write op after this */
115 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
116 ret = 0;
117 } else if (object->state == FSCACHE_OBJECT_CREATING) {
118 op->object = object;
119 object->n_ops++;
120 object->n_exclusive++; /* reads and writes must wait */
121 atomic_inc(&op->usage);
122 list_add_tail(&op->pend_link, &object->pending_ops);
123 fscache_stat(&fscache_n_op_pend);
124 ret = 0;
125 } else {
126 /* not allowed to submit ops in any other state */
127 BUG();
128 }
129
130 spin_unlock(&object->lock);
131 return ret;
132}
133
134/*
135 * report an unexpected submission
136 */
137static void fscache_report_unexpected_submission(struct fscache_object *object,
138 struct fscache_operation *op,
139 unsigned long ostate)
140{
141 static bool once_only;
142 struct fscache_operation *p;
143 unsigned n;
144
145 if (once_only)
146 return;
147 once_only = true;
148
149 kdebug("unexpected submission OP%x [OBJ%x %s]",
150 op->debug_id, object->debug_id,
151 fscache_object_states[object->state]);
152 kdebug("objstate=%s [%s]",
153 fscache_object_states[object->state],
154 fscache_object_states[ostate]);
155 kdebug("objflags=%lx", object->flags);
156 kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
157 kdebug("ops=%u inp=%u exc=%u",
158 object->n_ops, object->n_in_progress, object->n_exclusive);
159
160 if (!list_empty(&object->pending_ops)) {
161 n = 0;
162 list_for_each_entry(p, &object->pending_ops, pend_link) {
163 ASSERTCMP(p->object, ==, object);
164 kdebug("%p %p", op->processor, op->release);
165 n++;
166 }
167
168 kdebug("n=%u", n);
169 }
170
171 dump_stack();
172}
173
174/*
175 * submit an operation for an object
176 * - objects may be submitted only in the following states:
177 * - during object creation (write ops may be submitted)
178 * - whilst the object is active
179 * - after an I/O error incurred in one of the two above states (op rejected)
180 * - this gets any extra refs it needs on an op
181 */
182int fscache_submit_op(struct fscache_object *object,
183 struct fscache_operation *op)
184{
185 unsigned long ostate;
186 int ret;
187
188 _enter("{OBJ%x OP%x},{%u}",
189 object->debug_id, op->debug_id, atomic_read(&op->usage));
190
191 ASSERTCMP(atomic_read(&op->usage), >, 0);
192
193 spin_lock(&object->lock);
194 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
195 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
196
197 ostate = object->state;
198 smp_rmb();
199
200 if (fscache_object_is_active(object)) {
201 op->object = object;
202 object->n_ops++;
203
204 if (object->n_exclusive > 0) {
205 atomic_inc(&op->usage);
206 list_add_tail(&op->pend_link, &object->pending_ops);
207 fscache_stat(&fscache_n_op_pend);
208 } else if (!list_empty(&object->pending_ops)) {
209 atomic_inc(&op->usage);
210 list_add_tail(&op->pend_link, &object->pending_ops);
211 fscache_stat(&fscache_n_op_pend);
212 fscache_start_operations(object);
213 } else {
214 ASSERTCMP(object->n_exclusive, ==, 0);
215 fscache_run_op(object, op);
216 }
217 ret = 0;
218 } else if (object->state == FSCACHE_OBJECT_CREATING) {
219 op->object = object;
220 object->n_ops++;
221 atomic_inc(&op->usage);
222 list_add_tail(&op->pend_link, &object->pending_ops);
223 fscache_stat(&fscache_n_op_pend);
224 ret = 0;
225 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
226 fscache_report_unexpected_submission(object, op, ostate);
227 ASSERT(!fscache_object_is_active(object));
228 ret = -ENOBUFS;
229 } else {
230 ret = -ENOBUFS;
231 }
232
233 spin_unlock(&object->lock);
234 return ret;
235}
236
237/*
238 * queue an object for withdrawal on error, aborting all following asynchronous
239 * operations
240 */
241void fscache_abort_object(struct fscache_object *object)
242{
243 _enter("{OBJ%x}", object->debug_id);
244
245 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
246}
247
248/*
249 * jump start the operation processing on an object
250 * - caller must hold object->lock
251 */
252void fscache_start_operations(struct fscache_object *object)
253{
254 struct fscache_operation *op;
255 bool stop = false;
256
257 while (!list_empty(&object->pending_ops) && !stop) {
258 op = list_entry(object->pending_ops.next,
259 struct fscache_operation, pend_link);
260
261 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
262 if (object->n_in_progress > 0)
263 break;
264 stop = true;
265 }
266 list_del_init(&op->pend_link);
267 object->n_in_progress++;
268
269 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
270 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
271 if (op->processor)
272 fscache_enqueue_operation(op);
273
274 /* the pending queue was holding a ref on the object */
275 fscache_put_operation(op);
276 }
277
278 ASSERTCMP(object->n_in_progress, <=, object->n_ops);
279
280 _debug("woke %d ops on OBJ%x",
281 object->n_in_progress, object->debug_id);
282}
283
284/*
285 * release an operation
286 * - queues pending ops if this is the last in-progress op
287 */
288void fscache_put_operation(struct fscache_operation *op)
289{
290 struct fscache_object *object;
291 struct fscache_cache *cache;
292
293 _enter("{OBJ%x OP%x,%d}",
294 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
295
296 ASSERTCMP(atomic_read(&op->usage), >, 0);
297
298 if (!atomic_dec_and_test(&op->usage))
299 return;
300
301 _debug("PUT OP");
302 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
303 BUG();
304
305 fscache_stat(&fscache_n_op_release);
306
307 if (op->release) {
308 op->release(op);
309 op->release = NULL;
310 }
311
312 object = op->object;
313
314 /* now... we may get called with the object spinlock held, so we
315 * complete the cleanup here only if we can immediately acquire the
316 * lock, and defer it otherwise */
317 if (!spin_trylock(&object->lock)) {
318 _debug("defer put");
319 fscache_stat(&fscache_n_op_deferred_release);
320
321 cache = object->cache;
322 spin_lock(&cache->op_gc_list_lock);
323 list_add_tail(&op->pend_link, &cache->op_gc_list);
324 spin_unlock(&cache->op_gc_list_lock);
325 schedule_work(&cache->op_gc);
326 _leave(" [defer]");
327 return;
328 }
329
330 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
331 ASSERTCMP(object->n_exclusive, >, 0);
332 object->n_exclusive--;
333 }
334
335 ASSERTCMP(object->n_in_progress, >, 0);
336 object->n_in_progress--;
337 if (object->n_in_progress == 0)
338 fscache_start_operations(object);
339
340 ASSERTCMP(object->n_ops, >, 0);
341 object->n_ops--;
342 if (object->n_ops == 0)
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
344
345 spin_unlock(&object->lock);
346
347 kfree(op);
348 _leave(" [done]");
349}
350EXPORT_SYMBOL(fscache_put_operation);
351
352/*
353 * garbage collect operations that have had their release deferred
354 */
355void fscache_operation_gc(struct work_struct *work)
356{
357 struct fscache_operation *op;
358 struct fscache_object *object;
359 struct fscache_cache *cache =
360 container_of(work, struct fscache_cache, op_gc);
361 int count = 0;
362
363 _enter("");
364
365 do {
366 spin_lock(&cache->op_gc_list_lock);
367 if (list_empty(&cache->op_gc_list)) {
368 spin_unlock(&cache->op_gc_list_lock);
369 break;
370 }
371
372 op = list_entry(cache->op_gc_list.next,
373 struct fscache_operation, pend_link);
374 list_del(&op->pend_link);
375 spin_unlock(&cache->op_gc_list_lock);
376
377 object = op->object;
378
379 _debug("GC DEFERRED REL OBJ%x OP%x",
380 object->debug_id, op->debug_id);
381 fscache_stat(&fscache_n_op_gc);
382
383 ASSERTCMP(atomic_read(&op->usage), ==, 0);
384
385 spin_lock(&object->lock);
386 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
387 ASSERTCMP(object->n_exclusive, >, 0);
388 object->n_exclusive--;
389 }
390
391 ASSERTCMP(object->n_in_progress, >, 0);
392 object->n_in_progress--;
393 if (object->n_in_progress == 0)
394 fscache_start_operations(object);
395
396 ASSERTCMP(object->n_ops, >, 0);
397 object->n_ops--;
398 if (object->n_ops == 0)
399 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
400
401 spin_unlock(&object->lock);
402
403 } while (count++ < 20);
404
405 if (!list_empty(&cache->op_gc_list))
406 schedule_work(&cache->op_gc);
407
408 _leave("");
409}
410
411/*
412 * allow the slow work item processor to get a ref on an operation
413 */
414static int fscache_op_get_ref(struct slow_work *work)
415{
416 struct fscache_operation *op =
417 container_of(work, struct fscache_operation, slow_work);
418
419 atomic_inc(&op->usage);
420 return 0;
421}
422
423/*
424 * allow the slow work item processor to discard a ref on an operation
425 */
426static void fscache_op_put_ref(struct slow_work *work)
427{
428 struct fscache_operation *op =
429 container_of(work, struct fscache_operation, slow_work);
430
431 fscache_put_operation(op);
432}
433
434/*
435 * execute an operation using the slow thread pool to provide processing context
436 * - the caller holds a ref to this object, so we don't need to hold one
437 */
438static void fscache_op_execute(struct slow_work *work)
439{
440 struct fscache_operation *op =
441 container_of(work, struct fscache_operation, slow_work);
442 unsigned long start;
443
444 _enter("{OBJ%x OP%x,%d}",
445 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
446
447 ASSERT(op->processor != NULL);
448 start = jiffies;
449 op->processor(op);
450 fscache_hist(fscache_ops_histogram, start);
451
452 _leave("");
453}
454
455const struct slow_work_ops fscache_op_slow_work_ops = {
456 .get_ref = fscache_op_get_ref,
457 .put_ref = fscache_op_put_ref,
458 .execute = fscache_op_execute,
459};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
1/* Cache page management and data I/O routines
2 *
3 * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL PAGE
13#include <linux/module.h>
14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h>
16#include <linux/pagevec.h>
17#include "internal.h"
18
19/*
20 * check to see if a page is being written to the cache
21 */
22bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
23{
24 void *val;
25
26 rcu_read_lock();
27 val = radix_tree_lookup(&cookie->stores, page->index);
28 rcu_read_unlock();
29
30 return val != NULL;
31}
32EXPORT_SYMBOL(__fscache_check_page_write);
33
34/*
35 * wait for a page to finish being written to the cache
36 */
37void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
38{
39 wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
40
41 wait_event(*wq, !__fscache_check_page_write(cookie, page));
42}
43EXPORT_SYMBOL(__fscache_wait_on_page_write);
44
45/*
46 * note that a page has finished being written to the cache
47 */
48static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
49{
50 struct page *xpage;
51
52 spin_lock(&cookie->lock);
53 xpage = radix_tree_delete(&cookie->stores, page->index);
54 spin_unlock(&cookie->lock);
55 ASSERT(xpage != NULL);
56
57 wake_up_bit(&cookie->flags, 0);
58}
59
60/*
61 * actually apply the changed attributes to a cache object
62 */
63static void fscache_attr_changed_op(struct fscache_operation *op)
64{
65 struct fscache_object *object = op->object;
66
67 _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
68
69 fscache_stat(&fscache_n_attr_changed_calls);
70
71 if (fscache_object_is_active(object) &&
72 object->cache->ops->attr_changed(object) < 0)
73 fscache_abort_object(object);
74
75 _leave("");
76}
77
78/*
79 * notification that the attributes on an object have changed
80 */
81int __fscache_attr_changed(struct fscache_cookie *cookie)
82{
83 struct fscache_operation *op;
84 struct fscache_object *object;
85
86 _enter("%p", cookie);
87
88 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
89
90 fscache_stat(&fscache_n_attr_changed);
91
92 op = kzalloc(sizeof(*op), GFP_KERNEL);
93 if (!op) {
94 fscache_stat(&fscache_n_attr_changed_nomem);
95 _leave(" = -ENOMEM");
96 return -ENOMEM;
97 }
98
99 fscache_operation_init(op, NULL);
100 fscache_operation_init_slow(op, fscache_attr_changed_op);
101 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
102
103 spin_lock(&cookie->lock);
104
105 if (hlist_empty(&cookie->backing_objects))
106 goto nobufs;
107 object = hlist_entry(cookie->backing_objects.first,
108 struct fscache_object, cookie_link);
109
110 if (fscache_submit_exclusive_op(object, op) < 0)
111 goto nobufs;
112 spin_unlock(&cookie->lock);
113 fscache_stat(&fscache_n_attr_changed_ok);
114 fscache_put_operation(op);
115 _leave(" = 0");
116 return 0;
117
118nobufs:
119 spin_unlock(&cookie->lock);
120 kfree(op);
121 fscache_stat(&fscache_n_attr_changed_nobufs);
122 _leave(" = %d", -ENOBUFS);
123 return -ENOBUFS;
124}
125EXPORT_SYMBOL(__fscache_attr_changed);
126
127/*
128 * handle secondary execution given to a retrieval op on behalf of the
129 * cache
130 */
131static void fscache_retrieval_work(struct work_struct *work)
132{
133 struct fscache_retrieval *op =
134 container_of(work, struct fscache_retrieval, op.fast_work);
135 unsigned long start;
136
137 _enter("{OP%x}", op->op.debug_id);
138
139 start = jiffies;
140 op->op.processor(&op->op);
141 fscache_hist(fscache_ops_histogram, start);
142 fscache_put_operation(&op->op);
143}
144
145/*
146 * release a retrieval op reference
147 */
148static void fscache_release_retrieval_op(struct fscache_operation *_op)
149{
150 struct fscache_retrieval *op =
151 container_of(_op, struct fscache_retrieval, op);
152
153 _enter("{OP%x}", op->op.debug_id);
154
155 fscache_hist(fscache_retrieval_histogram, op->start_time);
156 if (op->context)
157 fscache_put_context(op->op.object->cookie, op->context);
158
159 _leave("");
160}
161
162/*
163 * allocate a retrieval op
164 */
165static struct fscache_retrieval *fscache_alloc_retrieval(
166 struct address_space *mapping,
167 fscache_rw_complete_t end_io_func,
168 void *context)
169{
170 struct fscache_retrieval *op;
171
172 /* allocate a retrieval operation and attempt to submit it */
173 op = kzalloc(sizeof(*op), GFP_NOIO);
174 if (!op) {
175 fscache_stat(&fscache_n_retrievals_nomem);
176 return NULL;
177 }
178
179 fscache_operation_init(&op->op, fscache_release_retrieval_op);
180 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
181 op->mapping = mapping;
182 op->end_io_func = end_io_func;
183 op->context = context;
184 op->start_time = jiffies;
185 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
186 INIT_LIST_HEAD(&op->to_do);
187 return op;
188}
189
190/*
191 * wait for a deferred lookup to complete
192 */
193static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
194{
195 unsigned long jif;
196
197 _enter("");
198
199 if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
200 _leave(" = 0 [imm]");
201 return 0;
202 }
203
204 fscache_stat(&fscache_n_retrievals_wait);
205
206 jif = jiffies;
207 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
208 fscache_wait_bit_interruptible,
209 TASK_INTERRUPTIBLE) != 0) {
210 fscache_stat(&fscache_n_retrievals_intr);
211 _leave(" = -ERESTARTSYS");
212 return -ERESTARTSYS;
213 }
214
215 ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
216
217 smp_rmb();
218 fscache_hist(fscache_retrieval_delay_histogram, jif);
219 _leave(" = 0 [dly]");
220 return 0;
221}
222
223/*
224 * read a page from the cache or allocate a block in which to store it
225 * - we return:
226 * -ENOMEM - out of memory, nothing done
227 * -ERESTARTSYS - interrupted
228 * -ENOBUFS - no backing object available in which to cache the block
229 * -ENODATA - no data available in the backing object for this block
230 * 0 - dispatched a read - it'll call end_io_func() when finished
231 */
232int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
233 struct page *page,
234 fscache_rw_complete_t end_io_func,
235 void *context,
236 gfp_t gfp)
237{
238 struct fscache_retrieval *op;
239 struct fscache_object *object;
240 int ret;
241
242 _enter("%p,%p,,,", cookie, page);
243
244 fscache_stat(&fscache_n_retrievals);
245
246 if (hlist_empty(&cookie->backing_objects))
247 goto nobufs;
248
249 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
250 ASSERTCMP(page, !=, NULL);
251
252 if (fscache_wait_for_deferred_lookup(cookie) < 0)
253 return -ERESTARTSYS;
254
255 op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
256 if (!op) {
257 _leave(" = -ENOMEM");
258 return -ENOMEM;
259 }
260
261 spin_lock(&cookie->lock);
262
263 if (hlist_empty(&cookie->backing_objects))
264 goto nobufs_unlock;
265 object = hlist_entry(cookie->backing_objects.first,
266 struct fscache_object, cookie_link);
267
268 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
269
270 if (fscache_submit_op(object, &op->op) < 0)
271 goto nobufs_unlock;
272 spin_unlock(&cookie->lock);
273
274 fscache_stat(&fscache_n_retrieval_ops);
275
276 /* pin the netfs read context in case we need to do the actual netfs
277 * read because we've encountered a cache read failure */
278 fscache_get_context(object->cookie, op->context);
279
280 /* we wait for the operation to become active, and then process it
281 * *here*, in this thread, and not in the thread pool */
282 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
283 _debug(">>> WT");
284 fscache_stat(&fscache_n_retrieval_op_waits);
285 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
286 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
287 _debug("<<< GO");
288 }
289
290 /* ask the cache to honour the operation */
291 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
292 ret = object->cache->ops->allocate_page(op, page, gfp);
293 if (ret == 0)
294 ret = -ENODATA;
295 } else {
296 ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
297 }
298
299 if (ret == -ENOMEM)
300 fscache_stat(&fscache_n_retrievals_nomem);
301 else if (ret == -ERESTARTSYS)
302 fscache_stat(&fscache_n_retrievals_intr);
303 else if (ret == -ENODATA)
304 fscache_stat(&fscache_n_retrievals_nodata);
305 else if (ret < 0)
306 fscache_stat(&fscache_n_retrievals_nobufs);
307 else
308 fscache_stat(&fscache_n_retrievals_ok);
309
310 fscache_put_retrieval(op);
311 _leave(" = %d", ret);
312 return ret;
313
314nobufs_unlock:
315 spin_unlock(&cookie->lock);
316 kfree(op);
317nobufs:
318 fscache_stat(&fscache_n_retrievals_nobufs);
319 _leave(" = -ENOBUFS");
320 return -ENOBUFS;
321}
322EXPORT_SYMBOL(__fscache_read_or_alloc_page);
323
324/*
325 * read a list of page from the cache or allocate a block in which to store
326 * them
327 * - we return:
328 * -ENOMEM - out of memory, some pages may be being read
329 * -ERESTARTSYS - interrupted, some pages may be being read
330 * -ENOBUFS - no backing object or space available in which to cache any
331 * pages not being read
332 * -ENODATA - no data available in the backing object for some or all of
333 * the pages
334 * 0 - dispatched a read on all pages
335 *
336 * end_io_func() will be called for each page read from the cache as it is
337 * finishes being read
338 *
339 * any pages for which a read is dispatched will be removed from pages and
340 * nr_pages
341 */
342int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
343 struct address_space *mapping,
344 struct list_head *pages,
345 unsigned *nr_pages,
346 fscache_rw_complete_t end_io_func,
347 void *context,
348 gfp_t gfp)
349{
350 fscache_pages_retrieval_func_t func;
351 struct fscache_retrieval *op;
352 struct fscache_object *object;
353 int ret;
354
355 _enter("%p,,%d,,,", cookie, *nr_pages);
356
357 fscache_stat(&fscache_n_retrievals);
358
359 if (hlist_empty(&cookie->backing_objects))
360 goto nobufs;
361
362 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
363 ASSERTCMP(*nr_pages, >, 0);
364 ASSERT(!list_empty(pages));
365
366 if (fscache_wait_for_deferred_lookup(cookie) < 0)
367 return -ERESTARTSYS;
368
369 op = fscache_alloc_retrieval(mapping, end_io_func, context);
370 if (!op)
371 return -ENOMEM;
372
373 spin_lock(&cookie->lock);
374
375 if (hlist_empty(&cookie->backing_objects))
376 goto nobufs_unlock;
377 object = hlist_entry(cookie->backing_objects.first,
378 struct fscache_object, cookie_link);
379
380 if (fscache_submit_op(object, &op->op) < 0)
381 goto nobufs_unlock;
382 spin_unlock(&cookie->lock);
383
384 fscache_stat(&fscache_n_retrieval_ops);
385
386 /* pin the netfs read context in case we need to do the actual netfs
387 * read because we've encountered a cache read failure */
388 fscache_get_context(object->cookie, op->context);
389
390 /* we wait for the operation to become active, and then process it
391 * *here*, in this thread, and not in the thread pool */
392 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
393 _debug(">>> WT");
394 fscache_stat(&fscache_n_retrieval_op_waits);
395 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
396 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
397 _debug("<<< GO");
398 }
399
400 /* ask the cache to honour the operation */
401 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
402 func = object->cache->ops->allocate_pages;
403 else
404 func = object->cache->ops->read_or_alloc_pages;
405 ret = func(op, pages, nr_pages, gfp);
406
407 if (ret == -ENOMEM)
408 fscache_stat(&fscache_n_retrievals_nomem);
409 else if (ret == -ERESTARTSYS)
410 fscache_stat(&fscache_n_retrievals_intr);
411 else if (ret == -ENODATA)
412 fscache_stat(&fscache_n_retrievals_nodata);
413 else if (ret < 0)
414 fscache_stat(&fscache_n_retrievals_nobufs);
415 else
416 fscache_stat(&fscache_n_retrievals_ok);
417
418 fscache_put_retrieval(op);
419 _leave(" = %d", ret);
420 return ret;
421
422nobufs_unlock:
423 spin_unlock(&cookie->lock);
424 kfree(op);
425nobufs:
426 fscache_stat(&fscache_n_retrievals_nobufs);
427 _leave(" = -ENOBUFS");
428 return -ENOBUFS;
429}
430EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
431
432/*
433 * allocate a block in the cache on which to store a page
434 * - we return:
435 * -ENOMEM - out of memory, nothing done
436 * -ERESTARTSYS - interrupted
437 * -ENOBUFS - no backing object available in which to cache the block
438 * 0 - block allocated
439 */
440int __fscache_alloc_page(struct fscache_cookie *cookie,
441 struct page *page,
442 gfp_t gfp)
443{
444 struct fscache_retrieval *op;
445 struct fscache_object *object;
446 int ret;
447
448 _enter("%p,%p,,,", cookie, page);
449
450 fscache_stat(&fscache_n_allocs);
451
452 if (hlist_empty(&cookie->backing_objects))
453 goto nobufs;
454
455 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
456 ASSERTCMP(page, !=, NULL);
457
458 if (fscache_wait_for_deferred_lookup(cookie) < 0)
459 return -ERESTARTSYS;
460
461 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
462 if (!op)
463 return -ENOMEM;
464
465 spin_lock(&cookie->lock);
466
467 if (hlist_empty(&cookie->backing_objects))
468 goto nobufs_unlock;
469 object = hlist_entry(cookie->backing_objects.first,
470 struct fscache_object, cookie_link);
471
472 if (fscache_submit_op(object, &op->op) < 0)
473 goto nobufs_unlock;
474 spin_unlock(&cookie->lock);
475
476 fscache_stat(&fscache_n_alloc_ops);
477
478 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
479 _debug(">>> WT");
480 fscache_stat(&fscache_n_alloc_op_waits);
481 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
482 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
483 _debug("<<< GO");
484 }
485
486 /* ask the cache to honour the operation */
487 ret = object->cache->ops->allocate_page(op, page, gfp);
488
489 if (ret < 0)
490 fscache_stat(&fscache_n_allocs_nobufs);
491 else
492 fscache_stat(&fscache_n_allocs_ok);
493
494 fscache_put_retrieval(op);
495 _leave(" = %d", ret);
496 return ret;
497
498nobufs_unlock:
499 spin_unlock(&cookie->lock);
500 kfree(op);
501nobufs:
502 fscache_stat(&fscache_n_allocs_nobufs);
503 _leave(" = -ENOBUFS");
504 return -ENOBUFS;
505}
506EXPORT_SYMBOL(__fscache_alloc_page);
507
508/*
509 * release a write op reference
510 */
511static void fscache_release_write_op(struct fscache_operation *_op)
512{
513 _enter("{OP%x}", _op->debug_id);
514}
515
516/*
517 * perform the background storage of a page into the cache
518 */
519static void fscache_write_op(struct fscache_operation *_op)
520{
521 struct fscache_storage *op =
522 container_of(_op, struct fscache_storage, op);
523 struct fscache_object *object = op->op.object;
524 struct fscache_cookie *cookie = object->cookie;
525 struct page *page;
526 unsigned n;
527 void *results[1];
528 int ret;
529
530 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
531
532 spin_lock(&cookie->lock);
533 spin_lock(&object->lock);
534
535 if (!fscache_object_is_active(object)) {
536 spin_unlock(&object->lock);
537 spin_unlock(&cookie->lock);
538 _leave("");
539 return;
540 }
541
542 fscache_stat(&fscache_n_store_calls);
543
544 /* find a page to store */
545 page = NULL;
546 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
547 FSCACHE_COOKIE_PENDING_TAG);
548 if (n != 1)
549 goto superseded;
550 page = results[0];
551 _debug("gang %d [%lx]", n, page->index);
552 if (page->index > op->store_limit)
553 goto superseded;
554
555 radix_tree_tag_clear(&cookie->stores, page->index,
556 FSCACHE_COOKIE_PENDING_TAG);
557
558 spin_unlock(&object->lock);
559 spin_unlock(&cookie->lock);
560
561 if (page) {
562 ret = object->cache->ops->write_page(op, page);
563 fscache_end_page_write(cookie, page);
564 page_cache_release(page);
565 if (ret < 0)
566 fscache_abort_object(object);
567 else
568 fscache_enqueue_operation(&op->op);
569 }
570
571 _leave("");
572 return;
573
574superseded:
575 /* this writer is going away and there aren't any more things to
576 * write */
577 _debug("cease");
578 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
579 spin_unlock(&object->lock);
580 spin_unlock(&cookie->lock);
581 _leave("");
582}
583
584/*
585 * request a page be stored in the cache
586 * - returns:
587 * -ENOMEM - out of memory, nothing done
588 * -ENOBUFS - no backing object available in which to cache the page
589 * 0 - dispatched a write - it'll call end_io_func() when finished
590 *
591 * if the cookie still has a backing object at this point, that object can be
592 * in one of a few states with respect to storage processing:
593 *
594 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
595 * set)
596 *
597 * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
598 * fill op)
599 *
600 * (b) writes deferred till post-creation (mark page for writing and
601 * return immediately)
602 *
603 * (2) negative lookup, object created, initial fill being made from netfs
604 * (FSCACHE_COOKIE_INITIAL_FILL is set)
605 *
606 * (a) fill point not yet reached this page (mark page for writing and
607 * return)
608 *
609 * (b) fill point passed this page (queue op to store this page)
610 *
611 * (3) object extant (queue op to store this page)
612 *
613 * any other state is invalid
614 */
615int __fscache_write_page(struct fscache_cookie *cookie,
616 struct page *page,
617 gfp_t gfp)
618{
619 struct fscache_storage *op;
620 struct fscache_object *object;
621 int ret;
622
623 _enter("%p,%x,", cookie, (u32) page->flags);
624
625 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
626 ASSERT(PageFsCache(page));
627
628 fscache_stat(&fscache_n_stores);
629
630 op = kzalloc(sizeof(*op), GFP_NOIO);
631 if (!op)
632 goto nomem;
633
634 fscache_operation_init(&op->op, fscache_release_write_op);
635 fscache_operation_init_slow(&op->op, fscache_write_op);
636 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
637
638 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
639 if (ret < 0)
640 goto nomem_free;
641
642 ret = -ENOBUFS;
643 spin_lock(&cookie->lock);
644
645 if (hlist_empty(&cookie->backing_objects))
646 goto nobufs;
647 object = hlist_entry(cookie->backing_objects.first,
648 struct fscache_object, cookie_link);
649 if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
650 goto nobufs;
651
652 /* add the page to the pending-storage radix tree on the backing
653 * object */
654 spin_lock(&object->lock);
655
656 _debug("store limit %llx", (unsigned long long) object->store_limit);
657
658 ret = radix_tree_insert(&cookie->stores, page->index, page);
659 if (ret < 0) {
660 if (ret == -EEXIST)
661 goto already_queued;
662 _debug("insert failed %d", ret);
663 goto nobufs_unlock_obj;
664 }
665
666 radix_tree_tag_set(&cookie->stores, page->index,
667 FSCACHE_COOKIE_PENDING_TAG);
668 page_cache_get(page);
669
670 /* we only want one writer at a time, but we do need to queue new
671 * writers after exclusive ops */
672 if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
673 goto already_pending;
674
675 spin_unlock(&object->lock);
676
677 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
678 op->store_limit = object->store_limit;
679
680 if (fscache_submit_op(object, &op->op) < 0)
681 goto submit_failed;
682
683 spin_unlock(&cookie->lock);
684 radix_tree_preload_end();
685 fscache_stat(&fscache_n_store_ops);
686 fscache_stat(&fscache_n_stores_ok);
687
688 /* the slow work queue now carries its own ref on the object */
689 fscache_put_operation(&op->op);
690 _leave(" = 0");
691 return 0;
692
693already_queued:
694 fscache_stat(&fscache_n_stores_again);
695already_pending:
696 spin_unlock(&object->lock);
697 spin_unlock(&cookie->lock);
698 radix_tree_preload_end();
699 kfree(op);
700 fscache_stat(&fscache_n_stores_ok);
701 _leave(" = 0");
702 return 0;
703
704submit_failed:
705 radix_tree_delete(&cookie->stores, page->index);
706 page_cache_release(page);
707 ret = -ENOBUFS;
708 goto nobufs;
709
710nobufs_unlock_obj:
711 spin_unlock(&object->lock);
712nobufs:
713 spin_unlock(&cookie->lock);
714 radix_tree_preload_end();
715 kfree(op);
716 fscache_stat(&fscache_n_stores_nobufs);
717 _leave(" = -ENOBUFS");
718 return -ENOBUFS;
719
720nomem_free:
721 kfree(op);
722nomem:
723 fscache_stat(&fscache_n_stores_oom);
724 _leave(" = -ENOMEM");
725 return -ENOMEM;
726}
727EXPORT_SYMBOL(__fscache_write_page);
728
729/*
730 * remove a page from the cache
731 */
732void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
733{
734 struct fscache_object *object;
735
736 _enter(",%p", page);
737
738 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
739 ASSERTCMP(page, !=, NULL);
740
741 fscache_stat(&fscache_n_uncaches);
742
743 /* cache withdrawal may beat us to it */
744 if (!PageFsCache(page))
745 goto done;
746
747 /* get the object */
748 spin_lock(&cookie->lock);
749
750 if (hlist_empty(&cookie->backing_objects)) {
751 ClearPageFsCache(page);
752 goto done_unlock;
753 }
754
755 object = hlist_entry(cookie->backing_objects.first,
756 struct fscache_object, cookie_link);
757
758 /* there might now be stuff on disk we could read */
759 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
760
761 /* only invoke the cache backend if we managed to mark the page
762 * uncached here; this deals with synchronisation vs withdrawal */
763 if (TestClearPageFsCache(page) &&
764 object->cache->ops->uncache_page) {
765 /* the cache backend releases the cookie lock */
766 object->cache->ops->uncache_page(object, page);
767 goto done;
768 }
769
770done_unlock:
771 spin_unlock(&cookie->lock);
772done:
773 _leave("");
774}
775EXPORT_SYMBOL(__fscache_uncache_page);
776
777/**
778 * fscache_mark_pages_cached - Mark pages as being cached
779 * @op: The retrieval op pages are being marked for
780 * @pagevec: The pages to be marked
781 *
782 * Mark a bunch of netfs pages as being cached. After this is called,
783 * the netfs must call fscache_uncache_page() to remove the mark.
784 */
785void fscache_mark_pages_cached(struct fscache_retrieval *op,
786 struct pagevec *pagevec)
787{
788 struct fscache_cookie *cookie = op->op.object->cookie;
789 unsigned long loop;
790
791#ifdef CONFIG_FSCACHE_STATS
792 atomic_add(pagevec->nr, &fscache_n_marks);
793#endif
794
795 for (loop = 0; loop < pagevec->nr; loop++) {
796 struct page *page = pagevec->pages[loop];
797
798 _debug("- mark %p{%lx}", page, page->index);
799 if (TestSetPageFsCache(page)) {
800 static bool once_only;
801 if (!once_only) {
802 once_only = true;
803 printk(KERN_WARNING "FS-Cache:"
804 " Cookie type %s marked page %lx"
805 " multiple times\n",
806 cookie->def->name, page->index);
807 }
808 }
809 }
810
811 if (cookie->def->mark_pages_cached)
812 cookie->def->mark_pages_cached(cookie->netfs_data,
813 op->mapping, pagevec);
814 pagevec_reinit(pagevec);
815}
816EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
1/* FS-Cache statistics viewing interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL OPERATION
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * initialise the /proc/fs/fscache/ directory
20 */
21int __init fscache_proc_init(void)
22{
23 _enter("");
24
25 if (!proc_mkdir("fs/fscache", NULL))
26 goto error_dir;
27
28#ifdef CONFIG_FSCACHE_STATS
29 if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
30 &fscache_stats_fops))
31 goto error_stats;
32#endif
33
34#ifdef CONFIG_FSCACHE_HISTOGRAM
35 if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
36 &fscache_histogram_fops))
37 goto error_histogram;
38#endif
39
40 _leave(" = 0");
41 return 0;
42
43#ifdef CONFIG_FSCACHE_HISTOGRAM
44error_histogram:
45#endif
46#ifdef CONFIG_FSCACHE_STATS
47 remove_proc_entry("fs/fscache/stats", NULL);
48error_stats:
49#endif
50 remove_proc_entry("fs/fscache", NULL);
51error_dir:
52 _leave(" = -ENOMEM");
53 return -ENOMEM;
54}
55
56/*
57 * clean up the /proc/fs/fscache/ directory
58 */
59void fscache_proc_cleanup(void)
60{
61#ifdef CONFIG_FSCACHE_HISTOGRAM
62 remove_proc_entry("fs/fscache/histogram", NULL);
63#endif
64#ifdef CONFIG_FSCACHE_STATS
65 remove_proc_entry("fs/fscache/stats", NULL);
66#endif
67 remove_proc_entry("fs/fscache", NULL);
68}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
1/* FS-Cache statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * operation counters
20 */
21atomic_t fscache_n_op_pend;
22atomic_t fscache_n_op_run;
23atomic_t fscache_n_op_enqueue;
24atomic_t fscache_n_op_requeue;
25atomic_t fscache_n_op_deferred_release;
26atomic_t fscache_n_op_release;
27atomic_t fscache_n_op_gc;
28
29atomic_t fscache_n_attr_changed;
30atomic_t fscache_n_attr_changed_ok;
31atomic_t fscache_n_attr_changed_nobufs;
32atomic_t fscache_n_attr_changed_nomem;
33atomic_t fscache_n_attr_changed_calls;
34
35atomic_t fscache_n_allocs;
36atomic_t fscache_n_allocs_ok;
37atomic_t fscache_n_allocs_wait;
38atomic_t fscache_n_allocs_nobufs;
39atomic_t fscache_n_alloc_ops;
40atomic_t fscache_n_alloc_op_waits;
41
42atomic_t fscache_n_retrievals;
43atomic_t fscache_n_retrievals_ok;
44atomic_t fscache_n_retrievals_wait;
45atomic_t fscache_n_retrievals_nodata;
46atomic_t fscache_n_retrievals_nobufs;
47atomic_t fscache_n_retrievals_intr;
48atomic_t fscache_n_retrievals_nomem;
49atomic_t fscache_n_retrieval_ops;
50atomic_t fscache_n_retrieval_op_waits;
51
52atomic_t fscache_n_stores;
53atomic_t fscache_n_stores_ok;
54atomic_t fscache_n_stores_again;
55atomic_t fscache_n_stores_nobufs;
56atomic_t fscache_n_stores_oom;
57atomic_t fscache_n_store_ops;
58atomic_t fscache_n_store_calls;
59
60atomic_t fscache_n_marks;
61atomic_t fscache_n_uncaches;
62
63atomic_t fscache_n_acquires;
64atomic_t fscache_n_acquires_null;
65atomic_t fscache_n_acquires_no_cache;
66atomic_t fscache_n_acquires_ok;
67atomic_t fscache_n_acquires_nobufs;
68atomic_t fscache_n_acquires_oom;
69
70atomic_t fscache_n_updates;
71atomic_t fscache_n_updates_null;
72atomic_t fscache_n_updates_run;
73
74atomic_t fscache_n_relinquishes;
75atomic_t fscache_n_relinquishes_null;
76atomic_t fscache_n_relinquishes_waitcrt;
77
78atomic_t fscache_n_cookie_index;
79atomic_t fscache_n_cookie_data;
80atomic_t fscache_n_cookie_special;
81
82atomic_t fscache_n_object_alloc;
83atomic_t fscache_n_object_no_alloc;
84atomic_t fscache_n_object_lookups;
85atomic_t fscache_n_object_lookups_negative;
86atomic_t fscache_n_object_lookups_positive;
87atomic_t fscache_n_object_created;
88atomic_t fscache_n_object_avail;
89atomic_t fscache_n_object_dead;
90
91atomic_t fscache_n_checkaux_none;
92atomic_t fscache_n_checkaux_okay;
93atomic_t fscache_n_checkaux_update;
94atomic_t fscache_n_checkaux_obsolete;
95
96/*
97 * display the general statistics
98 */
99static int fscache_stats_show(struct seq_file *m, void *v)
100{
101 seq_puts(m, "FS-Cache statistics\n");
102
103 seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
104 atomic_read(&fscache_n_cookie_index),
105 atomic_read(&fscache_n_cookie_data),
106 atomic_read(&fscache_n_cookie_special));
107
108 seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
109 atomic_read(&fscache_n_object_alloc),
110 atomic_read(&fscache_n_object_no_alloc),
111 atomic_read(&fscache_n_object_avail),
112 atomic_read(&fscache_n_object_dead));
113 seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
114 atomic_read(&fscache_n_checkaux_none),
115 atomic_read(&fscache_n_checkaux_okay),
116 atomic_read(&fscache_n_checkaux_update),
117 atomic_read(&fscache_n_checkaux_obsolete));
118
119 seq_printf(m, "Pages : mrk=%u unc=%u\n",
120 atomic_read(&fscache_n_marks),
121 atomic_read(&fscache_n_uncaches));
122
123 seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
124 " oom=%u\n",
125 atomic_read(&fscache_n_acquires),
126 atomic_read(&fscache_n_acquires_null),
127 atomic_read(&fscache_n_acquires_no_cache),
128 atomic_read(&fscache_n_acquires_ok),
129 atomic_read(&fscache_n_acquires_nobufs),
130 atomic_read(&fscache_n_acquires_oom));
131
132 seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
133 atomic_read(&fscache_n_object_lookups),
134 atomic_read(&fscache_n_object_lookups_negative),
135 atomic_read(&fscache_n_object_lookups_positive),
136 atomic_read(&fscache_n_object_created));
137
138 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
139 atomic_read(&fscache_n_updates),
140 atomic_read(&fscache_n_updates_null),
141 atomic_read(&fscache_n_updates_run));
142
143 seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
144 atomic_read(&fscache_n_relinquishes),
145 atomic_read(&fscache_n_relinquishes_null),
146 atomic_read(&fscache_n_relinquishes_waitcrt));
147
148 seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
149 atomic_read(&fscache_n_attr_changed),
150 atomic_read(&fscache_n_attr_changed_ok),
151 atomic_read(&fscache_n_attr_changed_nobufs),
152 atomic_read(&fscache_n_attr_changed_nomem),
153 atomic_read(&fscache_n_attr_changed_calls));
154
155 seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
156 atomic_read(&fscache_n_allocs),
157 atomic_read(&fscache_n_allocs_ok),
158 atomic_read(&fscache_n_allocs_wait),
159 atomic_read(&fscache_n_allocs_nobufs));
160 seq_printf(m, "Allocs : ops=%u owt=%u\n",
161 atomic_read(&fscache_n_alloc_ops),
162 atomic_read(&fscache_n_alloc_op_waits));
163
164 seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
165 " int=%u oom=%u\n",
166 atomic_read(&fscache_n_retrievals),
167 atomic_read(&fscache_n_retrievals_ok),
168 atomic_read(&fscache_n_retrievals_wait),
169 atomic_read(&fscache_n_retrievals_nodata),
170 atomic_read(&fscache_n_retrievals_nobufs),
171 atomic_read(&fscache_n_retrievals_intr),
172 atomic_read(&fscache_n_retrievals_nomem));
173 seq_printf(m, "Retrvls: ops=%u owt=%u\n",
174 atomic_read(&fscache_n_retrieval_ops),
175 atomic_read(&fscache_n_retrieval_op_waits));
176
177 seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
178 atomic_read(&fscache_n_stores),
179 atomic_read(&fscache_n_stores_ok),
180 atomic_read(&fscache_n_stores_again),
181 atomic_read(&fscache_n_stores_nobufs),
182 atomic_read(&fscache_n_stores_oom));
183 seq_printf(m, "Stores : ops=%u run=%u\n",
184 atomic_read(&fscache_n_store_ops),
185 atomic_read(&fscache_n_store_calls));
186
187 seq_printf(m, "Ops : pend=%u run=%u enq=%u\n",
188 atomic_read(&fscache_n_op_pend),
189 atomic_read(&fscache_n_op_run),
190 atomic_read(&fscache_n_op_enqueue));
191 seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
192 atomic_read(&fscache_n_op_deferred_release),
193 atomic_read(&fscache_n_op_release),
194 atomic_read(&fscache_n_op_gc));
195 return 0;
196}
197
198/*
199 * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
200 */
201static int fscache_stats_open(struct inode *inode, struct file *file)
202{
203 return single_open(file, fscache_stats_show, NULL);
204}
205
206const struct file_operations fscache_stats_fops = {
207 .owner = THIS_MODULE,
208 .open = fscache_stats_open,
209 .read = seq_read,
210 .llseek = seq_lseek,
211 .release = seq_release,
212};
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 06da05261e04..8b8eebc5614b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1032 fuse_put_request(fc, req); 1032 fuse_put_request(fc, req);
1033 return -ENOMEM; 1033 return -ENOMEM;
1034 } 1034 }
1035 req->out.argpages = 1;
1035 req->num_pages = 1; 1036 req->num_pages = 1;
1036 req->pages[0] = page; 1037 req->pages[0] = page;
1037 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1038 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4e340fedf768..2b25133524a3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
386 req->in.numargs = 1; 386 req->in.numargs = 1;
387 req->in.args[0].size = sizeof(struct fuse_read_in); 387 req->in.args[0].size = sizeof(struct fuse_read_in);
388 req->in.args[0].value = inarg; 388 req->in.args[0].value = inarg;
389 req->out.argpages = 1;
390 req->out.argvar = 1; 389 req->out.argvar = 1;
391 req->out.numargs = 1; 390 req->out.numargs = 1;
392 req->out.args[0].size = count; 391 req->out.args[0].size = count;
@@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page)
453 attr_ver = fuse_get_attr_version(fc); 452 attr_ver = fuse_get_attr_version(fc);
454 453
455 req->out.page_zeroing = 1; 454 req->out.page_zeroing = 1;
455 req->out.argpages = 1;
456 req->num_pages = 1; 456 req->num_pages = 1;
457 req->pages[0] = page; 457 req->pages[0] = page;
458 num_read = fuse_send_read(req, file, inode, pos, count, NULL); 458 num_read = fuse_send_read(req, file, inode, pos, count, NULL);
@@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
510 struct fuse_conn *fc = get_fuse_conn(inode); 510 struct fuse_conn *fc = get_fuse_conn(inode);
511 loff_t pos = page_offset(req->pages[0]); 511 loff_t pos = page_offset(req->pages[0]);
512 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 512 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
513
514 req->out.argpages = 1;
513 req->out.page_zeroing = 1; 515 req->out.page_zeroing = 1;
514 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 516 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
515 req->misc.read.attr_ver = fuse_get_attr_version(fc); 517 req->misc.read.attr_ver = fuse_get_attr_version(fc);
@@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
621 inarg->flags = file ? file->f_flags : 0; 623 inarg->flags = file ? file->f_flags : 0;
622 req->in.h.opcode = FUSE_WRITE; 624 req->in.h.opcode = FUSE_WRITE;
623 req->in.h.nodeid = get_node_id(inode); 625 req->in.h.nodeid = get_node_id(inode);
624 req->in.argpages = 1;
625 req->in.numargs = 2; 626 req->in.numargs = 2;
626 if (fc->minor < 9) 627 if (fc->minor < 9)
627 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 628 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
@@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
695 if (IS_ERR(req)) 696 if (IS_ERR(req))
696 return PTR_ERR(req); 697 return PTR_ERR(req);
697 698
699 req->in.argpages = 1;
698 req->num_pages = 1; 700 req->num_pages = 1;
699 req->pages[0] = page; 701 req->pages[0] = page;
700 req->page_offset = offset; 702 req->page_offset = offset;
@@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
771 size_t count = 0; 773 size_t count = 0;
772 int err; 774 int err;
773 775
776 req->in.argpages = 1;
774 req->page_offset = offset; 777 req->page_offset = offset;
775 778
776 do { 779 do {
@@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
935} 938}
936 939
937static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, 940static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
938 unsigned nbytes, int write) 941 unsigned *nbytesp, int write)
939{ 942{
943 unsigned nbytes = *nbytesp;
940 unsigned long user_addr = (unsigned long) buf; 944 unsigned long user_addr = (unsigned long) buf;
941 unsigned offset = user_addr & ~PAGE_MASK; 945 unsigned offset = user_addr & ~PAGE_MASK;
942 int npages; 946 int npages;
943 947
944 /* This doesn't work with nfsd */ 948 /* Special case for kernel I/O: can copy directly into the buffer */
945 if (!current->mm) 949 if (segment_eq(get_fs(), KERNEL_DS)) {
946 return -EPERM; 950 if (write)
951 req->in.args[1].value = (void *) user_addr;
952 else
953 req->out.args[0].value = (void *) user_addr;
954
955 return 0;
956 }
947 957
948 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 958 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
949 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 959 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
950 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 960 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
951 down_read(&current->mm->mmap_sem); 961 down_read(&current->mm->mmap_sem);
952 npages = get_user_pages(current, current->mm, user_addr, npages, write, 962 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
953 0, req->pages, NULL); 963 0, req->pages, NULL);
954 up_read(&current->mm->mmap_sem); 964 up_read(&current->mm->mmap_sem);
955 if (npages < 0) 965 if (npages < 0)
@@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
957 967
958 req->num_pages = npages; 968 req->num_pages = npages;
959 req->page_offset = offset; 969 req->page_offset = offset;
970
971 if (write)
972 req->in.argpages = 1;
973 else
974 req->out.argpages = 1;
975
976 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
977 *nbytesp = min(*nbytesp, nbytes);
978
960 return 0; 979 return 0;
961} 980}
962 981
@@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
979 998
980 while (count) { 999 while (count) {
981 size_t nres; 1000 size_t nres;
982 size_t nbytes_limit = min(count, nmax); 1001 size_t nbytes = min(count, nmax);
983 size_t nbytes; 1002 int err = fuse_get_user_pages(req, buf, &nbytes, write);
984 int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
985 if (err) { 1003 if (err) {
986 res = err; 1004 res = err;
987 break; 1005 break;
988 } 1006 }
989 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; 1007
990 nbytes = min(nbytes_limit, nbytes);
991 if (write) 1008 if (write)
992 nres = fuse_send_write(req, file, inode, pos, nbytes, 1009 nres = fuse_send_write(req, file, inode, pos, nbytes,
993 current->files); 1010 current->files);
@@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page)
1163 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); 1180 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
1164 1181
1165 copy_highpage(tmp_page, page); 1182 copy_highpage(tmp_page, page);
1183 req->in.argpages = 1;
1166 req->num_pages = 1; 1184 req->num_pages = 1;
1167 req->pages[0] = tmp_page; 1185 req->pages[0] = tmp_page;
1168 req->page_offset = 0; 1186 req->page_offset = 0;
@@ -1274,6 +1292,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1274 return 0; 1292 return 0;
1275} 1293}
1276 1294
1295static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1296{
1297 /* Can't provide the coherency needed for MAP_SHARED */
1298 if (vma->vm_flags & VM_MAYSHARE)
1299 return -ENODEV;
1300
1301 return generic_file_mmap(file, vma);
1302}
1303
1277static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, 1304static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1278 struct file_lock *fl) 1305 struct file_lock *fl)
1279{ 1306{
@@ -1908,6 +1935,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1908 .llseek = fuse_file_llseek, 1935 .llseek = fuse_file_llseek,
1909 .read = fuse_direct_read, 1936 .read = fuse_direct_read,
1910 .write = fuse_direct_write, 1937 .write = fuse_direct_write,
1938 .mmap = fuse_direct_mmap,
1911 .open = fuse_open, 1939 .open = fuse_open,
1912 .flush = fuse_flush, 1940 .flush = fuse_flush,
1913 .release = fuse_release, 1941 .release = fuse_release,
@@ -1917,7 +1945,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1917 .unlocked_ioctl = fuse_file_ioctl, 1945 .unlocked_ioctl = fuse_file_ioctl,
1918 .compat_ioctl = fuse_file_compat_ioctl, 1946 .compat_ioctl = fuse_file_compat_ioctl,
1919 .poll = fuse_file_poll, 1947 .poll = fuse_file_poll,
1920 /* no mmap and splice_read */ 1948 /* no splice_read */
1921}; 1949};
1922 1950
1923static const struct address_space_operations fuse_file_aops = { 1951static const struct address_space_operations fuse_file_aops = {
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
134 mode_t mode = inode->i_mode; 134 mode_t mode = inode->i_mode;
135 int error; 135 int error;
136 136
137 inode->i_mode = mode & ~current->fs->umask; 137 inode->i_mode = mode & ~current_umask();
138 if (!S_ISLNK(inode->i_mode)) 138 if (!S_ISLNK(inode->i_mode))
139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT); 139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
140 if (acl) { 140 if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa763..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
215 if (error) 215 if (error)
216 return error; 216 return error;
217 if (!acl) { 217 if (!acl) {
218 mode &= ~current->fs->umask; 218 mode &= ~current_umask();
219 if (mode != ip->i_inode.i_mode) 219 if (mode != ip->i_inode.i_mode)
220 error = munge_mode(ip, mode); 220 error = munge_mode(ip, mode);
221 return error; 221 return error;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b7..a36bb749926d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) 82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
83{ 83{
84 struct super_block *sb = dentry->d_sb; 84 struct super_block *sb = dentry->d_sb;
85 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
85 86
86 buf->f_type = HFS_SUPER_MAGIC; 87 buf->f_type = HFS_SUPER_MAGIC;
87 buf->f_bsize = sb->s_blocksize; 88 buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
90 buf->f_bavail = buf->f_bfree; 91 buf->f_bavail = buf->f_bfree;
91 buf->f_files = HFS_SB(sb)->fs_ablocks; 92 buf->f_files = HFS_SB(sb)->fs_ablocks;
92 buf->f_ffree = HFS_SB(sb)->free_ablocks; 93 buf->f_ffree = HFS_SB(sb)->free_ablocks;
94 buf->f_fsid.val[0] = (u32)id;
95 buf->f_fsid.val[1] = (u32)(id >> 32);
93 buf->f_namelen = HFS_NAMELEN; 96 buf->f_namelen = HFS_NAMELEN;
94 97
95 return 0; 98 return 0;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
48 48
49 opts->creator = HFSPLUS_DEF_CR_TYPE; 49 opts->creator = HFSPLUS_DEF_CR_TYPE;
50 opts->type = HFSPLUS_DEF_CR_TYPE; 50 opts->type = HFSPLUS_DEF_CR_TYPE;
51 opts->umask = current->fs->umask; 51 opts->umask = current_umask();
52 opts->uid = current_uid(); 52 opts->uid = current_uid();
53 opts->gid = current_gid(); 53 opts->gid = current_gid();
54 opts->part = -1; 54 opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8e..f2a64020f42e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 224{
225 struct super_block *sb = dentry->d_sb; 225 struct super_block *sb = dentry->d_sb;
226 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
226 227
227 buf->f_type = HFSPLUS_SUPER_MAGIC; 228 buf->f_type = HFSPLUS_SUPER_MAGIC;
228 buf->f_bsize = sb->s_blocksize; 229 buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
231 buf->f_bavail = buf->f_bfree; 232 buf->f_bavail = buf->f_bfree;
232 buf->f_files = 0xFFFFFFFF; 233 buf->f_files = 0xFFFFFFFF;
233 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 234 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
235 buf->f_fsid.val[0] = (u32)id;
236 buf->f_fsid.val[1] = (u32)(id >> 32);
234 buf->f_namelen = HFSPLUS_MAX_STRLEN; 237 buf->f_namelen = HFSPLUS_MAX_STRLEN;
235 238
236 return 0; 239 return 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..fecf402d7b8a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
136{ 136{
137 struct super_block *s = dentry->d_sb; 137 struct super_block *s = dentry->d_sb;
138 struct hpfs_sb_info *sbi = hpfs_sb(s); 138 struct hpfs_sb_info *sbi = hpfs_sb(s);
139 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
139 lock_kernel(); 140 lock_kernel();
140 141
141 /*if (sbi->sb_n_free == -1) {*/ 142 /*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
149 buf->f_bavail = sbi->sb_n_free; 150 buf->f_bavail = sbi->sb_n_free;
150 buf->f_files = sbi->sb_dirband_size / 4; 151 buf->f_files = sbi->sb_dirband_size / 4;
151 buf->f_ffree = sbi->sb_n_free_dnodes; 152 buf->f_ffree = sbi->sb_n_free_dnodes;
153 buf->f_fsid.val[0] = (u32)id;
154 buf->f_fsid.val[1] = (u32)(id >> 32);
152 buf->f_namelen = 254; 155 buf->f_namelen = 254;
153 156
154 unlock_kernel(); 157 unlock_kernel();
@@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 480
478 uid = current_uid(); 481 uid = current_uid();
479 gid = current_gid(); 482 gid = current_gid();
480 umask = current->fs->umask; 483 umask = current_umask();
481 lowercase = 0; 484 lowercase = 0;
482 conv = CONV_BINARY; 485 conv = CONV_BINARY;
483 eas = 2; 486 eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f52024..a5089a6dd67a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
280 "errno = %d\n", err); 280 "errno = %d\n", err);
281 return err; 281 return err;
282 } 282 }
283 count = hppfs_read_file(hppfs->host_fd, buf, count); 283 err = hppfs_read_file(hppfs->host_fd, buf, count);
284 if (err < 0) {
285 printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
286 return err;
287 }
288 count = err;
284 if (count > 0) 289 if (count > 0)
285 *ppos += count; 290 *ppos += count;
286 } 291 }
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
11 11
12struct super_block; 12struct super_block;
13struct linux_binprm; 13struct linux_binprm;
14struct path;
14 15
15/* 16/*
16 * block_dev.c 17 * block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
43/* 44/*
44 * exec.c 45 * exec.c
45 */ 46 */
46extern void check_unsafe_exec(struct linux_binprm *); 47extern int check_unsafe_exec(struct linux_binprm *);
47 48
48/* 49/*
49 * namespace.c 50 * namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
60extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 61extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
61 62
62extern void __init mnt_init(void); 63extern void __init mnt_init(void);
64
65/*
66 * fs_struct.c
67 */
68extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 13d2eddd0692..b4cbe9603c7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -923,6 +923,7 @@ out_freesbi:
923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) 923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
924{ 924{
925 struct super_block *sb = dentry->d_sb; 925 struct super_block *sb = dentry->d_sb;
926 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
926 927
927 buf->f_type = ISOFS_SUPER_MAGIC; 928 buf->f_type = ISOFS_SUPER_MAGIC;
928 buf->f_bsize = sb->s_blocksize; 929 buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
932 buf->f_bavail = 0; 933 buf->f_bavail = 0;
933 buf->f_files = ISOFS_SB(sb)->s_ninodes; 934 buf->f_files = ISOFS_SB(sb)->s_ninodes;
934 buf->f_ffree = 0; 935 buf->f_ffree = 0;
936 buf->f_fsid.val[0] = (u32)id;
937 buf->f_fsid.val[1] = (u32)(id >> 32);
935 buf->f_namelen = NAME_MAX; 938 buf->f_namelen = NAME_MAX;
936 return 0; 939 return 0;
937} 940}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..a8e8513a78a9 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bio.h>
23 24
24/* 25/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 26 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
171 return (ret == -EIO); 172 return (ret == -EIO);
172} 173}
173 174
174static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 175static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
176 int write_op)
175{ 177{
176 int i; 178 int i;
177 179
178 for (i = 0; i < bufs; i++) { 180 for (i = 0; i < bufs; i++) {
179 wbuf[i]->b_end_io = end_buffer_write_sync; 181 wbuf[i]->b_end_io = end_buffer_write_sync;
180 /* We use-up our safety reference in submit_bh() */ 182 /* We use-up our safety reference in submit_bh() */
181 submit_bh(WRITE, wbuf[i]); 183 submit_bh(write_op, wbuf[i]);
182 } 184 }
183} 185}
184 186
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
186 * Submit all the data buffers to disk 188 * Submit all the data buffers to disk
187 */ 189 */
188static int journal_submit_data_buffers(journal_t *journal, 190static int journal_submit_data_buffers(journal_t *journal,
189 transaction_t *commit_transaction) 191 transaction_t *commit_transaction,
192 int write_op)
190{ 193{
191 struct journal_head *jh; 194 struct journal_head *jh;
192 struct buffer_head *bh; 195 struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
225 BUFFER_TRACE(bh, "needs blocking lock"); 228 BUFFER_TRACE(bh, "needs blocking lock");
226 spin_unlock(&journal->j_list_lock); 229 spin_unlock(&journal->j_list_lock);
227 /* Write out all data to prevent deadlocks */ 230 /* Write out all data to prevent deadlocks */
228 journal_do_submit_data(wbuf, bufs); 231 journal_do_submit_data(wbuf, bufs, write_op);
229 bufs = 0; 232 bufs = 0;
230 lock_buffer(bh); 233 lock_buffer(bh);
231 spin_lock(&journal->j_list_lock); 234 spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
256 jbd_unlock_bh_state(bh); 259 jbd_unlock_bh_state(bh);
257 if (bufs == journal->j_wbufsize) { 260 if (bufs == journal->j_wbufsize) {
258 spin_unlock(&journal->j_list_lock); 261 spin_unlock(&journal->j_list_lock);
259 journal_do_submit_data(wbuf, bufs); 262 journal_do_submit_data(wbuf, bufs, write_op);
260 bufs = 0; 263 bufs = 0;
261 goto write_out_data; 264 goto write_out_data;
262 } 265 }
@@ -286,7 +289,7 @@ write_out_data:
286 } 289 }
287 } 290 }
288 spin_unlock(&journal->j_list_lock); 291 spin_unlock(&journal->j_list_lock);
289 journal_do_submit_data(wbuf, bufs); 292 journal_do_submit_data(wbuf, bufs, write_op);
290 293
291 return err; 294 return err;
292} 295}
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
315 int first_tag = 0; 318 int first_tag = 0;
316 int tag_flag; 319 int tag_flag;
317 int i; 320 int i;
321 int write_op = WRITE;
318 322
319 /* 323 /*
320 * First job: lock down the current transaction and wait for 324 * First job: lock down the current transaction and wait for
@@ -347,6 +351,13 @@ void journal_commit_transaction(journal_t *journal)
347 spin_lock(&journal->j_state_lock); 351 spin_lock(&journal->j_state_lock);
348 commit_transaction->t_state = T_LOCKED; 352 commit_transaction->t_state = T_LOCKED;
349 353
354 /*
355 * Use plugged writes here, since we want to submit several before
356 * we unplug the device. We don't do explicit unplugging in here,
357 * instead we rely on sync_buffer() doing the unplug for us.
358 */
359 if (commit_transaction->t_synchronous_commit)
360 write_op = WRITE_SYNC_PLUG;
350 spin_lock(&commit_transaction->t_handle_lock); 361 spin_lock(&commit_transaction->t_handle_lock);
351 while (commit_transaction->t_updates) { 362 while (commit_transaction->t_updates) {
352 DEFINE_WAIT(wait); 363 DEFINE_WAIT(wait);
@@ -431,7 +442,8 @@ void journal_commit_transaction(journal_t *journal)
431 * Now start flushing things to disk, in the order they appear 442 * Now start flushing things to disk, in the order they appear
432 * on the transaction lists. Data blocks go first. 443 * on the transaction lists. Data blocks go first.
433 */ 444 */
434 err = journal_submit_data_buffers(journal, commit_transaction); 445 err = journal_submit_data_buffers(journal, commit_transaction,
446 write_op);
435 447
436 /* 448 /*
437 * Wait for all previously submitted IO to complete. 449 * Wait for all previously submitted IO to complete.
@@ -660,7 +672,7 @@ start_journal_io:
660 clear_buffer_dirty(bh); 672 clear_buffer_dirty(bh);
661 set_buffer_uptodate(bh); 673 set_buffer_uptodate(bh);
662 bh->b_end_io = journal_end_buffer_io_sync; 674 bh->b_end_io = journal_end_buffer_io_sync;
663 submit_bh(WRITE, bh); 675 submit_bh(write_op, bh);
664 } 676 }
665 cond_resched(); 677 cond_resched();
666 678
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812afa..737f7246a4b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
637 return NULL; 637 return NULL;
638 638
639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
640 if (!bh)
641 return NULL;
640 lock_buffer(bh); 642 lock_buffer(bh);
641 memset(bh->b_data, 0, journal->j_blocksize); 643 memset(bh->b_data, 0, journal->j_blocksize);
642 set_buffer_uptodate(bh); 644 set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
733 if (!journal->j_wbuf) { 735 if (!journal->j_wbuf) {
734 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 736 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
735 __func__); 737 __func__);
736 kfree(journal); 738 goto out_err;
737 journal = NULL;
738 goto out;
739 } 739 }
740 journal->j_dev = bdev; 740 journal->j_dev = bdev;
741 journal->j_fs_dev = fs_dev; 741 journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
743 journal->j_maxlen = len; 743 journal->j_maxlen = len;
744 744
745 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 745 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
746 J_ASSERT(bh != NULL); 746 if (!bh) {
747 printk(KERN_ERR
748 "%s: Cannot get buffer for journal superblock\n",
749 __func__);
750 goto out_err;
751 }
747 journal->j_sb_buffer = bh; 752 journal->j_sb_buffer = bh;
748 journal->j_superblock = (journal_superblock_t *)bh->b_data; 753 journal->j_superblock = (journal_superblock_t *)bh->b_data;
749out: 754
750 return journal; 755 return journal;
756out_err:
757 kfree(journal);
758 return NULL;
751} 759}
752 760
753/** 761/**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
787 if (!journal->j_wbuf) { 795 if (!journal->j_wbuf) {
788 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 796 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
789 __func__); 797 __func__);
790 kfree(journal); 798 goto out_err;
791 return NULL;
792 } 799 }
793 800
794 err = journal_bmap(journal, 0, &blocknr); 801 err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
796 if (err) { 803 if (err) {
797 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 804 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
798 __func__); 805 __func__);
799 kfree(journal); 806 goto out_err;
800 return NULL;
801 } 807 }
802 808
803 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 809 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
804 J_ASSERT(bh != NULL); 810 if (!bh) {
811 printk(KERN_ERR
812 "%s: Cannot get buffer for journal superblock\n",
813 __func__);
814 goto out_err;
815 }
805 journal->j_sb_buffer = bh; 816 journal->j_sb_buffer = bh;
806 journal->j_superblock = (journal_superblock_t *)bh->b_data; 817 journal->j_superblock = (journal_superblock_t *)bh->b_data;
807 818
808 return journal; 819 return journal;
820out_err:
821 kfree(journal);
822 return NULL;
809} 823}
810 824
811/* 825/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
1440 } 1440 }
1441 } 1441 }
1442 1442
1443 if (handle->h_sync)
1444 transaction->t_synchronous_commit = 1;
1443 current->journal_info = NULL; 1445 current->journal_info = NULL;
1444 spin_lock(&journal->j_state_lock); 1446 spin_lock(&journal->j_state_lock);
1445 spin_lock(&transaction->t_handle_lock); 1447 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..073c8c3df7cd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
138 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
139 barrier_done = 1; 139 barrier_done = 1;
140 } 140 }
141 ret = submit_bh(WRITE_SYNC, bh); 141 ret = submit_bh(WRITE_SYNC_PLUG, bh);
142 if (barrier_done) 142 if (barrier_done)
143 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
144 144
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
159 lock_buffer(bh); 159 lock_buffer(bh);
160 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
161 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
162 ret = submit_bh(WRITE_SYNC, bh); 162 ret = submit_bh(WRITE_SYNC_PLUG, bh);
163 } 163 }
164 *cbh = bh; 164 *cbh = bh;
165 return ret; 165 return ret;
@@ -190,7 +190,7 @@ retry:
190 set_buffer_uptodate(bh); 190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync; 191 bh->b_end_io = journal_end_buffer_io_sync;
192 192
193 ret = submit_bh(WRITE_SYNC, bh); 193 ret = submit_bh(WRITE_SYNC_PLUG, bh);
194 if (ret) { 194 if (ret) {
195 unlock_buffer(bh); 195 unlock_buffer(bh);
196 return ret; 196 return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
402 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
403 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
404 404
405 /*
406 * Use plugged writes here, since we want to submit several before
407 * we unplug the device. We don't do explicit unplugging in here,
408 * instead we rely on sync_buffer() doing the unplug for us.
409 */
405 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
406 write_op = WRITE_SYNC; 411 write_op = WRITE_SYNC_PLUG;
407 stats.u.run.rs_wait = commit_transaction->t_max_wait; 412 stats.u.run.rs_wait = commit_transaction->t_max_wait;
408 stats.u.run.rs_locked = jiffies; 413 stats.u.run.rs_locked = jiffies;
409 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 414 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
38 size_t s; 38 size_t s;
39 39
40 size -= sizeof(struct jffs2_acl_header); 40 size -= sizeof(struct jffs2_acl_header);
41 s = size - 4 * sizeof(struct jffs2_acl_entry_short); 41 if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
42 if (s < 0) {
43 if (size % sizeof(struct jffs2_acl_entry_short)) 42 if (size % sizeof(struct jffs2_acl_entry_short))
44 return -1; 43 return -1;
45 return size / sizeof(struct jffs2_acl_entry_short); 44 return size / sizeof(struct jffs2_acl_entry_short);
46 } else { 45 } else {
46 s = size - 4 * sizeof(struct jffs2_acl_entry_short);
47 if (s % sizeof(struct jffs2_acl_entry)) 47 if (s % sizeof(struct jffs2_acl_entry))
48 return -1; 48 return -1;
49 return s / sizeof(struct jffs2_acl_entry) + 4; 49 return s / sizeof(struct jffs2_acl_entry) + 4;
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
336 return PTR_ERR(acl); 336 return PTR_ERR(acl);
337 337
338 if (!acl) { 338 if (!acl) {
339 *i_mode &= ~current->fs->umask; 339 *i_mode &= ~current_umask();
340 } else { 340 } else {
341 if (S_ISDIR(*i_mode)) 341 if (S_ISDIR(*i_mode))
342 jffs2_iset_acl(inode, &f->i_acl_default, acl); 342 jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) 284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
285{ 285{
286 struct jffs2_xattr_datum *xd; 286 struct jffs2_xattr_datum *xd;
287 xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); 287 xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
288 dbg_memalloc("%p\n", xd); 288 dbg_memalloc("%p\n", xd);
289 289
290 memset(xd, 0, sizeof(struct jffs2_xattr_datum));
291 xd->class = RAWNODE_CLASS_XATTR_DATUM; 290 xd->class = RAWNODE_CLASS_XATTR_DATUM;
292 xd->node = (void *)xd; 291 xd->node = (void *)xd;
293 INIT_LIST_HEAD(&xd->xindex); 292 INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
303struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) 302struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
304{ 303{
305 struct jffs2_xattr_ref *ref; 304 struct jffs2_xattr_ref *ref;
306 ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); 305 ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
307 dbg_memalloc("%p\n", ref); 306 dbg_memalloc("%p\n", ref);
308 307
309 memset(ref, 0, sizeof(struct jffs2_xattr_ref));
310 ref->class = RAWNODE_CLASS_XATTR_REF; 308 ref->class = RAWNODE_CLASS_XATTR_REF;
311 ref->node = (void *)ref; 309 ref->node = (void *)ref;
312 return ref; 310 return ref;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e82..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
182cleanup: 182cleanup:
183 posix_acl_release(acl); 183 posix_acl_release(acl);
184 } else 184 } else
185 inode->i_mode &= ~current->fs->umask; 185 inode->i_mode &= ~current_umask();
186 186
187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | 187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
188 inode->i_mode; 188 inode->i_mode;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
575 * possibly a read which collects the result - which is stored in a 575 * possibly a read which collects the result - which is stored in a
576 * file-local buffer. 576 * file-local buffer.
577 */ 577 */
578
579void simple_transaction_set(struct file *file, size_t n)
580{
581 struct simple_transaction_argresp *ar = file->private_data;
582
583 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
584
585 /*
586 * The barrier ensures that ar->size will really remain zero until
587 * ar->data is ready for reading.
588 */
589 smp_mb();
590 ar->size = n;
591}
592
578char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 593char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
579{ 594{
580 struct simple_transaction_argresp *ar; 595 struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
820EXPORT_SYMBOL(simple_unlink); 835EXPORT_SYMBOL(simple_unlink);
821EXPORT_SYMBOL(simple_read_from_buffer); 836EXPORT_SYMBOL(simple_read_from_buffer);
822EXPORT_SYMBOL(memory_read_from_buffer); 837EXPORT_SYMBOL(memory_read_from_buffer);
838EXPORT_SYMBOL(simple_transaction_set);
823EXPORT_SYMBOL(simple_transaction_get); 839EXPORT_SYMBOL(simple_transaction_get);
824EXPORT_SYMBOL(simple_transaction_read); 840EXPORT_SYMBOL(simple_transaction_read);
825EXPORT_SYMBOL(simple_transaction_release); 841EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
426 ret = nlm_granted; 426 ret = nlm_granted;
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 /*
430 * If this is a blocking request for an
431 * already pending lock request then we need
432 * to put it back on lockd's block list
433 */
434 if (wait)
435 break;
429 ret = nlm_lck_denied; 436 ret = nlm_lck_denied;
430 break; 437 goto out;
431 case FILE_LOCK_DEFERRED: 438 case FILE_LOCK_DEFERRED:
432 if (wait) 439 if (wait)
433 break; 440 break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 450 goto out;
444 } 451 }
445 452
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
450 ret = nlm_lck_blocked; 453 ret = nlm_lck_blocked;
451 454
452 /* Append to list of blocked */ 455 /* Append to list of blocked */
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 618865b3128b..daad3c2740db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -321,15 +321,20 @@ out:
321 321
322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) 322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
323{ 323{
324 struct minix_sb_info *sbi = minix_sb(dentry->d_sb); 324 struct super_block *sb = dentry->d_sb;
325 buf->f_type = dentry->d_sb->s_magic; 325 struct minix_sb_info *sbi = minix_sb(sb);
326 buf->f_bsize = dentry->d_sb->s_blocksize; 326 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
327 buf->f_type = sb->s_magic;
328 buf->f_bsize = sb->s_blocksize;
327 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; 329 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
328 buf->f_bfree = minix_count_free_blocks(sbi); 330 buf->f_bfree = minix_count_free_blocks(sbi);
329 buf->f_bavail = buf->f_bfree; 331 buf->f_bavail = buf->f_bfree;
330 buf->f_files = sbi->s_ninodes; 332 buf->f_files = sbi->s_ninodes;
331 buf->f_ffree = minix_count_free_inodes(sbi); 333 buf->f_ffree = minix_count_free_inodes(sbi);
332 buf->f_namelen = sbi->s_namelen; 334 buf->f_namelen = sbi->s_namelen;
335 buf->f_fsid.val[0] = (u32)id;
336 buf->f_fsid.val[1] = (u32)(id >> 32);
337
333 return 0; 338 return 0;
334} 339}
335 340
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
82 bio_put(bio); 82 bio_put(bio);
83} 83}
84 84
85struct bio *mpage_bio_submit(int rw, struct bio *bio) 85static struct bio *mpage_bio_submit(int rw, struct bio *bio)
86{ 86{
87 bio->bi_end_io = mpage_end_io_read; 87 bio->bi_end_io = mpage_end_io_read;
88 if (rw == WRITE) 88 if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
90 submit_bio(rw, bio); 90 submit_bio(rw, bio);
91 return NULL; 91 return NULL;
92} 92}
93EXPORT_SYMBOL(mpage_bio_submit);
94 93
95static struct bio * 94static struct bio *
96mpage_alloc(struct block_device *bdev, 95mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
439 * just allocate full-size (16-page) BIOs. 438 * just allocate full-size (16-page) BIOs.
440 */ 439 */
441 440
442int __mpage_writepage(struct page *page, struct writeback_control *wbc, 441struct mpage_data {
442 struct bio *bio;
443 sector_t last_block_in_bio;
444 get_block_t *get_block;
445 unsigned use_writepage;
446};
447
448static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
443 void *data) 449 void *data)
444{ 450{
445 struct mpage_data *mpd = data; 451 struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
648 mpd->bio = bio; 654 mpd->bio = bio;
649 return ret; 655 return ret;
650} 656}
651EXPORT_SYMBOL(__mpage_writepage);
652 657
653/** 658/**
654 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 659 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/fcntl.h> 33#include <linux/fcntl.h>
34#include <linux/device_cgroup.h> 34#include <linux/device_cgroup.h>
35#include <linux/fs_struct.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
37#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) 38#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -1578,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
1578 struct dentry *dir = nd->path.dentry; 1579 struct dentry *dir = nd->path.dentry;
1579 1580
1580 if (!IS_POSIXACL(dir->d_inode)) 1581 if (!IS_POSIXACL(dir->d_inode))
1581 mode &= ~current->fs->umask; 1582 mode &= ~current_umask();
1582 error = security_path_mknod(&nd->path, path->dentry, mode, 0); 1583 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1583 if (error) 1584 if (error)
1584 goto out_unlock; 1585 goto out_unlock;
@@ -1989,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
1989 goto out_unlock; 1990 goto out_unlock;
1990 } 1991 }
1991 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 1992 if (!IS_POSIXACL(nd.path.dentry->d_inode))
1992 mode &= ~current->fs->umask; 1993 mode &= ~current_umask();
1993 error = may_mknod(mode); 1994 error = may_mknod(mode);
1994 if (error) 1995 if (error)
1995 goto out_dput; 1996 goto out_dput;
@@ -2067,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2067 goto out_unlock; 2068 goto out_unlock;
2068 2069
2069 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 2070 if (!IS_POSIXACL(nd.path.dentry->d_inode))
2070 mode &= ~current->fs->umask; 2071 mode &= ~current_umask();
2071 error = mnt_want_write(nd.path.mnt); 2072 error = mnt_want_write(nd.path.mnt);
2072 if (error) 2073 if (error)
2073 goto out_dput; 2074 goto out_dput;
@@ -2897,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink);
2897EXPORT_SYMBOL(vfs_unlink); 2898EXPORT_SYMBOL(vfs_unlink);
2898EXPORT_SYMBOL(dentry_unhash); 2899EXPORT_SYMBOL(dentry_unhash);
2899EXPORT_SYMBOL(generic_readlink); 2900EXPORT_SYMBOL(generic_readlink);
2900
2901/* to be mentioned only in INIT_TASK */
2902struct fs_struct init_fs = {
2903 .count = ATOMIC_INIT(1),
2904 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
2905 .umask = 0022,
2906};
diff --git a/fs/namespace.c b/fs/namespace.c
index 0a42e0e96027..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
27#include <linux/ramfs.h> 27#include <linux/ramfs.h>
28#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/idr.h> 29#include <linux/idr.h>
30#include <linux/fs_struct.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/unistd.h> 32#include <asm/unistd.h>
32#include "pnode.h" 33#include "pnode.h"
@@ -2093,66 +2094,6 @@ out1:
2093} 2094}
2094 2095
2095/* 2096/*
2096 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
2097 * It can block. Requires the big lock held.
2098 */
2099void set_fs_root(struct fs_struct *fs, struct path *path)
2100{
2101 struct path old_root;
2102
2103 write_lock(&fs->lock);
2104 old_root = fs->root;
2105 fs->root = *path;
2106 path_get(path);
2107 write_unlock(&fs->lock);
2108 if (old_root.dentry)
2109 path_put(&old_root);
2110}
2111
2112/*
2113 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
2114 * It can block. Requires the big lock held.
2115 */
2116void set_fs_pwd(struct fs_struct *fs, struct path *path)
2117{
2118 struct path old_pwd;
2119
2120 write_lock(&fs->lock);
2121 old_pwd = fs->pwd;
2122 fs->pwd = *path;
2123 path_get(path);
2124 write_unlock(&fs->lock);
2125
2126 if (old_pwd.dentry)
2127 path_put(&old_pwd);
2128}
2129
2130static void chroot_fs_refs(struct path *old_root, struct path *new_root)
2131{
2132 struct task_struct *g, *p;
2133 struct fs_struct *fs;
2134
2135 read_lock(&tasklist_lock);
2136 do_each_thread(g, p) {
2137 task_lock(p);
2138 fs = p->fs;
2139 if (fs) {
2140 atomic_inc(&fs->count);
2141 task_unlock(p);
2142 if (fs->root.dentry == old_root->dentry
2143 && fs->root.mnt == old_root->mnt)
2144 set_fs_root(fs, new_root);
2145 if (fs->pwd.dentry == old_root->dentry
2146 && fs->pwd.mnt == old_root->mnt)
2147 set_fs_pwd(fs, new_root);
2148 put_fs_struct(fs);
2149 } else
2150 task_unlock(p);
2151 } while_each_thread(g, p);
2152 read_unlock(&tasklist_lock);
2153}
2154
2155/*
2156 * pivot_root Semantics: 2097 * pivot_root Semantics:
2157 * Moves the root file system of the current process to the directory put_old, 2098 * Moves the root file system of the current process to the directory put_old,
2158 * makes new_root as the new root file system of the current process, and sets 2099 * makes new_root as the new root file system of the current process, and sets
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba2..e67f3ec07736 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
84 <file:Documentation/filesystems/nfsroot.txt>. 84 <file:Documentation/filesystems/nfsroot.txt>.
85 85
86 Most people say N here. 86 Most people say N here.
87
88config NFS_FSCACHE
89 bool "Provide NFS client caching support (EXPERIMENTAL)"
90 depends on EXPERIMENTAL
91 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
92 help
93 Say Y here if you want NFS data to be cached locally on disc through
94 the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a3..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 callback.o callback_xdr.o callback_proc.o \ 15 callback.o callback_xdr.o callback_proc.o \
16 nfs4namespace.o 16 nfs4namespace.o
17nfs-$(CONFIG_SYSCTL) += sysctl.o 17nfs-$(CONFIG_SYSCTL) += sysctl.o
18nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index aba38017bdef..75c9cd2aa119 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
45#include "delegation.h" 45#include "delegation.h"
46#include "iostat.h" 46#include "iostat.h"
47#include "internal.h" 47#include "internal.h"
48#include "fscache.h"
48 49
49#define NFSDBG_FACILITY NFSDBG_CLIENT 50#define NFSDBG_FACILITY NFSDBG_CLIENT
50 51
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
154 if (!IS_ERR(cred)) 155 if (!IS_ERR(cred))
155 clp->cl_machine_cred = cred; 156 clp->cl_machine_cred = cred;
156 157
158 nfs_fscache_get_client_cookie(clp);
159
157 return clp; 160 return clp;
158 161
159error_3: 162error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
187 190
188 nfs4_shutdown_client(clp); 191 nfs4_shutdown_client(clp);
189 192
193 nfs_fscache_release_client_cookie(clp);
194
190 /* -EIO all pending I/O */ 195 /* -EIO all pending I/O */
191 if (!IS_ERR(clp->cl_rpcclient)) 196 if (!IS_ERR(clp->cl_rpcclient))
192 rpc_shutdown_client(clp->cl_rpcclient); 197 rpc_shutdown_client(clp->cl_rpcclient);
@@ -760,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
760 765
761 /* Initialise the client representation from the mount data */ 766 /* Initialise the client representation from the mount data */
762 server->flags = data->flags; 767 server->flags = data->flags;
768 server->options = data->options;
763 769
764 if (data->rsize) 770 if (data->rsize)
765 server->rsize = nfs_block_size(data->rsize, NULL); 771 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1148,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
1148 /* Initialise the client representation from the mount data */ 1154 /* Initialise the client representation from the mount data */
1149 server->flags = data->flags; 1155 server->flags = data->flags;
1150 server->caps |= NFS_CAP_ATOMIC_OPEN; 1156 server->caps |= NFS_CAP_ATOMIC_OPEN;
1157 server->options = data->options;
1151 1158
1152 /* Get a client record */ 1159 /* Get a client record */
1153 error = nfs4_set_client(server, 1160 error = nfs4_set_client(server,
@@ -1559,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1559 1566
1560 /* display header on line 1 */ 1567 /* display header on line 1 */
1561 if (v == &nfs_volume_list) { 1568 if (v == &nfs_volume_list) {
1562 seq_puts(m, "NV SERVER PORT DEV FSID\n"); 1569 seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
1563 return 0; 1570 return 0;
1564 } 1571 }
1565 /* display one transport per line on subsequent lines */ 1572 /* display one transport per line on subsequent lines */
@@ -1573,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1573 (unsigned long long) server->fsid.major, 1580 (unsigned long long) server->fsid.major,
1574 (unsigned long long) server->fsid.minor); 1581 (unsigned long long) server->fsid.minor);
1575 1582
1576 seq_printf(m, "v%u %s %s %-7s %-17s\n", 1583 seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
1577 clp->rpc_ops->version, 1584 clp->rpc_ops->version,
1578 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1585 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1579 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), 1586 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
1580 dev, 1587 dev,
1581 fsid); 1588 fsid,
1589 nfs_server_fscache_state(server));
1582 1590
1583 return 0; 1591 return 0;
1584} 1592}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0abf3f331f56..5a97bcfe03e5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
35#include "delegation.h" 35#include "delegation.h"
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h"
38 39
39#define NFSDBG_FACILITY NFSDBG_FILE 40#define NFSDBG_FACILITY NFSDBG_FILE
40 41
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
409 return copied; 410 return copied;
410} 411}
411 412
413/*
414 * Partially or wholly invalidate a page
415 * - Release the private state associated with a page if undergoing complete
416 * page invalidation
417 * - Called if either PG_private or PG_fscache is set on the page
418 * - Caller holds page lock
419 */
412static void nfs_invalidate_page(struct page *page, unsigned long offset) 420static void nfs_invalidate_page(struct page *page, unsigned long offset)
413{ 421{
414 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 422 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
417 return; 425 return;
418 /* Cancel any unstarted writes on this page */ 426 /* Cancel any unstarted writes on this page */
419 nfs_wb_page_cancel(page->mapping->host, page); 427 nfs_wb_page_cancel(page->mapping->host, page);
428
429 nfs_fscache_invalidate_page(page, page->mapping->host);
420} 430}
421 431
432/*
433 * Attempt to release the private state associated with a page
434 * - Called if either PG_private or PG_fscache is set on the page
435 * - Caller holds page lock
436 * - Return true (may release page) or false (may not)
437 */
422static int nfs_release_page(struct page *page, gfp_t gfp) 438static int nfs_release_page(struct page *page, gfp_t gfp)
423{ 439{
424 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 440 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
425 441
426 /* If PagePrivate() is set, then the page is not freeable */ 442 /* If PagePrivate() is set, then the page is not freeable */
427 return 0; 443 if (PagePrivate(page))
444 return 0;
445 return nfs_fscache_release_page(page, gfp);
428} 446}
429 447
448/*
449 * Attempt to clear the private state associated with a page when an error
450 * occurs that requires the cached contents of an inode to be written back or
451 * destroyed
452 * - Called if either PG_private or fscache is set on the page
453 * - Caller holds page lock
454 * - Return 0 if successful, -error otherwise
455 */
430static int nfs_launder_page(struct page *page) 456static int nfs_launder_page(struct page *page)
431{ 457{
432 struct inode *inode = page->mapping->host; 458 struct inode *inode = page->mapping->host;
459 struct nfs_inode *nfsi = NFS_I(inode);
433 460
434 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", 461 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
435 inode->i_ino, (long long)page_offset(page)); 462 inode->i_ino, (long long)page_offset(page));
436 463
464 nfs_fscache_wait_on_page_write(nfsi, page);
437 return nfs_wb_page(inode, page); 465 return nfs_wb_page(inode, page);
438} 466}
439 467
@@ -451,6 +479,11 @@ const struct address_space_operations nfs_file_aops = {
451 .launder_page = nfs_launder_page, 479 .launder_page = nfs_launder_page,
452}; 480};
453 481
482/*
483 * Notification that a PTE pointing to an NFS page is about to be made
484 * writable, implying that someone is about to modify the page through a
485 * shared-writable mapping
486 */
454static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 487static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
455{ 488{
456 struct page *page = vmf->page; 489 struct page *page = vmf->page;
@@ -465,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
465 filp->f_mapping->host->i_ino, 498 filp->f_mapping->host->i_ino,
466 (long long)page_offset(page)); 499 (long long)page_offset(page));
467 500
501 /* make sure the cache has finished storing the page */
502 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
503
468 lock_page(page); 504 lock_page(page);
469 mapping = page->mapping; 505 mapping = page->mapping;
470 if (mapping != dentry->d_inode->i_mapping) 506 if (mapping != dentry->d_inode->i_mapping)
@@ -480,8 +516,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
480 goto out_unlock; 516 goto out_unlock;
481 517
482 ret = nfs_updatepage(filp, page, 0, pagelen); 518 ret = nfs_updatepage(filp, page, 0, pagelen);
483 if (ret == 0)
484 ret = pagelen;
485out_unlock: 519out_unlock:
486 unlock_page(page); 520 unlock_page(page);
487 if (ret) 521 if (ret)
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000000..5b1006480bc2
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
1/* NFS FS-Cache index structure definition
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19
20#include "internal.h"
21#include "fscache.h"
22
23#define NFSDBG_FACILITY NFSDBG_FSCACHE
24
25/*
26 * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
27 * the cookie for the top-level index object for NFS into here. The top-level
28 * index can than have other cache objects inserted into it.
29 */
30struct fscache_netfs nfs_fscache_netfs = {
31 .name = "nfs",
32 .version = 0,
33};
34
35/*
36 * Register NFS for caching
37 */
38int nfs_fscache_register(void)
39{
40 return fscache_register_netfs(&nfs_fscache_netfs);
41}
42
43/*
44 * Unregister NFS for caching
45 */
46void nfs_fscache_unregister(void)
47{
48 fscache_unregister_netfs(&nfs_fscache_netfs);
49}
50
51/*
52 * Layout of the key for an NFS server cache object.
53 */
54struct nfs_server_key {
55 uint16_t nfsversion; /* NFS protocol version */
56 uint16_t family; /* address family */
57 uint16_t port; /* IP port */
58 union {
59 struct in_addr ipv4_addr; /* IPv4 address */
60 struct in6_addr ipv6_addr; /* IPv6 address */
61 } addr[0];
62};
63
64/*
65 * Generate a key to describe a server in the main NFS index
66 * - We return the length of the key, or 0 if we can't generate one
67 */
68static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
69 void *buffer, uint16_t bufmax)
70{
71 const struct nfs_client *clp = cookie_netfs_data;
72 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
73 const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
74 struct nfs_server_key *key = buffer;
75 uint16_t len = sizeof(struct nfs_server_key);
76
77 key->nfsversion = clp->rpc_ops->version;
78 key->family = clp->cl_addr.ss_family;
79
80 memset(key, 0, len);
81
82 switch (clp->cl_addr.ss_family) {
83 case AF_INET:
84 key->port = sin->sin_port;
85 key->addr[0].ipv4_addr = sin->sin_addr;
86 len += sizeof(key->addr[0].ipv4_addr);
87 break;
88
89 case AF_INET6:
90 key->port = sin6->sin6_port;
91 key->addr[0].ipv6_addr = sin6->sin6_addr;
92 len += sizeof(key->addr[0].ipv6_addr);
93 break;
94
95 default:
96 printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
97 clp->cl_addr.ss_family);
98 len = 0;
99 break;
100 }
101
102 return len;
103}
104
105/*
106 * Define the server object for FS-Cache. This is used to describe a server
107 * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
108 * server address parameters.
109 */
110const struct fscache_cookie_def nfs_fscache_server_index_def = {
111 .name = "NFS.server",
112 .type = FSCACHE_COOKIE_TYPE_INDEX,
113 .get_key = nfs_server_get_key,
114};
115
116/*
117 * Generate a key to describe a superblock key in the main NFS index
118 */
119static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
120 void *buffer, uint16_t bufmax)
121{
122 const struct nfs_fscache_key *key;
123 const struct nfs_server *nfss = cookie_netfs_data;
124 uint16_t len;
125
126 key = nfss->fscache_key;
127 len = sizeof(key->key) + key->key.uniq_len;
128 if (len > bufmax) {
129 len = 0;
130 } else {
131 memcpy(buffer, &key->key, sizeof(key->key));
132 memcpy(buffer + sizeof(key->key),
133 key->key.uniquifier, key->key.uniq_len);
134 }
135
136 return len;
137}
138
139/*
140 * Define the superblock object for FS-Cache. This is used to describe a
141 * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
142 * parameters that might cause a separate superblock.
143 */
144const struct fscache_cookie_def nfs_fscache_super_index_def = {
145 .name = "NFS.super",
146 .type = FSCACHE_COOKIE_TYPE_INDEX,
147 .get_key = nfs_super_get_key,
148};
149
150/*
151 * Definition of the auxiliary data attached to NFS inode storage objects
152 * within the cache.
153 *
154 * The contents of this struct are recorded in the on-disk local cache in the
155 * auxiliary data attached to the data storage object backing an inode. This
156 * permits coherency to be managed when a new inode binds to an already extant
157 * cache object.
158 */
159struct nfs_fscache_inode_auxdata {
160 struct timespec mtime;
161 struct timespec ctime;
162 loff_t size;
163 u64 change_attr;
164};
165
166/*
167 * Generate a key to describe an NFS inode in an NFS server's index
168 */
169static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
170 void *buffer, uint16_t bufmax)
171{
172 const struct nfs_inode *nfsi = cookie_netfs_data;
173 uint16_t nsize;
174
175 /* use the inode's NFS filehandle as the key */
176 nsize = nfsi->fh.size;
177 memcpy(buffer, nfsi->fh.data, nsize);
178 return nsize;
179}
180
181/*
182 * Get certain file attributes from the netfs data
183 * - This function can be absent for an index
184 * - Not permitted to return an error
185 * - The netfs data from the cookie being used as the source is presented
186 */
187static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
188 uint64_t *size)
189{
190 const struct nfs_inode *nfsi = cookie_netfs_data;
191
192 *size = nfsi->vfs_inode.i_size;
193}
194
195/*
196 * Get the auxiliary data from netfs data
197 * - This function can be absent if the index carries no state data
198 * - Should store the auxiliary data in the buffer
199 * - Should return the amount of amount stored
200 * - Not permitted to return an error
201 * - The netfs data from the cookie being used as the source is presented
202 */
203static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
204 void *buffer, uint16_t bufmax)
205{
206 struct nfs_fscache_inode_auxdata auxdata;
207 const struct nfs_inode *nfsi = cookie_netfs_data;
208
209 memset(&auxdata, 0, sizeof(auxdata));
210 auxdata.size = nfsi->vfs_inode.i_size;
211 auxdata.mtime = nfsi->vfs_inode.i_mtime;
212 auxdata.ctime = nfsi->vfs_inode.i_ctime;
213
214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
215 auxdata.change_attr = nfsi->change_attr;
216
217 if (bufmax > sizeof(auxdata))
218 bufmax = sizeof(auxdata);
219
220 memcpy(buffer, &auxdata, bufmax);
221 return bufmax;
222}
223
224/*
225 * Consult the netfs about the state of an object
226 * - This function can be absent if the index carries no state data
227 * - The netfs data from the cookie being used as the target is
228 * presented, as is the auxiliary data
229 */
230static
231enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
232 const void *data,
233 uint16_t datalen)
234{
235 struct nfs_fscache_inode_auxdata auxdata;
236 struct nfs_inode *nfsi = cookie_netfs_data;
237
238 if (datalen != sizeof(auxdata))
239 return FSCACHE_CHECKAUX_OBSOLETE;
240
241 memset(&auxdata, 0, sizeof(auxdata));
242 auxdata.size = nfsi->vfs_inode.i_size;
243 auxdata.mtime = nfsi->vfs_inode.i_mtime;
244 auxdata.ctime = nfsi->vfs_inode.i_ctime;
245
246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
247 auxdata.change_attr = nfsi->change_attr;
248
249 if (memcmp(data, &auxdata, datalen) != 0)
250 return FSCACHE_CHECKAUX_OBSOLETE;
251
252 return FSCACHE_CHECKAUX_OKAY;
253}
254
255/*
256 * Indication from FS-Cache that the cookie is no longer cached
257 * - This function is called when the backing store currently caching a cookie
258 * is removed
259 * - The netfs should use this to clean up any markers indicating cached pages
260 * - This is mandatory for any object that may have data
261 */
262static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
263{
264 struct nfs_inode *nfsi = cookie_netfs_data;
265 struct pagevec pvec;
266 pgoff_t first;
267 int loop, nr_pages;
268
269 pagevec_init(&pvec, 0);
270 first = 0;
271
272 dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
273
274 for (;;) {
275 /* grab a bunch of pages to unmark */
276 nr_pages = pagevec_lookup(&pvec,
277 nfsi->vfs_inode.i_mapping,
278 first,
279 PAGEVEC_SIZE - pagevec_count(&pvec));
280 if (!nr_pages)
281 break;
282
283 for (loop = 0; loop < nr_pages; loop++)
284 ClearPageFsCache(pvec.pages[loop]);
285
286 first = pvec.pages[nr_pages - 1]->index + 1;
287
288 pvec.nr = nr_pages;
289 pagevec_release(&pvec);
290 cond_resched();
291 }
292}
293
294/*
295 * Get an extra reference on a read context.
296 * - This function can be absent if the completion function doesn't require a
297 * context.
298 * - The read context is passed back to NFS in the event that a data read on the
299 * cache fails with EIO - in which case the server must be contacted to
300 * retrieve the data, which requires the read context for security.
301 */
302static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
303{
304 get_nfs_open_context(context);
305}
306
307/*
308 * Release an extra reference on a read context.
309 * - This function can be absent if the completion function doesn't require a
310 * context.
311 */
312static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
313{
314 if (context)
315 put_nfs_open_context(context);
316}
317
318/*
319 * Define the inode object for FS-Cache. This is used to describe an inode
320 * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
321 * an inode.
322 *
323 * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
324 * held in the cache auxiliary data for the data storage object with those in
325 * the inode struct in memory.
326 */
327const struct fscache_cookie_def nfs_fscache_inode_object_def = {
328 .name = "NFS.fh",
329 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
330 .get_key = nfs_fscache_inode_get_key,
331 .get_attr = nfs_fscache_inode_get_attr,
332 .get_aux = nfs_fscache_inode_get_aux,
333 .check_aux = nfs_fscache_inode_check_aux,
334 .now_uncached = nfs_fscache_inode_now_uncached,
335 .get_context = nfs_fh_get_context,
336 .put_context = nfs_fh_put_context,
337};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..379be678cb7e
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
1/* NFS filesystem cache interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19#include <linux/seq_file.h>
20
21#include "internal.h"
22#include "iostat.h"
23#include "fscache.h"
24
25#define NFSDBG_FACILITY NFSDBG_FSCACHE
26
27static struct rb_root nfs_fscache_keys = RB_ROOT;
28static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
29
30/*
31 * Get the per-client index cookie for an NFS client if the appropriate mount
32 * flag was set
33 * - We always try and get an index cookie for the client, but get filehandle
34 * cookies on a per-superblock basis, depending on the mount flags
35 */
36void nfs_fscache_get_client_cookie(struct nfs_client *clp)
37{
38 /* create a cache index for looking up filehandles */
39 clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
40 &nfs_fscache_server_index_def,
41 clp);
42 dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
43 clp, clp->fscache);
44}
45
46/*
47 * Dispose of a per-client cookie
48 */
49void nfs_fscache_release_client_cookie(struct nfs_client *clp)
50{
51 dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
52 clp, clp->fscache);
53
54 fscache_relinquish_cookie(clp->fscache, 0);
55 clp->fscache = NULL;
56}
57
58/*
59 * Get the cache cookie for an NFS superblock. We have to handle
60 * uniquification here because the cache doesn't do it for us.
61 */
62void nfs_fscache_get_super_cookie(struct super_block *sb,
63 struct nfs_parsed_mount_data *data)
64{
65 struct nfs_fscache_key *key, *xkey;
66 struct nfs_server *nfss = NFS_SB(sb);
67 struct rb_node **p, *parent;
68 const char *uniq = data->fscache_uniq ?: "";
69 int diff, ulen;
70
71 ulen = strlen(uniq);
72 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
73 if (!key)
74 return;
75
76 key->nfs_client = nfss->nfs_client;
77 key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
78 key->key.nfs_server.flags = nfss->flags;
79 key->key.nfs_server.rsize = nfss->rsize;
80 key->key.nfs_server.wsize = nfss->wsize;
81 key->key.nfs_server.acregmin = nfss->acregmin;
82 key->key.nfs_server.acregmax = nfss->acregmax;
83 key->key.nfs_server.acdirmin = nfss->acdirmin;
84 key->key.nfs_server.acdirmax = nfss->acdirmax;
85 key->key.nfs_server.fsid = nfss->fsid;
86 key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
87
88 key->key.uniq_len = ulen;
89 memcpy(key->key.uniquifier, uniq, ulen);
90
91 spin_lock(&nfs_fscache_keys_lock);
92 p = &nfs_fscache_keys.rb_node;
93 parent = NULL;
94 while (*p) {
95 parent = *p;
96 xkey = rb_entry(parent, struct nfs_fscache_key, node);
97
98 if (key->nfs_client < xkey->nfs_client)
99 goto go_left;
100 if (key->nfs_client > xkey->nfs_client)
101 goto go_right;
102
103 diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
104 if (diff < 0)
105 goto go_left;
106 if (diff > 0)
107 goto go_right;
108
109 if (key->key.uniq_len == 0)
110 goto non_unique;
111 diff = memcmp(key->key.uniquifier,
112 xkey->key.uniquifier,
113 key->key.uniq_len);
114 if (diff < 0)
115 goto go_left;
116 if (diff > 0)
117 goto go_right;
118 goto non_unique;
119
120 go_left:
121 p = &(*p)->rb_left;
122 continue;
123 go_right:
124 p = &(*p)->rb_right;
125 }
126
127 rb_link_node(&key->node, parent, p);
128 rb_insert_color(&key->node, &nfs_fscache_keys);
129 spin_unlock(&nfs_fscache_keys_lock);
130 nfss->fscache_key = key;
131
132 /* create a cache index for looking up filehandles */
133 nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
134 &nfs_fscache_super_index_def,
135 nfss);
136 dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
137 nfss, nfss->fscache);
138 return;
139
140non_unique:
141 spin_unlock(&nfs_fscache_keys_lock);
142 kfree(key);
143 nfss->fscache_key = NULL;
144 nfss->fscache = NULL;
145 printk(KERN_WARNING "NFS:"
146 " Cache request denied due to non-unique superblock keys\n");
147}
148
149/*
150 * release a per-superblock cookie
151 */
152void nfs_fscache_release_super_cookie(struct super_block *sb)
153{
154 struct nfs_server *nfss = NFS_SB(sb);
155
156 dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
157 nfss, nfss->fscache);
158
159 fscache_relinquish_cookie(nfss->fscache, 0);
160 nfss->fscache = NULL;
161
162 if (nfss->fscache_key) {
163 spin_lock(&nfs_fscache_keys_lock);
164 rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
165 spin_unlock(&nfs_fscache_keys_lock);
166 kfree(nfss->fscache_key);
167 nfss->fscache_key = NULL;
168 }
169}
170
171/*
172 * Initialise the per-inode cache cookie pointer for an NFS inode.
173 */
174void nfs_fscache_init_inode_cookie(struct inode *inode)
175{
176 NFS_I(inode)->fscache = NULL;
177 if (S_ISREG(inode->i_mode))
178 set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
179}
180
181/*
182 * Get the per-inode cache cookie for an NFS inode.
183 */
184static void nfs_fscache_enable_inode_cookie(struct inode *inode)
185{
186 struct super_block *sb = inode->i_sb;
187 struct nfs_inode *nfsi = NFS_I(inode);
188
189 if (nfsi->fscache || !NFS_FSCACHE(inode))
190 return;
191
192 if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
193 nfsi->fscache = fscache_acquire_cookie(
194 NFS_SB(sb)->fscache,
195 &nfs_fscache_inode_object_def,
196 nfsi);
197
198 dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
199 sb, nfsi, nfsi->fscache);
200 }
201}
202
203/*
204 * Release a per-inode cookie.
205 */
206void nfs_fscache_release_inode_cookie(struct inode *inode)
207{
208 struct nfs_inode *nfsi = NFS_I(inode);
209
210 dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
211 nfsi, nfsi->fscache);
212
213 fscache_relinquish_cookie(nfsi->fscache, 0);
214 nfsi->fscache = NULL;
215}
216
217/*
218 * Retire a per-inode cookie, destroying the data attached to it.
219 */
220void nfs_fscache_zap_inode_cookie(struct inode *inode)
221{
222 struct nfs_inode *nfsi = NFS_I(inode);
223
224 dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
225 nfsi, nfsi->fscache);
226
227 fscache_relinquish_cookie(nfsi->fscache, 1);
228 nfsi->fscache = NULL;
229}
230
231/*
232 * Turn off the cache with regard to a per-inode cookie if opened for writing,
233 * invalidating all the pages in the page cache relating to the associated
234 * inode to clear the per-page caching.
235 */
236static void nfs_fscache_disable_inode_cookie(struct inode *inode)
237{
238 clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
239
240 if (NFS_I(inode)->fscache) {
241 dfprintk(FSCACHE,
242 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
243
244 /* Need to invalidate any mapped pages that were read in before
245 * turning off the cache.
246 */
247 if (inode->i_mapping && inode->i_mapping->nrpages)
248 invalidate_inode_pages2(inode->i_mapping);
249
250 nfs_fscache_zap_inode_cookie(inode);
251 }
252}
253
254/*
255 * wait_on_bit() sleep function for uninterruptible waiting
256 */
257static int nfs_fscache_wait_bit(void *flags)
258{
259 schedule();
260 return 0;
261}
262
263/*
264 * Lock against someone else trying to also acquire or relinquish a cookie
265 */
266static inline void nfs_fscache_inode_lock(struct inode *inode)
267{
268 struct nfs_inode *nfsi = NFS_I(inode);
269
270 while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
271 wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
272 nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
273}
274
275/*
276 * Unlock cookie management lock
277 */
278static inline void nfs_fscache_inode_unlock(struct inode *inode)
279{
280 struct nfs_inode *nfsi = NFS_I(inode);
281
282 smp_mb__before_clear_bit();
283 clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
284 smp_mb__after_clear_bit();
285 wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
286}
287
288/*
289 * Decide if we should enable or disable local caching for this inode.
290 * - For now, with NFS, only regular files that are open read-only will be able
291 * to use the cache.
292 * - May be invoked multiple times in parallel by parallel nfs_open() functions.
293 */
294void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
295{
296 if (NFS_FSCACHE(inode)) {
297 nfs_fscache_inode_lock(inode);
298 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
299 nfs_fscache_disable_inode_cookie(inode);
300 else
301 nfs_fscache_enable_inode_cookie(inode);
302 nfs_fscache_inode_unlock(inode);
303 }
304}
305
306/*
307 * Replace a per-inode cookie due to revalidation detecting a file having
308 * changed on the server.
309 */
310void nfs_fscache_reset_inode_cookie(struct inode *inode)
311{
312 struct nfs_inode *nfsi = NFS_I(inode);
313 struct nfs_server *nfss = NFS_SERVER(inode);
314 struct fscache_cookie *old = nfsi->fscache;
315
316 nfs_fscache_inode_lock(inode);
317 if (nfsi->fscache) {
318 /* retire the current fscache cache and get a new one */
319 fscache_relinquish_cookie(nfsi->fscache, 1);
320
321 nfsi->fscache = fscache_acquire_cookie(
322 nfss->nfs_client->fscache,
323 &nfs_fscache_inode_object_def,
324 nfsi);
325
326 dfprintk(FSCACHE,
327 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
328 nfss, nfsi, old, nfsi->fscache);
329 }
330 nfs_fscache_inode_unlock(inode);
331}
332
333/*
334 * Release the caching state associated with a page, if the page isn't busy
335 * interacting with the cache.
336 * - Returns true (can release page) or false (page busy).
337 */
338int nfs_fscache_release_page(struct page *page, gfp_t gfp)
339{
340 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
341 struct fscache_cookie *cookie = nfsi->fscache;
342
343 BUG_ON(!cookie);
344
345 if (fscache_check_page_write(cookie, page)) {
346 if (!(gfp & __GFP_WAIT))
347 return 0;
348 fscache_wait_on_page_write(cookie, page);
349 }
350
351 if (PageFsCache(page)) {
352 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
353 cookie, page, nfsi);
354
355 fscache_uncache_page(cookie, page);
356 nfs_add_fscache_stats(page->mapping->host,
357 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
358 }
359
360 return 1;
361}
362
363/*
364 * Release the caching state associated with a page if undergoing complete page
365 * invalidation.
366 */
367void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
368{
369 struct nfs_inode *nfsi = NFS_I(inode);
370 struct fscache_cookie *cookie = nfsi->fscache;
371
372 BUG_ON(!cookie);
373
374 dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
375 cookie, page, nfsi);
376
377 fscache_wait_on_page_write(cookie, page);
378
379 BUG_ON(!PageLocked(page));
380 fscache_uncache_page(cookie, page);
381 nfs_add_fscache_stats(page->mapping->host,
382 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
383}
384
385/*
386 * Handle completion of a page being read from the cache.
387 * - Called in process (keventd) context.
388 */
389static void nfs_readpage_from_fscache_complete(struct page *page,
390 void *context,
391 int error)
392{
393 dfprintk(FSCACHE,
394 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
395 page, context, error);
396
397 /* if the read completes with an error, we just unlock the page and let
398 * the VM reissue the readpage */
399 if (!error) {
400 SetPageUptodate(page);
401 unlock_page(page);
402 } else {
403 error = nfs_readpage_async(context, page->mapping->host, page);
404 if (error)
405 unlock_page(page);
406 }
407}
408
409/*
410 * Retrieve a page from fscache
411 */
412int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
413 struct inode *inode, struct page *page)
414{
415 int ret;
416
417 dfprintk(FSCACHE,
418 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
419 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
420
421 ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
422 page,
423 nfs_readpage_from_fscache_complete,
424 ctx,
425 GFP_KERNEL);
426
427 switch (ret) {
428 case 0: /* read BIO submitted (page in fscache) */
429 dfprintk(FSCACHE,
430 "NFS: readpage_from_fscache: BIO submitted\n");
431 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
432 return ret;
433
434 case -ENOBUFS: /* inode not in cache */
435 case -ENODATA: /* page not in cache */
436 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
437 dfprintk(FSCACHE,
438 "NFS: readpage_from_fscache %d\n", ret);
439 return 1;
440
441 default:
442 dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
443 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
444 }
445 return ret;
446}
447
448/*
449 * Retrieve a set of pages from fscache
450 */
451int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
452 struct inode *inode,
453 struct address_space *mapping,
454 struct list_head *pages,
455 unsigned *nr_pages)
456{
457 int ret, npages = *nr_pages;
458
459 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
460 NFS_I(inode)->fscache, npages, inode);
461
462 ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
463 mapping, pages, nr_pages,
464 nfs_readpage_from_fscache_complete,
465 ctx,
466 mapping_gfp_mask(mapping));
467 if (*nr_pages < npages)
468 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
469 npages);
470 if (*nr_pages > 0)
471 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
472 *nr_pages);
473
474 switch (ret) {
475 case 0: /* read submitted to the cache for all pages */
476 BUG_ON(!list_empty(pages));
477 BUG_ON(*nr_pages != 0);
478 dfprintk(FSCACHE,
479 "NFS: nfs_getpages_from_fscache: submitted\n");
480
481 return ret;
482
483 case -ENOBUFS: /* some pages aren't cached and can't be */
484 case -ENODATA: /* some pages aren't cached */
485 dfprintk(FSCACHE,
486 "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
487 return 1;
488
489 default:
490 dfprintk(FSCACHE,
491 "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
492 }
493
494 return ret;
495}
496
497/*
498 * Store a newly fetched page in fscache
499 * - PG_fscache must be set on the page
500 */
501void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
502{
503 int ret;
504
505 dfprintk(FSCACHE,
506 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
507 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
508
509 ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
510 dfprintk(FSCACHE,
511 "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
512 page, page->index, page->flags, ret);
513
514 if (ret != 0) {
515 fscache_uncache_page(NFS_I(inode)->fscache, page);
516 nfs_add_fscache_stats(inode,
517 NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
518 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
519 } else {
520 nfs_add_fscache_stats(inode,
521 NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
522 }
523}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000000..6e809bb0ff08
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
1/* NFS filesystem cache interface definitions
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#ifndef _NFS_FSCACHE_H
13#define _NFS_FSCACHE_H
14
15#include <linux/nfs_fs.h>
16#include <linux/nfs_mount.h>
17#include <linux/nfs4_mount.h>
18#include <linux/fscache.h>
19
20#ifdef CONFIG_NFS_FSCACHE
21
22/*
23 * set of NFS FS-Cache objects that form a superblock key
24 */
25struct nfs_fscache_key {
26 struct rb_node node;
27 struct nfs_client *nfs_client; /* the server */
28
29 /* the elements of the unique key - as used by nfs_compare_super() and
30 * nfs_compare_mount_options() to distinguish superblocks */
31 struct {
32 struct {
33 unsigned long s_flags; /* various flags
34 * (& NFS_MS_MASK) */
35 } super;
36
37 struct {
38 struct nfs_fsid fsid;
39 int flags;
40 unsigned int rsize; /* read size */
41 unsigned int wsize; /* write size */
42 unsigned int acregmin; /* attr cache timeouts */
43 unsigned int acregmax;
44 unsigned int acdirmin;
45 unsigned int acdirmax;
46 } nfs_server;
47
48 struct {
49 rpc_authflavor_t au_flavor;
50 } rpc_auth;
51
52 /* uniquifier - can be used if nfs_server.flags includes
53 * NFS_MOUNT_UNSHARED */
54 u8 uniq_len;
55 char uniquifier[0];
56 } key;
57};
58
59/*
60 * fscache-index.c
61 */
62extern struct fscache_netfs nfs_fscache_netfs;
63extern const struct fscache_cookie_def nfs_fscache_server_index_def;
64extern const struct fscache_cookie_def nfs_fscache_super_index_def;
65extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
66
67extern int nfs_fscache_register(void);
68extern void nfs_fscache_unregister(void);
69
70/*
71 * fscache.c
72 */
73extern void nfs_fscache_get_client_cookie(struct nfs_client *);
74extern void nfs_fscache_release_client_cookie(struct nfs_client *);
75
76extern void nfs_fscache_get_super_cookie(struct super_block *,
77 struct nfs_parsed_mount_data *);
78extern void nfs_fscache_release_super_cookie(struct super_block *);
79
80extern void nfs_fscache_init_inode_cookie(struct inode *);
81extern void nfs_fscache_release_inode_cookie(struct inode *);
82extern void nfs_fscache_zap_inode_cookie(struct inode *);
83extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
84extern void nfs_fscache_reset_inode_cookie(struct inode *);
85
86extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
87extern int nfs_fscache_release_page(struct page *, gfp_t);
88
89extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
90 struct inode *, struct page *);
91extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
92 struct inode *, struct address_space *,
93 struct list_head *, unsigned *);
94extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
95
96/*
97 * wait for a page to complete writing to the cache
98 */
99static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
100 struct page *page)
101{
102 if (PageFsCache(page))
103 fscache_wait_on_page_write(nfsi->fscache, page);
104}
105
106/*
107 * release the caching state associated with a page if undergoing complete page
108 * invalidation
109 */
110static inline void nfs_fscache_invalidate_page(struct page *page,
111 struct inode *inode)
112{
113 if (PageFsCache(page))
114 __nfs_fscache_invalidate_page(page, inode);
115}
116
117/*
118 * Retrieve a page from an inode data storage object.
119 */
120static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
121 struct inode *inode,
122 struct page *page)
123{
124 if (NFS_I(inode)->fscache)
125 return __nfs_readpage_from_fscache(ctx, inode, page);
126 return -ENOBUFS;
127}
128
129/*
130 * Retrieve a set of pages from an inode data storage object.
131 */
132static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
133 struct inode *inode,
134 struct address_space *mapping,
135 struct list_head *pages,
136 unsigned *nr_pages)
137{
138 if (NFS_I(inode)->fscache)
139 return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
140 nr_pages);
141 return -ENOBUFS;
142}
143
144/*
145 * Store a page newly fetched from the server in an inode data storage object
146 * in the cache.
147 */
148static inline void nfs_readpage_to_fscache(struct inode *inode,
149 struct page *page,
150 int sync)
151{
152 if (PageFsCache(page))
153 __nfs_readpage_to_fscache(inode, page, sync);
154}
155
156/*
157 * indicate the client caching state as readable text
158 */
159static inline const char *nfs_server_fscache_state(struct nfs_server *server)
160{
161 if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
162 return "yes";
163 return "no ";
164}
165
166
167#else /* CONFIG_NFS_FSCACHE */
168static inline int nfs_fscache_register(void) { return 0; }
169static inline void nfs_fscache_unregister(void) {}
170
171static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
172static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
173
174static inline void nfs_fscache_get_super_cookie(
175 struct super_block *sb,
176 struct nfs_parsed_mount_data *data)
177{
178}
179static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
180
181static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
182static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
183static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
184static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
185 struct file *filp) {}
186static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
187
188static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
189{
190 return 1; /* True: may release page */
191}
192static inline void nfs_fscache_invalidate_page(struct page *page,
193 struct inode *inode) {}
194static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
195 struct page *page) {}
196
197static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
198 struct inode *inode,
199 struct page *page)
200{
201 return -ENOBUFS;
202}
203static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
204 struct inode *inode,
205 struct address_space *mapping,
206 struct list_head *pages,
207 unsigned *nr_pages)
208{
209 return -ENOBUFS;
210}
211static inline void nfs_readpage_to_fscache(struct inode *inode,
212 struct page *page, int sync) {}
213
214static inline const char *nfs_server_fscache_state(struct nfs_server *server)
215{
216 return "no ";
217}
218
219#endif /* CONFIG_NFS_FSCACHE */
220#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a834d1d850b7..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
46#include "delegation.h" 46#include "delegation.h"
47#include "iostat.h" 47#include "iostat.h"
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h"
49 50
50#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
51 52
@@ -121,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
121 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 122 BUG_ON(!list_empty(&NFS_I(inode)->open_files));
122 nfs_zap_acl_cache(inode); 123 nfs_zap_acl_cache(inode);
123 nfs_access_zap_cache(inode); 124 nfs_access_zap_cache(inode);
125 nfs_fscache_release_inode_cookie(inode);
124} 126}
125 127
126/** 128/**
@@ -355,6 +357,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
355 nfsi->attrtimeo_timestamp = now; 357 nfsi->attrtimeo_timestamp = now;
356 nfsi->access_cache = RB_ROOT; 358 nfsi->access_cache = RB_ROOT;
357 359
360 nfs_fscache_init_inode_cookie(inode);
361
358 unlock_new_inode(inode); 362 unlock_new_inode(inode);
359 } else 363 } else
360 nfs_refresh_inode(inode, fattr); 364 nfs_refresh_inode(inode, fattr);
@@ -686,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
686 ctx->mode = filp->f_mode; 690 ctx->mode = filp->f_mode;
687 nfs_file_set_open_context(filp, ctx); 691 nfs_file_set_open_context(filp, ctx);
688 put_nfs_open_context(ctx); 692 put_nfs_open_context(ctx);
693 nfs_fscache_set_inode_cookie(inode, filp);
689 return 0; 694 return 0;
690} 695}
691 696
@@ -786,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
786 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 791 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
787 spin_unlock(&inode->i_lock); 792 spin_unlock(&inode->i_lock);
788 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 793 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
794 nfs_fscache_reset_inode_cookie(inode);
789 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 795 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
790 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 796 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
791 return 0; 797 return 0;
@@ -1030,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
1030 spin_lock(&inode->i_lock); 1036 spin_lock(&inode->i_lock);
1031 status = nfs_refresh_inode_locked(inode, fattr); 1037 status = nfs_refresh_inode_locked(inode, fattr);
1032 spin_unlock(&inode->i_lock); 1038 spin_unlock(&inode->i_lock);
1039
1033 return status; 1040 return status;
1034} 1041}
1035 1042
@@ -1436,6 +1443,10 @@ static int __init init_nfs_fs(void)
1436{ 1443{
1437 int err; 1444 int err;
1438 1445
1446 err = nfs_fscache_register();
1447 if (err < 0)
1448 goto out7;
1449
1439 err = nfsiod_start(); 1450 err = nfsiod_start();
1440 if (err) 1451 if (err)
1441 goto out6; 1452 goto out6;
@@ -1488,6 +1499,8 @@ out4:
1488out5: 1499out5:
1489 nfsiod_stop(); 1500 nfsiod_stop();
1490out6: 1501out6:
1502 nfs_fscache_unregister();
1503out7:
1491 return err; 1504 return err;
1492} 1505}
1493 1506
@@ -1498,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
1498 nfs_destroy_readpagecache(); 1511 nfs_destroy_readpagecache();
1499 nfs_destroy_inodecache(); 1512 nfs_destroy_inodecache();
1500 nfs_destroy_nfspagecache(); 1513 nfs_destroy_nfspagecache();
1514 nfs_fscache_unregister();
1501#ifdef CONFIG_PROC_FS 1515#ifdef CONFIG_PROC_FS
1502 rpc_proc_unregister("nfs"); 1516 rpc_proc_unregister("nfs");
1503#endif 1517#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2041f68ff1cc..e4d6a8348adf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
5#include <linux/mount.h> 5#include <linux/mount.h>
6#include <linux/security.h> 6#include <linux/security.h>
7 7
8#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
9
8struct nfs_string; 10struct nfs_string;
9 11
10/* Maximum number of readahead requests 12/* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
37 int acregmin, acregmax, 39 int acregmin, acregmax,
38 acdirmin, acdirmax; 40 acdirmin, acdirmax;
39 int namlen; 41 int namlen;
42 unsigned int options;
40 unsigned int bsize; 43 unsigned int bsize;
41 unsigned int auth_flavor_len; 44 unsigned int auth_flavor_len;
42 rpc_authflavor_t auth_flavors[1]; 45 rpc_authflavor_t auth_flavors[1];
43 char *client_address; 46 char *client_address;
47 char *fscache_uniq;
44 48
45 struct { 49 struct {
46 struct sockaddr_storage address; 50 struct sockaddr_storage address;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
16 16
17struct nfs_iostats { 17struct nfs_iostats {
18 unsigned long long bytes[__NFSIOS_BYTESMAX]; 18 unsigned long long bytes[__NFSIOS_BYTESMAX];
19#ifdef CONFIG_NFS_FSCACHE
20 unsigned long long fscache[__NFSIOS_FSCACHEMAX];
21#endif
19 unsigned long events[__NFSIOS_COUNTSMAX]; 22 unsigned long events[__NFSIOS_COUNTSMAX];
20} ____cacheline_aligned; 23} ____cacheline_aligned;
21 24
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
57 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 60 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
58} 61}
59 62
63#ifdef CONFIG_NFS_FSCACHE
64static inline void nfs_add_fscache_stats(struct inode *inode,
65 enum nfs_stat_fscachecounters stat,
66 unsigned long addend)
67{
68 struct nfs_iostats *iostats;
69 int cpu;
70
71 cpu = get_cpu();
72 iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
73 iostats->fscache[stat] += addend;
74 put_cpu_no_resched();
75}
76#endif
77
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 78static inline struct nfs_iostats *nfs_alloc_iostats(void)
61{ 79{
62 return alloc_percpu(struct nfs_iostats); 80 return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index b82fe6847f14..d0cc5ce0edfe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
328 data->arg.create.verifier[1] = current->pid; 328 data->arg.create.verifier[1] = current->pid;
329 } 329 }
330 330
331 sattr->ia_mode &= ~current->fs->umask; 331 sattr->ia_mode &= ~current_umask();
332 332
333 for (;;) { 333 for (;;) {
334 status = nfs3_do_create(dir, dentry, data); 334 status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
528 528
529 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 529 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
530 530
531 sattr->ia_mode &= ~current->fs->umask; 531 sattr->ia_mode &= ~current_umask();
532 532
533 data = nfs3_alloc_createdata(); 533 data = nfs3_alloc_createdata();
534 if (data == NULL) 534 if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
640 MAJOR(rdev), MINOR(rdev)); 640 MAJOR(rdev), MINOR(rdev));
641 641
642 sattr->ia_mode &= ~current->fs->umask; 642 sattr->ia_mode &= ~current_umask();
643 643
644 data = nfs3_alloc_createdata(); 644 data = nfs3_alloc_createdata();
645 if (data == NULL) 645 if (data == NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 97bacccff579..a4d242680299 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1501,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1501 attr.ia_mode = nd->intent.open.create_mode; 1501 attr.ia_mode = nd->intent.open.create_mode;
1502 attr.ia_valid = ATTR_MODE; 1502 attr.ia_valid = ATTR_MODE;
1503 if (!IS_POSIXACL(dir)) 1503 if (!IS_POSIXACL(dir))
1504 attr.ia_mode &= ~current->fs->umask; 1504 attr.ia_mode &= ~current_umask();
1505 } else { 1505 } else {
1506 attr.ia_valid = 0; 1506 attr.ia_valid = 0;
1507 BUG_ON(nd->intent.open.flags & O_CREAT); 1507 BUG_ON(nd->intent.open.flags & O_CREAT);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..4ace3c50a8eb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
24 24
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h"
27 28
28#define NFSDBG_FACILITY NFSDBG_PAGECACHE 29#define NFSDBG_FACILITY NFSDBG_PAGECACHE
29 30
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
111 } 112 }
112} 113}
113 114
114static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 115int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
115 struct page *page) 116 struct page *page)
116{ 117{
117 LIST_HEAD(one_request); 118 LIST_HEAD(one_request);
118 struct nfs_page *new; 119 struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
139 140
140static void nfs_readpage_release(struct nfs_page *req) 141static void nfs_readpage_release(struct nfs_page *req)
141{ 142{
143 struct inode *d_inode = req->wb_context->path.dentry->d_inode;
144
145 if (PageUptodate(req->wb_page))
146 nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
147
142 unlock_page(req->wb_page); 148 unlock_page(req->wb_page);
143 149
144 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 150 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
510 } else 516 } else
511 ctx = get_nfs_open_context(nfs_file_open_context(file)); 517 ctx = get_nfs_open_context(nfs_file_open_context(file));
512 518
519 if (!IS_SYNC(inode)) {
520 error = nfs_readpage_from_fscache(ctx, inode, page);
521 if (error == 0)
522 goto out;
523 }
524
513 error = nfs_readpage_async(ctx, inode, page); 525 error = nfs_readpage_async(ctx, inode, page);
514 526
527out:
515 put_nfs_open_context(ctx); 528 put_nfs_open_context(ctx);
516 return error; 529 return error;
517out_unlock: 530out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
584 return -EBADF; 597 return -EBADF;
585 } else 598 } else
586 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); 599 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
600
601 /* attempt to read as many of the pages as possible from the cache
602 * - this returns -ENOBUFS immediately if the cookie is negative
603 */
604 ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
605 pages, &nr_pages);
606 if (ret == 0)
607 goto read_complete; /* all pages were read */
608
587 if (rsize < PAGE_CACHE_SIZE) 609 if (rsize < PAGE_CACHE_SIZE)
588 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 610 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
589 else 611 else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
594 nfs_pageio_complete(&pgio); 616 nfs_pageio_complete(&pgio);
595 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 617 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
596 nfs_add_stats(inode, NFSIOS_READPAGES, npages); 618 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
619read_complete:
597 put_nfs_open_context(desc.ctx); 620 put_nfs_open_context(desc.ctx);
598out: 621out:
599 return ret; 622 return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0942fcbbad3c..6717200923fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
60#include "delegation.h" 60#include "delegation.h"
61#include "iostat.h" 61#include "iostat.h"
62#include "internal.h" 62#include "internal.h"
63#include "fscache.h"
63 64
64#define NFSDBG_FACILITY NFSDBG_VFS 65#define NFSDBG_FACILITY NFSDBG_VFS
65 66
@@ -76,6 +77,7 @@ enum {
76 Opt_rdirplus, Opt_nordirplus, 77 Opt_rdirplus, Opt_nordirplus,
77 Opt_sharecache, Opt_nosharecache, 78 Opt_sharecache, Opt_nosharecache,
78 Opt_resvport, Opt_noresvport, 79 Opt_resvport, Opt_noresvport,
80 Opt_fscache, Opt_nofscache,
79 81
80 /* Mount options that take integer arguments */ 82 /* Mount options that take integer arguments */
81 Opt_port, 83 Opt_port,
@@ -93,6 +95,7 @@ enum {
93 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 95 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
94 Opt_addr, Opt_mountaddr, Opt_clientaddr, 96 Opt_addr, Opt_mountaddr, Opt_clientaddr,
95 Opt_lookupcache, 97 Opt_lookupcache,
98 Opt_fscache_uniq,
96 99
97 /* Special mount options */ 100 /* Special mount options */
98 Opt_userspace, Opt_deprecated, Opt_sloppy, 101 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
132 { Opt_nosharecache, "nosharecache" }, 135 { Opt_nosharecache, "nosharecache" },
133 { Opt_resvport, "resvport" }, 136 { Opt_resvport, "resvport" },
134 { Opt_noresvport, "noresvport" }, 137 { Opt_noresvport, "noresvport" },
138 { Opt_fscache, "fsc" },
139 { Opt_fscache_uniq, "fsc=%s" },
140 { Opt_nofscache, "nofsc" },
135 141
136 { Opt_port, "port=%u" }, 142 { Opt_port, "port=%u" },
137 { Opt_rsize, "rsize=%u" }, 143 { Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
563 if (clp->rpc_ops->version == 4) 569 if (clp->rpc_ops->version == 4)
564 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); 570 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
565#endif 571#endif
572 if (nfss->options & NFS_OPTION_FSCACHE)
573 seq_printf(m, ",fsc");
566} 574}
567 575
568/* 576/*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
641 totals.events[i] += stats->events[i]; 649 totals.events[i] += stats->events[i];
642 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 650 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
643 totals.bytes[i] += stats->bytes[i]; 651 totals.bytes[i] += stats->bytes[i];
652#ifdef CONFIG_NFS_FSCACHE
653 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
654 totals.fscache[i] += stats->fscache[i];
655#endif
644 656
645 preempt_enable(); 657 preempt_enable();
646 } 658 }
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
651 seq_printf(m, "\n\tbytes:\t"); 663 seq_printf(m, "\n\tbytes:\t");
652 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 664 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
653 seq_printf(m, "%Lu ", totals.bytes[i]); 665 seq_printf(m, "%Lu ", totals.bytes[i]);
666#ifdef CONFIG_NFS_FSCACHE
667 if (nfss->options & NFS_OPTION_FSCACHE) {
668 seq_printf(m, "\n\tfsc:\t");
669 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
670 seq_printf(m, "%Lu ", totals.bytes[i]);
671 }
672#endif
654 seq_printf(m, "\n"); 673 seq_printf(m, "\n");
655 674
656 rpc_print_iostats(m, nfss->client); 675 rpc_print_iostats(m, nfss->client);
@@ -1044,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw,
1044 case Opt_noresvport: 1063 case Opt_noresvport:
1045 mnt->flags |= NFS_MOUNT_NORESVPORT; 1064 mnt->flags |= NFS_MOUNT_NORESVPORT;
1046 break; 1065 break;
1066 case Opt_fscache:
1067 mnt->options |= NFS_OPTION_FSCACHE;
1068 kfree(mnt->fscache_uniq);
1069 mnt->fscache_uniq = NULL;
1070 break;
1071 case Opt_nofscache:
1072 mnt->options &= ~NFS_OPTION_FSCACHE;
1073 kfree(mnt->fscache_uniq);
1074 mnt->fscache_uniq = NULL;
1075 break;
1076 case Opt_fscache_uniq:
1077 string = match_strdup(args);
1078 if (!string)
1079 goto out_nomem;
1080 kfree(mnt->fscache_uniq);
1081 mnt->fscache_uniq = string;
1082 mnt->options |= NFS_OPTION_FSCACHE;
1083 break;
1047 1084
1048 /* 1085 /*
1049 * options that take numeric values 1086 * options that take numeric values
@@ -1191,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
1191 goto out_nomem; 1228 goto out_nomem;
1192 token = match_token(string, 1229 token = match_token(string,
1193 nfs_xprt_protocol_tokens, args); 1230 nfs_xprt_protocol_tokens, args);
1194 kfree(string);
1195 1231
1196 switch (token) { 1232 switch (token) {
1197 case Opt_xprt_udp: 1233 case Opt_xprt_udp:
@@ -1221,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw,
1221 goto out_nomem; 1257 goto out_nomem;
1222 token = match_token(string, 1258 token = match_token(string,
1223 nfs_xprt_protocol_tokens, args); 1259 nfs_xprt_protocol_tokens, args);
1260 kfree(string);
1224 1261
1225 switch (token) { 1262 switch (token) {
1226 case Opt_xprt_udp: 1263 case Opt_xprt_udp:
@@ -1870,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb,
1870 nfs_initialise_sb(sb); 1907 nfs_initialise_sb(sb);
1871} 1908}
1872 1909
1873#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
1874
1875static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) 1910static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
1876{ 1911{
1877 const struct nfs_server *a = s->s_fs_info; 1912 const struct nfs_server *a = s->s_fs_info;
@@ -2036,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2036 if (!s->s_root) { 2071 if (!s->s_root) {
2037 /* initial superblock/root creation */ 2072 /* initial superblock/root creation */
2038 nfs_fill_super(s, data); 2073 nfs_fill_super(s, data);
2074 nfs_fscache_get_super_cookie(s, data);
2039 } 2075 }
2040 2076
2041 mntroot = nfs_get_root(s, mntfh); 2077 mntroot = nfs_get_root(s, mntfh);
@@ -2056,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2056out: 2092out:
2057 kfree(data->nfs_server.hostname); 2093 kfree(data->nfs_server.hostname);
2058 kfree(data->mount_server.hostname); 2094 kfree(data->mount_server.hostname);
2095 kfree(data->fscache_uniq);
2059 security_free_mnt_opts(&data->lsm_opts); 2096 security_free_mnt_opts(&data->lsm_opts);
2060out_free_fh: 2097out_free_fh:
2061 kfree(mntfh); 2098 kfree(mntfh);
@@ -2083,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s)
2083 2120
2084 bdi_unregister(&server->backing_dev_info); 2121 bdi_unregister(&server->backing_dev_info);
2085 kill_anon_super(s); 2122 kill_anon_super(s);
2123 nfs_fscache_release_super_cookie(s);
2086 nfs_free_server(server); 2124 nfs_free_server(server);
2087} 2125}
2088 2126
@@ -2390,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2390 if (!s->s_root) { 2428 if (!s->s_root) {
2391 /* initial superblock/root creation */ 2429 /* initial superblock/root creation */
2392 nfs4_fill_super(s); 2430 nfs4_fill_super(s);
2431 nfs_fscache_get_super_cookie(s, data);
2393 } 2432 }
2394 2433
2395 mntroot = nfs4_get_root(s, mntfh); 2434 mntroot = nfs4_get_root(s, mntfh);
@@ -2411,6 +2450,7 @@ out:
2411 kfree(data->client_address); 2450 kfree(data->client_address);
2412 kfree(data->nfs_server.export_path); 2451 kfree(data->nfs_server.export_path);
2413 kfree(data->nfs_server.hostname); 2452 kfree(data->nfs_server.hostname);
2453 kfree(data->fscache_uniq);
2414 security_free_mnt_opts(&data->lsm_opts); 2454 security_free_mnt_opts(&data->lsm_opts);
2415out_free_fh: 2455out_free_fh:
2416 kfree(mntfh); 2456 kfree(mntfh);
@@ -2437,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb)
2437 kill_anon_super(sb); 2477 kill_anon_super(sb);
2438 2478
2439 nfs4_renewd_prepare_shutdown(server); 2479 nfs4_renewd_prepare_shutdown(server);
2480 nfs_fscache_release_super_cookie(sb);
2440 nfs_free_server(server); 2481 nfs_free_server(server);
2441} 2482}
2442 2483
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
1config NFSD 1config NFSD
2 tristate "NFS server support" 2 tristate "NFS server support"
3 depends on INET 3 depends on INET
4 depends on FILE_LOCKING
4 select LOCKD 5 select LOCKD
5 select SUNRPC 6 select SUNRPC
6 select EXPORTFS 7 select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
18#include <linux/unistd.h> 18#include <linux/unistd.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/major.h> 20#include <linux/major.h>
21#include <linux/magic.h>
21 22
22#include <linux/sunrpc/svc.h> 23#include <linux/sunrpc/svc.h>
23#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
202 struct nfsd3_writeres *resp) 203 struct nfsd3_writeres *resp)
203{ 204{
204 __be32 nfserr; 205 __be32 nfserr;
206 unsigned long cnt = argp->len;
205 207
206 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 208 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
207 SVCFH_fmt(&argp->fh), 209 SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
214 nfserr = nfsd_write(rqstp, &resp->fh, NULL, 216 nfserr = nfsd_write(rqstp, &resp->fh, NULL,
215 argp->offset, 217 argp->offset,
216 rqstp->rq_vec, argp->vlen, 218 rqstp->rq_vec, argp->vlen,
217 argp->len, 219 &cnt,
218 &resp->committed); 220 &resp->committed);
219 resp->count = argp->count; 221 resp->count = cnt;
220 RETURN_STATUS(nfserr); 222 RETURN_STATUS(nfserr);
221} 223}
222 224
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
569 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; 571 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
570 572
571 /* Note that we don't care for remote fs's here */ 573 /* Note that we don't care for remote fs's here */
572 if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { 574 if (sb->s_magic == MSDOS_SUPER_MAGIC) {
573 resp->f_properties = NFS3_FSF_BILLYBOY; 575 resp->f_properties = NFS3_FSF_BILLYBOY;
574 } 576 }
575 resp->f_maxfilesize = sb->s_maxbytes; 577 resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
610 resp->p_link_max = EXT2_LINK_MAX; 612 resp->p_link_max = EXT2_LINK_MAX;
611 resp->p_name_max = EXT2_NAME_LEN; 613 resp->p_name_max = EXT2_NAME_LEN;
612 break; 614 break;
613 case 0x4d44: /* MSDOS_SUPER_MAGIC */ 615 case MSDOS_SUPER_MAGIC:
614 resp->p_case_insensitive = 1; 616 resp->p_case_insensitive = 1;
615 resp->p_case_preserving = 0; 617 resp->p_case_preserving = 0;
616 break; 618 break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) 218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
219{ 219{
220 __be32 *p; 220 __be32 *p;
221 int len = cb_rec->cbr_fhlen; 221 int len = cb_rec->cbr_fh.fh_size;
222 222
223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
224 WRITE32(OP_CB_RECALL); 224 WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); 226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
227 WRITE32(cb_rec->cbr_trunc); 227 WRITE32(cb_rec->cbr_trunc);
228 WRITE32(len); 228 WRITE32(len);
229 WRITEMEM(cb_rec->cbr_fhval, len); 229 WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
230 return 0; 230 return 0;
231} 231}
232 232
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
361/* Reference counting, callback cleanup, etc., all look racy as heck. 361/* Reference counting, callback cleanup, etc., all look racy as heck.
362 * And why is cb_set an atomic? */ 362 * And why is cb_set an atomic? */
363 363
364static int do_probe_callback(void *data) 364static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
365{ 365{
366 struct nfs4_client *clp = data;
367 struct sockaddr_in addr; 366 struct sockaddr_in addr;
368 struct nfs4_callback *cb = &clp->cl_callback; 367 struct nfs4_callback *cb = &clp->cl_callback;
369 struct rpc_timeout timeparms = { 368 struct rpc_timeout timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 383 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 .client_name = clp->cl_principal, 384 .client_name = clp->cl_principal,
386 }; 385 };
387 struct rpc_message msg = {
388 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
389 .rpc_argp = clp,
390 };
391 struct rpc_clnt *client; 386 struct rpc_clnt *client;
392 int status;
393 387
394 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { 388 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
395 status = nfserr_cb_path_down; 389 return ERR_PTR(-EINVAL);
396 goto out_err;
397 }
398 390
399 /* Initialize address */ 391 /* Initialize address */
400 memset(&addr, 0, sizeof(addr)); 392 memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
404 396
405 /* Create RPC client */ 397 /* Create RPC client */
406 client = rpc_create(&args); 398 client = rpc_create(&args);
399 if (IS_ERR(client))
400 dprintk("NFSD: couldn't create callback client: %ld\n",
401 PTR_ERR(client));
402 return client;
403
404}
405
406static int do_probe_callback(void *data)
407{
408 struct nfs4_client *clp = data;
409 struct nfs4_callback *cb = &clp->cl_callback;
410 struct rpc_message msg = {
411 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
412 .rpc_argp = clp,
413 };
414 struct rpc_clnt *client;
415 int status;
416
417 client = setup_callback_client(clp);
407 if (IS_ERR(client)) { 418 if (IS_ERR(client)) {
408 dprintk("NFSD: couldn't create callback client\n");
409 status = PTR_ERR(client); 419 status = PTR_ERR(client);
420 dprintk("NFSD: couldn't create callback client: %d\n",
421 status);
410 goto out_err; 422 goto out_err;
411 } 423 }
412 424
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
422out_release_client: 434out_release_client:
423 rpc_shutdown_client(client); 435 rpc_shutdown_client(client);
424out_err: 436out_err:
425 dprintk("NFSD: warning: no callback path to client %.*s\n", 437 dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
426 (int)clp->cl_name.len, clp->cl_name.data); 438 (int)clp->cl_name.len, clp->cl_name.data, status);
427 put_nfs4_client(clp); 439 put_nfs4_client(clp);
428 return status; 440 return 0;
429} 441}
430 442
431/* 443/*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
451 463
452/* 464/*
453 * called with dp->dl_count inc'ed. 465 * called with dp->dl_count inc'ed.
454 * nfs4_lock_state() may or may not have been called.
455 */ 466 */
456void 467void
457nfsd4_cb_recall(struct nfs4_delegation *dp) 468nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
93 open->op_truncate = 0; 93 open->op_truncate = 0;
94 94
95 if (open->op_create) { 95 if (open->op_create) {
96 /* FIXME: check session persistence and pnfs flags.
97 * The nfsv4.1 spec requires the following semantics:
98 *
99 * Persistent | pNFS | Server REQUIRED | Client Allowed
100 * Reply Cache | server | |
101 * -------------+--------+-----------------+--------------------
102 * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1
103 * | | | (SHOULD)
104 * | | and EXCLUSIVE4 | or EXCLUSIVE4
105 * | | | (SHOULD NOT)
106 * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1
107 * yes | no | GUARDED4 | GUARDED4
108 * yes | yes | GUARDED4 | GUARDED4
109 */
110
96 /* 111 /*
97 * Note: create modes (UNCHECKED,GUARDED...) are the same 112 * Note: create modes (UNCHECKED,GUARDED...) are the same
98 * in NFSv4 as in v3. 113 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
103 (u32 *)open->op_verf.data, 118 (u32 *)open->op_verf.data,
104 &open->op_truncate, &created); 119 &open->op_truncate, &created);
105 120
106 /* If we ever decide to use different attrs to store the 121 /*
107 * verifier in nfsd_create_v3, then we'll need to change this 122 * Following rfc 3530 14.2.16, use the returned bitmask
123 * to indicate which attributes we used to store the
124 * verifier:
108 */ 125 */
109 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) 126 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
110 open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | 127 open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
111 FATTR4_WORD1_TIME_MODIFY); 128 FATTR4_WORD1_TIME_MODIFY);
112 } else { 129 } else {
113 status = nfsd_lookup(rqstp, current_fh, 130 status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
118 goto out; 135 goto out;
119 136
120 set_change_info(&open->op_cinfo, current_fh); 137 set_change_info(&open->op_cinfo, current_fh);
121
122 /* set reply cache */
123 fh_dup2(current_fh, &resfh); 138 fh_dup2(current_fh, &resfh);
124 open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
125 memcpy(open->op_stateowner->so_replay.rp_openfh,
126 &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
127 139
140 /* set reply cache */
141 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
142 &resfh.fh_handle);
128 if (!created) 143 if (!created)
129 status = do_open_permission(rqstp, current_fh, open, 144 status = do_open_permission(rqstp, current_fh, open,
130 NFSD_MAY_NOP); 145 NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
150 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); 165 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
151 166
152 /* set replay cache */ 167 /* set replay cache */
153 open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; 168 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
154 memcpy(open->op_stateowner->so_replay.rp_openfh, 169 &current_fh->fh_handle);
155 &current_fh->fh_handle.fh_base,
156 current_fh->fh_handle.fh_size);
157 170
158 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 171 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
159 (open->op_iattr.ia_size == 0); 172 (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
164 return status; 177 return status;
165} 178}
166 179
180static void
181copy_clientid(clientid_t *clid, struct nfsd4_session *session)
182{
183 struct nfsd4_sessionid *sid =
184 (struct nfsd4_sessionid *)session->se_sessionid.data;
185
186 clid->cl_boot = sid->clientid.cl_boot;
187 clid->cl_id = sid->clientid.cl_id;
188}
167 189
168static __be32 190static __be32
169nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 191nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
170 struct nfsd4_open *open) 192 struct nfsd4_open *open)
171{ 193{
172 __be32 status; 194 __be32 status;
195 struct nfsd4_compoundres *resp;
196
173 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 197 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
174 (int)open->op_fname.len, open->op_fname.data, 198 (int)open->op_fname.len, open->op_fname.data,
175 open->op_stateowner); 199 open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
178 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) 202 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
179 return nfserr_inval; 203 return nfserr_inval;
180 204
205 if (nfsd4_has_session(cstate))
206 copy_clientid(&open->op_clientid, cstate->session);
207
181 nfs4_lock_state(); 208 nfs4_lock_state();
182 209
183 /* check seqid for replay. set nfs4_owner */ 210 /* check seqid for replay. set nfs4_owner */
184 status = nfsd4_process_open1(open); 211 resp = rqstp->rq_resp;
212 status = nfsd4_process_open1(&resp->cstate, open);
185 if (status == nfserr_replay_me) { 213 if (status == nfserr_replay_me) {
186 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 214 struct nfs4_replay *rp = &open->op_stateowner->so_replay;
187 fh_put(&cstate->current_fh); 215 fh_put(&cstate->current_fh);
188 cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; 216 fh_copy_shallow(&cstate->current_fh.fh_handle,
189 memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, 217 &rp->rp_openfh);
190 rp->rp_openfh_len);
191 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); 218 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
192 if (status) 219 if (status)
193 dprintk("nfsd4_open: replay failed" 220 dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
209 236
210 switch (open->op_claim_type) { 237 switch (open->op_claim_type) {
211 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 238 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
212 status = nfserr_inval;
213 if (open->op_create)
214 goto out;
215 /* fall through */
216 case NFS4_OPEN_CLAIM_NULL: 239 case NFS4_OPEN_CLAIM_NULL:
217 /* 240 /*
218 * (1) set CURRENT_FH to the file being opened, 241 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
455 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 478 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
456 return nfserr_inval; 479 return nfserr_inval;
457 480
458 getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 481 getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
459 getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 482 getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
483 getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
460 484
461 getattr->ga_fhp = &cstate->current_fh; 485 getattr->ga_fhp = &cstate->current_fh;
462 return nfs_ok; 486 return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
520 544
521 nfs4_lock_state(); 545 nfs4_lock_state();
522 /* check stateid */ 546 /* check stateid */
523 if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, 547 if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
524 &read->rd_stateid, 548 RD_STATE, &read->rd_filp))) {
525 CHECK_FH | RD_STATE, &read->rd_filp))) {
526 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); 549 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
527 goto out; 550 goto out;
528 } 551 }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
548 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 571 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
549 return nfserr_inval; 572 return nfserr_inval;
550 573
551 readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 574 readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
552 readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 575 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
576 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
553 577
554 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || 578 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
555 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) 579 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
653 677
654 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { 678 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
655 nfs4_lock_state(); 679 nfs4_lock_state();
656 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 680 status = nfs4_preprocess_stateid_op(cstate,
657 &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); 681 &setattr->sa_stateid, WR_STATE, NULL);
658 nfs4_unlock_state(); 682 nfs4_unlock_state();
659 if (status) { 683 if (status) {
660 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); 684 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
685 struct file *filp = NULL; 709 struct file *filp = NULL;
686 u32 *p; 710 u32 *p;
687 __be32 status = nfs_ok; 711 __be32 status = nfs_ok;
712 unsigned long cnt;
688 713
689 /* no need to check permission - this will be done in nfsd_write() */ 714 /* no need to check permission - this will be done in nfsd_write() */
690 715
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
692 return nfserr_inval; 717 return nfserr_inval;
693 718
694 nfs4_lock_state(); 719 nfs4_lock_state();
695 status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, 720 status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
696 CHECK_FH | WR_STATE, &filp);
697 if (filp) 721 if (filp)
698 get_file(filp); 722 get_file(filp);
699 nfs4_unlock_state(); 723 nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
703 return status; 727 return status;
704 } 728 }
705 729
706 write->wr_bytes_written = write->wr_buflen; 730 cnt = write->wr_buflen;
707 write->wr_how_written = write->wr_stable_how; 731 write->wr_how_written = write->wr_stable_how;
708 p = (u32 *)write->wr_verifier.data; 732 p = (u32 *)write->wr_verifier.data;
709 *p++ = nfssvc_boot.tv_sec; 733 *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
711 735
712 status = nfsd_write(rqstp, &cstate->current_fh, filp, 736 status = nfsd_write(rqstp, &cstate->current_fh, filp,
713 write->wr_offset, rqstp->rq_vec, write->wr_vlen, 737 write->wr_offset, rqstp->rq_vec, write->wr_vlen,
714 write->wr_buflen, &write->wr_how_written); 738 &cnt, &write->wr_how_written);
715 if (filp) 739 if (filp)
716 fput(filp); 740 fput(filp);
717 741
742 write->wr_bytes_written = cnt;
743
718 if (status == nfserr_symlink) 744 if (status == nfserr_symlink)
719 status = nfserr_inval; 745 status = nfserr_inval;
720 return status; 746 return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
737 if (status) 763 if (status)
738 return status; 764 return status;
739 765
740 if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) 766 if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
741 || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 767 || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
768 || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
742 return nfserr_attrnotsupp; 769 return nfserr_attrnotsupp;
743 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) 770 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
744 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) 771 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
766 if (status) 793 if (status)
767 goto out_kfree; 794 goto out_kfree;
768 795
769 p = buf + 3; 796 /* skip bitmap */
797 p = buf + 1 + ntohl(buf[0]);
770 status = nfserr_not_same; 798 status = nfserr_not_same;
771 if (ntohl(*p++) != verify->ve_attrlen) 799 if (ntohl(*p++) != verify->ve_attrlen)
772 goto out_kfree; 800 goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
813 nfsdstats.nfs4_opcount[opnum]++; 841 nfsdstats.nfs4_opcount[opnum]++;
814} 842}
815 843
816static void cstate_free(struct nfsd4_compound_state *cstate)
817{
818 if (cstate == NULL)
819 return;
820 fh_put(&cstate->current_fh);
821 fh_put(&cstate->save_fh);
822 BUG_ON(cstate->replay_owner);
823 kfree(cstate);
824}
825
826static struct nfsd4_compound_state *cstate_alloc(void)
827{
828 struct nfsd4_compound_state *cstate;
829
830 cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
831 if (cstate == NULL)
832 return NULL;
833 fh_init(&cstate->current_fh, NFS4_FHSIZE);
834 fh_init(&cstate->save_fh, NFS4_FHSIZE);
835 cstate->replay_owner = NULL;
836 return cstate;
837}
838
839typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, 844typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
840 void *); 845 void *);
846enum nfsd4_op_flags {
847 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
848 ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */
849 ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */
850};
841 851
842struct nfsd4_operation { 852struct nfsd4_operation {
843 nfsd4op_func op_func; 853 nfsd4op_func op_func;
844 u32 op_flags; 854 u32 op_flags;
845/* Most ops require a valid current filehandle; a few don't: */
846#define ALLOWED_WITHOUT_FH 1
847/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
848#define ALLOWED_ON_ABSENT_FS 2
849 char *op_name; 855 char *op_name;
850}; 856};
851 857
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
854static const char *nfsd4_op_name(unsigned opnum); 860static const char *nfsd4_op_name(unsigned opnum);
855 861
856/* 862/*
863 * This is a replay of a compound for which no cache entry pages
864 * were used. Encode the sequence operation, and if cachethis is FALSE
865 * encode the uncache rep error on the next operation.
866 */
867static __be32
868nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
869 struct nfsd4_compoundres *resp)
870{
871 struct nfsd4_op *op;
872
873 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
874 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
875
876 /* Encode the replayed sequence operation */
877 BUG_ON(resp->opcnt != 1);
878 op = &args->ops[resp->opcnt - 1];
879 nfsd4_encode_operation(resp, op);
880
881 /*return nfserr_retry_uncached_rep in next operation. */
882 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
883 op = &args->ops[resp->opcnt++];
884 op->status = nfserr_retry_uncached_rep;
885 nfsd4_encode_operation(resp, op);
886 }
887 return op->status;
888}
889
890/*
891 * Enforce NFSv4.1 COMPOUND ordering rules.
892 *
893 * TODO:
894 * - enforce NFS4ERR_NOT_ONLY_OP,
895 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
896 */
897static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
898{
899 if (args->minorversion && args->opcnt > 0) {
900 struct nfsd4_op *op = &args->ops[0];
901 return (op->status == nfserr_op_illegal) ||
902 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
903 }
904 return true;
905}
906
907/*
857 * COMPOUND call. 908 * COMPOUND call.
858 */ 909 */
859static __be32 910static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
863{ 914{
864 struct nfsd4_op *op; 915 struct nfsd4_op *op;
865 struct nfsd4_operation *opdesc; 916 struct nfsd4_operation *opdesc;
866 struct nfsd4_compound_state *cstate = NULL; 917 struct nfsd4_compound_state *cstate = &resp->cstate;
867 int slack_bytes; 918 int slack_bytes;
868 __be32 status; 919 __be32 status;
869 920
870 resp->xbuf = &rqstp->rq_res; 921 resp->xbuf = &rqstp->rq_res;
871 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; 922 resp->p = rqstp->rq_res.head[0].iov_base +
923 rqstp->rq_res.head[0].iov_len;
872 resp->tagp = resp->p; 924 resp->tagp = resp->p;
873 /* reserve space for: taglen, tag, and opcnt */ 925 /* reserve space for: taglen, tag, and opcnt */
874 resp->p += 2 + XDR_QUADLEN(args->taglen); 926 resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
877 resp->tag = args->tag; 929 resp->tag = args->tag;
878 resp->opcnt = 0; 930 resp->opcnt = 0;
879 resp->rqstp = rqstp; 931 resp->rqstp = rqstp;
932 resp->cstate.minorversion = args->minorversion;
933 resp->cstate.replay_owner = NULL;
934 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
935 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
936 /* Use the deferral mechanism only for NFSv4.0 compounds */
937 rqstp->rq_usedeferral = (args->minorversion == 0);
880 938
881 /* 939 /*
882 * According to RFC3010, this takes precedence over all other errors. 940 * According to RFC3010, this takes precedence over all other errors.
883 */ 941 */
884 status = nfserr_minor_vers_mismatch; 942 status = nfserr_minor_vers_mismatch;
885 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) 943 if (args->minorversion > nfsd_supported_minorversion)
886 goto out; 944 goto out;
887 945
888 status = nfserr_resource; 946 if (!nfs41_op_ordering_ok(args)) {
889 cstate = cstate_alloc(); 947 op = &args->ops[0];
890 if (cstate == NULL) 948 op->status = nfserr_sequence_pos;
891 goto out; 949 goto encode_op;
950 }
892 951
893 status = nfs_ok; 952 status = nfs_ok;
894 while (!status && resp->opcnt < args->opcnt) { 953 while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
897 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", 956 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
898 resp->opcnt, args->opcnt, op->opnum, 957 resp->opcnt, args->opcnt, op->opnum,
899 nfsd4_op_name(op->opnum)); 958 nfsd4_op_name(op->opnum));
900
901 /* 959 /*
902 * The XDR decode routines may have pre-set op->status; 960 * The XDR decode routines may have pre-set op->status;
903 * for example, if there is a miscellaneous XDR error 961 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
938 BUG_ON(op->status == nfs_ok); 996 BUG_ON(op->status == nfs_ok);
939 997
940encode_op: 998encode_op:
999 /* Only from SEQUENCE or CREATE_SESSION */
1000 if (resp->cstate.status == nfserr_replay_cache) {
1001 dprintk("%s NFS4.1 replay from cache\n", __func__);
1002 if (nfsd4_not_cached(resp))
1003 status = nfsd4_enc_uncached_replay(args, resp);
1004 else
1005 status = op->status;
1006 goto out;
1007 }
941 if (op->status == nfserr_replay_me) { 1008 if (op->status == nfserr_replay_me) {
942 op->replay = &cstate->replay_owner->so_replay; 1009 op->replay = &cstate->replay_owner->so_replay;
943 nfsd4_encode_replay(resp, op); 1010 nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
961 1028
962 nfsd4_increment_op_stats(op->opnum); 1029 nfsd4_increment_op_stats(op->opnum);
963 } 1030 }
1031 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1032 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1033 status = nfserr_jukebox;
1034 }
964 1035
965 cstate_free(cstate); 1036 resp->cstate.status = status;
1037 fh_put(&resp->cstate.current_fh);
1038 fh_put(&resp->cstate.save_fh);
1039 BUG_ON(resp->cstate.replay_owner);
966out: 1040out:
967 nfsd4_release_compoundargs(args); 1041 nfsd4_release_compoundargs(args);
1042 /* Reset deferral mechanism for RPC deferrals */
1043 rqstp->rq_usedeferral = 1;
968 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 1044 dprintk("nfsv4 compound returned %d\n", ntohl(status));
969 return status; 1045 return status;
970} 1046}
971 1047
972static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { 1048static struct nfsd4_operation nfsd4_ops[] = {
973 [OP_ACCESS] = { 1049 [OP_ACCESS] = {
974 .op_func = (nfsd4op_func)nfsd4_access, 1050 .op_func = (nfsd4op_func)nfsd4_access,
975 .op_name = "OP_ACCESS", 1051 .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1045 .op_name = "OP_PUTFH", 1121 .op_name = "OP_PUTFH",
1046 }, 1122 },
1047 [OP_PUTPUBFH] = { 1123 [OP_PUTPUBFH] = {
1048 /* unsupported, just for future reference: */ 1124 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1049 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1125 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1050 .op_name = "OP_PUTPUBFH", 1126 .op_name = "OP_PUTPUBFH",
1051 }, 1127 },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1119 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1195 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1120 .op_name = "OP_RELEASE_LOCKOWNER", 1196 .op_name = "OP_RELEASE_LOCKOWNER",
1121 }, 1197 },
1198
1199 /* NFSv4.1 operations */
1200 [OP_EXCHANGE_ID] = {
1201 .op_func = (nfsd4op_func)nfsd4_exchange_id,
1202 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1203 .op_name = "OP_EXCHANGE_ID",
1204 },
1205 [OP_CREATE_SESSION] = {
1206 .op_func = (nfsd4op_func)nfsd4_create_session,
1207 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1208 .op_name = "OP_CREATE_SESSION",
1209 },
1210 [OP_DESTROY_SESSION] = {
1211 .op_func = (nfsd4op_func)nfsd4_destroy_session,
1212 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1213 .op_name = "OP_DESTROY_SESSION",
1214 },
1215 [OP_SEQUENCE] = {
1216 .op_func = (nfsd4op_func)nfsd4_sequence,
1217 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1218 .op_name = "OP_SEQUENCE",
1219 },
1122}; 1220};
1123 1221
1124static const char *nfsd4_op_name(unsigned opnum) 1222static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
182 182
183typedef int (recdir_func)(struct dentry *, struct dentry *); 183typedef int (recdir_func)(struct dentry *, struct dentry *);
184 184
185struct dentry_list { 185struct name_list {
186 struct dentry *dentry; 186 char name[HEXDIR_LEN];
187 struct list_head list; 187 struct list_head list;
188}; 188};
189 189
190struct dentry_list_arg {
191 struct list_head dentries;
192 struct dentry *parent;
193};
194
195static int 190static int
196nfsd4_build_dentrylist(void *arg, const char *name, int namlen, 191nfsd4_build_namelist(void *arg, const char *name, int namlen,
197 loff_t offset, u64 ino, unsigned int d_type) 192 loff_t offset, u64 ino, unsigned int d_type)
198{ 193{
199 struct dentry_list_arg *dla = arg; 194 struct list_head *names = arg;
200 struct list_head *dentries = &dla->dentries; 195 struct name_list *entry;
201 struct dentry *parent = dla->parent;
202 struct dentry *dentry;
203 struct dentry_list *child;
204 196
205 if (name && isdotent(name, namlen)) 197 if (namlen != HEXDIR_LEN - 1)
206 return 0; 198 return 0;
207 dentry = lookup_one_len(name, parent, namlen); 199 entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
208 if (IS_ERR(dentry)) 200 if (entry == NULL)
209 return PTR_ERR(dentry);
210 child = kmalloc(sizeof(*child), GFP_KERNEL);
211 if (child == NULL)
212 return -ENOMEM; 201 return -ENOMEM;
213 child->dentry = dentry; 202 memcpy(entry->name, name, HEXDIR_LEN - 1);
214 list_add(&child->list, dentries); 203 entry->name[HEXDIR_LEN - 1] = '\0';
204 list_add(&entry->list, names);
215 return 0; 205 return 0;
216} 206}
217 207
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
220{ 210{
221 const struct cred *original_cred; 211 const struct cred *original_cred;
222 struct file *filp; 212 struct file *filp;
223 struct dentry_list_arg dla = { 213 LIST_HEAD(names);
224 .parent = dir, 214 struct name_list *entry;
225 }; 215 struct dentry *dentry;
226 struct list_head *dentries = &dla.dentries;
227 struct dentry_list *child;
228 int status; 216 int status;
229 217
230 if (!rec_dir_init) 218 if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
233 status = nfs4_save_creds(&original_cred); 221 status = nfs4_save_creds(&original_cred);
234 if (status < 0) 222 if (status < 0)
235 return status; 223 return status;
236 INIT_LIST_HEAD(dentries);
237 224
238 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 225 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
239 current_cred()); 226 current_cred());
240 status = PTR_ERR(filp); 227 status = PTR_ERR(filp);
241 if (IS_ERR(filp)) 228 if (IS_ERR(filp))
242 goto out; 229 goto out;
243 INIT_LIST_HEAD(dentries); 230 status = vfs_readdir(filp, nfsd4_build_namelist, &names);
244 status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
245 fput(filp); 231 fput(filp);
246 while (!list_empty(dentries)) { 232 while (!list_empty(&names)) {
247 child = list_entry(dentries->next, struct dentry_list, list); 233 entry = list_entry(names.next, struct name_list, list);
248 status = f(dir, child->dentry); 234
235 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
236 if (IS_ERR(dentry)) {
237 status = PTR_ERR(dentry);
238 goto out;
239 }
240 status = f(dir, dentry);
241 dput(dentry);
249 if (status) 242 if (status)
250 goto out; 243 goto out;
251 list_del(&child->list); 244 list_del(&entry->list);
252 dput(child->dentry); 245 kfree(entry);
253 kfree(child);
254 } 246 }
255out: 247out:
256 while (!list_empty(dentries)) { 248 while (!list_empty(&names)) {
257 child = list_entry(dentries->next, struct dentry_list, list); 249 entry = list_entry(names.next, struct name_list, list);
258 list_del(&child->list); 250 list_del(&entry->list);
259 dput(child->dentry); 251 kfree(entry);
260 kfree(child);
261 } 252 }
262 nfs4_reset_creds(original_cred); 253 nfs4_reset_creds(original_cred);
263 return status; 254 return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
353{ 344{
354 int status; 345 int status;
355 346
356 if (nfs4_has_reclaimed_state(child->d_name.name)) 347 /* note: we currently use this path only for minorversion 0 */
348 if (nfs4_has_reclaimed_state(child->d_name.name, false))
357 return 0; 349 return 0;
358 350
359 status = nfsd4_clear_clid_dir(parent, child); 351 status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..c65a27b76a9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
68static u32 nfs4_init; 68static u32 nfs4_init;
69static stateid_t zerostateid; /* bits all 0 */ 69static stateid_t zerostateid; /* bits all 0 */
70static stateid_t onestateid; /* bits all 1 */ 70static stateid_t onestateid; /* bits all 1 */
71static u64 current_sessionid = 1;
71 72
72#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) 73#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
73#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 74#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */
75/* forward declarations */ 76/* forward declarations */
76static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); 77static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
77static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); 78static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
78static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; 79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
80static void nfs4_set_recdir(char *recdir); 80static void nfs4_set_recdir(char *recdir);
81 81
82/* Locking: 82/* Locking: */
83 * 83
84 * client_mutex: 84/* Currently used for almost all code touching nfsv4 state: */
85 * protects clientid_hashtbl[], clientstr_hashtbl[],
86 * unconfstr_hashtbl[], uncofid_hashtbl[].
87 */
88static DEFINE_MUTEX(client_mutex); 85static DEFINE_MUTEX(client_mutex);
89 86
87/*
88 * Currently used for the del_recall_lru and file hash table. In an
89 * effort to decrease the scope of the client_mutex, this spinlock may
90 * eventually cover more:
91 */
92static DEFINE_SPINLOCK(recall_lock);
93
90static struct kmem_cache *stateowner_slab = NULL; 94static struct kmem_cache *stateowner_slab = NULL;
91static struct kmem_cache *file_slab = NULL; 95static struct kmem_cache *file_slab = NULL;
92static struct kmem_cache *stateid_slab = NULL; 96static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
117 return x; 121 return x;
118} 122}
119 123
120/* forward declarations */
121static void release_stateowner(struct nfs4_stateowner *sop);
122static void release_stateid(struct nfs4_stateid *stp, int flags);
123
124/*
125 * Delegation state
126 */
127
128/* recall_lock protects the del_recall_lru */
129static DEFINE_SPINLOCK(recall_lock);
130static struct list_head del_recall_lru; 124static struct list_head del_recall_lru;
131 125
132static void
133free_nfs4_file(struct kref *kref)
134{
135 struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
136 list_del(&fp->fi_hash);
137 iput(fp->fi_inode);
138 kmem_cache_free(file_slab, fp);
139}
140
141static inline void 126static inline void
142put_nfs4_file(struct nfs4_file *fi) 127put_nfs4_file(struct nfs4_file *fi)
143{ 128{
144 kref_put(&fi->fi_ref, free_nfs4_file); 129 if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
130 list_del(&fi->fi_hash);
131 spin_unlock(&recall_lock);
132 iput(fi->fi_inode);
133 kmem_cache_free(file_slab, fi);
134 }
145} 135}
146 136
147static inline void 137static inline void
148get_nfs4_file(struct nfs4_file *fi) 138get_nfs4_file(struct nfs4_file *fi)
149{ 139{
150 kref_get(&fi->fi_ref); 140 atomic_inc(&fi->fi_ref);
151} 141}
152 142
153static int num_delegations; 143static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
220 dp->dl_stateid.si_stateownerid = current_delegid++; 210 dp->dl_stateid.si_stateownerid = current_delegid++;
221 dp->dl_stateid.si_fileid = 0; 211 dp->dl_stateid.si_fileid = 0;
222 dp->dl_stateid.si_generation = 0; 212 dp->dl_stateid.si_generation = 0;
223 dp->dl_fhlen = current_fh->fh_handle.fh_size; 213 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
224 memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
225 current_fh->fh_handle.fh_size);
226 dp->dl_time = 0; 214 dp->dl_time = 0;
227 atomic_set(&dp->dl_count, 1); 215 atomic_set(&dp->dl_count, 1);
228 list_add(&dp->dl_perfile, &fp->fi_delegations); 216 list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
311static struct list_head client_lru; 299static struct list_head client_lru;
312static struct list_head close_lru; 300static struct list_head close_lru;
313 301
302static void unhash_generic_stateid(struct nfs4_stateid *stp)
303{
304 list_del(&stp->st_hash);
305 list_del(&stp->st_perfile);
306 list_del(&stp->st_perstateowner);
307}
308
309static void free_generic_stateid(struct nfs4_stateid *stp)
310{
311 put_nfs4_file(stp->st_file);
312 kmem_cache_free(stateid_slab, stp);
313}
314
315static void release_lock_stateid(struct nfs4_stateid *stp)
316{
317 unhash_generic_stateid(stp);
318 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
319 free_generic_stateid(stp);
320}
321
322static void unhash_lockowner(struct nfs4_stateowner *sop)
323{
324 struct nfs4_stateid *stp;
325
326 list_del(&sop->so_idhash);
327 list_del(&sop->so_strhash);
328 list_del(&sop->so_perstateid);
329 while (!list_empty(&sop->so_stateids)) {
330 stp = list_first_entry(&sop->so_stateids,
331 struct nfs4_stateid, st_perstateowner);
332 release_lock_stateid(stp);
333 }
334}
335
336static void release_lockowner(struct nfs4_stateowner *sop)
337{
338 unhash_lockowner(sop);
339 nfs4_put_stateowner(sop);
340}
341
342static void
343release_stateid_lockowners(struct nfs4_stateid *open_stp)
344{
345 struct nfs4_stateowner *lock_sop;
346
347 while (!list_empty(&open_stp->st_lockowners)) {
348 lock_sop = list_entry(open_stp->st_lockowners.next,
349 struct nfs4_stateowner, so_perstateid);
350 /* list_del(&open_stp->st_lockowners); */
351 BUG_ON(lock_sop->so_is_open_owner);
352 release_lockowner(lock_sop);
353 }
354}
355
356static void release_open_stateid(struct nfs4_stateid *stp)
357{
358 unhash_generic_stateid(stp);
359 release_stateid_lockowners(stp);
360 nfsd_close(stp->st_vfs_file);
361 free_generic_stateid(stp);
362}
363
364static void unhash_openowner(struct nfs4_stateowner *sop)
365{
366 struct nfs4_stateid *stp;
367
368 list_del(&sop->so_idhash);
369 list_del(&sop->so_strhash);
370 list_del(&sop->so_perclient);
371 list_del(&sop->so_perstateid); /* XXX: necessary? */
372 while (!list_empty(&sop->so_stateids)) {
373 stp = list_first_entry(&sop->so_stateids,
374 struct nfs4_stateid, st_perstateowner);
375 release_open_stateid(stp);
376 }
377}
378
379static void release_openowner(struct nfs4_stateowner *sop)
380{
381 unhash_openowner(sop);
382 list_del(&sop->so_close_lru);
383 nfs4_put_stateowner(sop);
384}
385
386static DEFINE_SPINLOCK(sessionid_lock);
387#define SESSION_HASH_SIZE 512
388static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
389
390static inline int
391hash_sessionid(struct nfs4_sessionid *sessionid)
392{
393 struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
394
395 return sid->sequence % SESSION_HASH_SIZE;
396}
397
398static inline void
399dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
400{
401 u32 *ptr = (u32 *)(&sessionid->data[0]);
402 dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
403}
404
405static void
406gen_sessionid(struct nfsd4_session *ses)
407{
408 struct nfs4_client *clp = ses->se_client;
409 struct nfsd4_sessionid *sid;
410
411 sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
412 sid->clientid = clp->cl_clientid;
413 sid->sequence = current_sessionid++;
414 sid->reserved = 0;
415}
416
417/*
418 * Give the client the number of slots it requests bound by
419 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
420 *
421 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
422 * should (up to a point) re-negotiate active sessions and reduce their
423 * slot usage to make rooom for new connections. For now we just fail the
424 * create session.
425 */
426static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
427{
428 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
429
430 spin_lock(&nfsd_serv->sv_lock);
431 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
432 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
433 nfsd_serv->sv_drc_pages_used += np;
434 spin_unlock(&nfsd_serv->sv_lock);
435
436 if (np <= 0) {
437 status = nfserr_resource;
438 fchan->maxreqs = 0;
439 } else
440 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
441
442 return status;
443}
444
445/*
446 * fchan holds the client values on input, and the server values on output
447 */
448static int init_forechannel_attrs(struct svc_rqst *rqstp,
449 struct nfsd4_session *session,
450 struct nfsd4_channel_attrs *fchan)
451{
452 int status = 0;
453 __u32 maxcount = svc_max_payload(rqstp);
454
455 /* headerpadsz set to zero in encode routine */
456
457 /* Use the client's max request and max response size if possible */
458 if (fchan->maxreq_sz > maxcount)
459 fchan->maxreq_sz = maxcount;
460 session->se_fmaxreq_sz = fchan->maxreq_sz;
461
462 if (fchan->maxresp_sz > maxcount)
463 fchan->maxresp_sz = maxcount;
464 session->se_fmaxresp_sz = fchan->maxresp_sz;
465
466 /* Set the max response cached size our default which is
467 * a multiple of PAGE_SIZE and small */
468 session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
469 fchan->maxresp_cached = session->se_fmaxresp_cached;
470
471 /* Use the client's maxops if possible */
472 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
473 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
474 session->se_fmaxops = fchan->maxops;
475
476 /* try to use the client requested number of slots */
477 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
478 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
479
480 /* FIXME: Error means no more DRC pages so the server should
481 * recover pages from existing sessions. For now fail session
482 * creation.
483 */
484 status = set_forechannel_maxreqs(fchan);
485
486 session->se_fnumslots = fchan->maxreqs;
487 return status;
488}
489
490static int
491alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
492 struct nfsd4_create_session *cses)
493{
494 struct nfsd4_session *new, tmp;
495 int idx, status = nfserr_resource, slotsize;
496
497 memset(&tmp, 0, sizeof(tmp));
498
499 /* FIXME: For now, we just accept the client back channel attributes. */
500 status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
501 if (status)
502 goto out;
503
504 /* allocate struct nfsd4_session and slot table in one piece */
505 slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
506 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
507 if (!new)
508 goto out;
509
510 memcpy(new, &tmp, sizeof(*new));
511
512 new->se_client = clp;
513 gen_sessionid(new);
514 idx = hash_sessionid(&new->se_sessionid);
515 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
516 NFS4_MAX_SESSIONID_LEN);
517
518 new->se_flags = cses->flags;
519 kref_init(&new->se_ref);
520 spin_lock(&sessionid_lock);
521 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
522 list_add(&new->se_perclnt, &clp->cl_sessions);
523 spin_unlock(&sessionid_lock);
524
525 status = nfs_ok;
526out:
527 return status;
528}
529
530/* caller must hold sessionid_lock */
531static struct nfsd4_session *
532find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
533{
534 struct nfsd4_session *elem;
535 int idx;
536
537 dump_sessionid(__func__, sessionid);
538 idx = hash_sessionid(sessionid);
539 dprintk("%s: idx is %d\n", __func__, idx);
540 /* Search in the appropriate list */
541 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
542 dump_sessionid("list traversal", &elem->se_sessionid);
543 if (!memcmp(elem->se_sessionid.data, sessionid->data,
544 NFS4_MAX_SESSIONID_LEN)) {
545 return elem;
546 }
547 }
548
549 dprintk("%s: session not found\n", __func__);
550 return NULL;
551}
552
553/* caller must hold sessionid_lock */
554static void
555unhash_session(struct nfsd4_session *ses)
556{
557 list_del(&ses->se_hash);
558 list_del(&ses->se_perclnt);
559}
560
561static void
562release_session(struct nfsd4_session *ses)
563{
564 spin_lock(&sessionid_lock);
565 unhash_session(ses);
566 spin_unlock(&sessionid_lock);
567 nfsd4_put_session(ses);
568}
569
570static void nfsd4_release_respages(struct page **respages, short resused);
571
572void
573free_session(struct kref *kref)
574{
575 struct nfsd4_session *ses;
576 int i;
577
578 ses = container_of(kref, struct nfsd4_session, se_ref);
579 for (i = 0; i < ses->se_fnumslots; i++) {
580 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
581 nfsd4_release_respages(e->ce_respages, e->ce_resused);
582 }
583 kfree(ses->se_slots);
584 kfree(ses);
585}
586
314static inline void 587static inline void
315renew_client(struct nfs4_client *clp) 588renew_client(struct nfs4_client *clp)
316{ 589{
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
330{ 603{
331 if (clid->cl_boot == boot_time) 604 if (clid->cl_boot == boot_time)
332 return 0; 605 return 0;
333 dprintk("NFSD stale clientid (%08x/%08x)\n", 606 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
334 clid->cl_boot, clid->cl_id); 607 clid->cl_boot, clid->cl_id, boot_time);
335 return 1; 608 return 1;
336} 609}
337 610
@@ -376,6 +649,8 @@ static inline void
376free_client(struct nfs4_client *clp) 649free_client(struct nfs4_client *clp)
377{ 650{
378 shutdown_callback_client(clp); 651 shutdown_callback_client(clp);
652 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
653 clp->cl_slot.sl_cache_entry.ce_resused);
379 if (clp->cl_cred.cr_group_info) 654 if (clp->cl_cred.cr_group_info)
380 put_group_info(clp->cl_cred.cr_group_info); 655 put_group_info(clp->cl_cred.cr_group_info);
381 kfree(clp->cl_principal); 656 kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
420 list_del(&clp->cl_lru); 695 list_del(&clp->cl_lru);
421 while (!list_empty(&clp->cl_openowners)) { 696 while (!list_empty(&clp->cl_openowners)) {
422 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 697 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
423 release_stateowner(sop); 698 release_openowner(sop);
699 }
700 while (!list_empty(&clp->cl_sessions)) {
701 struct nfsd4_session *ses;
702 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
703 se_perclnt);
704 release_session(ses);
424 } 705 }
425 put_nfs4_client(clp); 706 put_nfs4_client(clp);
426} 707}
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
439 INIT_LIST_HEAD(&clp->cl_strhash); 720 INIT_LIST_HEAD(&clp->cl_strhash);
440 INIT_LIST_HEAD(&clp->cl_openowners); 721 INIT_LIST_HEAD(&clp->cl_openowners);
441 INIT_LIST_HEAD(&clp->cl_delegations); 722 INIT_LIST_HEAD(&clp->cl_delegations);
723 INIT_LIST_HEAD(&clp->cl_sessions);
442 INIT_LIST_HEAD(&clp->cl_lru); 724 INIT_LIST_HEAD(&clp->cl_lru);
443 return clp; 725 return clp;
444} 726}
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
568 return NULL; 850 return NULL;
569} 851}
570 852
853/*
854 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
855 * parameter. Matching is based on the fact the at least one of the
856 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
857 *
858 * FIXME: we need to unify the clientid namespaces for nfsv4.x
859 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
860 * and SET_CLIENTID{,_CONFIRM}
861 */
862static inline int
863match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
864{
865 bool has_exchange_flags = (clp->cl_exchange_flags != 0);
866 return use_exchange_id == has_exchange_flags;
867}
868
571static struct nfs4_client * 869static struct nfs4_client *
572find_confirmed_client_by_str(const char *dname, unsigned int hashval) 870find_confirmed_client_by_str(const char *dname, unsigned int hashval,
871 bool use_exchange_id)
573{ 872{
574 struct nfs4_client *clp; 873 struct nfs4_client *clp;
575 874
576 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 875 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
577 if (same_name(clp->cl_recdir, dname)) 876 if (same_name(clp->cl_recdir, dname) &&
877 match_clientid_establishment(clp, use_exchange_id))
578 return clp; 878 return clp;
579 } 879 }
580 return NULL; 880 return NULL;
581} 881}
582 882
583static struct nfs4_client * 883static struct nfs4_client *
584find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) 884find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
885 bool use_exchange_id)
585{ 886{
586 struct nfs4_client *clp; 887 struct nfs4_client *clp;
587 888
588 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 889 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
589 if (same_name(clp->cl_recdir, dname)) 890 if (same_name(clp->cl_recdir, dname) &&
891 match_clientid_establishment(clp, use_exchange_id))
590 return clp; 892 return clp;
591 } 893 }
592 return NULL; 894 return NULL;
@@ -685,6 +987,534 @@ out_err:
685 return; 987 return;
686} 988}
687 989
990void
991nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
992{
993 struct nfsd4_compoundres *resp = rqstp->rq_resp;
994
995 resp->cstate.statp = statp;
996}
997
998/*
999 * Dereference the result pages.
1000 */
1001static void
1002nfsd4_release_respages(struct page **respages, short resused)
1003{
1004 int i;
1005
1006 dprintk("--> %s\n", __func__);
1007 for (i = 0; i < resused; i++) {
1008 if (!respages[i])
1009 continue;
1010 put_page(respages[i]);
1011 respages[i] = NULL;
1012 }
1013}
1014
1015static void
1016nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
1017{
1018 int i;
1019
1020 for (i = 0; i < count; i++) {
1021 topages[i] = frompages[i];
1022 if (!topages[i])
1023 continue;
1024 get_page(topages[i]);
1025 }
1026}
1027
1028/*
1029 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
1030 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
1031 * length of the XDR response is less than se_fmaxresp_cached
1032 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
1033 * of the reply (e.g. readdir).
1034 *
1035 * Store the base and length of the rq_req.head[0] page
1036 * of the NFSv4.1 data, just past the rpc header.
1037 */
1038void
1039nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1040{
1041 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1042 struct svc_rqst *rqstp = resp->rqstp;
1043 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1044 struct nfsd4_op *op = &args->ops[resp->opcnt];
1045 struct kvec *resv = &rqstp->rq_res.head[0];
1046
1047 dprintk("--> %s entry %p\n", __func__, entry);
1048
1049 /* Don't cache a failed OP_SEQUENCE. */
1050 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1051 return;
1052
1053 nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
1054 entry->ce_opcnt = resp->opcnt;
1055 entry->ce_status = resp->cstate.status;
1056
1057 /*
1058 * Don't need a page to cache just the sequence operation - the slot
1059 * does this for us!
1060 */
1061
1062 if (nfsd4_not_cached(resp)) {
1063 entry->ce_resused = 0;
1064 entry->ce_rpchdrlen = 0;
1065 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
1066 resp->cstate.slot->sl_cache_entry.ce_cachethis);
1067 return;
1068 }
1069 entry->ce_resused = rqstp->rq_resused;
1070 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1071 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1072 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1073 entry->ce_resused);
1074 entry->ce_datav.iov_base = resp->cstate.statp;
1075 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1076 (char *)page_address(rqstp->rq_respages[0]));
1077 /* Current request rpc header length*/
1078 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1079 (char *)page_address(rqstp->rq_respages[0]);
1080}
1081
1082/*
1083 * We keep the rpc header, but take the nfs reply from the replycache.
1084 */
1085static int
1086nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1087 struct nfsd4_cache_entry *entry)
1088{
1089 struct svc_rqst *rqstp = resp->rqstp;
1090 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1091 int len;
1092
1093 /* Current request rpc header length*/
1094 len = (char *)resp->cstate.statp -
1095 (char *)page_address(rqstp->rq_respages[0]);
1096 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1097 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1098 entry->ce_datav.iov_len);
1099 return 0;
1100 }
1101 /* copy the cached reply nfsd data past the current rpc header */
1102 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1103 entry->ce_datav.iov_len);
1104 resv->iov_len = len + entry->ce_datav.iov_len;
1105 return 1;
1106}
1107
1108/*
1109 * Keep the first page of the replay. Copy the NFSv4.1 data from the first
1110 * cached page. Replace any futher replay pages from the cache.
1111 */
1112__be32
1113nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1114 struct nfsd4_sequence *seq)
1115{
1116 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1117 __be32 status;
1118
1119 dprintk("--> %s entry %p\n", __func__, entry);
1120
1121 /*
1122 * If this is just the sequence operation, we did not keep
1123 * a page in the cache entry because we can just use the
1124 * slot info stored in struct nfsd4_sequence that was checked
1125 * against the slot in nfsd4_sequence().
1126 *
1127 * This occurs when seq->cachethis is FALSE, or when the client
1128 * session inactivity timer fires and a solo sequence operation
1129 * is sent (lease renewal).
1130 */
1131 if (seq && nfsd4_not_cached(resp)) {
1132 seq->maxslots = resp->cstate.session->se_fnumslots;
1133 return nfs_ok;
1134 }
1135
1136 if (!nfsd41_copy_replay_data(resp, entry)) {
1137 /*
1138 * Not enough room to use the replay rpc header, send the
1139 * cached header. Release all the allocated result pages.
1140 */
1141 svc_free_res_pages(resp->rqstp);
1142 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1143 entry->ce_resused);
1144 } else {
1145 /* Release all but the first allocated result page */
1146
1147 resp->rqstp->rq_resused--;
1148 svc_free_res_pages(resp->rqstp);
1149
1150 nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
1151 &entry->ce_respages[1],
1152 entry->ce_resused - 1);
1153 }
1154
1155 resp->rqstp->rq_resused = entry->ce_resused;
1156 resp->opcnt = entry->ce_opcnt;
1157 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
1158 status = entry->ce_status;
1159
1160 return status;
1161}
1162
1163/*
1164 * Set the exchange_id flags returned by the server.
1165 */
1166static void
1167nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
1168{
1169 /* pNFS is not supported */
1170 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
1171
1172 /* Referrals are supported, Migration is not. */
1173 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
1174
1175 /* set the wire flags to return to client. */
1176 clid->flags = new->cl_exchange_flags;
1177}
1178
1179__be32
1180nfsd4_exchange_id(struct svc_rqst *rqstp,
1181 struct nfsd4_compound_state *cstate,
1182 struct nfsd4_exchange_id *exid)
1183{
1184 struct nfs4_client *unconf, *conf, *new;
1185 int status;
1186 unsigned int strhashval;
1187 char dname[HEXDIR_LEN];
1188 nfs4_verifier verf = exid->verifier;
1189 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1190
1191 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1192 " ip_addr=%u flags %x, spa_how %d\n",
1193 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1194 ip_addr, exid->flags, exid->spa_how);
1195
1196 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1197 return nfserr_inval;
1198
1199 /* Currently only support SP4_NONE */
1200 switch (exid->spa_how) {
1201 case SP4_NONE:
1202 break;
1203 case SP4_SSV:
1204 return nfserr_encr_alg_unsupp;
1205 default:
1206 BUG(); /* checked by xdr code */
1207 case SP4_MACH_CRED:
1208 return nfserr_serverfault; /* no excuse :-/ */
1209 }
1210
1211 status = nfs4_make_rec_clidname(dname, &exid->clname);
1212
1213 if (status)
1214 goto error;
1215
1216 strhashval = clientstr_hashval(dname);
1217
1218 nfs4_lock_state();
1219 status = nfs_ok;
1220
1221 conf = find_confirmed_client_by_str(dname, strhashval, true);
1222 if (conf) {
1223 if (!same_verf(&verf, &conf->cl_verifier)) {
1224 /* 18.35.4 case 8 */
1225 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1226 status = nfserr_not_same;
1227 goto out;
1228 }
1229 /* Client reboot: destroy old state */
1230 expire_client(conf);
1231 goto out_new;
1232 }
1233 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1234 /* 18.35.4 case 9 */
1235 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1236 status = nfserr_perm;
1237 goto out;
1238 }
1239 expire_client(conf);
1240 goto out_new;
1241 }
1242 if (ip_addr != conf->cl_addr &&
1243 !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
1244 /* Client collision. 18.35.4 case 3 */
1245 status = nfserr_clid_inuse;
1246 goto out;
1247 }
1248 /*
1249 * Set bit when the owner id and verifier map to an already
1250 * confirmed client id (18.35.3).
1251 */
1252 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
1253
1254 /*
1255 * Falling into 18.35.4 case 2, possible router replay.
1256 * Leave confirmed record intact and return same result.
1257 */
1258 copy_verf(conf, &verf);
1259 new = conf;
1260 goto out_copy;
1261 } else {
1262 /* 18.35.4 case 7 */
1263 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1264 status = nfserr_noent;
1265 goto out;
1266 }
1267 }
1268
1269 unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
1270 if (unconf) {
1271 /*
1272 * Possible retry or client restart. Per 18.35.4 case 4,
1273 * a new unconfirmed record should be generated regardless
1274 * of whether any properties have changed.
1275 */
1276 expire_client(unconf);
1277 }
1278
1279out_new:
1280 /* Normal case */
1281 new = create_client(exid->clname, dname);
1282 if (new == NULL) {
1283 status = nfserr_resource;
1284 goto out;
1285 }
1286
1287 copy_verf(new, &verf);
1288 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1289 new->cl_addr = ip_addr;
1290 gen_clid(new);
1291 gen_confirm(new);
1292 add_to_unconfirmed(new, strhashval);
1293out_copy:
1294 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1295 exid->clientid.cl_id = new->cl_clientid.cl_id;
1296
1297 new->cl_slot.sl_seqid = 0;
1298 exid->seqid = 1;
1299 nfsd4_set_ex_flags(new, exid);
1300
1301 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1302 new->cl_slot.sl_seqid, new->cl_exchange_flags);
1303 status = nfs_ok;
1304
1305out:
1306 nfs4_unlock_state();
1307error:
1308 dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
1309 return status;
1310}
1311
1312static int
1313check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
1314{
1315 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
1316 slot->sl_seqid);
1317
1318 /* The slot is in use, and no response has been sent. */
1319 if (slot->sl_inuse) {
1320 if (seqid == slot->sl_seqid)
1321 return nfserr_jukebox;
1322 else
1323 return nfserr_seq_misordered;
1324 }
1325 /* Normal */
1326 if (likely(seqid == slot->sl_seqid + 1))
1327 return nfs_ok;
1328 /* Replay */
1329 if (seqid == slot->sl_seqid)
1330 return nfserr_replay_cache;
1331 /* Wraparound */
1332 if (seqid == 1 && (slot->sl_seqid + 1) == 0)
1333 return nfs_ok;
1334 /* Misordered replay or misordered new request */
1335 return nfserr_seq_misordered;
1336}
1337
1338__be32
1339nfsd4_create_session(struct svc_rqst *rqstp,
1340 struct nfsd4_compound_state *cstate,
1341 struct nfsd4_create_session *cr_ses)
1342{
1343 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1344 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1345 struct nfs4_client *conf, *unconf;
1346 struct nfsd4_slot *slot = NULL;
1347 int status = 0;
1348
1349 nfs4_lock_state();
1350 unconf = find_unconfirmed_client(&cr_ses->clientid);
1351 conf = find_confirmed_client(&cr_ses->clientid);
1352
1353 if (conf) {
1354 slot = &conf->cl_slot;
1355 status = check_slot_seqid(cr_ses->seqid, slot);
1356 if (status == nfserr_replay_cache) {
1357 dprintk("Got a create_session replay! seqid= %d\n",
1358 slot->sl_seqid);
1359 cstate->slot = slot;
1360 cstate->status = status;
1361 /* Return the cached reply status */
1362 status = nfsd4_replay_cache_entry(resp, NULL);
1363 goto out;
1364 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
1365 status = nfserr_seq_misordered;
1366 dprintk("Sequence misordered!\n");
1367 dprintk("Expected seqid= %d but got seqid= %d\n",
1368 slot->sl_seqid, cr_ses->seqid);
1369 goto out;
1370 }
1371 conf->cl_slot.sl_seqid++;
1372 } else if (unconf) {
1373 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1374 (ip_addr != unconf->cl_addr)) {
1375 status = nfserr_clid_inuse;
1376 goto out;
1377 }
1378
1379 slot = &unconf->cl_slot;
1380 status = check_slot_seqid(cr_ses->seqid, slot);
1381 if (status) {
1382 /* an unconfirmed replay returns misordered */
1383 status = nfserr_seq_misordered;
1384 goto out;
1385 }
1386
1387 slot->sl_seqid++; /* from 0 to 1 */
1388 move_to_confirmed(unconf);
1389
1390 /*
1391 * We do not support RDMA or persistent sessions
1392 */
1393 cr_ses->flags &= ~SESSION4_PERSIST;
1394 cr_ses->flags &= ~SESSION4_RDMA;
1395
1396 conf = unconf;
1397 } else {
1398 status = nfserr_stale_clientid;
1399 goto out;
1400 }
1401
1402 status = alloc_init_session(rqstp, conf, cr_ses);
1403 if (status)
1404 goto out;
1405
1406 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1407 NFS4_MAX_SESSIONID_LEN);
1408 cr_ses->seqid = slot->sl_seqid;
1409
1410 slot->sl_inuse = true;
1411 cstate->slot = slot;
1412 /* Ensure a page is used for the cache */
1413 slot->sl_cache_entry.ce_cachethis = 1;
1414out:
1415 nfs4_unlock_state();
1416 dprintk("%s returns %d\n", __func__, ntohl(status));
1417 return status;
1418}
1419
1420__be32
1421nfsd4_destroy_session(struct svc_rqst *r,
1422 struct nfsd4_compound_state *cstate,
1423 struct nfsd4_destroy_session *sessionid)
1424{
1425 struct nfsd4_session *ses;
1426 u32 status = nfserr_badsession;
1427
1428 /* Notes:
1429 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
1430 * - Should we return nfserr_back_chan_busy if waiting for
1431 * callbacks on to-be-destroyed session?
1432 * - Do we need to clear any callback info from previous session?
1433 */
1434
1435 dump_sessionid(__func__, &sessionid->sessionid);
1436 spin_lock(&sessionid_lock);
1437 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1438 if (!ses) {
1439 spin_unlock(&sessionid_lock);
1440 goto out;
1441 }
1442
1443 unhash_session(ses);
1444 spin_unlock(&sessionid_lock);
1445
1446 /* wait for callbacks */
1447 shutdown_callback_client(ses->se_client);
1448 nfsd4_put_session(ses);
1449 status = nfs_ok;
1450out:
1451 dprintk("%s returns %d\n", __func__, ntohl(status));
1452 return status;
1453}
1454
1455__be32
1456nfsd4_sequence(struct svc_rqst *rqstp,
1457 struct nfsd4_compound_state *cstate,
1458 struct nfsd4_sequence *seq)
1459{
1460 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1461 struct nfsd4_session *session;
1462 struct nfsd4_slot *slot;
1463 int status;
1464
1465 if (resp->opcnt != 1)
1466 return nfserr_sequence_pos;
1467
1468 spin_lock(&sessionid_lock);
1469 status = nfserr_badsession;
1470 session = find_in_sessionid_hashtbl(&seq->sessionid);
1471 if (!session)
1472 goto out;
1473
1474 status = nfserr_badslot;
1475 if (seq->slotid >= session->se_fnumslots)
1476 goto out;
1477
1478 slot = &session->se_slots[seq->slotid];
1479 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1480
1481 status = check_slot_seqid(seq->seqid, slot);
1482 if (status == nfserr_replay_cache) {
1483 cstate->slot = slot;
1484 cstate->session = session;
1485 /* Return the cached reply status and set cstate->status
1486 * for nfsd4_svc_encode_compoundres processing */
1487 status = nfsd4_replay_cache_entry(resp, seq);
1488 cstate->status = nfserr_replay_cache;
1489 goto replay_cache;
1490 }
1491 if (status)
1492 goto out;
1493
1494 /* Success! bump slot seqid */
1495 slot->sl_inuse = true;
1496 slot->sl_seqid = seq->seqid;
1497 slot->sl_cache_entry.ce_cachethis = seq->cachethis;
1498 /* Always set the cache entry cachethis for solo sequence */
1499 if (nfsd4_is_solo_sequence(resp))
1500 slot->sl_cache_entry.ce_cachethis = 1;
1501
1502 cstate->slot = slot;
1503 cstate->session = session;
1504
1505replay_cache:
1506 /* Renew the clientid on success and on replay.
1507 * Hold a session reference until done processing the compound:
1508 * nfsd4_put_session called only if the cstate slot is set.
1509 */
1510 renew_client(session->se_client);
1511 nfsd4_get_session(session);
1512out:
1513 spin_unlock(&sessionid_lock);
1514 dprintk("%s: return %d\n", __func__, ntohl(status));
1515 return status;
1516}
1517
688__be32 1518__be32
689nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1519nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
690 struct nfsd4_setclientid *setclid) 1520 struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
716 strhashval = clientstr_hashval(dname); 1546 strhashval = clientstr_hashval(dname);
717 1547
718 nfs4_lock_state(); 1548 nfs4_lock_state();
719 conf = find_confirmed_client_by_str(dname, strhashval); 1549 conf = find_confirmed_client_by_str(dname, strhashval, false);
720 if (conf) { 1550 if (conf) {
721 /* RFC 3530 14.2.33 CASE 0: */ 1551 /* RFC 3530 14.2.33 CASE 0: */
722 status = nfserr_clid_inuse; 1552 status = nfserr_clid_inuse;
723 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 1553 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
724 || conf->cl_addr != sin->sin_addr.s_addr) { 1554 dprintk("NFSD: setclientid: string in use by client"
725 dprintk("NFSD: setclientid: string in use by clientat %pI4\n", 1555 " at %pI4\n", &conf->cl_addr);
726 &conf->cl_addr);
727 goto out; 1556 goto out;
728 } 1557 }
729 } 1558 }
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
732 * has a description of SETCLIENTID request processing consisting 1561 * has a description of SETCLIENTID request processing consisting
733 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1562 * of 5 bullet points, labeled as CASE0 - CASE4 below.
734 */ 1563 */
735 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1564 unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
736 status = nfserr_resource; 1565 status = nfserr_resource;
737 if (!conf) { 1566 if (!conf) {
738 /* 1567 /*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
887 unsigned int hash = 1716 unsigned int hash =
888 clientstr_hashval(unconf->cl_recdir); 1717 clientstr_hashval(unconf->cl_recdir);
889 conf = find_confirmed_client_by_str(unconf->cl_recdir, 1718 conf = find_confirmed_client_by_str(unconf->cl_recdir,
890 hash); 1719 hash, false);
891 if (conf) { 1720 if (conf) {
892 nfsd4_remove_clid_dir(conf); 1721 nfsd4_remove_clid_dir(conf);
893 expire_client(conf); 1722 expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
923 1752
924 fp = kmem_cache_alloc(file_slab, GFP_KERNEL); 1753 fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
925 if (fp) { 1754 if (fp) {
926 kref_init(&fp->fi_ref); 1755 atomic_set(&fp->fi_ref, 1);
927 INIT_LIST_HEAD(&fp->fi_hash); 1756 INIT_LIST_HEAD(&fp->fi_hash);
928 INIT_LIST_HEAD(&fp->fi_stateids); 1757 INIT_LIST_HEAD(&fp->fi_stateids);
929 INIT_LIST_HEAD(&fp->fi_delegations); 1758 INIT_LIST_HEAD(&fp->fi_delegations);
1759 spin_lock(&recall_lock);
930 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1760 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1761 spin_unlock(&recall_lock);
931 fp->fi_inode = igrab(ino); 1762 fp->fi_inode = igrab(ino);
932 fp->fi_id = current_fileid++; 1763 fp->fi_id = current_fileid++;
933 fp->fi_had_conflict = false; 1764 fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
1037 return sop; 1868 return sop;
1038} 1869}
1039 1870
1040static void
1041release_stateid_lockowners(struct nfs4_stateid *open_stp)
1042{
1043 struct nfs4_stateowner *lock_sop;
1044
1045 while (!list_empty(&open_stp->st_lockowners)) {
1046 lock_sop = list_entry(open_stp->st_lockowners.next,
1047 struct nfs4_stateowner, so_perstateid);
1048 /* list_del(&open_stp->st_lockowners); */
1049 BUG_ON(lock_sop->so_is_open_owner);
1050 release_stateowner(lock_sop);
1051 }
1052}
1053
1054static void
1055unhash_stateowner(struct nfs4_stateowner *sop)
1056{
1057 struct nfs4_stateid *stp;
1058
1059 list_del(&sop->so_idhash);
1060 list_del(&sop->so_strhash);
1061 if (sop->so_is_open_owner)
1062 list_del(&sop->so_perclient);
1063 list_del(&sop->so_perstateid);
1064 while (!list_empty(&sop->so_stateids)) {
1065 stp = list_entry(sop->so_stateids.next,
1066 struct nfs4_stateid, st_perstateowner);
1067 if (sop->so_is_open_owner)
1068 release_stateid(stp, OPEN_STATE);
1069 else
1070 release_stateid(stp, LOCK_STATE);
1071 }
1072}
1073
1074static void
1075release_stateowner(struct nfs4_stateowner *sop)
1076{
1077 unhash_stateowner(sop);
1078 list_del(&sop->so_close_lru);
1079 nfs4_put_stateowner(sop);
1080}
1081
1082static inline void 1871static inline void
1083init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 1872init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
1084 struct nfs4_stateowner *sop = open->op_stateowner; 1873 struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1100 stp->st_stateid.si_generation = 0; 1889 stp->st_stateid.si_generation = 0;
1101 stp->st_access_bmap = 0; 1890 stp->st_access_bmap = 0;
1102 stp->st_deny_bmap = 0; 1891 stp->st_deny_bmap = 0;
1103 __set_bit(open->op_share_access, &stp->st_access_bmap); 1892 __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
1893 &stp->st_access_bmap);
1104 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 1894 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
1105 stp->st_openstp = NULL; 1895 stp->st_openstp = NULL;
1106} 1896}
1107 1897
1108static void 1898static void
1109release_stateid(struct nfs4_stateid *stp, int flags)
1110{
1111 struct file *filp = stp->st_vfs_file;
1112
1113 list_del(&stp->st_hash);
1114 list_del(&stp->st_perfile);
1115 list_del(&stp->st_perstateowner);
1116 if (flags & OPEN_STATE) {
1117 release_stateid_lockowners(stp);
1118 stp->st_vfs_file = NULL;
1119 nfsd_close(filp);
1120 } else if (flags & LOCK_STATE)
1121 locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
1122 put_nfs4_file(stp->st_file);
1123 kmem_cache_free(stateid_slab, stp);
1124}
1125
1126static void
1127move_to_close_lru(struct nfs4_stateowner *sop) 1899move_to_close_lru(struct nfs4_stateowner *sop)
1128{ 1900{
1129 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); 1901 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
1160 unsigned int hashval = file_hashval(ino); 1932 unsigned int hashval = file_hashval(ino);
1161 struct nfs4_file *fp; 1933 struct nfs4_file *fp;
1162 1934
1935 spin_lock(&recall_lock);
1163 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { 1936 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
1164 if (fp->fi_inode == ino) { 1937 if (fp->fi_inode == ino) {
1165 get_nfs4_file(fp); 1938 get_nfs4_file(fp);
1939 spin_unlock(&recall_lock);
1166 return fp; 1940 return fp;
1167 } 1941 }
1168 } 1942 }
1943 spin_unlock(&recall_lock);
1169 return NULL; 1944 return NULL;
1170} 1945}
1171 1946
1172static inline int access_valid(u32 x) 1947static inline int access_valid(u32 x, u32 minorversion)
1173{ 1948{
1174 if (x < NFS4_SHARE_ACCESS_READ) 1949 if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
1175 return 0; 1950 return 0;
1176 if (x > NFS4_SHARE_ACCESS_BOTH) 1951 if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
1952 return 0;
1953 x &= ~NFS4_SHARE_ACCESS_MASK;
1954 if (minorversion && x) {
1955 if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
1956 return 0;
1957 if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
1958 return 0;
1959 x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
1960 }
1961 if (x)
1177 return 0; 1962 return 0;
1178 return 1; 1963 return 1;
1179} 1964}
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
1409 2194
1410 2195
1411__be32 2196__be32
1412nfsd4_process_open1(struct nfsd4_open *open) 2197nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2198 struct nfsd4_open *open)
1413{ 2199{
1414 clientid_t *clientid = &open->op_clientid; 2200 clientid_t *clientid = &open->op_clientid;
1415 struct nfs4_client *clp = NULL; 2201 struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
1432 return nfserr_expired; 2218 return nfserr_expired;
1433 goto renew; 2219 goto renew;
1434 } 2220 }
2221 /* When sessions are used, skip open sequenceid processing */
2222 if (nfsd4_has_session(cstate))
2223 goto renew;
1435 if (!sop->so_confirmed) { 2224 if (!sop->so_confirmed) {
1436 /* Replace unconfirmed owners without checking for replay. */ 2225 /* Replace unconfirmed owners without checking for replay. */
1437 clp = sop->so_client; 2226 clp = sop->so_client;
1438 release_stateowner(sop); 2227 release_openowner(sop);
1439 open->op_stateowner = NULL; 2228 open->op_stateowner = NULL;
1440 goto renew; 2229 goto renew;
1441 } 2230 }
@@ -1709,6 +2498,7 @@ out:
1709__be32 2498__be32
1710nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 2499nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
1711{ 2500{
2501 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1712 struct nfs4_file *fp = NULL; 2502 struct nfs4_file *fp = NULL;
1713 struct inode *ino = current_fh->fh_dentry->d_inode; 2503 struct inode *ino = current_fh->fh_dentry->d_inode;
1714 struct nfs4_stateid *stp = NULL; 2504 struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1716 __be32 status; 2506 __be32 status;
1717 2507
1718 status = nfserr_inval; 2508 status = nfserr_inval;
1719 if (!access_valid(open->op_share_access) 2509 if (!access_valid(open->op_share_access, resp->cstate.minorversion)
1720 || !deny_valid(open->op_share_deny)) 2510 || !deny_valid(open->op_share_deny))
1721 goto out; 2511 goto out;
1722 /* 2512 /*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1764 init_stateid(stp, fp, open); 2554 init_stateid(stp, fp, open);
1765 status = nfsd4_truncate(rqstp, current_fh, open); 2555 status = nfsd4_truncate(rqstp, current_fh, open);
1766 if (status) { 2556 if (status) {
1767 release_stateid(stp, OPEN_STATE); 2557 release_open_stateid(stp);
1768 goto out; 2558 goto out;
1769 } 2559 }
2560 if (nfsd4_has_session(&resp->cstate))
2561 update_stateid(&stp->st_stateid);
1770 } 2562 }
1771 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2563 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
1772 2564
2565 if (nfsd4_has_session(&resp->cstate))
2566 open->op_stateowner->so_confirmed = 1;
2567
1773 /* 2568 /*
1774 * Attempt to hand out a delegation. No error return, because the 2569 * Attempt to hand out a delegation. No error return, because the
1775 * OPEN succeeds even if we fail. 2570 * OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
1790 * To finish the open response, we just need to set the rflags. 2585 * To finish the open response, we just need to set the rflags.
1791 */ 2586 */
1792 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; 2587 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
1793 if (!open->op_stateowner->so_confirmed) 2588 if (!open->op_stateowner->so_confirmed &&
2589 !nfsd4_has_session(&resp->cstate))
1794 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 2590 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
1795 2591
1796 return status; 2592 return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
1898 } 2694 }
1899 dprintk("NFSD: purging unused open stateowner (so_id %d)\n", 2695 dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
1900 sop->so_id); 2696 sop->so_id);
1901 release_stateowner(sop); 2697 release_openowner(sop);
1902 } 2698 }
1903 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) 2699 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
1904 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; 2700 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
1983static inline __be32 2779static inline __be32
1984check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) 2780check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1985{ 2781{
1986 /* Trying to call delegreturn with a special stateid? Yuch: */ 2782 if (ONE_STATEID(stateid) && (flags & RD_STATE))
1987 if (!(flags & (RD_STATE | WR_STATE)))
1988 return nfserr_bad_stateid;
1989 else if (ONE_STATEID(stateid) && (flags & RD_STATE))
1990 return nfs_ok; 2783 return nfs_ok;
1991 else if (locks_in_grace()) { 2784 else if (locks_in_grace()) {
1992 /* Answer in remaining cases depends on existance of 2785 /* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
2005 * that are not able to provide mandatory locking. 2798 * that are not able to provide mandatory locking.
2006 */ 2799 */
2007static inline int 2800static inline int
2008io_during_grace_disallowed(struct inode *inode, int flags) 2801grace_disallows_io(struct inode *inode)
2009{ 2802{
2010 return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) 2803 return locks_in_grace() && mandatory_lock(inode);
2011 && mandatory_lock(inode);
2012} 2804}
2013 2805
2014static int check_stateid_generation(stateid_t *in, stateid_t *ref) 2806static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
2015{ 2807{
2808 /*
2809 * When sessions are used the stateid generation number is ignored
2810 * when it is zero.
2811 */
2812 if ((flags & HAS_SESSION) && in->si_generation == 0)
2813 goto out;
2814
2016 /* If the client sends us a stateid from the future, it's buggy: */ 2815 /* If the client sends us a stateid from the future, it's buggy: */
2017 if (in->si_generation > ref->si_generation) 2816 if (in->si_generation > ref->si_generation)
2018 return nfserr_bad_stateid; 2817 return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
2028 */ 2827 */
2029 if (in->si_generation < ref->si_generation) 2828 if (in->si_generation < ref->si_generation)
2030 return nfserr_old_stateid; 2829 return nfserr_old_stateid;
2830out:
2031 return nfs_ok; 2831 return nfs_ok;
2032} 2832}
2033 2833
2834static int is_delegation_stateid(stateid_t *stateid)
2835{
2836 return stateid->si_fileid == 0;
2837}
2838
2034/* 2839/*
2035* Checks for stateid operations 2840* Checks for stateid operations
2036*/ 2841*/
2037__be32 2842__be32
2038nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) 2843nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2844 stateid_t *stateid, int flags, struct file **filpp)
2039{ 2845{
2040 struct nfs4_stateid *stp = NULL; 2846 struct nfs4_stateid *stp = NULL;
2041 struct nfs4_delegation *dp = NULL; 2847 struct nfs4_delegation *dp = NULL;
2042 stateid_t *stidp; 2848 struct svc_fh *current_fh = &cstate->current_fh;
2043 struct inode *ino = current_fh->fh_dentry->d_inode; 2849 struct inode *ino = current_fh->fh_dentry->d_inode;
2044 __be32 status; 2850 __be32 status;
2045 2851
2046 dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
2047 stateid->si_boot, stateid->si_stateownerid,
2048 stateid->si_fileid, stateid->si_generation);
2049 if (filpp) 2852 if (filpp)
2050 *filpp = NULL; 2853 *filpp = NULL;
2051 2854
2052 if (io_during_grace_disallowed(ino, flags)) 2855 if (grace_disallows_io(ino))
2053 return nfserr_grace; 2856 return nfserr_grace;
2054 2857
2858 if (nfsd4_has_session(cstate))
2859 flags |= HAS_SESSION;
2860
2055 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 2861 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2056 return check_special_stateids(current_fh, stateid, flags); 2862 return check_special_stateids(current_fh, stateid, flags);
2057 2863
2058 /* STALE STATEID */
2059 status = nfserr_stale_stateid; 2864 status = nfserr_stale_stateid;
2060 if (STALE_STATEID(stateid)) 2865 if (STALE_STATEID(stateid))
2061 goto out; 2866 goto out;
2062 2867
2063 /* BAD STATEID */
2064 status = nfserr_bad_stateid; 2868 status = nfserr_bad_stateid;
2065 if (!stateid->si_fileid) { /* delegation stateid */ 2869 if (is_delegation_stateid(stateid)) {
2066 if(!(dp = find_delegation_stateid(ino, stateid))) { 2870 dp = find_delegation_stateid(ino, stateid);
2067 dprintk("NFSD: delegation stateid not found\n"); 2871 if (!dp)
2068 goto out; 2872 goto out;
2069 } 2873 status = check_stateid_generation(stateid, &dp->dl_stateid,
2070 stidp = &dp->dl_stateid; 2874 flags);
2875 if (status)
2876 goto out;
2877 status = nfs4_check_delegmode(dp, flags);
2878 if (status)
2879 goto out;
2880 renew_client(dp->dl_client);
2881 if (filpp)
2882 *filpp = dp->dl_vfs_file;
2071 } else { /* open or lock stateid */ 2883 } else { /* open or lock stateid */
2072 if (!(stp = find_stateid(stateid, flags))) { 2884 stp = find_stateid(stateid, flags);
2073 dprintk("NFSD: open or lock stateid not found\n"); 2885 if (!stp)
2074 goto out; 2886 goto out;
2075 } 2887 if (nfs4_check_fh(current_fh, stp))
2076 if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
2077 goto out; 2888 goto out;
2078 if (!stp->st_stateowner->so_confirmed) 2889 if (!stp->st_stateowner->so_confirmed)
2079 goto out; 2890 goto out;
2080 stidp = &stp->st_stateid; 2891 status = check_stateid_generation(stateid, &stp->st_stateid,
2081 } 2892 flags);
2082 status = check_stateid_generation(stateid, stidp); 2893 if (status)
2083 if (status) 2894 goto out;
2084 goto out; 2895 status = nfs4_check_openmode(stp, flags);
2085 if (stp) { 2896 if (status)
2086 if ((status = nfs4_check_openmode(stp,flags)))
2087 goto out; 2897 goto out;
2088 renew_client(stp->st_stateowner->so_client); 2898 renew_client(stp->st_stateowner->so_client);
2089 if (filpp) 2899 if (filpp)
2090 *filpp = stp->st_vfs_file; 2900 *filpp = stp->st_vfs_file;
2091 } else {
2092 if ((status = nfs4_check_delegmode(dp, flags)))
2093 goto out;
2094 renew_client(dp->dl_client);
2095 if (flags & DELEG_RET)
2096 unhash_delegation(dp);
2097 if (filpp)
2098 *filpp = dp->dl_vfs_file;
2099 } 2901 }
2100 status = nfs_ok; 2902 status = nfs_ok;
2101out: 2903out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
2113 * Checks for sequence id mutating operations. 2915 * Checks for sequence id mutating operations.
2114 */ 2916 */
2115static __be32 2917static __be32
2116nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) 2918nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2919 stateid_t *stateid, int flags,
2920 struct nfs4_stateowner **sopp,
2921 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
2117{ 2922{
2118 struct nfs4_stateid *stp; 2923 struct nfs4_stateid *stp;
2119 struct nfs4_stateowner *sop; 2924 struct nfs4_stateowner *sop;
2925 struct svc_fh *current_fh = &cstate->current_fh;
2120 __be32 status; 2926 __be32 status;
2121 2927
2122 dprintk("NFSD: preprocess_seqid_op: seqid=%d " 2928 dprintk("NFSD: preprocess_seqid_op: seqid=%d "
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2134 2940
2135 if (STALE_STATEID(stateid)) 2941 if (STALE_STATEID(stateid))
2136 return nfserr_stale_stateid; 2942 return nfserr_stale_stateid;
2943
2944 if (nfsd4_has_session(cstate))
2945 flags |= HAS_SESSION;
2946
2137 /* 2947 /*
2138 * We return BAD_STATEID if filehandle doesn't match stateid, 2948 * We return BAD_STATEID if filehandle doesn't match stateid,
2139 * the confirmed flag is incorrecly set, or the generation 2949 * the confirmed flag is incorrecly set, or the generation
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2166 if (lock->lk_is_new) { 2976 if (lock->lk_is_new) {
2167 if (!sop->so_is_open_owner) 2977 if (!sop->so_is_open_owner)
2168 return nfserr_bad_stateid; 2978 return nfserr_bad_stateid;
2169 if (!same_clid(&clp->cl_clientid, lockclid)) 2979 if (!(flags & HAS_SESSION) &&
2170 return nfserr_bad_stateid; 2980 !same_clid(&clp->cl_clientid, lockclid))
2981 return nfserr_bad_stateid;
2171 /* stp is the open stateid */ 2982 /* stp is the open stateid */
2172 status = nfs4_check_openmode(stp, lkflg); 2983 status = nfs4_check_openmode(stp, lkflg);
2173 if (status) 2984 if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2190 * For the moment, we ignore the possibility of 3001 * For the moment, we ignore the possibility of
2191 * generation number wraparound. 3002 * generation number wraparound.
2192 */ 3003 */
2193 if (seqid != sop->so_seqid) 3004 if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
2194 goto check_replay; 3005 goto check_replay;
2195 3006
2196 if (sop->so_confirmed && flags & CONFIRM) { 3007 if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2203 " confirmed yet!\n"); 3014 " confirmed yet!\n");
2204 return nfserr_bad_stateid; 3015 return nfserr_bad_stateid;
2205 } 3016 }
2206 status = check_stateid_generation(stateid, &stp->st_stateid); 3017 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
2207 if (status) 3018 if (status)
2208 return status; 3019 return status;
2209 renew_client(sop->so_client); 3020 renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2239 3050
2240 nfs4_lock_state(); 3051 nfs4_lock_state();
2241 3052
2242 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3053 if ((status = nfs4_preprocess_seqid_op(cstate,
2243 oc->oc_seqid, &oc->oc_req_stateid, 3054 oc->oc_seqid, &oc->oc_req_stateid,
2244 CONFIRM | OPEN_STATE, 3055 CONFIRM | OPEN_STATE,
2245 &oc->oc_stateowner, &stp, NULL))) 3056 &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
2304 (int)cstate->current_fh.fh_dentry->d_name.len, 3115 (int)cstate->current_fh.fh_dentry->d_name.len,
2305 cstate->current_fh.fh_dentry->d_name.name); 3116 cstate->current_fh.fh_dentry->d_name.name);
2306 3117
2307 if (!access_valid(od->od_share_access) 3118 if (!access_valid(od->od_share_access, cstate->minorversion)
2308 || !deny_valid(od->od_share_deny)) 3119 || !deny_valid(od->od_share_deny))
2309 return nfserr_inval; 3120 return nfserr_inval;
2310 3121
2311 nfs4_lock_state(); 3122 nfs4_lock_state();
2312 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3123 if ((status = nfs4_preprocess_seqid_op(cstate,
2313 od->od_seqid, 3124 od->od_seqid,
2314 &od->od_stateid, 3125 &od->od_stateid,
2315 OPEN_STATE, 3126 OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2362 3173
2363 nfs4_lock_state(); 3174 nfs4_lock_state();
2364 /* check close_lru for replay */ 3175 /* check close_lru for replay */
2365 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3176 if ((status = nfs4_preprocess_seqid_op(cstate,
2366 close->cl_seqid, 3177 close->cl_seqid,
2367 &close->cl_stateid, 3178 &close->cl_stateid,
2368 OPEN_STATE | CLOSE_STATE, 3179 OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2373 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 3184 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
2374 3185
2375 /* release_stateid() calls nfsd_close() if needed */ 3186 /* release_stateid() calls nfsd_close() if needed */
2376 release_stateid(stp, OPEN_STATE); 3187 release_open_stateid(stp);
2377 3188
2378 /* place unused nfs4_stateowners on so_close_lru list to be 3189 /* place unused nfs4_stateowners on so_close_lru list to be
2379 * released by the laundromat service after the lease period 3190 * released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
2394nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3205nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2395 struct nfsd4_delegreturn *dr) 3206 struct nfsd4_delegreturn *dr)
2396{ 3207{
3208 struct nfs4_delegation *dp;
3209 stateid_t *stateid = &dr->dr_stateid;
3210 struct inode *inode;
2397 __be32 status; 3211 __be32 status;
3212 int flags = 0;
2398 3213
2399 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3214 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
2400 goto out; 3215 return status;
3216 inode = cstate->current_fh.fh_dentry->d_inode;
2401 3217
3218 if (nfsd4_has_session(cstate))
3219 flags |= HAS_SESSION;
2402 nfs4_lock_state(); 3220 nfs4_lock_state();
2403 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 3221 status = nfserr_bad_stateid;
2404 &dr->dr_stateid, DELEG_RET, NULL); 3222 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2405 nfs4_unlock_state(); 3223 goto out;
3224 status = nfserr_stale_stateid;
3225 if (STALE_STATEID(stateid))
3226 goto out;
3227 status = nfserr_bad_stateid;
3228 if (!is_delegation_stateid(stateid))
3229 goto out;
3230 dp = find_delegation_stateid(inode, stateid);
3231 if (!dp)
3232 goto out;
3233 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3234 if (status)
3235 goto out;
3236 renew_client(dp->dl_client);
3237
3238 unhash_delegation(dp);
2406out: 3239out:
3240 nfs4_unlock_state();
3241
2407 return status; 3242 return status;
2408} 3243}
2409 3244
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2684 struct nfs4_file *fp; 3519 struct nfs4_file *fp;
2685 3520
2686 status = nfserr_stale_clientid; 3521 status = nfserr_stale_clientid;
2687 if (STALE_CLIENTID(&lock->lk_new_clientid)) 3522 if (!nfsd4_has_session(cstate) &&
3523 STALE_CLIENTID(&lock->lk_new_clientid))
2688 goto out; 3524 goto out;
2689 3525
2690 /* validate and update open stateid and open seqid */ 3526 /* validate and update open stateid and open seqid */
2691 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3527 status = nfs4_preprocess_seqid_op(cstate,
2692 lock->lk_new_open_seqid, 3528 lock->lk_new_open_seqid,
2693 &lock->lk_new_open_stateid, 3529 &lock->lk_new_open_stateid,
2694 OPEN_STATE, 3530 OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2715 goto out; 3551 goto out;
2716 } else { 3552 } else {
2717 /* lock (lock owner + lock stateid) already exists */ 3553 /* lock (lock owner + lock stateid) already exists */
2718 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3554 status = nfs4_preprocess_seqid_op(cstate,
2719 lock->lk_old_lock_seqid, 3555 lock->lk_old_lock_seqid,
2720 &lock->lk_old_lock_stateid, 3556 &lock->lk_old_lock_stateid,
2721 LOCK_STATE, 3557 LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2788 } 3624 }
2789out: 3625out:
2790 if (status && lock->lk_is_new && lock_sop) 3626 if (status && lock->lk_is_new && lock_sop)
2791 release_stateowner(lock_sop); 3627 release_lockowner(lock_sop);
2792 if (lock->lk_replay_owner) { 3628 if (lock->lk_replay_owner) {
2793 nfs4_get_stateowner(lock->lk_replay_owner); 3629 nfs4_get_stateowner(lock->lk_replay_owner);
2794 cstate->replay_owner = lock->lk_replay_owner; 3630 cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2838 nfs4_lock_state(); 3674 nfs4_lock_state();
2839 3675
2840 status = nfserr_stale_clientid; 3676 status = nfserr_stale_clientid;
2841 if (STALE_CLIENTID(&lockt->lt_clientid)) 3677 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
2842 goto out; 3678 goto out;
2843 3679
2844 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { 3680 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2911 3747
2912 nfs4_lock_state(); 3748 nfs4_lock_state();
2913 3749
2914 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3750 if ((status = nfs4_preprocess_seqid_op(cstate,
2915 locku->lu_seqid, 3751 locku->lu_seqid,
2916 &locku->lu_stateid, 3752 &locku->lu_stateid,
2917 LOCK_STATE, 3753 LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3037 /* unhash_stateowner deletes so_perclient only 3873 /* unhash_stateowner deletes so_perclient only
3038 * for openowners. */ 3874 * for openowners. */
3039 list_del(&sop->so_perclient); 3875 list_del(&sop->so_perclient);
3040 release_stateowner(sop); 3876 release_lockowner(sop);
3041 } 3877 }
3042out: 3878out:
3043 nfs4_unlock_state(); 3879 nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
3051} 3887}
3052 3888
3053int 3889int
3054nfs4_has_reclaimed_state(const char *name) 3890nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
3055{ 3891{
3056 unsigned int strhashval = clientstr_hashval(name); 3892 unsigned int strhashval = clientstr_hashval(name);
3057 struct nfs4_client *clp; 3893 struct nfs4_client *clp;
3058 3894
3059 clp = find_confirmed_client_by_str(name, strhashval); 3895 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
3060 return clp ? 1 : 0; 3896 return clp ? 1 : 0;
3061} 3897}
3062 3898
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
3153 INIT_LIST_HEAD(&unconf_str_hashtbl[i]); 3989 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
3154 INIT_LIST_HEAD(&unconf_id_hashtbl[i]); 3990 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
3155 } 3991 }
3992 for (i = 0; i < SESSION_HASH_SIZE; i++)
3993 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
3156 for (i = 0; i < FILE_HASH_SIZE; i++) { 3994 for (i = 0; i < FILE_HASH_SIZE; i++) {
3157 INIT_LIST_HEAD(&file_hashtbl[i]); 3995 INIT_LIST_HEAD(&file_hashtbl[i]);
3158 } 3996 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
45#include <linux/fs.h> 45#include <linux/fs.h>
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/vfs.h> 47#include <linux/vfs.h>
48#include <linux/utsname.h>
48#include <linux/sunrpc/xdr.h> 49#include <linux/sunrpc/xdr.h>
49#include <linux/sunrpc/svc.h> 50#include <linux/sunrpc/svc.h>
50#include <linux/sunrpc/clnt.h> 51#include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
188 return p; 189 return p;
189} 190}
190 191
192static int zero_clientid(clientid_t *clid)
193{
194 return (clid->cl_boot == 0) && (clid->cl_id == 0);
195}
196
191static int 197static int
192defer_free(struct nfsd4_compoundargs *argp, 198defer_free(struct nfsd4_compoundargs *argp,
193 void (*release)(const void *), void *p) 199 void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
230 236
231 bmval[0] = 0; 237 bmval[0] = 0;
232 bmval[1] = 0; 238 bmval[1] = 0;
239 bmval[2] = 0;
233 240
234 READ_BUF(4); 241 READ_BUF(4);
235 READ32(bmlen); 242 READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
241 READ32(bmval[0]); 248 READ32(bmval[0]);
242 if (bmlen > 1) 249 if (bmlen > 1)
243 READ32(bmval[1]); 250 READ32(bmval[1]);
251 if (bmlen > 2)
252 READ32(bmval[2]);
244 253
245 DECODE_TAIL; 254 DECODE_TAIL;
246} 255}
247 256
257static u32 nfsd_attrmask[] = {
258 NFSD_WRITEABLE_ATTRS_WORD0,
259 NFSD_WRITEABLE_ATTRS_WORD1,
260 NFSD_WRITEABLE_ATTRS_WORD2
261};
262
263static u32 nfsd41_ex_attrmask[] = {
264 NFSD_SUPPATTR_EXCLCREAT_WORD0,
265 NFSD_SUPPATTR_EXCLCREAT_WORD1,
266 NFSD_SUPPATTR_EXCLCREAT_WORD2
267};
268
248static __be32 269static __be32
249nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, 270nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
250 struct nfs4_acl **acl) 271 struct iattr *iattr, struct nfs4_acl **acl)
251{ 272{
252 int expected_len, len = 0; 273 int expected_len, len = 0;
253 u32 dummy32; 274 u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
263 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; 284 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
264 * read-only attributes return ERR_INVAL. 285 * read-only attributes return ERR_INVAL.
265 */ 286 */
266 if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 287 if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
288 (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
289 (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
267 return nfserr_attrnotsupp; 290 return nfserr_attrnotsupp;
268 if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1)) 291 if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
292 (bmval[2] & ~writable[2]))
269 return nfserr_inval; 293 return nfserr_inval;
270 294
271 READ_BUF(4); 295 READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
400 goto xdr_error; 424 goto xdr_error;
401 } 425 }
402 } 426 }
427 BUG_ON(bmval[2]); /* no such writeable attr supported yet */
403 if (len != expected_len) 428 if (len != expected_len)
404 goto xdr_error; 429 goto xdr_error;
405 430
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
493 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 518 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
494 return status; 519 return status;
495 520
496 if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) 521 status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
522 &create->cr_iattr, &create->cr_acl);
523 if (status)
497 goto out; 524 goto out;
498 525
499 DECODE_TAIL; 526 DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
583 READ_BUF(lockt->lt_owner.len); 610 READ_BUF(lockt->lt_owner.len);
584 READMEM(lockt->lt_owner.data, lockt->lt_owner.len); 611 READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
585 612
613 if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
614 return nfserr_inval;
586 DECODE_TAIL; 615 DECODE_TAIL;
587} 616}
588 617
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
652 switch (open->op_createmode) { 681 switch (open->op_createmode) {
653 case NFS4_CREATE_UNCHECKED: 682 case NFS4_CREATE_UNCHECKED:
654 case NFS4_CREATE_GUARDED: 683 case NFS4_CREATE_GUARDED:
655 if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) 684 status = nfsd4_decode_fattr(argp, open->op_bmval,
685 nfsd_attrmask, &open->op_iattr, &open->op_acl);
686 if (status)
656 goto out; 687 goto out;
657 break; 688 break;
658 case NFS4_CREATE_EXCLUSIVE: 689 case NFS4_CREATE_EXCLUSIVE:
659 READ_BUF(8); 690 READ_BUF(8);
660 COPYMEM(open->op_verf.data, 8); 691 COPYMEM(open->op_verf.data, 8);
661 break; 692 break;
693 case NFS4_CREATE_EXCLUSIVE4_1:
694 if (argp->minorversion < 1)
695 goto xdr_error;
696 READ_BUF(8);
697 COPYMEM(open->op_verf.data, 8);
698 status = nfsd4_decode_fattr(argp, open->op_bmval,
699 nfsd41_ex_attrmask, &open->op_iattr,
700 &open->op_acl);
701 if (status)
702 goto out;
703 break;
662 default: 704 default:
663 goto xdr_error; 705 goto xdr_error;
664 } 706 }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
851 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); 893 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
852 if (status) 894 if (status)
853 return status; 895 return status;
854 return nfsd4_decode_fattr(argp, setattr->sa_bmval, 896 return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
855 &setattr->sa_iattr, &setattr->sa_acl); 897 &setattr->sa_iattr, &setattr->sa_acl);
856} 898}
857 899
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
993 READ_BUF(rlockowner->rl_owner.len); 1035 READ_BUF(rlockowner->rl_owner.len);
994 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); 1036 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
995 1037
1038 if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
1039 return nfserr_inval;
1040 DECODE_TAIL;
1041}
1042
1043static __be32
1044nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1045 struct nfsd4_exchange_id *exid)
1046{
1047 int dummy;
1048 DECODE_HEAD;
1049
1050 READ_BUF(NFS4_VERIFIER_SIZE);
1051 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
1052
1053 READ_BUF(4);
1054 READ32(exid->clname.len);
1055
1056 READ_BUF(exid->clname.len);
1057 SAVEMEM(exid->clname.data, exid->clname.len);
1058
1059 READ_BUF(4);
1060 READ32(exid->flags);
1061
1062 /* Ignore state_protect4_a */
1063 READ_BUF(4);
1064 READ32(exid->spa_how);
1065 switch (exid->spa_how) {
1066 case SP4_NONE:
1067 break;
1068 case SP4_MACH_CRED:
1069 /* spo_must_enforce */
1070 READ_BUF(4);
1071 READ32(dummy);
1072 READ_BUF(dummy * 4);
1073 p += dummy;
1074
1075 /* spo_must_allow */
1076 READ_BUF(4);
1077 READ32(dummy);
1078 READ_BUF(dummy * 4);
1079 p += dummy;
1080 break;
1081 case SP4_SSV:
1082 /* ssp_ops */
1083 READ_BUF(4);
1084 READ32(dummy);
1085 READ_BUF(dummy * 4);
1086 p += dummy;
1087
1088 READ_BUF(4);
1089 READ32(dummy);
1090 READ_BUF(dummy * 4);
1091 p += dummy;
1092
1093 /* ssp_hash_algs<> */
1094 READ_BUF(4);
1095 READ32(dummy);
1096 READ_BUF(dummy);
1097 p += XDR_QUADLEN(dummy);
1098
1099 /* ssp_encr_algs<> */
1100 READ_BUF(4);
1101 READ32(dummy);
1102 READ_BUF(dummy);
1103 p += XDR_QUADLEN(dummy);
1104
1105 /* ssp_window and ssp_num_gss_handles */
1106 READ_BUF(8);
1107 READ32(dummy);
1108 READ32(dummy);
1109 break;
1110 default:
1111 goto xdr_error;
1112 }
1113
1114 /* Ignore Implementation ID */
1115 READ_BUF(4); /* nfs_impl_id4 array length */
1116 READ32(dummy);
1117
1118 if (dummy > 1)
1119 goto xdr_error;
1120
1121 if (dummy == 1) {
1122 /* nii_domain */
1123 READ_BUF(4);
1124 READ32(dummy);
1125 READ_BUF(dummy);
1126 p += XDR_QUADLEN(dummy);
1127
1128 /* nii_name */
1129 READ_BUF(4);
1130 READ32(dummy);
1131 READ_BUF(dummy);
1132 p += XDR_QUADLEN(dummy);
1133
1134 /* nii_date */
1135 READ_BUF(12);
1136 p += 3;
1137 }
1138 DECODE_TAIL;
1139}
1140
1141static __be32
1142nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1143 struct nfsd4_create_session *sess)
1144{
1145 DECODE_HEAD;
1146
1147 u32 dummy;
1148 char *machine_name;
1149 int i;
1150 int nr_secflavs;
1151
1152 READ_BUF(16);
1153 COPYMEM(&sess->clientid, 8);
1154 READ32(sess->seqid);
1155 READ32(sess->flags);
1156
1157 /* Fore channel attrs */
1158 READ_BUF(28);
1159 READ32(dummy); /* headerpadsz is always 0 */
1160 READ32(sess->fore_channel.maxreq_sz);
1161 READ32(sess->fore_channel.maxresp_sz);
1162 READ32(sess->fore_channel.maxresp_cached);
1163 READ32(sess->fore_channel.maxops);
1164 READ32(sess->fore_channel.maxreqs);
1165 READ32(sess->fore_channel.nr_rdma_attrs);
1166 if (sess->fore_channel.nr_rdma_attrs == 1) {
1167 READ_BUF(4);
1168 READ32(sess->fore_channel.rdma_attrs);
1169 } else if (sess->fore_channel.nr_rdma_attrs > 1) {
1170 dprintk("Too many fore channel attr bitmaps!\n");
1171 goto xdr_error;
1172 }
1173
1174 /* Back channel attrs */
1175 READ_BUF(28);
1176 READ32(dummy); /* headerpadsz is always 0 */
1177 READ32(sess->back_channel.maxreq_sz);
1178 READ32(sess->back_channel.maxresp_sz);
1179 READ32(sess->back_channel.maxresp_cached);
1180 READ32(sess->back_channel.maxops);
1181 READ32(sess->back_channel.maxreqs);
1182 READ32(sess->back_channel.nr_rdma_attrs);
1183 if (sess->back_channel.nr_rdma_attrs == 1) {
1184 READ_BUF(4);
1185 READ32(sess->back_channel.rdma_attrs);
1186 } else if (sess->back_channel.nr_rdma_attrs > 1) {
1187 dprintk("Too many back channel attr bitmaps!\n");
1188 goto xdr_error;
1189 }
1190
1191 READ_BUF(8);
1192 READ32(sess->callback_prog);
1193
1194 /* callback_sec_params4 */
1195 READ32(nr_secflavs);
1196 for (i = 0; i < nr_secflavs; ++i) {
1197 READ_BUF(4);
1198 READ32(dummy);
1199 switch (dummy) {
1200 case RPC_AUTH_NULL:
1201 /* Nothing to read */
1202 break;
1203 case RPC_AUTH_UNIX:
1204 READ_BUF(8);
1205 /* stamp */
1206 READ32(dummy);
1207
1208 /* machine name */
1209 READ32(dummy);
1210 READ_BUF(dummy);
1211 SAVEMEM(machine_name, dummy);
1212
1213 /* uid, gid */
1214 READ_BUF(8);
1215 READ32(sess->uid);
1216 READ32(sess->gid);
1217
1218 /* more gids */
1219 READ_BUF(4);
1220 READ32(dummy);
1221 READ_BUF(dummy * 4);
1222 for (i = 0; i < dummy; ++i)
1223 READ32(dummy);
1224 break;
1225 case RPC_AUTH_GSS:
1226 dprintk("RPC_AUTH_GSS callback secflavor "
1227 "not supported!\n");
1228 READ_BUF(8);
1229 /* gcbp_service */
1230 READ32(dummy);
1231 /* gcbp_handle_from_server */
1232 READ32(dummy);
1233 READ_BUF(dummy);
1234 p += XDR_QUADLEN(dummy);
1235 /* gcbp_handle_from_client */
1236 READ_BUF(4);
1237 READ32(dummy);
1238 READ_BUF(dummy);
1239 p += XDR_QUADLEN(dummy);
1240 break;
1241 default:
1242 dprintk("Illegal callback secflavor\n");
1243 return nfserr_inval;
1244 }
1245 }
1246 DECODE_TAIL;
1247}
1248
1249static __be32
1250nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
1251 struct nfsd4_destroy_session *destroy_session)
1252{
1253 DECODE_HEAD;
1254 READ_BUF(NFS4_MAX_SESSIONID_LEN);
1255 COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1256
1257 DECODE_TAIL;
1258}
1259
1260static __be32
1261nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1262 struct nfsd4_sequence *seq)
1263{
1264 DECODE_HEAD;
1265
1266 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
1267 COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1268 READ32(seq->seqid);
1269 READ32(seq->slotid);
1270 READ32(seq->maxslots);
1271 READ32(seq->cachethis);
1272
996 DECODE_TAIL; 1273 DECODE_TAIL;
997} 1274}
998 1275
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1005static __be32 1282static __be32
1006nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) 1283nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
1007{ 1284{
1008 return nfserr_opnotsupp; 1285 return nfserr_notsupp;
1009} 1286}
1010 1287
1011typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); 1288typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1031 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, 1308 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
1032 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, 1309 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
1033 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, 1310 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
1034 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, 1311 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop,
1035 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, 1312 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
1036 [OP_READ] = (nfsd4_dec)nfsd4_decode_read, 1313 [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
1037 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, 1314 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1050 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, 1327 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
1051}; 1328};
1052 1329
1330static nfsd4_dec nfsd41_dec_ops[] = {
1331 [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access,
1332 [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close,
1333 [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit,
1334 [OP_CREATE] (nfsd4_dec)nfsd4_decode_create,
1335 [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp,
1336 [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn,
1337 [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr,
1338 [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop,
1339 [OP_LINK] (nfsd4_dec)nfsd4_decode_link,
1340 [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock,
1341 [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt,
1342 [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku,
1343 [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup,
1344 [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop,
1345 [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify,
1346 [OP_OPEN] (nfsd4_dec)nfsd4_decode_open,
1347 [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp,
1349 [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade,
1350 [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh,
1351 [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop,
1353 [OP_READ] (nfsd4_dec)nfsd4_decode_read,
1354 [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir,
1355 [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop,
1356 [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove,
1357 [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename,
1358 [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp,
1359 [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop,
1360 [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop,
1361 [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo,
1362 [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr,
1363 [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1364 [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
1365 [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify,
1366 [OP_WRITE] (nfsd4_dec)nfsd4_decode_write,
1367 [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp,
1368
1369 /* new operations for NFSv4.1 */
1370 [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp,
1371 [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
1372 [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id,
1373 [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session,
1374 [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session,
1375 [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1376 [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1377 [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp,
1378 [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp,
1379 [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp,
1380 [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp,
1381 [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp,
1382 [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp,
1383 [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence,
1384 [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp,
1385 [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1386 [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1387 [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1388 [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp,
1389};
1390
1053struct nfsd4_minorversion_ops { 1391struct nfsd4_minorversion_ops {
1054 nfsd4_dec *decoders; 1392 nfsd4_dec *decoders;
1055 int nops; 1393 int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
1057 1395
1058static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1396static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1059 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1397 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1398 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1060}; 1399};
1061 1400
1062static __be32 1401static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1412{ 1751{
1413 u32 bmval0 = bmval[0]; 1752 u32 bmval0 = bmval[0];
1414 u32 bmval1 = bmval[1]; 1753 u32 bmval1 = bmval[1];
1754 u32 bmval2 = bmval[2];
1415 struct kstat stat; 1755 struct kstat stat;
1416 struct svc_fh tempfh; 1756 struct svc_fh tempfh;
1417 struct kstatfs statfs; 1757 struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1425 int err; 1765 int err;
1426 int aclsupport = 0; 1766 int aclsupport = 0;
1427 struct nfs4_acl *acl = NULL; 1767 struct nfs4_acl *acl = NULL;
1768 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1769 u32 minorversion = resp->cstate.minorversion;
1428 1770
1429 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 1771 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
1430 BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); 1772 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
1431 BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); 1773 BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
1774 BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
1432 1775
1433 if (exp->ex_fslocs.migrated) { 1776 if (exp->ex_fslocs.migrated) {
1777 BUG_ON(bmval[2]);
1434 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); 1778 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
1435 if (status) 1779 if (status)
1436 goto out; 1780 goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1476 if ((buflen -= 16) < 0) 1820 if ((buflen -= 16) < 0)
1477 goto out_resource; 1821 goto out_resource;
1478 1822
1479 WRITE32(2); 1823 if (unlikely(bmval2)) {
1480 WRITE32(bmval0); 1824 WRITE32(3);
1481 WRITE32(bmval1); 1825 WRITE32(bmval0);
1826 WRITE32(bmval1);
1827 WRITE32(bmval2);
1828 } else if (likely(bmval1)) {
1829 WRITE32(2);
1830 WRITE32(bmval0);
1831 WRITE32(bmval1);
1832 } else {
1833 WRITE32(1);
1834 WRITE32(bmval0);
1835 }
1482 attrlenp = p++; /* to be backfilled later */ 1836 attrlenp = p++; /* to be backfilled later */
1483 1837
1484 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { 1838 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
1485 u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; 1839 u32 word0 = nfsd_suppattrs0(minorversion);
1840 u32 word1 = nfsd_suppattrs1(minorversion);
1841 u32 word2 = nfsd_suppattrs2(minorversion);
1842
1486 if ((buflen -= 12) < 0) 1843 if ((buflen -= 12) < 0)
1487 goto out_resource; 1844 goto out_resource;
1488 if (!aclsupport) 1845 if (!aclsupport)
1489 word0 &= ~FATTR4_WORD0_ACL; 1846 word0 &= ~FATTR4_WORD0_ACL;
1490 if (!exp->ex_fslocs.locations) 1847 if (!exp->ex_fslocs.locations)
1491 word0 &= ~FATTR4_WORD0_FS_LOCATIONS; 1848 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1492 WRITE32(2); 1849 if (!word2) {
1493 WRITE32(word0); 1850 WRITE32(2);
1494 WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); 1851 WRITE32(word0);
1852 WRITE32(word1);
1853 } else {
1854 WRITE32(3);
1855 WRITE32(word0);
1856 WRITE32(word1);
1857 WRITE32(word2);
1858 }
1495 } 1859 }
1496 if (bmval0 & FATTR4_WORD0_TYPE) { 1860 if (bmval0 & FATTR4_WORD0_TYPE) {
1497 if ((buflen -= 4) < 0) 1861 if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
1801 } 2165 }
1802 WRITE64(stat.ino); 2166 WRITE64(stat.ino);
1803 } 2167 }
2168 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2169 WRITE32(3);
2170 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
2171 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
2172 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
2173 }
2174
1804 *attrlenp = htonl((char *)p - (char *)attrlenp - 4); 2175 *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
1805 *countp = p - buffer; 2176 *countp = p - buffer;
1806 status = nfs_ok; 2177 status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
2572} 2943}
2573 2944
2574static __be32 2945static __be32
2946nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
2947 struct nfsd4_exchange_id *exid)
2948{
2949 ENCODE_HEAD;
2950 char *major_id;
2951 char *server_scope;
2952 int major_id_sz;
2953 int server_scope_sz;
2954 uint64_t minor_id = 0;
2955
2956 if (nfserr)
2957 return nfserr;
2958
2959 major_id = utsname()->nodename;
2960 major_id_sz = strlen(major_id);
2961 server_scope = utsname()->nodename;
2962 server_scope_sz = strlen(server_scope);
2963
2964 RESERVE_SPACE(
2965 8 /* eir_clientid */ +
2966 4 /* eir_sequenceid */ +
2967 4 /* eir_flags */ +
2968 4 /* spr_how (SP4_NONE) */ +
2969 8 /* so_minor_id */ +
2970 4 /* so_major_id.len */ +
2971 (XDR_QUADLEN(major_id_sz) * 4) +
2972 4 /* eir_server_scope.len */ +
2973 (XDR_QUADLEN(server_scope_sz) * 4) +
2974 4 /* eir_server_impl_id.count (0) */);
2975
2976 WRITEMEM(&exid->clientid, 8);
2977 WRITE32(exid->seqid);
2978 WRITE32(exid->flags);
2979
2980 /* state_protect4_r. Currently only support SP4_NONE */
2981 BUG_ON(exid->spa_how != SP4_NONE);
2982 WRITE32(exid->spa_how);
2983
2984 /* The server_owner struct */
2985 WRITE64(minor_id); /* Minor id */
2986 /* major id */
2987 WRITE32(major_id_sz);
2988 WRITEMEM(major_id, major_id_sz);
2989
2990 /* Server scope */
2991 WRITE32(server_scope_sz);
2992 WRITEMEM(server_scope, server_scope_sz);
2993
2994 /* Implementation id */
2995 WRITE32(0); /* zero length nfs_impl_id4 array */
2996 ADJUST_ARGS();
2997 return 0;
2998}
2999
3000static __be32
3001nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
3002 struct nfsd4_create_session *sess)
3003{
3004 ENCODE_HEAD;
3005
3006 if (nfserr)
3007 return nfserr;
3008
3009 RESERVE_SPACE(24);
3010 WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3011 WRITE32(sess->seqid);
3012 WRITE32(sess->flags);
3013 ADJUST_ARGS();
3014
3015 RESERVE_SPACE(28);
3016 WRITE32(0); /* headerpadsz */
3017 WRITE32(sess->fore_channel.maxreq_sz);
3018 WRITE32(sess->fore_channel.maxresp_sz);
3019 WRITE32(sess->fore_channel.maxresp_cached);
3020 WRITE32(sess->fore_channel.maxops);
3021 WRITE32(sess->fore_channel.maxreqs);
3022 WRITE32(sess->fore_channel.nr_rdma_attrs);
3023 ADJUST_ARGS();
3024
3025 if (sess->fore_channel.nr_rdma_attrs) {
3026 RESERVE_SPACE(4);
3027 WRITE32(sess->fore_channel.rdma_attrs);
3028 ADJUST_ARGS();
3029 }
3030
3031 RESERVE_SPACE(28);
3032 WRITE32(0); /* headerpadsz */
3033 WRITE32(sess->back_channel.maxreq_sz);
3034 WRITE32(sess->back_channel.maxresp_sz);
3035 WRITE32(sess->back_channel.maxresp_cached);
3036 WRITE32(sess->back_channel.maxops);
3037 WRITE32(sess->back_channel.maxreqs);
3038 WRITE32(sess->back_channel.nr_rdma_attrs);
3039 ADJUST_ARGS();
3040
3041 if (sess->back_channel.nr_rdma_attrs) {
3042 RESERVE_SPACE(4);
3043 WRITE32(sess->back_channel.rdma_attrs);
3044 ADJUST_ARGS();
3045 }
3046 return 0;
3047}
3048
3049static __be32
3050nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
3051 struct nfsd4_destroy_session *destroy_session)
3052{
3053 return nfserr;
3054}
3055
3056__be32
3057nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3058 struct nfsd4_sequence *seq)
3059{
3060 ENCODE_HEAD;
3061
3062 if (nfserr)
3063 return nfserr;
3064
3065 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
3066 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3067 WRITE32(seq->seqid);
3068 WRITE32(seq->slotid);
3069 WRITE32(seq->maxslots);
3070 /*
3071 * FIXME: for now:
3072 * target_maxslots = maxslots
3073 * status_flags = 0
3074 */
3075 WRITE32(seq->maxslots);
3076 WRITE32(0);
3077
3078 ADJUST_ARGS();
3079 return 0;
3080}
3081
3082static __be32
2575nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3083nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2576{ 3084{
2577 return nfserr; 3085 return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2579 3087
2580typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); 3088typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
2581 3089
3090/*
3091 * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
3092 * since we don't need to filter out obsolete ops as this is
3093 * done in the decoding phase.
3094 */
2582static nfsd4_enc nfsd4_enc_ops[] = { 3095static nfsd4_enc nfsd4_enc_ops[] = {
2583 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, 3096 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
2584 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, 3097 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
2617 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, 3130 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
2618 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, 3131 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
2619 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, 3132 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
3133
3134 /* NFSv4.1 operations */
3135 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3136 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3137 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3138 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3139 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
3140 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3141 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3142 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3143 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3144 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3145 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3146 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3147 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
3148 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3149 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3150 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3151 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3152 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3153 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
2620}; 3154};
2621 3155
3156/*
3157 * Calculate the total amount of memory that the compound response has taken
3158 * after encoding the current operation.
3159 *
3160 * pad: add on 8 bytes for the next operation's op_code and status so that
3161 * there is room to cache a failure on the next operation.
3162 *
3163 * Compare this length to the session se_fmaxresp_cached.
3164 *
3165 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3166 * will be at least a page and will therefore hold the xdr_buf head.
3167 */
3168static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3169{
3170 int status = 0;
3171 struct xdr_buf *xb = &resp->rqstp->rq_res;
3172 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
3173 struct nfsd4_session *session = NULL;
3174 struct nfsd4_slot *slot = resp->cstate.slot;
3175 u32 length, tlen = 0, pad = 8;
3176
3177 if (!nfsd4_has_session(&resp->cstate))
3178 return status;
3179
3180 session = resp->cstate.session;
3181 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
3182 return status;
3183
3184 if (resp->opcnt >= args->opcnt)
3185 pad = 0; /* this is the last operation */
3186
3187 if (xb->page_len == 0) {
3188 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
3189 } else {
3190 if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
3191 tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
3192
3193 length = xb->head[0].iov_len + xb->page_len + tlen + pad;
3194 }
3195 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3196 length, xb->page_len, tlen, pad);
3197
3198 if (length <= session->se_fmaxresp_cached)
3199 return status;
3200 else
3201 return nfserr_rep_too_big_to_cache;
3202}
3203
2622void 3204void
2623nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 3205nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2624{ 3206{
@@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2635 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || 3217 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
2636 !nfsd4_enc_ops[op->opnum]); 3218 !nfsd4_enc_ops[op->opnum]);
2637 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3219 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3220 /* nfsd4_check_drc_limit guarantees enough room for error status */
3221 if (!op->status && nfsd4_check_drc_limit(resp))
3222 op->status = nfserr_rep_too_big_to_cache;
2638status: 3223status:
2639 /* 3224 /*
2640 * Note: We write the status directly, instead of using WRITE32(), 3225 * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
2735 iov = &rqstp->rq_res.head[0]; 3320 iov = &rqstp->rq_res.head[0];
2736 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3321 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
2737 BUG_ON(iov->iov_len > PAGE_SIZE); 3322 BUG_ON(iov->iov_len > PAGE_SIZE);
3323 if (nfsd4_has_session(&resp->cstate)) {
3324 if (resp->cstate.status == nfserr_replay_cache &&
3325 !nfsd4_not_cached(resp)) {
3326 iov->iov_len = resp->cstate.iovlen;
3327 } else {
3328 nfsd4_store_cache_entry(resp);
3329 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3330 resp->cstate.slot->sl_inuse = 0;
3331 }
3332 if (resp->cstate.session)
3333 nfsd4_put_session(resp->cstate.session);
3334 }
2738 return 1; 3335 return 1;
2739} 3336}
2740 3337
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
60 NFSD_FO_UnlockFS, 60 NFSD_FO_UnlockFS,
61 NFSD_Threads, 61 NFSD_Threads,
62 NFSD_Pool_Threads, 62 NFSD_Pool_Threads,
63 NFSD_Pool_Stats,
63 NFSD_Versions, 64 NFSD_Versions,
64 NFSD_Ports, 65 NFSD_Ports,
65 NFSD_MaxBlkSize, 66 NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
172 .owner = THIS_MODULE, 173 .owner = THIS_MODULE,
173}; 174};
174 175
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177
178static struct file_operations pool_stats_operations = {
179 .open = nfsd_pool_stats_open,
180 .read = seq_read,
181 .llseek = seq_lseek,
182 .release = seq_release,
183 .owner = THIS_MODULE,
184};
185
175/*----------------------------------------------------------------------------*/ 186/*----------------------------------------------------------------------------*/
176/* 187/*
177 * payload - write methods 188 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
781static ssize_t __write_versions(struct file *file, char *buf, size_t size) 792static ssize_t __write_versions(struct file *file, char *buf, size_t size)
782{ 793{
783 char *mesg = buf; 794 char *mesg = buf;
784 char *vers, sign; 795 char *vers, *minorp, sign;
785 int len, num; 796 int len, num;
797 unsigned minor;
786 ssize_t tlen = 0; 798 ssize_t tlen = 0;
787 char *sep; 799 char *sep;
788 800
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
803 do { 815 do {
804 sign = *vers; 816 sign = *vers;
805 if (sign == '+' || sign == '-') 817 if (sign == '+' || sign == '-')
806 num = simple_strtol((vers+1), NULL, 0); 818 num = simple_strtol((vers+1), &minorp, 0);
807 else 819 else
808 num = simple_strtol(vers, NULL, 0); 820 num = simple_strtol(vers, &minorp, 0);
821 if (*minorp == '.') {
822 if (num < 4)
823 return -EINVAL;
824 minor = simple_strtoul(minorp+1, NULL, 0);
825 if (minor == 0)
826 return -EINVAL;
827 if (nfsd_minorversion(minor, sign == '-' ?
828 NFSD_CLEAR : NFSD_SET) < 0)
829 return -EINVAL;
830 goto next;
831 }
809 switch(num) { 832 switch(num) {
810 case 2: 833 case 2:
811 case 3: 834 case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
815 default: 838 default:
816 return -EINVAL; 839 return -EINVAL;
817 } 840 }
841 next:
818 vers += len + 1; 842 vers += len + 1;
819 tlen += len; 843 tlen += len;
820 } while ((len = qword_get(&mesg, vers, size)) > 0); 844 } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
833 num); 857 num);
834 sep = " "; 858 sep = " ";
835 } 859 }
860 if (nfsd_vers(4, NFSD_AVAIL))
861 for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
862 len += sprintf(buf+len, " %c4.%u",
863 (nfsd_vers(4, NFSD_TEST) &&
864 nfsd_minorversion(minor, NFSD_TEST)) ?
865 '+' : '-',
866 minor);
836 len += sprintf(buf+len, "\n"); 867 len += sprintf(buf+len, "\n");
837 return len; 868 return len;
838} 869}
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1248 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, 1279 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
1249 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1280 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
1250 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1281 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
1282 [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
1251 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1283 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1252 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1284 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1253 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1285 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
180{ 180{
181 __be32 nfserr; 181 __be32 nfserr;
182 int stable = 1; 182 int stable = 1;
183 unsigned long cnt = argp->len;
183 184
184 dprintk("nfsd: WRITE %s %d bytes at %d\n", 185 dprintk("nfsd: WRITE %s %d bytes at %d\n",
185 SVCFH_fmt(&argp->fh), 186 SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
188 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 189 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
189 argp->offset, 190 argp->offset,
190 rqstp->rq_vec, argp->vlen, 191 rqstp->rq_vec, argp->vlen,
191 argp->len, 192 &cnt,
192 &stable); 193 &stable);
193 return nfsd_return_attrs(nfserr, resp); 194 return nfsd_return_attrs(nfserr, resp);
194} 195}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index bc3567bab8c4..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/fs_struct.h> 23#include <linux/fs_struct.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/swap.h>
25 26
26#include <linux/sunrpc/types.h> 27#include <linux/sunrpc/types.h>
27#include <linux/sunrpc/stats.h> 28#include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
40extern struct svc_program nfsd_program; 41extern struct svc_program nfsd_program;
41static int nfsd(void *vrqstp); 42static int nfsd(void *vrqstp);
42struct timeval nfssvc_boot; 43struct timeval nfssvc_boot;
43static atomic_t nfsd_busy;
44static unsigned long nfsd_last_call;
45static DEFINE_SPINLOCK(nfsd_call_lock);
46 44
47/* 45/*
48 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members 46 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program nfsd_program = {
123 121
124}; 122};
125 123
124u32 nfsd_supported_minorversion;
125
126int nfsd_vers(int vers, enum vers_op change) 126int nfsd_vers(int vers, enum vers_op change)
127{ 127{
128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
149 } 149 }
150 return 0; 150 return 0;
151} 151}
152
153int nfsd_minorversion(u32 minorversion, enum vers_op change)
154{
155 if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
156 return -1;
157 switch(change) {
158 case NFSD_SET:
159 nfsd_supported_minorversion = minorversion;
160 break;
161 case NFSD_CLEAR:
162 if (minorversion == 0)
163 return -1;
164 nfsd_supported_minorversion = minorversion - 1;
165 break;
166 case NFSD_TEST:
167 return minorversion <= nfsd_supported_minorversion;
168 case NFSD_AVAIL:
169 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
170 }
171 return 0;
172}
173
152/* 174/*
153 * Maximum number of nfsd processes 175 * Maximum number of nfsd processes
154 */ 176 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
200 } 222 }
201} 223}
202 224
225/*
226 * Each session guarantees a negotiated per slot memory cache for replies
227 * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
228 * NFSv4.1 server might want to use more memory for a DRC than a machine
229 * with mutiple services.
230 *
231 * Impose a hard limit on the number of pages for the DRC which varies
232 * according to the machines free pages. This is of course only a default.
233 *
234 * For now this is a #defined shift which could be under admin control
235 * in the future.
236 */
237static void set_max_drc(void)
238{
239 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
240 #define NFSD_DRC_SIZE_SHIFT 7
241 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
242 >> NFSD_DRC_SIZE_SHIFT;
243 nfsd_serv->sv_drc_pages_used = 0;
244 dprintk("%s svc_drc_max_pages %u\n", __func__,
245 nfsd_serv->sv_drc_max_pages);
246}
203 247
204int nfsd_create_serv(void) 248int nfsd_create_serv(void)
205{ 249{
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
227 nfsd_max_blksize /= 2; 271 nfsd_max_blksize /= 2;
228 } 272 }
229 273
230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 274 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 nfsd_last_thread, nfsd, THIS_MODULE); 275 nfsd_last_thread, nfsd, THIS_MODULE);
233 if (nfsd_serv == NULL) 276 if (nfsd_serv == NULL)
234 err = -ENOMEM; 277 err = -ENOMEM;
278 else
279 set_max_drc();
235 280
236 do_gettimeofday(&nfssvc_boot); /* record boot time */ 281 do_gettimeofday(&nfssvc_boot); /* record boot time */
237 return err; 282 return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
375 return error; 420 return error;
376} 421}
377 422
378static inline void
379update_thread_usage(int busy_threads)
380{
381 unsigned long prev_call;
382 unsigned long diff;
383 int decile;
384
385 spin_lock(&nfsd_call_lock);
386 prev_call = nfsd_last_call;
387 nfsd_last_call = jiffies;
388 decile = busy_threads*10/nfsdstats.th_cnt;
389 if (decile>0 && decile <= 10) {
390 diff = nfsd_last_call - prev_call;
391 if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
392 nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
393 if (decile == 10)
394 nfsdstats.th_fullcnt++;
395 }
396 spin_unlock(&nfsd_call_lock);
397}
398 423
399/* 424/*
400 * This is the NFS server kernel thread 425 * This is the NFS server kernel thread
@@ -403,7 +428,6 @@ static int
403nfsd(void *vrqstp) 428nfsd(void *vrqstp)
404{ 429{
405 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 430 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
406 struct fs_struct *fsp;
407 int err, preverr = 0; 431 int err, preverr = 0;
408 432
409 /* Lock module and set up kernel thread */ 433 /* Lock module and set up kernel thread */
@@ -412,13 +436,11 @@ nfsd(void *vrqstp)
412 /* At this point, the thread shares current->fs 436 /* At this point, the thread shares current->fs
413 * with the init process. We need to create files with a 437 * with the init process. We need to create files with a
414 * umask of 0 instead of init's umask. */ 438 * umask of 0 instead of init's umask. */
415 fsp = copy_fs_struct(current->fs); 439 if (unshare_fs_struct() < 0) {
416 if (!fsp) {
417 printk("Unable to start nfsd thread: out of memory\n"); 440 printk("Unable to start nfsd thread: out of memory\n");
418 goto out; 441 goto out;
419 } 442 }
420 exit_fs(current); 443
421 current->fs = fsp;
422 current->fs->umask = 0; 444 current->fs->umask = 0;
423 445
424 /* 446 /*
@@ -463,8 +485,6 @@ nfsd(void *vrqstp)
463 continue; 485 continue;
464 } 486 }
465 487
466 update_thread_usage(atomic_read(&nfsd_busy));
467 atomic_inc(&nfsd_busy);
468 488
469 /* Lock the export hash tables for reading. */ 489 /* Lock the export hash tables for reading. */
470 exp_readlock(); 490 exp_readlock();
@@ -473,8 +493,6 @@ nfsd(void *vrqstp)
473 493
474 /* Unlock export hash tables */ 494 /* Unlock export hash tables */
475 exp_readunlock(); 495 exp_readunlock();
476 update_thread_usage(atomic_read(&nfsd_busy));
477 atomic_dec(&nfsd_busy);
478 } 496 }
479 497
480 /* Clear signals before calling svc_exit_thread() */ 498 /* Clear signals before calling svc_exit_thread() */
@@ -542,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
542 + rqstp->rq_res.head[0].iov_len; 560 + rqstp->rq_res.head[0].iov_len;
543 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 561 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
544 562
563 /* NFSv4.1 DRC requires statp */
564 if (rqstp->rq_vers == 4)
565 nfsd4_set_statp(rqstp, statp);
566
545 /* Now call the procedure handler, and encode NFS status. */ 567 /* Now call the procedure handler, and encode NFS status. */
546 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 568 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
547 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 569 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -573,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
573 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 595 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
574 return 1; 596 return 1;
575} 597}
598
599int nfsd_pool_stats_open(struct inode *inode, struct file *file)
600{
601 if (nfsd_serv == NULL)
602 return -ENODEV;
603 return svc_pool_stats_open(nfsd_serv, file);
604}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..ab93fcfef254 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
366 } 366 }
367 367
368 /* Revoke setuid/setgid on chown */ 368 /* Revoke setuid/setgid on chown */
369 if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || 369 if (!S_ISDIR(inode->i_mode) &&
370 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { 370 (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
371 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
371 iap->ia_valid |= ATTR_KILL_PRIV; 372 iap->ia_valid |= ATTR_KILL_PRIV;
372 if (iap->ia_valid & ATTR_MODE) { 373 if (iap->ia_valid & ATTR_MODE) {
373 /* we're setting mode too, just clear the s*id bits */ 374 /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
960static __be32 961static __be32
961nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 962nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
962 loff_t offset, struct kvec *vec, int vlen, 963 loff_t offset, struct kvec *vec, int vlen,
963 unsigned long cnt, int *stablep) 964 unsigned long *cnt, int *stablep)
964{ 965{
965 struct svc_export *exp; 966 struct svc_export *exp;
966 struct dentry *dentry; 967 struct dentry *dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
974 err = nfserr_perm; 975 err = nfserr_perm;
975 976
976 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 977 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
977 (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) 978 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
978 goto out; 979 goto out;
979#endif 980#endif
980 981
@@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1009 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 1010 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
1010 set_fs(oldfs); 1011 set_fs(oldfs);
1011 if (host_err >= 0) { 1012 if (host_err >= 0) {
1012 nfsdstats.io_write += cnt; 1013 nfsdstats.io_write += host_err;
1013 fsnotify_modify(file->f_path.dentry); 1014 fsnotify_modify(file->f_path.dentry);
1014 } 1015 }
1015 1016
@@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1054 } 1055 }
1055 1056
1056 dprintk("nfsd: write complete host_err=%d\n", host_err); 1057 dprintk("nfsd: write complete host_err=%d\n", host_err);
1057 if (host_err >= 0) 1058 if (host_err >= 0) {
1058 err = 0; 1059 err = 0;
1059 else 1060 *cnt = host_err;
1061 } else
1060 err = nfserrno(host_err); 1062 err = nfserrno(host_err);
1061out: 1063out:
1062 return err; 1064 return err;
@@ -1098,7 +1100,7 @@ out:
1098 */ 1100 */
1099__be32 1101__be32
1100nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1102nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1101 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, 1103 loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
1102 int *stablep) 1104 int *stablep)
1103{ 1105{
1104 __be32 err = 0; 1106 __be32 err = 0;
@@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
1179 return 0; 1181 return 0;
1180} 1182}
1181 1183
1184/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
1185 * setting size to 0 may fail for some specific file systems by the permission
1186 * checking which requires WRITE permission but the mode is 000.
1187 * we ignore the resizing(to 0) on the just new created file, since the size is
1188 * 0 after file created.
1189 *
1190 * call this only after vfs_create() is called.
1191 * */
1192static void
1193nfsd_check_ignore_resizing(struct iattr *iap)
1194{
1195 if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
1196 iap->ia_valid &= ~ATTR_SIZE;
1197}
1198
1182/* 1199/*
1183 * Create a file (regular, directory, device, fifo); UNIX sockets 1200 * Create a file (regular, directory, device, fifo); UNIX sockets
1184 * not yet implemented. 1201 * not yet implemented.
@@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1274 switch (type) { 1291 switch (type) {
1275 case S_IFREG: 1292 case S_IFREG:
1276 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1293 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1294 if (!host_err)
1295 nfsd_check_ignore_resizing(iap);
1277 break; 1296 break;
1278 case S_IFDIR: 1297 case S_IFDIR:
1279 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1298 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1427 /* setattr will sync the child (or not) */ 1446 /* setattr will sync the child (or not) */
1428 } 1447 }
1429 1448
1449 nfsd_check_ignore_resizing(iap);
1450
1430 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1451 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1431 /* Cram the verifier into atime/mtime */ 1452 /* Cram the verifier into atime/mtime */
1432 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1453 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 000000000000..df3e62c1ddc5
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
1obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 000000000000..d69e6ae59251
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
1/*
2 * alloc.c - NILFS dat/inode allocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include <linux/fs.h>
28#include <linux/bitops.h>
29#include "mdt.h"
30#include "alloc.h"
31
32
33static inline unsigned long
34nilfs_palloc_groups_per_desc_block(const struct inode *inode)
35{
36 return (1UL << inode->i_blkbits) /
37 sizeof(struct nilfs_palloc_group_desc);
38}
39
40static inline unsigned long
41nilfs_palloc_groups_count(const struct inode *inode)
42{
43 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
44}
45
46int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
47{
48 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
49
50 mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
51 if (!mi->mi_bgl)
52 return -ENOMEM;
53
54 bgl_lock_init(mi->mi_bgl);
55
56 nilfs_mdt_set_entry_size(inode, entry_size, 0);
57
58 mi->mi_blocks_per_group =
59 DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
60 mi->mi_entries_per_block) + 1;
61 /* Number of blocks in a group including entry blocks and
62 a bitmap block */
63 mi->mi_blocks_per_desc_block =
64 nilfs_palloc_groups_per_desc_block(inode) *
65 mi->mi_blocks_per_group + 1;
66 /* Number of blocks per descriptor including the
67 descriptor block */
68 return 0;
69}
70
71static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
72 unsigned long *offset)
73{
74 __u64 group = nr;
75
76 *offset = do_div(group, nilfs_palloc_entries_per_group(inode));
77 return group;
78}
79
80static unsigned long
81nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
82{
83 unsigned long desc_block =
84 group / nilfs_palloc_groups_per_desc_block(inode);
85 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
86}
87
88static unsigned long
89nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
90{
91 unsigned long desc_offset =
92 group % nilfs_palloc_groups_per_desc_block(inode);
93 return nilfs_palloc_desc_blkoff(inode, group) + 1 +
94 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
95}
96
97static unsigned long
98nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
99 const struct nilfs_palloc_group_desc *desc)
100{
101 unsigned long nfree;
102
103 spin_lock(nilfs_mdt_bgl_lock(inode, group));
104 nfree = le32_to_cpu(desc->pg_nfrees);
105 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
106 return nfree;
107}
108
109static void
110nilfs_palloc_group_desc_add_entries(struct inode *inode,
111 unsigned long group,
112 struct nilfs_palloc_group_desc *desc,
113 u32 n)
114{
115 spin_lock(nilfs_mdt_bgl_lock(inode, group));
116 le32_add_cpu(&desc->pg_nfrees, n);
117 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
118}
119
120static unsigned long
121nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
122{
123 unsigned long group, group_offset;
124
125 group = nilfs_palloc_group(inode, nr, &group_offset);
126
127 return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
128 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
129}
130
131static void nilfs_palloc_desc_block_init(struct inode *inode,
132 struct buffer_head *bh, void *kaddr)
133{
134 struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
135 unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
136 __le32 nfrees;
137
138 nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
139 while (n-- > 0) {
140 desc->pg_nfrees = nfrees;
141 desc++;
142 }
143}
144
145static int nilfs_palloc_get_desc_block(struct inode *inode,
146 unsigned long group,
147 int create, struct buffer_head **bhp)
148{
149 return nilfs_mdt_get_block(inode,
150 nilfs_palloc_desc_blkoff(inode, group),
151 create, nilfs_palloc_desc_block_init, bhp);
152}
153
154static int nilfs_palloc_get_bitmap_block(struct inode *inode,
155 unsigned long group,
156 int create, struct buffer_head **bhp)
157{
158 return nilfs_mdt_get_block(inode,
159 nilfs_palloc_bitmap_blkoff(inode, group),
160 create, NULL, bhp);
161}
162
163int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
164 int create, struct buffer_head **bhp)
165{
166 return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
167 create, NULL, bhp);
168}
169
170static struct nilfs_palloc_group_desc *
171nilfs_palloc_block_get_group_desc(const struct inode *inode,
172 unsigned long group,
173 const struct buffer_head *bh, void *kaddr)
174{
175 return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
176 group % nilfs_palloc_groups_per_desc_block(inode);
177}
178
179static unsigned char *
180nilfs_palloc_block_get_bitmap(const struct inode *inode,
181 const struct buffer_head *bh, void *kaddr)
182{
183 return (unsigned char *)(kaddr + bh_offset(bh));
184}
185
186void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
187 const struct buffer_head *bh, void *kaddr)
188{
189 unsigned long entry_offset, group_offset;
190
191 nilfs_palloc_group(inode, nr, &group_offset);
192 entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
193
194 return kaddr + bh_offset(bh) +
195 entry_offset * NILFS_MDT(inode)->mi_entry_size;
196}
197
198static int nilfs_palloc_find_available_slot(struct inode *inode,
199 unsigned long group,
200 unsigned long target,
201 unsigned char *bitmap,
202 int bsize) /* size in bits */
203{
204 int curr, pos, end, i;
205
206 if (target > 0) {
207 end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
208 if (end > bsize)
209 end = bsize;
210 pos = nilfs_find_next_zero_bit(bitmap, end, target);
211 if (pos < end &&
212 !nilfs_set_bit_atomic(
213 nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
214 return pos;
215 } else
216 end = 0;
217
218 for (i = 0, curr = end;
219 i < bsize;
220 i += BITS_PER_LONG, curr += BITS_PER_LONG) {
221 /* wrap around */
222 if (curr >= bsize)
223 curr = 0;
224 while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
225 != ~0UL) {
226 end = curr + BITS_PER_LONG;
227 if (end > bsize)
228 end = bsize;
229 pos = nilfs_find_next_zero_bit(bitmap, end, curr);
230 if ((pos < end) &&
231 !nilfs_set_bit_atomic(
232 nilfs_mdt_bgl_lock(inode, group), pos,
233 bitmap))
234 return pos;
235 }
236 }
237 return -ENOSPC;
238}
239
240static unsigned long
241nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
242 unsigned long curr, unsigned long max)
243{
244 return min_t(unsigned long,
245 nilfs_palloc_groups_per_desc_block(inode) -
246 curr % nilfs_palloc_groups_per_desc_block(inode),
247 max - curr + 1);
248}
249
250int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
251 struct nilfs_palloc_req *req)
252{
253 struct buffer_head *desc_bh, *bitmap_bh;
254 struct nilfs_palloc_group_desc *desc;
255 unsigned char *bitmap;
256 void *desc_kaddr, *bitmap_kaddr;
257 unsigned long group, maxgroup, ngroups;
258 unsigned long group_offset, maxgroup_offset;
259 unsigned long n, entries_per_group, groups_per_desc_block;
260 unsigned long i, j;
261 int pos, ret;
262
263 ngroups = nilfs_palloc_groups_count(inode);
264 maxgroup = ngroups - 1;
265 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
266 entries_per_group = nilfs_palloc_entries_per_group(inode);
267 groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
268
269 for (i = 0; i < ngroups; i += n) {
270 if (group >= ngroups) {
271 /* wrap around */
272 group = 0;
273 maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
274 &maxgroup_offset) - 1;
275 }
276 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
277 if (ret < 0)
278 return ret;
279 desc_kaddr = kmap(desc_bh->b_page);
280 desc = nilfs_palloc_block_get_group_desc(
281 inode, group, desc_bh, desc_kaddr);
282 n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
283 maxgroup);
284 for (j = 0; j < n; j++, desc++, group++) {
285 if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
286 > 0) {
287 ret = nilfs_palloc_get_bitmap_block(
288 inode, group, 1, &bitmap_bh);
289 if (ret < 0)
290 goto out_desc;
291 bitmap_kaddr = kmap(bitmap_bh->b_page);
292 bitmap = nilfs_palloc_block_get_bitmap(
293 inode, bitmap_bh, bitmap_kaddr);
294 pos = nilfs_palloc_find_available_slot(
295 inode, group, group_offset, bitmap,
296 entries_per_group);
297 if (pos >= 0) {
298 /* found a free entry */
299 nilfs_palloc_group_desc_add_entries(
300 inode, group, desc, -1);
301 req->pr_entry_nr =
302 entries_per_group * group + pos;
303 kunmap(desc_bh->b_page);
304 kunmap(bitmap_bh->b_page);
305
306 req->pr_desc_bh = desc_bh;
307 req->pr_bitmap_bh = bitmap_bh;
308 return 0;
309 }
310 kunmap(bitmap_bh->b_page);
311 brelse(bitmap_bh);
312 }
313
314 group_offset = 0;
315 }
316
317 kunmap(desc_bh->b_page);
318 brelse(desc_bh);
319 }
320
321 /* no entries left */
322 return -ENOSPC;
323
324 out_desc:
325 kunmap(desc_bh->b_page);
326 brelse(desc_bh);
327 return ret;
328}
329
330void nilfs_palloc_commit_alloc_entry(struct inode *inode,
331 struct nilfs_palloc_req *req)
332{
333 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
334 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
335 nilfs_mdt_mark_dirty(inode);
336
337 brelse(req->pr_bitmap_bh);
338 brelse(req->pr_desc_bh);
339}
340
341void nilfs_palloc_commit_free_entry(struct inode *inode,
342 struct nilfs_palloc_req *req)
343{
344 struct nilfs_palloc_group_desc *desc;
345 unsigned long group, group_offset;
346 unsigned char *bitmap;
347 void *desc_kaddr, *bitmap_kaddr;
348
349 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
350 desc_kaddr = kmap(req->pr_desc_bh->b_page);
351 desc = nilfs_palloc_block_get_group_desc(inode, group,
352 req->pr_desc_bh, desc_kaddr);
353 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
354 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
355 bitmap_kaddr);
356
357 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
358 group_offset, bitmap))
359 printk(KERN_WARNING "%s: entry number %llu already freed\n",
360 __func__, (unsigned long long)req->pr_entry_nr);
361
362 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
363
364 kunmap(req->pr_bitmap_bh->b_page);
365 kunmap(req->pr_desc_bh->b_page);
366
367 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
368 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
369 nilfs_mdt_mark_dirty(inode);
370
371 brelse(req->pr_bitmap_bh);
372 brelse(req->pr_desc_bh);
373}
374
375void nilfs_palloc_abort_alloc_entry(struct inode *inode,
376 struct nilfs_palloc_req *req)
377{
378 struct nilfs_palloc_group_desc *desc;
379 void *desc_kaddr, *bitmap_kaddr;
380 unsigned char *bitmap;
381 unsigned long group, group_offset;
382
383 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
384 desc_kaddr = kmap(req->pr_desc_bh->b_page);
385 desc = nilfs_palloc_block_get_group_desc(inode, group,
386 req->pr_desc_bh, desc_kaddr);
387 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
388 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
389 bitmap_kaddr);
390 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
391 group_offset, bitmap))
392 printk(KERN_WARNING "%s: entry numer %llu already freed\n",
393 __func__, (unsigned long long)req->pr_entry_nr);
394
395 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
396
397 kunmap(req->pr_bitmap_bh->b_page);
398 kunmap(req->pr_desc_bh->b_page);
399
400 brelse(req->pr_bitmap_bh);
401 brelse(req->pr_desc_bh);
402
403 req->pr_entry_nr = 0;
404 req->pr_bitmap_bh = NULL;
405 req->pr_desc_bh = NULL;
406}
407
408int nilfs_palloc_prepare_free_entry(struct inode *inode,
409 struct nilfs_palloc_req *req)
410{
411 struct buffer_head *desc_bh, *bitmap_bh;
412 unsigned long group, group_offset;
413 int ret;
414
415 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
416 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
417 if (ret < 0)
418 return ret;
419 ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
420 if (ret < 0) {
421 brelse(desc_bh);
422 return ret;
423 }
424
425 req->pr_desc_bh = desc_bh;
426 req->pr_bitmap_bh = bitmap_bh;
427 return 0;
428}
429
430void nilfs_palloc_abort_free_entry(struct inode *inode,
431 struct nilfs_palloc_req *req)
432{
433 brelse(req->pr_bitmap_bh);
434 brelse(req->pr_desc_bh);
435
436 req->pr_entry_nr = 0;
437 req->pr_bitmap_bh = NULL;
438 req->pr_desc_bh = NULL;
439}
440
441static int
442nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
443{
444 __u64 first, last;
445
446 first = group * nilfs_palloc_entries_per_group(inode);
447 last = first + nilfs_palloc_entries_per_group(inode) - 1;
448 return (nr >= first) && (nr <= last);
449}
450
451int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
452{
453 struct buffer_head *desc_bh, *bitmap_bh;
454 struct nilfs_palloc_group_desc *desc;
455 unsigned char *bitmap;
456 void *desc_kaddr, *bitmap_kaddr;
457 unsigned long group, group_offset;
458 int i, j, n, ret;
459
460 for (i = 0; i < nitems; i += n) {
461 group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
462 ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
463 if (ret < 0)
464 return ret;
465 ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
466 &bitmap_bh);
467 if (ret < 0) {
468 brelse(desc_bh);
469 return ret;
470 }
471 desc_kaddr = kmap(desc_bh->b_page);
472 desc = nilfs_palloc_block_get_group_desc(
473 inode, group, desc_bh, desc_kaddr);
474 bitmap_kaddr = kmap(bitmap_bh->b_page);
475 bitmap = nilfs_palloc_block_get_bitmap(
476 inode, bitmap_bh, bitmap_kaddr);
477 for (j = i, n = 0;
478 (j < nitems) && nilfs_palloc_group_is_in(inode, group,
479 entry_nrs[j]);
480 j++, n++) {
481 nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
482 if (!nilfs_clear_bit_atomic(
483 nilfs_mdt_bgl_lock(inode, group),
484 group_offset, bitmap)) {
485 printk(KERN_WARNING
486 "%s: entry number %llu already freed\n",
487 __func__,
488 (unsigned long long)entry_nrs[j]);
489 }
490 }
491 nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
492
493 kunmap(bitmap_bh->b_page);
494 kunmap(desc_bh->b_page);
495
496 nilfs_mdt_mark_buffer_dirty(desc_bh);
497 nilfs_mdt_mark_buffer_dirty(bitmap_bh);
498 nilfs_mdt_mark_dirty(inode);
499
500 brelse(bitmap_bh);
501 brelse(desc_bh);
502 }
503 return 0;
504}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 000000000000..4ace5475c2c7
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
1/*
2 * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#ifndef _NILFS_ALLOC_H
26#define _NILFS_ALLOC_H
27
28#include <linux/types.h>
29#include <linux/buffer_head.h>
30#include <linux/fs.h>
31
32static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode)
34{
35 return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
36}
37
38int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
39int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
40 struct buffer_head **);
41void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *);
43
44/**
45 * nilfs_palloc_req - persistent alloctor request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
49 * @pr_entry_bh: buffer head of the buffer containing translation entries
50 */
51struct nilfs_palloc_req {
52 __u64 pr_entry_nr;
53 struct buffer_head *pr_desc_bh;
54 struct buffer_head *pr_bitmap_bh;
55 struct buffer_head *pr_entry_bh;
56};
57
58int nilfs_palloc_prepare_alloc_entry(struct inode *,
59 struct nilfs_palloc_req *);
60void nilfs_palloc_commit_alloc_entry(struct inode *,
61 struct nilfs_palloc_req *);
62void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
63void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
64int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
65void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
66int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
67
68#define nilfs_set_bit_atomic ext2_set_bit_atomic
69#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
70#define nilfs_find_next_zero_bit ext2_find_next_zero_bit
71
72#endif /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 000000000000..24638e059bf3
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,783 @@
1/*
2 * bmap.c - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include "nilfs.h"
27#include "bmap.h"
28#include "sb.h"
29#include "btnode.h"
30#include "mdt.h"
31#include "dat.h"
32#include "alloc.h"
33
34int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
35 __u64 *ptrp)
36{
37 __u64 ptr;
38 int ret;
39
40 down_read(&bmap->b_sem);
41 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
42 if (ret < 0)
43 goto out;
44 if (bmap->b_pops->bpop_translate != NULL) {
45 ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
46 if (ret < 0)
47 goto out;
48 *ptrp = ptr;
49 }
50
51 out:
52 up_read(&bmap->b_sem);
53 return ret;
54}
55
56
57/**
58 * nilfs_bmap_lookup - find a record
59 * @bmap: bmap
60 * @key: key
61 * @recp: pointer to record
62 *
63 * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
64 * @bmap.
65 *
66 * Return Value: On success, 0 is returned and the record associated with @key
67 * is stored in the place pointed by @recp. On error, one of the following
68 * negative error codes is returned.
69 *
70 * %-EIO - I/O error.
71 *
72 * %-ENOMEM - Insufficient amount of memory available.
73 *
74 * %-ENOENT - A record associated with @key does not exist.
75 */
76int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
77 unsigned long key,
78 unsigned long *recp)
79{
80 __u64 ptr;
81 int ret;
82
83 /* XXX: use macro for level 1 */
84 ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
85 if (recp != NULL)
86 *recp = ptr;
87 return ret;
88}
89
90static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
91{
92 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
93 __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
94 int ret, n;
95
96 if (bmap->b_ops->bop_check_insert != NULL) {
97 ret = bmap->b_ops->bop_check_insert(bmap, key);
98 if (ret > 0) {
99 n = bmap->b_ops->bop_gather_data(
100 bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
101 if (n < 0)
102 return n;
103 ret = nilfs_btree_convert_and_insert(
104 bmap, key, ptr, keys, ptrs, n,
105 NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
106 if (ret == 0)
107 bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
108
109 return ret;
110 } else if (ret < 0)
111 return ret;
112 }
113
114 return bmap->b_ops->bop_insert(bmap, key, ptr);
115}
116
117/**
118 * nilfs_bmap_insert - insert a new key-record pair into a bmap
119 * @bmap: bmap
120 * @key: key
121 * @rec: record
122 *
123 * Description: nilfs_bmap_insert() inserts the new key-record pair specified
124 * by @key and @rec into @bmap.
125 *
126 * Return Value: On success, 0 is returned. On error, one of the following
127 * negative error codes is returned.
128 *
129 * %-EIO - I/O error.
130 *
131 * %-ENOMEM - Insufficient amount of memory available.
132 *
133 * %-EEXIST - A record associated with @key already exist.
134 */
135int nilfs_bmap_insert(struct nilfs_bmap *bmap,
136 unsigned long key,
137 unsigned long rec)
138{
139 int ret;
140
141 down_write(&bmap->b_sem);
142 ret = nilfs_bmap_do_insert(bmap, key, rec);
143 up_write(&bmap->b_sem);
144 return ret;
145}
146
147static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
148{
149 __u64 keys[NILFS_BMAP_LARGE_LOW + 1];
150 __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
151 int ret, n;
152
153 if (bmap->b_ops->bop_check_delete != NULL) {
154 ret = bmap->b_ops->bop_check_delete(bmap, key);
155 if (ret > 0) {
156 n = bmap->b_ops->bop_gather_data(
157 bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
158 if (n < 0)
159 return n;
160 ret = nilfs_direct_delete_and_convert(
161 bmap, key, keys, ptrs, n,
162 NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
163 if (ret == 0)
164 bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
165
166 return ret;
167 } else if (ret < 0)
168 return ret;
169 }
170
171 return bmap->b_ops->bop_delete(bmap, key);
172}
173
174int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
175{
176 __u64 lastkey;
177 int ret;
178
179 down_read(&bmap->b_sem);
180 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
181 if (!ret)
182 *key = lastkey;
183 up_read(&bmap->b_sem);
184 return ret;
185}
186
187/**
188 * nilfs_bmap_delete - delete a key-record pair from a bmap
189 * @bmap: bmap
190 * @key: key
191 *
192 * Description: nilfs_bmap_delete() deletes the key-record pair specified by
193 * @key from @bmap.
194 *
195 * Return Value: On success, 0 is returned. On error, one of the following
196 * negative error codes is returned.
197 *
198 * %-EIO - I/O error.
199 *
200 * %-ENOMEM - Insufficient amount of memory available.
201 *
202 * %-ENOENT - A record associated with @key does not exist.
203 */
204int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
205{
206 int ret;
207
208 down_write(&bmap->b_sem);
209 ret = nilfs_bmap_do_delete(bmap, key);
210 up_write(&bmap->b_sem);
211 return ret;
212}
213
214static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
215{
216 __u64 lastkey;
217 int ret;
218
219 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
220 if (ret < 0) {
221 if (ret == -ENOENT)
222 ret = 0;
223 return ret;
224 }
225
226 while (key <= lastkey) {
227 ret = nilfs_bmap_do_delete(bmap, lastkey);
228 if (ret < 0)
229 return ret;
230 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
231 if (ret < 0) {
232 if (ret == -ENOENT)
233 ret = 0;
234 return ret;
235 }
236 }
237 return 0;
238}
239
240/**
241 * nilfs_bmap_truncate - truncate a bmap to a specified key
242 * @bmap: bmap
243 * @key: key
244 *
245 * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
246 * greater than or equal to @key from @bmap.
247 *
248 * Return Value: On success, 0 is returned. On error, one of the following
249 * negative error codes is returned.
250 *
251 * %-EIO - I/O error.
252 *
253 * %-ENOMEM - Insufficient amount of memory available.
254 */
255int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
256{
257 int ret;
258
259 down_write(&bmap->b_sem);
260 ret = nilfs_bmap_do_truncate(bmap, key);
261 up_write(&bmap->b_sem);
262 return ret;
263}
264
265/**
266 * nilfs_bmap_clear - free resources a bmap holds
267 * @bmap: bmap
268 *
269 * Description: nilfs_bmap_clear() frees resources associated with @bmap.
270 */
271void nilfs_bmap_clear(struct nilfs_bmap *bmap)
272{
273 down_write(&bmap->b_sem);
274 if (bmap->b_ops->bop_clear != NULL)
275 bmap->b_ops->bop_clear(bmap);
276 up_write(&bmap->b_sem);
277}
278
279/**
280 * nilfs_bmap_propagate - propagate dirty state
281 * @bmap: bmap
282 * @bh: buffer head
283 *
284 * Description: nilfs_bmap_propagate() marks the buffers that directly or
285 * indirectly refer to the block specified by @bh dirty.
286 *
287 * Return Value: On success, 0 is returned. On error, one of the following
288 * negative error codes is returned.
289 *
290 * %-EIO - I/O error.
291 *
292 * %-ENOMEM - Insufficient amount of memory available.
293 */
294int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
295{
296 int ret;
297
298 down_write(&bmap->b_sem);
299 ret = bmap->b_ops->bop_propagate(bmap, bh);
300 up_write(&bmap->b_sem);
301 return ret;
302}
303
304/**
305 * nilfs_bmap_lookup_dirty_buffers -
306 * @bmap: bmap
307 * @listp: pointer to buffer head list
308 */
309void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
310 struct list_head *listp)
311{
312 if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
313 bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
314}
315
316/**
317 * nilfs_bmap_assign - assign a new block number to a block
318 * @bmap: bmap
319 * @bhp: pointer to buffer head
320 * @blocknr: block number
321 * @binfo: block information
322 *
323 * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
324 * buffer specified by @bh.
325 *
326 * Return Value: On success, 0 is returned and the buffer head of a newly
327 * create buffer and the block information associated with the buffer are
328 * stored in the place pointed by @bh and @binfo, respectively. On error, one
329 * of the following negative error codes is returned.
330 *
331 * %-EIO - I/O error.
332 *
333 * %-ENOMEM - Insufficient amount of memory available.
334 */
335int nilfs_bmap_assign(struct nilfs_bmap *bmap,
336 struct buffer_head **bh,
337 unsigned long blocknr,
338 union nilfs_binfo *binfo)
339{
340 int ret;
341
342 down_write(&bmap->b_sem);
343 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
344 up_write(&bmap->b_sem);
345 return ret;
346}
347
348/**
349 * nilfs_bmap_mark - mark block dirty
350 * @bmap: bmap
351 * @key: key
352 * @level: level
353 *
354 * Description: nilfs_bmap_mark() marks the block specified by @key and @level
355 * as dirty.
356 *
357 * Return Value: On success, 0 is returned. On error, one of the following
358 * negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 */
364int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
365{
366 int ret;
367
368 if (bmap->b_ops->bop_mark == NULL)
369 return 0;
370
371 down_write(&bmap->b_sem);
372 ret = bmap->b_ops->bop_mark(bmap, key, level);
373 up_write(&bmap->b_sem);
374 return ret;
375}
376
377/**
378 * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
379 * @bmap: bmap
380 *
381 * Description: nilfs_test_and_clear() is the atomic operation to test and
382 * clear the dirty state of @bmap.
383 *
384 * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
385 */
386int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
387{
388 int ret;
389
390 down_write(&bmap->b_sem);
391 ret = nilfs_bmap_dirty(bmap);
392 nilfs_bmap_clear_dirty(bmap);
393 up_write(&bmap->b_sem);
394 return ret;
395}
396
397
398/*
399 * Internal use only
400 */
401
402void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
403{
404 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
405 if (NILFS_MDT(bmap->b_inode))
406 nilfs_mdt_mark_dirty(bmap->b_inode);
407 else
408 mark_inode_dirty(bmap->b_inode);
409}
410
411void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
412{
413 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
414 if (NILFS_MDT(bmap->b_inode))
415 nilfs_mdt_mark_dirty(bmap->b_inode);
416 else
417 mark_inode_dirty(bmap->b_inode);
418}
419
420int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
421 struct buffer_head **bhp)
422{
423 return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
424 ptr, 0, bhp, 0);
425}
426
427void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
428 struct buffer_head *bh)
429{
430 brelse(bh);
431}
432
433int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
434 struct buffer_head **bhp)
435{
436 int ret;
437
438 ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
439 ptr, 0, bhp, 1);
440 if (ret < 0)
441 return ret;
442 set_buffer_nilfs_volatile(*bhp);
443 return 0;
444}
445
446void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
447 struct buffer_head *bh)
448{
449 nilfs_btnode_delete(bh);
450}
451
452__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
453 const struct buffer_head *bh)
454{
455 struct buffer_head *pbh;
456 __u64 key;
457
458 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
459 bmap->b_inode->i_blkbits);
460 for (pbh = page_buffers(bh->b_page); pbh != bh;
461 pbh = pbh->b_this_page, key++);
462
463 return key;
464}
465
466__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
467{
468 __s64 diff;
469
470 diff = key - bmap->b_last_allocated_key;
471 if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
472 (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
473 (bmap->b_last_allocated_ptr + diff > 0))
474 return bmap->b_last_allocated_ptr + diff;
475 else
476 return NILFS_BMAP_INVALID_PTR;
477}
478
479static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
480{
481 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
482}
483
484#define NILFS_BMAP_GROUP_DIV 8
485__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
486{
487 struct inode *dat = nilfs_bmap_get_dat(bmap);
488 unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
489 unsigned long group = bmap->b_inode->i_ino / entries_per_group;
490
491 return group * entries_per_group +
492 (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
493 (entries_per_group / NILFS_BMAP_GROUP_DIV);
494}
495
496static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
497 union nilfs_bmap_ptr_req *req)
498{
499 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
500}
501
502static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req)
504{
505 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506}
507
508static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req)
510{
511 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
512}
513
514static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
515 union nilfs_bmap_ptr_req *req)
516{
517 return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
518}
519
520static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
521 union nilfs_bmap_ptr_req *req,
522 sector_t blocknr)
523{
524 nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
525 blocknr);
526}
527
528static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
529 union nilfs_bmap_ptr_req *req)
530{
531 nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
532}
533
534static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
535 union nilfs_bmap_ptr_req *req)
536{
537 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
538}
539
540static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
541 union nilfs_bmap_ptr_req *req)
542{
543 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
544}
545
546static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
547 union nilfs_bmap_ptr_req *req)
548{
549 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
550}
551
552static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
553 union nilfs_bmap_ptr_req *req)
554{
555 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
556}
557
558int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
559 sector_t blocknr)
560{
561 return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
562}
563
564int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
565{
566 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
567}
568
569int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
570 union nilfs_bmap_ptr_req *oldreq,
571 union nilfs_bmap_ptr_req *newreq)
572{
573 int ret;
574
575 ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
576 if (ret < 0)
577 return ret;
578 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
579 if (ret < 0)
580 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
581
582 return ret;
583}
584
585void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
586 union nilfs_bmap_ptr_req *oldreq,
587 union nilfs_bmap_ptr_req *newreq)
588{
589 bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
590 bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
591}
592
593void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
594 union nilfs_bmap_ptr_req *oldreq,
595 union nilfs_bmap_ptr_req *newreq)
596{
597 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
598 bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
599}
600
601static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
602 __u64 *ptrp)
603{
604 sector_t blocknr;
605 int ret;
606
607 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
608 if (ret < 0)
609 return ret;
610 if (ptrp != NULL)
611 *ptrp = blocknr;
612 return 0;
613}
614
615static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
616 union nilfs_bmap_ptr_req *req)
617{
618 /* ignore target ptr */
619 req->bpr_ptr = bmap->b_last_allocated_ptr++;
620 return 0;
621}
622
623static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
624 union nilfs_bmap_ptr_req *req)
625{
626 /* do nothing */
627}
628
629static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
630 union nilfs_bmap_ptr_req *req)
631{
632 bmap->b_last_allocated_ptr--;
633}
634
635static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
636 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
637 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
638 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
639 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
640 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
641 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
642 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
643 .bpop_commit_end_ptr = nilfs_bmap_commit_end_v,
644 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
645
646 .bpop_translate = nilfs_bmap_translate_v,
647};
648
649static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
650 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
651 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
652 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
653 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
654 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
655 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
656 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
657 .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt,
658 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
659
660 .bpop_translate = nilfs_bmap_translate_v,
661};
662
663static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
664 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p,
665 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p,
666 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p,
667 .bpop_prepare_start_ptr = NULL,
668 .bpop_commit_start_ptr = NULL,
669 .bpop_abort_start_ptr = NULL,
670 .bpop_prepare_end_ptr = NULL,
671 .bpop_commit_end_ptr = NULL,
672 .bpop_abort_end_ptr = NULL,
673
674 .bpop_translate = NULL,
675};
676
677static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
678 .bpop_prepare_alloc_ptr = NULL,
679 .bpop_commit_alloc_ptr = NULL,
680 .bpop_abort_alloc_ptr = NULL,
681 .bpop_prepare_start_ptr = NULL,
682 .bpop_commit_start_ptr = NULL,
683 .bpop_abort_start_ptr = NULL,
684 .bpop_prepare_end_ptr = NULL,
685 .bpop_commit_end_ptr = NULL,
686 .bpop_abort_end_ptr = NULL,
687
688 .bpop_translate = NULL,
689};
690
691/**
692 * nilfs_bmap_read - read a bmap from an inode
693 * @bmap: bmap
694 * @raw_inode: on-disk inode
695 *
696 * Description: nilfs_bmap_read() initializes the bmap @bmap.
697 *
698 * Return Value: On success, 0 is returned. On error, the following negative
699 * error code is returned.
700 *
701 * %-ENOMEM - Insufficient amount of memory available.
702 */
703int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
704{
705 if (raw_inode == NULL)
706 memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
707 else
708 memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
709
710 init_rwsem(&bmap->b_sem);
711 bmap->b_state = 0;
712 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
713 switch (bmap->b_inode->i_ino) {
714 case NILFS_DAT_INO:
715 bmap->b_pops = &nilfs_bmap_ptr_ops_p;
716 bmap->b_last_allocated_key = 0; /* XXX: use macro */
717 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
718 break;
719 case NILFS_CPFILE_INO:
720 case NILFS_SUFILE_INO:
721 bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
722 bmap->b_last_allocated_key = 0; /* XXX: use macro */
723 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
724 break;
725 default:
726 bmap->b_pops = &nilfs_bmap_ptr_ops_v;
727 bmap->b_last_allocated_key = 0; /* XXX: use macro */
728 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
729 break;
730 }
731
732 return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
733 nilfs_btree_init(bmap,
734 NILFS_BMAP_LARGE_LOW,
735 NILFS_BMAP_LARGE_HIGH) :
736 nilfs_direct_init(bmap,
737 NILFS_BMAP_SMALL_LOW,
738 NILFS_BMAP_SMALL_HIGH);
739}
740
741/**
742 * nilfs_bmap_write - write back a bmap to an inode
743 * @bmap: bmap
744 * @raw_inode: on-disk inode
745 *
746 * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
747 */
748void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
749{
750 down_write(&bmap->b_sem);
751 memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
752 NILFS_INODE_BMAP_SIZE * sizeof(__le64));
753 if (bmap->b_inode->i_ino == NILFS_DAT_INO)
754 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
755
756 up_write(&bmap->b_sem);
757}
758
759void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
760{
761 memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
762 init_rwsem(&bmap->b_sem);
763 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
764 bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
765 bmap->b_last_allocated_key = 0;
766 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
767 bmap->b_state = 0;
768 nilfs_btree_init_gc(bmap);
769}
770
771void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
772{
773 memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
774 init_rwsem(&gcbmap->b_sem);
775 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
776}
777
778void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
779{
780 memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
781 init_rwsem(&bmap->b_sem);
782 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
783}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 000000000000..4f2708abb1ba
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
1/*
2 * bmap.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_H
24#define _NILFS_BMAP_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "alloc.h"
31
32#define NILFS_BMAP_INVALID_PTR 0
33
34#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey)
35#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key)
36#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr)
37#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr)
38
39#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff))
40
41
42struct nilfs_bmap;
43
44/**
45 * union nilfs_bmap_ptr_req - request for bmap ptr
46 * @bpr_ptr: bmap pointer
47 * @bpr_req: request for persistent allocator
48 */
49union nilfs_bmap_ptr_req {
50 __u64 bpr_ptr;
51 struct nilfs_palloc_req bpr_req;
52};
53
54/**
55 * struct nilfs_bmap_stats - bmap statistics
56 * @bs_nblocks: number of blocks created or deleted
57 */
58struct nilfs_bmap_stats {
59 unsigned int bs_nblocks;
60};
61
62/**
63 * struct nilfs_bmap_operations - bmap operation table
64 */
65struct nilfs_bmap_operations {
66 int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
67 int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
68 int (*bop_delete)(struct nilfs_bmap *, __u64);
69 void (*bop_clear)(struct nilfs_bmap *);
70
71 int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
72 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
73 struct list_head *);
74
75 int (*bop_assign)(struct nilfs_bmap *,
76 struct buffer_head **,
77 sector_t,
78 union nilfs_binfo *);
79 int (*bop_mark)(struct nilfs_bmap *, __u64, int);
80
81 /* The following functions are internal use only. */
82 int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
83 int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
84 int (*bop_check_delete)(struct nilfs_bmap *, __u64);
85 int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
86};
87
88
89/**
90 * struct nilfs_bmap_ptr_operations - bmap ptr operation table
91 */
92struct nilfs_bmap_ptr_operations {
93 int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
94 union nilfs_bmap_ptr_req *);
95 void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
96 union nilfs_bmap_ptr_req *);
97 void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
98 union nilfs_bmap_ptr_req *);
99 int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
100 union nilfs_bmap_ptr_req *);
101 void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
102 union nilfs_bmap_ptr_req *,
103 sector_t);
104 void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
105 union nilfs_bmap_ptr_req *);
106 int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
107 union nilfs_bmap_ptr_req *);
108 void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
109 union nilfs_bmap_ptr_req *);
110 void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
111 union nilfs_bmap_ptr_req *);
112
113 int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
114};
115
116
117#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
118#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */)
119#define NILFS_BMAP_NEW_PTR_INIT \
120 (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
121
122static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
123{
124 return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
125}
126
127
128/**
129 * struct nilfs_bmap - bmap structure
130 * @b_u: raw data
131 * @b_sem: semaphore
132 * @b_inode: owner of bmap
133 * @b_ops: bmap operation table
134 * @b_pops: bmap ptr operation table
135 * @b_low: low watermark of conversion
136 * @b_high: high watermark of conversion
137 * @b_last_allocated_key: last allocated key for data block
138 * @b_last_allocated_ptr: last allocated ptr for data block
139 * @b_state: state
140 */
141struct nilfs_bmap {
142 union {
143 __u8 u_flags;
144 __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
145 } b_u;
146 struct rw_semaphore b_sem;
147 struct inode *b_inode;
148 const struct nilfs_bmap_operations *b_ops;
149 const struct nilfs_bmap_ptr_operations *b_pops;
150 __u64 b_low;
151 __u64 b_high;
152 __u64 b_last_allocated_key;
153 __u64 b_last_allocated_ptr;
154 int b_state;
155};
156
157/* state */
158#define NILFS_BMAP_DIRTY 0x00000001
159
160
161int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
162int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
163void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
164int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
165int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
166int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
167int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
168int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
169void nilfs_bmap_clear(struct nilfs_bmap *);
170int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
171void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
172int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
173 unsigned long, union nilfs_binfo *);
174int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
175int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
176
177void nilfs_bmap_init_gc(struct nilfs_bmap *);
178void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
179void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
180
181
182/*
183 * Internal use only
184 */
185
186int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
187int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
188
189
190__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
191 const struct buffer_head *);
192
193__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
194__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
195
196int nilfs_bmap_prepare_update(struct nilfs_bmap *,
197 union nilfs_bmap_ptr_req *,
198 union nilfs_bmap_ptr_req *);
199void nilfs_bmap_commit_update(struct nilfs_bmap *,
200 union nilfs_bmap_ptr_req *,
201 union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_update(struct nilfs_bmap *,
203 union nilfs_bmap_ptr_req *,
204 union nilfs_bmap_ptr_req *);
205
206void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
207void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
208
209
210int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
211 struct buffer_head **);
212void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
213int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
214 struct buffer_head **);
215void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
216
217
218/* Assume that bmap semaphore is locked. */
219static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
220{
221 return !!(bmap->b_state & NILFS_BMAP_DIRTY);
222}
223
224/* Assume that bmap semaphore is locked. */
225static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
226{
227 bmap->b_state |= NILFS_BMAP_DIRTY;
228}
229
230/* Assume that bmap semaphore is locked. */
231static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
232{
233 bmap->b_state &= ~NILFS_BMAP_DIRTY;
234}
235
236
237#define NILFS_BMAP_LARGE 0x1
238
239#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN
240#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX
241#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX
242#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX
243
244#endif /* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 000000000000..d41509bff47b
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
1/*
2 * bmap_union.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_UNION_H
24#define _NILFS_BMAP_UNION_H
25
26#include "bmap.h"
27#include "direct.h"
28#include "btree.h"
29
30/**
31 * nilfs_bmap_union -
32 * @bi_bmap: bmap structure
33 * @bi_btree: direct map structure
34 * @bi_direct: B-tree structure
35 */
36union nilfs_bmap_union {
37 struct nilfs_bmap bi_bmap;
38 struct nilfs_direct bi_direct;
39 struct nilfs_btree bi_btree;
40};
41
42#endif /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 000000000000..4cc07b2c30e0
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
1/*
2 * btnode.c - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * This file was originally written by Seiji Kihara <kihara@osrg.net>
21 * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
22 * stabilization and simplification.
23 *
24 */
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/mm.h>
29#include <linux/backing-dev.h>
30#include "nilfs.h"
31#include "mdt.h"
32#include "dat.h"
33#include "page.h"
34#include "btnode.h"
35
36
37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list);
42 spin_lock_init(&btnc->private_lock);
43
44 spin_lock_init(&btnc->i_mmap_lock);
45 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47}
48
49static struct address_space_operations def_btnode_aops;
50
51void nilfs_btnode_cache_init(struct address_space *btnc)
52{
53 btnc->host = NULL; /* can safely set to host inode ? */
54 btnc->flags = 0;
55 mapping_set_gfp_mask(btnc, GFP_NOFS);
56 btnc->assoc_mapping = NULL;
57 btnc->backing_dev_info = &default_backing_dev_info;
58 btnc->a_ops = &def_btnode_aops;
59}
60
61void nilfs_btnode_cache_clear(struct address_space *btnc)
62{
63 invalidate_mapping_pages(btnc, 0, -1);
64 truncate_inode_pages(btnc, 0);
65}
66
67int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
68 sector_t pblocknr, struct buffer_head **pbh,
69 int newblk)
70{
71 struct buffer_head *bh;
72 struct inode *inode = NILFS_BTNC_I(btnc);
73 int err;
74
75 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
76 if (unlikely(!bh))
77 return -ENOMEM;
78
79 err = -EEXIST; /* internal code */
80 if (newblk) {
81 if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
82 buffer_dirty(bh))) {
83 brelse(bh);
84 BUG();
85 }
86 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
87 bh->b_blocknr = blocknr;
88 set_buffer_mapped(bh);
89 set_buffer_uptodate(bh);
90 goto found;
91 }
92
93 if (buffer_uptodate(bh) || buffer_dirty(bh))
94 goto found;
95
96 if (pblocknr == 0) {
97 pblocknr = blocknr;
98 if (inode->i_ino != NILFS_DAT_INO) {
99 struct inode *dat =
100 nilfs_dat_inode(NILFS_I_NILFS(inode));
101
102 /* blocknr is a virtual block number */
103 err = nilfs_dat_translate(dat, blocknr, &pblocknr);
104 if (unlikely(err)) {
105 brelse(bh);
106 goto out_locked;
107 }
108 }
109 }
110 lock_buffer(bh);
111 if (buffer_uptodate(bh)) {
112 unlock_buffer(bh);
113 err = -EEXIST; /* internal code */
114 goto found;
115 }
116 set_buffer_mapped(bh);
117 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
118 bh->b_blocknr = pblocknr; /* set block address for read */
119 bh->b_end_io = end_buffer_read_sync;
120 get_bh(bh);
121 submit_bh(READ, bh);
122 bh->b_blocknr = blocknr; /* set back to the given block address */
123 err = 0;
124found:
125 *pbh = bh;
126
127out_locked:
128 unlock_page(bh->b_page);
129 page_cache_release(bh->b_page);
130 return err;
131}
132
133int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
134 sector_t pblocknr, struct buffer_head **pbh, int newblk)
135{
136 struct buffer_head *bh;
137 int err;
138
139 err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
140 if (err == -EEXIST) /* internal code (cache hit) */
141 return 0;
142 if (unlikely(err))
143 return err;
144
145 bh = *pbh;
146 wait_on_buffer(bh);
147 if (!buffer_uptodate(bh)) {
148 brelse(bh);
149 return -EIO;
150 }
151 return 0;
152}
153
154/**
155 * nilfs_btnode_delete - delete B-tree node buffer
156 * @bh: buffer to be deleted
157 *
158 * nilfs_btnode_delete() invalidates the specified buffer and delete the page
159 * including the buffer if the page gets unbusy.
160 */
161void nilfs_btnode_delete(struct buffer_head *bh)
162{
163 struct address_space *mapping;
164 struct page *page = bh->b_page;
165 pgoff_t index = page_index(page);
166 int still_dirty;
167
168 page_cache_get(page);
169 lock_page(page);
170 wait_on_page_writeback(page);
171
172 nilfs_forget_buffer(bh);
173 still_dirty = PageDirty(page);
174 mapping = page->mapping;
175 unlock_page(page);
176 page_cache_release(page);
177
178 if (!still_dirty && mapping)
179 invalidate_inode_pages2_range(mapping, index, index);
180}
181
182/**
183 * nilfs_btnode_prepare_change_key
184 * prepare to move contents of the block for old key to one of new key.
185 * the old buffer will not be removed, but might be reused for new buffer.
186 * it might return -ENOMEM because of memory allocation errors,
187 * and might return -EIO because of disk read errors.
188 */
189int nilfs_btnode_prepare_change_key(struct address_space *btnc,
190 struct nilfs_btnode_chkey_ctxt *ctxt)
191{
192 struct buffer_head *obh, *nbh;
193 struct inode *inode = NILFS_BTNC_I(btnc);
194 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
195 int err;
196
197 if (oldkey == newkey)
198 return 0;
199
200 obh = ctxt->bh;
201 ctxt->newbh = NULL;
202
203 if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
204 lock_page(obh->b_page);
205 /*
206 * We cannot call radix_tree_preload for the kernels older
207 * than 2.6.23, because it is not exported for modules.
208 */
209 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
210 if (err)
211 goto failed_unlock;
212 /* BUG_ON(oldkey != obh->b_page->index); */
213 if (unlikely(oldkey != obh->b_page->index))
214 NILFS_PAGE_BUG(obh->b_page,
215 "invalid oldkey %lld (newkey=%lld)",
216 (unsigned long long)oldkey,
217 (unsigned long long)newkey);
218
219retry:
220 spin_lock_irq(&btnc->tree_lock);
221 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
222 spin_unlock_irq(&btnc->tree_lock);
223 /*
224 * Note: page->index will not change to newkey until
225 * nilfs_btnode_commit_change_key() will be called.
226 * To protect the page in intermediate state, the page lock
227 * is held.
228 */
229 radix_tree_preload_end();
230 if (!err)
231 return 0;
232 else if (err != -EEXIST)
233 goto failed_unlock;
234
235 err = invalidate_inode_pages2_range(btnc, newkey, newkey);
236 if (!err)
237 goto retry;
238 /* fallback to copy mode */
239 unlock_page(obh->b_page);
240 }
241
242 err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
243 if (likely(!err)) {
244 BUG_ON(nbh == obh);
245 ctxt->newbh = nbh;
246 }
247 return err;
248
249 failed_unlock:
250 unlock_page(obh->b_page);
251 return err;
252}
253
254/**
255 * nilfs_btnode_commit_change_key
256 * commit the change_key operation prepared by prepare_change_key().
257 */
258void nilfs_btnode_commit_change_key(struct address_space *btnc,
259 struct nilfs_btnode_chkey_ctxt *ctxt)
260{
261 struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
262 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
263 struct page *opage;
264
265 if (oldkey == newkey)
266 return;
267
268 if (nbh == NULL) { /* blocksize == pagesize */
269 opage = obh->b_page;
270 if (unlikely(oldkey != opage->index))
271 NILFS_PAGE_BUG(opage,
272 "invalid oldkey %lld (newkey=%lld)",
273 (unsigned long long)oldkey,
274 (unsigned long long)newkey);
275 if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
276 BUG();
277
278 spin_lock_irq(&btnc->tree_lock);
279 radix_tree_delete(&btnc->page_tree, oldkey);
280 radix_tree_tag_set(&btnc->page_tree, newkey,
281 PAGECACHE_TAG_DIRTY);
282 spin_unlock_irq(&btnc->tree_lock);
283
284 opage->index = obh->b_blocknr = newkey;
285 unlock_page(opage);
286 } else {
287 nilfs_copy_buffer(nbh, obh);
288 nilfs_btnode_mark_dirty(nbh);
289
290 nbh->b_blocknr = newkey;
291 ctxt->bh = nbh;
292 nilfs_btnode_delete(obh); /* will decrement bh->b_count */
293 }
294}
295
296/**
297 * nilfs_btnode_abort_change_key
298 * abort the change_key operation prepared by prepare_change_key().
299 */
300void nilfs_btnode_abort_change_key(struct address_space *btnc,
301 struct nilfs_btnode_chkey_ctxt *ctxt)
302{
303 struct buffer_head *nbh = ctxt->newbh;
304 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
305
306 if (oldkey == newkey)
307 return;
308
309 if (nbh == NULL) { /* blocksize == pagesize */
310 spin_lock_irq(&btnc->tree_lock);
311 radix_tree_delete(&btnc->page_tree, newkey);
312 spin_unlock_irq(&btnc->tree_lock);
313 unlock_page(ctxt->bh->b_page);
314 } else
315 brelse(nbh);
316}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 000000000000..35faa86444a7
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
1/*
2 * btnode.h - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_BTNODE_H
25#define _NILFS_BTNODE_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/backing-dev.h>
31
32
33struct nilfs_btnode_chkey_ctxt {
34 __u64 oldkey;
35 __u64 newkey;
36 struct buffer_head *bh;
37 struct buffer_head *newbh;
38};
39
40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *);
42void nilfs_btnode_cache_clear(struct address_space *);
43int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
44 struct buffer_head **, int);
45int nilfs_btnode_get(struct address_space *, __u64, sector_t,
46 struct buffer_head **, int);
47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *);
50void nilfs_btnode_commit_change_key(struct address_space *,
51 struct nilfs_btnode_chkey_ctxt *);
52void nilfs_btnode_abort_change_key(struct address_space *,
53 struct nilfs_btnode_chkey_ctxt *);
54
55#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh)
56
57
58#endif /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 000000000000..6b37a2767293
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
1/*
2 * btree.c - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/slab.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include <linux/pagevec.h>
27#include "nilfs.h"
28#include "page.h"
29#include "btnode.h"
30#include "btree.h"
31#include "alloc.h"
32
33/**
34 * struct nilfs_btree_path - A path on which B-tree operations are executed
35 * @bp_bh: buffer head of node block
36 * @bp_sib_bh: buffer head of sibling node block
37 * @bp_index: index of child node
38 * @bp_oldreq: ptr end request for old ptr
39 * @bp_newreq: ptr alloc request for new ptr
40 * @bp_op: rebalance operation
41 */
42struct nilfs_btree_path {
43 struct buffer_head *bp_bh;
44 struct buffer_head *bp_sib_bh;
45 int bp_index;
46 union nilfs_bmap_ptr_req bp_oldreq;
47 union nilfs_bmap_ptr_req bp_newreq;
48 struct nilfs_btnode_chkey_ctxt bp_ctxt;
49 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
50 int, __u64 *, __u64 *);
51};
52
53/*
54 * B-tree path operations
55 */
56
57static struct kmem_cache *nilfs_btree_path_cache;
58
59int __init nilfs_btree_path_cache_init(void)
60{
61 nilfs_btree_path_cache =
62 kmem_cache_create("nilfs2_btree_path_cache",
63 sizeof(struct nilfs_btree_path) *
64 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
65 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
66}
67
68void nilfs_btree_path_cache_destroy(void)
69{
70 kmem_cache_destroy(nilfs_btree_path_cache);
71}
72
73static inline struct nilfs_btree_path *
74nilfs_btree_alloc_path(const struct nilfs_btree *btree)
75{
76 return (struct nilfs_btree_path *)
77 kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
78}
79
80static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
81 struct nilfs_btree_path *path)
82{
83 kmem_cache_free(nilfs_btree_path_cache, path);
84}
85
86static void nilfs_btree_init_path(const struct nilfs_btree *btree,
87 struct nilfs_btree_path *path)
88{
89 int level;
90
91 for (level = NILFS_BTREE_LEVEL_DATA;
92 level < NILFS_BTREE_LEVEL_MAX;
93 level++) {
94 path[level].bp_bh = NULL;
95 path[level].bp_sib_bh = NULL;
96 path[level].bp_index = 0;
97 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
98 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
99 path[level].bp_op = NULL;
100 }
101}
102
103static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
104 struct nilfs_btree_path *path)
105{
106 int level;
107
108 for (level = NILFS_BTREE_LEVEL_DATA;
109 level < NILFS_BTREE_LEVEL_MAX;
110 level++) {
111 if (path[level].bp_bh != NULL) {
112 nilfs_bmap_put_block(&btree->bt_bmap,
113 path[level].bp_bh);
114 path[level].bp_bh = NULL;
115 }
116 /* sib_bh is released or deleted by prepare or commit
117 * operations. */
118 path[level].bp_sib_bh = NULL;
119 path[level].bp_index = 0;
120 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
121 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
122 path[level].bp_op = NULL;
123 }
124}
125
126
127/*
128 * B-tree node operations
129 */
130
131static inline int
132nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
133 const struct nilfs_btree_node *node)
134{
135 return node->bn_flags;
136}
137
138static inline void
139nilfs_btree_node_set_flags(struct nilfs_btree *btree,
140 struct nilfs_btree_node *node,
141 int flags)
142{
143 node->bn_flags = flags;
144}
145
146static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
147 const struct nilfs_btree_node *node)
148{
149 return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
150}
151
152static inline int
153nilfs_btree_node_get_level(const struct nilfs_btree *btree,
154 const struct nilfs_btree_node *node)
155{
156 return node->bn_level;
157}
158
159static inline void
160nilfs_btree_node_set_level(struct nilfs_btree *btree,
161 struct nilfs_btree_node *node,
162 int level)
163{
164 node->bn_level = level;
165}
166
167static inline int
168nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
169 const struct nilfs_btree_node *node)
170{
171 return le16_to_cpu(node->bn_nchildren);
172}
173
174static inline void
175nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
176 struct nilfs_btree_node *node,
177 int nchildren)
178{
179 node->bn_nchildren = cpu_to_le16(nchildren);
180}
181
182static inline int
183nilfs_btree_node_size(const struct nilfs_btree *btree)
184{
185 return 1 << btree->bt_bmap.b_inode->i_blkbits;
186}
187
188static inline int
189nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
190 const struct nilfs_btree_node *node)
191{
192 return nilfs_btree_node_root(btree, node) ?
193 NILFS_BTREE_ROOT_NCHILDREN_MIN :
194 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
195}
196
197static inline int
198nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
199 const struct nilfs_btree_node *node)
200{
201 return nilfs_btree_node_root(btree, node) ?
202 NILFS_BTREE_ROOT_NCHILDREN_MAX :
203 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
204}
205
206static inline __le64 *
207nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
208 const struct nilfs_btree_node *node)
209{
210 return (__le64 *)((char *)(node + 1) +
211 (nilfs_btree_node_root(btree, node) ?
212 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
213}
214
215static inline __le64 *
216nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
217 const struct nilfs_btree_node *node)
218{
219 return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
220 nilfs_btree_node_nchildren_max(btree, node));
221}
222
223static inline __u64
224nilfs_btree_node_get_key(const struct nilfs_btree *btree,
225 const struct nilfs_btree_node *node, int index)
226{
227 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
228 index));
229}
230
231static inline void
232nilfs_btree_node_set_key(struct nilfs_btree *btree,
233 struct nilfs_btree_node *node, int index, __u64 key)
234{
235 *(nilfs_btree_node_dkeys(btree, node) + index) =
236 nilfs_bmap_key_to_dkey(key);
237}
238
239static inline __u64
240nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
241 const struct nilfs_btree_node *node,
242 int index)
243{
244 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
245 index));
246}
247
248static inline void
249nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
250 struct nilfs_btree_node *node,
251 int index,
252 __u64 ptr)
253{
254 *(nilfs_btree_node_dptrs(btree, node) + index) =
255 nilfs_bmap_ptr_to_dptr(ptr);
256}
257
258static void nilfs_btree_node_init(struct nilfs_btree *btree,
259 struct nilfs_btree_node *node,
260 int flags, int level, int nchildren,
261 const __u64 *keys, const __u64 *ptrs)
262{
263 __le64 *dkeys;
264 __le64 *dptrs;
265 int i;
266
267 nilfs_btree_node_set_flags(btree, node, flags);
268 nilfs_btree_node_set_level(btree, node, level);
269 nilfs_btree_node_set_nchildren(btree, node, nchildren);
270
271 dkeys = nilfs_btree_node_dkeys(btree, node);
272 dptrs = nilfs_btree_node_dptrs(btree, node);
273 for (i = 0; i < nchildren; i++) {
274 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
275 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
276 }
277}
278
279/* Assume the buffer heads corresponding to left and right are locked. */
280static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
281 struct nilfs_btree_node *left,
282 struct nilfs_btree_node *right,
283 int n)
284{
285 __le64 *ldkeys, *rdkeys;
286 __le64 *ldptrs, *rdptrs;
287 int lnchildren, rnchildren;
288
289 ldkeys = nilfs_btree_node_dkeys(btree, left);
290 ldptrs = nilfs_btree_node_dptrs(btree, left);
291 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
292
293 rdkeys = nilfs_btree_node_dkeys(btree, right);
294 rdptrs = nilfs_btree_node_dptrs(btree, right);
295 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
296
297 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
298 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
299 memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
300 memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
301
302 lnchildren += n;
303 rnchildren -= n;
304 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
305 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
306}
307
308/* Assume that the buffer heads corresponding to left and right are locked. */
309static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
310 struct nilfs_btree_node *left,
311 struct nilfs_btree_node *right,
312 int n)
313{
314 __le64 *ldkeys, *rdkeys;
315 __le64 *ldptrs, *rdptrs;
316 int lnchildren, rnchildren;
317
318 ldkeys = nilfs_btree_node_dkeys(btree, left);
319 ldptrs = nilfs_btree_node_dptrs(btree, left);
320 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
321
322 rdkeys = nilfs_btree_node_dkeys(btree, right);
323 rdptrs = nilfs_btree_node_dptrs(btree, right);
324 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
325
326 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
327 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
328 memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
329 memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
330
331 lnchildren -= n;
332 rnchildren += n;
333 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
334 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
335}
336
337/* Assume that the buffer head corresponding to node is locked. */
338static void nilfs_btree_node_insert(struct nilfs_btree *btree,
339 struct nilfs_btree_node *node,
340 __u64 key, __u64 ptr, int index)
341{
342 __le64 *dkeys;
343 __le64 *dptrs;
344 int nchildren;
345
346 dkeys = nilfs_btree_node_dkeys(btree, node);
347 dptrs = nilfs_btree_node_dptrs(btree, node);
348 nchildren = nilfs_btree_node_get_nchildren(btree, node);
349 if (index < nchildren) {
350 memmove(dkeys + index + 1, dkeys + index,
351 (nchildren - index) * sizeof(*dkeys));
352 memmove(dptrs + index + 1, dptrs + index,
353 (nchildren - index) * sizeof(*dptrs));
354 }
355 dkeys[index] = nilfs_bmap_key_to_dkey(key);
356 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
357 nchildren++;
358 nilfs_btree_node_set_nchildren(btree, node, nchildren);
359}
360
361/* Assume that the buffer head corresponding to node is locked. */
362static void nilfs_btree_node_delete(struct nilfs_btree *btree,
363 struct nilfs_btree_node *node,
364 __u64 *keyp, __u64 *ptrp, int index)
365{
366 __u64 key;
367 __u64 ptr;
368 __le64 *dkeys;
369 __le64 *dptrs;
370 int nchildren;
371
372 dkeys = nilfs_btree_node_dkeys(btree, node);
373 dptrs = nilfs_btree_node_dptrs(btree, node);
374 key = nilfs_bmap_dkey_to_key(dkeys[index]);
375 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
376 nchildren = nilfs_btree_node_get_nchildren(btree, node);
377 if (keyp != NULL)
378 *keyp = key;
379 if (ptrp != NULL)
380 *ptrp = ptr;
381
382 if (index < nchildren - 1) {
383 memmove(dkeys + index, dkeys + index + 1,
384 (nchildren - index - 1) * sizeof(*dkeys));
385 memmove(dptrs + index, dptrs + index + 1,
386 (nchildren - index - 1) * sizeof(*dptrs));
387 }
388 nchildren--;
389 nilfs_btree_node_set_nchildren(btree, node, nchildren);
390}
391
392static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
393 const struct nilfs_btree_node *node,
394 __u64 key, int *indexp)
395{
396 __u64 nkey;
397 int index, low, high, s;
398
399 /* binary search */
400 low = 0;
401 high = nilfs_btree_node_get_nchildren(btree, node) - 1;
402 index = 0;
403 s = 0;
404 while (low <= high) {
405 index = (low + high) / 2;
406 nkey = nilfs_btree_node_get_key(btree, node, index);
407 if (nkey == key) {
408 s = 0;
409 goto out;
410 } else if (nkey < key) {
411 low = index + 1;
412 s = -1;
413 } else {
414 high = index - 1;
415 s = 1;
416 }
417 }
418
419 /* adjust index */
420 if (nilfs_btree_node_get_level(btree, node) >
421 NILFS_BTREE_LEVEL_NODE_MIN) {
422 if ((s > 0) && (index > 0))
423 index--;
424 } else if (s < 0)
425 index++;
426
427 out:
428 *indexp = index;
429
430 return s == 0;
431}
432
433static inline struct nilfs_btree_node *
434nilfs_btree_get_root(const struct nilfs_btree *btree)
435{
436 return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
437}
438
439static inline struct nilfs_btree_node *
440nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
441 const struct nilfs_btree_path *path,
442 int level)
443{
444 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
445}
446
447static inline struct nilfs_btree_node *
448nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
449 const struct nilfs_btree_path *path,
450 int level)
451{
452 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
453}
454
455static inline int nilfs_btree_height(const struct nilfs_btree *btree)
456{
457 return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
458 + 1;
459}
460
461static inline struct nilfs_btree_node *
462nilfs_btree_get_node(const struct nilfs_btree *btree,
463 const struct nilfs_btree_path *path,
464 int level)
465{
466 return (level == nilfs_btree_height(btree) - 1) ?
467 nilfs_btree_get_root(btree) :
468 nilfs_btree_get_nonroot_node(btree, path, level);
469}
470
471static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
472 struct nilfs_btree_path *path,
473 __u64 key, __u64 *ptrp, int minlevel)
474{
475 struct nilfs_btree_node *node;
476 __u64 ptr;
477 int level, index, found, ret;
478
479 node = nilfs_btree_get_root(btree);
480 level = nilfs_btree_node_get_level(btree, node);
481 if ((level < minlevel) ||
482 (nilfs_btree_node_get_nchildren(btree, node) <= 0))
483 return -ENOENT;
484
485 found = nilfs_btree_node_lookup(btree, node, key, &index);
486 ptr = nilfs_btree_node_get_ptr(btree, node, index);
487 path[level].bp_bh = NULL;
488 path[level].bp_index = index;
489
490 for (level--; level >= minlevel; level--) {
491 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
492 &path[level].bp_bh);
493 if (ret < 0)
494 return ret;
495 node = nilfs_btree_get_nonroot_node(btree, path, level);
496 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
497 if (!found)
498 found = nilfs_btree_node_lookup(btree, node, key,
499 &index);
500 else
501 index = 0;
502 if (index < nilfs_btree_node_nchildren_max(btree, node))
503 ptr = nilfs_btree_node_get_ptr(btree, node, index);
504 else {
505 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
506 /* insert */
507 ptr = NILFS_BMAP_INVALID_PTR;
508 }
509 path[level].bp_index = index;
510 }
511 if (!found)
512 return -ENOENT;
513
514 if (ptrp != NULL)
515 *ptrp = ptr;
516
517 return 0;
518}
519
520static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
521 struct nilfs_btree_path *path,
522 __u64 *keyp, __u64 *ptrp)
523{
524 struct nilfs_btree_node *node;
525 __u64 ptr;
526 int index, level, ret;
527
528 node = nilfs_btree_get_root(btree);
529 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
530 if (index < 0)
531 return -ENOENT;
532 level = nilfs_btree_node_get_level(btree, node);
533 ptr = nilfs_btree_node_get_ptr(btree, node, index);
534 path[level].bp_bh = NULL;
535 path[level].bp_index = index;
536
537 for (level--; level > 0; level--) {
538 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
539 &path[level].bp_bh);
540 if (ret < 0)
541 return ret;
542 node = nilfs_btree_get_nonroot_node(btree, path, level);
543 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
544 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
545 ptr = nilfs_btree_node_get_ptr(btree, node, index);
546 path[level].bp_index = index;
547 }
548
549 if (keyp != NULL)
550 *keyp = nilfs_btree_node_get_key(btree, node, index);
551 if (ptrp != NULL)
552 *ptrp = ptr;
553
554 return 0;
555}
556
557static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
558 __u64 key, int level, __u64 *ptrp)
559{
560 struct nilfs_btree *btree;
561 struct nilfs_btree_path *path;
562 __u64 ptr;
563 int ret;
564
565 btree = (struct nilfs_btree *)bmap;
566 path = nilfs_btree_alloc_path(btree);
567 if (path == NULL)
568 return -ENOMEM;
569 nilfs_btree_init_path(btree, path);
570
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
572
573 if (ptrp != NULL)
574 *ptrp = ptr;
575
576 nilfs_btree_clear_path(btree, path);
577 nilfs_btree_free_path(btree, path);
578
579 return ret;
580}
581
582static void nilfs_btree_promote_key(struct nilfs_btree *btree,
583 struct nilfs_btree_path *path,
584 int level, __u64 key)
585{
586 if (level < nilfs_btree_height(btree) - 1) {
587 do {
588 lock_buffer(path[level].bp_bh);
589 nilfs_btree_node_set_key(
590 btree,
591 nilfs_btree_get_nonroot_node(
592 btree, path, level),
593 path[level].bp_index, key);
594 if (!buffer_dirty(path[level].bp_bh))
595 nilfs_btnode_mark_dirty(path[level].bp_bh);
596 unlock_buffer(path[level].bp_bh);
597 } while ((path[level].bp_index == 0) &&
598 (++level < nilfs_btree_height(btree) - 1));
599 }
600
601 /* root */
602 if (level == nilfs_btree_height(btree) - 1) {
603 nilfs_btree_node_set_key(btree,
604 nilfs_btree_get_root(btree),
605 path[level].bp_index, key);
606 }
607}
608
609static void nilfs_btree_do_insert(struct nilfs_btree *btree,
610 struct nilfs_btree_path *path,
611 int level, __u64 *keyp, __u64 *ptrp)
612{
613 struct nilfs_btree_node *node;
614
615 if (level < nilfs_btree_height(btree) - 1) {
616 lock_buffer(path[level].bp_bh);
617 node = nilfs_btree_get_nonroot_node(btree, path, level);
618 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
619 path[level].bp_index);
620 if (!buffer_dirty(path[level].bp_bh))
621 nilfs_btnode_mark_dirty(path[level].bp_bh);
622 unlock_buffer(path[level].bp_bh);
623
624 if (path[level].bp_index == 0)
625 nilfs_btree_promote_key(btree, path, level + 1,
626 nilfs_btree_node_get_key(
627 btree, node, 0));
628 } else {
629 node = nilfs_btree_get_root(btree);
630 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
631 path[level].bp_index);
632 }
633}
634
635static void nilfs_btree_carry_left(struct nilfs_btree *btree,
636 struct nilfs_btree_path *path,
637 int level, __u64 *keyp, __u64 *ptrp)
638{
639 struct nilfs_btree_node *node, *left;
640 int nchildren, lnchildren, n, move;
641
642 lock_buffer(path[level].bp_bh);
643 lock_buffer(path[level].bp_sib_bh);
644
645 node = nilfs_btree_get_nonroot_node(btree, path, level);
646 left = nilfs_btree_get_sib_node(btree, path, level);
647 nchildren = nilfs_btree_node_get_nchildren(btree, node);
648 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
649 move = 0;
650
651 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
652 if (n > path[level].bp_index) {
653 /* move insert point */
654 n--;
655 move = 1;
656 }
657
658 nilfs_btree_node_move_left(btree, left, node, n);
659
660 if (!buffer_dirty(path[level].bp_bh))
661 nilfs_btnode_mark_dirty(path[level].bp_bh);
662 if (!buffer_dirty(path[level].bp_sib_bh))
663 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
664
665 unlock_buffer(path[level].bp_bh);
666 unlock_buffer(path[level].bp_sib_bh);
667
668 nilfs_btree_promote_key(btree, path, level + 1,
669 nilfs_btree_node_get_key(btree, node, 0));
670
671 if (move) {
672 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
673 path[level].bp_bh = path[level].bp_sib_bh;
674 path[level].bp_sib_bh = NULL;
675 path[level].bp_index += lnchildren;
676 path[level + 1].bp_index--;
677 } else {
678 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
679 path[level].bp_sib_bh = NULL;
680 path[level].bp_index -= n;
681 }
682
683 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
684}
685
686static void nilfs_btree_carry_right(struct nilfs_btree *btree,
687 struct nilfs_btree_path *path,
688 int level, __u64 *keyp, __u64 *ptrp)
689{
690 struct nilfs_btree_node *node, *right;
691 int nchildren, rnchildren, n, move;
692
693 lock_buffer(path[level].bp_bh);
694 lock_buffer(path[level].bp_sib_bh);
695
696 node = nilfs_btree_get_nonroot_node(btree, path, level);
697 right = nilfs_btree_get_sib_node(btree, path, level);
698 nchildren = nilfs_btree_node_get_nchildren(btree, node);
699 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
700 move = 0;
701
702 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
703 if (n > nchildren - path[level].bp_index) {
704 /* move insert point */
705 n--;
706 move = 1;
707 }
708
709 nilfs_btree_node_move_right(btree, node, right, n);
710
711 if (!buffer_dirty(path[level].bp_bh))
712 nilfs_btnode_mark_dirty(path[level].bp_bh);
713 if (!buffer_dirty(path[level].bp_sib_bh))
714 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
715
716 unlock_buffer(path[level].bp_bh);
717 unlock_buffer(path[level].bp_sib_bh);
718
719 path[level + 1].bp_index++;
720 nilfs_btree_promote_key(btree, path, level + 1,
721 nilfs_btree_node_get_key(btree, right, 0));
722 path[level + 1].bp_index--;
723
724 if (move) {
725 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
726 path[level].bp_bh = path[level].bp_sib_bh;
727 path[level].bp_sib_bh = NULL;
728 path[level].bp_index -=
729 nilfs_btree_node_get_nchildren(btree, node);
730 path[level + 1].bp_index++;
731 } else {
732 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
733 path[level].bp_sib_bh = NULL;
734 }
735
736 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
737}
738
739static void nilfs_btree_split(struct nilfs_btree *btree,
740 struct nilfs_btree_path *path,
741 int level, __u64 *keyp, __u64 *ptrp)
742{
743 struct nilfs_btree_node *node, *right;
744 __u64 newkey;
745 __u64 newptr;
746 int nchildren, n, move;
747
748 lock_buffer(path[level].bp_bh);
749 lock_buffer(path[level].bp_sib_bh);
750
751 node = nilfs_btree_get_nonroot_node(btree, path, level);
752 right = nilfs_btree_get_sib_node(btree, path, level);
753 nchildren = nilfs_btree_node_get_nchildren(btree, node);
754 move = 0;
755
756 n = (nchildren + 1) / 2;
757 if (n > nchildren - path[level].bp_index) {
758 n--;
759 move = 1;
760 }
761
762 nilfs_btree_node_move_right(btree, node, right, n);
763
764 if (!buffer_dirty(path[level].bp_bh))
765 nilfs_btnode_mark_dirty(path[level].bp_bh);
766 if (!buffer_dirty(path[level].bp_sib_bh))
767 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
768
769 unlock_buffer(path[level].bp_bh);
770 unlock_buffer(path[level].bp_sib_bh);
771
772 newkey = nilfs_btree_node_get_key(btree, right, 0);
773 newptr = path[level].bp_newreq.bpr_ptr;
774
775 if (move) {
776 path[level].bp_index -=
777 nilfs_btree_node_get_nchildren(btree, node);
778 nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
779 path[level].bp_index);
780
781 *keyp = nilfs_btree_node_get_key(btree, right, 0);
782 *ptrp = path[level].bp_newreq.bpr_ptr;
783
784 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
785 path[level].bp_bh = path[level].bp_sib_bh;
786 path[level].bp_sib_bh = NULL;
787 } else {
788 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
789
790 *keyp = nilfs_btree_node_get_key(btree, right, 0);
791 *ptrp = path[level].bp_newreq.bpr_ptr;
792
793 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
794 path[level].bp_sib_bh = NULL;
795 }
796
797 path[level + 1].bp_index++;
798}
799
800static void nilfs_btree_grow(struct nilfs_btree *btree,
801 struct nilfs_btree_path *path,
802 int level, __u64 *keyp, __u64 *ptrp)
803{
804 struct nilfs_btree_node *root, *child;
805 int n;
806
807 lock_buffer(path[level].bp_sib_bh);
808
809 root = nilfs_btree_get_root(btree);
810 child = nilfs_btree_get_sib_node(btree, path, level);
811
812 n = nilfs_btree_node_get_nchildren(btree, root);
813
814 nilfs_btree_node_move_right(btree, root, child, n);
815 nilfs_btree_node_set_level(btree, root, level + 1);
816
817 if (!buffer_dirty(path[level].bp_sib_bh))
818 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
819
820 unlock_buffer(path[level].bp_sib_bh);
821
822 path[level].bp_bh = path[level].bp_sib_bh;
823 path[level].bp_sib_bh = NULL;
824
825 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
826
827 *keyp = nilfs_btree_node_get_key(btree, child, 0);
828 *ptrp = path[level].bp_newreq.bpr_ptr;
829}
830
831static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
832 const struct nilfs_btree_path *path)
833{
834 struct nilfs_btree_node *node;
835 int level;
836
837 if (path == NULL)
838 return NILFS_BMAP_INVALID_PTR;
839
840 /* left sibling */
841 level = NILFS_BTREE_LEVEL_NODE_MIN;
842 if (path[level].bp_index > 0) {
843 node = nilfs_btree_get_node(btree, path, level);
844 return nilfs_btree_node_get_ptr(btree, node,
845 path[level].bp_index - 1);
846 }
847
848 /* parent */
849 level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
850 if (level <= nilfs_btree_height(btree) - 1) {
851 node = nilfs_btree_get_node(btree, path, level);
852 return nilfs_btree_node_get_ptr(btree, node,
853 path[level].bp_index);
854 }
855
856 return NILFS_BMAP_INVALID_PTR;
857}
858
859static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
860 const struct nilfs_btree_path *path,
861 __u64 key)
862{
863 __u64 ptr;
864
865 ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
866 if (ptr != NILFS_BMAP_INVALID_PTR)
867 /* sequential access */
868 return ptr;
869 else {
870 ptr = nilfs_btree_find_near(btree, path);
871 if (ptr != NILFS_BMAP_INVALID_PTR)
872 /* near */
873 return ptr;
874 }
875 /* block group */
876 return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
877}
878
879static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
880 __u64 ptr)
881{
882 btree->bt_bmap.b_last_allocated_key = key;
883 btree->bt_bmap.b_last_allocated_ptr = ptr;
884}
885
886static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
887 struct nilfs_btree_path *path,
888 int *levelp, __u64 key, __u64 ptr,
889 struct nilfs_bmap_stats *stats)
890{
891 struct buffer_head *bh;
892 struct nilfs_btree_node *node, *parent, *sib;
893 __u64 sibptr;
894 int pindex, level, ret;
895
896 stats->bs_nblocks = 0;
897 level = NILFS_BTREE_LEVEL_DATA;
898
899 /* allocate a new ptr for data block */
900 if (btree->bt_ops->btop_find_target != NULL)
901 path[level].bp_newreq.bpr_ptr =
902 btree->bt_ops->btop_find_target(btree, path, key);
903
904 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
905 &btree->bt_bmap, &path[level].bp_newreq);
906 if (ret < 0)
907 goto err_out_data;
908
909 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
910 level < nilfs_btree_height(btree) - 1;
911 level++) {
912 node = nilfs_btree_get_nonroot_node(btree, path, level);
913 if (nilfs_btree_node_get_nchildren(btree, node) <
914 nilfs_btree_node_nchildren_max(btree, node)) {
915 path[level].bp_op = nilfs_btree_do_insert;
916 stats->bs_nblocks++;
917 goto out;
918 }
919
920 parent = nilfs_btree_get_node(btree, path, level + 1);
921 pindex = path[level + 1].bp_index;
922
923 /* left sibling */
924 if (pindex > 0) {
925 sibptr = nilfs_btree_node_get_ptr(btree, parent,
926 pindex - 1);
927 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
928 &bh);
929 if (ret < 0)
930 goto err_out_child_node;
931 sib = (struct nilfs_btree_node *)bh->b_data;
932 if (nilfs_btree_node_get_nchildren(btree, sib) <
933 nilfs_btree_node_nchildren_max(btree, sib)) {
934 path[level].bp_sib_bh = bh;
935 path[level].bp_op = nilfs_btree_carry_left;
936 stats->bs_nblocks++;
937 goto out;
938 } else
939 nilfs_bmap_put_block(&btree->bt_bmap, bh);
940 }
941
942 /* right sibling */
943 if (pindex <
944 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
945 sibptr = nilfs_btree_node_get_ptr(btree, parent,
946 pindex + 1);
947 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
948 &bh);
949 if (ret < 0)
950 goto err_out_child_node;
951 sib = (struct nilfs_btree_node *)bh->b_data;
952 if (nilfs_btree_node_get_nchildren(btree, sib) <
953 nilfs_btree_node_nchildren_max(btree, sib)) {
954 path[level].bp_sib_bh = bh;
955 path[level].bp_op = nilfs_btree_carry_right;
956 stats->bs_nblocks++;
957 goto out;
958 } else
959 nilfs_bmap_put_block(&btree->bt_bmap, bh);
960 }
961
962 /* split */
963 path[level].bp_newreq.bpr_ptr =
964 path[level - 1].bp_newreq.bpr_ptr + 1;
965 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
966 &btree->bt_bmap, &path[level].bp_newreq);
967 if (ret < 0)
968 goto err_out_child_node;
969 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
970 path[level].bp_newreq.bpr_ptr,
971 &bh);
972 if (ret < 0)
973 goto err_out_curr_node;
974
975 stats->bs_nblocks++;
976
977 lock_buffer(bh);
978 nilfs_btree_node_init(btree,
979 (struct nilfs_btree_node *)bh->b_data,
980 0, level, 0, NULL, NULL);
981 unlock_buffer(bh);
982 path[level].bp_sib_bh = bh;
983 path[level].bp_op = nilfs_btree_split;
984 }
985
986 /* root */
987 node = nilfs_btree_get_root(btree);
988 if (nilfs_btree_node_get_nchildren(btree, node) <
989 nilfs_btree_node_nchildren_max(btree, node)) {
990 path[level].bp_op = nilfs_btree_do_insert;
991 stats->bs_nblocks++;
992 goto out;
993 }
994
995 /* grow */
996 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
997 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
998 &btree->bt_bmap, &path[level].bp_newreq);
999 if (ret < 0)
1000 goto err_out_child_node;
1001 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
1002 path[level].bp_newreq.bpr_ptr, &bh);
1003 if (ret < 0)
1004 goto err_out_curr_node;
1005
1006 lock_buffer(bh);
1007 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
1008 0, level, 0, NULL, NULL);
1009 unlock_buffer(bh);
1010 path[level].bp_sib_bh = bh;
1011 path[level].bp_op = nilfs_btree_grow;
1012
1013 level++;
1014 path[level].bp_op = nilfs_btree_do_insert;
1015
1016 /* a newly-created node block and a data block are added */
1017 stats->bs_nblocks += 2;
1018
1019 /* success */
1020 out:
1021 *levelp = level;
1022 return ret;
1023
1024 /* error */
1025 err_out_curr_node:
1026 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1027 &path[level].bp_newreq);
1028 err_out_child_node:
1029 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1030 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1031 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
1032 &btree->bt_bmap, &path[level].bp_newreq);
1033
1034 }
1035
1036 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1037 &path[level].bp_newreq);
1038 err_out_data:
1039 *levelp = level;
1040 stats->bs_nblocks = 0;
1041 return ret;
1042}
1043
1044static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1045 struct nilfs_btree_path *path,
1046 int maxlevel, __u64 key, __u64 ptr)
1047{
1048 int level;
1049
1050 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1051 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1052 if (btree->bt_ops->btop_set_target != NULL)
1053 btree->bt_ops->btop_set_target(btree, key, ptr);
1054
1055 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1056 if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
1057 btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
1058 &btree->bt_bmap, &path[level - 1].bp_newreq);
1059 }
1060 path[level].bp_op(btree, path, level, &key, &ptr);
1061 }
1062
1063 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1064 nilfs_bmap_set_dirty(&btree->bt_bmap);
1065}
1066
1067static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1068{
1069 struct nilfs_btree *btree;
1070 struct nilfs_btree_path *path;
1071 struct nilfs_bmap_stats stats;
1072 int level, ret;
1073
1074 btree = (struct nilfs_btree *)bmap;
1075 path = nilfs_btree_alloc_path(btree);
1076 if (path == NULL)
1077 return -ENOMEM;
1078 nilfs_btree_init_path(btree, path);
1079
1080 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1081 NILFS_BTREE_LEVEL_NODE_MIN);
1082 if (ret != -ENOENT) {
1083 if (ret == 0)
1084 ret = -EEXIST;
1085 goto out;
1086 }
1087
1088 ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
1089 if (ret < 0)
1090 goto out;
1091 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1092 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1093
1094 out:
1095 nilfs_btree_clear_path(btree, path);
1096 nilfs_btree_free_path(btree, path);
1097 return ret;
1098}
1099
1100static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1101 struct nilfs_btree_path *path,
1102 int level, __u64 *keyp, __u64 *ptrp)
1103{
1104 struct nilfs_btree_node *node;
1105
1106 if (level < nilfs_btree_height(btree) - 1) {
1107 lock_buffer(path[level].bp_bh);
1108 node = nilfs_btree_get_nonroot_node(btree, path, level);
1109 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1110 path[level].bp_index);
1111 if (!buffer_dirty(path[level].bp_bh))
1112 nilfs_btnode_mark_dirty(path[level].bp_bh);
1113 unlock_buffer(path[level].bp_bh);
1114 if (path[level].bp_index == 0)
1115 nilfs_btree_promote_key(btree, path, level + 1,
1116 nilfs_btree_node_get_key(btree, node, 0));
1117 } else {
1118 node = nilfs_btree_get_root(btree);
1119 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1120 path[level].bp_index);
1121 }
1122}
1123
1124static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1125 struct nilfs_btree_path *path,
1126 int level, __u64 *keyp, __u64 *ptrp)
1127{
1128 struct nilfs_btree_node *node, *left;
1129 int nchildren, lnchildren, n;
1130
1131 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1132
1133 lock_buffer(path[level].bp_bh);
1134 lock_buffer(path[level].bp_sib_bh);
1135
1136 node = nilfs_btree_get_nonroot_node(btree, path, level);
1137 left = nilfs_btree_get_sib_node(btree, path, level);
1138 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1139 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
1140
1141 n = (nchildren + lnchildren) / 2 - nchildren;
1142
1143 nilfs_btree_node_move_right(btree, left, node, n);
1144
1145 if (!buffer_dirty(path[level].bp_bh))
1146 nilfs_btnode_mark_dirty(path[level].bp_bh);
1147 if (!buffer_dirty(path[level].bp_sib_bh))
1148 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1149
1150 unlock_buffer(path[level].bp_bh);
1151 unlock_buffer(path[level].bp_sib_bh);
1152
1153 nilfs_btree_promote_key(btree, path, level + 1,
1154 nilfs_btree_node_get_key(btree, node, 0));
1155
1156 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1157 path[level].bp_sib_bh = NULL;
1158 path[level].bp_index += n;
1159}
1160
1161static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1162 struct nilfs_btree_path *path,
1163 int level, __u64 *keyp, __u64 *ptrp)
1164{
1165 struct nilfs_btree_node *node, *right;
1166 int nchildren, rnchildren, n;
1167
1168 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1169
1170 lock_buffer(path[level].bp_bh);
1171 lock_buffer(path[level].bp_sib_bh);
1172
1173 node = nilfs_btree_get_nonroot_node(btree, path, level);
1174 right = nilfs_btree_get_sib_node(btree, path, level);
1175 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1176 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
1177
1178 n = (nchildren + rnchildren) / 2 - nchildren;
1179
1180 nilfs_btree_node_move_left(btree, node, right, n);
1181
1182 if (!buffer_dirty(path[level].bp_bh))
1183 nilfs_btnode_mark_dirty(path[level].bp_bh);
1184 if (!buffer_dirty(path[level].bp_sib_bh))
1185 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1186
1187 unlock_buffer(path[level].bp_bh);
1188 unlock_buffer(path[level].bp_sib_bh);
1189
1190 path[level + 1].bp_index++;
1191 nilfs_btree_promote_key(btree, path, level + 1,
1192 nilfs_btree_node_get_key(btree, right, 0));
1193 path[level + 1].bp_index--;
1194
1195 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1196 path[level].bp_sib_bh = NULL;
1197}
1198
1199static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1200 struct nilfs_btree_path *path,
1201 int level, __u64 *keyp, __u64 *ptrp)
1202{
1203 struct nilfs_btree_node *node, *left;
1204 int n;
1205
1206 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1207
1208 lock_buffer(path[level].bp_bh);
1209 lock_buffer(path[level].bp_sib_bh);
1210
1211 node = nilfs_btree_get_nonroot_node(btree, path, level);
1212 left = nilfs_btree_get_sib_node(btree, path, level);
1213
1214 n = nilfs_btree_node_get_nchildren(btree, node);
1215
1216 nilfs_btree_node_move_left(btree, left, node, n);
1217
1218 if (!buffer_dirty(path[level].bp_sib_bh))
1219 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1220
1221 unlock_buffer(path[level].bp_bh);
1222 unlock_buffer(path[level].bp_sib_bh);
1223
1224 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1225 path[level].bp_bh = path[level].bp_sib_bh;
1226 path[level].bp_sib_bh = NULL;
1227 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
1228}
1229
1230static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1231 struct nilfs_btree_path *path,
1232 int level, __u64 *keyp, __u64 *ptrp)
1233{
1234 struct nilfs_btree_node *node, *right;
1235 int n;
1236
1237 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1238
1239 lock_buffer(path[level].bp_bh);
1240 lock_buffer(path[level].bp_sib_bh);
1241
1242 node = nilfs_btree_get_nonroot_node(btree, path, level);
1243 right = nilfs_btree_get_sib_node(btree, path, level);
1244
1245 n = nilfs_btree_node_get_nchildren(btree, right);
1246
1247 nilfs_btree_node_move_left(btree, node, right, n);
1248
1249 if (!buffer_dirty(path[level].bp_bh))
1250 nilfs_btnode_mark_dirty(path[level].bp_bh);
1251
1252 unlock_buffer(path[level].bp_bh);
1253 unlock_buffer(path[level].bp_sib_bh);
1254
1255 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1256 path[level].bp_sib_bh = NULL;
1257 path[level + 1].bp_index++;
1258}
1259
1260static void nilfs_btree_shrink(struct nilfs_btree *btree,
1261 struct nilfs_btree_path *path,
1262 int level, __u64 *keyp, __u64 *ptrp)
1263{
1264 struct nilfs_btree_node *root, *child;
1265 int n;
1266
1267 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1268
1269 lock_buffer(path[level].bp_bh);
1270 root = nilfs_btree_get_root(btree);
1271 child = nilfs_btree_get_nonroot_node(btree, path, level);
1272
1273 nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
1274 nilfs_btree_node_set_level(btree, root, level);
1275 n = nilfs_btree_node_get_nchildren(btree, child);
1276 nilfs_btree_node_move_left(btree, root, child, n);
1277 unlock_buffer(path[level].bp_bh);
1278
1279 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1280 path[level].bp_bh = NULL;
1281}
1282
1283
1284static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1285 struct nilfs_btree_path *path,
1286 int *levelp,
1287 struct nilfs_bmap_stats *stats)
1288{
1289 struct buffer_head *bh;
1290 struct nilfs_btree_node *node, *parent, *sib;
1291 __u64 sibptr;
1292 int pindex, level, ret;
1293
1294 ret = 0;
1295 stats->bs_nblocks = 0;
1296 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1297 level < nilfs_btree_height(btree) - 1;
1298 level++) {
1299 node = nilfs_btree_get_nonroot_node(btree, path, level);
1300 path[level].bp_oldreq.bpr_ptr =
1301 nilfs_btree_node_get_ptr(btree, node,
1302 path[level].bp_index);
1303 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1304 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1305 &btree->bt_bmap, &path[level].bp_oldreq);
1306 if (ret < 0)
1307 goto err_out_child_node;
1308 }
1309
1310 if (nilfs_btree_node_get_nchildren(btree, node) >
1311 nilfs_btree_node_nchildren_min(btree, node)) {
1312 path[level].bp_op = nilfs_btree_do_delete;
1313 stats->bs_nblocks++;
1314 goto out;
1315 }
1316
1317 parent = nilfs_btree_get_node(btree, path, level + 1);
1318 pindex = path[level + 1].bp_index;
1319
1320 if (pindex > 0) {
1321 /* left sibling */
1322 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1323 pindex - 1);
1324 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1325 &bh);
1326 if (ret < 0)
1327 goto err_out_curr_node;
1328 sib = (struct nilfs_btree_node *)bh->b_data;
1329 if (nilfs_btree_node_get_nchildren(btree, sib) >
1330 nilfs_btree_node_nchildren_min(btree, sib)) {
1331 path[level].bp_sib_bh = bh;
1332 path[level].bp_op = nilfs_btree_borrow_left;
1333 stats->bs_nblocks++;
1334 goto out;
1335 } else {
1336 path[level].bp_sib_bh = bh;
1337 path[level].bp_op = nilfs_btree_concat_left;
1338 stats->bs_nblocks++;
1339 /* continue; */
1340 }
1341 } else if (pindex <
1342 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
1343 /* right sibling */
1344 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1345 pindex + 1);
1346 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1347 &bh);
1348 if (ret < 0)
1349 goto err_out_curr_node;
1350 sib = (struct nilfs_btree_node *)bh->b_data;
1351 if (nilfs_btree_node_get_nchildren(btree, sib) >
1352 nilfs_btree_node_nchildren_min(btree, sib)) {
1353 path[level].bp_sib_bh = bh;
1354 path[level].bp_op = nilfs_btree_borrow_right;
1355 stats->bs_nblocks++;
1356 goto out;
1357 } else {
1358 path[level].bp_sib_bh = bh;
1359 path[level].bp_op = nilfs_btree_concat_right;
1360 stats->bs_nblocks++;
1361 /* continue; */
1362 }
1363 } else {
1364 /* no siblings */
1365 /* the only child of the root node */
1366 WARN_ON(level != nilfs_btree_height(btree) - 2);
1367 if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
1368 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1369 path[level].bp_op = nilfs_btree_shrink;
1370 stats->bs_nblocks += 2;
1371 } else {
1372 path[level].bp_op = nilfs_btree_do_delete;
1373 stats->bs_nblocks++;
1374 }
1375
1376 goto out;
1377
1378 }
1379 }
1380
1381 node = nilfs_btree_get_root(btree);
1382 path[level].bp_oldreq.bpr_ptr =
1383 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1384 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1385 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1386 &btree->bt_bmap, &path[level].bp_oldreq);
1387 if (ret < 0)
1388 goto err_out_child_node;
1389 }
1390 /* child of the root node is deleted */
1391 path[level].bp_op = nilfs_btree_do_delete;
1392 stats->bs_nblocks++;
1393
1394 /* success */
1395 out:
1396 *levelp = level;
1397 return ret;
1398
1399 /* error */
1400 err_out_curr_node:
1401 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1402 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1403 &btree->bt_bmap, &path[level].bp_oldreq);
1404 err_out_child_node:
1405 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1406 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1407 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1408 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1409 &btree->bt_bmap, &path[level].bp_oldreq);
1410 }
1411 *levelp = level;
1412 stats->bs_nblocks = 0;
1413 return ret;
1414}
1415
1416static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1417 struct nilfs_btree_path *path,
1418 int maxlevel)
1419{
1420 int level;
1421
1422 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1423 if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
1424 btree->bt_bmap.b_pops->bpop_commit_end_ptr(
1425 &btree->bt_bmap, &path[level].bp_oldreq);
1426 path[level].bp_op(btree, path, level, NULL, NULL);
1427 }
1428
1429 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1430 nilfs_bmap_set_dirty(&btree->bt_bmap);
1431}
1432
1433static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1434
1435{
1436 struct nilfs_btree *btree;
1437 struct nilfs_btree_path *path;
1438 struct nilfs_bmap_stats stats;
1439 int level, ret;
1440
1441 btree = (struct nilfs_btree *)bmap;
1442 path = nilfs_btree_alloc_path(btree);
1443 if (path == NULL)
1444 return -ENOMEM;
1445 nilfs_btree_init_path(btree, path);
1446 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1447 NILFS_BTREE_LEVEL_NODE_MIN);
1448 if (ret < 0)
1449 goto out;
1450
1451 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
1452 if (ret < 0)
1453 goto out;
1454 nilfs_btree_commit_delete(btree, path, level);
1455 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1456
1457out:
1458 nilfs_btree_clear_path(btree, path);
1459 nilfs_btree_free_path(btree, path);
1460 return ret;
1461}
1462
1463static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1464{
1465 struct nilfs_btree *btree;
1466 struct nilfs_btree_path *path;
1467 int ret;
1468
1469 btree = (struct nilfs_btree *)bmap;
1470 path = nilfs_btree_alloc_path(btree);
1471 if (path == NULL)
1472 return -ENOMEM;
1473 nilfs_btree_init_path(btree, path);
1474
1475 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1476
1477 nilfs_btree_clear_path(btree, path);
1478 nilfs_btree_free_path(btree, path);
1479
1480 return ret;
1481}
1482
1483static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1484{
1485 struct buffer_head *bh;
1486 struct nilfs_btree *btree;
1487 struct nilfs_btree_node *root, *node;
1488 __u64 maxkey, nextmaxkey;
1489 __u64 ptr;
1490 int nchildren, ret;
1491
1492 btree = (struct nilfs_btree *)bmap;
1493 root = nilfs_btree_get_root(btree);
1494 switch (nilfs_btree_height(btree)) {
1495 case 2:
1496 bh = NULL;
1497 node = root;
1498 break;
1499 case 3:
1500 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1501 if (nchildren > 1)
1502 return 0;
1503 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1504 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1505 if (ret < 0)
1506 return ret;
1507 node = (struct nilfs_btree_node *)bh->b_data;
1508 break;
1509 default:
1510 return 0;
1511 }
1512
1513 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1514 maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
1515 nextmaxkey = (nchildren > 1) ?
1516 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
1517 if (bh != NULL)
1518 nilfs_bmap_put_block(bmap, bh);
1519
1520 return (maxkey == key) && (nextmaxkey < bmap->b_low);
1521}
1522
1523static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1524 __u64 *keys, __u64 *ptrs, int nitems)
1525{
1526 struct buffer_head *bh;
1527 struct nilfs_btree *btree;
1528 struct nilfs_btree_node *node, *root;
1529 __le64 *dkeys;
1530 __le64 *dptrs;
1531 __u64 ptr;
1532 int nchildren, i, ret;
1533
1534 btree = (struct nilfs_btree *)bmap;
1535 root = nilfs_btree_get_root(btree);
1536 switch (nilfs_btree_height(btree)) {
1537 case 2:
1538 bh = NULL;
1539 node = root;
1540 break;
1541 case 3:
1542 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1543 WARN_ON(nchildren > 1);
1544 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1545 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1546 if (ret < 0)
1547 return ret;
1548 node = (struct nilfs_btree_node *)bh->b_data;
1549 break;
1550 default:
1551 node = NULL;
1552 return -EINVAL;
1553 }
1554
1555 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1556 if (nchildren < nitems)
1557 nitems = nchildren;
1558 dkeys = nilfs_btree_node_dkeys(btree, node);
1559 dptrs = nilfs_btree_node_dptrs(btree, node);
1560 for (i = 0; i < nitems; i++) {
1561 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
1562 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
1563 }
1564
1565 if (bh != NULL)
1566 nilfs_bmap_put_block(bmap, bh);
1567
1568 return nitems;
1569}
1570
1571static int
1572nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1573 union nilfs_bmap_ptr_req *dreq,
1574 union nilfs_bmap_ptr_req *nreq,
1575 struct buffer_head **bhp,
1576 struct nilfs_bmap_stats *stats)
1577{
1578 struct buffer_head *bh;
1579 struct nilfs_btree *btree;
1580 int ret;
1581
1582 btree = (struct nilfs_btree *)bmap;
1583 stats->bs_nblocks = 0;
1584
1585 /* for data */
1586 /* cannot find near ptr */
1587 if (btree->bt_ops->btop_find_target != NULL)
1588 dreq->bpr_ptr
1589 = btree->bt_ops->btop_find_target(btree, NULL, key);
1590 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
1591 if (ret < 0)
1592 return ret;
1593
1594 *bhp = NULL;
1595 stats->bs_nblocks++;
1596 if (nreq != NULL) {
1597 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1598 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
1599 if (ret < 0)
1600 goto err_out_dreq;
1601
1602 ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
1603 if (ret < 0)
1604 goto err_out_nreq;
1605
1606 *bhp = bh;
1607 stats->bs_nblocks++;
1608 }
1609
1610 /* success */
1611 return 0;
1612
1613 /* error */
1614 err_out_nreq:
1615 bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
1616 err_out_dreq:
1617 bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
1618 stats->bs_nblocks = 0;
1619 return ret;
1620
1621}
1622
1623static void
1624nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1625 __u64 key, __u64 ptr,
1626 const __u64 *keys, const __u64 *ptrs,
1627 int n, __u64 low, __u64 high,
1628 union nilfs_bmap_ptr_req *dreq,
1629 union nilfs_bmap_ptr_req *nreq,
1630 struct buffer_head *bh)
1631{
1632 struct nilfs_btree *btree;
1633 struct nilfs_btree_node *node;
1634 __u64 tmpptr;
1635
1636 /* free resources */
1637 if (bmap->b_ops->bop_clear != NULL)
1638 bmap->b_ops->bop_clear(bmap);
1639
1640 /* ptr must be a pointer to a buffer head. */
1641 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1642
1643 /* convert and insert */
1644 btree = (struct nilfs_btree *)bmap;
1645 nilfs_btree_init(bmap, low, high);
1646 if (nreq != NULL) {
1647 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
1648 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1649 bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
1650 }
1651
1652 /* create child node at level 1 */
1653 lock_buffer(bh);
1654 node = (struct nilfs_btree_node *)bh->b_data;
1655 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
1656 nilfs_btree_node_insert(btree, node,
1657 key, dreq->bpr_ptr, n);
1658 if (!buffer_dirty(bh))
1659 nilfs_btnode_mark_dirty(bh);
1660 if (!nilfs_bmap_dirty(bmap))
1661 nilfs_bmap_set_dirty(bmap);
1662
1663 unlock_buffer(bh);
1664 nilfs_bmap_put_block(bmap, bh);
1665
1666 /* create root node at level 2 */
1667 node = nilfs_btree_get_root(btree);
1668 tmpptr = nreq->bpr_ptr;
1669 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1670 2, 1, &keys[0], &tmpptr);
1671 } else {
1672 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
1673 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1674
1675 /* create root node at level 1 */
1676 node = nilfs_btree_get_root(btree);
1677 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1678 1, n, keys, ptrs);
1679 nilfs_btree_node_insert(btree, node,
1680 key, dreq->bpr_ptr, n);
1681 if (!nilfs_bmap_dirty(bmap))
1682 nilfs_bmap_set_dirty(bmap);
1683 }
1684
1685 if (btree->bt_ops->btop_set_target != NULL)
1686 btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
1687}
1688
1689/**
1690 * nilfs_btree_convert_and_insert -
1691 * @bmap:
1692 * @key:
1693 * @ptr:
1694 * @keys:
1695 * @ptrs:
1696 * @n:
1697 * @low:
1698 * @high:
1699 */
1700int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1701 __u64 key, __u64 ptr,
1702 const __u64 *keys, const __u64 *ptrs,
1703 int n, __u64 low, __u64 high)
1704{
1705 struct buffer_head *bh;
1706 union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
1707 struct nilfs_bmap_stats stats;
1708 int ret;
1709
1710 if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1711 di = &dreq;
1712 ni = NULL;
1713 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1714 1 << bmap->b_inode->i_blkbits)) {
1715 di = &dreq;
1716 ni = &nreq;
1717 } else {
1718 di = NULL;
1719 ni = NULL;
1720 BUG();
1721 }
1722
1723 ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
1724 &stats);
1725 if (ret < 0)
1726 return ret;
1727 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
1728 low, high, di, ni, bh);
1729 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1730 return 0;
1731}
1732
1733static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1734 struct nilfs_btree_path *path,
1735 int level,
1736 struct buffer_head *bh)
1737{
1738 while ((++level < nilfs_btree_height(btree) - 1) &&
1739 !buffer_dirty(path[level].bp_bh))
1740 nilfs_btnode_mark_dirty(path[level].bp_bh);
1741
1742 return 0;
1743}
1744
1745static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1746 struct nilfs_btree_path *path,
1747 int level)
1748{
1749 struct nilfs_btree_node *parent;
1750 int ret;
1751
1752 parent = nilfs_btree_get_node(btree, path, level + 1);
1753 path[level].bp_oldreq.bpr_ptr =
1754 nilfs_btree_node_get_ptr(btree, parent,
1755 path[level + 1].bp_index);
1756 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1757 ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
1758 &path[level].bp_oldreq,
1759 &path[level].bp_newreq);
1760 if (ret < 0)
1761 return ret;
1762
1763 if (buffer_nilfs_node(path[level].bp_bh)) {
1764 path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
1765 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1766 path[level].bp_ctxt.bh = path[level].bp_bh;
1767 ret = nilfs_btnode_prepare_change_key(
1768 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1769 &path[level].bp_ctxt);
1770 if (ret < 0) {
1771 nilfs_bmap_abort_update(&btree->bt_bmap,
1772 &path[level].bp_oldreq,
1773 &path[level].bp_newreq);
1774 return ret;
1775 }
1776 }
1777
1778 return 0;
1779}
1780
1781static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1782 struct nilfs_btree_path *path,
1783 int level)
1784{
1785 struct nilfs_btree_node *parent;
1786
1787 nilfs_bmap_commit_update(&btree->bt_bmap,
1788 &path[level].bp_oldreq,
1789 &path[level].bp_newreq);
1790
1791 if (buffer_nilfs_node(path[level].bp_bh)) {
1792 nilfs_btnode_commit_change_key(
1793 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1794 &path[level].bp_ctxt);
1795 path[level].bp_bh = path[level].bp_ctxt.bh;
1796 }
1797 set_buffer_nilfs_volatile(path[level].bp_bh);
1798
1799 parent = nilfs_btree_get_node(btree, path, level + 1);
1800 nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
1801 path[level].bp_newreq.bpr_ptr);
1802}
1803
1804static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1805 struct nilfs_btree_path *path,
1806 int level)
1807{
1808 nilfs_bmap_abort_update(&btree->bt_bmap,
1809 &path[level].bp_oldreq,
1810 &path[level].bp_newreq);
1811 if (buffer_nilfs_node(path[level].bp_bh))
1812 nilfs_btnode_abort_change_key(
1813 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1814 &path[level].bp_ctxt);
1815}
1816
1817static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1818 struct nilfs_btree_path *path,
1819 int minlevel,
1820 int *maxlevelp)
1821{
1822 int level, ret;
1823
1824 level = minlevel;
1825 if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1826 ret = nilfs_btree_prepare_update_v(btree, path, level);
1827 if (ret < 0)
1828 return ret;
1829 }
1830 while ((++level < nilfs_btree_height(btree) - 1) &&
1831 !buffer_dirty(path[level].bp_bh)) {
1832
1833 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1834 ret = nilfs_btree_prepare_update_v(btree, path, level);
1835 if (ret < 0)
1836 goto out;
1837 }
1838
1839 /* success */
1840 *maxlevelp = level - 1;
1841 return 0;
1842
1843 /* error */
1844 out:
1845 while (--level > minlevel)
1846 nilfs_btree_abort_update_v(btree, path, level);
1847 if (!buffer_nilfs_volatile(path[level].bp_bh))
1848 nilfs_btree_abort_update_v(btree, path, level);
1849 return ret;
1850}
1851
1852static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1853 struct nilfs_btree_path *path,
1854 int minlevel,
1855 int maxlevel,
1856 struct buffer_head *bh)
1857{
1858 int level;
1859
1860 if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
1861 nilfs_btree_commit_update_v(btree, path, minlevel);
1862
1863 for (level = minlevel + 1; level <= maxlevel; level++)
1864 nilfs_btree_commit_update_v(btree, path, level);
1865}
1866
1867static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1868 struct nilfs_btree_path *path,
1869 int level,
1870 struct buffer_head *bh)
1871{
1872 int maxlevel, ret;
1873 struct nilfs_btree_node *parent;
1874 __u64 ptr;
1875
1876 get_bh(bh);
1877 path[level].bp_bh = bh;
1878 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
1879 if (ret < 0)
1880 goto out;
1881
1882 if (buffer_nilfs_volatile(path[level].bp_bh)) {
1883 parent = nilfs_btree_get_node(btree, path, level + 1);
1884 ptr = nilfs_btree_node_get_ptr(btree, parent,
1885 path[level + 1].bp_index);
1886 ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
1887 if (ret < 0)
1888 goto out;
1889 }
1890
1891 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
1892
1893 out:
1894 brelse(path[level].bp_bh);
1895 path[level].bp_bh = NULL;
1896 return ret;
1897}
1898
1899static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1900 struct buffer_head *bh)
1901{
1902 struct nilfs_btree *btree;
1903 struct nilfs_btree_path *path;
1904 struct nilfs_btree_node *node;
1905 __u64 key;
1906 int level, ret;
1907
1908 WARN_ON(!buffer_dirty(bh));
1909
1910 btree = (struct nilfs_btree *)bmap;
1911 path = nilfs_btree_alloc_path(btree);
1912 if (path == NULL)
1913 return -ENOMEM;
1914 nilfs_btree_init_path(btree, path);
1915
1916 if (buffer_nilfs_node(bh)) {
1917 node = (struct nilfs_btree_node *)bh->b_data;
1918 key = nilfs_btree_node_get_key(btree, node, 0);
1919 level = nilfs_btree_node_get_level(btree, node);
1920 } else {
1921 key = nilfs_bmap_data_get_key(bmap, bh);
1922 level = NILFS_BTREE_LEVEL_DATA;
1923 }
1924
1925 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
1926 if (ret < 0) {
1927 if (unlikely(ret == -ENOENT))
1928 printk(KERN_CRIT "%s: key = %llu, level == %d\n",
1929 __func__, (unsigned long long)key, level);
1930 goto out;
1931 }
1932
1933 ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
1934
1935 out:
1936 nilfs_btree_clear_path(btree, path);
1937 nilfs_btree_free_path(btree, path);
1938
1939 return ret;
1940}
1941
1942static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
1943 struct buffer_head *bh)
1944{
1945 return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
1946}
1947
1948static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1949 struct list_head *lists,
1950 struct buffer_head *bh)
1951{
1952 struct list_head *head;
1953 struct buffer_head *cbh;
1954 struct nilfs_btree_node *node, *cnode;
1955 __u64 key, ckey;
1956 int level;
1957
1958 get_bh(bh);
1959 node = (struct nilfs_btree_node *)bh->b_data;
1960 key = nilfs_btree_node_get_key(btree, node, 0);
1961 level = nilfs_btree_node_get_level(btree, node);
1962 list_for_each(head, &lists[level]) {
1963 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
1964 cnode = (struct nilfs_btree_node *)cbh->b_data;
1965 ckey = nilfs_btree_node_get_key(btree, cnode, 0);
1966 if (key < ckey)
1967 break;
1968 }
1969 list_add_tail(&bh->b_assoc_buffers, head);
1970}
1971
1972static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
1973 struct list_head *listp)
1974{
1975 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1976 struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
1977 struct list_head lists[NILFS_BTREE_LEVEL_MAX];
1978 struct pagevec pvec;
1979 struct buffer_head *bh, *head;
1980 pgoff_t index = 0;
1981 int level, i;
1982
1983 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1984 level < NILFS_BTREE_LEVEL_MAX;
1985 level++)
1986 INIT_LIST_HEAD(&lists[level]);
1987
1988 pagevec_init(&pvec, 0);
1989
1990 while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
1991 PAGEVEC_SIZE)) {
1992 for (i = 0; i < pagevec_count(&pvec); i++) {
1993 bh = head = page_buffers(pvec.pages[i]);
1994 do {
1995 if (buffer_dirty(bh))
1996 nilfs_btree_add_dirty_buffer(btree,
1997 lists, bh);
1998 } while ((bh = bh->b_this_page) != head);
1999 }
2000 pagevec_release(&pvec);
2001 cond_resched();
2002 }
2003
2004 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
2005 level < NILFS_BTREE_LEVEL_MAX;
2006 level++)
2007 list_splice(&lists[level], listp->prev);
2008}
2009
2010static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2011 struct nilfs_btree_path *path,
2012 int level,
2013 struct buffer_head **bh,
2014 sector_t blocknr,
2015 union nilfs_binfo *binfo)
2016{
2017 struct nilfs_btree_node *parent;
2018 __u64 key;
2019 __u64 ptr;
2020 int ret;
2021
2022 parent = nilfs_btree_get_node(btree, path, level + 1);
2023 ptr = nilfs_btree_node_get_ptr(btree, parent,
2024 path[level + 1].bp_index);
2025 if (buffer_nilfs_node(*bh)) {
2026 path[level].bp_ctxt.oldkey = ptr;
2027 path[level].bp_ctxt.newkey = blocknr;
2028 path[level].bp_ctxt.bh = *bh;
2029 ret = nilfs_btnode_prepare_change_key(
2030 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2031 &path[level].bp_ctxt);
2032 if (ret < 0)
2033 return ret;
2034 nilfs_btnode_commit_change_key(
2035 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2036 &path[level].bp_ctxt);
2037 *bh = path[level].bp_ctxt.bh;
2038 }
2039
2040 nilfs_btree_node_set_ptr(btree, parent,
2041 path[level + 1].bp_index, blocknr);
2042
2043 key = nilfs_btree_node_get_key(btree, parent,
2044 path[level + 1].bp_index);
2045 /* on-disk format */
2046 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2047 binfo->bi_dat.bi_level = level;
2048
2049 return 0;
2050}
2051
2052static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2053 struct nilfs_btree_path *path,
2054 int level,
2055 struct buffer_head **bh,
2056 sector_t blocknr,
2057 union nilfs_binfo *binfo)
2058{
2059 struct nilfs_btree_node *parent;
2060 __u64 key;
2061 __u64 ptr;
2062 union nilfs_bmap_ptr_req req;
2063 int ret;
2064
2065 parent = nilfs_btree_get_node(btree, path, level + 1);
2066 ptr = nilfs_btree_node_get_ptr(btree, parent,
2067 path[level + 1].bp_index);
2068 req.bpr_ptr = ptr;
2069 ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
2070 &req);
2071 if (ret < 0)
2072 return ret;
2073 btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
2074 &req, blocknr);
2075
2076 key = nilfs_btree_node_get_key(btree, parent,
2077 path[level + 1].bp_index);
2078 /* on-disk format */
2079 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
2080 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2081
2082 return 0;
2083}
2084
2085static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2086 struct buffer_head **bh,
2087 sector_t blocknr,
2088 union nilfs_binfo *binfo)
2089{
2090 struct nilfs_btree *btree;
2091 struct nilfs_btree_path *path;
2092 struct nilfs_btree_node *node;
2093 __u64 key;
2094 int level, ret;
2095
2096 btree = (struct nilfs_btree *)bmap;
2097 path = nilfs_btree_alloc_path(btree);
2098 if (path == NULL)
2099 return -ENOMEM;
2100 nilfs_btree_init_path(btree, path);
2101
2102 if (buffer_nilfs_node(*bh)) {
2103 node = (struct nilfs_btree_node *)(*bh)->b_data;
2104 key = nilfs_btree_node_get_key(btree, node, 0);
2105 level = nilfs_btree_node_get_level(btree, node);
2106 } else {
2107 key = nilfs_bmap_data_get_key(bmap, *bh);
2108 level = NILFS_BTREE_LEVEL_DATA;
2109 }
2110
2111 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
2112 if (ret < 0) {
2113 WARN_ON(ret == -ENOENT);
2114 goto out;
2115 }
2116
2117 ret = btree->bt_ops->btop_assign(btree, path, level, bh,
2118 blocknr, binfo);
2119
2120 out:
2121 nilfs_btree_clear_path(btree, path);
2122 nilfs_btree_free_path(btree, path);
2123
2124 return ret;
2125}
2126
2127static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2128 struct buffer_head **bh,
2129 sector_t blocknr,
2130 union nilfs_binfo *binfo)
2131{
2132 struct nilfs_btree *btree;
2133 struct nilfs_btree_node *node;
2134 __u64 key;
2135 int ret;
2136
2137 btree = (struct nilfs_btree *)bmap;
2138 ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
2139 if (ret < 0)
2140 return ret;
2141
2142 if (buffer_nilfs_node(*bh)) {
2143 node = (struct nilfs_btree_node *)(*bh)->b_data;
2144 key = nilfs_btree_node_get_key(btree, node, 0);
2145 } else
2146 key = nilfs_bmap_data_get_key(bmap, *bh);
2147
2148 /* on-disk format */
2149 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2150 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2151
2152 return 0;
2153}
2154
2155static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2156{
2157 struct buffer_head *bh;
2158 struct nilfs_btree *btree;
2159 struct nilfs_btree_path *path;
2160 __u64 ptr;
2161 int ret;
2162
2163 btree = (struct nilfs_btree *)bmap;
2164 path = nilfs_btree_alloc_path(btree);
2165 if (path == NULL)
2166 return -ENOMEM;
2167 nilfs_btree_init_path(btree, path);
2168
2169 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2170 if (ret < 0) {
2171 WARN_ON(ret == -ENOENT);
2172 goto out;
2173 }
2174 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
2175 if (ret < 0) {
2176 WARN_ON(ret == -ENOENT);
2177 goto out;
2178 }
2179
2180 if (!buffer_dirty(bh))
2181 nilfs_btnode_mark_dirty(bh);
2182 nilfs_bmap_put_block(&btree->bt_bmap, bh);
2183 if (!nilfs_bmap_dirty(&btree->bt_bmap))
2184 nilfs_bmap_set_dirty(&btree->bt_bmap);
2185
2186 out:
2187 nilfs_btree_clear_path(btree, path);
2188 nilfs_btree_free_path(btree, path);
2189 return ret;
2190}
2191
2192static const struct nilfs_bmap_operations nilfs_btree_ops = {
2193 .bop_lookup = nilfs_btree_lookup,
2194 .bop_insert = nilfs_btree_insert,
2195 .bop_delete = nilfs_btree_delete,
2196 .bop_clear = NULL,
2197
2198 .bop_propagate = nilfs_btree_propagate,
2199
2200 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2201
2202 .bop_assign = nilfs_btree_assign,
2203 .bop_mark = nilfs_btree_mark,
2204
2205 .bop_last_key = nilfs_btree_last_key,
2206 .bop_check_insert = NULL,
2207 .bop_check_delete = nilfs_btree_check_delete,
2208 .bop_gather_data = nilfs_btree_gather_data,
2209};
2210
2211static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2212 .bop_lookup = NULL,
2213 .bop_insert = NULL,
2214 .bop_delete = NULL,
2215 .bop_clear = NULL,
2216
2217 .bop_propagate = nilfs_btree_propagate_gc,
2218
2219 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2220
2221 .bop_assign = nilfs_btree_assign_gc,
2222 .bop_mark = NULL,
2223
2224 .bop_last_key = NULL,
2225 .bop_check_insert = NULL,
2226 .bop_check_delete = NULL,
2227 .bop_gather_data = NULL,
2228};
2229
2230static const struct nilfs_btree_operations nilfs_btree_ops_v = {
2231 .btop_find_target = nilfs_btree_find_target_v,
2232 .btop_set_target = nilfs_btree_set_target_v,
2233 .btop_propagate = nilfs_btree_propagate_v,
2234 .btop_assign = nilfs_btree_assign_v,
2235};
2236
2237static const struct nilfs_btree_operations nilfs_btree_ops_p = {
2238 .btop_find_target = NULL,
2239 .btop_set_target = NULL,
2240 .btop_propagate = nilfs_btree_propagate_p,
2241 .btop_assign = nilfs_btree_assign_p,
2242};
2243
2244int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
2245{
2246 struct nilfs_btree *btree;
2247
2248 btree = (struct nilfs_btree *)bmap;
2249 bmap->b_ops = &nilfs_btree_ops;
2250 bmap->b_low = low;
2251 bmap->b_high = high;
2252 switch (bmap->b_inode->i_ino) {
2253 case NILFS_DAT_INO:
2254 btree->bt_ops = &nilfs_btree_ops_p;
2255 break;
2256 default:
2257 btree->bt_ops = &nilfs_btree_ops_v;
2258 break;
2259 }
2260
2261 return 0;
2262}
2263
2264void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2265{
2266 bmap->b_low = NILFS_BMAP_LARGE_LOW;
2267 bmap->b_high = NILFS_BMAP_LARGE_HIGH;
2268 bmap->b_ops = &nilfs_btree_ops_gc;
2269}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 000000000000..4766deb52fb1
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
1/*
2 * btree.h - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BTREE_H
24#define _NILFS_BTREE_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/list.h>
29#include <linux/nilfs2_fs.h>
30#include "btnode.h"
31#include "bmap.h"
32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/**
37 * struct nilfs_btree_operations - B-tree operation table
38 */
39struct nilfs_btree_operations {
40 __u64 (*btop_find_target)(const struct nilfs_btree *,
41 const struct nilfs_btree_path *, __u64);
42 void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
43
44 struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
45
46 int (*btop_propagate)(struct nilfs_btree *,
47 struct nilfs_btree_path *,
48 int,
49 struct buffer_head *);
50 int (*btop_assign)(struct nilfs_btree *,
51 struct nilfs_btree_path *,
52 int,
53 struct buffer_head **,
54 sector_t,
55 union nilfs_binfo *);
56};
57
58/**
59 * struct nilfs_btree_node - B-tree node
60 * @bn_flags: flags
61 * @bn_level: level
62 * @bn_nchildren: number of children
63 * @bn_pad: padding
64 */
65struct nilfs_btree_node {
66 __u8 bn_flags;
67 __u8 bn_level;
68 __le16 bn_nchildren;
69 __le32 bn_pad;
70};
71
72/* flags */
73#define NILFS_BTREE_NODE_ROOT 0x01
74
75/* level */
76#define NILFS_BTREE_LEVEL_DATA 0
77#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1)
78#define NILFS_BTREE_LEVEL_MAX 14
79
80/**
81 * struct nilfs_btree - B-tree structure
82 * @bt_bmap: bmap base structure
83 * @bt_ops: B-tree operation table
84 */
85struct nilfs_btree {
86 struct nilfs_bmap bt_bmap;
87
88 /* B-tree-specific members */
89 const struct nilfs_btree_operations *bt_ops;
90};
91
92
93#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
94#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
95 ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \
96 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
97#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0
98#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64))
99#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \
100 (((nodesize) - sizeof(struct nilfs_btree_node) - \
101 NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \
102 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
103#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \
104 ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
105#define NILFS_BTREE_KEY_MIN ((__u64)0)
106#define NILFS_BTREE_KEY_MAX (~(__u64)0)
107
108
109int nilfs_btree_path_cache_init(void);
110void nilfs_btree_path_cache_destroy(void);
111int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
112int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
113 const __u64 *, const __u64 *,
114 int, __u64, __u64);
115void nilfs_btree_init_gc(struct nilfs_bmap *);
116
117#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000000..e90b60dfced9
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,925 @@
1/*
2 * cpfile.c - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "cpfile.h"
31
32
33static inline unsigned long
34nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
35{
36 return NILFS_MDT(cpfile)->mi_entries_per_block;
37}
38
39/* block number from the beginning of the file */
40static unsigned long
41nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
42{
43 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
44 do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
45 return (unsigned long)tcno;
46}
47
48/* offset in block */
49static unsigned long
50nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
51{
52 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
53 return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
54}
55
56static unsigned long
57nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
58 __u64 curr,
59 __u64 max)
60{
61 return min_t(__u64,
62 nilfs_cpfile_checkpoints_per_block(cpfile) -
63 nilfs_cpfile_get_offset(cpfile, curr),
64 max - curr);
65}
66
67static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
68 __u64 cno)
69{
70 return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
71}
72
73static unsigned int
74nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
75 struct buffer_head *bh,
76 void *kaddr,
77 unsigned int n)
78{
79 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
80 unsigned int count;
81
82 count = le32_to_cpu(cp->cp_checkpoints_count) + n;
83 cp->cp_checkpoints_count = cpu_to_le32(count);
84 return count;
85}
86
87static unsigned int
88nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
89 struct buffer_head *bh,
90 void *kaddr,
91 unsigned int n)
92{
93 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
94 unsigned int count;
95
96 WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
97 count = le32_to_cpu(cp->cp_checkpoints_count) - n;
98 cp->cp_checkpoints_count = cpu_to_le32(count);
99 return count;
100}
101
102static inline struct nilfs_cpfile_header *
103nilfs_cpfile_block_get_header(const struct inode *cpfile,
104 struct buffer_head *bh,
105 void *kaddr)
106{
107 return kaddr + bh_offset(bh);
108}
109
110static struct nilfs_checkpoint *
111nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
112 struct buffer_head *bh,
113 void *kaddr)
114{
115 return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
116 NILFS_MDT(cpfile)->mi_entry_size;
117}
118
119static void nilfs_cpfile_block_init(struct inode *cpfile,
120 struct buffer_head *bh,
121 void *kaddr)
122{
123 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
124 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
125 int n = nilfs_cpfile_checkpoints_per_block(cpfile);
126
127 while (n-- > 0) {
128 nilfs_checkpoint_set_invalid(cp);
129 cp = (void *)cp + cpsz;
130 }
131}
132
133static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
134 struct buffer_head **bhp)
135{
136 return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
137}
138
139static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
140 __u64 cno,
141 int create,
142 struct buffer_head **bhp)
143{
144 return nilfs_mdt_get_block(cpfile,
145 nilfs_cpfile_get_blkoff(cpfile, cno),
146 create, nilfs_cpfile_block_init, bhp);
147}
148
149static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
150 __u64 cno)
151{
152 return nilfs_mdt_delete_block(cpfile,
153 nilfs_cpfile_get_blkoff(cpfile, cno));
154}
155
156/**
157 * nilfs_cpfile_get_checkpoint - get a checkpoint
158 * @cpfile: inode of checkpoint file
159 * @cno: checkpoint number
160 * @create: create flag
161 * @cpp: pointer to a checkpoint
162 * @bhp: pointer to a buffer head
163 *
164 * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
165 * specified by @cno. A new checkpoint will be created if @cno is the current
166 * checkpoint number and @create is nonzero.
167 *
168 * Return Value: On success, 0 is returned, and the checkpoint and the
169 * buffer head of the buffer on which the checkpoint is located are stored in
170 * the place pointed by @cpp and @bhp, respectively. On error, one of the
171 * following negative error codes is returned.
172 *
173 * %-EIO - I/O error.
174 *
175 * %-ENOMEM - Insufficient amount of memory available.
176 *
177 * %-ENOENT - No such checkpoint.
178 *
179 * %-EINVAL - invalid checkpoint.
180 */
181int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
182 __u64 cno,
183 int create,
184 struct nilfs_checkpoint **cpp,
185 struct buffer_head **bhp)
186{
187 struct buffer_head *header_bh, *cp_bh;
188 struct nilfs_cpfile_header *header;
189 struct nilfs_checkpoint *cp;
190 void *kaddr;
191 int ret;
192
193 if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
194 (cno < nilfs_mdt_cno(cpfile) && create)))
195 return -EINVAL;
196
197 down_write(&NILFS_MDT(cpfile)->mi_sem);
198
199 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
200 if (ret < 0)
201 goto out_sem;
202 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
203 if (ret < 0)
204 goto out_header;
205 kaddr = kmap(cp_bh->b_page);
206 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
207 if (nilfs_checkpoint_invalid(cp)) {
208 if (!create) {
209 kunmap(cp_bh->b_page);
210 brelse(cp_bh);
211 ret = -ENOENT;
212 goto out_header;
213 }
214 /* a newly-created checkpoint */
215 nilfs_checkpoint_clear_invalid(cp);
216 if (!nilfs_cpfile_is_in_first(cpfile, cno))
217 nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
218 kaddr, 1);
219 nilfs_mdt_mark_buffer_dirty(cp_bh);
220
221 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
222 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
223 kaddr);
224 le64_add_cpu(&header->ch_ncheckpoints, 1);
225 kunmap_atomic(kaddr, KM_USER0);
226 nilfs_mdt_mark_buffer_dirty(header_bh);
227 nilfs_mdt_mark_dirty(cpfile);
228 }
229
230 if (cpp != NULL)
231 *cpp = cp;
232 *bhp = cp_bh;
233
234 out_header:
235 brelse(header_bh);
236
237 out_sem:
238 up_write(&NILFS_MDT(cpfile)->mi_sem);
239 return ret;
240}
241
242/**
243 * nilfs_cpfile_put_checkpoint - put a checkpoint
244 * @cpfile: inode of checkpoint file
245 * @cno: checkpoint number
246 * @bh: buffer head
247 *
248 * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
249 * specified by @cno. @bh must be the buffer head which has been returned by
250 * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
251 */
252void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
253 struct buffer_head *bh)
254{
255 kunmap(bh->b_page);
256 brelse(bh);
257}
258
259/**
260 * nilfs_cpfile_delete_checkpoints - delete checkpoints
261 * @cpfile: inode of checkpoint file
262 * @start: start checkpoint number
263 * @end: end checkpoint numer
264 *
265 * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
266 * the period from @start to @end, excluding @end itself. The checkpoints
267 * which have been already deleted are ignored.
268 *
269 * Return Value: On success, 0 is returned. On error, one of the following
270 * negative error codes is returned.
271 *
272 * %-EIO - I/O error.
273 *
274 * %-ENOMEM - Insufficient amount of memory available.
275 *
276 * %-EINVAL - invalid checkpoints.
277 */
278int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
279 __u64 start,
280 __u64 end)
281{
282 struct buffer_head *header_bh, *cp_bh;
283 struct nilfs_cpfile_header *header;
284 struct nilfs_checkpoint *cp;
285 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
286 __u64 cno;
287 void *kaddr;
288 unsigned long tnicps;
289 int ret, ncps, nicps, count, i;
290
291 if (unlikely(start == 0 || start > end)) {
292 printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
293 "[%llu, %llu)\n", __func__,
294 (unsigned long long)start, (unsigned long long)end);
295 return -EINVAL;
296 }
297
298 /* cannot delete the latest checkpoint */
299 if (start == nilfs_mdt_cno(cpfile) - 1)
300 return -EPERM;
301
302 down_write(&NILFS_MDT(cpfile)->mi_sem);
303
304 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
305 if (ret < 0)
306 goto out_sem;
307 tnicps = 0;
308
309 for (cno = start; cno < end; cno += ncps) {
310 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
311 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
312 if (ret < 0) {
313 if (ret != -ENOENT)
314 goto out_sem;
315 /* skip hole */
316 ret = 0;
317 continue;
318 }
319
320 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
321 cp = nilfs_cpfile_block_get_checkpoint(
322 cpfile, cno, cp_bh, kaddr);
323 nicps = 0;
324 for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
325 WARN_ON(nilfs_checkpoint_snapshot(cp));
326 if (!nilfs_checkpoint_invalid(cp)) {
327 nilfs_checkpoint_set_invalid(cp);
328 nicps++;
329 }
330 }
331 if (nicps > 0) {
332 tnicps += nicps;
333 nilfs_mdt_mark_buffer_dirty(cp_bh);
334 nilfs_mdt_mark_dirty(cpfile);
335 if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
336 (count = nilfs_cpfile_block_sub_valid_checkpoints(
337 cpfile, cp_bh, kaddr, nicps)) == 0) {
338 /* make hole */
339 kunmap_atomic(kaddr, KM_USER0);
340 brelse(cp_bh);
341 ret = nilfs_cpfile_delete_checkpoint_block(
342 cpfile, cno);
343 if (ret == 0)
344 continue;
345 printk(KERN_ERR "%s: cannot delete block\n",
346 __func__);
347 goto out_sem;
348 }
349 }
350
351 kunmap_atomic(kaddr, KM_USER0);
352 brelse(cp_bh);
353 }
354
355 if (tnicps > 0) {
356 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
357 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
358 kaddr);
359 le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
360 nilfs_mdt_mark_buffer_dirty(header_bh);
361 nilfs_mdt_mark_dirty(cpfile);
362 kunmap_atomic(kaddr, KM_USER0);
363 }
364 brelse(header_bh);
365
366 out_sem:
367 up_write(&NILFS_MDT(cpfile)->mi_sem);
368 return ret;
369}
370
371static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
372 struct nilfs_checkpoint *cp,
373 struct nilfs_cpinfo *ci)
374{
375 ci->ci_flags = le32_to_cpu(cp->cp_flags);
376 ci->ci_cno = le64_to_cpu(cp->cp_cno);
377 ci->ci_create = le64_to_cpu(cp->cp_create);
378 ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
379 ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
380 ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
381 ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
382}
383
384static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
385 struct nilfs_cpinfo *ci, size_t nci)
386{
387 struct nilfs_checkpoint *cp;
388 struct buffer_head *bh;
389 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
390 __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
391 void *kaddr;
392 int n, ret;
393 int ncps, i;
394
395 if (cno == 0)
396 return -ENOENT; /* checkpoint number 0 is invalid */
397 down_read(&NILFS_MDT(cpfile)->mi_sem);
398
399 for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
400 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
401 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
402 if (ret < 0) {
403 if (ret != -ENOENT)
404 goto out;
405 continue; /* skip hole */
406 }
407
408 kaddr = kmap_atomic(bh->b_page, KM_USER0);
409 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
410 for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
411 if (!nilfs_checkpoint_invalid(cp))
412 nilfs_cpfile_checkpoint_to_cpinfo(
413 cpfile, cp, &ci[n++]);
414 }
415 kunmap_atomic(kaddr, KM_USER0);
416 brelse(bh);
417 }
418
419 ret = n;
420 if (n > 0)
421 *cnop = ci[n - 1].ci_cno + 1;
422
423 out:
424 up_read(&NILFS_MDT(cpfile)->mi_sem);
425 return ret;
426}
427
428static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
429 struct nilfs_cpinfo *ci, size_t nci)
430{
431 struct buffer_head *bh;
432 struct nilfs_cpfile_header *header;
433 struct nilfs_checkpoint *cp;
434 __u64 curr = *cnop, next;
435 unsigned long curr_blkoff, next_blkoff;
436 void *kaddr;
437 int n = 0, ret;
438
439 down_read(&NILFS_MDT(cpfile)->mi_sem);
440
441 if (curr == 0) {
442 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
443 if (ret < 0)
444 goto out;
445 kaddr = kmap_atomic(bh->b_page, KM_USER0);
446 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
447 curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
448 kunmap_atomic(kaddr, KM_USER0);
449 brelse(bh);
450 if (curr == 0) {
451 ret = 0;
452 goto out;
453 }
454 } else if (unlikely(curr == ~(__u64)0)) {
455 ret = 0;
456 goto out;
457 }
458
459 curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
460 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
461 if (unlikely(ret < 0)) {
462 if (ret == -ENOENT)
463 ret = 0; /* No snapshots (started from a hole block) */
464 goto out;
465 }
466 kaddr = kmap_atomic(bh->b_page, KM_USER0);
467 while (n < nci) {
468 cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
469 curr = ~(__u64)0; /* Terminator */
470 if (unlikely(nilfs_checkpoint_invalid(cp) ||
471 !nilfs_checkpoint_snapshot(cp)))
472 break;
473 nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
474 next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
475 if (next == 0)
476 break; /* reach end of the snapshot list */
477
478 next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
479 if (curr_blkoff != next_blkoff) {
480 kunmap_atomic(kaddr, KM_USER0);
481 brelse(bh);
482 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
483 0, &bh);
484 if (unlikely(ret < 0)) {
485 WARN_ON(ret == -ENOENT);
486 goto out;
487 }
488 kaddr = kmap_atomic(bh->b_page, KM_USER0);
489 }
490 curr = next;
491 curr_blkoff = next_blkoff;
492 }
493 kunmap_atomic(kaddr, KM_USER0);
494 brelse(bh);
495 *cnop = curr;
496 ret = n;
497
498 out:
499 up_read(&NILFS_MDT(cpfile)->mi_sem);
500 return ret;
501}
502
503/**
504 * nilfs_cpfile_get_cpinfo -
505 * @cpfile:
506 * @cno:
507 * @ci:
508 * @nci:
509 */
510
511ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
512 struct nilfs_cpinfo *ci, size_t nci)
513{
514 switch (mode) {
515 case NILFS_CHECKPOINT:
516 return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
517 case NILFS_SNAPSHOT:
518 return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
519 default:
520 return -EINVAL;
521 }
522}
523
524/**
525 * nilfs_cpfile_delete_checkpoint -
526 * @cpfile:
527 * @cno:
528 */
529int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
530{
531 struct nilfs_cpinfo ci;
532 __u64 tcno = cno;
533 ssize_t nci;
534 int ret;
535
536 nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
537 if (nci < 0)
538 return nci;
539 else if (nci == 0 || ci.ci_cno != cno)
540 return -ENOENT;
541
542 /* cannot delete the latest checkpoint nor snapshots */
543 ret = nilfs_cpinfo_snapshot(&ci);
544 if (ret < 0)
545 return ret;
546 else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
547 return -EPERM;
548
549 return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
550}
551
552static struct nilfs_snapshot_list *
553nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
554 __u64 cno,
555 struct buffer_head *bh,
556 void *kaddr)
557{
558 struct nilfs_cpfile_header *header;
559 struct nilfs_checkpoint *cp;
560 struct nilfs_snapshot_list *list;
561
562 if (cno != 0) {
563 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
564 list = &cp->cp_snapshot_list;
565 } else {
566 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
567 list = &header->ch_snapshot_list;
568 }
569 return list;
570}
571
572static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
573{
574 struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
575 struct nilfs_cpfile_header *header;
576 struct nilfs_checkpoint *cp;
577 struct nilfs_snapshot_list *list;
578 __u64 curr, prev;
579 unsigned long curr_blkoff, prev_blkoff;
580 void *kaddr;
581 int ret;
582
583 if (cno == 0)
584 return -ENOENT; /* checkpoint number 0 is invalid */
585 down_write(&NILFS_MDT(cpfile)->mi_sem);
586
587 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
588 if (ret < 0)
589 goto out_sem;
590 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
591 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
592 if (nilfs_checkpoint_invalid(cp)) {
593 ret = -ENOENT;
594 kunmap_atomic(kaddr, KM_USER0);
595 goto out_cp;
596 }
597 if (nilfs_checkpoint_snapshot(cp)) {
598 ret = 0;
599 kunmap_atomic(kaddr, KM_USER0);
600 goto out_cp;
601 }
602 kunmap_atomic(kaddr, KM_USER0);
603
604 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
605 if (ret < 0)
606 goto out_cp;
607 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
608 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
609 list = &header->ch_snapshot_list;
610 curr_bh = header_bh;
611 get_bh(curr_bh);
612 curr = 0;
613 curr_blkoff = 0;
614 prev = le64_to_cpu(list->ssl_prev);
615 while (prev > cno) {
616 prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
617 curr = prev;
618 if (curr_blkoff != prev_blkoff) {
619 kunmap_atomic(kaddr, KM_USER0);
620 brelse(curr_bh);
621 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
622 0, &curr_bh);
623 if (ret < 0)
624 goto out_header;
625 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
626 }
627 curr_blkoff = prev_blkoff;
628 cp = nilfs_cpfile_block_get_checkpoint(
629 cpfile, curr, curr_bh, kaddr);
630 list = &cp->cp_snapshot_list;
631 prev = le64_to_cpu(list->ssl_prev);
632 }
633 kunmap_atomic(kaddr, KM_USER0);
634
635 if (prev != 0) {
636 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
637 &prev_bh);
638 if (ret < 0)
639 goto out_curr;
640 } else {
641 prev_bh = header_bh;
642 get_bh(prev_bh);
643 }
644
645 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
646 list = nilfs_cpfile_block_get_snapshot_list(
647 cpfile, curr, curr_bh, kaddr);
648 list->ssl_prev = cpu_to_le64(cno);
649 kunmap_atomic(kaddr, KM_USER0);
650
651 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
652 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
653 cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
654 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
655 nilfs_checkpoint_set_snapshot(cp);
656 kunmap_atomic(kaddr, KM_USER0);
657
658 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
659 list = nilfs_cpfile_block_get_snapshot_list(
660 cpfile, prev, prev_bh, kaddr);
661 list->ssl_next = cpu_to_le64(cno);
662 kunmap_atomic(kaddr, KM_USER0);
663
664 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
665 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
666 le64_add_cpu(&header->ch_nsnapshots, 1);
667 kunmap_atomic(kaddr, KM_USER0);
668
669 nilfs_mdt_mark_buffer_dirty(prev_bh);
670 nilfs_mdt_mark_buffer_dirty(curr_bh);
671 nilfs_mdt_mark_buffer_dirty(cp_bh);
672 nilfs_mdt_mark_buffer_dirty(header_bh);
673 nilfs_mdt_mark_dirty(cpfile);
674
675 brelse(prev_bh);
676
677 out_curr:
678 brelse(curr_bh);
679
680 out_header:
681 brelse(header_bh);
682
683 out_cp:
684 brelse(cp_bh);
685
686 out_sem:
687 up_write(&NILFS_MDT(cpfile)->mi_sem);
688 return ret;
689}
690
691static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
692{
693 struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
694 struct nilfs_cpfile_header *header;
695 struct nilfs_checkpoint *cp;
696 struct nilfs_snapshot_list *list;
697 __u64 next, prev;
698 void *kaddr;
699 int ret;
700
701 if (cno == 0)
702 return -ENOENT; /* checkpoint number 0 is invalid */
703 down_write(&NILFS_MDT(cpfile)->mi_sem);
704
705 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
706 if (ret < 0)
707 goto out_sem;
708 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
709 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
710 if (nilfs_checkpoint_invalid(cp)) {
711 ret = -ENOENT;
712 kunmap_atomic(kaddr, KM_USER0);
713 goto out_cp;
714 }
715 if (!nilfs_checkpoint_snapshot(cp)) {
716 ret = 0;
717 kunmap_atomic(kaddr, KM_USER0);
718 goto out_cp;
719 }
720
721 list = &cp->cp_snapshot_list;
722 next = le64_to_cpu(list->ssl_next);
723 prev = le64_to_cpu(list->ssl_prev);
724 kunmap_atomic(kaddr, KM_USER0);
725
726 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
727 if (ret < 0)
728 goto out_cp;
729 if (next != 0) {
730 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
731 &next_bh);
732 if (ret < 0)
733 goto out_header;
734 } else {
735 next_bh = header_bh;
736 get_bh(next_bh);
737 }
738 if (prev != 0) {
739 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
740 &prev_bh);
741 if (ret < 0)
742 goto out_next;
743 } else {
744 prev_bh = header_bh;
745 get_bh(prev_bh);
746 }
747
748 kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
749 list = nilfs_cpfile_block_get_snapshot_list(
750 cpfile, next, next_bh, kaddr);
751 list->ssl_prev = cpu_to_le64(prev);
752 kunmap_atomic(kaddr, KM_USER0);
753
754 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
755 list = nilfs_cpfile_block_get_snapshot_list(
756 cpfile, prev, prev_bh, kaddr);
757 list->ssl_next = cpu_to_le64(next);
758 kunmap_atomic(kaddr, KM_USER0);
759
760 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
761 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
762 cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
763 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
764 nilfs_checkpoint_clear_snapshot(cp);
765 kunmap_atomic(kaddr, KM_USER0);
766
767 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
768 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
769 le64_add_cpu(&header->ch_nsnapshots, -1);
770 kunmap_atomic(kaddr, KM_USER0);
771
772 nilfs_mdt_mark_buffer_dirty(next_bh);
773 nilfs_mdt_mark_buffer_dirty(prev_bh);
774 nilfs_mdt_mark_buffer_dirty(cp_bh);
775 nilfs_mdt_mark_buffer_dirty(header_bh);
776 nilfs_mdt_mark_dirty(cpfile);
777
778 brelse(prev_bh);
779
780 out_next:
781 brelse(next_bh);
782
783 out_header:
784 brelse(header_bh);
785
786 out_cp:
787 brelse(cp_bh);
788
789 out_sem:
790 up_write(&NILFS_MDT(cpfile)->mi_sem);
791 return ret;
792}
793
794/**
795 * nilfs_cpfile_is_snapshot -
796 * @cpfile: inode of checkpoint file
797 * @cno: checkpoint number
798 *
799 * Description:
800 *
801 * Return Value: On success, 1 is returned if the checkpoint specified by
802 * @cno is a snapshot, or 0 if not. On error, one of the following negative
803 * error codes is returned.
804 *
805 * %-EIO - I/O error.
806 *
807 * %-ENOMEM - Insufficient amount of memory available.
808 *
809 * %-ENOENT - No such checkpoint.
810 */
811int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
812{
813 struct buffer_head *bh;
814 struct nilfs_checkpoint *cp;
815 void *kaddr;
816 int ret;
817
818 if (cno == 0)
819 return -ENOENT; /* checkpoint number 0 is invalid */
820 down_read(&NILFS_MDT(cpfile)->mi_sem);
821
822 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
823 if (ret < 0)
824 goto out;
825 kaddr = kmap_atomic(bh->b_page, KM_USER0);
826 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
827 ret = nilfs_checkpoint_snapshot(cp);
828 kunmap_atomic(kaddr, KM_USER0);
829 brelse(bh);
830
831 out:
832 up_read(&NILFS_MDT(cpfile)->mi_sem);
833 return ret;
834}
835
836/**
837 * nilfs_cpfile_change_cpmode - change checkpoint mode
838 * @cpfile: inode of checkpoint file
839 * @cno: checkpoint number
840 * @status: mode of checkpoint
841 *
842 * Description: nilfs_change_cpmode() changes the mode of the checkpoint
843 * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
844 *
845 * Return Value: On success, 0 is returned. On error, one of the following
846 * negative error codes is returned.
847 *
848 * %-EIO - I/O error.
849 *
850 * %-ENOMEM - Insufficient amount of memory available.
851 *
852 * %-ENOENT - No such checkpoint.
853 */
854int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
855{
856 struct the_nilfs *nilfs;
857 int ret;
858
859 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
860
861 switch (mode) {
862 case NILFS_CHECKPOINT:
863 /*
864 * Check for protecting existing snapshot mounts:
865 * bd_mount_sem is used to make this operation atomic and
866 * exclusive with a new mount job. Though it doesn't cover
867 * umount, it's enough for the purpose.
868 */
869 down(&nilfs->ns_bdev->bd_mount_sem);
870 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
871 /* Current implementation does not have to protect
872 plain read-only mounts since they are exclusive
873 with a read/write mount and are protected from the
874 cleaner. */
875 ret = -EBUSY;
876 } else
877 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
878 up(&nilfs->ns_bdev->bd_mount_sem);
879 return ret;
880 case NILFS_SNAPSHOT:
881 return nilfs_cpfile_set_snapshot(cpfile, cno);
882 default:
883 return -EINVAL;
884 }
885}
886
887/**
888 * nilfs_cpfile_get_stat - get checkpoint statistics
889 * @cpfile: inode of checkpoint file
890 * @stat: pointer to a structure of checkpoint statistics
891 *
892 * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
893 *
894 * Return Value: On success, 0 is returned, and checkpoints information is
895 * stored in the place pointed by @stat. On error, one of the following
896 * negative error codes is returned.
897 *
898 * %-EIO - I/O error.
899 *
900 * %-ENOMEM - Insufficient amount of memory available.
901 */
902int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
903{
904 struct buffer_head *bh;
905 struct nilfs_cpfile_header *header;
906 void *kaddr;
907 int ret;
908
909 down_read(&NILFS_MDT(cpfile)->mi_sem);
910
911 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
912 if (ret < 0)
913 goto out_sem;
914 kaddr = kmap_atomic(bh->b_page, KM_USER0);
915 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
916 cpstat->cs_cno = nilfs_mdt_cno(cpfile);
917 cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
918 cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
919 kunmap_atomic(kaddr, KM_USER0);
920 brelse(bh);
921
922 out_sem:
923 up_read(&NILFS_MDT(cpfile)->mi_sem);
924 return ret;
925}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000000..1a8a1008c342
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
1/*
2 * cpfile.h - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_CPFILE_H
24#define _NILFS_CPFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29
30#define NILFS_CPFILE_GFP NILFS_MDT_GFP
31
32
33int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
34 struct nilfs_checkpoint **,
35 struct buffer_head **);
36void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
37int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
38int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
39int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
40int nilfs_cpfile_is_snapshot(struct inode *, __u64);
41int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
42ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
43 struct nilfs_cpinfo *, size_t);
44
45#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 000000000000..bb8a5818e7f1
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
1/*
2 * dat.c - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/types.h>
24#include <linux/buffer_head.h>
25#include <linux/string.h>
26#include <linux/errno.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "dat.h"
31
32
33#define NILFS_CNO_MIN ((__u64)1)
34#define NILFS_CNO_MAX (~(__u64)0)
35
36static int nilfs_dat_prepare_entry(struct inode *dat,
37 struct nilfs_palloc_req *req, int create)
38{
39 return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
40 create, &req->pr_entry_bh);
41}
42
43static void nilfs_dat_commit_entry(struct inode *dat,
44 struct nilfs_palloc_req *req)
45{
46 nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
47 nilfs_mdt_mark_dirty(dat);
48 brelse(req->pr_entry_bh);
49}
50
51static void nilfs_dat_abort_entry(struct inode *dat,
52 struct nilfs_palloc_req *req)
53{
54 brelse(req->pr_entry_bh);
55}
56
57int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
58{
59 int ret;
60
61 ret = nilfs_palloc_prepare_alloc_entry(dat, req);
62 if (ret < 0)
63 return ret;
64
65 ret = nilfs_dat_prepare_entry(dat, req, 1);
66 if (ret < 0)
67 nilfs_palloc_abort_alloc_entry(dat, req);
68
69 return ret;
70}
71
72void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
73{
74 struct nilfs_dat_entry *entry;
75 void *kaddr;
76
77 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
78 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
79 req->pr_entry_bh, kaddr);
80 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
81 entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
82 entry->de_blocknr = cpu_to_le64(0);
83 kunmap_atomic(kaddr, KM_USER0);
84
85 nilfs_palloc_commit_alloc_entry(dat, req);
86 nilfs_dat_commit_entry(dat, req);
87}
88
89void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
90{
91 nilfs_dat_abort_entry(dat, req);
92 nilfs_palloc_abort_alloc_entry(dat, req);
93}
94
95int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
96{
97 int ret;
98
99 ret = nilfs_palloc_prepare_free_entry(dat, req);
100 if (ret < 0)
101 return ret;
102 ret = nilfs_dat_prepare_entry(dat, req, 0);
103 if (ret < 0) {
104 nilfs_palloc_abort_free_entry(dat, req);
105 return ret;
106 }
107 return 0;
108}
109
110void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
111{
112 struct nilfs_dat_entry *entry;
113 void *kaddr;
114
115 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
116 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
117 req->pr_entry_bh, kaddr);
118 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
119 entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
120 entry->de_blocknr = cpu_to_le64(0);
121 kunmap_atomic(kaddr, KM_USER0);
122
123 nilfs_dat_commit_entry(dat, req);
124 nilfs_palloc_commit_free_entry(dat, req);
125}
126
127void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
128{
129 nilfs_dat_abort_entry(dat, req);
130 nilfs_palloc_abort_free_entry(dat, req);
131}
132
133int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
134{
135 int ret;
136
137 ret = nilfs_dat_prepare_entry(dat, req, 0);
138 WARN_ON(ret == -ENOENT);
139 return ret;
140}
141
142void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
143 sector_t blocknr)
144{
145 struct nilfs_dat_entry *entry;
146 void *kaddr;
147
148 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
149 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
150 req->pr_entry_bh, kaddr);
151 entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
152 if (entry->de_blocknr != cpu_to_le64(0) ||
153 entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
154 printk(KERN_CRIT
155 "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
156 __func__, (unsigned long long)req->pr_entry_nr,
157 (unsigned long long)le64_to_cpu(entry->de_start),
158 (unsigned long long)le64_to_cpu(entry->de_end),
159 (unsigned long long)le64_to_cpu(entry->de_blocknr));
160 }
161 entry->de_blocknr = cpu_to_le64(blocknr);
162 kunmap_atomic(kaddr, KM_USER0);
163
164 nilfs_dat_commit_entry(dat, req);
165}
166
167void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
168{
169 nilfs_dat_abort_entry(dat, req);
170}
171
172int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
173{
174 struct nilfs_dat_entry *entry;
175 __u64 start;
176 sector_t blocknr;
177 void *kaddr;
178 int ret;
179
180 ret = nilfs_dat_prepare_entry(dat, req, 0);
181 if (ret < 0) {
182 WARN_ON(ret == -ENOENT);
183 return ret;
184 }
185
186 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
187 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
188 req->pr_entry_bh, kaddr);
189 start = le64_to_cpu(entry->de_start);
190 blocknr = le64_to_cpu(entry->de_blocknr);
191 kunmap_atomic(kaddr, KM_USER0);
192
193 if (blocknr == 0) {
194 ret = nilfs_palloc_prepare_free_entry(dat, req);
195 if (ret < 0) {
196 nilfs_dat_abort_entry(dat, req);
197 return ret;
198 }
199 }
200
201 return 0;
202}
203
204void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
205 int dead)
206{
207 struct nilfs_dat_entry *entry;
208 __u64 start, end;
209 sector_t blocknr;
210 void *kaddr;
211
212 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
213 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
214 req->pr_entry_bh, kaddr);
215 end = start = le64_to_cpu(entry->de_start);
216 if (!dead) {
217 end = nilfs_mdt_cno(dat);
218 WARN_ON(start > end);
219 }
220 entry->de_end = cpu_to_le64(end);
221 blocknr = le64_to_cpu(entry->de_blocknr);
222 kunmap_atomic(kaddr, KM_USER0);
223
224 if (blocknr == 0)
225 nilfs_dat_commit_free(dat, req);
226 else
227 nilfs_dat_commit_entry(dat, req);
228}
229
230void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
231{
232 struct nilfs_dat_entry *entry;
233 __u64 start;
234 sector_t blocknr;
235 void *kaddr;
236
237 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
238 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
239 req->pr_entry_bh, kaddr);
240 start = le64_to_cpu(entry->de_start);
241 blocknr = le64_to_cpu(entry->de_blocknr);
242 kunmap_atomic(kaddr, KM_USER0);
243
244 if (start == nilfs_mdt_cno(dat) && blocknr == 0)
245 nilfs_palloc_abort_free_entry(dat, req);
246 nilfs_dat_abort_entry(dat, req);
247}
248
249/**
250 * nilfs_dat_mark_dirty -
251 * @dat: DAT file inode
252 * @vblocknr: virtual block number
253 *
254 * Description:
255 *
256 * Return Value: On success, 0 is returned. On error, one of the following
257 * negative error codes is returned.
258 *
259 * %-EIO - I/O error.
260 *
261 * %-ENOMEM - Insufficient amount of memory available.
262 */
263int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
264{
265 struct nilfs_palloc_req req;
266 int ret;
267
268 req.pr_entry_nr = vblocknr;
269 ret = nilfs_dat_prepare_entry(dat, &req, 0);
270 if (ret == 0)
271 nilfs_dat_commit_entry(dat, &req);
272 return ret;
273}
274
275/**
276 * nilfs_dat_freev - free virtual block numbers
277 * @dat: DAT file inode
278 * @vblocknrs: array of virtual block numbers
279 * @nitems: number of virtual block numbers
280 *
281 * Description: nilfs_dat_freev() frees the virtual block numbers specified by
282 * @vblocknrs and @nitems.
283 *
284 * Return Value: On success, 0 is returned. On error, one of the following
285 * nagative error codes is returned.
286 *
287 * %-EIO - I/O error.
288 *
289 * %-ENOMEM - Insufficient amount of memory available.
290 *
291 * %-ENOENT - The virtual block number have not been allocated.
292 */
293int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
294{
295 return nilfs_palloc_freev(dat, vblocknrs, nitems);
296}
297
298/**
299 * nilfs_dat_move - change a block number
300 * @dat: DAT file inode
301 * @vblocknr: virtual block number
302 * @blocknr: block number
303 *
304 * Description: nilfs_dat_move() changes the block number associated with
305 * @vblocknr to @blocknr.
306 *
307 * Return Value: On success, 0 is returned. On error, one of the following
308 * negative error codes is returned.
309 *
310 * %-EIO - I/O error.
311 *
312 * %-ENOMEM - Insufficient amount of memory available.
313 */
314int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
315{
316 struct buffer_head *entry_bh;
317 struct nilfs_dat_entry *entry;
318 void *kaddr;
319 int ret;
320
321 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
322 if (ret < 0)
323 return ret;
324 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
325 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
326 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
327 printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
328 (unsigned long long)vblocknr,
329 (unsigned long long)le64_to_cpu(entry->de_start),
330 (unsigned long long)le64_to_cpu(entry->de_end));
331 kunmap_atomic(kaddr, KM_USER0);
332 brelse(entry_bh);
333 return -EINVAL;
334 }
335 WARN_ON(blocknr == 0);
336 entry->de_blocknr = cpu_to_le64(blocknr);
337 kunmap_atomic(kaddr, KM_USER0);
338
339 nilfs_mdt_mark_buffer_dirty(entry_bh);
340 nilfs_mdt_mark_dirty(dat);
341
342 brelse(entry_bh);
343
344 return 0;
345}
346
347/**
348 * nilfs_dat_translate - translate a virtual block number to a block number
349 * @dat: DAT file inode
350 * @vblocknr: virtual block number
351 * @blocknrp: pointer to a block number
352 *
353 * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
354 * to the corresponding block number.
355 *
356 * Return Value: On success, 0 is returned and the block number associated
357 * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
358 * of the following negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 *
364 * %-ENOENT - A block number associated with @vblocknr does not exist.
365 */
366int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
367{
368 struct buffer_head *entry_bh;
369 struct nilfs_dat_entry *entry;
370 sector_t blocknr;
371 void *kaddr;
372 int ret;
373
374 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
375 if (ret < 0)
376 return ret;
377
378 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
379 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
380 blocknr = le64_to_cpu(entry->de_blocknr);
381 if (blocknr == 0) {
382 ret = -ENOENT;
383 goto out;
384 }
385 if (blocknrp != NULL)
386 *blocknrp = blocknr;
387
388 out:
389 kunmap_atomic(kaddr, KM_USER0);
390 brelse(entry_bh);
391 return ret;
392}
393
394ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
395 size_t nvi)
396{
397 struct buffer_head *entry_bh;
398 struct nilfs_dat_entry *entry;
399 __u64 first, last;
400 void *kaddr;
401 unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
402 int i, j, n, ret;
403
404 for (i = 0; i < nvi; i += n) {
405 ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
406 0, &entry_bh);
407 if (ret < 0)
408 return ret;
409 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
410 /* last virtual block number in this block */
411 first = vinfo[i].vi_vblocknr;
412 do_div(first, entries_per_block);
413 first *= entries_per_block;
414 last = first + entries_per_block - 1;
415 for (j = i, n = 0;
416 j < nvi && vinfo[j].vi_vblocknr >= first &&
417 vinfo[j].vi_vblocknr <= last;
418 j++, n++) {
419 entry = nilfs_palloc_block_get_entry(
420 dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
421 vinfo[j].vi_start = le64_to_cpu(entry->de_start);
422 vinfo[j].vi_end = le64_to_cpu(entry->de_end);
423 vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
424 }
425 kunmap_atomic(kaddr, KM_USER0);
426 brelse(entry_bh);
427 }
428
429 return nvi;
430}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 000000000000..d9560654a4b7
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
1/*
2 * dat.h - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DAT_H
24#define _NILFS_DAT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/fs.h>
29
30#define NILFS_DAT_GFP NILFS_MDT_GFP
31
32struct nilfs_palloc_req;
33
34int nilfs_dat_translate(struct inode *, __u64, sector_t *);
35
36int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
37void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
38void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
39int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
40void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
41 sector_t);
42void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
43int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
44void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
45void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
46
47int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t);
49int nilfs_dat_move(struct inode *, __u64, sector_t);
50ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
51
52#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 000000000000..54100acc1102
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
1/*
2 * dir.c - NILFS directory entry operations
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
21 */
22/*
23 * linux/fs/ext2/dir.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/dir.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * ext2 directory handling functions
37 *
38 * Big-endian to little-endian byte-swapping/bitmaps by
39 * David S. Miller (davem@caip.rutgers.edu), 1995
40 *
41 * All code that works with directory layout had been switched to pagecache
42 * and moved here. AV
43 */
44
45#include <linux/pagemap.h>
46#include <linux/smp_lock.h>
47#include "nilfs.h"
48#include "page.h"
49
50/*
51 * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
52 * more robust, but we have what we have
53 */
54static inline unsigned nilfs_chunk_size(struct inode *inode)
55{
56 return inode->i_sb->s_blocksize;
57}
58
59static inline void nilfs_put_page(struct page *page)
60{
61 kunmap(page);
62 page_cache_release(page);
63}
64
65static inline unsigned long dir_pages(struct inode *inode)
66{
67 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
68}
69
70/*
71 * Return the offset into page `page_nr' of the last valid
72 * byte in that page, plus one.
73 */
74static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
75{
76 unsigned last_byte = inode->i_size;
77
78 last_byte -= page_nr << PAGE_CACHE_SHIFT;
79 if (last_byte > PAGE_CACHE_SIZE)
80 last_byte = PAGE_CACHE_SIZE;
81 return last_byte;
82}
83
84static int nilfs_prepare_chunk_uninterruptible(struct page *page,
85 struct address_space *mapping,
86 unsigned from, unsigned to)
87{
88 loff_t pos = page_offset(page) + from;
89 return block_write_begin(NULL, mapping, pos, to - from,
90 AOP_FLAG_UNINTERRUPTIBLE, &page,
91 NULL, nilfs_get_block);
92}
93
94static int nilfs_prepare_chunk(struct page *page,
95 struct address_space *mapping,
96 unsigned from, unsigned to)
97{
98 loff_t pos = page_offset(page) + from;
99 return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
100 NULL, nilfs_get_block);
101}
102
103static int nilfs_commit_chunk(struct page *page,
104 struct address_space *mapping,
105 unsigned from, unsigned to)
106{
107 struct inode *dir = mapping->host;
108 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
109 loff_t pos = page_offset(page) + from;
110 unsigned len = to - from;
111 unsigned nr_dirty, copied;
112 int err;
113
114 nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
115 copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
116 if (pos + copied > dir->i_size) {
117 i_size_write(dir, pos + copied);
118 mark_inode_dirty(dir);
119 }
120 if (IS_DIRSYNC(dir))
121 nilfs_set_transaction_flag(NILFS_TI_SYNC);
122 err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
123 unlock_page(page);
124 return err;
125}
126
127static void nilfs_check_page(struct page *page)
128{
129 struct inode *dir = page->mapping->host;
130 struct super_block *sb = dir->i_sb;
131 unsigned chunk_size = nilfs_chunk_size(dir);
132 char *kaddr = page_address(page);
133 unsigned offs, rec_len;
134 unsigned limit = PAGE_CACHE_SIZE;
135 struct nilfs_dir_entry *p;
136 char *error;
137
138 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
139 limit = dir->i_size & ~PAGE_CACHE_MASK;
140 if (limit & (chunk_size - 1))
141 goto Ebadsize;
142 if (!limit)
143 goto out;
144 }
145 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
146 p = (struct nilfs_dir_entry *)(kaddr + offs);
147 rec_len = le16_to_cpu(p->rec_len);
148
149 if (rec_len < NILFS_DIR_REC_LEN(1))
150 goto Eshort;
151 if (rec_len & 3)
152 goto Ealign;
153 if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
154 goto Enamelen;
155 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
156 goto Espan;
157 }
158 if (offs != limit)
159 goto Eend;
160out:
161 SetPageChecked(page);
162 return;
163
164 /* Too bad, we had an error */
165
166Ebadsize:
167 nilfs_error(sb, "nilfs_check_page",
168 "size of directory #%lu is not a multiple of chunk size",
169 dir->i_ino
170 );
171 goto fail;
172Eshort:
173 error = "rec_len is smaller than minimal";
174 goto bad_entry;
175Ealign:
176 error = "unaligned directory entry";
177 goto bad_entry;
178Enamelen:
179 error = "rec_len is too small for name_len";
180 goto bad_entry;
181Espan:
182 error = "directory entry across blocks";
183bad_entry:
184 nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
185 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
186 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
187 (unsigned long) le64_to_cpu(p->inode),
188 rec_len, p->name_len);
189 goto fail;
190Eend:
191 p = (struct nilfs_dir_entry *)(kaddr + offs);
192 nilfs_error(sb, "nilfs_check_page",
193 "entry in directory #%lu spans the page boundary"
194 "offset=%lu, inode=%lu",
195 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
196 (unsigned long) le64_to_cpu(p->inode));
197fail:
198 SetPageChecked(page);
199 SetPageError(page);
200}
201
202static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
203{
204 struct address_space *mapping = dir->i_mapping;
205 struct page *page = read_cache_page(mapping, n,
206 (filler_t *)mapping->a_ops->readpage, NULL);
207 if (!IS_ERR(page)) {
208 wait_on_page_locked(page);
209 kmap(page);
210 if (!PageUptodate(page))
211 goto fail;
212 if (!PageChecked(page))
213 nilfs_check_page(page);
214 if (PageError(page))
215 goto fail;
216 }
217 return page;
218
219fail:
220 nilfs_put_page(page);
221 return ERR_PTR(-EIO);
222}
223
224/*
225 * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
226 *
227 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
228 */
229static int
230nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
231{
232 if (len != de->name_len)
233 return 0;
234 if (!de->inode)
235 return 0;
236 return !memcmp(name, de->name, len);
237}
238
239/*
240 * p is at least 6 bytes before the end of page
241 */
242static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
243{
244 return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
245}
246
247static unsigned char
248nilfs_filetype_table[NILFS_FT_MAX] = {
249 [NILFS_FT_UNKNOWN] = DT_UNKNOWN,
250 [NILFS_FT_REG_FILE] = DT_REG,
251 [NILFS_FT_DIR] = DT_DIR,
252 [NILFS_FT_CHRDEV] = DT_CHR,
253 [NILFS_FT_BLKDEV] = DT_BLK,
254 [NILFS_FT_FIFO] = DT_FIFO,
255 [NILFS_FT_SOCK] = DT_SOCK,
256 [NILFS_FT_SYMLINK] = DT_LNK,
257};
258
259#define S_SHIFT 12
260static unsigned char
261nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
262 [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
263 [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
264 [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
265 [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV,
266 [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO,
267 [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK,
268 [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK,
269};
270
271static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
272{
273 mode_t mode = inode->i_mode;
274
275 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
276}
277
278static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
279{
280 loff_t pos = filp->f_pos;
281 struct inode *inode = filp->f_dentry->d_inode;
282 struct super_block *sb = inode->i_sb;
283 unsigned int offset = pos & ~PAGE_CACHE_MASK;
284 unsigned long n = pos >> PAGE_CACHE_SHIFT;
285 unsigned long npages = dir_pages(inode);
286/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
287 unsigned char *types = NULL;
288 int ret;
289
290 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
291 goto success;
292
293 types = nilfs_filetype_table;
294
295 for ( ; n < npages; n++, offset = 0) {
296 char *kaddr, *limit;
297 struct nilfs_dir_entry *de;
298 struct page *page = nilfs_get_page(inode, n);
299
300 if (IS_ERR(page)) {
301 nilfs_error(sb, __func__, "bad page in #%lu",
302 inode->i_ino);
303 filp->f_pos += PAGE_CACHE_SIZE - offset;
304 ret = -EIO;
305 goto done;
306 }
307 kaddr = page_address(page);
308 de = (struct nilfs_dir_entry *)(kaddr + offset);
309 limit = kaddr + nilfs_last_byte(inode, n) -
310 NILFS_DIR_REC_LEN(1);
311 for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
312 if (de->rec_len == 0) {
313 nilfs_error(sb, __func__,
314 "zero-length directory entry");
315 ret = -EIO;
316 nilfs_put_page(page);
317 goto done;
318 }
319 if (de->inode) {
320 int over;
321 unsigned char d_type = DT_UNKNOWN;
322
323 if (types && de->file_type < NILFS_FT_MAX)
324 d_type = types[de->file_type];
325
326 offset = (char *)de - kaddr;
327 over = filldir(dirent, de->name, de->name_len,
328 (n<<PAGE_CACHE_SHIFT) | offset,
329 le64_to_cpu(de->inode), d_type);
330 if (over) {
331 nilfs_put_page(page);
332 goto success;
333 }
334 }
335 filp->f_pos += le16_to_cpu(de->rec_len);
336 }
337 nilfs_put_page(page);
338 }
339
340success:
341 ret = 0;
342done:
343 return ret;
344}
345
346/*
347 * nilfs_find_entry()
348 *
349 * finds an entry in the specified directory with the wanted name. It
350 * returns the page in which the entry was found, and the entry itself
351 * (as a parameter - res_dir). Page is returned mapped and unlocked.
352 * Entry is guaranteed to be valid.
353 */
354struct nilfs_dir_entry *
355nilfs_find_entry(struct inode *dir, struct dentry *dentry,
356 struct page **res_page)
357{
358 const char *name = dentry->d_name.name;
359 int namelen = dentry->d_name.len;
360 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
361 unsigned long start, n;
362 unsigned long npages = dir_pages(dir);
363 struct page *page = NULL;
364 struct nilfs_inode_info *ei = NILFS_I(dir);
365 struct nilfs_dir_entry *de;
366
367 if (npages == 0)
368 goto out;
369
370 /* OFFSET_CACHE */
371 *res_page = NULL;
372
373 start = ei->i_dir_start_lookup;
374 if (start >= npages)
375 start = 0;
376 n = start;
377 do {
378 char *kaddr;
379 page = nilfs_get_page(dir, n);
380 if (!IS_ERR(page)) {
381 kaddr = page_address(page);
382 de = (struct nilfs_dir_entry *)kaddr;
383 kaddr += nilfs_last_byte(dir, n) - reclen;
384 while ((char *) de <= kaddr) {
385 if (de->rec_len == 0) {
386 nilfs_error(dir->i_sb, __func__,
387 "zero-length directory entry");
388 nilfs_put_page(page);
389 goto out;
390 }
391 if (nilfs_match(namelen, name, de))
392 goto found;
393 de = nilfs_next_entry(de);
394 }
395 nilfs_put_page(page);
396 }
397 if (++n >= npages)
398 n = 0;
399 /* next page is past the blocks we've got */
400 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
401 nilfs_error(dir->i_sb, __func__,
402 "dir %lu size %lld exceeds block cout %llu",
403 dir->i_ino, dir->i_size,
404 (unsigned long long)dir->i_blocks);
405 goto out;
406 }
407 } while (n != start);
408out:
409 return NULL;
410
411found:
412 *res_page = page;
413 ei->i_dir_start_lookup = n;
414 return de;
415}
416
417struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
418{
419 struct page *page = nilfs_get_page(dir, 0);
420 struct nilfs_dir_entry *de = NULL;
421
422 if (!IS_ERR(page)) {
423 de = nilfs_next_entry(
424 (struct nilfs_dir_entry *)page_address(page));
425 *p = page;
426 }
427 return de;
428}
429
430ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
431{
432 ino_t res = 0;
433 struct nilfs_dir_entry *de;
434 struct page *page;
435
436 de = nilfs_find_entry(dir, dentry, &page);
437 if (de) {
438 res = le64_to_cpu(de->inode);
439 kunmap(page);
440 page_cache_release(page);
441 }
442 return res;
443}
444
445/* Releases the page */
446void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
447 struct page *page, struct inode *inode)
448{
449 unsigned from = (char *) de - (char *) page_address(page);
450 unsigned to = from + le16_to_cpu(de->rec_len);
451 struct address_space *mapping = page->mapping;
452 int err;
453
454 lock_page(page);
455 err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
456 BUG_ON(err);
457 de->inode = cpu_to_le64(inode->i_ino);
458 nilfs_set_de_type(de, inode);
459 err = nilfs_commit_chunk(page, mapping, from, to);
460 nilfs_put_page(page);
461 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
462/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
463 mark_inode_dirty(dir);
464}
465
466/*
467 * Parent is locked.
468 */
469int nilfs_add_link(struct dentry *dentry, struct inode *inode)
470{
471 struct inode *dir = dentry->d_parent->d_inode;
472 const char *name = dentry->d_name.name;
473 int namelen = dentry->d_name.len;
474 unsigned chunk_size = nilfs_chunk_size(dir);
475 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
476 unsigned short rec_len, name_len;
477 struct page *page = NULL;
478 struct nilfs_dir_entry *de;
479 unsigned long npages = dir_pages(dir);
480 unsigned long n;
481 char *kaddr;
482 unsigned from, to;
483 int err;
484
485 /*
486 * We take care of directory expansion in the same loop.
487 * This code plays outside i_size, so it locks the page
488 * to protect that region.
489 */
490 for (n = 0; n <= npages; n++) {
491 char *dir_end;
492
493 page = nilfs_get_page(dir, n);
494 err = PTR_ERR(page);
495 if (IS_ERR(page))
496 goto out;
497 lock_page(page);
498 kaddr = page_address(page);
499 dir_end = kaddr + nilfs_last_byte(dir, n);
500 de = (struct nilfs_dir_entry *)kaddr;
501 kaddr += PAGE_CACHE_SIZE - reclen;
502 while ((char *)de <= kaddr) {
503 if ((char *)de == dir_end) {
504 /* We hit i_size */
505 name_len = 0;
506 rec_len = chunk_size;
507 de->rec_len = cpu_to_le16(chunk_size);
508 de->inode = 0;
509 goto got_it;
510 }
511 if (de->rec_len == 0) {
512 nilfs_error(dir->i_sb, __func__,
513 "zero-length directory entry");
514 err = -EIO;
515 goto out_unlock;
516 }
517 err = -EEXIST;
518 if (nilfs_match(namelen, name, de))
519 goto out_unlock;
520 name_len = NILFS_DIR_REC_LEN(de->name_len);
521 rec_len = le16_to_cpu(de->rec_len);
522 if (!de->inode && rec_len >= reclen)
523 goto got_it;
524 if (rec_len >= name_len + reclen)
525 goto got_it;
526 de = (struct nilfs_dir_entry *)((char *)de + rec_len);
527 }
528 unlock_page(page);
529 nilfs_put_page(page);
530 }
531 BUG();
532 return -EINVAL;
533
534got_it:
535 from = (char *)de - (char *)page_address(page);
536 to = from + rec_len;
537 err = nilfs_prepare_chunk(page, page->mapping, from, to);
538 if (err)
539 goto out_unlock;
540 if (de->inode) {
541 struct nilfs_dir_entry *de1;
542
543 de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
544 de1->rec_len = cpu_to_le16(rec_len - name_len);
545 de->rec_len = cpu_to_le16(name_len);
546 de = de1;
547 }
548 de->name_len = namelen;
549 memcpy(de->name, name, namelen);
550 de->inode = cpu_to_le64(inode->i_ino);
551 nilfs_set_de_type(de, inode);
552 err = nilfs_commit_chunk(page, page->mapping, from, to);
553 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
554/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
555 mark_inode_dirty(dir);
556 /* OFFSET_CACHE */
557out_put:
558 nilfs_put_page(page);
559out:
560 return err;
561out_unlock:
562 unlock_page(page);
563 goto out_put;
564}
565
566/*
567 * nilfs_delete_entry deletes a directory entry by merging it with the
568 * previous entry. Page is up-to-date. Releases the page.
569 */
570int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
571{
572 struct address_space *mapping = page->mapping;
573 struct inode *inode = mapping->host;
574 char *kaddr = page_address(page);
575 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
576 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
577 struct nilfs_dir_entry *pde = NULL;
578 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
579 int err;
580
581 while ((char *)de < (char *)dir) {
582 if (de->rec_len == 0) {
583 nilfs_error(inode->i_sb, __func__,
584 "zero-length directory entry");
585 err = -EIO;
586 goto out;
587 }
588 pde = de;
589 de = nilfs_next_entry(de);
590 }
591 if (pde)
592 from = (char *)pde - (char *)page_address(page);
593 lock_page(page);
594 err = nilfs_prepare_chunk(page, mapping, from, to);
595 BUG_ON(err);
596 if (pde)
597 pde->rec_len = cpu_to_le16(to - from);
598 dir->inode = 0;
599 err = nilfs_commit_chunk(page, mapping, from, to);
600 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
601/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
602 mark_inode_dirty(inode);
603out:
604 nilfs_put_page(page);
605 return err;
606}
607
608/*
609 * Set the first fragment of directory.
610 */
611int nilfs_make_empty(struct inode *inode, struct inode *parent)
612{
613 struct address_space *mapping = inode->i_mapping;
614 struct page *page = grab_cache_page(mapping, 0);
615 unsigned chunk_size = nilfs_chunk_size(inode);
616 struct nilfs_dir_entry *de;
617 int err;
618 void *kaddr;
619
620 if (!page)
621 return -ENOMEM;
622
623 err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
624 if (unlikely(err)) {
625 unlock_page(page);
626 goto fail;
627 }
628 kaddr = kmap_atomic(page, KM_USER0);
629 memset(kaddr, 0, chunk_size);
630 de = (struct nilfs_dir_entry *)kaddr;
631 de->name_len = 1;
632 de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
633 memcpy(de->name, ".\0\0", 4);
634 de->inode = cpu_to_le64(inode->i_ino);
635 nilfs_set_de_type(de, inode);
636
637 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
638 de->name_len = 2;
639 de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
640 de->inode = cpu_to_le64(parent->i_ino);
641 memcpy(de->name, "..\0", 4);
642 nilfs_set_de_type(de, inode);
643 kunmap_atomic(kaddr, KM_USER0);
644 err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
645fail:
646 page_cache_release(page);
647 return err;
648}
649
650/*
651 * routine to check that the specified directory is empty (for rmdir)
652 */
653int nilfs_empty_dir(struct inode *inode)
654{
655 struct page *page = NULL;
656 unsigned long i, npages = dir_pages(inode);
657
658 for (i = 0; i < npages; i++) {
659 char *kaddr;
660 struct nilfs_dir_entry *de;
661
662 page = nilfs_get_page(inode, i);
663 if (IS_ERR(page))
664 continue;
665
666 kaddr = page_address(page);
667 de = (struct nilfs_dir_entry *)kaddr;
668 kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
669
670 while ((char *)de <= kaddr) {
671 if (de->rec_len == 0) {
672 nilfs_error(inode->i_sb, __func__,
673 "zero-length directory entry "
674 "(kaddr=%p, de=%p)\n", kaddr, de);
675 goto not_empty;
676 }
677 if (de->inode != 0) {
678 /* check for . and .. */
679 if (de->name[0] != '.')
680 goto not_empty;
681 if (de->name_len > 2)
682 goto not_empty;
683 if (de->name_len < 2) {
684 if (de->inode !=
685 cpu_to_le64(inode->i_ino))
686 goto not_empty;
687 } else if (de->name[1] != '.')
688 goto not_empty;
689 }
690 de = nilfs_next_entry(de);
691 }
692 nilfs_put_page(page);
693 }
694 return 1;
695
696not_empty:
697 nilfs_put_page(page);
698 return 0;
699}
700
701struct file_operations nilfs_dir_operations = {
702 .llseek = generic_file_llseek,
703 .read = generic_read_dir,
704 .readdir = nilfs_readdir,
705 .unlocked_ioctl = nilfs_ioctl,
706#ifdef CONFIG_COMPAT
707 .compat_ioctl = nilfs_ioctl,
708#endif /* CONFIG_COMPAT */
709 .fsync = nilfs_sync_file,
710
711};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 000000000000..c6379e482781
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
1/*
2 * direct.c - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/errno.h>
24#include "nilfs.h"
25#include "page.h"
26#include "direct.h"
27#include "alloc.h"
28
29static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
30{
31 return (__le64 *)
32 ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
33}
34
35static inline __u64
36nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
37{
38 return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
39}
40
41static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
42 __u64 key, __u64 ptr)
43{
44 *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
45}
46
47static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
48 __u64 key, int level, __u64 *ptrp)
49{
50 struct nilfs_direct *direct;
51 __u64 ptr;
52
53 direct = (struct nilfs_direct *)bmap;
54 if ((key > NILFS_DIRECT_KEY_MAX) ||
55 (level != 1) || /* XXX: use macro for level 1 */
56 ((ptr = nilfs_direct_get_ptr(direct, key)) ==
57 NILFS_BMAP_INVALID_PTR))
58 return -ENOENT;
59
60 if (ptrp != NULL)
61 *ptrp = ptr;
62 return 0;
63}
64
65static __u64
66nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
67{
68 __u64 ptr;
69
70 ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
71 if (ptr != NILFS_BMAP_INVALID_PTR)
72 /* sequential access */
73 return ptr;
74 else
75 /* block group */
76 return nilfs_bmap_find_target_in_group(&direct->d_bmap);
77}
78
79static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
80 __u64 key, __u64 ptr)
81{
82 direct->d_bmap.b_last_allocated_key = key;
83 direct->d_bmap.b_last_allocated_ptr = ptr;
84}
85
86static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
87 __u64 key,
88 union nilfs_bmap_ptr_req *req,
89 struct nilfs_bmap_stats *stats)
90{
91 int ret;
92
93 if (direct->d_ops->dop_find_target != NULL)
94 req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
95 ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
96 req);
97 if (ret < 0)
98 return ret;
99
100 stats->bs_nblocks = 1;
101 return 0;
102}
103
104static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
105 union nilfs_bmap_ptr_req *req,
106 __u64 key, __u64 ptr)
107{
108 struct buffer_head *bh;
109
110 /* ptr must be a pointer to a buffer head. */
111 bh = (struct buffer_head *)((unsigned long)ptr);
112 set_buffer_nilfs_volatile(bh);
113
114 if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
115 direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
116 &direct->d_bmap, req);
117 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
118
119 if (!nilfs_bmap_dirty(&direct->d_bmap))
120 nilfs_bmap_set_dirty(&direct->d_bmap);
121
122 if (direct->d_ops->dop_set_target != NULL)
123 direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
124}
125
126static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
127{
128 struct nilfs_direct *direct;
129 union nilfs_bmap_ptr_req req;
130 struct nilfs_bmap_stats stats;
131 int ret;
132
133 direct = (struct nilfs_direct *)bmap;
134 if (key > NILFS_DIRECT_KEY_MAX)
135 return -ENOENT;
136 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
137 return -EEXIST;
138
139 ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
140 if (ret < 0)
141 return ret;
142 nilfs_direct_commit_insert(direct, &req, key, ptr);
143 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
144
145 return 0;
146}
147
148static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
149 union nilfs_bmap_ptr_req *req,
150 __u64 key,
151 struct nilfs_bmap_stats *stats)
152{
153 int ret;
154
155 if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
156 req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
157 ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
158 &direct->d_bmap, req);
159 if (ret < 0)
160 return ret;
161 }
162
163 stats->bs_nblocks = 1;
164 return 0;
165}
166
167static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
168 union nilfs_bmap_ptr_req *req,
169 __u64 key)
170{
171 if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
172 direct->d_bmap.b_pops->bpop_commit_end_ptr(
173 &direct->d_bmap, req);
174 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
175}
176
177static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
178{
179 struct nilfs_direct *direct;
180 union nilfs_bmap_ptr_req req;
181 struct nilfs_bmap_stats stats;
182 int ret;
183
184 direct = (struct nilfs_direct *)bmap;
185 if ((key > NILFS_DIRECT_KEY_MAX) ||
186 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
187 return -ENOENT;
188
189 ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
190 if (ret < 0)
191 return ret;
192 nilfs_direct_commit_delete(direct, &req, key);
193 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
194
195 return 0;
196}
197
198static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
199{
200 struct nilfs_direct *direct;
201 __u64 key, lastkey;
202
203 direct = (struct nilfs_direct *)bmap;
204 lastkey = NILFS_DIRECT_KEY_MAX + 1;
205 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
206 if (nilfs_direct_get_ptr(direct, key) !=
207 NILFS_BMAP_INVALID_PTR)
208 lastkey = key;
209
210 if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
211 return -ENOENT;
212
213 *keyp = lastkey;
214
215 return 0;
216}
217
218static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
219{
220 return key > NILFS_DIRECT_KEY_MAX;
221}
222
223static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
224 __u64 *keys, __u64 *ptrs, int nitems)
225{
226 struct nilfs_direct *direct;
227 __u64 key;
228 __u64 ptr;
229 int n;
230
231 direct = (struct nilfs_direct *)bmap;
232 if (nitems > NILFS_DIRECT_NBLOCKS)
233 nitems = NILFS_DIRECT_NBLOCKS;
234 n = 0;
235 for (key = 0; key < nitems; key++) {
236 ptr = nilfs_direct_get_ptr(direct, key);
237 if (ptr != NILFS_BMAP_INVALID_PTR) {
238 keys[n] = key;
239 ptrs[n] = ptr;
240 n++;
241 }
242 }
243 return n;
244}
245
246int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
247 __u64 key, __u64 *keys, __u64 *ptrs,
248 int n, __u64 low, __u64 high)
249{
250 struct nilfs_direct *direct;
251 __le64 *dptrs;
252 int ret, i, j;
253
254 /* no need to allocate any resource for conversion */
255
256 /* delete */
257 ret = bmap->b_ops->bop_delete(bmap, key);
258 if (ret < 0)
259 return ret;
260
261 /* free resources */
262 if (bmap->b_ops->bop_clear != NULL)
263 bmap->b_ops->bop_clear(bmap);
264
265 /* convert */
266 direct = (struct nilfs_direct *)bmap;
267 dptrs = nilfs_direct_dptrs(direct);
268 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
269 if ((j < n) && (i == keys[j])) {
270 dptrs[i] = (i != key) ?
271 nilfs_bmap_ptr_to_dptr(ptrs[j]) :
272 NILFS_BMAP_INVALID_PTR;
273 j++;
274 } else
275 dptrs[i] = NILFS_BMAP_INVALID_PTR;
276 }
277
278 nilfs_direct_init(bmap, low, high);
279
280 return 0;
281}
282
283static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
284 struct buffer_head *bh)
285{
286 union nilfs_bmap_ptr_req oldreq, newreq;
287 __u64 key;
288 __u64 ptr;
289 int ret;
290
291 key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
292 ptr = nilfs_direct_get_ptr(direct, key);
293 if (!buffer_nilfs_volatile(bh)) {
294 oldreq.bpr_ptr = ptr;
295 newreq.bpr_ptr = ptr;
296 ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
297 &newreq);
298 if (ret < 0)
299 return ret;
300 nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
301 set_buffer_nilfs_volatile(bh);
302 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
303 } else
304 ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
305
306 return ret;
307}
308
309static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
310 struct buffer_head *bh)
311{
312 struct nilfs_direct *direct;
313
314 direct = (struct nilfs_direct *)bmap;
315 return (direct->d_ops->dop_propagate != NULL) ?
316 direct->d_ops->dop_propagate(direct, bh) :
317 0;
318}
319
320static int nilfs_direct_assign_v(struct nilfs_direct *direct,
321 __u64 key, __u64 ptr,
322 struct buffer_head **bh,
323 sector_t blocknr,
324 union nilfs_binfo *binfo)
325{
326 union nilfs_bmap_ptr_req req;
327 int ret;
328
329 req.bpr_ptr = ptr;
330 ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
331 &direct->d_bmap, &req);
332 if (ret < 0)
333 return ret;
334 direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
335 &req, blocknr);
336
337 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
338 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
339
340 return 0;
341}
342
343static int nilfs_direct_assign_p(struct nilfs_direct *direct,
344 __u64 key, __u64 ptr,
345 struct buffer_head **bh,
346 sector_t blocknr,
347 union nilfs_binfo *binfo)
348{
349 nilfs_direct_set_ptr(direct, key, blocknr);
350
351 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
352 binfo->bi_dat.bi_level = 0;
353
354 return 0;
355}
356
357static int nilfs_direct_assign(struct nilfs_bmap *bmap,
358 struct buffer_head **bh,
359 sector_t blocknr,
360 union nilfs_binfo *binfo)
361{
362 struct nilfs_direct *direct;
363 __u64 key;
364 __u64 ptr;
365
366 direct = (struct nilfs_direct *)bmap;
367 key = nilfs_bmap_data_get_key(bmap, *bh);
368 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
369 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
370 (unsigned long long)key);
371 return -EINVAL;
372 }
373 ptr = nilfs_direct_get_ptr(direct, key);
374 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
375 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
376 (unsigned long long)ptr);
377 return -EINVAL;
378 }
379
380 return direct->d_ops->dop_assign(direct, key, ptr, bh,
381 blocknr, binfo);
382}
383
384static const struct nilfs_bmap_operations nilfs_direct_ops = {
385 .bop_lookup = nilfs_direct_lookup,
386 .bop_insert = nilfs_direct_insert,
387 .bop_delete = nilfs_direct_delete,
388 .bop_clear = NULL,
389
390 .bop_propagate = nilfs_direct_propagate,
391
392 .bop_lookup_dirty_buffers = NULL,
393
394 .bop_assign = nilfs_direct_assign,
395 .bop_mark = NULL,
396
397 .bop_last_key = nilfs_direct_last_key,
398 .bop_check_insert = nilfs_direct_check_insert,
399 .bop_check_delete = NULL,
400 .bop_gather_data = nilfs_direct_gather_data,
401};
402
403
404static const struct nilfs_direct_operations nilfs_direct_ops_v = {
405 .dop_find_target = nilfs_direct_find_target_v,
406 .dop_set_target = nilfs_direct_set_target_v,
407 .dop_propagate = nilfs_direct_propagate_v,
408 .dop_assign = nilfs_direct_assign_v,
409};
410
411static const struct nilfs_direct_operations nilfs_direct_ops_p = {
412 .dop_find_target = NULL,
413 .dop_set_target = NULL,
414 .dop_propagate = NULL,
415 .dop_assign = nilfs_direct_assign_p,
416};
417
418int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
419{
420 struct nilfs_direct *direct;
421
422 direct = (struct nilfs_direct *)bmap;
423 bmap->b_ops = &nilfs_direct_ops;
424 bmap->b_low = low;
425 bmap->b_high = high;
426 switch (bmap->b_inode->i_ino) {
427 case NILFS_DAT_INO:
428 direct->d_ops = &nilfs_direct_ops_p;
429 break;
430 default:
431 direct->d_ops = &nilfs_direct_ops_v;
432 break;
433 }
434
435 return 0;
436}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 000000000000..45d2c5cda812
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
1/*
2 * direct.h - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DIRECT_H
24#define _NILFS_DIRECT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include "bmap.h"
29
30
31struct nilfs_direct;
32
33/**
34 * struct nilfs_direct_operations - direct mapping operation table
35 */
36struct nilfs_direct_operations {
37 __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
38 void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
39 int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
40 int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
41 struct buffer_head **, sector_t,
42 union nilfs_binfo *);
43};
44
45/**
46 * struct nilfs_direct_node - direct node
47 * @dn_flags: flags
48 * @dn_pad: padding
49 */
50struct nilfs_direct_node {
51 __u8 dn_flags;
52 __u8 pad[7];
53};
54
55/**
56 * struct nilfs_direct - direct mapping
57 * @d_bmap: bmap structure
58 * @d_ops: direct mapping operation table
59 */
60struct nilfs_direct {
61 struct nilfs_bmap d_bmap;
62
63 /* direct-mapping-specific members */
64 const struct nilfs_direct_operations *d_ops;
65};
66
67
68#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
69#define NILFS_DIRECT_KEY_MIN 0
70#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
71
72
73int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
74int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
75 __u64 *, int, __u64, __u64);
76
77
78#endif /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 000000000000..6bd84a0d8238
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
1/*
2 * file.c - NILFS regular file handling primitives including fsync().
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#include <linux/fs.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include "nilfs.h"
28#include "segment.h"
29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
31{
32 /*
33 * Called from fsync() system call
34 * This is the only entry point that can catch write and synch
35 * timing for both data blocks and intermediate blocks.
36 *
37 * This function should be implemented when the writeback function
38 * will be implemented.
39 */
40 struct inode *inode = dentry->d_inode;
41 int err;
42
43 if (!nilfs_inode_dirty(inode))
44 return 0;
45
46 if (datasync)
47 err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
48 LLONG_MAX);
49 else
50 err = nilfs_construct_segment(inode->i_sb);
51
52 return err;
53}
54
55static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
56{
57 struct page *page = vmf->page;
58 struct inode *inode = vma->vm_file->f_dentry->d_inode;
59 struct nilfs_transaction_info ti;
60 int ret;
61
62 if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
63 return VM_FAULT_SIGBUS; /* -ENOSPC */
64
65 lock_page(page);
66 if (page->mapping != inode->i_mapping ||
67 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
68 unlock_page(page);
69 return VM_FAULT_NOPAGE; /* make the VM retry the fault */
70 }
71
72 /*
73 * check to see if the page is mapped already (no holes)
74 */
75 if (PageMappedToDisk(page)) {
76 unlock_page(page);
77 goto mapped;
78 }
79 if (page_has_buffers(page)) {
80 struct buffer_head *bh, *head;
81 int fully_mapped = 1;
82
83 bh = head = page_buffers(page);
84 do {
85 if (!buffer_mapped(bh)) {
86 fully_mapped = 0;
87 break;
88 }
89 } while (bh = bh->b_this_page, bh != head);
90
91 if (fully_mapped) {
92 SetPageMappedToDisk(page);
93 unlock_page(page);
94 goto mapped;
95 }
96 }
97 unlock_page(page);
98
99 /*
100 * fill hole blocks
101 */
102 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
103 /* never returns -ENOMEM, but may return -ENOSPC */
104 if (unlikely(ret))
105 return VM_FAULT_SIGBUS;
106
107 ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
108 if (unlikely(ret)) {
109 nilfs_transaction_abort(inode->i_sb);
110 return ret;
111 }
112 nilfs_transaction_commit(inode->i_sb);
113
114 mapped:
115 SetPageChecked(page);
116 wait_on_page_writeback(page);
117 return 0;
118}
119
120struct vm_operations_struct nilfs_file_vm_ops = {
121 .fault = filemap_fault,
122 .page_mkwrite = nilfs_page_mkwrite,
123};
124
125static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
126{
127 file_accessed(file);
128 vma->vm_ops = &nilfs_file_vm_ops;
129 vma->vm_flags |= VM_CAN_NONLINEAR;
130 return 0;
131}
132
133/*
134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem.
136 */
137struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek,
139 .read = do_sync_read,
140 .write = do_sync_write,
141 .aio_read = generic_file_aio_read,
142 .aio_write = generic_file_aio_write,
143 .unlocked_ioctl = nilfs_ioctl,
144#ifdef CONFIG_COMPAT
145 .compat_ioctl = nilfs_ioctl,
146#endif /* CONFIG_COMPAT */
147 .mmap = nilfs_file_mmap,
148 .open = generic_file_open,
149 /* .release = nilfs_release_file, */
150 .fsync = nilfs_sync_file,
151 .splice_read = generic_file_splice_read,
152};
153
154struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission,
158};
159
160/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 000000000000..93383c5cee90
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_clear_dirty_pages(mapping);
65 nilfs_copy_back_pages(mapping, gmapping);
66 /* note: mdt dirty flags should be cleared by segctor. */
67
68 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
69 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
70
71 up_write(&NILFS_MDT(dat)->mi_sem);
72}
73
74void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
75{
76 struct inode *gcdat = nilfs->ns_gc_dat;
77 struct nilfs_inode_info *gii = NILFS_I(gcdat);
78
79 gcdat->i_state = I_CLEAR;
80 gii->i_flags = 0;
81
82 truncate_inode_pages(gcdat->i_mapping, 0);
83 truncate_inode_pages(&gii->i_btnode_cache, 0);
84}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000000..19d2102b6a69
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
1/*
2 * gcinode.c - dummy inodes to buffer blocks for garbage collection
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
23 *
24 */
25/*
26 * This file adds the cache of on-disk blocks to be moved in garbage
27 * collection. The disk blocks are held with dummy inodes (called
28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function.
30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap
41 * because the dirty blocks make a new generation; they rather must be
42 * written individually.
43 */
44
45#include <linux/buffer_head.h>
46#include <linux/mpage.h>
47#include <linux/hash.h>
48#include <linux/swap.h>
49#include "nilfs.h"
50#include "page.h"
51#include "mdt.h"
52#include "dat.h"
53#include "ifile.h"
54
55static struct address_space_operations def_gcinode_aops = {};
56/* XXX need def_gcinode_iops/fops? */
57
58/*
59 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
60 * @inode - gc inode
61 * @blkoff - dummy offset treated as the key for the page cache
62 * @pbn - physical block number of the block
63 * @vbn - virtual block number of the block, 0 for non-virtual block
64 * @out_bh - indirect pointer to a buffer_head struct to receive the results
65 *
66 * Description: nilfs_gccache_submit_read_data() registers the data buffer
67 * specified by @pbn to the GC pagecache with the key @blkoff.
68 * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
69 *
70 * Return Value: On success, 0 is returned. On Error, one of the following
71 * negative error code is returned.
72 *
73 * %-EIO - I/O error.
74 *
75 * %-ENOMEM - Insufficient amount of memory available.
76 *
77 * %-ENOENT - The block specified with @pbn does not exist.
78 */
79int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
80 sector_t pbn, __u64 vbn,
81 struct buffer_head **out_bh)
82{
83 struct buffer_head *bh;
84 int err;
85
86 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
87 if (unlikely(!bh))
88 return -ENOMEM;
89
90 if (buffer_uptodate(bh))
91 goto out;
92
93 if (pbn == 0) {
94 struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
95 /* use original dat, not gc dat. */
96 err = nilfs_dat_translate(dat_inode, vbn, &pbn);
97 if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
98 brelse(bh);
99 goto failed;
100 }
101 }
102
103 lock_buffer(bh);
104 if (buffer_uptodate(bh)) {
105 unlock_buffer(bh);
106 goto out;
107 }
108
109 if (!buffer_mapped(bh)) {
110 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
111 set_buffer_mapped(bh);
112 }
113 bh->b_blocknr = pbn;
114 bh->b_end_io = end_buffer_read_sync;
115 get_bh(bh);
116 submit_bh(READ, bh);
117 if (vbn)
118 bh->b_blocknr = vbn;
119 out:
120 err = 0;
121 *out_bh = bh;
122
123 failed:
124 unlock_page(bh->b_page);
125 page_cache_release(bh->b_page);
126 return err;
127}
128
129/*
130 * nilfs_gccache_submit_read_node() - add node buffer and submit read request
131 * @inode - gc inode
132 * @pbn - physical block number for the block
133 * @vbn - virtual block number for the block
134 * @out_bh - indirect pointer to a buffer_head struct to receive the results
135 *
136 * Description: nilfs_gccache_submit_read_node() registers the node buffer
137 * specified by @vbn to the GC pagecache. @pbn can be supplied by the
138 * caller to avoid translation of the disk block address.
139 *
140 * Return Value: On success, 0 is returned. On Error, one of the following
141 * negative error code is returned.
142 *
143 * %-EIO - I/O error.
144 *
145 * %-ENOMEM - Insufficient amount of memory available.
146 */
147int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
148 __u64 vbn, struct buffer_head **out_bh)
149{
150 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
151 vbn ? : pbn, pbn, out_bh, 0);
152 if (ret == -EEXIST) /* internal code (cache hit) */
153 ret = 0;
154 return ret;
155}
156
157int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
158{
159 wait_on_buffer(bh);
160 if (!buffer_uptodate(bh))
161 return -EIO;
162 if (buffer_dirty(bh))
163 return -EEXIST;
164
165 if (buffer_nilfs_node(bh))
166 nilfs_btnode_mark_dirty(bh);
167 else
168 nilfs_mdt_mark_buffer_dirty(bh);
169 return 0;
170}
171
172/*
173 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
174 * @nilfs - the_nilfs
175 *
176 * Return Value: On success, 0.
177 * On error, a negative error code is returned.
178 */
179int nilfs_init_gccache(struct the_nilfs *nilfs)
180{
181 int loop;
182
183 BUG_ON(nilfs->ns_gc_inodes_h);
184
185 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
186
187 nilfs->ns_gc_inodes_h =
188 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
189 GFP_NOFS);
190 if (nilfs->ns_gc_inodes_h == NULL)
191 return -ENOMEM;
192
193 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
194 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
195 return 0;
196}
197
198/*
199 * nilfs_destroy_gccache() - free gc_inode hash table
200 * @nilfs - the nilfs
201 */
202void nilfs_destroy_gccache(struct the_nilfs *nilfs)
203{
204 if (nilfs->ns_gc_inodes_h) {
205 nilfs_remove_all_gcinode(nilfs);
206 kfree(nilfs->ns_gc_inodes_h);
207 nilfs->ns_gc_inodes_h = NULL;
208 }
209}
210
211static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
212 __u64 cno)
213{
214 struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
215 struct nilfs_inode_info *ii;
216
217 if (!inode)
218 return NULL;
219
220 inode->i_op = NULL;
221 inode->i_fop = NULL;
222 inode->i_mapping->a_ops = &def_gcinode_aops;
223
224 ii = NILFS_I(inode);
225 ii->i_cno = cno;
226 ii->i_flags = 0;
227 ii->i_state = 1 << NILFS_I_GCINODE;
228 ii->i_bh = NULL;
229 nilfs_bmap_init_gc(ii->i_bmap);
230
231 return inode;
232}
233
234static unsigned long ihash(ino_t ino, __u64 cno)
235{
236 return hash_long((unsigned long)((ino << 2) + cno),
237 NILFS_GCINODE_HASH_BITS);
238}
239
240/*
241 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
242 */
243struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
244{
245 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
246 struct hlist_node *node;
247 struct inode *inode;
248
249 hlist_for_each_entry(inode, node, head, i_hash) {
250 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
251 return inode;
252 }
253
254 inode = alloc_gcinode(nilfs, ino, cno);
255 if (likely(inode)) {
256 hlist_add_head(&inode->i_hash, head);
257 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
258 }
259 return inode;
260}
261
262/*
263 * nilfs_clear_gcinode() - clear and free a gc inode
264 */
265void nilfs_clear_gcinode(struct inode *inode)
266{
267 nilfs_mdt_clear(inode);
268 nilfs_mdt_destroy(inode);
269}
270
271/*
272 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
273 */
274void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
275{
276 struct hlist_head *head = nilfs->ns_gc_inodes_h;
277 struct hlist_node *node, *n;
278 struct inode *inode;
279 int loop;
280
281 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
282 hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
283 hlist_del_init(&inode->i_hash);
284 list_del_init(&NILFS_I(inode)->i_dirty);
285 nilfs_clear_gcinode(inode); /* might sleep */
286 }
287 }
288}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 000000000000..de86401f209f
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
1/*
2 * ifile.c - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>.
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "ifile.h"
31
32/**
33 * nilfs_ifile_create_inode - create a new disk inode
34 * @ifile: ifile inode
35 * @out_ino: pointer to a variable to store inode number
36 * @out_bh: buffer_head contains newly allocated disk inode
37 *
38 * Return Value: On success, 0 is returned and the newly allocated inode
39 * number is stored in the place pointed by @ino, and buffer_head pointer
40 * that contains newly allocated disk inode structure is stored in the
41 * place pointed by @out_bh
42 * On error, one of the following negative error codes is returned.
43 *
44 * %-EIO - I/O error.
45 *
46 * %-ENOMEM - Insufficient amount of memory available.
47 *
48 * %-ENOSPC - No inode left.
49 */
50int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
51 struct buffer_head **out_bh)
52{
53 struct nilfs_palloc_req req;
54 int ret;
55
56 req.pr_entry_nr = 0; /* 0 says find free inode from beginning of
57 a group. dull code!! */
58 req.pr_entry_bh = NULL;
59
60 ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
61 if (!ret) {
62 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
63 &req.pr_entry_bh);
64 if (ret < 0)
65 nilfs_palloc_abort_alloc_entry(ifile, &req);
66 }
67 if (ret < 0) {
68 brelse(req.pr_entry_bh);
69 return ret;
70 }
71 nilfs_palloc_commit_alloc_entry(ifile, &req);
72 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
73 nilfs_mdt_mark_dirty(ifile);
74 *out_ino = (ino_t)req.pr_entry_nr;
75 *out_bh = req.pr_entry_bh;
76 return 0;
77}
78
79/**
80 * nilfs_ifile_delete_inode - delete a disk inode
81 * @ifile: ifile inode
82 * @ino: inode number
83 *
84 * Return Value: On success, 0 is returned. On error, one of the following
85 * negative error codes is returned.
86 *
87 * %-EIO - I/O error.
88 *
89 * %-ENOMEM - Insufficient amount of memory available.
90 *
91 * %-ENOENT - The inode number @ino have not been allocated.
92 */
93int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
94{
95 struct nilfs_palloc_req req = {
96 .pr_entry_nr = ino, .pr_entry_bh = NULL
97 };
98 struct nilfs_inode *raw_inode;
99 void *kaddr;
100 int ret;
101
102 ret = nilfs_palloc_prepare_free_entry(ifile, &req);
103 if (!ret) {
104 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
105 &req.pr_entry_bh);
106 if (ret < 0)
107 nilfs_palloc_abort_free_entry(ifile, &req);
108 }
109 if (ret < 0) {
110 brelse(req.pr_entry_bh);
111 return ret;
112 }
113
114 kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
115 raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
116 req.pr_entry_bh, kaddr);
117 raw_inode->i_flags = 0;
118 kunmap_atomic(kaddr, KM_USER0);
119
120 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
121 brelse(req.pr_entry_bh);
122
123 nilfs_palloc_commit_free_entry(ifile, &req);
124
125 return 0;
126}
127
128int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
129 struct buffer_head **out_bh)
130{
131 struct super_block *sb = ifile->i_sb;
132 int err;
133
134 if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
135 nilfs_error(sb, __func__, "bad inode number: %lu",
136 (unsigned long) ino);
137 return -EINVAL;
138 }
139
140 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
141 if (unlikely(err)) {
142 if (err == -EINVAL)
143 nilfs_error(sb, __func__, "ifile is broken");
144 else
145 nilfs_warning(sb, __func__,
146 "unable to read inode: %lu",
147 (unsigned long) ino);
148 }
149 return err;
150}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 000000000000..5d30a35679b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
1/*
2 * ifile.h - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24
25#ifndef _NILFS_IFILE_H
26#define _NILFS_IFILE_H
27
28#include <linux/fs.h>
29#include <linux/buffer_head.h>
30#include <linux/nilfs2_fs.h>
31#include "mdt.h"
32#include "alloc.h"
33
34#define NILFS_IFILE_GFP NILFS_MDT_GFP
35
36static inline struct nilfs_inode *
37nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
38{
39 void *kaddr = kmap(ibh->b_page);
40 return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
41}
42
43static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
44 struct buffer_head *ibh)
45{
46 kunmap(ibh->b_page);
47}
48
49int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
50int nilfs_ifile_delete_inode(struct inode *, ino_t);
51int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
52
53#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 000000000000..49ab4a49bb4f
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
1/*
2 * inode.c - NILFS inode operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/mpage.h>
26#include <linux/writeback.h>
27#include <linux/uio.h>
28#include "nilfs.h"
29#include "segment.h"
30#include "page.h"
31#include "mdt.h"
32#include "cpfile.h"
33#include "ifile.h"
34
35
36/**
37 * nilfs_get_block() - get a file block on the filesystem (callback function)
38 * @inode - inode struct of the target file
39 * @blkoff - file block number
40 * @bh_result - buffer head to be mapped on
41 * @create - indicate whether allocating the block or not when it has not
42 * been allocated yet.
43 *
44 * This function does not issue actual read request of the specified data
45 * block. It is done by VFS.
46 * Bulk read for direct-io is not supported yet. (should be supported)
47 */
48int nilfs_get_block(struct inode *inode, sector_t blkoff,
49 struct buffer_head *bh_result, int create)
50{
51 struct nilfs_inode_info *ii = NILFS_I(inode);
52 unsigned long blknum = 0;
53 int err = 0, ret;
54 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
55
56 /* This exclusion control is a workaround; should be revised */
57 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
58 ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
59 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
60 if (ret == 0) { /* found */
61 map_bh(bh_result, inode->i_sb, blknum);
62 goto out;
63 }
64 /* data block was not found */
65 if (ret == -ENOENT && create) {
66 struct nilfs_transaction_info ti;
67
68 bh_result->b_blocknr = 0;
69 err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
70 if (unlikely(err))
71 goto out;
72 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
73 (unsigned long)bh_result);
74 if (unlikely(err != 0)) {
75 if (err == -EEXIST) {
76 /*
77 * The get_block() function could be called
78 * from multiple callers for an inode.
79 * However, the page having this block must
80 * be locked in this case.
81 */
82 printk(KERN_WARNING
83 "nilfs_get_block: a race condition "
84 "while inserting a data block. "
85 "(inode number=%lu, file block "
86 "offset=%llu)\n",
87 inode->i_ino,
88 (unsigned long long)blkoff);
89 err = 0;
90 } else if (err == -EINVAL) {
91 nilfs_error(inode->i_sb, __func__,
92 "broken bmap (inode=%lu)\n",
93 inode->i_ino);
94 err = -EIO;
95 }
96 nilfs_transaction_abort(inode->i_sb);
97 goto out;
98 }
99 nilfs_transaction_commit(inode->i_sb); /* never fails */
100 /* Error handling should be detailed */
101 set_buffer_new(bh_result);
102 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
103 to proper value */
104 } else if (ret == -ENOENT) {
105 /* not found is not error (e.g. hole); must return without
106 the mapped state flag. */
107 ;
108 } else {
109 err = ret;
110 }
111
112 out:
113 return err;
114}
115
116/**
117 * nilfs_readpage() - implement readpage() method of nilfs_aops {}
118 * address_space_operations.
119 * @file - file struct of the file to be read
120 * @page - the page to be read
121 */
122static int nilfs_readpage(struct file *file, struct page *page)
123{
124 return mpage_readpage(page, nilfs_get_block);
125}
126
127/**
128 * nilfs_readpages() - implement readpages() method of nilfs_aops {}
129 * address_space_operations.
130 * @file - file struct of the file to be read
131 * @mapping - address_space struct used for reading multiple pages
132 * @pages - the pages to be read
133 * @nr_pages - number of pages to be read
134 */
135static int nilfs_readpages(struct file *file, struct address_space *mapping,
136 struct list_head *pages, unsigned nr_pages)
137{
138 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
139}
140
141static int nilfs_writepages(struct address_space *mapping,
142 struct writeback_control *wbc)
143{
144 struct inode *inode = mapping->host;
145 int err = 0;
146
147 if (wbc->sync_mode == WB_SYNC_ALL)
148 err = nilfs_construct_dsync_segment(inode->i_sb, inode,
149 wbc->range_start,
150 wbc->range_end);
151 return err;
152}
153
154static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
155{
156 struct inode *inode = page->mapping->host;
157 int err;
158
159 redirty_page_for_writepage(wbc, page);
160 unlock_page(page);
161
162 if (wbc->sync_mode == WB_SYNC_ALL) {
163 err = nilfs_construct_segment(inode->i_sb);
164 if (unlikely(err))
165 return err;
166 } else if (wbc->for_reclaim)
167 nilfs_flush_segment(inode->i_sb, inode->i_ino);
168
169 return 0;
170}
171
172static int nilfs_set_page_dirty(struct page *page)
173{
174 int ret = __set_page_dirty_buffers(page);
175
176 if (ret) {
177 struct inode *inode = page->mapping->host;
178 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
179 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
180
181 nilfs_set_file_dirty(sbi, inode, nr_dirty);
182 }
183 return ret;
184}
185
186static int nilfs_write_begin(struct file *file, struct address_space *mapping,
187 loff_t pos, unsigned len, unsigned flags,
188 struct page **pagep, void **fsdata)
189
190{
191 struct inode *inode = mapping->host;
192 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
193
194 if (unlikely(err))
195 return err;
196
197 *pagep = NULL;
198 err = block_write_begin(file, mapping, pos, len, flags, pagep,
199 fsdata, nilfs_get_block);
200 if (unlikely(err))
201 nilfs_transaction_abort(inode->i_sb);
202 return err;
203}
204
205static int nilfs_write_end(struct file *file, struct address_space *mapping,
206 loff_t pos, unsigned len, unsigned copied,
207 struct page *page, void *fsdata)
208{
209 struct inode *inode = mapping->host;
210 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
211 unsigned nr_dirty;
212 int err;
213
214 nr_dirty = nilfs_page_count_clean_buffers(page, start,
215 start + copied);
216 copied = generic_write_end(file, mapping, pos, len, copied, page,
217 fsdata);
218 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
219 err = nilfs_transaction_commit(inode->i_sb);
220 return err ? : copied;
221}
222
223static ssize_t
224nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
225 loff_t offset, unsigned long nr_segs)
226{
227 struct file *file = iocb->ki_filp;
228 struct inode *inode = file->f_mapping->host;
229 ssize_t size;
230
231 if (rw == WRITE)
232 return 0;
233
234 /* Needs synchronization with the cleaner */
235 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
236 offset, nr_segs, nilfs_get_block, NULL);
237 return size;
238}
239
240struct address_space_operations nilfs_aops = {
241 .writepage = nilfs_writepage,
242 .readpage = nilfs_readpage,
243 /* .sync_page = nilfs_sync_page, */
244 .writepages = nilfs_writepages,
245 .set_page_dirty = nilfs_set_page_dirty,
246 .readpages = nilfs_readpages,
247 .write_begin = nilfs_write_begin,
248 .write_end = nilfs_write_end,
249 /* .releasepage = nilfs_releasepage, */
250 .invalidatepage = block_invalidatepage,
251 .direct_IO = nilfs_direct_IO,
252};
253
254struct inode *nilfs_new_inode(struct inode *dir, int mode)
255{
256 struct super_block *sb = dir->i_sb;
257 struct nilfs_sb_info *sbi = NILFS_SB(sb);
258 struct inode *inode;
259 struct nilfs_inode_info *ii;
260 int err = -ENOMEM;
261 ino_t ino;
262
263 inode = new_inode(sb);
264 if (unlikely(!inode))
265 goto failed;
266
267 mapping_set_gfp_mask(inode->i_mapping,
268 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
269
270 ii = NILFS_I(inode);
271 ii->i_state = 1 << NILFS_I_NEW;
272
273 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
274 if (unlikely(err))
275 goto failed_ifile_create_inode;
276 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
277
278 atomic_inc(&sbi->s_inodes_count);
279
280 inode->i_uid = current_fsuid();
281 if (dir->i_mode & S_ISGID) {
282 inode->i_gid = dir->i_gid;
283 if (S_ISDIR(mode))
284 mode |= S_ISGID;
285 } else
286 inode->i_gid = current_fsgid();
287
288 inode->i_mode = mode;
289 inode->i_ino = ino;
290 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
291
292 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
293 err = nilfs_bmap_read(ii->i_bmap, NULL);
294 if (err < 0)
295 goto failed_bmap;
296
297 set_bit(NILFS_I_BMAP, &ii->i_state);
298 /* No lock is needed; iget() ensures it. */
299 }
300
301 ii->i_flags = NILFS_I(dir)->i_flags;
302 if (S_ISLNK(mode))
303 ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
304 if (!S_ISDIR(mode))
305 ii->i_flags &= ~NILFS_DIRSYNC_FL;
306
307 /* ii->i_file_acl = 0; */
308 /* ii->i_dir_acl = 0; */
309 ii->i_dir_start_lookup = 0;
310#ifdef CONFIG_NILFS_FS_POSIX_ACL
311 ii->i_acl = NULL;
312 ii->i_default_acl = NULL;
313#endif
314 ii->i_cno = 0;
315 nilfs_set_inode_flags(inode);
316 spin_lock(&sbi->s_next_gen_lock);
317 inode->i_generation = sbi->s_next_generation++;
318 spin_unlock(&sbi->s_next_gen_lock);
319 insert_inode_hash(inode);
320
321 err = nilfs_init_acl(inode, dir);
322 if (unlikely(err))
323 goto failed_acl; /* never occur. When supporting
324 nilfs_init_acl(), proper cancellation of
325 above jobs should be considered */
326
327 mark_inode_dirty(inode);
328 return inode;
329
330 failed_acl:
331 failed_bmap:
332 inode->i_nlink = 0;
333 iput(inode); /* raw_inode will be deleted through
334 generic_delete_inode() */
335 goto failed;
336
337 failed_ifile_create_inode:
338 make_bad_inode(inode);
339 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
340 called */
341 failed:
342 return ERR_PTR(err);
343}
344
345void nilfs_free_inode(struct inode *inode)
346{
347 struct super_block *sb = inode->i_sb;
348 struct nilfs_sb_info *sbi = NILFS_SB(sb);
349
350 clear_inode(inode);
351 /* XXX: check error code? Is there any thing I can do? */
352 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
353 atomic_dec(&sbi->s_inodes_count);
354}
355
356void nilfs_set_inode_flags(struct inode *inode)
357{
358 unsigned int flags = NILFS_I(inode)->i_flags;
359
360 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
361 S_DIRSYNC);
362 if (flags & NILFS_SYNC_FL)
363 inode->i_flags |= S_SYNC;
364 if (flags & NILFS_APPEND_FL)
365 inode->i_flags |= S_APPEND;
366 if (flags & NILFS_IMMUTABLE_FL)
367 inode->i_flags |= S_IMMUTABLE;
368#ifndef NILFS_ATIME_DISABLE
369 if (flags & NILFS_NOATIME_FL)
370#endif
371 inode->i_flags |= S_NOATIME;
372 if (flags & NILFS_DIRSYNC_FL)
373 inode->i_flags |= S_DIRSYNC;
374 mapping_set_gfp_mask(inode->i_mapping,
375 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
376}
377
378int nilfs_read_inode_common(struct inode *inode,
379 struct nilfs_inode *raw_inode)
380{
381 struct nilfs_inode_info *ii = NILFS_I(inode);
382 int err;
383
384 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
385 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
386 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
387 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
388 inode->i_size = le64_to_cpu(raw_inode->i_size);
389 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
390 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
391 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
392 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
393 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
394 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
395 if (inode->i_nlink == 0 && inode->i_mode == 0)
396 return -EINVAL; /* this inode is deleted */
397
398 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
399 ii->i_flags = le32_to_cpu(raw_inode->i_flags);
400#if 0
401 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
402 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
403 0 : le32_to_cpu(raw_inode->i_dir_acl);
404#endif
405 ii->i_cno = 0;
406 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
407
408 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
409 S_ISLNK(inode->i_mode)) {
410 err = nilfs_bmap_read(ii->i_bmap, raw_inode);
411 if (err < 0)
412 return err;
413 set_bit(NILFS_I_BMAP, &ii->i_state);
414 /* No lock is needed; iget() ensures it. */
415 }
416 return 0;
417}
418
419static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
420 struct inode *inode)
421{
422 struct nilfs_sb_info *sbi = NILFS_SB(sb);
423 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
424 struct buffer_head *bh;
425 struct nilfs_inode *raw_inode;
426 int err;
427
428 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
429 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
430 if (unlikely(err))
431 goto bad_inode;
432
433 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
434
435#ifdef CONFIG_NILFS_FS_POSIX_ACL
436 ii->i_acl = NILFS_ACL_NOT_CACHED;
437 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
438#endif
439 if (nilfs_read_inode_common(inode, raw_inode))
440 goto failed_unmap;
441
442 if (S_ISREG(inode->i_mode)) {
443 inode->i_op = &nilfs_file_inode_operations;
444 inode->i_fop = &nilfs_file_operations;
445 inode->i_mapping->a_ops = &nilfs_aops;
446 } else if (S_ISDIR(inode->i_mode)) {
447 inode->i_op = &nilfs_dir_inode_operations;
448 inode->i_fop = &nilfs_dir_operations;
449 inode->i_mapping->a_ops = &nilfs_aops;
450 } else if (S_ISLNK(inode->i_mode)) {
451 inode->i_op = &nilfs_symlink_inode_operations;
452 inode->i_mapping->a_ops = &nilfs_aops;
453 } else {
454 inode->i_op = &nilfs_special_inode_operations;
455 init_special_inode(
456 inode, inode->i_mode,
457 new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
458 }
459 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
460 brelse(bh);
461 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
462 nilfs_set_inode_flags(inode);
463 return 0;
464
465 failed_unmap:
466 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
467 brelse(bh);
468
469 bad_inode:
470 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
471 return err;
472}
473
474struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
475{
476 struct inode *inode;
477 int err;
478
479 inode = iget_locked(sb, ino);
480 if (unlikely(!inode))
481 return ERR_PTR(-ENOMEM);
482 if (!(inode->i_state & I_NEW))
483 return inode;
484
485 err = __nilfs_read_inode(sb, ino, inode);
486 if (unlikely(err)) {
487 iget_failed(inode);
488 return ERR_PTR(err);
489 }
490 unlock_new_inode(inode);
491 return inode;
492}
493
494void nilfs_write_inode_common(struct inode *inode,
495 struct nilfs_inode *raw_inode, int has_bmap)
496{
497 struct nilfs_inode_info *ii = NILFS_I(inode);
498
499 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
500 raw_inode->i_uid = cpu_to_le32(inode->i_uid);
501 raw_inode->i_gid = cpu_to_le32(inode->i_gid);
502 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
503 raw_inode->i_size = cpu_to_le64(inode->i_size);
504 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
505 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
506 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
507 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
508 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
509
510 raw_inode->i_flags = cpu_to_le32(ii->i_flags);
511 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
512
513 if (has_bmap)
514 nilfs_bmap_write(ii->i_bmap, raw_inode);
515 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
516 raw_inode->i_device_code =
517 cpu_to_le64(new_encode_dev(inode->i_rdev));
518 /* When extending inode, nilfs->ns_inode_size should be checked
519 for substitutions of appended fields */
520}
521
522void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
523{
524 ino_t ino = inode->i_ino;
525 struct nilfs_inode_info *ii = NILFS_I(inode);
526 struct super_block *sb = inode->i_sb;
527 struct nilfs_sb_info *sbi = NILFS_SB(sb);
528 struct nilfs_inode *raw_inode;
529
530 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
531
532 /* The buffer is guarded with lock_buffer() by the caller */
533 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
534 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
535 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
536
537 nilfs_write_inode_common(inode, raw_inode, 0);
538 /* XXX: call with has_bmap = 0 is a workaround to avoid
539 deadlock of bmap. This delays update of i_bmap to just
540 before writing */
541 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
542}
543
544#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
545
546static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
547 unsigned long from)
548{
549 unsigned long b;
550 int ret;
551
552 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
553 return;
554 repeat:
555 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
556 if (ret == -ENOENT)
557 return;
558 else if (ret < 0)
559 goto failed;
560
561 if (b < from)
562 return;
563
564 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
565 ret = nilfs_bmap_truncate(ii->i_bmap, b);
566 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
567 if (!ret || (ret == -ENOMEM &&
568 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
569 goto repeat;
570
571 failed:
572 if (ret == -EINVAL)
573 nilfs_error(ii->vfs_inode.i_sb, __func__,
574 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
575 else
576 nilfs_warning(ii->vfs_inode.i_sb, __func__,
577 "failed to truncate bmap (ino=%lu, err=%d)",
578 ii->vfs_inode.i_ino, ret);
579}
580
581void nilfs_truncate(struct inode *inode)
582{
583 unsigned long blkoff;
584 unsigned int blocksize;
585 struct nilfs_transaction_info ti;
586 struct super_block *sb = inode->i_sb;
587 struct nilfs_inode_info *ii = NILFS_I(inode);
588
589 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
590 return;
591 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
592 return;
593
594 blocksize = sb->s_blocksize;
595 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
596 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
597
598 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
599
600 nilfs_truncate_bmap(ii, blkoff);
601
602 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
603 if (IS_SYNC(inode))
604 nilfs_set_transaction_flag(NILFS_TI_SYNC);
605
606 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
607 nilfs_transaction_commit(sb);
608 /* May construct a logical segment and may fail in sync mode.
609 But truncate has no return value. */
610}
611
612void nilfs_delete_inode(struct inode *inode)
613{
614 struct nilfs_transaction_info ti;
615 struct super_block *sb = inode->i_sb;
616 struct nilfs_inode_info *ii = NILFS_I(inode);
617
618 if (unlikely(is_bad_inode(inode))) {
619 if (inode->i_data.nrpages)
620 truncate_inode_pages(&inode->i_data, 0);
621 clear_inode(inode);
622 return;
623 }
624 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
625
626 if (inode->i_data.nrpages)
627 truncate_inode_pages(&inode->i_data, 0);
628
629 nilfs_truncate_bmap(ii, 0);
630 nilfs_free_inode(inode);
631 /* nilfs_free_inode() marks inode buffer dirty */
632 if (IS_SYNC(inode))
633 nilfs_set_transaction_flag(NILFS_TI_SYNC);
634 nilfs_transaction_commit(sb);
635 /* May construct a logical segment and may fail in sync mode.
636 But delete_inode has no return value. */
637}
638
639int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
640{
641 struct nilfs_transaction_info ti;
642 struct inode *inode = dentry->d_inode;
643 struct super_block *sb = inode->i_sb;
644 int err;
645
646 err = inode_change_ok(inode, iattr);
647 if (err)
648 return err;
649
650 err = nilfs_transaction_begin(sb, &ti, 0);
651 if (unlikely(err))
652 return err;
653 err = inode_setattr(inode, iattr);
654 if (!err && (iattr->ia_valid & ATTR_MODE))
655 err = nilfs_acl_chmod(inode);
656 if (likely(!err))
657 err = nilfs_transaction_commit(sb);
658 else
659 nilfs_transaction_abort(sb);
660
661 return err;
662}
663
664int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
665 struct buffer_head **pbh)
666{
667 struct nilfs_inode_info *ii = NILFS_I(inode);
668 int err;
669
670 spin_lock(&sbi->s_inode_lock);
671 /* Caller of this function MUST lock s_inode_lock */
672 if (ii->i_bh == NULL) {
673 spin_unlock(&sbi->s_inode_lock);
674 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
675 pbh);
676 if (unlikely(err))
677 return err;
678 spin_lock(&sbi->s_inode_lock);
679 if (ii->i_bh == NULL)
680 ii->i_bh = *pbh;
681 else {
682 brelse(*pbh);
683 *pbh = ii->i_bh;
684 }
685 } else
686 *pbh = ii->i_bh;
687
688 get_bh(*pbh);
689 spin_unlock(&sbi->s_inode_lock);
690 return 0;
691}
692
693int nilfs_inode_dirty(struct inode *inode)
694{
695 struct nilfs_inode_info *ii = NILFS_I(inode);
696 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
697 int ret = 0;
698
699 if (!list_empty(&ii->i_dirty)) {
700 spin_lock(&sbi->s_inode_lock);
701 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
702 test_bit(NILFS_I_BUSY, &ii->i_state);
703 spin_unlock(&sbi->s_inode_lock);
704 }
705 return ret;
706}
707
708int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
709 unsigned nr_dirty)
710{
711 struct nilfs_inode_info *ii = NILFS_I(inode);
712
713 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
714
715 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
716 return 0;
717
718 spin_lock(&sbi->s_inode_lock);
719 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
720 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
721 /* Because this routine may race with nilfs_dispose_list(),
722 we have to check NILFS_I_QUEUED here, too. */
723 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
724 /* This will happen when somebody is freeing
725 this inode. */
726 nilfs_warning(sbi->s_super, __func__,
727 "cannot get inode (ino=%lu)\n",
728 inode->i_ino);
729 spin_unlock(&sbi->s_inode_lock);
730 return -EINVAL; /* NILFS_I_DIRTY may remain for
731 freeing inode */
732 }
733 list_del(&ii->i_dirty);
734 list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
735 set_bit(NILFS_I_QUEUED, &ii->i_state);
736 }
737 spin_unlock(&sbi->s_inode_lock);
738 return 0;
739}
740
741int nilfs_mark_inode_dirty(struct inode *inode)
742{
743 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
744 struct buffer_head *ibh;
745 int err;
746
747 err = nilfs_load_inode_block(sbi, inode, &ibh);
748 if (unlikely(err)) {
749 nilfs_warning(inode->i_sb, __func__,
750 "failed to reget inode block.\n");
751 return err;
752 }
753 lock_buffer(ibh);
754 nilfs_update_inode(inode, ibh);
755 unlock_buffer(ibh);
756 nilfs_mdt_mark_buffer_dirty(ibh);
757 nilfs_mdt_mark_dirty(sbi->s_ifile);
758 brelse(ibh);
759 return 0;
760}
761
762/**
763 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
764 * @inode: inode of the file to be registered.
765 *
766 * nilfs_dirty_inode() loads a inode block containing the specified
767 * @inode and copies data from a nilfs_inode to a corresponding inode
768 * entry in the inode block. This operation is excluded from the segment
769 * construction. This function can be called both as a single operation
770 * and as a part of indivisible file operations.
771 */
772void nilfs_dirty_inode(struct inode *inode)
773{
774 struct nilfs_transaction_info ti;
775
776 if (is_bad_inode(inode)) {
777 nilfs_warning(inode->i_sb, __func__,
778 "tried to mark bad_inode dirty. ignored.\n");
779 dump_stack();
780 return;
781 }
782 nilfs_transaction_begin(inode->i_sb, &ti, 0);
783 nilfs_mark_inode_dirty(inode);
784 nilfs_transaction_commit(inode->i_sb); /* never fails */
785}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000000..108d281ebca5
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,654 @@
1/*
2 * ioctl.c - NILFS ioctl operations.
3 *
4 * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/nilfs2_fs.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "bmap.h"
32#include "cpfile.h"
33#include "sufile.h"
34#include "dat.h"
35
36
37static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
38 struct nilfs_argv *argv, int dir,
39 ssize_t (*dofunc)(struct the_nilfs *,
40 __u64 *, int,
41 void *, size_t, size_t))
42{
43 void *buf;
44 void __user *base = (void __user *)(unsigned long)argv->v_base;
45 size_t maxmembs, total, n;
46 ssize_t nr;
47 int ret, i;
48 __u64 pos, ppos;
49
50 if (argv->v_nmembs == 0)
51 return 0;
52
53 if (argv->v_size > PAGE_SIZE)
54 return -EINVAL;
55
56 buf = (void *)__get_free_pages(GFP_NOFS, 0);
57 if (unlikely(!buf))
58 return -ENOMEM;
59 maxmembs = PAGE_SIZE / argv->v_size;
60
61 ret = 0;
62 total = 0;
63 pos = argv->v_index;
64 for (i = 0; i < argv->v_nmembs; i += n) {
65 n = (argv->v_nmembs - i < maxmembs) ?
66 argv->v_nmembs - i : maxmembs;
67 if ((dir & _IOC_WRITE) &&
68 copy_from_user(buf, base + argv->v_size * i,
69 argv->v_size * n)) {
70 ret = -EFAULT;
71 break;
72 }
73 ppos = pos;
74 nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
75 n);
76 if (nr < 0) {
77 ret = nr;
78 break;
79 }
80 if ((dir & _IOC_READ) &&
81 copy_to_user(base + argv->v_size * i, buf,
82 argv->v_size * nr)) {
83 ret = -EFAULT;
84 break;
85 }
86 total += nr;
87 if ((size_t)nr < n)
88 break;
89 if (pos == ppos)
90 pos += n;
91 }
92 argv->v_nmembs = total;
93
94 free_pages((unsigned long)buf, 0);
95 return ret;
96}
97
98static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
99 unsigned int cmd, void __user *argp)
100{
101 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
102 struct nilfs_transaction_info ti;
103 struct nilfs_cpmode cpmode;
104 int ret;
105
106 if (!capable(CAP_SYS_ADMIN))
107 return -EPERM;
108 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
109 return -EFAULT;
110
111 nilfs_transaction_begin(inode->i_sb, &ti, 0);
112 ret = nilfs_cpfile_change_cpmode(
113 cpfile, cpmode.cm_cno, cpmode.cm_mode);
114 if (unlikely(ret < 0)) {
115 nilfs_transaction_abort(inode->i_sb);
116 return ret;
117 }
118 nilfs_transaction_commit(inode->i_sb); /* never fails */
119 return ret;
120}
121
122static int
123nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
124 unsigned int cmd, void __user *argp)
125{
126 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
127 struct nilfs_transaction_info ti;
128 __u64 cno;
129 int ret;
130
131 if (!capable(CAP_SYS_ADMIN))
132 return -EPERM;
133 if (copy_from_user(&cno, argp, sizeof(cno)))
134 return -EFAULT;
135
136 nilfs_transaction_begin(inode->i_sb, &ti, 0);
137 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
138 if (unlikely(ret < 0)) {
139 nilfs_transaction_abort(inode->i_sb);
140 return ret;
141 }
142 nilfs_transaction_commit(inode->i_sb); /* never fails */
143 return ret;
144}
145
146static ssize_t
147nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
148 void *buf, size_t size, size_t nmembs)
149{
150 return nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
151 nmembs);
152}
153
154static int nilfs_ioctl_get_cpinfo(struct inode *inode, struct file *filp,
155 unsigned int cmd, void __user *argp)
156{
157 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
158 struct nilfs_argv argv;
159 int ret;
160
161 if (copy_from_user(&argv, argp, sizeof(argv)))
162 return -EFAULT;
163
164 down_read(&nilfs->ns_segctor_sem);
165 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
166 nilfs_ioctl_do_get_cpinfo);
167 up_read(&nilfs->ns_segctor_sem);
168 if (ret < 0)
169 return ret;
170
171 if (copy_to_user(argp, &argv, sizeof(argv)))
172 ret = -EFAULT;
173 return ret;
174}
175
176static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
177 unsigned int cmd, void __user *argp)
178{
179 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
180 struct nilfs_cpstat cpstat;
181 int ret;
182
183 down_read(&nilfs->ns_segctor_sem);
184 ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
185 up_read(&nilfs->ns_segctor_sem);
186 if (ret < 0)
187 return ret;
188
189 if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
190 ret = -EFAULT;
191 return ret;
192}
193
194static ssize_t
195nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
196 void *buf, size_t size, size_t nmembs)
197{
198 return nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
199}
200
201static int nilfs_ioctl_get_suinfo(struct inode *inode, struct file *filp,
202 unsigned int cmd, void __user *argp)
203{
204 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
205 struct nilfs_argv argv;
206 int ret;
207
208 if (copy_from_user(&argv, argp, sizeof(argv)))
209 return -EFAULT;
210
211 down_read(&nilfs->ns_segctor_sem);
212 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
213 nilfs_ioctl_do_get_suinfo);
214 up_read(&nilfs->ns_segctor_sem);
215 if (ret < 0)
216 return ret;
217
218 if (copy_to_user(argp, &argv, sizeof(argv)))
219 ret = -EFAULT;
220 return ret;
221}
222
223static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
224 unsigned int cmd, void __user *argp)
225{
226 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
227 struct nilfs_sustat sustat;
228 int ret;
229
230 down_read(&nilfs->ns_segctor_sem);
231 ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
232 up_read(&nilfs->ns_segctor_sem);
233 if (ret < 0)
234 return ret;
235
236 if (copy_to_user(argp, &sustat, sizeof(sustat)))
237 ret = -EFAULT;
238 return ret;
239}
240
241static ssize_t
242nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
243 void *buf, size_t size, size_t nmembs)
244{
245 return nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
246}
247
248static int nilfs_ioctl_get_vinfo(struct inode *inode, struct file *filp,
249 unsigned int cmd, void __user *argp)
250{
251 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
252 struct nilfs_argv argv;
253 int ret;
254
255 if (copy_from_user(&argv, argp, sizeof(argv)))
256 return -EFAULT;
257
258 down_read(&nilfs->ns_segctor_sem);
259 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
260 nilfs_ioctl_do_get_vinfo);
261 up_read(&nilfs->ns_segctor_sem);
262 if (ret < 0)
263 return ret;
264
265 if (copy_to_user(argp, &argv, sizeof(argv)))
266 ret = -EFAULT;
267 return ret;
268}
269
270static ssize_t
271nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
272 void *buf, size_t size, size_t nmembs)
273{
274 struct inode *dat = nilfs_dat_inode(nilfs);
275 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
276 struct nilfs_bdesc *bdescs = buf;
277 int ret, i;
278
279 for (i = 0; i < nmembs; i++) {
280 ret = nilfs_bmap_lookup_at_level(bmap,
281 bdescs[i].bd_offset,
282 bdescs[i].bd_level + 1,
283 &bdescs[i].bd_blocknr);
284 if (ret < 0) {
285 if (ret != -ENOENT)
286 return ret;
287 bdescs[i].bd_blocknr = 0;
288 }
289 }
290 return nmembs;
291}
292
293static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
294 unsigned int cmd, void __user *argp)
295{
296 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
297 struct nilfs_argv argv;
298 int ret;
299
300 if (copy_from_user(&argv, argp, sizeof(argv)))
301 return -EFAULT;
302
303 down_read(&nilfs->ns_segctor_sem);
304 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
305 nilfs_ioctl_do_get_bdescs);
306 up_read(&nilfs->ns_segctor_sem);
307 if (ret < 0)
308 return ret;
309
310 if (copy_to_user(argp, &argv, sizeof(argv)))
311 ret = -EFAULT;
312 return ret;
313}
314
315static int nilfs_ioctl_move_inode_block(struct inode *inode,
316 struct nilfs_vdesc *vdesc,
317 struct list_head *buffers)
318{
319 struct buffer_head *bh;
320 int ret;
321
322 if (vdesc->vd_flags == 0)
323 ret = nilfs_gccache_submit_read_data(
324 inode, vdesc->vd_offset, vdesc->vd_blocknr,
325 vdesc->vd_vblocknr, &bh);
326 else
327 ret = nilfs_gccache_submit_read_node(
328 inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
329
330 if (unlikely(ret < 0)) {
331 if (ret == -ENOENT)
332 printk(KERN_CRIT
333 "%s: invalid virtual block address (%s): "
334 "ino=%llu, cno=%llu, offset=%llu, "
335 "blocknr=%llu, vblocknr=%llu\n",
336 __func__, vdesc->vd_flags ? "node" : "data",
337 (unsigned long long)vdesc->vd_ino,
338 (unsigned long long)vdesc->vd_cno,
339 (unsigned long long)vdesc->vd_offset,
340 (unsigned long long)vdesc->vd_blocknr,
341 (unsigned long long)vdesc->vd_vblocknr);
342 return ret;
343 }
344 bh->b_private = vdesc;
345 list_add_tail(&bh->b_assoc_buffers, buffers);
346 return 0;
347}
348
349static ssize_t
350nilfs_ioctl_do_move_blocks(struct the_nilfs *nilfs, __u64 *posp, int flags,
351 void *buf, size_t size, size_t nmembs)
352{
353 struct inode *inode;
354 struct nilfs_vdesc *vdesc;
355 struct buffer_head *bh, *n;
356 LIST_HEAD(buffers);
357 ino_t ino;
358 __u64 cno;
359 int i, ret;
360
361 for (i = 0, vdesc = buf; i < nmembs; ) {
362 ino = vdesc->vd_ino;
363 cno = vdesc->vd_cno;
364 inode = nilfs_gc_iget(nilfs, ino, cno);
365 if (unlikely(inode == NULL)) {
366 ret = -ENOMEM;
367 goto failed;
368 }
369 do {
370 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
371 &buffers);
372 if (unlikely(ret < 0))
373 goto failed;
374 vdesc++;
375 } while (++i < nmembs &&
376 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
377 }
378
379 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
380 ret = nilfs_gccache_wait_and_mark_dirty(bh);
381 if (unlikely(ret < 0)) {
382 if (ret == -EEXIST) {
383 vdesc = bh->b_private;
384 printk(KERN_CRIT
385 "%s: conflicting %s buffer: "
386 "ino=%llu, cno=%llu, offset=%llu, "
387 "blocknr=%llu, vblocknr=%llu\n",
388 __func__,
389 vdesc->vd_flags ? "node" : "data",
390 (unsigned long long)vdesc->vd_ino,
391 (unsigned long long)vdesc->vd_cno,
392 (unsigned long long)vdesc->vd_offset,
393 (unsigned long long)vdesc->vd_blocknr,
394 (unsigned long long)vdesc->vd_vblocknr);
395 }
396 goto failed;
397 }
398 list_del_init(&bh->b_assoc_buffers);
399 bh->b_private = NULL;
400 brelse(bh);
401 }
402 return nmembs;
403
404 failed:
405 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
406 list_del_init(&bh->b_assoc_buffers);
407 bh->b_private = NULL;
408 brelse(bh);
409 }
410 return ret;
411}
412
413static inline int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
414 struct nilfs_argv *argv,
415 int dir)
416{
417 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
418 nilfs_ioctl_do_move_blocks);
419}
420
421static ssize_t
422nilfs_ioctl_do_delete_checkpoints(struct the_nilfs *nilfs, __u64 *posp,
423 int flags, void *buf, size_t size,
424 size_t nmembs)
425{
426 struct inode *cpfile = nilfs->ns_cpfile;
427 struct nilfs_period *periods = buf;
428 int ret, i;
429
430 for (i = 0; i < nmembs; i++) {
431 ret = nilfs_cpfile_delete_checkpoints(
432 cpfile, periods[i].p_start, periods[i].p_end);
433 if (ret < 0)
434 return ret;
435 }
436 return nmembs;
437}
438
439static inline int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
440 struct nilfs_argv *argv,
441 int dir)
442{
443 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
444 nilfs_ioctl_do_delete_checkpoints);
445}
446
447static ssize_t
448nilfs_ioctl_do_free_vblocknrs(struct the_nilfs *nilfs, __u64 *posp, int flags,
449 void *buf, size_t size, size_t nmembs)
450{
451 int ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
452
453 return (ret < 0) ? ret : nmembs;
454}
455
456static inline int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
457 struct nilfs_argv *argv,
458 int dir)
459{
460 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
461 nilfs_ioctl_do_free_vblocknrs);
462}
463
464static ssize_t
465nilfs_ioctl_do_mark_blocks_dirty(struct the_nilfs *nilfs, __u64 *posp,
466 int flags, void *buf, size_t size,
467 size_t nmembs)
468{
469 struct inode *dat = nilfs_dat_inode(nilfs);
470 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
471 struct nilfs_bdesc *bdescs = buf;
472 int ret, i;
473
474 for (i = 0; i < nmembs; i++) {
475 /* XXX: use macro or inline func to check liveness */
476 ret = nilfs_bmap_lookup_at_level(bmap,
477 bdescs[i].bd_offset,
478 bdescs[i].bd_level + 1,
479 &bdescs[i].bd_blocknr);
480 if (ret < 0) {
481 if (ret != -ENOENT)
482 return ret;
483 bdescs[i].bd_blocknr = 0;
484 }
485 if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
486 /* skip dead block */
487 continue;
488 if (bdescs[i].bd_level == 0) {
489 ret = nilfs_mdt_mark_block_dirty(dat,
490 bdescs[i].bd_offset);
491 if (ret < 0) {
492 WARN_ON(ret == -ENOENT);
493 return ret;
494 }
495 } else {
496 ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
497 bdescs[i].bd_level);
498 if (ret < 0) {
499 WARN_ON(ret == -ENOENT);
500 return ret;
501 }
502 }
503 }
504 return nmembs;
505}
506
507static inline int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
508 struct nilfs_argv *argv,
509 int dir)
510{
511 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
512 nilfs_ioctl_do_mark_blocks_dirty);
513}
514
515static ssize_t
516nilfs_ioctl_do_free_segments(struct the_nilfs *nilfs, __u64 *posp, int flags,
517 void *buf, size_t size, size_t nmembs)
518{
519 struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
520 int ret;
521
522 if (unlikely(!sbi))
523 return -EROFS;
524 ret = nilfs_segctor_add_segments_to_be_freed(
525 NILFS_SC(sbi), buf, nmembs);
526 nilfs_put_writer(nilfs);
527
528 return (ret < 0) ? ret : nmembs;
529}
530
531static inline int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
532 struct nilfs_argv *argv,
533 int dir)
534{
535 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
536 nilfs_ioctl_do_free_segments);
537}
538
539int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
540 void __user *argp)
541{
542 struct nilfs_argv argv[5];
543 const char *msg;
544 int dir, ret;
545
546 if (copy_from_user(argv, argp, sizeof(argv)))
547 return -EFAULT;
548
549 dir = _IOC_WRITE;
550 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], dir);
551 if (ret < 0) {
552 msg = "cannot read source blocks";
553 goto failed;
554 }
555 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], dir);
556 if (ret < 0) {
557 /*
558 * can safely abort because checkpoints can be removed
559 * independently.
560 */
561 msg = "cannot delete checkpoints";
562 goto failed;
563 }
564 ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], dir);
565 if (ret < 0) {
566 /*
567 * can safely abort because DAT file is updated atomically
568 * using a copy-on-write technique.
569 */
570 msg = "cannot delete virtual blocks from DAT file";
571 goto failed;
572 }
573 ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], dir);
574 if (ret < 0) {
575 /*
576 * can safely abort because the operation is nondestructive.
577 */
578 msg = "cannot mark copying blocks dirty";
579 goto failed;
580 }
581 ret = nilfs_ioctl_free_segments(nilfs, &argv[4], dir);
582 if (ret < 0) {
583 /*
584 * can safely abort because this operation is atomic.
585 */
586 msg = "cannot set segments to be freed";
587 goto failed;
588 }
589 return 0;
590
591 failed:
592 nilfs_remove_all_gcinode(nilfs);
593 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
594 msg, ret);
595 return ret;
596}
597
598static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
599 unsigned int cmd, void __user *argp)
600{
601 if (!capable(CAP_SYS_ADMIN))
602 return -EPERM;
603 return nilfs_clean_segments(inode->i_sb, argp);
604}
605
606static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
607 unsigned int cmd, void __user *argp)
608{
609 __u64 cno;
610 int ret;
611
612 ret = nilfs_construct_segment(inode->i_sb);
613 if (ret < 0)
614 return ret;
615
616 if (argp != NULL) {
617 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
618 if (copy_to_user(argp, &cno, sizeof(cno)))
619 return -EFAULT;
620 }
621 return 0;
622}
623
624long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
625{
626 struct inode *inode = filp->f_dentry->d_inode;
627 void __user *argp = (void * __user *)arg;
628
629 switch (cmd) {
630 case NILFS_IOCTL_CHANGE_CPMODE:
631 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
632 case NILFS_IOCTL_DELETE_CHECKPOINT:
633 return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
634 case NILFS_IOCTL_GET_CPINFO:
635 return nilfs_ioctl_get_cpinfo(inode, filp, cmd, argp);
636 case NILFS_IOCTL_GET_CPSTAT:
637 return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
638 case NILFS_IOCTL_GET_SUINFO:
639 return nilfs_ioctl_get_suinfo(inode, filp, cmd, argp);
640 case NILFS_IOCTL_GET_SUSTAT:
641 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
642 case NILFS_IOCTL_GET_VINFO:
643 /* XXX: rename to ??? */
644 return nilfs_ioctl_get_vinfo(inode, filp, cmd, argp);
645 case NILFS_IOCTL_GET_BDESCS:
646 return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
647 case NILFS_IOCTL_CLEAN_SEGMENTS:
648 return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
649 case NILFS_IOCTL_SYNC:
650 return nilfs_ioctl_sync(inode, filp, cmd, argp);
651 default:
652 return -ENOTTY;
653 }
654}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 000000000000..47dd815433fd
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,563 @@
1/*
2 * mdt.c - meta data file for NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/mpage.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include <linux/backing-dev.h>
28#include <linux/swap.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "page.h"
32#include "mdt.h"
33
34
35#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
36
37#define INIT_UNUSED_INODE_FIELDS
38
39static int
40nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
41 struct buffer_head *bh,
42 void (*init_block)(struct inode *,
43 struct buffer_head *, void *))
44{
45 struct nilfs_inode_info *ii = NILFS_I(inode);
46 void *kaddr;
47 int ret;
48
49 /* Caller exclude read accesses using page lock */
50
51 /* set_buffer_new(bh); */
52 bh->b_blocknr = 0;
53
54 ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
55 if (unlikely(ret))
56 return ret;
57
58 set_buffer_mapped(bh);
59
60 kaddr = kmap_atomic(bh->b_page, KM_USER0);
61 memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
62 if (init_block)
63 init_block(inode, bh, kaddr);
64 flush_dcache_page(bh->b_page);
65 kunmap_atomic(kaddr, KM_USER0);
66
67 set_buffer_uptodate(bh);
68 nilfs_mark_buffer_dirty(bh);
69 nilfs_mdt_mark_dirty(inode);
70 return 0;
71}
72
73static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
74 struct buffer_head **out_bh,
75 void (*init_block)(struct inode *,
76 struct buffer_head *,
77 void *))
78{
79 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
80 struct nilfs_sb_info *writer = NULL;
81 struct super_block *sb = inode->i_sb;
82 struct nilfs_transaction_info ti;
83 struct buffer_head *bh;
84 int err;
85
86 if (!sb) {
87 writer = nilfs_get_writer(nilfs);
88 if (!writer) {
89 err = -EROFS;
90 goto out;
91 }
92 sb = writer->s_super;
93 }
94
95 nilfs_transaction_begin(sb, &ti, 0);
96
97 err = -ENOMEM;
98 bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
99 if (unlikely(!bh))
100 goto failed_unlock;
101
102 err = -EEXIST;
103 if (buffer_uptodate(bh) || buffer_mapped(bh))
104 goto failed_bh;
105#if 0
106 /* The uptodate flag is not protected by the page lock, but
107 the mapped flag is. Thus, we don't have to wait the buffer. */
108 wait_on_buffer(bh);
109 if (buffer_uptodate(bh))
110 goto failed_bh;
111#endif
112
113 bh->b_bdev = nilfs->ns_bdev;
114 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
115 if (likely(!err)) {
116 get_bh(bh);
117 *out_bh = bh;
118 }
119
120 failed_bh:
121 unlock_page(bh->b_page);
122 page_cache_release(bh->b_page);
123 brelse(bh);
124
125 failed_unlock:
126 if (likely(!err))
127 err = nilfs_transaction_commit(sb);
128 else
129 nilfs_transaction_abort(sb);
130 if (writer)
131 nilfs_put_writer(nilfs);
132 out:
133 return err;
134}
135
136static int
137nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
138 int mode, struct buffer_head **out_bh)
139{
140 struct buffer_head *bh;
141 unsigned long blknum = 0;
142 int ret = -ENOMEM;
143
144 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
145 if (unlikely(!bh))
146 goto failed;
147
148 ret = -EEXIST; /* internal code */
149 if (buffer_uptodate(bh))
150 goto out;
151
152 if (mode == READA) {
153 if (!trylock_buffer(bh)) {
154 ret = -EBUSY;
155 goto failed_bh;
156 }
157 } else /* mode == READ */
158 lock_buffer(bh);
159
160 if (buffer_uptodate(bh)) {
161 unlock_buffer(bh);
162 goto out;
163 }
164 if (!buffer_mapped(bh)) { /* unused buffer */
165 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
166 &blknum);
167 if (unlikely(ret)) {
168 unlock_buffer(bh);
169 goto failed_bh;
170 }
171 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
172 bh->b_blocknr = blknum;
173 set_buffer_mapped(bh);
174 }
175
176 bh->b_end_io = end_buffer_read_sync;
177 get_bh(bh);
178 submit_bh(mode, bh);
179 ret = 0;
180 out:
181 get_bh(bh);
182 *out_bh = bh;
183
184 failed_bh:
185 unlock_page(bh->b_page);
186 page_cache_release(bh->b_page);
187 brelse(bh);
188 failed:
189 return ret;
190}
191
192static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
193 struct buffer_head **out_bh)
194{
195 struct buffer_head *first_bh, *bh;
196 unsigned long blkoff;
197 int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
198 int err;
199
200 err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
201 if (err == -EEXIST) /* internal code */
202 goto out;
203
204 if (unlikely(err))
205 goto failed;
206
207 blkoff = block + 1;
208 for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
209 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
210 if (likely(!err || err == -EEXIST))
211 brelse(bh);
212 else if (err != -EBUSY)
213 break; /* abort readahead if bmap lookup failed */
214
215 if (!buffer_locked(first_bh))
216 goto out_no_wait;
217 }
218
219 wait_on_buffer(first_bh);
220
221 out_no_wait:
222 err = -EIO;
223 if (!buffer_uptodate(first_bh))
224 goto failed_bh;
225 out:
226 *out_bh = first_bh;
227 return 0;
228
229 failed_bh:
230 brelse(first_bh);
231 failed:
232 return err;
233}
234
235/**
236 * nilfs_mdt_get_block - read or create a buffer on meta data file.
237 * @inode: inode of the meta data file
238 * @blkoff: block offset
239 * @create: create flag
240 * @init_block: initializer used for newly allocated block
241 * @out_bh: output of a pointer to the buffer_head
242 *
243 * nilfs_mdt_get_block() looks up the specified buffer and tries to create
244 * a new buffer if @create is not zero. On success, the returned buffer is
245 * assured to be either existing or formatted using a buffer lock on success.
246 * @out_bh is substituted only when zero is returned.
247 *
248 * Return Value: On success, it returns 0. On error, the following negative
249 * error code is returned.
250 *
251 * %-ENOMEM - Insufficient memory available.
252 *
253 * %-EIO - I/O error
254 *
255 * %-ENOENT - the specified block does not exist (hole block)
256 *
257 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
258 *
259 * %-EROFS - Read only filesystem (for create mode)
260 */
261int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
262 void (*init_block)(struct inode *,
263 struct buffer_head *, void *),
264 struct buffer_head **out_bh)
265{
266 int ret;
267
268 /* Should be rewritten with merging nilfs_mdt_read_block() */
269 retry:
270 ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
271 if (!create || ret != -ENOENT)
272 return ret;
273
274 ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
275 if (unlikely(ret == -EEXIST)) {
276 /* create = 0; */ /* limit read-create loop retries */
277 goto retry;
278 }
279 return ret;
280}
281
282/**
283 * nilfs_mdt_delete_block - make a hole on the meta data file.
284 * @inode: inode of the meta data file
285 * @block: block offset
286 *
287 * Return Value: On success, zero is returned.
288 * On error, one of the following negative error code is returned.
289 *
290 * %-ENOMEM - Insufficient memory available.
291 *
292 * %-EIO - I/O error
293 *
294 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
295 */
296int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
297{
298 struct nilfs_inode_info *ii = NILFS_I(inode);
299 int err;
300
301 err = nilfs_bmap_delete(ii->i_bmap, block);
302 if (likely(!err)) {
303 nilfs_mdt_mark_dirty(inode);
304 nilfs_mdt_forget_block(inode, block);
305 }
306 return err;
307}
308
309/**
310 * nilfs_mdt_forget_block - discard dirty state and try to remove the page
311 * @inode: inode of the meta data file
312 * @block: block offset
313 *
314 * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
315 * tries to release the page including the buffer from a page cache.
316 *
317 * Return Value: On success, 0 is returned. On error, one of the following
318 * negative error code is returned.
319 *
320 * %-EBUSY - page has an active buffer.
321 *
322 * %-ENOENT - page cache has no page addressed by the offset.
323 */
324int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
325{
326 pgoff_t index = (pgoff_t)block >>
327 (PAGE_CACHE_SHIFT - inode->i_blkbits);
328 struct page *page;
329 unsigned long first_block;
330 int ret = 0;
331 int still_dirty;
332
333 page = find_lock_page(inode->i_mapping, index);
334 if (!page)
335 return -ENOENT;
336
337 wait_on_page_writeback(page);
338
339 first_block = (unsigned long)index <<
340 (PAGE_CACHE_SHIFT - inode->i_blkbits);
341 if (page_has_buffers(page)) {
342 struct buffer_head *bh;
343
344 bh = nilfs_page_get_nth_block(page, block - first_block);
345 nilfs_forget_buffer(bh);
346 }
347 still_dirty = PageDirty(page);
348 unlock_page(page);
349 page_cache_release(page);
350
351 if (still_dirty ||
352 invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
353 ret = -EBUSY;
354 return ret;
355}
356
357/**
358 * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
359 * @inode: inode of the meta data file
360 * @block: block offset
361 *
362 * Return Value: On success, it returns 0. On error, the following negative
363 * error code is returned.
364 *
365 * %-ENOMEM - Insufficient memory available.
366 *
367 * %-EIO - I/O error
368 *
369 * %-ENOENT - the specified block does not exist (hole block)
370 *
371 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
372 */
373int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
374{
375 struct buffer_head *bh;
376 int err;
377
378 err = nilfs_mdt_read_block(inode, block, &bh);
379 if (unlikely(err))
380 return err;
381 nilfs_mark_buffer_dirty(bh);
382 nilfs_mdt_mark_dirty(inode);
383 brelse(bh);
384 return 0;
385}
386
387int nilfs_mdt_fetch_dirty(struct inode *inode)
388{
389 struct nilfs_inode_info *ii = NILFS_I(inode);
390
391 if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
392 set_bit(NILFS_I_DIRTY, &ii->i_state);
393 return 1;
394 }
395 return test_bit(NILFS_I_DIRTY, &ii->i_state);
396}
397
398static int
399nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
400{
401 struct inode *inode = container_of(page->mapping,
402 struct inode, i_data);
403 struct super_block *sb = inode->i_sb;
404 struct nilfs_sb_info *writer = NULL;
405 int err = 0;
406
407 redirty_page_for_writepage(wbc, page);
408 unlock_page(page);
409
410 if (page->mapping->assoc_mapping)
411 return 0; /* Do not request flush for shadow page cache */
412 if (!sb) {
413 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
414 if (!writer)
415 return -EROFS;
416 sb = writer->s_super;
417 }
418
419 if (wbc->sync_mode == WB_SYNC_ALL)
420 err = nilfs_construct_segment(sb);
421 else if (wbc->for_reclaim)
422 nilfs_flush_segment(sb, inode->i_ino);
423
424 if (writer)
425 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
426 return err;
427}
428
429
430static struct address_space_operations def_mdt_aops = {
431 .writepage = nilfs_mdt_write_page,
432};
433
434static struct inode_operations def_mdt_iops;
435static struct file_operations def_mdt_fops;
436
437/*
438 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
439 * ifile, or gcinodes. This allows the B-tree code and segment constructor
440 * to treat them like regular files, and this helps to simplify the
441 * implementation.
442 * On the other hand, some of the pseudo inodes have an irregular point:
443 * They don't have valid inode->i_sb pointer because their lifetimes are
444 * longer than those of the super block structs; they may continue for
445 * several consecutive mounts/umounts. This would need discussions.
446 */
447struct inode *
448nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
449 ino_t ino, gfp_t gfp_mask)
450{
451 struct inode *inode = nilfs_alloc_inode(sb);
452
453 if (!inode)
454 return NULL;
455 else {
456 struct address_space * const mapping = &inode->i_data;
457 struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
458
459 if (!mi) {
460 nilfs_destroy_inode(inode);
461 return NULL;
462 }
463 mi->mi_nilfs = nilfs;
464 init_rwsem(&mi->mi_sem);
465
466 inode->i_sb = sb; /* sb may be NULL for some meta data files */
467 inode->i_blkbits = nilfs->ns_blocksize_bits;
468 inode->i_flags = 0;
469 atomic_set(&inode->i_count, 1);
470 inode->i_nlink = 1;
471 inode->i_ino = ino;
472 inode->i_mode = S_IFREG;
473 inode->i_private = mi;
474
475#ifdef INIT_UNUSED_INODE_FIELDS
476 atomic_set(&inode->i_writecount, 0);
477 inode->i_size = 0;
478 inode->i_blocks = 0;
479 inode->i_bytes = 0;
480 inode->i_generation = 0;
481#ifdef CONFIG_QUOTA
482 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
483#endif
484 inode->i_pipe = NULL;
485 inode->i_bdev = NULL;
486 inode->i_cdev = NULL;
487 inode->i_rdev = 0;
488#ifdef CONFIG_SECURITY
489 inode->i_security = NULL;
490#endif
491 inode->dirtied_when = 0;
492
493 INIT_LIST_HEAD(&inode->i_list);
494 INIT_LIST_HEAD(&inode->i_sb_list);
495 inode->i_state = 0;
496#endif
497
498 spin_lock_init(&inode->i_lock);
499 mutex_init(&inode->i_mutex);
500 init_rwsem(&inode->i_alloc_sem);
501
502 mapping->host = NULL; /* instead of inode */
503 mapping->flags = 0;
504 mapping_set_gfp_mask(mapping, gfp_mask);
505 mapping->assoc_mapping = NULL;
506 mapping->backing_dev_info = nilfs->ns_bdi;
507
508 inode->i_mapping = mapping;
509 }
510
511 return inode;
512}
513
514struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
515 ino_t ino, gfp_t gfp_mask)
516{
517 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
518
519 if (!inode)
520 return NULL;
521
522 inode->i_op = &def_mdt_iops;
523 inode->i_fop = &def_mdt_fops;
524 inode->i_mapping->a_ops = &def_mdt_aops;
525 return inode;
526}
527
528void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
529 unsigned header_size)
530{
531 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
532
533 mi->mi_entry_size = entry_size;
534 mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
535 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
536}
537
538void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
539{
540 shadow->i_mapping->assoc_mapping = orig->i_mapping;
541 NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
542 &NILFS_I(orig)->i_btnode_cache;
543}
544
545void nilfs_mdt_clear(struct inode *inode)
546{
547 struct nilfs_inode_info *ii = NILFS_I(inode);
548
549 invalidate_mapping_pages(inode->i_mapping, 0, -1);
550 truncate_inode_pages(inode->i_mapping, 0);
551
552 nilfs_bmap_clear(ii->i_bmap);
553 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
554}
555
556void nilfs_mdt_destroy(struct inode *inode)
557{
558 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
559
560 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
561 kfree(mdi);
562 nilfs_destroy_inode(inode);
563}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 000000000000..df683e0bca6a
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
1/*
2 * mdt.h - NILFS meta data file prototype and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#ifndef _NILFS_MDT_H
24#define _NILFS_MDT_H
25
26#include <linux/buffer_head.h>
27#include <linux/blockgroup_lock.h>
28#include "nilfs.h"
29#include "page.h"
30
31/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block
39 * @mi_blocks_per_group: number of blocks in a group
40 * @mi_blocks_per_desc_block: number of blocks per descriptor block
41 */
42struct nilfs_mdt_info {
43 struct the_nilfs *mi_nilfs;
44 struct rw_semaphore mi_sem;
45 struct blockgroup_lock *mi_bgl;
46 unsigned mi_entry_size;
47 unsigned mi_first_entry_offset;
48 unsigned long mi_entries_per_block;
49 unsigned long mi_blocks_per_group;
50 unsigned long mi_blocks_per_desc_block;
51};
52
53static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
54{
55 return inode->i_private;
56}
57
58static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
59{
60 struct super_block *sb = inode->i_sb;
61
62 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
63}
64
65/* Default GFP flags using highmem */
66#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
67
68int nilfs_mdt_get_block(struct inode *, unsigned long, int,
69 void (*init_block)(struct inode *,
70 struct buffer_head *, void *),
71 struct buffer_head **);
72int nilfs_mdt_delete_block(struct inode *, unsigned long);
73int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *);
76
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
78 gfp_t);
79struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
80 ino_t, gfp_t);
81void nilfs_mdt_destroy(struct inode *);
82void nilfs_mdt_clear(struct inode *);
83void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
84void nilfs_mdt_set_shadow(struct inode *, struct inode *);
85
86
87#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
88
89static inline void nilfs_mdt_mark_dirty(struct inode *inode)
90{
91 if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
92 set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
93}
94
95static inline void nilfs_mdt_clear_dirty(struct inode *inode)
96{
97 clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
98}
99
100static inline __u64 nilfs_mdt_cno(struct inode *inode)
101{
102 return NILFS_MDT(inode)->mi_nilfs->ns_cno;
103}
104
105#define nilfs_mdt_bgl_lock(inode, bg) \
106 (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
107
108
109static inline int
110nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
111 unsigned n)
112{
113 return nilfs_read_inode_common(
114 inode, (struct nilfs_inode *)(bh->b_data + n));
115}
116
117static inline void
118nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
119 unsigned n)
120{
121 nilfs_write_inode_common(
122 inode, (struct nilfs_inode *)(bh->b_data + n), 1);
123}
124
125#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 000000000000..df70dadb336f
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
1/*
2 * namei.c - NILFS pathname lookup operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23/*
24 * linux/fs/ext2/namei.c
25 *
26 * Copyright (C) 1992, 1993, 1994, 1995
27 * Remy Card (card@masi.ibp.fr)
28 * Laboratoire MASI - Institut Blaise Pascal
29 * Universite Pierre et Marie Curie (Paris VI)
30 *
31 * from
32 *
33 * linux/fs/minix/namei.c
34 *
35 * Copyright (C) 1991, 1992 Linus Torvalds
36 *
37 * Big-endian to little-endian byte-swapping/bitmaps by
38 * David S. Miller (davem@caip.rutgers.edu), 1995
39 */
40
41#include <linux/pagemap.h>
42#include "nilfs.h"
43
44
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{
47 int err = nilfs_add_link(dentry, inode);
48 if (!err) {
49 d_instantiate(dentry, inode);
50 return 0;
51 }
52 inode_dec_link_count(inode);
53 iput(inode);
54 return err;
55}
56
57/*
58 * Methods themselves.
59 */
60
61static struct dentry *
62nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
63{
64 struct inode *inode;
65 ino_t ino;
66
67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG);
69
70 ino = nilfs_inode_by_name(dir, dentry);
71 inode = NULL;
72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino);
74 if (IS_ERR(inode))
75 return ERR_CAST(inode);
76 }
77 return d_splice_alias(inode, dentry);
78}
79
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct dentry dotdot;
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino)
91 return ERR_PTR(-ENOENT);
92
93 inode = nilfs_iget(child->d_inode->i_sb, ino);
94 if (IS_ERR(inode))
95 return ERR_CAST(inode);
96 return d_obtain_alias(inode);
97}
98
99/*
100 * By the time this is called, we already have created
101 * the directory cache entry for the new file, but it
102 * is so far negative - it has no inode.
103 *
104 * If the create succeeds, we fill in the inode information
105 * with d_instantiate().
106 */
107static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
108 struct nameidata *nd)
109{
110 struct inode *inode;
111 struct nilfs_transaction_info ti;
112 int err;
113
114 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
115 if (err)
116 return err;
117 inode = nilfs_new_inode(dir, mode);
118 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) {
120 inode->i_op = &nilfs_file_inode_operations;
121 inode->i_fop = &nilfs_file_operations;
122 inode->i_mapping->a_ops = &nilfs_aops;
123 mark_inode_dirty(inode);
124 err = nilfs_add_nondir(dentry, inode);
125 }
126 if (!err)
127 err = nilfs_transaction_commit(dir->i_sb);
128 else
129 nilfs_transaction_abort(dir->i_sb);
130
131 return err;
132}
133
134static int
135nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
136{
137 struct inode *inode;
138 struct nilfs_transaction_info ti;
139 int err;
140
141 if (!new_valid_dev(rdev))
142 return -EINVAL;
143
144 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
145 if (err)
146 return err;
147 inode = nilfs_new_inode(dir, mode);
148 err = PTR_ERR(inode);
149 if (!IS_ERR(inode)) {
150 init_special_inode(inode, inode->i_mode, rdev);
151 mark_inode_dirty(inode);
152 err = nilfs_add_nondir(dentry, inode);
153 }
154 if (!err)
155 err = nilfs_transaction_commit(dir->i_sb);
156 else
157 nilfs_transaction_abort(dir->i_sb);
158
159 return err;
160}
161
162static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
163 const char *symname)
164{
165 struct nilfs_transaction_info ti;
166 struct super_block *sb = dir->i_sb;
167 unsigned l = strlen(symname)+1;
168 struct inode *inode;
169 int err;
170
171 if (l > sb->s_blocksize)
172 return -ENAMETOOLONG;
173
174 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
175 if (err)
176 return err;
177
178 inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
179 err = PTR_ERR(inode);
180 if (IS_ERR(inode))
181 goto out;
182
183 /* slow symlink */
184 inode->i_op = &nilfs_symlink_inode_operations;
185 inode->i_mapping->a_ops = &nilfs_aops;
186 err = page_symlink(inode, symname, l);
187 if (err)
188 goto out_fail;
189
190 /* mark_inode_dirty(inode); */
191 /* nilfs_new_inode() and page_symlink() do this */
192
193 err = nilfs_add_nondir(dentry, inode);
194out:
195 if (!err)
196 err = nilfs_transaction_commit(dir->i_sb);
197 else
198 nilfs_transaction_abort(dir->i_sb);
199
200 return err;
201
202out_fail:
203 inode_dec_link_count(inode);
204 iput(inode);
205 goto out;
206}
207
208static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
209 struct dentry *dentry)
210{
211 struct inode *inode = old_dentry->d_inode;
212 struct nilfs_transaction_info ti;
213 int err;
214
215 if (inode->i_nlink >= NILFS_LINK_MAX)
216 return -EMLINK;
217
218 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
219 if (err)
220 return err;
221
222 inode->i_ctime = CURRENT_TIME;
223 inode_inc_link_count(inode);
224 atomic_inc(&inode->i_count);
225
226 err = nilfs_add_nondir(dentry, inode);
227 if (!err)
228 err = nilfs_transaction_commit(dir->i_sb);
229 else
230 nilfs_transaction_abort(dir->i_sb);
231
232 return err;
233}
234
235static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
236{
237 struct inode *inode;
238 struct nilfs_transaction_info ti;
239 int err;
240
241 if (dir->i_nlink >= NILFS_LINK_MAX)
242 return -EMLINK;
243
244 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
245 if (err)
246 return err;
247
248 inode_inc_link_count(dir);
249
250 inode = nilfs_new_inode(dir, S_IFDIR | mode);
251 err = PTR_ERR(inode);
252 if (IS_ERR(inode))
253 goto out_dir;
254
255 inode->i_op = &nilfs_dir_inode_operations;
256 inode->i_fop = &nilfs_dir_operations;
257 inode->i_mapping->a_ops = &nilfs_aops;
258
259 inode_inc_link_count(inode);
260
261 err = nilfs_make_empty(inode, dir);
262 if (err)
263 goto out_fail;
264
265 err = nilfs_add_link(dentry, inode);
266 if (err)
267 goto out_fail;
268
269 d_instantiate(dentry, inode);
270out:
271 if (!err)
272 err = nilfs_transaction_commit(dir->i_sb);
273 else
274 nilfs_transaction_abort(dir->i_sb);
275
276 return err;
277
278out_fail:
279 inode_dec_link_count(inode);
280 inode_dec_link_count(inode);
281 iput(inode);
282out_dir:
283 inode_dec_link_count(dir);
284 goto out;
285}
286
287static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
288{
289 struct inode *inode;
290 struct nilfs_dir_entry *de;
291 struct page *page;
292 struct nilfs_transaction_info ti;
293 int err;
294
295 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
296 if (err)
297 return err;
298
299 err = -ENOENT;
300 de = nilfs_find_entry(dir, dentry, &page);
301 if (!de)
302 goto out;
303
304 inode = dentry->d_inode;
305 err = -EIO;
306 if (le64_to_cpu(de->inode) != inode->i_ino)
307 goto out;
308
309 if (!inode->i_nlink) {
310 nilfs_warning(inode->i_sb, __func__,
311 "deleting nonexistent file (%lu), %d\n",
312 inode->i_ino, inode->i_nlink);
313 inode->i_nlink = 1;
314 }
315 err = nilfs_delete_entry(de, page);
316 if (err)
317 goto out;
318
319 inode->i_ctime = dir->i_ctime;
320 inode_dec_link_count(inode);
321 err = 0;
322out:
323 if (!err)
324 err = nilfs_transaction_commit(dir->i_sb);
325 else
326 nilfs_transaction_abort(dir->i_sb);
327
328 return err;
329}
330
331static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
332{
333 struct inode *inode = dentry->d_inode;
334 struct nilfs_transaction_info ti;
335 int err;
336
337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
338 if (err)
339 return err;
340
341 err = -ENOTEMPTY;
342 if (nilfs_empty_dir(inode)) {
343 err = nilfs_unlink(dir, dentry);
344 if (!err) {
345 inode->i_size = 0;
346 inode_dec_link_count(inode);
347 inode_dec_link_count(dir);
348 }
349 }
350 if (!err)
351 err = nilfs_transaction_commit(dir->i_sb);
352 else
353 nilfs_transaction_abort(dir->i_sb);
354
355 return err;
356}
357
358static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
359 struct inode *new_dir, struct dentry *new_dentry)
360{
361 struct inode *old_inode = old_dentry->d_inode;
362 struct inode *new_inode = new_dentry->d_inode;
363 struct page *dir_page = NULL;
364 struct nilfs_dir_entry *dir_de = NULL;
365 struct page *old_page;
366 struct nilfs_dir_entry *old_de;
367 struct nilfs_transaction_info ti;
368 int err;
369
370 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
371 if (unlikely(err))
372 return err;
373
374 err = -ENOENT;
375 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
376 if (!old_de)
377 goto out;
378
379 if (S_ISDIR(old_inode->i_mode)) {
380 err = -EIO;
381 dir_de = nilfs_dotdot(old_inode, &dir_page);
382 if (!dir_de)
383 goto out_old;
384 }
385
386 if (new_inode) {
387 struct page *new_page;
388 struct nilfs_dir_entry *new_de;
389
390 err = -ENOTEMPTY;
391 if (dir_de && !nilfs_empty_dir(new_inode))
392 goto out_dir;
393
394 err = -ENOENT;
395 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
396 if (!new_de)
397 goto out_dir;
398 inode_inc_link_count(old_inode);
399 nilfs_set_link(new_dir, new_de, new_page, old_inode);
400 new_inode->i_ctime = CURRENT_TIME;
401 if (dir_de)
402 drop_nlink(new_inode);
403 inode_dec_link_count(new_inode);
404 } else {
405 if (dir_de) {
406 err = -EMLINK;
407 if (new_dir->i_nlink >= NILFS_LINK_MAX)
408 goto out_dir;
409 }
410 inode_inc_link_count(old_inode);
411 err = nilfs_add_link(new_dentry, old_inode);
412 if (err) {
413 inode_dec_link_count(old_inode);
414 goto out_dir;
415 }
416 if (dir_de)
417 inode_inc_link_count(new_dir);
418 }
419
420 /*
421 * Like most other Unix systems, set the ctime for inodes on a
422 * rename.
423 * inode_dec_link_count() will mark the inode dirty.
424 */
425 old_inode->i_ctime = CURRENT_TIME;
426
427 nilfs_delete_entry(old_de, old_page);
428 inode_dec_link_count(old_inode);
429
430 if (dir_de) {
431 nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
432 inode_dec_link_count(old_dir);
433 }
434
435 err = nilfs_transaction_commit(old_dir->i_sb);
436 return err;
437
438out_dir:
439 if (dir_de) {
440 kunmap(dir_page);
441 page_cache_release(dir_page);
442 }
443out_old:
444 kunmap(old_page);
445 page_cache_release(old_page);
446out:
447 nilfs_transaction_abort(old_dir->i_sb);
448 return err;
449}
450
451struct inode_operations nilfs_dir_inode_operations = {
452 .create = nilfs_create,
453 .lookup = nilfs_lookup,
454 .link = nilfs_link,
455 .unlink = nilfs_unlink,
456 .symlink = nilfs_symlink,
457 .mkdir = nilfs_mkdir,
458 .rmdir = nilfs_rmdir,
459 .mknod = nilfs_mknod,
460 .rename = nilfs_rename,
461 .setattr = nilfs_setattr,
462 .permission = nilfs_permission,
463};
464
465struct inode_operations nilfs_special_inode_operations = {
466 .setattr = nilfs_setattr,
467 .permission = nilfs_permission,
468};
469
470struct inode_operations nilfs_symlink_inode_operations = {
471 .readlink = generic_readlink,
472 .follow_link = page_follow_link_light,
473 .put_link = page_put_link,
474};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000000..7558c977db02
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,318 @@
1/*
2 * nilfs.h - NILFS local header file.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_H
25#define _NILFS_H
26
27#include <linux/kernel.h>
28#include <linux/buffer_head.h>
29#include <linux/spinlock.h>
30#include <linux/blkdev.h>
31#include <linux/nilfs2_fs.h>
32#include "the_nilfs.h"
33#include "sb.h"
34#include "bmap.h"
35#include "bmap_union.h"
36
37/*
38 * NILFS filesystem version
39 */
40#define NILFS_VERSION "2.0.5"
41
42/*
43 * nilfs inode data in memory
44 */
45struct nilfs_inode_info {
46 __u32 i_flags;
47 unsigned long i_state; /* Dynamic state flags */
48 struct nilfs_bmap *i_bmap;
49 union nilfs_bmap_union i_bmap_union;
50 __u64 i_xattr; /* sector_t ??? */
51 __u32 i_dir_start_lookup;
52 __u64 i_cno; /* check point number for GC inode */
53 struct address_space i_btnode_cache;
54 struct list_head i_dirty; /* List for connecting dirty files */
55
56#ifdef CONFIG_NILFS_XATTR
57 /*
58 * Extended attributes can be read independently of the main file
59 * data. Taking i_sem even when reading would cause contention
60 * between readers of EAs and writers of regular file data, so
61 * instead we synchronize on xattr_sem when reading or changing
62 * EAs.
63 */
64 struct rw_semaphore xattr_sem;
65#endif
66#ifdef CONFIG_NILFS_POSIX_ACL
67 struct posix_acl *i_acl;
68 struct posix_acl *i_default_acl;
69#endif
70 struct buffer_head *i_bh; /* i_bh contains a new or dirty
71 disk inode */
72 struct inode vfs_inode;
73};
74
75static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
76{
77 return container_of(inode, struct nilfs_inode_info, vfs_inode);
78}
79
80static inline struct nilfs_inode_info *
81NILFS_BMAP_I(const struct nilfs_bmap *bmap)
82{
83 return container_of((union nilfs_bmap_union *)bmap,
84 struct nilfs_inode_info,
85 i_bmap_union);
86}
87
88static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
89{
90 struct nilfs_inode_info *ii =
91 container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
92 return &ii->vfs_inode;
93}
94
95static inline struct inode *NILFS_AS_I(struct address_space *mapping)
96{
97 return (mapping->host) ? :
98 container_of(mapping, struct inode, i_data);
99}
100
101/*
102 * Dynamic state flags of NILFS on-memory inode (i_state)
103 */
104enum {
105 NILFS_I_NEW = 0, /* Inode is newly created */
106 NILFS_I_DIRTY, /* The file is dirty */
107 NILFS_I_QUEUED, /* inode is in dirty_files list */
108 NILFS_I_BUSY, /* inode is grabbed by a segment
109 constructor */
110 NILFS_I_COLLECTED, /* All dirty blocks are collected */
111 NILFS_I_UPDATED, /* The file has been written back */
112 NILFS_I_INODE_DIRTY, /* write_inode is requested */
113 NILFS_I_BMAP, /* has bmap and btnode_cache */
114 NILFS_I_GCINODE, /* inode for GC, on memory only */
115 NILFS_I_GCDAT, /* shadow DAT, on memory only */
116};
117
118/*
119 * Macros to check inode numbers
120 */
121#define NILFS_MDT_INO_BITS \
122 ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
123 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
124 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
125
126#define NILFS_SYS_INO_BITS \
127 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
128
129#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino)
130
131#define NILFS_MDT_INODE(sb, ino) \
132 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
133#define NILFS_VALID_INODE(sb, ino) \
134 ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
135
136/**
137 * struct nilfs_transaction_info: context information for synchronization
138 * @ti_magic: Magic number
139 * @ti_save: Backup of journal_info field of task_struct
140 * @ti_flags: Flags
141 * @ti_count: Nest level
142 * @ti_garbage: List of inode to be put when releasing semaphore
143 */
144struct nilfs_transaction_info {
145 u32 ti_magic;
146 void *ti_save;
147 /* This should never used. If this happens,
148 one of other filesystems has a bug. */
149 unsigned short ti_flags;
150 unsigned short ti_count;
151 struct list_head ti_garbage;
152};
153
154/* ti_magic */
155#define NILFS_TI_MAGIC 0xd9e392fb
156
157/* ti_flags */
158#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
159#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the
160 end of transaction. */
161#define NILFS_TI_GC 0x0004 /* GC context */
162#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
163#define NILFS_TI_WRITER 0x0010 /* Constructor context */
164
165
166int nilfs_transaction_begin(struct super_block *,
167 struct nilfs_transaction_info *, int);
168int nilfs_transaction_commit(struct super_block *);
169void nilfs_transaction_abort(struct super_block *);
170
171static inline void nilfs_set_transaction_flag(unsigned int flag)
172{
173 struct nilfs_transaction_info *ti = current->journal_info;
174
175 ti->ti_flags |= flag;
176}
177
178static inline int nilfs_test_transaction_flag(unsigned int flag)
179{
180 struct nilfs_transaction_info *ti = current->journal_info;
181
182 if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
183 return 0;
184 return !!(ti->ti_flags & flag);
185}
186
187static inline int nilfs_doing_gc(void)
188{
189 return nilfs_test_transaction_flag(NILFS_TI_GC);
190}
191
192static inline int nilfs_doing_construction(void)
193{
194 return nilfs_test_transaction_flag(NILFS_TI_WRITER);
195}
196
197static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
198{
199 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
200}
201
202/*
203 * function prototype
204 */
205#ifdef CONFIG_NILFS_POSIX_ACL
206#error "NILFS: not yet supported POSIX ACL"
207extern int nilfs_permission(struct inode *, int, struct nameidata *);
208extern int nilfs_acl_chmod(struct inode *);
209extern int nilfs_init_acl(struct inode *, struct inode *);
210#else
211#define nilfs_permission NULL
212
213static inline int nilfs_acl_chmod(struct inode *inode)
214{
215 return 0;
216}
217
218static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
219{
220 inode->i_mode &= ~current_umask();
221 return 0;
222}
223#endif
224
225#define NILFS_ATIME_DISABLE
226
227/* dir.c */
228extern int nilfs_add_link(struct dentry *, struct inode *);
229extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
230extern int nilfs_make_empty(struct inode *, struct inode *);
231extern struct nilfs_dir_entry *
232nilfs_find_entry(struct inode *, struct dentry *, struct page **);
233extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
234extern int nilfs_empty_dir(struct inode *);
235extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
236extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
237 struct page *, struct inode *);
238
239/* file.c */
240extern int nilfs_sync_file(struct file *, struct dentry *, int);
241
242/* ioctl.c */
243long nilfs_ioctl(struct file *, unsigned int, unsigned long);
244int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *);
245
246/* inode.c */
247extern struct inode *nilfs_new_inode(struct inode *, int);
248extern void nilfs_free_inode(struct inode *);
249extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
250extern void nilfs_set_inode_flags(struct inode *);
251extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
252extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
253extern struct inode *nilfs_iget(struct super_block *, unsigned long);
254extern void nilfs_update_inode(struct inode *, struct buffer_head *);
255extern void nilfs_truncate(struct inode *);
256extern void nilfs_delete_inode(struct inode *);
257extern int nilfs_setattr(struct dentry *, struct iattr *);
258extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
259 struct buffer_head **);
260extern int nilfs_inode_dirty(struct inode *);
261extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
262 unsigned);
263extern int nilfs_mark_inode_dirty(struct inode *);
264extern void nilfs_dirty_inode(struct inode *);
265
266/* namei.c */
267extern struct dentry *nilfs_get_parent(struct dentry *);
268
269/* super.c */
270extern struct inode *nilfs_alloc_inode(struct super_block *);
271extern void nilfs_destroy_inode(struct inode *);
272extern void nilfs_error(struct super_block *, const char *, const char *, ...)
273 __attribute__ ((format (printf, 3, 4)));
274extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
275 __attribute__ ((format (printf, 3, 4)));
276extern struct nilfs_super_block *
277nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
278extern int nilfs_store_magic_and_option(struct super_block *,
279 struct nilfs_super_block *, char *);
280extern int nilfs_commit_super(struct nilfs_sb_info *, int);
281extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
282extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
283
284/* gcinode.c */
285int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
286 struct buffer_head **);
287int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
288 struct buffer_head **);
289int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
290int nilfs_init_gccache(struct the_nilfs *);
291void nilfs_destroy_gccache(struct the_nilfs *);
292void nilfs_clear_gcinode(struct inode *);
293struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
294void nilfs_remove_all_gcinode(struct the_nilfs *);
295
296/* gcdat.c */
297int nilfs_init_gcdat_inode(struct the_nilfs *);
298void nilfs_commit_gcdat_inode(struct the_nilfs *);
299void nilfs_clear_gcdat_inode(struct the_nilfs *);
300
301/*
302 * Inodes and files operations
303 */
304extern struct file_operations nilfs_dir_operations;
305extern struct inode_operations nilfs_file_inode_operations;
306extern struct file_operations nilfs_file_operations;
307extern struct address_space_operations nilfs_aops;
308extern struct inode_operations nilfs_dir_inode_operations;
309extern struct inode_operations nilfs_special_inode_operations;
310extern struct inode_operations nilfs_symlink_inode_operations;
311
312/*
313 * filesystem type
314 */
315extern struct file_system_type nilfs_fs_type;
316
317
318#endif /* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 000000000000..1bfbba9c0e9a
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,540 @@
1/*
2 * page.c - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#include <linux/pagemap.h>
25#include <linux/writeback.h>
26#include <linux/swap.h>
27#include <linux/bitops.h>
28#include <linux/page-flags.h>
29#include <linux/list.h>
30#include <linux/highmem.h>
31#include <linux/pagevec.h>
32#include "nilfs.h"
33#include "page.h"
34#include "mdt.h"
35
36
37#define NILFS_BUFFER_INHERENT_BITS \
38 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
39 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
40
41static struct buffer_head *
42__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
43 int blkbits, unsigned long b_state)
44
45{
46 unsigned long first_block;
47 struct buffer_head *bh;
48
49 if (!page_has_buffers(page))
50 create_empty_buffers(page, 1 << blkbits, b_state);
51
52 first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
53 bh = nilfs_page_get_nth_block(page, block - first_block);
54
55 touch_buffer(bh);
56 wait_on_buffer(bh);
57 return bh;
58}
59
60/*
61 * Since the page cache of B-tree node pages or data page cache of pseudo
62 * inodes does not have a valid mapping->host pointer, calling
63 * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
64 * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
65 * To avoid this problem, the old style mark_buffer_dirty() is used instead.
66 */
67void nilfs_mark_buffer_dirty(struct buffer_head *bh)
68{
69 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
70 __set_page_dirty_nobuffers(bh->b_page);
71}
72
73struct buffer_head *nilfs_grab_buffer(struct inode *inode,
74 struct address_space *mapping,
75 unsigned long blkoff,
76 unsigned long b_state)
77{
78 int blkbits = inode->i_blkbits;
79 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
80 struct page *page, *opage;
81 struct buffer_head *bh, *obh;
82
83 page = grab_cache_page(mapping, index);
84 if (unlikely(!page))
85 return NULL;
86
87 bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
88 if (unlikely(!bh)) {
89 unlock_page(page);
90 page_cache_release(page);
91 return NULL;
92 }
93 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
94 /*
95 * Shadow page cache uses assoc_mapping to point its original
96 * page cache. The following code tries the original cache
97 * if the given cache is a shadow and it didn't hit.
98 */
99 opage = find_lock_page(mapping->assoc_mapping, index);
100 if (!opage)
101 return bh;
102
103 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
104 b_state);
105 if (buffer_uptodate(obh)) {
106 nilfs_copy_buffer(bh, obh);
107 if (buffer_dirty(obh)) {
108 nilfs_mark_buffer_dirty(bh);
109 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
110 nilfs_mdt_mark_dirty(inode);
111 }
112 }
113 brelse(obh);
114 unlock_page(opage);
115 page_cache_release(opage);
116 }
117 return bh;
118}
119
120/**
121 * nilfs_forget_buffer - discard dirty state
122 * @inode: owner inode of the buffer
123 * @bh: buffer head of the buffer to be discarded
124 */
125void nilfs_forget_buffer(struct buffer_head *bh)
126{
127 struct page *page = bh->b_page;
128
129 lock_buffer(bh);
130 clear_buffer_nilfs_volatile(bh);
131 if (test_clear_buffer_dirty(bh) && nilfs_page_buffers_clean(page))
132 __nilfs_clear_page_dirty(page);
133
134 clear_buffer_uptodate(bh);
135 clear_buffer_mapped(bh);
136 bh->b_blocknr = -1;
137 ClearPageUptodate(page);
138 ClearPageMappedToDisk(page);
139 unlock_buffer(bh);
140 brelse(bh);
141}
142
143/**
144 * nilfs_copy_buffer -- copy buffer data and flags
145 * @dbh: destination buffer
146 * @sbh: source buffer
147 */
148void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
149{
150 void *kaddr0, *kaddr1;
151 unsigned long bits;
152 struct page *spage = sbh->b_page, *dpage = dbh->b_page;
153 struct buffer_head *bh;
154
155 kaddr0 = kmap_atomic(spage, KM_USER0);
156 kaddr1 = kmap_atomic(dpage, KM_USER1);
157 memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
158 kunmap_atomic(kaddr1, KM_USER1);
159 kunmap_atomic(kaddr0, KM_USER0);
160
161 dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
162 dbh->b_blocknr = sbh->b_blocknr;
163 dbh->b_bdev = sbh->b_bdev;
164
165 bh = dbh;
166 bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
167 while ((bh = bh->b_this_page) != dbh) {
168 lock_buffer(bh);
169 bits &= bh->b_state;
170 unlock_buffer(bh);
171 }
172 if (bits & (1UL << BH_Uptodate))
173 SetPageUptodate(dpage);
174 else
175 ClearPageUptodate(dpage);
176 if (bits & (1UL << BH_Mapped))
177 SetPageMappedToDisk(dpage);
178 else
179 ClearPageMappedToDisk(dpage);
180}
181
182/**
183 * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
184 * @page: page to be checked
185 *
186 * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
187 * Otherwise, it returns non-zero value.
188 */
189int nilfs_page_buffers_clean(struct page *page)
190{
191 struct buffer_head *bh, *head;
192
193 bh = head = page_buffers(page);
194 do {
195 if (buffer_dirty(bh))
196 return 0;
197 bh = bh->b_this_page;
198 } while (bh != head);
199 return 1;
200}
201
202void nilfs_page_bug(struct page *page)
203{
204 struct address_space *m;
205 unsigned long ino = 0;
206
207 if (unlikely(!page)) {
208 printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
209 return;
210 }
211
212 m = page->mapping;
213 if (m) {
214 struct inode *inode = NILFS_AS_I(m);
215 if (inode != NULL)
216 ino = inode->i_ino;
217 }
218 printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
219 "mapping=%p ino=%lu\n",
220 page, atomic_read(&page->_count),
221 (unsigned long long)page->index, page->flags, m, ino);
222
223 if (page_has_buffers(page)) {
224 struct buffer_head *bh, *head;
225 int i = 0;
226
227 bh = head = page_buffers(page);
228 do {
229 printk(KERN_CRIT
230 " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
231 i++, bh, atomic_read(&bh->b_count),
232 (unsigned long long)bh->b_blocknr, bh->b_state);
233 bh = bh->b_this_page;
234 } while (bh != head);
235 }
236}
237
238/**
239 * nilfs_alloc_private_page - allocate a private page with buffer heads
240 *
241 * Return Value: On success, a pointer to the allocated page is returned.
242 * On error, NULL is returned.
243 */
244struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
245 unsigned long state)
246{
247 struct buffer_head *bh, *head, *tail;
248 struct page *page;
249
250 page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
251 if (unlikely(!page))
252 return NULL;
253
254 lock_page(page);
255 head = alloc_page_buffers(page, size, 0);
256 if (unlikely(!head)) {
257 unlock_page(page);
258 __free_page(page);
259 return NULL;
260 }
261
262 bh = head;
263 do {
264 bh->b_state = (1UL << BH_NILFS_Allocated) | state;
265 tail = bh;
266 bh->b_bdev = bdev;
267 bh = bh->b_this_page;
268 } while (bh);
269
270 tail->b_this_page = head;
271 attach_page_buffers(page, head);
272
273 return page;
274}
275
276void nilfs_free_private_page(struct page *page)
277{
278 BUG_ON(!PageLocked(page));
279 BUG_ON(page->mapping);
280
281 if (page_has_buffers(page) && !try_to_free_buffers(page))
282 NILFS_PAGE_BUG(page, "failed to free page");
283
284 unlock_page(page);
285 __free_page(page);
286}
287
288/**
289 * nilfs_copy_page -- copy the page with buffers
290 * @dst: destination page
291 * @src: source page
292 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
293 *
294 * This fuction is for both data pages and btnode pages. The dirty flag
295 * should be treated by caller. The page must not be under i/o.
296 * Both src and dst page must be locked
297 */
298static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
299{
300 struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
301 unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
302
303 BUG_ON(PageWriteback(dst));
304
305 sbh = sbufs = page_buffers(src);
306 if (!page_has_buffers(dst))
307 create_empty_buffers(dst, sbh->b_size, 0);
308
309 if (copy_dirty)
310 mask |= (1UL << BH_Dirty);
311
312 dbh = dbufs = page_buffers(dst);
313 do {
314 lock_buffer(sbh);
315 lock_buffer(dbh);
316 dbh->b_state = sbh->b_state & mask;
317 dbh->b_blocknr = sbh->b_blocknr;
318 dbh->b_bdev = sbh->b_bdev;
319 sbh = sbh->b_this_page;
320 dbh = dbh->b_this_page;
321 } while (dbh != dbufs);
322
323 copy_highpage(dst, src);
324
325 if (PageUptodate(src) && !PageUptodate(dst))
326 SetPageUptodate(dst);
327 else if (!PageUptodate(src) && PageUptodate(dst))
328 ClearPageUptodate(dst);
329 if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
330 SetPageMappedToDisk(dst);
331 else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
332 ClearPageMappedToDisk(dst);
333
334 do {
335 unlock_buffer(sbh);
336 unlock_buffer(dbh);
337 sbh = sbh->b_this_page;
338 dbh = dbh->b_this_page;
339 } while (dbh != dbufs);
340}
341
342int nilfs_copy_dirty_pages(struct address_space *dmap,
343 struct address_space *smap)
344{
345 struct pagevec pvec;
346 unsigned int i;
347 pgoff_t index = 0;
348 int err = 0;
349
350 pagevec_init(&pvec, 0);
351repeat:
352 if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
353 PAGEVEC_SIZE))
354 return 0;
355
356 for (i = 0; i < pagevec_count(&pvec); i++) {
357 struct page *page = pvec.pages[i], *dpage;
358
359 lock_page(page);
360 if (unlikely(!PageDirty(page)))
361 NILFS_PAGE_BUG(page, "inconsistent dirty state");
362
363 dpage = grab_cache_page(dmap, page->index);
364 if (unlikely(!dpage)) {
365 /* No empty page is added to the page cache */
366 err = -ENOMEM;
367 unlock_page(page);
368 break;
369 }
370 if (unlikely(!page_has_buffers(page)))
371 NILFS_PAGE_BUG(page,
372 "found empty page in dat page cache");
373
374 nilfs_copy_page(dpage, page, 1);
375 __set_page_dirty_nobuffers(dpage);
376
377 unlock_page(dpage);
378 page_cache_release(dpage);
379 unlock_page(page);
380 }
381 pagevec_release(&pvec);
382 cond_resched();
383
384 if (likely(!err))
385 goto repeat;
386 return err;
387}
388
389/**
390 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
391 * @dmap: destination page cache
392 * @smap: source page cache
393 *
394 * No pages must no be added to the cache during this process.
395 * This must be ensured by the caller.
396 */
397void nilfs_copy_back_pages(struct address_space *dmap,
398 struct address_space *smap)
399{
400 struct pagevec pvec;
401 unsigned int i, n;
402 pgoff_t index = 0;
403 int err;
404
405 pagevec_init(&pvec, 0);
406repeat:
407 n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
408 if (!n)
409 return;
410 index = pvec.pages[n - 1]->index + 1;
411
412 for (i = 0; i < pagevec_count(&pvec); i++) {
413 struct page *page = pvec.pages[i], *dpage;
414 pgoff_t offset = page->index;
415
416 lock_page(page);
417 dpage = find_lock_page(dmap, offset);
418 if (dpage) {
419 /* override existing page on the destination cache */
420 WARN_ON(PageDirty(dpage));
421 nilfs_copy_page(dpage, page, 0);
422 unlock_page(dpage);
423 page_cache_release(dpage);
424 } else {
425 struct page *page2;
426
427 /* move the page to the destination cache */
428 spin_lock_irq(&smap->tree_lock);
429 page2 = radix_tree_delete(&smap->page_tree, offset);
430 WARN_ON(page2 != page);
431
432 smap->nrpages--;
433 spin_unlock_irq(&smap->tree_lock);
434
435 spin_lock_irq(&dmap->tree_lock);
436 err = radix_tree_insert(&dmap->page_tree, offset, page);
437 if (unlikely(err < 0)) {
438 WARN_ON(err == -EEXIST);
439 page->mapping = NULL;
440 page_cache_release(page); /* for cache */
441 } else {
442 page->mapping = dmap;
443 dmap->nrpages++;
444 if (PageDirty(page))
445 radix_tree_tag_set(&dmap->page_tree,
446 offset,
447 PAGECACHE_TAG_DIRTY);
448 }
449 spin_unlock_irq(&dmap->tree_lock);
450 }
451 unlock_page(page);
452 }
453 pagevec_release(&pvec);
454 cond_resched();
455
456 goto repeat;
457}
458
459void nilfs_clear_dirty_pages(struct address_space *mapping)
460{
461 struct pagevec pvec;
462 unsigned int i;
463 pgoff_t index = 0;
464
465 pagevec_init(&pvec, 0);
466
467 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
468 PAGEVEC_SIZE)) {
469 for (i = 0; i < pagevec_count(&pvec); i++) {
470 struct page *page = pvec.pages[i];
471 struct buffer_head *bh, *head;
472
473 lock_page(page);
474 ClearPageUptodate(page);
475 ClearPageMappedToDisk(page);
476 bh = head = page_buffers(page);
477 do {
478 lock_buffer(bh);
479 clear_buffer_dirty(bh);
480 clear_buffer_nilfs_volatile(bh);
481 clear_buffer_uptodate(bh);
482 clear_buffer_mapped(bh);
483 unlock_buffer(bh);
484 bh = bh->b_this_page;
485 } while (bh != head);
486
487 __nilfs_clear_page_dirty(page);
488 unlock_page(page);
489 }
490 pagevec_release(&pvec);
491 cond_resched();
492 }
493}
494
495unsigned nilfs_page_count_clean_buffers(struct page *page,
496 unsigned from, unsigned to)
497{
498 unsigned block_start, block_end;
499 struct buffer_head *bh, *head;
500 unsigned nc = 0;
501
502 for (bh = head = page_buffers(page), block_start = 0;
503 bh != head || !block_start;
504 block_start = block_end, bh = bh->b_this_page) {
505 block_end = block_start + bh->b_size;
506 if (block_end > from && block_start < to && !buffer_dirty(bh))
507 nc++;
508 }
509 return nc;
510}
511
512/*
513 * NILFS2 needs clear_page_dirty() in the following two cases:
514 *
515 * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
516 * page dirty flags when it copies back pages from the shadow cache
517 * (gcdat->{i_mapping,i_btnode_cache}) to its original cache
518 * (dat->{i_mapping,i_btnode_cache}).
519 *
520 * 2) Some B-tree operations like insertion or deletion may dispose buffers
521 * in dirty state, and this needs to cancel the dirty state of their pages.
522 */
523int __nilfs_clear_page_dirty(struct page *page)
524{
525 struct address_space *mapping = page->mapping;
526
527 if (mapping) {
528 spin_lock_irq(&mapping->tree_lock);
529 if (test_bit(PG_dirty, &page->flags)) {
530 radix_tree_tag_clear(&mapping->page_tree,
531 page_index(page),
532 PAGECACHE_TAG_DIRTY);
533 spin_unlock_irq(&mapping->tree_lock);
534 return clear_page_dirty_for_io(page);
535 }
536 spin_unlock_irq(&mapping->tree_lock);
537 return 0;
538 }
539 return TestClearPageDirty(page);
540}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 000000000000..8abca4d1c1f8
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
1/*
2 * page.h - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#ifndef _NILFS_PAGE_H
25#define _NILFS_PAGE_H
26
27#include <linux/buffer_head.h>
28#include "nilfs.h"
29
30/*
31 * Extended buffer state bits
32 */
33enum {
34 BH_NILFS_Allocated = BH_PrivateStart,
35 BH_NILFS_Node,
36 BH_NILFS_Volatile,
37};
38
39BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
40BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
41BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
42
43
44void nilfs_mark_buffer_dirty(struct buffer_head *bh);
45int __nilfs_clear_page_dirty(struct page *);
46
47struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
48 unsigned long, unsigned long);
49void nilfs_forget_buffer(struct buffer_head *);
50void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
51int nilfs_page_buffers_clean(struct page *);
52void nilfs_page_bug(struct page *);
53struct page *nilfs_alloc_private_page(struct block_device *, int,
54 unsigned long);
55void nilfs_free_private_page(struct page *);
56
57int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
58void nilfs_copy_back_pages(struct address_space *, struct address_space *);
59void nilfs_clear_dirty_pages(struct address_space *);
60unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
61
62#define NILFS_PAGE_BUG(page, m, a...) \
63 do { nilfs_page_bug(page); BUG(); } while (0)
64
65static inline struct buffer_head *
66nilfs_page_get_nth_block(struct page *page, unsigned int count)
67{
68 struct buffer_head *bh = page_buffers(page);
69
70 while (count-- > 0)
71 bh = bh->b_this_page;
72 get_bh(bh);
73 return bh;
74}
75
76#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 000000000000..6ade0963fc1d
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,929 @@
1/*
2 * recovery.c - NILFS recovery logic
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/swap.h>
26#include <linux/crc32.h>
27#include "nilfs.h"
28#include "segment.h"
29#include "sufile.h"
30#include "page.h"
31#include "seglist.h"
32#include "segbuf.h"
33
34/*
35 * Segment check result
36 */
37enum {
38 NILFS_SEG_VALID,
39 NILFS_SEG_NO_SUPER_ROOT,
40 NILFS_SEG_FAIL_IO,
41 NILFS_SEG_FAIL_MAGIC,
42 NILFS_SEG_FAIL_SEQ,
43 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
44 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
45 NILFS_SEG_FAIL_CHECKSUM_FULL,
46 NILFS_SEG_FAIL_CONSISTENCY,
47};
48
49/* work structure for recovery */
50struct nilfs_recovery_block {
51 ino_t ino; /* Inode number of the file that this block
52 belongs to */
53 sector_t blocknr; /* block number */
54 __u64 vblocknr; /* virtual block number */
55 unsigned long blkoff; /* File offset of the data block (per block) */
56 struct list_head list;
57};
58
59
60static int nilfs_warn_segment_error(int err)
61{
62 switch (err) {
63 case NILFS_SEG_FAIL_IO:
64 printk(KERN_WARNING
65 "NILFS warning: I/O error on loading last segment\n");
66 return -EIO;
67 case NILFS_SEG_FAIL_MAGIC:
68 printk(KERN_WARNING
69 "NILFS warning: Segment magic number invalid\n");
70 break;
71 case NILFS_SEG_FAIL_SEQ:
72 printk(KERN_WARNING
73 "NILFS warning: Sequence number mismatch\n");
74 break;
75 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
76 printk(KERN_WARNING
77 "NILFS warning: Checksum error in segment summary\n");
78 break;
79 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
80 printk(KERN_WARNING
81 "NILFS warning: Checksum error in super root\n");
82 break;
83 case NILFS_SEG_FAIL_CHECKSUM_FULL:
84 printk(KERN_WARNING
85 "NILFS warning: Checksum error in segment payload\n");
86 break;
87 case NILFS_SEG_FAIL_CONSISTENCY:
88 printk(KERN_WARNING
89 "NILFS warning: Inconsistent segment\n");
90 break;
91 case NILFS_SEG_NO_SUPER_ROOT:
92 printk(KERN_WARNING
93 "NILFS warning: No super root in the last segment\n");
94 break;
95 }
96 return -EINVAL;
97}
98
99static void store_segsum_info(struct nilfs_segsum_info *ssi,
100 struct nilfs_segment_summary *sum,
101 unsigned int blocksize)
102{
103 ssi->flags = le16_to_cpu(sum->ss_flags);
104 ssi->seg_seq = le64_to_cpu(sum->ss_seq);
105 ssi->ctime = le64_to_cpu(sum->ss_create);
106 ssi->next = le64_to_cpu(sum->ss_next);
107 ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
108 ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
109 ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
110
111 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
112 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
113}
114
115/**
116 * calc_crc_cont - check CRC of blocks continuously
117 * @sbi: nilfs_sb_info
118 * @bhs: buffer head of start block
119 * @sum: place to store result
120 * @offset: offset bytes in the first block
121 * @check_bytes: number of bytes to be checked
122 * @start: DBN of start block
123 * @nblock: number of blocks to be checked
124 */
125static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
126 u32 *sum, unsigned long offset, u64 check_bytes,
127 sector_t start, unsigned long nblock)
128{
129 unsigned long blocksize = sbi->s_super->s_blocksize;
130 unsigned long size;
131 u32 crc;
132
133 BUG_ON(offset >= blocksize);
134 check_bytes -= offset;
135 size = min_t(u64, check_bytes, blocksize - offset);
136 crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
137 (unsigned char *)bhs->b_data + offset, size);
138 if (--nblock > 0) {
139 do {
140 struct buffer_head *bh
141 = sb_bread(sbi->s_super, ++start);
142 if (!bh)
143 return -EIO;
144 check_bytes -= size;
145 size = min_t(u64, check_bytes, blocksize);
146 crc = crc32_le(crc, bh->b_data, size);
147 brelse(bh);
148 } while (--nblock > 0);
149 }
150 *sum = crc;
151 return 0;
152}
153
154/**
155 * nilfs_read_super_root_block - read super root block
156 * @sb: super_block
157 * @sr_block: disk block number of the super root block
158 * @pbh: address of a buffer_head pointer to return super root buffer
159 * @check: CRC check flag
160 */
161int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
162 struct buffer_head **pbh, int check)
163{
164 struct buffer_head *bh_sr;
165 struct nilfs_super_root *sr;
166 u32 crc;
167 int ret;
168
169 *pbh = NULL;
170 bh_sr = sb_bread(sb, sr_block);
171 if (unlikely(!bh_sr)) {
172 ret = NILFS_SEG_FAIL_IO;
173 goto failed;
174 }
175
176 sr = (struct nilfs_super_root *)bh_sr->b_data;
177 if (check) {
178 unsigned bytes = le16_to_cpu(sr->sr_bytes);
179
180 if (bytes == 0 || bytes > sb->s_blocksize) {
181 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
182 goto failed_bh;
183 }
184 if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
185 sizeof(sr->sr_sum), bytes, sr_block, 1)) {
186 ret = NILFS_SEG_FAIL_IO;
187 goto failed_bh;
188 }
189 if (crc != le32_to_cpu(sr->sr_sum)) {
190 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
191 goto failed_bh;
192 }
193 }
194 *pbh = bh_sr;
195 return 0;
196
197 failed_bh:
198 brelse(bh_sr);
199
200 failed:
201 return nilfs_warn_segment_error(ret);
202}
203
204/**
205 * load_segment_summary - read segment summary of the specified partial segment
206 * @sbi: nilfs_sb_info
207 * @pseg_start: start disk block number of partial segment
208 * @seg_seq: sequence number requested
209 * @ssi: pointer to nilfs_segsum_info struct to store information
210 * @full_check: full check flag
211 * (0: only checks segment summary CRC, 1: data CRC)
212 */
213static int
214load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
215 u64 seg_seq, struct nilfs_segsum_info *ssi,
216 int full_check)
217{
218 struct buffer_head *bh_sum;
219 struct nilfs_segment_summary *sum;
220 unsigned long offset, nblock;
221 u64 check_bytes;
222 u32 crc, crc_sum;
223 int ret = NILFS_SEG_FAIL_IO;
224
225 bh_sum = sb_bread(sbi->s_super, pseg_start);
226 if (!bh_sum)
227 goto out;
228
229 sum = (struct nilfs_segment_summary *)bh_sum->b_data;
230
231 /* Check consistency of segment summary */
232 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
233 ret = NILFS_SEG_FAIL_MAGIC;
234 goto failed;
235 }
236 store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
237 if (seg_seq != ssi->seg_seq) {
238 ret = NILFS_SEG_FAIL_SEQ;
239 goto failed;
240 }
241 if (full_check) {
242 offset = sizeof(sum->ss_datasum);
243 check_bytes =
244 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
245 nblock = ssi->nblocks;
246 crc_sum = le32_to_cpu(sum->ss_datasum);
247 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
248 } else { /* only checks segment summary */
249 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
250 check_bytes = ssi->sumbytes;
251 nblock = ssi->nsumblk;
252 crc_sum = le32_to_cpu(sum->ss_sumsum);
253 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
254 }
255
256 if (unlikely(nblock == 0 ||
257 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
258 /* This limits the number of blocks read in the CRC check */
259 ret = NILFS_SEG_FAIL_CONSISTENCY;
260 goto failed;
261 }
262 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
263 pseg_start, nblock)) {
264 ret = NILFS_SEG_FAIL_IO;
265 goto failed;
266 }
267 if (crc == crc_sum)
268 ret = 0;
269 failed:
270 brelse(bh_sum);
271 out:
272 return ret;
273}
274
275static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
276 unsigned int *offset, unsigned int bytes)
277{
278 void *ptr;
279 sector_t blocknr;
280
281 BUG_ON((*pbh)->b_size < *offset);
282 if (bytes > (*pbh)->b_size - *offset) {
283 blocknr = (*pbh)->b_blocknr;
284 brelse(*pbh);
285 *pbh = sb_bread(sb, blocknr + 1);
286 if (unlikely(!*pbh))
287 return NULL;
288 *offset = 0;
289 }
290 ptr = (*pbh)->b_data + *offset;
291 *offset += bytes;
292 return ptr;
293}
294
295static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
296 unsigned int *offset, unsigned int bytes,
297 unsigned long count)
298{
299 unsigned int rest_item_in_current_block
300 = ((*pbh)->b_size - *offset) / bytes;
301
302 if (count <= rest_item_in_current_block) {
303 *offset += bytes * count;
304 } else {
305 sector_t blocknr = (*pbh)->b_blocknr;
306 unsigned int nitem_per_block = (*pbh)->b_size / bytes;
307 unsigned int bcnt;
308
309 count -= rest_item_in_current_block;
310 bcnt = DIV_ROUND_UP(count, nitem_per_block);
311 *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
312
313 brelse(*pbh);
314 *pbh = sb_bread(sb, blocknr + bcnt);
315 }
316}
317
318static int
319collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
320 struct nilfs_segsum_info *ssi,
321 struct list_head *head)
322{
323 struct buffer_head *bh;
324 unsigned int offset;
325 unsigned long nfinfo = ssi->nfinfo;
326 sector_t blocknr = sum_blocknr + ssi->nsumblk;
327 ino_t ino;
328 int err = -EIO;
329
330 if (!nfinfo)
331 return 0;
332
333 bh = sb_bread(sbi->s_super, sum_blocknr);
334 if (unlikely(!bh))
335 goto out;
336
337 offset = le16_to_cpu(
338 ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
339 for (;;) {
340 unsigned long nblocks, ndatablk, nnodeblk;
341 struct nilfs_finfo *finfo;
342
343 finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
344 if (unlikely(!finfo))
345 goto out;
346
347 ino = le64_to_cpu(finfo->fi_ino);
348 nblocks = le32_to_cpu(finfo->fi_nblocks);
349 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
350 nnodeblk = nblocks - ndatablk;
351
352 while (ndatablk-- > 0) {
353 struct nilfs_recovery_block *rb;
354 struct nilfs_binfo_v *binfo;
355
356 binfo = segsum_get(sbi->s_super, &bh, &offset,
357 sizeof(*binfo));
358 if (unlikely(!binfo))
359 goto out;
360
361 rb = kmalloc(sizeof(*rb), GFP_NOFS);
362 if (unlikely(!rb)) {
363 err = -ENOMEM;
364 goto out;
365 }
366 rb->ino = ino;
367 rb->blocknr = blocknr++;
368 rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
369 rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
370 /* INIT_LIST_HEAD(&rb->list); */
371 list_add_tail(&rb->list, head);
372 }
373 if (--nfinfo == 0)
374 break;
375 blocknr += nnodeblk; /* always 0 for the data sync segments */
376 segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
377 nnodeblk);
378 if (unlikely(!bh))
379 goto out;
380 }
381 err = 0;
382 out:
383 brelse(bh); /* brelse(NULL) is just ignored */
384 return err;
385}
386
387static void dispose_recovery_list(struct list_head *head)
388{
389 while (!list_empty(head)) {
390 struct nilfs_recovery_block *rb
391 = list_entry(head->next,
392 struct nilfs_recovery_block, list);
393 list_del(&rb->list);
394 kfree(rb);
395 }
396}
397
398void nilfs_dispose_segment_list(struct list_head *head)
399{
400 while (!list_empty(head)) {
401 struct nilfs_segment_entry *ent
402 = list_entry(head->next,
403 struct nilfs_segment_entry, list);
404 list_del(&ent->list);
405 nilfs_free_segment_entry(ent);
406 }
407}
408
409static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
410 struct nilfs_recovery_info *ri)
411{
412 struct list_head *head = &ri->ri_used_segments;
413 struct nilfs_segment_entry *ent, *n;
414 struct inode *sufile = nilfs->ns_sufile;
415 __u64 segnum[4];
416 time_t mtime;
417 int err;
418 int i;
419
420 segnum[0] = nilfs->ns_segnum;
421 segnum[1] = nilfs->ns_nextnum;
422 segnum[2] = ri->ri_segnum;
423 segnum[3] = ri->ri_nextnum;
424
425 /*
426 * Releasing the next segment of the latest super root.
427 * The next segment is invalidated by this recovery.
428 */
429 err = nilfs_sufile_free(sufile, segnum[1]);
430 if (unlikely(err))
431 goto failed;
432
433 err = -ENOMEM;
434 for (i = 1; i < 4; i++) {
435 ent = nilfs_alloc_segment_entry(segnum[i]);
436 if (unlikely(!ent))
437 goto failed;
438 list_add_tail(&ent->list, head);
439 }
440
441 /*
442 * Collecting segments written after the latest super root.
443 * These are marked dirty to avoid being reallocated in the next write.
444 */
445 mtime = get_seconds();
446 list_for_each_entry_safe(ent, n, head, list) {
447 if (ent->segnum == segnum[0]) {
448 list_del(&ent->list);
449 nilfs_free_segment_entry(ent);
450 continue;
451 }
452 err = nilfs_open_segment_entry(ent, sufile);
453 if (unlikely(err))
454 goto failed;
455 if (!nilfs_segment_usage_dirty(ent->raw_su)) {
456 /* make the segment garbage */
457 ent->raw_su->su_nblocks = cpu_to_le32(0);
458 ent->raw_su->su_lastmod = cpu_to_le32(mtime);
459 nilfs_segment_usage_set_dirty(ent->raw_su);
460 }
461 list_del(&ent->list);
462 nilfs_close_segment_entry(ent, sufile);
463 nilfs_free_segment_entry(ent);
464 }
465
466 /* Allocate new segments for recovery */
467 err = nilfs_sufile_alloc(sufile, &segnum[0]);
468 if (unlikely(err))
469 goto failed;
470
471 nilfs->ns_pseg_offset = 0;
472 nilfs->ns_seg_seq = ri->ri_seq + 2;
473 nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
474 return 0;
475
476 failed:
477 /* No need to recover sufile because it will be destroyed on error */
478 return err;
479}
480
481static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
482 struct nilfs_recovery_block *rb,
483 struct page *page)
484{
485 struct buffer_head *bh_org;
486 void *kaddr;
487
488 bh_org = sb_bread(sbi->s_super, rb->blocknr);
489 if (unlikely(!bh_org))
490 return -EIO;
491
492 kaddr = kmap_atomic(page, KM_USER0);
493 memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
494 kunmap_atomic(kaddr, KM_USER0);
495 brelse(bh_org);
496 return 0;
497}
498
499static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
500 struct list_head *head,
501 unsigned long *nr_salvaged_blocks)
502{
503 struct inode *inode;
504 struct nilfs_recovery_block *rb, *n;
505 unsigned blocksize = sbi->s_super->s_blocksize;
506 struct page *page;
507 loff_t pos;
508 int err = 0, err2 = 0;
509
510 list_for_each_entry_safe(rb, n, head, list) {
511 inode = nilfs_iget(sbi->s_super, rb->ino);
512 if (IS_ERR(inode)) {
513 err = PTR_ERR(inode);
514 inode = NULL;
515 goto failed_inode;
516 }
517
518 pos = rb->blkoff << inode->i_blkbits;
519 page = NULL;
520 err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
521 0, &page, NULL, nilfs_get_block);
522 if (unlikely(err))
523 goto failed_inode;
524
525 err = nilfs_recovery_copy_block(sbi, rb, page);
526 if (unlikely(err))
527 goto failed_page;
528
529 err = nilfs_set_file_dirty(sbi, inode, 1);
530 if (unlikely(err))
531 goto failed_page;
532
533 block_write_end(NULL, inode->i_mapping, pos, blocksize,
534 blocksize, page, NULL);
535
536 unlock_page(page);
537 page_cache_release(page);
538
539 (*nr_salvaged_blocks)++;
540 goto next;
541
542 failed_page:
543 unlock_page(page);
544 page_cache_release(page);
545
546 failed_inode:
547 printk(KERN_WARNING
548 "NILFS warning: error recovering data block "
549 "(err=%d, ino=%lu, block-offset=%llu)\n",
550 err, rb->ino, (unsigned long long)rb->blkoff);
551 if (!err2)
552 err2 = err;
553 next:
554 iput(inode); /* iput(NULL) is just ignored */
555 list_del_init(&rb->list);
556 kfree(rb);
557 }
558 return err2;
559}
560
561/**
562 * nilfs_do_roll_forward - salvage logical segments newer than the latest
563 * checkpoint
564 * @sbi: nilfs_sb_info
565 * @nilfs: the_nilfs
566 * @ri: pointer to a nilfs_recovery_info
567 */
568static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
569 struct nilfs_sb_info *sbi,
570 struct nilfs_recovery_info *ri)
571{
572 struct nilfs_segsum_info ssi;
573 sector_t pseg_start;
574 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
575 unsigned long nsalvaged_blocks = 0;
576 u64 seg_seq;
577 __u64 segnum, nextnum = 0;
578 int empty_seg = 0;
579 int err = 0, ret;
580 LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */
581 enum {
582 RF_INIT_ST,
583 RF_DSYNC_ST, /* scanning data-sync segments */
584 };
585 int state = RF_INIT_ST;
586
587 nilfs_attach_writer(nilfs, sbi);
588 pseg_start = ri->ri_lsegs_start;
589 seg_seq = ri->ri_lsegs_start_seq;
590 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
591 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
592
593 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
594
595 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
596 if (ret) {
597 if (ret == NILFS_SEG_FAIL_IO) {
598 err = -EIO;
599 goto failed;
600 }
601 goto strayed;
602 }
603 if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
604 goto confused;
605
606 /* Found a valid partial segment; do recovery actions */
607 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
608 empty_seg = 0;
609 nilfs->ns_ctime = ssi.ctime;
610 if (!(ssi.flags & NILFS_SS_GC))
611 nilfs->ns_nongc_ctime = ssi.ctime;
612
613 switch (state) {
614 case RF_INIT_ST:
615 if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
616 goto try_next_pseg;
617 state = RF_DSYNC_ST;
618 /* Fall through */
619 case RF_DSYNC_ST:
620 if (!NILFS_SEG_DSYNC(&ssi))
621 goto confused;
622
623 err = collect_blocks_from_segsum(
624 sbi, pseg_start, &ssi, &dsync_blocks);
625 if (unlikely(err))
626 goto failed;
627 if (NILFS_SEG_LOGEND(&ssi)) {
628 err = recover_dsync_blocks(
629 sbi, &dsync_blocks, &nsalvaged_blocks);
630 if (unlikely(err))
631 goto failed;
632 state = RF_INIT_ST;
633 }
634 break; /* Fall through to try_next_pseg */
635 }
636
637 try_next_pseg:
638 if (pseg_start == ri->ri_lsegs_end)
639 break;
640 pseg_start += ssi.nblocks;
641 if (pseg_start < seg_end)
642 continue;
643 goto feed_segment;
644
645 strayed:
646 if (pseg_start == ri->ri_lsegs_end)
647 break;
648
649 feed_segment:
650 /* Looking to the next full segment */
651 if (empty_seg++)
652 break;
653 seg_seq++;
654 segnum = nextnum;
655 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
656 pseg_start = seg_start;
657 }
658
659 if (nsalvaged_blocks) {
660 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
661 sbi->s_super->s_id, nsalvaged_blocks);
662 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
663 }
664 out:
665 dispose_recovery_list(&dsync_blocks);
666 nilfs_detach_writer(sbi->s_nilfs, sbi);
667 return err;
668
669 confused:
670 err = -EINVAL;
671 failed:
672 printk(KERN_ERR
673 "NILFS (device %s): Error roll-forwarding "
674 "(err=%d, pseg block=%llu). ",
675 sbi->s_super->s_id, err, (unsigned long long)pseg_start);
676 goto out;
677}
678
679static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
680 struct nilfs_sb_info *sbi,
681 struct nilfs_recovery_info *ri)
682{
683 struct buffer_head *bh;
684 int err;
685
686 if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
687 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
688 return;
689
690 bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
691 BUG_ON(!bh);
692 memset(bh->b_data, 0, bh->b_size);
693 set_buffer_dirty(bh);
694 err = sync_dirty_buffer(bh);
695 if (unlikely(err))
696 printk(KERN_WARNING
697 "NILFS warning: buffer sync write failed during "
698 "post-cleaning of recovery.\n");
699 brelse(bh);
700}
701
702/**
703 * nilfs_recover_logical_segments - salvage logical segments written after
704 * the latest super root
705 * @nilfs: the_nilfs
706 * @sbi: nilfs_sb_info
707 * @ri: pointer to a nilfs_recovery_info struct to store search results.
708 *
709 * Return Value: On success, 0 is returned. On error, one of the following
710 * negative error code is returned.
711 *
712 * %-EINVAL - Inconsistent filesystem state.
713 *
714 * %-EIO - I/O error
715 *
716 * %-ENOSPC - No space left on device (only in a panic state).
717 *
718 * %-ERESTARTSYS - Interrupted.
719 *
720 * %-ENOMEM - Insufficient memory available.
721 */
722int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
723 struct nilfs_sb_info *sbi,
724 struct nilfs_recovery_info *ri)
725{
726 int err;
727
728 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
729 return 0;
730
731 err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
732 if (unlikely(err)) {
733 printk(KERN_ERR
734 "NILFS: error loading the latest checkpoint.\n");
735 return err;
736 }
737
738 err = nilfs_do_roll_forward(nilfs, sbi, ri);
739 if (unlikely(err))
740 goto failed;
741
742 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
743 err = nilfs_prepare_segment_for_recovery(nilfs, ri);
744 if (unlikely(err)) {
745 printk(KERN_ERR "NILFS: Error preparing segments for "
746 "recovery.\n");
747 goto failed;
748 }
749
750 err = nilfs_attach_segment_constructor(sbi);
751 if (unlikely(err))
752 goto failed;
753
754 set_nilfs_discontinued(nilfs);
755 err = nilfs_construct_segment(sbi->s_super);
756 nilfs_detach_segment_constructor(sbi);
757
758 if (unlikely(err)) {
759 printk(KERN_ERR "NILFS: Oops! recovery failed. "
760 "(err=%d)\n", err);
761 goto failed;
762 }
763
764 nilfs_finish_roll_forward(nilfs, sbi, ri);
765 }
766
767 nilfs_detach_checkpoint(sbi);
768 return 0;
769
770 failed:
771 nilfs_detach_checkpoint(sbi);
772 nilfs_mdt_clear(nilfs->ns_cpfile);
773 nilfs_mdt_clear(nilfs->ns_sufile);
774 nilfs_mdt_clear(nilfs->ns_dat);
775 return err;
776}
777
778/**
779 * nilfs_search_super_root - search the latest valid super root
780 * @nilfs: the_nilfs
781 * @sbi: nilfs_sb_info
782 * @ri: pointer to a nilfs_recovery_info struct to store search results.
783 *
784 * nilfs_search_super_root() looks for the latest super-root from a partial
785 * segment pointed by the superblock. It sets up struct the_nilfs through
786 * this search. It fills nilfs_recovery_info (ri) required for recovery.
787 *
788 * Return Value: On success, 0 is returned. On error, one of the following
789 * negative error code is returned.
790 *
791 * %-EINVAL - No valid segment found
792 *
793 * %-EIO - I/O error
794 */
795int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
796 struct nilfs_recovery_info *ri)
797{
798 struct nilfs_segsum_info ssi;
799 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
800 sector_t seg_start, seg_end; /* range of full segment (block number) */
801 u64 seg_seq;
802 __u64 segnum, nextnum = 0;
803 __u64 cno;
804 struct nilfs_segment_entry *ent;
805 LIST_HEAD(segments);
806 int empty_seg = 0, scan_newer = 0;
807 int ret;
808
809 pseg_start = nilfs->ns_last_pseg;
810 seg_seq = nilfs->ns_last_seq;
811 cno = nilfs->ns_last_cno;
812 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
813
814 /* Calculate range of segment */
815 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
816
817 for (;;) {
818 /* Load segment summary */
819 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
820 if (ret) {
821 if (ret == NILFS_SEG_FAIL_IO)
822 goto failed;
823 goto strayed;
824 }
825 pseg_end = pseg_start + ssi.nblocks - 1;
826 if (unlikely(pseg_end > seg_end)) {
827 ret = NILFS_SEG_FAIL_CONSISTENCY;
828 goto strayed;
829 }
830
831 /* A valid partial segment */
832 ri->ri_pseg_start = pseg_start;
833 ri->ri_seq = seg_seq;
834 ri->ri_segnum = segnum;
835 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
836 ri->ri_nextnum = nextnum;
837 empty_seg = 0;
838
839 if (!NILFS_SEG_HAS_SR(&ssi)) {
840 if (!scan_newer) {
841 /* This will never happen because a superblock
842 (last_segment) always points to a pseg
843 having a super root. */
844 ret = NILFS_SEG_FAIL_CONSISTENCY;
845 goto failed;
846 }
847 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
848 ri->ri_lsegs_start = pseg_start;
849 ri->ri_lsegs_start_seq = seg_seq;
850 }
851 if (NILFS_SEG_LOGEND(&ssi))
852 ri->ri_lsegs_end = pseg_start;
853 goto try_next_pseg;
854 }
855
856 /* A valid super root was found. */
857 ri->ri_cno = cno++;
858 ri->ri_super_root = pseg_end;
859 ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
860
861 nilfs_dispose_segment_list(&segments);
862 nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
863 + ssi.nblocks - seg_start;
864 nilfs->ns_seg_seq = seg_seq;
865 nilfs->ns_segnum = segnum;
866 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */
867 nilfs->ns_ctime = ssi.ctime;
868 nilfs->ns_nextnum = nextnum;
869
870 if (scan_newer)
871 ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
872 else {
873 if (nilfs->ns_mount_state & NILFS_VALID_FS)
874 goto super_root_found;
875 scan_newer = 1;
876 }
877
878 /* reset region for roll-forward */
879 pseg_start += ssi.nblocks;
880 if (pseg_start < seg_end)
881 continue;
882 goto feed_segment;
883
884 try_next_pseg:
885 /* Standing on a course, or met an inconsistent state */
886 pseg_start += ssi.nblocks;
887 if (pseg_start < seg_end)
888 continue;
889 goto feed_segment;
890
891 strayed:
892 /* Off the trail */
893 if (!scan_newer)
894 /*
895 * This can happen if a checkpoint was written without
896 * barriers, or as a result of an I/O failure.
897 */
898 goto failed;
899
900 feed_segment:
901 /* Looking to the next full segment */
902 if (empty_seg++)
903 goto super_root_found; /* found a valid super root */
904
905 ent = nilfs_alloc_segment_entry(segnum);
906 if (unlikely(!ent)) {
907 ret = -ENOMEM;
908 goto failed;
909 }
910 list_add_tail(&ent->list, &segments);
911
912 seg_seq++;
913 segnum = nextnum;
914 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
915 pseg_start = seg_start;
916 }
917
918 super_root_found:
919 /* Updating pointers relating to the latest checkpoint */
920 list_splice(&segments, ri->ri_used_segments.prev);
921 nilfs->ns_last_pseg = sr_pseg_start;
922 nilfs->ns_last_seq = nilfs->ns_seg_seq;
923 nilfs->ns_last_cno = ri->ri_cno;
924 return 0;
925
926 failed:
927 nilfs_dispose_segment_list(&segments);
928 return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
929}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 000000000000..adccd4fc654e
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
1/*
2 * sb.h - NILFS on-memory super block structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _NILFS_SB
25#define _NILFS_SB
26
27#include <linux/types.h>
28#include <linux/fs.h>
29
30/*
31 * Mount options
32 */
33struct nilfs_mount_options {
34 unsigned long mount_opt;
35 __u64 snapshot_cno;
36};
37
38struct the_nilfs;
39struct nilfs_sc_info;
40
41/*
42 * NILFS super-block data in memory
43 */
44struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */
51 unsigned long s_mount_opt;
52 uid_t s_resuid;
53 gid_t s_resgid;
54
55 unsigned long s_interval; /* construction interval */
56 unsigned long s_watermark; /* threshold of data amount
57 for the segment construction */
58
59 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63
64 /* Segment constructor */
65 struct list_head s_dirty_files; /* dirty files list */
66 struct nilfs_sc_info *s_sc_info; /* segment constructor info */
67 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
68 It covers s_dirty_files list */
69
70 /* Metadata files */
71 struct inode *s_ifile; /* index file inode */
72
73 /* Inode allocator */
74 spinlock_t s_next_gen_lock;
75 u32 s_next_generation;
76};
77
78static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
79{
80 return sb->s_fs_info;
81}
82
83static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
84{
85 return sbi->s_sc_info;
86}
87
88/*
89 * Bit operations for the mount option
90 */
91#define nilfs_clear_opt(sbi, opt) \
92 do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
93#define nilfs_set_opt(sbi, opt) \
94 do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
95#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
96#define nilfs_write_opt(sbi, mask, opt) \
97 do { (sbi)->s_mount_opt = \
98 (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \
99 NILFS_MOUNT_##opt); \
100 } while (0)
101
102#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000000..1e68821b4a9b
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
1/*
2 * segbuf.c - NILFS segment buffer
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/writeback.h>
26#include <linux/crc32.h>
27#include "page.h"
28#include "segbuf.h"
29#include "seglist.h"
30
31
32static struct kmem_cache *nilfs_segbuf_cachep;
33
34static void nilfs_segbuf_init_once(void *obj)
35{
36 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
37}
38
39int __init nilfs_init_segbuf_cache(void)
40{
41 nilfs_segbuf_cachep =
42 kmem_cache_create("nilfs2_segbuf_cache",
43 sizeof(struct nilfs_segment_buffer),
44 0, SLAB_RECLAIM_ACCOUNT,
45 nilfs_segbuf_init_once);
46
47 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
48}
49
50void nilfs_destroy_segbuf_cache(void)
51{
52 kmem_cache_destroy(nilfs_segbuf_cachep);
53}
54
55struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
56{
57 struct nilfs_segment_buffer *segbuf;
58
59 segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
60 if (unlikely(!segbuf))
61 return NULL;
62
63 segbuf->sb_super = sb;
64 INIT_LIST_HEAD(&segbuf->sb_list);
65 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
66 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
67 return segbuf;
68}
69
70void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
71{
72 kmem_cache_free(nilfs_segbuf_cachep, segbuf);
73}
74
75void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
76 unsigned long offset, struct the_nilfs *nilfs)
77{
78 segbuf->sb_segnum = segnum;
79 nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
80 &segbuf->sb_fseg_end);
81
82 segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
83 segbuf->sb_rest_blocks =
84 segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
85}
86
87void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
88 __u64 nextnum, struct the_nilfs *nilfs)
89{
90 segbuf->sb_nextnum = nextnum;
91 segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
92}
93
94int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
95{
96 struct buffer_head *bh;
97
98 bh = sb_getblk(segbuf->sb_super,
99 segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
100 if (unlikely(!bh))
101 return -ENOMEM;
102
103 nilfs_segbuf_add_segsum_buffer(segbuf, bh);
104 return 0;
105}
106
107int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
108 struct buffer_head **bhp)
109{
110 struct buffer_head *bh;
111
112 bh = sb_getblk(segbuf->sb_super,
113 segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
114 if (unlikely(!bh))
115 return -ENOMEM;
116
117 nilfs_segbuf_add_payload_buffer(segbuf, bh);
118 *bhp = bh;
119 return 0;
120}
121
122int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
123 time_t ctime)
124{
125 int err;
126
127 segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
128 err = nilfs_segbuf_extend_segsum(segbuf);
129 if (unlikely(err))
130 return err;
131
132 segbuf->sb_sum.flags = flags;
133 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
134 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
135 segbuf->sb_sum.ctime = ctime;
136
137 segbuf->sb_io_error = 0;
138 return 0;
139}
140
141/*
142 * Setup segument summary
143 */
144void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
145{
146 struct nilfs_segment_summary *raw_sum;
147 struct buffer_head *bh_sum;
148
149 bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
150 struct buffer_head, b_assoc_buffers);
151 raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
152
153 raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC);
154 raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum));
155 raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags);
156 raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq);
157 raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime);
158 raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next);
159 raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks);
160 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
161 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
162 raw_sum->ss_pad = 0;
163}
164
165/*
166 * CRC calculation routines
167 */
168void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
169 u32 seed)
170{
171 struct buffer_head *bh;
172 struct nilfs_segment_summary *raw_sum;
173 unsigned long size, bytes = segbuf->sb_sum.sumbytes;
174 u32 crc;
175
176 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
177 b_assoc_buffers);
178
179 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
180 size = min_t(unsigned long, bytes, bh->b_size);
181 crc = crc32_le(seed,
182 (unsigned char *)raw_sum +
183 sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
184 size - (sizeof(raw_sum->ss_datasum) +
185 sizeof(raw_sum->ss_sumsum)));
186
187 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
188 b_assoc_buffers) {
189 bytes -= size;
190 size = min_t(unsigned long, bytes, bh->b_size);
191 crc = crc32_le(crc, bh->b_data, size);
192 }
193 raw_sum->ss_sumsum = cpu_to_le32(crc);
194}
195
196void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
197 u32 seed)
198{
199 struct buffer_head *bh;
200 struct nilfs_segment_summary *raw_sum;
201 void *kaddr;
202 u32 crc;
203
204 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
205 b_assoc_buffers);
206 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
207 crc = crc32_le(seed,
208 (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
209 bh->b_size - sizeof(raw_sum->ss_datasum));
210
211 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
212 b_assoc_buffers) {
213 crc = crc32_le(crc, bh->b_data, bh->b_size);
214 }
215 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
216 kaddr = kmap_atomic(bh->b_page, KM_USER0);
217 crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
218 kunmap_atomic(kaddr, KM_USER0);
219 }
220 raw_sum->ss_datasum = cpu_to_le32(crc);
221}
222
223void nilfs_release_buffers(struct list_head *list)
224{
225 struct buffer_head *bh, *n;
226
227 list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
228 list_del_init(&bh->b_assoc_buffers);
229 if (buffer_nilfs_allocated(bh)) {
230 struct page *clone_page = bh->b_page;
231
232 /* remove clone page */
233 brelse(bh);
234 page_cache_release(clone_page); /* for each bh */
235 if (page_count(clone_page) <= 2) {
236 lock_page(clone_page);
237 nilfs_free_private_page(clone_page);
238 }
239 continue;
240 }
241 brelse(bh);
242 }
243}
244
245/*
246 * BIO operations
247 */
248static void nilfs_end_bio_write(struct bio *bio, int err)
249{
250 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
251 struct nilfs_write_info *wi = bio->bi_private;
252
253 if (err == -EOPNOTSUPP) {
254 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
255 bio_put(bio);
256 /* to be detected by submit_seg_bio() */
257 }
258
259 if (!uptodate)
260 atomic_inc(&wi->err);
261
262 bio_put(bio);
263 complete(&wi->bio_event);
264}
265
266static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
267{
268 struct bio *bio = wi->bio;
269 int err;
270
271 if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
272 wait_for_completion(&wi->bio_event);
273 wi->nbio--;
274 if (unlikely(atomic_read(&wi->err))) {
275 bio_put(bio);
276 err = -EIO;
277 goto failed;
278 }
279 }
280
281 bio->bi_end_io = nilfs_end_bio_write;
282 bio->bi_private = wi;
283 bio_get(bio);
284 submit_bio(mode, bio);
285 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
286 bio_put(bio);
287 err = -EOPNOTSUPP;
288 goto failed;
289 }
290 wi->nbio++;
291 bio_put(bio);
292
293 wi->bio = NULL;
294 wi->rest_blocks -= wi->end - wi->start;
295 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
296 wi->start = wi->end;
297 return 0;
298
299 failed:
300 wi->bio = NULL;
301 return err;
302}
303
304/**
305 * nilfs_alloc_seg_bio - allocate a bio for writing segment.
306 * @sb: super block
307 * @start: beginning disk block number of this BIO.
308 * @nr_vecs: request size of page vector.
309 *
310 * alloc_seg_bio() allocates a new BIO structure and initialize it.
311 *
312 * Return Value: On success, pointer to the struct bio is returned.
313 * On error, NULL is returned.
314 */
315static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
316 int nr_vecs)
317{
318 struct bio *bio;
319
320 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
321 if (bio == NULL) {
322 while (!bio && (nr_vecs >>= 1))
323 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
324 }
325 if (likely(bio)) {
326 bio->bi_bdev = sb->s_bdev;
327 bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
328 }
329 return bio;
330}
331
332void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
333 struct nilfs_write_info *wi)
334{
335 wi->bio = NULL;
336 wi->rest_blocks = segbuf->sb_sum.nblocks;
337 wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
338 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
339 wi->start = wi->end = 0;
340 wi->nbio = 0;
341 wi->blocknr = segbuf->sb_pseg_start;
342
343 atomic_set(&wi->err, 0);
344 init_completion(&wi->bio_event);
345}
346
347static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
348 int mode)
349{
350 int len, err;
351
352 BUG_ON(wi->nr_vecs <= 0);
353 repeat:
354 if (!wi->bio) {
355 wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
356 wi->nr_vecs);
357 if (unlikely(!wi->bio))
358 return -ENOMEM;
359 }
360
361 len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (len == bh->b_size) {
363 wi->end++;
364 return 0;
365 }
366 /* bio is FULL */
367 err = nilfs_submit_seg_bio(wi, mode);
368 /* never submit current bh */
369 if (likely(!err))
370 goto repeat;
371 return err;
372}
373
374int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
375 struct nilfs_write_info *wi)
376{
377 struct buffer_head *bh;
378 int res, rw = WRITE;
379
380 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
381 res = nilfs_submit_bh(wi, bh, rw);
382 if (unlikely(res))
383 goto failed_bio;
384 }
385
386 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
387 res = nilfs_submit_bh(wi, bh, rw);
388 if (unlikely(res))
389 goto failed_bio;
390 }
391
392 if (wi->bio) {
393 /*
394 * Last BIO is always sent through the following
395 * submission.
396 */
397 rw |= (1 << BIO_RW_SYNCIO);
398 res = nilfs_submit_seg_bio(wi, rw);
399 if (unlikely(res))
400 goto failed_bio;
401 }
402
403 res = 0;
404 out:
405 return res;
406
407 failed_bio:
408 atomic_inc(&wi->err);
409 goto out;
410}
411
412/**
413 * nilfs_segbuf_wait - wait for completion of requested BIOs
414 * @wi: nilfs_write_info
415 *
416 * Return Value: On Success, 0 is returned. On Error, one of the following
417 * negative error code is returned.
418 *
419 * %-EIO - I/O error
420 */
421int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
422 struct nilfs_write_info *wi)
423{
424 int err = 0;
425
426 if (!wi->nbio)
427 return 0;
428
429 do {
430 wait_for_completion(&wi->bio_event);
431 } while (--wi->nbio > 0);
432
433 if (unlikely(atomic_read(&wi->err) > 0)) {
434 printk(KERN_ERR "NILFS: IO error writing segment\n");
435 err = -EIO;
436 segbuf->sb_io_error = 1;
437 }
438 return err;
439}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000000..0c3076f4e592
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
1/*
2 * segbuf.h - NILFS Segment buffer prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGBUF_H
24#define _NILFS_SEGBUF_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/bio.h>
29#include <linux/completion.h>
30#include <linux/backing-dev.h>
31
32/**
33 * struct nilfs_segsum_info - On-memory segment summary
34 * @flags: Flags
35 * @nfinfo: Number of file information structures
36 * @nblocks: Number of blocks included in the partial segment
37 * @nsumblk: Number of summary blocks
38 * @sumbytes: Byte count of segment summary
39 * @nfileblk: Total number of file blocks
40 * @seg_seq: Segment sequence number
41 * @ctime: Creation time
42 * @next: Block number of the next full segment
43 */
44struct nilfs_segsum_info {
45 unsigned int flags;
46 unsigned long nfinfo;
47 unsigned long nblocks;
48 unsigned long nsumblk;
49 unsigned long sumbytes;
50 unsigned long nfileblk;
51 u64 seg_seq;
52 time_t ctime;
53 sector_t next;
54};
55
56/* macro for the flags */
57#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR)
58#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN)
59#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND)
60#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT)
61#define NILFS_SEG_SIMPLEX(sum) \
62 (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
63 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
64
65#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk)
66
67/**
68 * struct nilfs_segment_buffer - Segment buffer
69 * @sb_super: back pointer to a superblock struct
70 * @sb_list: List head to chain this structure
71 * @sb_sum: On-memory segment summary
72 * @sb_segnum: Index number of the full segment
73 * @sb_nextnum: Index number of the next full segment
74 * @sb_fseg_start: Start block number of the full segment
75 * @sb_fseg_end: End block number of the full segment
76 * @sb_pseg_start: Disk block number of partial segment
77 * @sb_rest_blocks: Number of residual blocks in the current segment
78 * @sb_segsum_buffers: List of buffers for segment summaries
79 * @sb_payload_buffers: List of buffers for segment payload
80 * @sb_io_error: I/O error status
81 */
82struct nilfs_segment_buffer {
83 struct super_block *sb_super;
84 struct list_head sb_list;
85
86 /* Segment information */
87 struct nilfs_segsum_info sb_sum;
88 __u64 sb_segnum;
89 __u64 sb_nextnum;
90 sector_t sb_fseg_start, sb_fseg_end;
91 sector_t sb_pseg_start;
92 unsigned sb_rest_blocks;
93
94 /* Buffers */
95 struct list_head sb_segsum_buffers;
96 struct list_head sb_payload_buffers; /* including super root */
97
98 /* io status */
99 int sb_io_error;
100};
101
102#define NILFS_LIST_SEGBUF(head) \
103 list_entry((head), struct nilfs_segment_buffer, sb_list)
104#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
105#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
106#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev)
107#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next)
108#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head))
109
110#define nilfs_for_each_segbuf_before(s, t, h) \
111 for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
112 (s) = NILFS_NEXT_SEGBUF(s))
113
114#define NILFS_SEGBUF_FIRST_BH(head) \
115 (list_entry((head)->next, struct buffer_head, b_assoc_buffers))
116#define NILFS_SEGBUF_NEXT_BH(bh) \
117 (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
118 b_assoc_buffers))
119#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
120
121
122int __init nilfs_init_segbuf_cache(void);
123void nilfs_destroy_segbuf_cache(void);
124struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
125void nilfs_segbuf_free(struct nilfs_segment_buffer *);
126void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
127 struct the_nilfs *);
128void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
129 struct the_nilfs *);
130int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
131int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
132int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
133 struct buffer_head **);
134void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
135void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
136void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
137
138static inline void
139nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
140 struct buffer_head *bh)
141{
142 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
143 segbuf->sb_sum.nblocks++;
144 segbuf->sb_sum.nsumblk++;
145}
146
147static inline void
148nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
149 struct buffer_head *bh)
150{
151 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
152 segbuf->sb_sum.nblocks++;
153}
154
155static inline void
156nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
157 struct buffer_head *bh)
158{
159 get_bh(bh);
160 nilfs_segbuf_add_payload_buffer(segbuf, bh);
161 segbuf->sb_sum.nfileblk++;
162}
163
164void nilfs_release_buffers(struct list_head *);
165
166static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
167{
168 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
169 nilfs_release_buffers(&segbuf->sb_payload_buffers);
170}
171
172struct nilfs_write_info {
173 struct bio *bio;
174 int start, end; /* The region to be submitted */
175 int rest_blocks;
176 int max_pages;
177 int nr_vecs;
178 sector_t blocknr;
179
180 int nbio;
181 atomic_t err;
182 struct completion bio_event;
183 /* completion event of segment write */
184
185 /*
186 * The following fields must be set explicitly
187 */
188 struct super_block *sb;
189 struct backing_dev_info *bdi; /* backing dev info */
190 struct buffer_head *bh_sr;
191};
192
193
194void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
195 struct nilfs_write_info *);
196int nilfs_segbuf_write(struct nilfs_segment_buffer *,
197 struct nilfs_write_info *);
198int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
199 struct nilfs_write_info *);
200
201#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 000000000000..d39df9144e99
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
1/*
2 * seglist.h - expediential structure and routines to handle list of segments
3 * (would be removed in a future release)
4 *
5 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 * Written by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24#ifndef _NILFS_SEGLIST_H
25#define _NILFS_SEGLIST_H
26
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sufile.h"
31
32struct nilfs_segment_entry {
33 __u64 segnum;
34
35#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally.
36 It must be cancelled if
37 construction aborted */
38
39 unsigned flags;
40 struct list_head list;
41 struct buffer_head *bh_su;
42 struct nilfs_segment_usage *raw_su;
43};
44
45
46void nilfs_dispose_segment_list(struct list_head *);
47
48static inline struct nilfs_segment_entry *
49nilfs_alloc_segment_entry(__u64 segnum)
50{
51 struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
52
53 if (likely(ent)) {
54 ent->segnum = segnum;
55 ent->flags = 0;
56 ent->bh_su = NULL;
57 ent->raw_su = NULL;
58 INIT_LIST_HEAD(&ent->list);
59 }
60 return ent;
61}
62
63static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
64 struct inode *sufile)
65{
66 return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
67 &ent->raw_su, &ent->bh_su);
68}
69
70static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
71 struct inode *sufile)
72{
73 if (!ent->bh_su)
74 return;
75 nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
76 ent->bh_su = NULL;
77 ent->raw_su = NULL;
78}
79
80static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
81{
82 kfree(ent);
83}
84
85#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 000000000000..fb70ec3be20e
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2977 @@
1/*
2 * segment.c - NILFS segment constructor.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/pagemap.h>
25#include <linux/buffer_head.h>
26#include <linux/writeback.h>
27#include <linux/bio.h>
28#include <linux/completion.h>
29#include <linux/blkdev.h>
30#include <linux/backing-dev.h>
31#include <linux/freezer.h>
32#include <linux/kthread.h>
33#include <linux/crc32.h>
34#include <linux/pagevec.h>
35#include "nilfs.h"
36#include "btnode.h"
37#include "page.h"
38#include "segment.h"
39#include "sufile.h"
40#include "cpfile.h"
41#include "ifile.h"
42#include "seglist.h"
43#include "segbuf.h"
44
45
46/*
47 * Segment constructor
48 */
49#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */
50
51#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments
52 appended in collection retry loop */
53
54/* Construction mode */
55enum {
56 SC_LSEG_SR = 1, /* Make a logical segment having a super root */
57 SC_LSEG_DSYNC, /* Flush data blocks of a given file and make
58 a logical segment without a super root */
59 SC_FLUSH_FILE, /* Flush data files, leads to segment writes without
60 creating a checkpoint */
61 SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without
62 a checkpoint */
63};
64
65/* Stage numbers of dirty block collection */
66enum {
67 NILFS_ST_INIT = 0,
68 NILFS_ST_GC, /* Collecting dirty blocks for GC */
69 NILFS_ST_FILE,
70 NILFS_ST_IFILE,
71 NILFS_ST_CPFILE,
72 NILFS_ST_SUFILE,
73 NILFS_ST_DAT,
74 NILFS_ST_SR, /* Super root */
75 NILFS_ST_DSYNC, /* Data sync blocks */
76 NILFS_ST_DONE,
77};
78
79/* State flags of collection */
80#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
81#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
82#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED)
83
84/* Operations depending on the construction mode and file type */
85struct nilfs_sc_operations {
86 int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
87 struct inode *);
88 int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
89 struct inode *);
90 int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
91 struct inode *);
92 void (*write_data_binfo)(struct nilfs_sc_info *,
93 struct nilfs_segsum_pointer *,
94 union nilfs_binfo *);
95 void (*write_node_binfo)(struct nilfs_sc_info *,
96 struct nilfs_segsum_pointer *,
97 union nilfs_binfo *);
98};
99
100/*
101 * Other definitions
102 */
103static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
104static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
105static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
106static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
107 int);
108
109#define nilfs_cnt32_gt(a, b) \
110 (typecheck(__u32, a) && typecheck(__u32, b) && \
111 ((__s32)(b) - (__s32)(a) < 0))
112#define nilfs_cnt32_ge(a, b) \
113 (typecheck(__u32, a) && typecheck(__u32, b) && \
114 ((__s32)(a) - (__s32)(b) >= 0))
115#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
116#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
117
118/*
119 * Transaction
120 */
121static struct kmem_cache *nilfs_transaction_cachep;
122
123/**
124 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
125 *
126 * nilfs_init_transaction_cache() creates a slab cache for the struct
127 * nilfs_transaction_info.
128 *
129 * Return Value: On success, it returns 0. On error, one of the following
130 * negative error code is returned.
131 *
132 * %-ENOMEM - Insufficient memory available.
133 */
134int nilfs_init_transaction_cache(void)
135{
136 nilfs_transaction_cachep =
137 kmem_cache_create("nilfs2_transaction_cache",
138 sizeof(struct nilfs_transaction_info),
139 0, SLAB_RECLAIM_ACCOUNT, NULL);
140 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
141}
142
143/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info
145 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info.
148 */
149void nilfs_destroy_transaction_cache(void)
150{
151 kmem_cache_destroy(nilfs_transaction_cachep);
152}
153
154static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
155{
156 struct nilfs_transaction_info *cur_ti = current->journal_info;
157 void *save = NULL;
158
159 if (cur_ti) {
160 if (cur_ti->ti_magic == NILFS_TI_MAGIC)
161 return ++cur_ti->ti_count;
162 else {
163 /*
164 * If journal_info field is occupied by other FS,
165 * it is saved and will be restored on
166 * nilfs_transaction_commit().
167 */
168 printk(KERN_WARNING
169 "NILFS warning: journal info from a different "
170 "FS\n");
171 save = current->journal_info;
172 }
173 }
174 if (!ti) {
175 ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
176 if (!ti)
177 return -ENOMEM;
178 ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
179 } else {
180 ti->ti_flags = 0;
181 }
182 ti->ti_count = 0;
183 ti->ti_save = save;
184 ti->ti_magic = NILFS_TI_MAGIC;
185 current->journal_info = ti;
186 return 0;
187}
188
189/**
190 * nilfs_transaction_begin - start indivisible file operations.
191 * @sb: super block
192 * @ti: nilfs_transaction_info
193 * @vacancy_check: flags for vacancy rate checks
194 *
195 * nilfs_transaction_begin() acquires a reader/writer semaphore, called
196 * the segment semaphore, to make a segment construction and write tasks
197 * exclusive. The function is used with nilfs_transaction_commit() in pairs.
198 * The region enclosed by these two functions can be nested. To avoid a
199 * deadlock, the semaphore is only acquired or released in the outermost call.
200 *
201 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab.
205 *
206 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity.
208 *
209 * Return Value: On success, 0 is returned. On error, one of the following
210 * negative error code is returned.
211 *
212 * %-ENOMEM - Insufficient memory available.
213 *
214 * %-ENOSPC - No space left on device
215 */
216int nilfs_transaction_begin(struct super_block *sb,
217 struct nilfs_transaction_info *ti,
218 int vacancy_check)
219{
220 struct nilfs_sb_info *sbi;
221 struct the_nilfs *nilfs;
222 int ret = nilfs_prepare_segment_lock(ti);
223
224 if (unlikely(ret < 0))
225 return ret;
226 if (ret > 0)
227 return 0;
228
229 sbi = NILFS_SB(sb);
230 nilfs = sbi->s_nilfs;
231 down_read(&nilfs->ns_segctor_sem);
232 if (vacancy_check && nilfs_near_disk_full(nilfs)) {
233 up_read(&nilfs->ns_segctor_sem);
234 ret = -ENOSPC;
235 goto failed;
236 }
237 return 0;
238
239 failed:
240 ti = current->journal_info;
241 current->journal_info = ti->ti_save;
242 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
243 kmem_cache_free(nilfs_transaction_cachep, ti);
244 return ret;
245}
246
247/**
248 * nilfs_transaction_commit - commit indivisible file operations.
249 * @sb: super block
250 *
251 * nilfs_transaction_commit() releases the read semaphore which is
252 * acquired by nilfs_transaction_begin(). This is only performed
253 * in outermost call of this function. If a commit flag is set,
254 * nilfs_transaction_commit() sets a timer to start the segment
255 * constructor. If a sync flag is set, it starts construction
256 * directly.
257 */
258int nilfs_transaction_commit(struct super_block *sb)
259{
260 struct nilfs_transaction_info *ti = current->journal_info;
261 struct nilfs_sb_info *sbi;
262 struct nilfs_sc_info *sci;
263 int err = 0;
264
265 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
266 ti->ti_flags |= NILFS_TI_COMMIT;
267 if (ti->ti_count > 0) {
268 ti->ti_count--;
269 return 0;
270 }
271 sbi = NILFS_SB(sb);
272 sci = NILFS_SC(sbi);
273 if (sci != NULL) {
274 if (ti->ti_flags & NILFS_TI_COMMIT)
275 nilfs_segctor_start_timer(sci);
276 if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
277 sci->sc_watermark)
278 nilfs_segctor_do_flush(sci, 0);
279 }
280 up_read(&sbi->s_nilfs->ns_segctor_sem);
281 current->journal_info = ti->ti_save;
282
283 if (ti->ti_flags & NILFS_TI_SYNC)
284 err = nilfs_construct_segment(sb);
285 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
286 kmem_cache_free(nilfs_transaction_cachep, ti);
287 return err;
288}
289
290void nilfs_transaction_abort(struct super_block *sb)
291{
292 struct nilfs_transaction_info *ti = current->journal_info;
293
294 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
295 if (ti->ti_count > 0) {
296 ti->ti_count--;
297 return;
298 }
299 up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
300
301 current->journal_info = ti->ti_save;
302 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
303 kmem_cache_free(nilfs_transaction_cachep, ti);
304}
305
306void nilfs_relax_pressure_in_lock(struct super_block *sb)
307{
308 struct nilfs_sb_info *sbi = NILFS_SB(sb);
309 struct nilfs_sc_info *sci = NILFS_SC(sbi);
310 struct the_nilfs *nilfs = sbi->s_nilfs;
311
312 if (!sci || !sci->sc_flush_request)
313 return;
314
315 set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
316 up_read(&nilfs->ns_segctor_sem);
317
318 down_write(&nilfs->ns_segctor_sem);
319 if (sci->sc_flush_request &&
320 test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
321 struct nilfs_transaction_info *ti = current->journal_info;
322
323 ti->ti_flags |= NILFS_TI_WRITER;
324 nilfs_segctor_do_immediate_flush(sci);
325 ti->ti_flags &= ~NILFS_TI_WRITER;
326 }
327 downgrade_write(&nilfs->ns_segctor_sem);
328}
329
330static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
331 struct nilfs_transaction_info *ti,
332 int gcflag)
333{
334 struct nilfs_transaction_info *cur_ti = current->journal_info;
335
336 WARN_ON(cur_ti);
337 ti->ti_flags = NILFS_TI_WRITER;
338 ti->ti_count = 0;
339 ti->ti_save = cur_ti;
340 ti->ti_magic = NILFS_TI_MAGIC;
341 INIT_LIST_HEAD(&ti->ti_garbage);
342 current->journal_info = ti;
343
344 for (;;) {
345 down_write(&sbi->s_nilfs->ns_segctor_sem);
346 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
347 break;
348
349 nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
350
351 up_write(&sbi->s_nilfs->ns_segctor_sem);
352 yield();
353 }
354 if (gcflag)
355 ti->ti_flags |= NILFS_TI_GC;
356}
357
358static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
359{
360 struct nilfs_transaction_info *ti = current->journal_info;
361
362 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
363 BUG_ON(ti->ti_count > 0);
364
365 up_write(&sbi->s_nilfs->ns_segctor_sem);
366 current->journal_info = ti->ti_save;
367 if (!list_empty(&ti->ti_garbage))
368 nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
369}
370
371static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
372 struct nilfs_segsum_pointer *ssp,
373 unsigned bytes)
374{
375 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
376 unsigned blocksize = sci->sc_super->s_blocksize;
377 void *p;
378
379 if (unlikely(ssp->offset + bytes > blocksize)) {
380 ssp->offset = 0;
381 BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
382 &segbuf->sb_segsum_buffers));
383 ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
384 }
385 p = ssp->bh->b_data + ssp->offset;
386 ssp->offset += bytes;
387 return p;
388}
389
390/**
391 * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
392 * @sci: nilfs_sc_info
393 */
394static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
395{
396 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
397 struct buffer_head *sumbh;
398 unsigned sumbytes;
399 unsigned flags = 0;
400 int err;
401
402 if (nilfs_doing_gc())
403 flags = NILFS_SS_GC;
404 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
405 if (unlikely(err))
406 return err;
407
408 sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
409 sumbytes = segbuf->sb_sum.sumbytes;
410 sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes;
411 sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes;
412 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
413 return 0;
414}
415
416static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
417{
418 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
419 if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
420 return -E2BIG; /* The current segment is filled up
421 (internal code) */
422 sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
423 return nilfs_segctor_reset_segment_buffer(sci);
424}
425
426static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
427{
428 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
429 int err;
430
431 if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
432 err = nilfs_segctor_feed_segment(sci);
433 if (err)
434 return err;
435 segbuf = sci->sc_curseg;
436 }
437 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
438 if (likely(!err))
439 segbuf->sb_sum.flags |= NILFS_SS_SR;
440 return err;
441}
442
443/*
444 * Functions for making segment summary and payloads
445 */
446static int nilfs_segctor_segsum_block_required(
447 struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
448 unsigned binfo_size)
449{
450 unsigned blocksize = sci->sc_super->s_blocksize;
451 /* Size of finfo and binfo is enough small against blocksize */
452
453 return ssp->offset + binfo_size +
454 (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
455 blocksize;
456}
457
458static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
459 struct inode *inode)
460{
461 sci->sc_curseg->sb_sum.nfinfo++;
462 sci->sc_binfo_ptr = sci->sc_finfo_ptr;
463 nilfs_segctor_map_segsum_entry(
464 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
465
466 if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
467 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
468 /* skip finfo */
469}
470
471static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
472 struct inode *inode)
473{
474 struct nilfs_finfo *finfo;
475 struct nilfs_inode_info *ii;
476 struct nilfs_segment_buffer *segbuf;
477
478 if (sci->sc_blk_cnt == 0)
479 return;
480
481 ii = NILFS_I(inode);
482 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
483 sizeof(*finfo));
484 finfo->fi_ino = cpu_to_le64(inode->i_ino);
485 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
486 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
487 finfo->fi_cno = cpu_to_le64(ii->i_cno);
488
489 segbuf = sci->sc_curseg;
490 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
491 sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
492 sci->sc_finfo_ptr = sci->sc_binfo_ptr;
493 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
494}
495
496static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
497 struct buffer_head *bh,
498 struct inode *inode,
499 unsigned binfo_size)
500{
501 struct nilfs_segment_buffer *segbuf;
502 int required, err = 0;
503
504 retry:
505 segbuf = sci->sc_curseg;
506 required = nilfs_segctor_segsum_block_required(
507 sci, &sci->sc_binfo_ptr, binfo_size);
508 if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
509 nilfs_segctor_end_finfo(sci, inode);
510 err = nilfs_segctor_feed_segment(sci);
511 if (err)
512 return err;
513 goto retry;
514 }
515 if (unlikely(required)) {
516 err = nilfs_segbuf_extend_segsum(segbuf);
517 if (unlikely(err))
518 goto failed;
519 }
520 if (sci->sc_blk_cnt == 0)
521 nilfs_segctor_begin_finfo(sci, inode);
522
523 nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
524 /* Substitution to vblocknr is delayed until update_blocknr() */
525 nilfs_segbuf_add_file_buffer(segbuf, bh);
526 sci->sc_blk_cnt++;
527 failed:
528 return err;
529}
530
531static int nilfs_handle_bmap_error(int err, const char *fname,
532 struct inode *inode, struct super_block *sb)
533{
534 if (err == -EINVAL) {
535 nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
536 inode->i_ino);
537 err = -EIO;
538 }
539 return err;
540}
541
542/*
543 * Callback functions that enumerate, mark, and collect dirty blocks
544 */
545static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
546 struct buffer_head *bh, struct inode *inode)
547{
548 int err;
549
550 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
551 if (unlikely(err < 0))
552 return nilfs_handle_bmap_error(err, __func__, inode,
553 sci->sc_super);
554
555 err = nilfs_segctor_add_file_block(sci, bh, inode,
556 sizeof(struct nilfs_binfo_v));
557 if (!err)
558 sci->sc_datablk_cnt++;
559 return err;
560}
561
562static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
563 struct buffer_head *bh,
564 struct inode *inode)
565{
566 int err;
567
568 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
569 if (unlikely(err < 0))
570 return nilfs_handle_bmap_error(err, __func__, inode,
571 sci->sc_super);
572 return 0;
573}
574
575static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
576 struct buffer_head *bh,
577 struct inode *inode)
578{
579 WARN_ON(!buffer_dirty(bh));
580 return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
581}
582
583static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
584 struct nilfs_segsum_pointer *ssp,
585 union nilfs_binfo *binfo)
586{
587 struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
588 sci, ssp, sizeof(*binfo_v));
589 *binfo_v = binfo->bi_v;
590}
591
592static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
593 struct nilfs_segsum_pointer *ssp,
594 union nilfs_binfo *binfo)
595{
596 __le64 *vblocknr = nilfs_segctor_map_segsum_entry(
597 sci, ssp, sizeof(*vblocknr));
598 *vblocknr = binfo->bi_v.bi_vblocknr;
599}
600
601struct nilfs_sc_operations nilfs_sc_file_ops = {
602 .collect_data = nilfs_collect_file_data,
603 .collect_node = nilfs_collect_file_node,
604 .collect_bmap = nilfs_collect_file_bmap,
605 .write_data_binfo = nilfs_write_file_data_binfo,
606 .write_node_binfo = nilfs_write_file_node_binfo,
607};
608
609static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
610 struct buffer_head *bh, struct inode *inode)
611{
612 int err;
613
614 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
615 if (unlikely(err < 0))
616 return nilfs_handle_bmap_error(err, __func__, inode,
617 sci->sc_super);
618
619 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
620 if (!err)
621 sci->sc_datablk_cnt++;
622 return err;
623}
624
625static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
626 struct buffer_head *bh, struct inode *inode)
627{
628 WARN_ON(!buffer_dirty(bh));
629 return nilfs_segctor_add_file_block(sci, bh, inode,
630 sizeof(struct nilfs_binfo_dat));
631}
632
633static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
634 struct nilfs_segsum_pointer *ssp,
635 union nilfs_binfo *binfo)
636{
637 __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
638 sizeof(*blkoff));
639 *blkoff = binfo->bi_dat.bi_blkoff;
640}
641
642static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
643 struct nilfs_segsum_pointer *ssp,
644 union nilfs_binfo *binfo)
645{
646 struct nilfs_binfo_dat *binfo_dat =
647 nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
648 *binfo_dat = binfo->bi_dat;
649}
650
651struct nilfs_sc_operations nilfs_sc_dat_ops = {
652 .collect_data = nilfs_collect_dat_data,
653 .collect_node = nilfs_collect_file_node,
654 .collect_bmap = nilfs_collect_dat_bmap,
655 .write_data_binfo = nilfs_write_dat_data_binfo,
656 .write_node_binfo = nilfs_write_dat_node_binfo,
657};
658
659struct nilfs_sc_operations nilfs_sc_dsync_ops = {
660 .collect_data = nilfs_collect_file_data,
661 .collect_node = NULL,
662 .collect_bmap = NULL,
663 .write_data_binfo = nilfs_write_file_data_binfo,
664 .write_node_binfo = NULL,
665};
666
667static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
668 struct list_head *listp,
669 size_t nlimit,
670 loff_t start, loff_t end)
671{
672 struct address_space *mapping = inode->i_mapping;
673 struct pagevec pvec;
674 pgoff_t index = 0, last = ULONG_MAX;
675 size_t ndirties = 0;
676 int i;
677
678 if (unlikely(start != 0 || end != LLONG_MAX)) {
679 /*
680 * A valid range is given for sync-ing data pages. The
681 * range is rounded to per-page; extra dirty buffers
682 * may be included if blocksize < pagesize.
683 */
684 index = start >> PAGE_SHIFT;
685 last = end >> PAGE_SHIFT;
686 }
687 pagevec_init(&pvec, 0);
688 repeat:
689 if (unlikely(index > last) ||
690 !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
691 min_t(pgoff_t, last - index,
692 PAGEVEC_SIZE - 1) + 1))
693 return ndirties;
694
695 for (i = 0; i < pagevec_count(&pvec); i++) {
696 struct buffer_head *bh, *head;
697 struct page *page = pvec.pages[i];
698
699 if (unlikely(page->index > last))
700 break;
701
702 if (mapping->host) {
703 lock_page(page);
704 if (!page_has_buffers(page))
705 create_empty_buffers(page,
706 1 << inode->i_blkbits, 0);
707 unlock_page(page);
708 }
709
710 bh = head = page_buffers(page);
711 do {
712 if (!buffer_dirty(bh))
713 continue;
714 get_bh(bh);
715 list_add_tail(&bh->b_assoc_buffers, listp);
716 ndirties++;
717 if (unlikely(ndirties >= nlimit)) {
718 pagevec_release(&pvec);
719 cond_resched();
720 return ndirties;
721 }
722 } while (bh = bh->b_this_page, bh != head);
723 }
724 pagevec_release(&pvec);
725 cond_resched();
726 goto repeat;
727}
728
729static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
730 struct list_head *listp)
731{
732 struct nilfs_inode_info *ii = NILFS_I(inode);
733 struct address_space *mapping = &ii->i_btnode_cache;
734 struct pagevec pvec;
735 struct buffer_head *bh, *head;
736 unsigned int i;
737 pgoff_t index = 0;
738
739 pagevec_init(&pvec, 0);
740
741 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
742 PAGEVEC_SIZE)) {
743 for (i = 0; i < pagevec_count(&pvec); i++) {
744 bh = head = page_buffers(pvec.pages[i]);
745 do {
746 if (buffer_dirty(bh)) {
747 get_bh(bh);
748 list_add_tail(&bh->b_assoc_buffers,
749 listp);
750 }
751 bh = bh->b_this_page;
752 } while (bh != head);
753 }
754 pagevec_release(&pvec);
755 cond_resched();
756 }
757}
758
759static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
760 struct list_head *head, int force)
761{
762 struct nilfs_inode_info *ii, *n;
763 struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
764 unsigned nv = 0;
765
766 while (!list_empty(head)) {
767 spin_lock(&sbi->s_inode_lock);
768 list_for_each_entry_safe(ii, n, head, i_dirty) {
769 list_del_init(&ii->i_dirty);
770 if (force) {
771 if (unlikely(ii->i_bh)) {
772 brelse(ii->i_bh);
773 ii->i_bh = NULL;
774 }
775 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
776 set_bit(NILFS_I_QUEUED, &ii->i_state);
777 list_add_tail(&ii->i_dirty,
778 &sbi->s_dirty_files);
779 continue;
780 }
781 ivec[nv++] = ii;
782 if (nv == SC_N_INODEVEC)
783 break;
784 }
785 spin_unlock(&sbi->s_inode_lock);
786
787 for (pii = ivec; nv > 0; pii++, nv--)
788 iput(&(*pii)->vfs_inode);
789 }
790}
791
792static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
793{
794 struct the_nilfs *nilfs = sbi->s_nilfs;
795 int ret = 0;
796
797 if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
798 ret++;
799 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
800 ret++;
801 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
802 ret++;
803 if (ret || nilfs_doing_gc())
804 if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
805 ret++;
806 return ret;
807}
808
809static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
810{
811 return list_empty(&sci->sc_dirty_files) &&
812 !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
813 list_empty(&sci->sc_cleaning_segments) &&
814 (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
815}
816
817static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
818{
819 struct nilfs_sb_info *sbi = sci->sc_sbi;
820 int ret = 0;
821
822 if (nilfs_test_metadata_dirty(sbi))
823 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
824
825 spin_lock(&sbi->s_inode_lock);
826 if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
827 ret++;
828
829 spin_unlock(&sbi->s_inode_lock);
830 return ret;
831}
832
833static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
834{
835 struct nilfs_sb_info *sbi = sci->sc_sbi;
836 struct the_nilfs *nilfs = sbi->s_nilfs;
837
838 nilfs_mdt_clear_dirty(sbi->s_ifile);
839 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
840 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
841 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
842}
843
844static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
845{
846 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
847 struct buffer_head *bh_cp;
848 struct nilfs_checkpoint *raw_cp;
849 int err;
850
851 /* XXX: this interface will be changed */
852 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
853 &raw_cp, &bh_cp);
854 if (likely(!err)) {
855 /* The following code is duplicated with cpfile. But, it is
856 needed to collect the checkpoint even if it was not newly
857 created */
858 nilfs_mdt_mark_buffer_dirty(bh_cp);
859 nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
860 nilfs_cpfile_put_checkpoint(
861 nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
862 } else
863 WARN_ON(err == -EINVAL || err == -ENOENT);
864
865 return err;
866}
867
868static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
869{
870 struct nilfs_sb_info *sbi = sci->sc_sbi;
871 struct the_nilfs *nilfs = sbi->s_nilfs;
872 struct buffer_head *bh_cp;
873 struct nilfs_checkpoint *raw_cp;
874 int err;
875
876 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
877 &raw_cp, &bh_cp);
878 if (unlikely(err)) {
879 WARN_ON(err == -EINVAL || err == -ENOENT);
880 goto failed_ibh;
881 }
882 raw_cp->cp_snapshot_list.ssl_next = 0;
883 raw_cp->cp_snapshot_list.ssl_prev = 0;
884 raw_cp->cp_inodes_count =
885 cpu_to_le64(atomic_read(&sbi->s_inodes_count));
886 raw_cp->cp_blocks_count =
887 cpu_to_le64(atomic_read(&sbi->s_blocks_count));
888 raw_cp->cp_nblk_inc =
889 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
890 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
891 raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
892
893 if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
894 nilfs_checkpoint_clear_minor(raw_cp);
895 else
896 nilfs_checkpoint_set_minor(raw_cp);
897
898 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
899 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
900 return 0;
901
902 failed_ibh:
903 return err;
904}
905
906static void nilfs_fill_in_file_bmap(struct inode *ifile,
907 struct nilfs_inode_info *ii)
908
909{
910 struct buffer_head *ibh;
911 struct nilfs_inode *raw_inode;
912
913 if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
914 ibh = ii->i_bh;
915 BUG_ON(!ibh);
916 raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
917 ibh);
918 nilfs_bmap_write(ii->i_bmap, raw_inode);
919 nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
920 }
921}
922
923static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
924 struct inode *ifile)
925{
926 struct nilfs_inode_info *ii;
927
928 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
929 nilfs_fill_in_file_bmap(ifile, ii);
930 set_bit(NILFS_I_COLLECTED, &ii->i_state);
931 }
932}
933
934/*
935 * CRC calculation routines
936 */
937static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
938{
939 struct nilfs_super_root *raw_sr =
940 (struct nilfs_super_root *)bh_sr->b_data;
941 u32 crc;
942
943 crc = crc32_le(seed,
944 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
945 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
946 raw_sr->sr_sum = cpu_to_le32(crc);
947}
948
949static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
950 u32 seed)
951{
952 struct nilfs_segment_buffer *segbuf;
953
954 if (sci->sc_super_root)
955 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
956
957 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
958 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
959 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
960 }
961}
962
963static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
964 struct the_nilfs *nilfs)
965{
966 struct buffer_head *bh_sr = sci->sc_super_root;
967 struct nilfs_super_root *raw_sr =
968 (struct nilfs_super_root *)bh_sr->b_data;
969 unsigned isz = nilfs->ns_inode_size;
970
971 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
972 raw_sr->sr_nongc_ctime
973 = cpu_to_le64(nilfs_doing_gc() ?
974 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
975 raw_sr->sr_flags = 0;
976
977 nilfs_mdt_write_inode_direct(
978 nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
979 nilfs_mdt_write_inode_direct(
980 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
981 nilfs_mdt_write_inode_direct(
982 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
983}
984
985static void nilfs_redirty_inodes(struct list_head *head)
986{
987 struct nilfs_inode_info *ii;
988
989 list_for_each_entry(ii, head, i_dirty) {
990 if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
991 clear_bit(NILFS_I_COLLECTED, &ii->i_state);
992 }
993}
994
995static void nilfs_drop_collected_inodes(struct list_head *head)
996{
997 struct nilfs_inode_info *ii;
998
999 list_for_each_entry(ii, head, i_dirty) {
1000 if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
1001 continue;
1002
1003 clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
1004 set_bit(NILFS_I_UPDATED, &ii->i_state);
1005 }
1006}
1007
1008static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
1009 struct inode *sufile)
1010
1011{
1012 struct list_head *head = &sci->sc_cleaning_segments;
1013 struct nilfs_segment_entry *ent;
1014 int err;
1015
1016 list_for_each_entry(ent, head, list) {
1017 if (!(ent->flags & NILFS_SLH_FREED))
1018 break;
1019 err = nilfs_sufile_cancel_free(sufile, ent->segnum);
1020 WARN_ON(err); /* do not happen */
1021 ent->flags &= ~NILFS_SLH_FREED;
1022 }
1023}
1024
1025static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
1026 struct inode *sufile)
1027{
1028 struct list_head *head = &sci->sc_cleaning_segments;
1029 struct nilfs_segment_entry *ent;
1030 int err;
1031
1032 list_for_each_entry(ent, head, list) {
1033 err = nilfs_sufile_free(sufile, ent->segnum);
1034 if (unlikely(err))
1035 return err;
1036 ent->flags |= NILFS_SLH_FREED;
1037 }
1038 return 0;
1039}
1040
1041static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
1042{
1043 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
1044}
1045
1046static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
1047 struct inode *inode,
1048 struct list_head *listp,
1049 int (*collect)(struct nilfs_sc_info *,
1050 struct buffer_head *,
1051 struct inode *))
1052{
1053 struct buffer_head *bh, *n;
1054 int err = 0;
1055
1056 if (collect) {
1057 list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
1058 list_del_init(&bh->b_assoc_buffers);
1059 err = collect(sci, bh, inode);
1060 brelse(bh);
1061 if (unlikely(err))
1062 goto dispose_buffers;
1063 }
1064 return 0;
1065 }
1066
1067 dispose_buffers:
1068 while (!list_empty(listp)) {
1069 bh = list_entry(listp->next, struct buffer_head,
1070 b_assoc_buffers);
1071 list_del_init(&bh->b_assoc_buffers);
1072 brelse(bh);
1073 }
1074 return err;
1075}
1076
1077static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
1078{
1079 /* Remaining number of blocks within segment buffer */
1080 return sci->sc_segbuf_nblocks -
1081 (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
1082}
1083
1084static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
1085 struct inode *inode,
1086 struct nilfs_sc_operations *sc_ops)
1087{
1088 LIST_HEAD(data_buffers);
1089 LIST_HEAD(node_buffers);
1090 int err;
1091
1092 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1093 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1094
1095 n = nilfs_lookup_dirty_data_buffers(
1096 inode, &data_buffers, rest + 1, 0, LLONG_MAX);
1097 if (n > rest) {
1098 err = nilfs_segctor_apply_buffers(
1099 sci, inode, &data_buffers,
1100 sc_ops->collect_data);
1101 BUG_ON(!err); /* always receive -E2BIG or true error */
1102 goto break_or_fail;
1103 }
1104 }
1105 nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
1106
1107 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1108 err = nilfs_segctor_apply_buffers(
1109 sci, inode, &data_buffers, sc_ops->collect_data);
1110 if (unlikely(err)) {
1111 /* dispose node list */
1112 nilfs_segctor_apply_buffers(
1113 sci, inode, &node_buffers, NULL);
1114 goto break_or_fail;
1115 }
1116 sci->sc_stage.flags |= NILFS_CF_NODE;
1117 }
1118 /* Collect node */
1119 err = nilfs_segctor_apply_buffers(
1120 sci, inode, &node_buffers, sc_ops->collect_node);
1121 if (unlikely(err))
1122 goto break_or_fail;
1123
1124 nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
1125 err = nilfs_segctor_apply_buffers(
1126 sci, inode, &node_buffers, sc_ops->collect_bmap);
1127 if (unlikely(err))
1128 goto break_or_fail;
1129
1130 nilfs_segctor_end_finfo(sci, inode);
1131 sci->sc_stage.flags &= ~NILFS_CF_NODE;
1132
1133 break_or_fail:
1134 return err;
1135}
1136
1137static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
1138 struct inode *inode)
1139{
1140 LIST_HEAD(data_buffers);
1141 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1142 int err;
1143
1144 n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
1145 sci->sc_dsync_start,
1146 sci->sc_dsync_end);
1147
1148 err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
1149 nilfs_collect_file_data);
1150 if (!err) {
1151 nilfs_segctor_end_finfo(sci, inode);
1152 BUG_ON(n > rest);
1153 /* always receive -E2BIG or true error if n > rest */
1154 }
1155 return err;
1156}
1157
1158static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1159{
1160 struct nilfs_sb_info *sbi = sci->sc_sbi;
1161 struct the_nilfs *nilfs = sbi->s_nilfs;
1162 struct list_head *head;
1163 struct nilfs_inode_info *ii;
1164 int err = 0;
1165
1166 switch (sci->sc_stage.scnt) {
1167 case NILFS_ST_INIT:
1168 /* Pre-processes */
1169 sci->sc_stage.flags = 0;
1170
1171 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
1172 sci->sc_nblk_inc = 0;
1173 sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
1174 if (mode == SC_LSEG_DSYNC) {
1175 sci->sc_stage.scnt = NILFS_ST_DSYNC;
1176 goto dsync_mode;
1177 }
1178 }
1179
1180 sci->sc_stage.dirty_file_ptr = NULL;
1181 sci->sc_stage.gc_inode_ptr = NULL;
1182 if (mode == SC_FLUSH_DAT) {
1183 sci->sc_stage.scnt = NILFS_ST_DAT;
1184 goto dat_stage;
1185 }
1186 sci->sc_stage.scnt++; /* Fall through */
1187 case NILFS_ST_GC:
1188 if (nilfs_doing_gc()) {
1189 head = &sci->sc_gc_inodes;
1190 ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
1191 head, i_dirty);
1192 list_for_each_entry_continue(ii, head, i_dirty) {
1193 err = nilfs_segctor_scan_file(
1194 sci, &ii->vfs_inode,
1195 &nilfs_sc_file_ops);
1196 if (unlikely(err)) {
1197 sci->sc_stage.gc_inode_ptr = list_entry(
1198 ii->i_dirty.prev,
1199 struct nilfs_inode_info,
1200 i_dirty);
1201 goto break_or_fail;
1202 }
1203 set_bit(NILFS_I_COLLECTED, &ii->i_state);
1204 }
1205 sci->sc_stage.gc_inode_ptr = NULL;
1206 }
1207 sci->sc_stage.scnt++; /* Fall through */
1208 case NILFS_ST_FILE:
1209 head = &sci->sc_dirty_files;
1210 ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
1211 i_dirty);
1212 list_for_each_entry_continue(ii, head, i_dirty) {
1213 clear_bit(NILFS_I_DIRTY, &ii->i_state);
1214
1215 err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
1216 &nilfs_sc_file_ops);
1217 if (unlikely(err)) {
1218 sci->sc_stage.dirty_file_ptr =
1219 list_entry(ii->i_dirty.prev,
1220 struct nilfs_inode_info,
1221 i_dirty);
1222 goto break_or_fail;
1223 }
1224 /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
1225 /* XXX: required ? */
1226 }
1227 sci->sc_stage.dirty_file_ptr = NULL;
1228 if (mode == SC_FLUSH_FILE) {
1229 sci->sc_stage.scnt = NILFS_ST_DONE;
1230 return 0;
1231 }
1232 sci->sc_stage.scnt++;
1233 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1234 /* Fall through */
1235 case NILFS_ST_IFILE:
1236 err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
1237 &nilfs_sc_file_ops);
1238 if (unlikely(err))
1239 break;
1240 sci->sc_stage.scnt++;
1241 /* Creating a checkpoint */
1242 err = nilfs_segctor_create_checkpoint(sci);
1243 if (unlikely(err))
1244 break;
1245 /* Fall through */
1246 case NILFS_ST_CPFILE:
1247 err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
1248 &nilfs_sc_file_ops);
1249 if (unlikely(err))
1250 break;
1251 sci->sc_stage.scnt++; /* Fall through */
1252 case NILFS_ST_SUFILE:
1253 err = nilfs_segctor_prepare_free_segments(sci,
1254 nilfs->ns_sufile);
1255 if (unlikely(err))
1256 break;
1257 err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
1258 &nilfs_sc_file_ops);
1259 if (unlikely(err))
1260 break;
1261 sci->sc_stage.scnt++; /* Fall through */
1262 case NILFS_ST_DAT:
1263 dat_stage:
1264 err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
1265 &nilfs_sc_dat_ops);
1266 if (unlikely(err))
1267 break;
1268 if (mode == SC_FLUSH_DAT) {
1269 sci->sc_stage.scnt = NILFS_ST_DONE;
1270 return 0;
1271 }
1272 sci->sc_stage.scnt++; /* Fall through */
1273 case NILFS_ST_SR:
1274 if (mode == SC_LSEG_SR) {
1275 /* Appending a super root */
1276 err = nilfs_segctor_add_super_root(sci);
1277 if (unlikely(err))
1278 break;
1279 }
1280 /* End of a logical segment */
1281 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1282 sci->sc_stage.scnt = NILFS_ST_DONE;
1283 return 0;
1284 case NILFS_ST_DSYNC:
1285 dsync_mode:
1286 sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
1287 ii = sci->sc_dsync_inode;
1288 if (!test_bit(NILFS_I_BUSY, &ii->i_state))
1289 break;
1290
1291 err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
1292 if (unlikely(err))
1293 break;
1294 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1295 sci->sc_stage.scnt = NILFS_ST_DONE;
1296 return 0;
1297 case NILFS_ST_DONE:
1298 return 0;
1299 default:
1300 BUG();
1301 }
1302
1303 break_or_fail:
1304 return err;
1305}
1306
1307static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
1308{
1309 struct buffer_head *bh_su;
1310 struct nilfs_segment_usage *raw_su;
1311 int err;
1312
1313 err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
1314 if (unlikely(err))
1315 return err;
1316 nilfs_mdt_mark_buffer_dirty(bh_su);
1317 nilfs_mdt_mark_dirty(sufile);
1318 nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
1319 return 0;
1320}
1321
1322static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
1323 struct the_nilfs *nilfs)
1324{
1325 struct nilfs_segment_buffer *segbuf, *n;
1326 __u64 nextnum;
1327 int err;
1328
1329 if (list_empty(&sci->sc_segbufs)) {
1330 segbuf = nilfs_segbuf_new(sci->sc_super);
1331 if (unlikely(!segbuf))
1332 return -ENOMEM;
1333 list_add(&segbuf->sb_list, &sci->sc_segbufs);
1334 } else
1335 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1336
1337 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
1338 nilfs);
1339
1340 if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
1341 nilfs_shift_to_next_segment(nilfs);
1342 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
1343 }
1344 sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
1345
1346 err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
1347 if (unlikely(err))
1348 return err;
1349
1350 if (nilfs->ns_segnum == nilfs->ns_nextnum) {
1351 /* Start from the head of a new full segment */
1352 err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
1353 if (unlikely(err))
1354 return err;
1355 } else
1356 nextnum = nilfs->ns_nextnum;
1357
1358 segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
1359 nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
1360
1361 /* truncating segment buffers */
1362 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1363 sb_list) {
1364 list_del_init(&segbuf->sb_list);
1365 nilfs_segbuf_free(segbuf);
1366 }
1367 return 0;
1368}
1369
1370static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
1371 struct the_nilfs *nilfs, int nadd)
1372{
1373 struct nilfs_segment_buffer *segbuf, *prev, *n;
1374 struct inode *sufile = nilfs->ns_sufile;
1375 __u64 nextnextnum;
1376 LIST_HEAD(list);
1377 int err, ret, i;
1378
1379 prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
1380 /*
1381 * Since the segment specified with nextnum might be allocated during
1382 * the previous construction, the buffer including its segusage may
1383 * not be dirty. The following call ensures that the buffer is dirty
1384 * and will pin the buffer on memory until the sufile is written.
1385 */
1386 err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
1387 if (unlikely(err))
1388 return err;
1389
1390 for (i = 0; i < nadd; i++) {
1391 /* extend segment info */
1392 err = -ENOMEM;
1393 segbuf = nilfs_segbuf_new(sci->sc_super);
1394 if (unlikely(!segbuf))
1395 goto failed;
1396
1397 /* map this buffer to region of segment on-disk */
1398 nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
1399 sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
1400
1401 /* allocate the next next full segment */
1402 err = nilfs_sufile_alloc(sufile, &nextnextnum);
1403 if (unlikely(err))
1404 goto failed_segbuf;
1405
1406 segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
1407 nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
1408
1409 list_add_tail(&segbuf->sb_list, &list);
1410 prev = segbuf;
1411 }
1412 list_splice(&list, sci->sc_segbufs.prev);
1413 return 0;
1414
1415 failed_segbuf:
1416 nilfs_segbuf_free(segbuf);
1417 failed:
1418 list_for_each_entry_safe(segbuf, n, &list, sb_list) {
1419 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1420 WARN_ON(ret); /* never fails */
1421 list_del_init(&segbuf->sb_list);
1422 nilfs_segbuf_free(segbuf);
1423 }
1424 return err;
1425}
1426
1427static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
1428 struct the_nilfs *nilfs)
1429{
1430 struct nilfs_segment_buffer *segbuf;
1431 int ret, done = 0;
1432
1433 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1434 if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
1435 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1436 WARN_ON(ret); /* never fails */
1437 }
1438 if (segbuf->sb_io_error) {
1439 /* Case 1: The first segment failed */
1440 if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
1441 /* Case 1a: Partial segment appended into an existing
1442 segment */
1443 nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
1444 segbuf->sb_fseg_end);
1445 else /* Case 1b: New full segment */
1446 set_nilfs_discontinued(nilfs);
1447 done++;
1448 }
1449
1450 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1451 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1452 WARN_ON(ret); /* never fails */
1453 if (!done && segbuf->sb_io_error) {
1454 if (segbuf->sb_segnum != nilfs->ns_nextnum)
1455 /* Case 2: extended segment (!= next) failed */
1456 nilfs_sufile_set_error(nilfs->ns_sufile,
1457 segbuf->sb_segnum);
1458 done++;
1459 }
1460 }
1461}
1462
1463static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
1464{
1465 struct nilfs_segment_buffer *segbuf;
1466
1467 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
1468 nilfs_segbuf_clear(segbuf);
1469 sci->sc_super_root = NULL;
1470}
1471
1472static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
1473{
1474 struct nilfs_segment_buffer *segbuf;
1475
1476 while (!list_empty(&sci->sc_segbufs)) {
1477 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1478 list_del_init(&segbuf->sb_list);
1479 nilfs_segbuf_free(segbuf);
1480 }
1481 /* sci->sc_curseg = NULL; */
1482}
1483
1484static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
1485 struct the_nilfs *nilfs, int err)
1486{
1487 if (unlikely(err)) {
1488 nilfs_segctor_free_incomplete_segments(sci, nilfs);
1489 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1490 }
1491 nilfs_segctor_clear_segment_buffers(sci);
1492}
1493
1494static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
1495 struct inode *sufile)
1496{
1497 struct nilfs_segment_buffer *segbuf;
1498 struct buffer_head *bh_su;
1499 struct nilfs_segment_usage *raw_su;
1500 unsigned long live_blocks;
1501 int ret;
1502
1503 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1504 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1505 &raw_su, &bh_su);
1506 WARN_ON(ret); /* always succeed because bh_su is dirty */
1507 live_blocks = segbuf->sb_sum.nblocks +
1508 (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
1509 raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
1510 raw_su->su_nblocks = cpu_to_le32(live_blocks);
1511 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1512 bh_su);
1513 }
1514}
1515
1516static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
1517 struct inode *sufile)
1518{
1519 struct nilfs_segment_buffer *segbuf;
1520 struct buffer_head *bh_su;
1521 struct nilfs_segment_usage *raw_su;
1522 int ret;
1523
1524 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1525 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1526 &raw_su, &bh_su);
1527 WARN_ON(ret); /* always succeed because bh_su is dirty */
1528 raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
1529 segbuf->sb_fseg_start);
1530 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
1531
1532 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1533 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1534 &raw_su, &bh_su);
1535 WARN_ON(ret); /* always succeed */
1536 raw_su->su_nblocks = 0;
1537 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1538 bh_su);
1539 }
1540}
1541
1542static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
1543 struct nilfs_segment_buffer *last,
1544 struct inode *sufile)
1545{
1546 struct nilfs_segment_buffer *segbuf = last, *n;
1547 int ret;
1548
1549 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1550 sb_list) {
1551 list_del_init(&segbuf->sb_list);
1552 sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
1553 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1554 WARN_ON(ret);
1555 nilfs_segbuf_free(segbuf);
1556 }
1557}
1558
1559
1560static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1561 struct the_nilfs *nilfs, int mode)
1562{
1563 struct nilfs_cstage prev_stage = sci->sc_stage;
1564 int err, nadd = 1;
1565
1566 /* Collection retry loop */
1567 for (;;) {
1568 sci->sc_super_root = NULL;
1569 sci->sc_nblk_this_inc = 0;
1570 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1571
1572 err = nilfs_segctor_reset_segment_buffer(sci);
1573 if (unlikely(err))
1574 goto failed;
1575
1576 err = nilfs_segctor_collect_blocks(sci, mode);
1577 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
1578 if (!err)
1579 break;
1580
1581 if (unlikely(err != -E2BIG))
1582 goto failed;
1583
1584 /* The current segment is filled up */
1585 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1586 break;
1587
1588 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1589 nilfs_segctor_clear_segment_buffers(sci);
1590
1591 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1592 if (unlikely(err))
1593 return err;
1594
1595 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1596 sci->sc_stage = prev_stage;
1597 }
1598 nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
1599 return 0;
1600
1601 failed:
1602 return err;
1603}
1604
1605static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
1606 struct buffer_head *new_bh)
1607{
1608 BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
1609
1610 list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
1611 /* The caller must release old_bh */
1612}
1613
1614static int
1615nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1616 struct nilfs_segment_buffer *segbuf,
1617 int mode)
1618{
1619 struct inode *inode = NULL;
1620 sector_t blocknr;
1621 unsigned long nfinfo = segbuf->sb_sum.nfinfo;
1622 unsigned long nblocks = 0, ndatablk = 0;
1623 struct nilfs_sc_operations *sc_op = NULL;
1624 struct nilfs_segsum_pointer ssp;
1625 struct nilfs_finfo *finfo = NULL;
1626 union nilfs_binfo binfo;
1627 struct buffer_head *bh, *bh_org;
1628 ino_t ino = 0;
1629 int err = 0;
1630
1631 if (!nfinfo)
1632 goto out;
1633
1634 blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
1635 ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
1636 ssp.offset = sizeof(struct nilfs_segment_summary);
1637
1638 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1639 if (bh == sci->sc_super_root)
1640 break;
1641 if (!finfo) {
1642 finfo = nilfs_segctor_map_segsum_entry(
1643 sci, &ssp, sizeof(*finfo));
1644 ino = le64_to_cpu(finfo->fi_ino);
1645 nblocks = le32_to_cpu(finfo->fi_nblocks);
1646 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
1647
1648 if (buffer_nilfs_node(bh))
1649 inode = NILFS_BTNC_I(bh->b_page->mapping);
1650 else
1651 inode = NILFS_AS_I(bh->b_page->mapping);
1652
1653 if (mode == SC_LSEG_DSYNC)
1654 sc_op = &nilfs_sc_dsync_ops;
1655 else if (ino == NILFS_DAT_INO)
1656 sc_op = &nilfs_sc_dat_ops;
1657 else /* file blocks */
1658 sc_op = &nilfs_sc_file_ops;
1659 }
1660 bh_org = bh;
1661 get_bh(bh_org);
1662 err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
1663 &binfo);
1664 if (bh != bh_org)
1665 nilfs_list_replace_buffer(bh_org, bh);
1666 brelse(bh_org);
1667 if (unlikely(err))
1668 goto failed_bmap;
1669
1670 if (ndatablk > 0)
1671 sc_op->write_data_binfo(sci, &ssp, &binfo);
1672 else
1673 sc_op->write_node_binfo(sci, &ssp, &binfo);
1674
1675 blocknr++;
1676 if (--nblocks == 0) {
1677 finfo = NULL;
1678 if (--nfinfo == 0)
1679 break;
1680 } else if (ndatablk > 0)
1681 ndatablk--;
1682 }
1683 out:
1684 return 0;
1685
1686 failed_bmap:
1687 err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
1688 return err;
1689}
1690
1691static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
1692{
1693 struct nilfs_segment_buffer *segbuf;
1694 int err;
1695
1696 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1697 err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
1698 if (unlikely(err))
1699 return err;
1700 nilfs_segbuf_fill_in_segsum(segbuf);
1701 }
1702 return 0;
1703}
1704
1705static int
1706nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1707{
1708 struct page *clone_page;
1709 struct buffer_head *bh, *head, *bh2;
1710 void *kaddr;
1711
1712 bh = head = page_buffers(page);
1713
1714 clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
1715 if (unlikely(!clone_page))
1716 return -ENOMEM;
1717
1718 bh2 = page_buffers(clone_page);
1719 kaddr = kmap_atomic(page, KM_USER0);
1720 do {
1721 if (list_empty(&bh->b_assoc_buffers))
1722 continue;
1723 get_bh(bh2);
1724 page_cache_get(clone_page); /* for each bh */
1725 memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
1726 bh2->b_blocknr = bh->b_blocknr;
1727 list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
1728 list_add_tail(&bh->b_assoc_buffers, out);
1729 } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
1730 kunmap_atomic(kaddr, KM_USER0);
1731
1732 if (!TestSetPageWriteback(clone_page))
1733 inc_zone_page_state(clone_page, NR_WRITEBACK);
1734 unlock_page(clone_page);
1735
1736 return 0;
1737}
1738
1739static int nilfs_test_page_to_be_frozen(struct page *page)
1740{
1741 struct address_space *mapping = page->mapping;
1742
1743 if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
1744 return 0;
1745
1746 if (page_mapped(page)) {
1747 ClearPageChecked(page);
1748 return 1;
1749 }
1750 return PageChecked(page);
1751}
1752
1753static int nilfs_begin_page_io(struct page *page, struct list_head *out)
1754{
1755 if (!page || PageWriteback(page))
1756 /* For split b-tree node pages, this function may be called
1757 twice. We ignore the 2nd or later calls by this check. */
1758 return 0;
1759
1760 lock_page(page);
1761 clear_page_dirty_for_io(page);
1762 set_page_writeback(page);
1763 unlock_page(page);
1764
1765 if (nilfs_test_page_to_be_frozen(page)) {
1766 int err = nilfs_copy_replace_page_buffers(page, out);
1767 if (unlikely(err))
1768 return err;
1769 }
1770 return 0;
1771}
1772
1773static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1774 struct page **failed_page)
1775{
1776 struct nilfs_segment_buffer *segbuf;
1777 struct page *bd_page = NULL, *fs_page = NULL;
1778 struct list_head *list = &sci->sc_copied_buffers;
1779 int err;
1780
1781 *failed_page = NULL;
1782 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1783 struct buffer_head *bh;
1784
1785 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1786 b_assoc_buffers) {
1787 if (bh->b_page != bd_page) {
1788 if (bd_page) {
1789 lock_page(bd_page);
1790 clear_page_dirty_for_io(bd_page);
1791 set_page_writeback(bd_page);
1792 unlock_page(bd_page);
1793 }
1794 bd_page = bh->b_page;
1795 }
1796 }
1797
1798 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1799 b_assoc_buffers) {
1800 if (bh == sci->sc_super_root) {
1801 if (bh->b_page != bd_page) {
1802 lock_page(bd_page);
1803 clear_page_dirty_for_io(bd_page);
1804 set_page_writeback(bd_page);
1805 unlock_page(bd_page);
1806 bd_page = bh->b_page;
1807 }
1808 break;
1809 }
1810 if (bh->b_page != fs_page) {
1811 err = nilfs_begin_page_io(fs_page, list);
1812 if (unlikely(err)) {
1813 *failed_page = fs_page;
1814 goto out;
1815 }
1816 fs_page = bh->b_page;
1817 }
1818 }
1819 }
1820 if (bd_page) {
1821 lock_page(bd_page);
1822 clear_page_dirty_for_io(bd_page);
1823 set_page_writeback(bd_page);
1824 unlock_page(bd_page);
1825 }
1826 err = nilfs_begin_page_io(fs_page, list);
1827 if (unlikely(err))
1828 *failed_page = fs_page;
1829 out:
1830 return err;
1831}
1832
1833static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1834 struct backing_dev_info *bdi)
1835{
1836 struct nilfs_segment_buffer *segbuf;
1837 struct nilfs_write_info wi;
1838 int err, res;
1839
1840 wi.sb = sci->sc_super;
1841 wi.bh_sr = sci->sc_super_root;
1842 wi.bdi = bdi;
1843
1844 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1845 nilfs_segbuf_prepare_write(segbuf, &wi);
1846 err = nilfs_segbuf_write(segbuf, &wi);
1847
1848 res = nilfs_segbuf_wait(segbuf, &wi);
1849 err = unlikely(err) ? : res;
1850 if (unlikely(err))
1851 return err;
1852 }
1853 return 0;
1854}
1855
1856static int nilfs_page_has_uncleared_buffer(struct page *page)
1857{
1858 struct buffer_head *head, *bh;
1859
1860 head = bh = page_buffers(page);
1861 do {
1862 if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
1863 return 1;
1864 bh = bh->b_this_page;
1865 } while (bh != head);
1866 return 0;
1867}
1868
1869static void __nilfs_end_page_io(struct page *page, int err)
1870{
1871 if (!err) {
1872 if (!nilfs_page_buffers_clean(page))
1873 __set_page_dirty_nobuffers(page);
1874 ClearPageError(page);
1875 } else {
1876 __set_page_dirty_nobuffers(page);
1877 SetPageError(page);
1878 }
1879
1880 if (buffer_nilfs_allocated(page_buffers(page))) {
1881 if (TestClearPageWriteback(page))
1882 dec_zone_page_state(page, NR_WRITEBACK);
1883 } else
1884 end_page_writeback(page);
1885}
1886
1887static void nilfs_end_page_io(struct page *page, int err)
1888{
1889 if (!page)
1890 return;
1891
1892 if (buffer_nilfs_node(page_buffers(page)) &&
1893 nilfs_page_has_uncleared_buffer(page))
1894 /* For b-tree node pages, this function may be called twice
1895 or more because they might be split in a segment.
1896 This check assures that cleanup has been done for all
1897 buffers in a split btnode page. */
1898 return;
1899
1900 __nilfs_end_page_io(page, err);
1901}
1902
1903static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1904{
1905 struct buffer_head *bh, *head;
1906 struct page *page;
1907
1908 while (!list_empty(list)) {
1909 bh = list_entry(list->next, struct buffer_head,
1910 b_assoc_buffers);
1911 page = bh->b_page;
1912 page_cache_get(page);
1913 head = bh = page_buffers(page);
1914 do {
1915 if (!list_empty(&bh->b_assoc_buffers)) {
1916 list_del_init(&bh->b_assoc_buffers);
1917 if (!err) {
1918 set_buffer_uptodate(bh);
1919 clear_buffer_dirty(bh);
1920 clear_buffer_nilfs_volatile(bh);
1921 }
1922 brelse(bh); /* for b_assoc_buffers */
1923 }
1924 } while ((bh = bh->b_this_page) != head);
1925
1926 __nilfs_end_page_io(page, err);
1927 page_cache_release(page);
1928 }
1929}
1930
1931static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1932 struct page *failed_page, int err)
1933{
1934 struct nilfs_segment_buffer *segbuf;
1935 struct page *bd_page = NULL, *fs_page = NULL;
1936
1937 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1938 struct buffer_head *bh;
1939
1940 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1941 b_assoc_buffers) {
1942 if (bh->b_page != bd_page) {
1943 if (bd_page)
1944 end_page_writeback(bd_page);
1945 bd_page = bh->b_page;
1946 }
1947 }
1948
1949 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1950 b_assoc_buffers) {
1951 if (bh == sci->sc_super_root) {
1952 if (bh->b_page != bd_page) {
1953 end_page_writeback(bd_page);
1954 bd_page = bh->b_page;
1955 }
1956 break;
1957 }
1958 if (bh->b_page != fs_page) {
1959 nilfs_end_page_io(fs_page, err);
1960 if (unlikely(fs_page == failed_page))
1961 goto done;
1962 fs_page = bh->b_page;
1963 }
1964 }
1965 }
1966 if (bd_page)
1967 end_page_writeback(bd_page);
1968
1969 nilfs_end_page_io(fs_page, err);
1970 done:
1971 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
1972}
1973
1974static void nilfs_set_next_segment(struct the_nilfs *nilfs,
1975 struct nilfs_segment_buffer *segbuf)
1976{
1977 nilfs->ns_segnum = segbuf->sb_segnum;
1978 nilfs->ns_nextnum = segbuf->sb_nextnum;
1979 nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
1980 + segbuf->sb_sum.nblocks;
1981 nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
1982 nilfs->ns_ctime = segbuf->sb_sum.ctime;
1983}
1984
1985static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1986{
1987 struct nilfs_segment_buffer *segbuf;
1988 struct page *bd_page = NULL, *fs_page = NULL;
1989 struct nilfs_sb_info *sbi = sci->sc_sbi;
1990 struct the_nilfs *nilfs = sbi->s_nilfs;
1991 int update_sr = (sci->sc_super_root != NULL);
1992
1993 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1994 struct buffer_head *bh;
1995
1996 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1997 b_assoc_buffers) {
1998 set_buffer_uptodate(bh);
1999 clear_buffer_dirty(bh);
2000 if (bh->b_page != bd_page) {
2001 if (bd_page)
2002 end_page_writeback(bd_page);
2003 bd_page = bh->b_page;
2004 }
2005 }
2006 /*
2007 * We assume that the buffers which belong to the same page
2008 * continue over the buffer list.
2009 * Under this assumption, the last BHs of pages is
2010 * identifiable by the discontinuity of bh->b_page
2011 * (page != fs_page).
2012 *
2013 * For B-tree node blocks, however, this assumption is not
2014 * guaranteed. The cleanup code of B-tree node pages needs
2015 * special care.
2016 */
2017 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
2018 b_assoc_buffers) {
2019 set_buffer_uptodate(bh);
2020 clear_buffer_dirty(bh);
2021 clear_buffer_nilfs_volatile(bh);
2022 if (bh == sci->sc_super_root) {
2023 if (bh->b_page != bd_page) {
2024 end_page_writeback(bd_page);
2025 bd_page = bh->b_page;
2026 }
2027 break;
2028 }
2029 if (bh->b_page != fs_page) {
2030 nilfs_end_page_io(fs_page, 0);
2031 fs_page = bh->b_page;
2032 }
2033 }
2034
2035 if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
2036 if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
2037 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2038 sci->sc_lseg_stime = jiffies;
2039 }
2040 if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
2041 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2042 }
2043 }
2044 /*
2045 * Since pages may continue over multiple segment buffers,
2046 * end of the last page must be checked outside of the loop.
2047 */
2048 if (bd_page)
2049 end_page_writeback(bd_page);
2050
2051 nilfs_end_page_io(fs_page, 0);
2052
2053 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
2054
2055 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
2056
2057 if (nilfs_doing_gc()) {
2058 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
2059 if (update_sr)
2060 nilfs_commit_gcdat_inode(nilfs);
2061 } else
2062 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
2063
2064 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
2065
2066 segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
2067 nilfs_set_next_segment(nilfs, segbuf);
2068
2069 if (update_sr) {
2070 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2071 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2072 sbi->s_super->s_dirt = 1;
2073
2074 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2075 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2076 set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2077 } else
2078 clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2079}
2080
2081static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2082 struct nilfs_sb_info *sbi)
2083{
2084 struct nilfs_inode_info *ii, *n;
2085 __u64 cno = sbi->s_nilfs->ns_cno;
2086
2087 spin_lock(&sbi->s_inode_lock);
2088 retry:
2089 list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
2090 if (!ii->i_bh) {
2091 struct buffer_head *ibh;
2092 int err;
2093
2094 spin_unlock(&sbi->s_inode_lock);
2095 err = nilfs_ifile_get_inode_block(
2096 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
2097 if (unlikely(err)) {
2098 nilfs_warning(sbi->s_super, __func__,
2099 "failed to get inode block.\n");
2100 return err;
2101 }
2102 nilfs_mdt_mark_buffer_dirty(ibh);
2103 nilfs_mdt_mark_dirty(sbi->s_ifile);
2104 spin_lock(&sbi->s_inode_lock);
2105 if (likely(!ii->i_bh))
2106 ii->i_bh = ibh;
2107 else
2108 brelse(ibh);
2109 goto retry;
2110 }
2111 ii->i_cno = cno;
2112
2113 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2114 set_bit(NILFS_I_BUSY, &ii->i_state);
2115 list_del(&ii->i_dirty);
2116 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
2117 }
2118 spin_unlock(&sbi->s_inode_lock);
2119
2120 NILFS_I(sbi->s_ifile)->i_cno = cno;
2121
2122 return 0;
2123}
2124
2125static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2126 struct nilfs_sb_info *sbi)
2127{
2128 struct nilfs_transaction_info *ti = current->journal_info;
2129 struct nilfs_inode_info *ii, *n;
2130 __u64 cno = sbi->s_nilfs->ns_cno;
2131
2132 spin_lock(&sbi->s_inode_lock);
2133 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2134 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2135 test_bit(NILFS_I_DIRTY, &ii->i_state)) {
2136 /* The current checkpoint number (=nilfs->ns_cno) is
2137 changed between check-in and check-out only if the
2138 super root is written out. So, we can update i_cno
2139 for the inodes that remain in the dirty list. */
2140 ii->i_cno = cno;
2141 continue;
2142 }
2143 clear_bit(NILFS_I_BUSY, &ii->i_state);
2144 brelse(ii->i_bh);
2145 ii->i_bh = NULL;
2146 list_del(&ii->i_dirty);
2147 list_add_tail(&ii->i_dirty, &ti->ti_garbage);
2148 }
2149 spin_unlock(&sbi->s_inode_lock);
2150}
2151
2152/*
2153 * Main procedure of segment constructor
2154 */
2155static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2156{
2157 struct nilfs_sb_info *sbi = sci->sc_sbi;
2158 struct the_nilfs *nilfs = sbi->s_nilfs;
2159 struct page *failed_page;
2160 int err, has_sr = 0;
2161
2162 sci->sc_stage.scnt = NILFS_ST_INIT;
2163
2164 err = nilfs_segctor_check_in_files(sci, sbi);
2165 if (unlikely(err))
2166 goto out;
2167
2168 if (nilfs_test_metadata_dirty(sbi))
2169 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2170
2171 if (nilfs_segctor_clean(sci))
2172 goto out;
2173
2174 do {
2175 sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
2176
2177 err = nilfs_segctor_begin_construction(sci, nilfs);
2178 if (unlikely(err))
2179 goto out;
2180
2181 /* Update time stamp */
2182 sci->sc_seg_ctime = get_seconds();
2183
2184 err = nilfs_segctor_collect(sci, nilfs, mode);
2185 if (unlikely(err))
2186 goto failed;
2187
2188 has_sr = (sci->sc_super_root != NULL);
2189
2190 /* Avoid empty segment */
2191 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2192 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
2193 nilfs_segctor_end_construction(sci, nilfs, 1);
2194 goto out;
2195 }
2196
2197 err = nilfs_segctor_assign(sci, mode);
2198 if (unlikely(err))
2199 goto failed;
2200
2201 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2202 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2203
2204 if (has_sr) {
2205 err = nilfs_segctor_fill_in_checkpoint(sci);
2206 if (unlikely(err))
2207 goto failed_to_make_up;
2208
2209 nilfs_segctor_fill_in_super_root(sci, nilfs);
2210 }
2211 nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
2212
2213 /* Write partial segments */
2214 err = nilfs_segctor_prepare_write(sci, &failed_page);
2215 if (unlikely(err))
2216 goto failed_to_write;
2217
2218 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
2219
2220 err = nilfs_segctor_write(sci, nilfs->ns_bdi);
2221 if (unlikely(err))
2222 goto failed_to_write;
2223
2224 nilfs_segctor_complete_write(sci);
2225
2226 /* Commit segments */
2227 if (has_sr) {
2228 nilfs_segctor_commit_free_segments(sci);
2229 nilfs_segctor_clear_metadata_dirty(sci);
2230 }
2231
2232 nilfs_segctor_end_construction(sci, nilfs, 0);
2233
2234 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2235
2236 out:
2237 nilfs_segctor_destroy_segment_buffers(sci);
2238 nilfs_segctor_check_out_files(sci, sbi);
2239 return err;
2240
2241 failed_to_write:
2242 nilfs_segctor_abort_write(sci, failed_page, err);
2243 nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
2244
2245 failed_to_make_up:
2246 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2247 nilfs_redirty_inodes(&sci->sc_dirty_files);
2248
2249 failed:
2250 if (nilfs_doing_gc())
2251 nilfs_redirty_inodes(&sci->sc_gc_inodes);
2252 nilfs_segctor_end_construction(sci, nilfs, err);
2253 goto out;
2254}
2255
2256/**
2257 * nilfs_secgtor_start_timer - set timer of background write
2258 * @sci: nilfs_sc_info
2259 *
2260 * If the timer has already been set, it ignores the new request.
2261 * This function MUST be called within a section locking the segment
2262 * semaphore.
2263 */
2264static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2265{
2266 spin_lock(&sci->sc_state_lock);
2267 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2268 sci->sc_timer->expires = jiffies + sci->sc_interval;
2269 add_timer(sci->sc_timer);
2270 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2271 }
2272 spin_unlock(&sci->sc_state_lock);
2273}
2274
2275static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
2276{
2277 spin_lock(&sci->sc_state_lock);
2278 if (!(sci->sc_flush_request & (1 << bn))) {
2279 unsigned long prev_req = sci->sc_flush_request;
2280
2281 sci->sc_flush_request |= (1 << bn);
2282 if (!prev_req)
2283 wake_up(&sci->sc_wait_daemon);
2284 }
2285 spin_unlock(&sci->sc_state_lock);
2286}
2287
2288/**
2289 * nilfs_flush_segment - trigger a segment construction for resource control
2290 * @sb: super block
2291 * @ino: inode number of the file to be flushed out.
2292 */
2293void nilfs_flush_segment(struct super_block *sb, ino_t ino)
2294{
2295 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2296 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2297
2298 if (!sci || nilfs_doing_construction())
2299 return;
2300 nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
2301 /* assign bit 0 to data files */
2302}
2303
2304int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
2305 __u64 *segnum, size_t nsegs)
2306{
2307 struct nilfs_segment_entry *ent;
2308 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2309 struct inode *sufile = nilfs->ns_sufile;
2310 LIST_HEAD(list);
2311 __u64 *pnum;
2312 size_t i;
2313 int err;
2314
2315 for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
2316 ent = nilfs_alloc_segment_entry(*pnum);
2317 if (unlikely(!ent)) {
2318 err = -ENOMEM;
2319 goto failed;
2320 }
2321 list_add_tail(&ent->list, &list);
2322
2323 err = nilfs_open_segment_entry(ent, sufile);
2324 if (unlikely(err))
2325 goto failed;
2326
2327 if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
2328 printk(KERN_WARNING "NILFS: unused segment is "
2329 "requested to be cleaned (segnum=%llu)\n",
2330 (unsigned long long)ent->segnum);
2331 nilfs_close_segment_entry(ent, sufile);
2332 }
2333 list_splice(&list, sci->sc_cleaning_segments.prev);
2334 return 0;
2335
2336 failed:
2337 nilfs_dispose_segment_list(&list);
2338 return err;
2339}
2340
2341void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
2342{
2343 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2344}
2345
2346struct nilfs_segctor_wait_request {
2347 wait_queue_t wq;
2348 __u32 seq;
2349 int err;
2350 atomic_t done;
2351};
2352
2353static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
2354{
2355 struct nilfs_segctor_wait_request wait_req;
2356 int err = 0;
2357
2358 spin_lock(&sci->sc_state_lock);
2359 init_wait(&wait_req.wq);
2360 wait_req.err = 0;
2361 atomic_set(&wait_req.done, 0);
2362 wait_req.seq = ++sci->sc_seq_request;
2363 spin_unlock(&sci->sc_state_lock);
2364
2365 init_waitqueue_entry(&wait_req.wq, current);
2366 add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
2367 set_current_state(TASK_INTERRUPTIBLE);
2368 wake_up(&sci->sc_wait_daemon);
2369
2370 for (;;) {
2371 if (atomic_read(&wait_req.done)) {
2372 err = wait_req.err;
2373 break;
2374 }
2375 if (!signal_pending(current)) {
2376 schedule();
2377 continue;
2378 }
2379 err = -ERESTARTSYS;
2380 break;
2381 }
2382 finish_wait(&sci->sc_wait_request, &wait_req.wq);
2383 return err;
2384}
2385
2386static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
2387{
2388 struct nilfs_segctor_wait_request *wrq, *n;
2389 unsigned long flags;
2390
2391 spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
2392 list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
2393 wq.task_list) {
2394 if (!atomic_read(&wrq->done) &&
2395 nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
2396 wrq->err = err;
2397 atomic_set(&wrq->done, 1);
2398 }
2399 if (atomic_read(&wrq->done)) {
2400 wrq->wq.func(&wrq->wq,
2401 TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2402 0, NULL);
2403 }
2404 }
2405 spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
2406}
2407
2408/**
2409 * nilfs_construct_segment - construct a logical segment
2410 * @sb: super block
2411 *
2412 * Return Value: On success, 0 is retured. On errors, one of the following
2413 * negative error code is returned.
2414 *
2415 * %-EROFS - Read only filesystem.
2416 *
2417 * %-EIO - I/O error
2418 *
2419 * %-ENOSPC - No space left on device (only in a panic state).
2420 *
2421 * %-ERESTARTSYS - Interrupted.
2422 *
2423 * %-ENOMEM - Insufficient memory available.
2424 */
2425int nilfs_construct_segment(struct super_block *sb)
2426{
2427 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2428 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2429 struct nilfs_transaction_info *ti;
2430 int err;
2431
2432 if (!sci)
2433 return -EROFS;
2434
2435 /* A call inside transactions causes a deadlock. */
2436 BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
2437
2438 err = nilfs_segctor_sync(sci);
2439 return err;
2440}
2441
2442/**
2443 * nilfs_construct_dsync_segment - construct a data-only logical segment
2444 * @sb: super block
2445 * @inode: inode whose data blocks should be written out
2446 * @start: start byte offset
2447 * @end: end byte offset (inclusive)
2448 *
2449 * Return Value: On success, 0 is retured. On errors, one of the following
2450 * negative error code is returned.
2451 *
2452 * %-EROFS - Read only filesystem.
2453 *
2454 * %-EIO - I/O error
2455 *
2456 * %-ENOSPC - No space left on device (only in a panic state).
2457 *
2458 * %-ERESTARTSYS - Interrupted.
2459 *
2460 * %-ENOMEM - Insufficient memory available.
2461 */
2462int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2463 loff_t start, loff_t end)
2464{
2465 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2466 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2467 struct nilfs_inode_info *ii;
2468 struct nilfs_transaction_info ti;
2469 int err = 0;
2470
2471 if (!sci)
2472 return -EROFS;
2473
2474 nilfs_transaction_lock(sbi, &ti, 0);
2475
2476 ii = NILFS_I(inode);
2477 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
2478 nilfs_test_opt(sbi, STRICT_ORDER) ||
2479 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2480 nilfs_discontinued(sbi->s_nilfs)) {
2481 nilfs_transaction_unlock(sbi);
2482 err = nilfs_segctor_sync(sci);
2483 return err;
2484 }
2485
2486 spin_lock(&sbi->s_inode_lock);
2487 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
2488 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
2489 spin_unlock(&sbi->s_inode_lock);
2490 nilfs_transaction_unlock(sbi);
2491 return 0;
2492 }
2493 spin_unlock(&sbi->s_inode_lock);
2494 sci->sc_dsync_inode = ii;
2495 sci->sc_dsync_start = start;
2496 sci->sc_dsync_end = end;
2497
2498 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
2499
2500 nilfs_transaction_unlock(sbi);
2501 return err;
2502}
2503
2504struct nilfs_segctor_req {
2505 int mode;
2506 __u32 seq_accepted;
2507 int sc_err; /* construction failure */
2508 int sb_err; /* super block writeback failure */
2509};
2510
2511#define FLUSH_FILE_BIT (0x1) /* data file only */
2512#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2513
2514static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
2515 struct nilfs_segctor_req *req)
2516{
2517 req->sc_err = req->sb_err = 0;
2518 spin_lock(&sci->sc_state_lock);
2519 req->seq_accepted = sci->sc_seq_request;
2520 spin_unlock(&sci->sc_state_lock);
2521
2522 if (sci->sc_timer)
2523 del_timer_sync(sci->sc_timer);
2524}
2525
2526static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2527 struct nilfs_segctor_req *req)
2528{
2529 /* Clear requests (even when the construction failed) */
2530 spin_lock(&sci->sc_state_lock);
2531
2532 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2533
2534 if (req->mode == SC_LSEG_SR) {
2535 sci->sc_seq_done = req->seq_accepted;
2536 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
2537 sci->sc_flush_request = 0;
2538 } else if (req->mode == SC_FLUSH_FILE)
2539 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2540 else if (req->mode == SC_FLUSH_DAT)
2541 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2542
2543 spin_unlock(&sci->sc_state_lock);
2544}
2545
2546static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
2547 struct nilfs_segctor_req *req)
2548{
2549 struct nilfs_sb_info *sbi = sci->sc_sbi;
2550 struct the_nilfs *nilfs = sbi->s_nilfs;
2551 int err = 0;
2552
2553 if (nilfs_discontinued(nilfs))
2554 req->mode = SC_LSEG_SR;
2555 if (!nilfs_segctor_confirm(sci)) {
2556 err = nilfs_segctor_do_construct(sci, req->mode);
2557 req->sc_err = err;
2558 }
2559 if (likely(!err)) {
2560 if (req->mode != SC_FLUSH_DAT)
2561 atomic_set(&nilfs->ns_ndirtyblks, 0);
2562 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2563 nilfs_discontinued(nilfs)) {
2564 down_write(&nilfs->ns_sem);
2565 req->sb_err = nilfs_commit_super(sbi, 0);
2566 up_write(&nilfs->ns_sem);
2567 }
2568 }
2569 return err;
2570}
2571
2572static void nilfs_construction_timeout(unsigned long data)
2573{
2574 struct task_struct *p = (struct task_struct *)data;
2575 wake_up_process(p);
2576}
2577
2578static void
2579nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2580{
2581 struct nilfs_inode_info *ii, *n;
2582
2583 list_for_each_entry_safe(ii, n, head, i_dirty) {
2584 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2585 continue;
2586 hlist_del_init(&ii->vfs_inode.i_hash);
2587 list_del_init(&ii->i_dirty);
2588 nilfs_clear_gcinode(&ii->vfs_inode);
2589 }
2590}
2591
2592int nilfs_clean_segments(struct super_block *sb, void __user *argp)
2593{
2594 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2595 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2596 struct the_nilfs *nilfs = sbi->s_nilfs;
2597 struct nilfs_transaction_info ti;
2598 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2599 int err;
2600
2601 if (unlikely(!sci))
2602 return -EROFS;
2603
2604 nilfs_transaction_lock(sbi, &ti, 1);
2605
2606 err = nilfs_init_gcdat_inode(nilfs);
2607 if (unlikely(err))
2608 goto out_unlock;
2609 err = nilfs_ioctl_prepare_clean_segments(nilfs, argp);
2610 if (unlikely(err))
2611 goto out_unlock;
2612
2613 list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
2614
2615 for (;;) {
2616 nilfs_segctor_accept(sci, &req);
2617 err = nilfs_segctor_construct(sci, &req);
2618 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2619 nilfs_segctor_notify(sci, &req);
2620
2621 if (likely(!err))
2622 break;
2623
2624 nilfs_warning(sb, __func__,
2625 "segment construction failed. (err=%d)", err);
2626 set_current_state(TASK_INTERRUPTIBLE);
2627 schedule_timeout(sci->sc_interval);
2628 }
2629
2630 out_unlock:
2631 nilfs_clear_gcdat_inode(nilfs);
2632 nilfs_transaction_unlock(sbi);
2633 return err;
2634}
2635
2636static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2637{
2638 struct nilfs_sb_info *sbi = sci->sc_sbi;
2639 struct nilfs_transaction_info ti;
2640 struct nilfs_segctor_req req = { .mode = mode };
2641
2642 nilfs_transaction_lock(sbi, &ti, 0);
2643
2644 nilfs_segctor_accept(sci, &req);
2645 nilfs_segctor_construct(sci, &req);
2646 nilfs_segctor_notify(sci, &req);
2647
2648 /*
2649 * Unclosed segment should be retried. We do this using sc_timer.
2650 * Timeout of sc_timer will invoke complete construction which leads
2651 * to close the current logical segment.
2652 */
2653 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
2654 nilfs_segctor_start_timer(sci);
2655
2656 nilfs_transaction_unlock(sbi);
2657}
2658
2659static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
2660{
2661 int mode = 0;
2662 int err;
2663
2664 spin_lock(&sci->sc_state_lock);
2665 mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
2666 SC_FLUSH_DAT : SC_FLUSH_FILE;
2667 spin_unlock(&sci->sc_state_lock);
2668
2669 if (mode) {
2670 err = nilfs_segctor_do_construct(sci, mode);
2671
2672 spin_lock(&sci->sc_state_lock);
2673 sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
2674 ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
2675 spin_unlock(&sci->sc_state_lock);
2676 }
2677 clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
2678}
2679
2680static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2681{
2682 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2683 time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
2684 if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
2685 return SC_FLUSH_FILE;
2686 else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
2687 return SC_FLUSH_DAT;
2688 }
2689 return SC_LSEG_SR;
2690}
2691
2692/**
2693 * nilfs_segctor_thread - main loop of the segment constructor thread.
2694 * @arg: pointer to a struct nilfs_sc_info.
2695 *
2696 * nilfs_segctor_thread() initializes a timer and serves as a daemon
2697 * to execute segment constructions.
2698 */
2699static int nilfs_segctor_thread(void *arg)
2700{
2701 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2702 struct timer_list timer;
2703 int timeout = 0;
2704
2705 init_timer(&timer);
2706 timer.data = (unsigned long)current;
2707 timer.function = nilfs_construction_timeout;
2708 sci->sc_timer = &timer;
2709
2710 /* start sync. */
2711 sci->sc_task = current;
2712 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
2713 printk(KERN_INFO
2714 "segctord starting. Construction interval = %lu seconds, "
2715 "CP frequency < %lu seconds\n",
2716 sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
2717
2718 spin_lock(&sci->sc_state_lock);
2719 loop:
2720 for (;;) {
2721 int mode;
2722
2723 if (sci->sc_state & NILFS_SEGCTOR_QUIT)
2724 goto end_thread;
2725
2726 if (timeout || sci->sc_seq_request != sci->sc_seq_done)
2727 mode = SC_LSEG_SR;
2728 else if (!sci->sc_flush_request)
2729 break;
2730 else
2731 mode = nilfs_segctor_flush_mode(sci);
2732
2733 spin_unlock(&sci->sc_state_lock);
2734 nilfs_segctor_thread_construct(sci, mode);
2735 spin_lock(&sci->sc_state_lock);
2736 timeout = 0;
2737 }
2738
2739
2740 if (freezing(current)) {
2741 spin_unlock(&sci->sc_state_lock);
2742 refrigerator();
2743 spin_lock(&sci->sc_state_lock);
2744 } else {
2745 DEFINE_WAIT(wait);
2746 int should_sleep = 1;
2747
2748 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2749 TASK_INTERRUPTIBLE);
2750
2751 if (sci->sc_seq_request != sci->sc_seq_done)
2752 should_sleep = 0;
2753 else if (sci->sc_flush_request)
2754 should_sleep = 0;
2755 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2756 should_sleep = time_before(jiffies,
2757 sci->sc_timer->expires);
2758
2759 if (should_sleep) {
2760 spin_unlock(&sci->sc_state_lock);
2761 schedule();
2762 spin_lock(&sci->sc_state_lock);
2763 }
2764 finish_wait(&sci->sc_wait_daemon, &wait);
2765 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2766 time_after_eq(jiffies, sci->sc_timer->expires));
2767 }
2768 goto loop;
2769
2770 end_thread:
2771 spin_unlock(&sci->sc_state_lock);
2772 del_timer_sync(sci->sc_timer);
2773 sci->sc_timer = NULL;
2774
2775 /* end sync. */
2776 sci->sc_task = NULL;
2777 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
2778 return 0;
2779}
2780
2781static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2782{
2783 struct task_struct *t;
2784
2785 t = kthread_run(nilfs_segctor_thread, sci, "segctord");
2786 if (IS_ERR(t)) {
2787 int err = PTR_ERR(t);
2788
2789 printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
2790 err);
2791 return err;
2792 }
2793 wait_event(sci->sc_wait_task, sci->sc_task != NULL);
2794 return 0;
2795}
2796
2797static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2798{
2799 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2800
2801 while (sci->sc_task) {
2802 wake_up(&sci->sc_wait_daemon);
2803 spin_unlock(&sci->sc_state_lock);
2804 wait_event(sci->sc_wait_task, sci->sc_task == NULL);
2805 spin_lock(&sci->sc_state_lock);
2806 }
2807}
2808
2809static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2810{
2811 sci->sc_seq_done = sci->sc_seq_request;
2812
2813 return nilfs_segctor_start_thread(sci);
2814}
2815
2816/*
2817 * Setup & clean-up functions
2818 */
2819static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2820{
2821 struct nilfs_sc_info *sci;
2822
2823 sci = kzalloc(sizeof(*sci), GFP_KERNEL);
2824 if (!sci)
2825 return NULL;
2826
2827 sci->sc_sbi = sbi;
2828 sci->sc_super = sbi->s_super;
2829
2830 init_waitqueue_head(&sci->sc_wait_request);
2831 init_waitqueue_head(&sci->sc_wait_daemon);
2832 init_waitqueue_head(&sci->sc_wait_task);
2833 spin_lock_init(&sci->sc_state_lock);
2834 INIT_LIST_HEAD(&sci->sc_dirty_files);
2835 INIT_LIST_HEAD(&sci->sc_segbufs);
2836 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2837 INIT_LIST_HEAD(&sci->sc_cleaning_segments);
2838 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2839
2840 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2841 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
2842 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
2843
2844 if (sbi->s_interval)
2845 sci->sc_interval = sbi->s_interval;
2846 if (sbi->s_watermark)
2847 sci->sc_watermark = sbi->s_watermark;
2848 return sci;
2849}
2850
2851static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2852{
2853 int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
2854
2855 /* The segctord thread was stopped and its timer was removed.
2856 But some tasks remain. */
2857 do {
2858 struct nilfs_sb_info *sbi = sci->sc_sbi;
2859 struct nilfs_transaction_info ti;
2860 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2861
2862 nilfs_transaction_lock(sbi, &ti, 0);
2863 nilfs_segctor_accept(sci, &req);
2864 ret = nilfs_segctor_construct(sci, &req);
2865 nilfs_segctor_notify(sci, &req);
2866 nilfs_transaction_unlock(sbi);
2867
2868 } while (ret && retrycount-- > 0);
2869}
2870
2871/**
2872 * nilfs_segctor_destroy - destroy the segment constructor.
2873 * @sci: nilfs_sc_info
2874 *
2875 * nilfs_segctor_destroy() kills the segctord thread and frees
2876 * the nilfs_sc_info struct.
2877 * Caller must hold the segment semaphore.
2878 */
2879static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2880{
2881 struct nilfs_sb_info *sbi = sci->sc_sbi;
2882 int flag;
2883
2884 up_write(&sbi->s_nilfs->ns_segctor_sem);
2885
2886 spin_lock(&sci->sc_state_lock);
2887 nilfs_segctor_kill_thread(sci);
2888 flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
2889 || sci->sc_seq_request != sci->sc_seq_done);
2890 spin_unlock(&sci->sc_state_lock);
2891
2892 if (flag || nilfs_segctor_confirm(sci))
2893 nilfs_segctor_write_out(sci);
2894
2895 WARN_ON(!list_empty(&sci->sc_copied_buffers));
2896
2897 if (!list_empty(&sci->sc_dirty_files)) {
2898 nilfs_warning(sbi->s_super, __func__,
2899 "dirty file(s) after the final construction\n");
2900 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
2901 }
2902
2903 if (!list_empty(&sci->sc_cleaning_segments))
2904 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2905
2906 WARN_ON(!list_empty(&sci->sc_segbufs));
2907
2908 down_write(&sbi->s_nilfs->ns_segctor_sem);
2909
2910 kfree(sci);
2911}
2912
2913/**
2914 * nilfs_attach_segment_constructor - attach a segment constructor
2915 * @sbi: nilfs_sb_info
2916 *
2917 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2918 * initilizes it, and starts the segment constructor.
2919 *
2920 * Return Value: On success, 0 is returned. On error, one of the following
2921 * negative error code is returned.
2922 *
2923 * %-ENOMEM - Insufficient memory available.
2924 */
2925int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2926{
2927 struct the_nilfs *nilfs = sbi->s_nilfs;
2928 int err;
2929
2930 /* Each field of nilfs_segctor is cleared through the initialization
2931 of super-block info */
2932 sbi->s_sc_info = nilfs_segctor_new(sbi);
2933 if (!sbi->s_sc_info)
2934 return -ENOMEM;
2935
2936 nilfs_attach_writer(nilfs, sbi);
2937 err = nilfs_segctor_init(NILFS_SC(sbi));
2938 if (err) {
2939 nilfs_detach_writer(nilfs, sbi);
2940 kfree(sbi->s_sc_info);
2941 sbi->s_sc_info = NULL;
2942 }
2943 return err;
2944}
2945
2946/**
2947 * nilfs_detach_segment_constructor - destroy the segment constructor
2948 * @sbi: nilfs_sb_info
2949 *
2950 * nilfs_detach_segment_constructor() kills the segment constructor daemon,
2951 * frees the struct nilfs_sc_info, and destroy the dirty file list.
2952 */
2953void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2954{
2955 struct the_nilfs *nilfs = sbi->s_nilfs;
2956 LIST_HEAD(garbage_list);
2957
2958 down_write(&nilfs->ns_segctor_sem);
2959 if (NILFS_SC(sbi)) {
2960 nilfs_segctor_destroy(NILFS_SC(sbi));
2961 sbi->s_sc_info = NULL;
2962 }
2963
2964 /* Force to free the list of dirty files */
2965 spin_lock(&sbi->s_inode_lock);
2966 if (!list_empty(&sbi->s_dirty_files)) {
2967 list_splice_init(&sbi->s_dirty_files, &garbage_list);
2968 nilfs_warning(sbi->s_super, __func__,
2969 "Non empty dirty list after the last "
2970 "segment construction\n");
2971 }
2972 spin_unlock(&sbi->s_inode_lock);
2973 up_write(&nilfs->ns_segctor_sem);
2974
2975 nilfs_dispose_list(sbi, &garbage_list, 1);
2976 nilfs_detach_writer(nilfs, sbi);
2977}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 000000000000..a98fc1ed0bbb
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,243 @@
1/*
2 * segment.h - NILFS Segment constructor prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGMENT_H
24#define _NILFS_SEGMENT_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sb.h"
31
32/**
33 * struct nilfs_recovery_info - Recovery infomation
34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint
37 * @ri_lsegs_start: Region for roll-forwarding (start block number)
38 * @ri_lsegs_end: Region for roll-forwarding (end block number)
39 * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
40 * @ri_used_segments: List of segments to be mark active
41 * @ri_pseg_start: Block number of the last partial segment
42 * @ri_seq: Sequence number on the last partial segment
43 * @ri_segnum: Segment number on the last partial segment
44 * @ri_nextnum: Next segment number on the last partial segment
45 */
46struct nilfs_recovery_info {
47 int ri_need_recovery;
48 sector_t ri_super_root;
49 __u64 ri_cno;
50
51 sector_t ri_lsegs_start;
52 sector_t ri_lsegs_end;
53 u64 ri_lsegs_start_seq;
54 struct list_head ri_used_segments;
55 sector_t ri_pseg_start;
56 u64 ri_seq;
57 __u64 ri_segnum;
58 __u64 ri_nextnum;
59};
60
61/* ri_need_recovery */
62#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */
63#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */
64
65/**
66 * struct nilfs_cstage - Context of collection stage
67 * @scnt: Stage count
68 * @flags: State flags
69 * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
70 * @gc_inode_ptr: Pointer on the list of gc-inodes
71 */
72struct nilfs_cstage {
73 int scnt;
74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr;
77};
78
79struct nilfs_segment_buffer;
80
81struct nilfs_segsum_pointer {
82 struct buffer_head *bh;
83 unsigned offset; /* offset in bytes */
84};
85
86/**
87 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct
90 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written
93 * @sc_cleaning_segments: List of segments to be freed through construction
94 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
95 * @sc_dsync_inode: inode whose data pages are written for a sync operation
96 * @sc_dsync_start: start byte offset of data pages
97 * @sc_dsync_end: end byte offset of data pages (inclusive)
98 * @sc_segbufs: List of segment buffers
99 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
100 * @sc_curseg: Current segment buffer
101 * @sc_super_root: Pointer to the super root buffer
102 * @sc_stage: Collection stage
103 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
104 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
105 * @sc_blk_cnt: Block count of a file
106 * @sc_datablk_cnt: Data block count of a file
107 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
108 * @sc_seg_ctime: Creation time
109 * @sc_flags: Internal flags
110 * @sc_state_lock: spinlock for sc_state and so on
111 * @sc_state: Segctord state flags
112 * @sc_flush_request: inode bitmap of metadata files to be flushed
113 * @sc_wait_request: Client request queue
114 * @sc_wait_daemon: Daemon wait queue
115 * @sc_wait_task: Start/end wait queue to control segctord task
116 * @sc_seq_request: Request counter
117 * @sc_seq_done: Completion counter
118 * @sc_sync: Request of explicit sync operation
119 * @sc_interval: Timeout value of background construction
120 * @sc_mjcp_freq: Frequency of creating checkpoints
121 * @sc_lseg_stime: Start time of the latest logical segment
122 * @sc_watermark: Watermark for the number of dirty buffers
123 * @sc_timer: Timer for segctord
124 * @sc_task: current thread of segctord
125 */
126struct nilfs_sc_info {
127 struct super_block *sc_super;
128 struct nilfs_sb_info *sc_sbi;
129
130 unsigned long sc_nblk_inc;
131
132 struct list_head sc_dirty_files;
133 struct list_head sc_gc_inodes;
134 struct list_head sc_cleaning_segments;
135 struct list_head sc_copied_buffers;
136
137 struct nilfs_inode_info *sc_dsync_inode;
138 loff_t sc_dsync_start;
139 loff_t sc_dsync_end;
140
141 /* Segment buffers */
142 struct list_head sc_segbufs;
143 unsigned long sc_segbuf_nblocks;
144 struct nilfs_segment_buffer *sc_curseg;
145 struct buffer_head *sc_super_root;
146
147 struct nilfs_cstage sc_stage;
148
149 struct nilfs_segsum_pointer sc_finfo_ptr;
150 struct nilfs_segsum_pointer sc_binfo_ptr;
151 unsigned long sc_blk_cnt;
152 unsigned long sc_datablk_cnt;
153 unsigned long sc_nblk_this_inc;
154 time_t sc_seg_ctime;
155
156 unsigned long sc_flags;
157
158 spinlock_t sc_state_lock;
159 unsigned long sc_state;
160 unsigned long sc_flush_request;
161
162 wait_queue_head_t sc_wait_request;
163 wait_queue_head_t sc_wait_daemon;
164 wait_queue_head_t sc_wait_task;
165
166 __u32 sc_seq_request;
167 __u32 sc_seq_done;
168
169 int sc_sync;
170 unsigned long sc_interval;
171 unsigned long sc_mjcp_freq;
172 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
173 unsigned long sc_watermark;
174
175 struct timer_list *sc_timer;
176 struct task_struct *sc_task;
177};
178
179/* sc_flags */
180enum {
181 NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */
182 NILFS_SC_UNCLOSED, /* Logical segment is not closed */
183 NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */
184 NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a
185 checkpoint */
186 NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files
187 other than DAT, cpfile, sufile, or files
188 moved by GC */
189};
190
191/* sc_state */
192#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */
193#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */
194
195/*
196 * Constant parameters
197 */
198#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when
199 destroying segctord */
200
201/*
202 * Default values of timeout, in seconds.
203 */
204#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks.
205 It triggers construction of a
206 logical segment with a super root */
207#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root
208 creation */
209
210/*
211 * The default threshold amount of data, in block counts.
212 */
213#define NILFS_SC_DEFAULT_WATERMARK 3600
214
215
216/* segment.c */
217extern int nilfs_init_transaction_cache(void);
218extern void nilfs_destroy_transaction_cache(void);
219extern void nilfs_relax_pressure_in_lock(struct super_block *);
220
221extern int nilfs_construct_segment(struct super_block *);
222extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
223 loff_t, loff_t);
224extern void nilfs_flush_segment(struct super_block *, ino_t);
225extern int nilfs_clean_segments(struct super_block *, void __user *);
226
227extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
228 __u64 *, size_t);
229extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
230
231extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
232extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
233
234/* recovery.c */
235extern int nilfs_read_super_root_block(struct super_block *, sector_t,
236 struct buffer_head **, int);
237extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
238 struct nilfs_recovery_info *);
239extern int nilfs_recover_logical_segments(struct the_nilfs *,
240 struct nilfs_sb_info *,
241 struct nilfs_recovery_info *);
242
243#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 000000000000..c774cf397e2f
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,640 @@
1/*
2 * sufile.c - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "sufile.h"
31
32
33static inline unsigned long
34nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
35{
36 return NILFS_MDT(sufile)->mi_entries_per_block;
37}
38
39static unsigned long
40nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
41{
42 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
43 do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
44 return (unsigned long)t;
45}
46
47static unsigned long
48nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
49{
50 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
51 return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
52}
53
54static unsigned long
55nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
56 __u64 max)
57{
58 return min_t(unsigned long,
59 nilfs_sufile_segment_usages_per_block(sufile) -
60 nilfs_sufile_get_offset(sufile, curr),
61 max - curr + 1);
62}
63
64static inline struct nilfs_sufile_header *
65nilfs_sufile_block_get_header(const struct inode *sufile,
66 struct buffer_head *bh,
67 void *kaddr)
68{
69 return kaddr + bh_offset(bh);
70}
71
72static struct nilfs_segment_usage *
73nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
74 struct buffer_head *bh, void *kaddr)
75{
76 return kaddr + bh_offset(bh) +
77 nilfs_sufile_get_offset(sufile, segnum) *
78 NILFS_MDT(sufile)->mi_entry_size;
79}
80
81static inline int nilfs_sufile_get_header_block(struct inode *sufile,
82 struct buffer_head **bhp)
83{
84 return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
85}
86
87static inline int
88nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
89 int create, struct buffer_head **bhp)
90{
91 return nilfs_mdt_get_block(sufile,
92 nilfs_sufile_get_blkoff(sufile, segnum),
93 create, NULL, bhp);
94}
95
96/**
97 * nilfs_sufile_alloc - allocate a segment
98 * @sufile: inode of segment usage file
99 * @segnump: pointer to segment number
100 *
101 * Description: nilfs_sufile_alloc() allocates a clean segment.
102 *
103 * Return Value: On success, 0 is returned and the segment number of the
104 * allocated segment is stored in the place pointed by @segnump. On error, one
105 * of the following negative error codes is returned.
106 *
107 * %-EIO - I/O error.
108 *
109 * %-ENOMEM - Insufficient amount of memory available.
110 *
111 * %-ENOSPC - No clean segment left.
112 */
113int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
114{
115 struct buffer_head *header_bh, *su_bh;
116 struct the_nilfs *nilfs;
117 struct nilfs_sufile_header *header;
118 struct nilfs_segment_usage *su;
119 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
120 __u64 segnum, maxsegnum, last_alloc;
121 void *kaddr;
122 unsigned long nsegments, ncleansegs, nsus;
123 int ret, i, j;
124
125 down_write(&NILFS_MDT(sufile)->mi_sem);
126
127 nilfs = NILFS_MDT(sufile)->mi_nilfs;
128
129 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
130 if (ret < 0)
131 goto out_sem;
132 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
133 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
134 ncleansegs = le64_to_cpu(header->sh_ncleansegs);
135 last_alloc = le64_to_cpu(header->sh_last_alloc);
136 kunmap_atomic(kaddr, KM_USER0);
137
138 nsegments = nilfs_sufile_get_nsegments(sufile);
139 segnum = last_alloc + 1;
140 maxsegnum = nsegments - 1;
141 for (i = 0; i < nsegments; i += nsus) {
142 if (segnum >= nsegments) {
143 /* wrap around */
144 segnum = 0;
145 maxsegnum = last_alloc;
146 }
147 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
148 &su_bh);
149 if (ret < 0)
150 goto out_header;
151 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
152 su = nilfs_sufile_block_get_segment_usage(
153 sufile, segnum, su_bh, kaddr);
154
155 nsus = nilfs_sufile_segment_usages_in_block(
156 sufile, segnum, maxsegnum);
157 for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
158 if (!nilfs_segment_usage_clean(su))
159 continue;
160 /* found a clean segment */
161 nilfs_segment_usage_set_dirty(su);
162 kunmap_atomic(kaddr, KM_USER0);
163
164 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
165 header = nilfs_sufile_block_get_header(
166 sufile, header_bh, kaddr);
167 le64_add_cpu(&header->sh_ncleansegs, -1);
168 le64_add_cpu(&header->sh_ndirtysegs, 1);
169 header->sh_last_alloc = cpu_to_le64(segnum);
170 kunmap_atomic(kaddr, KM_USER0);
171
172 nilfs_mdt_mark_buffer_dirty(header_bh);
173 nilfs_mdt_mark_buffer_dirty(su_bh);
174 nilfs_mdt_mark_dirty(sufile);
175 brelse(su_bh);
176 *segnump = segnum;
177 goto out_header;
178 }
179
180 kunmap_atomic(kaddr, KM_USER0);
181 brelse(su_bh);
182 }
183
184 /* no segments left */
185 ret = -ENOSPC;
186
187 out_header:
188 brelse(header_bh);
189
190 out_sem:
191 up_write(&NILFS_MDT(sufile)->mi_sem);
192 return ret;
193}
194
195/**
196 * nilfs_sufile_cancel_free -
197 * @sufile: inode of segment usage file
198 * @segnum: segment number
199 *
200 * Description:
201 *
202 * Return Value: On success, 0 is returned. On error, one of the following
203 * negative error codes is returned.
204 *
205 * %-EIO - I/O error.
206 *
207 * %-ENOMEM - Insufficient amount of memory available.
208 */
209int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
210{
211 struct buffer_head *header_bh, *su_bh;
212 struct the_nilfs *nilfs;
213 struct nilfs_sufile_header *header;
214 struct nilfs_segment_usage *su;
215 void *kaddr;
216 int ret;
217
218 down_write(&NILFS_MDT(sufile)->mi_sem);
219
220 nilfs = NILFS_MDT(sufile)->mi_nilfs;
221
222 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
223 if (ret < 0)
224 goto out_sem;
225
226 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
227 if (ret < 0)
228 goto out_header;
229
230 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
231 su = nilfs_sufile_block_get_segment_usage(
232 sufile, segnum, su_bh, kaddr);
233 if (unlikely(!nilfs_segment_usage_clean(su))) {
234 printk(KERN_WARNING "%s: segment %llu must be clean\n",
235 __func__, (unsigned long long)segnum);
236 kunmap_atomic(kaddr, KM_USER0);
237 goto out_su_bh;
238 }
239 nilfs_segment_usage_set_dirty(su);
240 kunmap_atomic(kaddr, KM_USER0);
241
242 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
243 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
244 le64_add_cpu(&header->sh_ncleansegs, -1);
245 le64_add_cpu(&header->sh_ndirtysegs, 1);
246 kunmap_atomic(kaddr, KM_USER0);
247
248 nilfs_mdt_mark_buffer_dirty(header_bh);
249 nilfs_mdt_mark_buffer_dirty(su_bh);
250 nilfs_mdt_mark_dirty(sufile);
251
252 out_su_bh:
253 brelse(su_bh);
254 out_header:
255 brelse(header_bh);
256 out_sem:
257 up_write(&NILFS_MDT(sufile)->mi_sem);
258 return ret;
259}
260
261/**
262 * nilfs_sufile_freev - free segments
263 * @sufile: inode of segment usage file
264 * @segnum: array of segment numbers
265 * @nsegs: number of segments
266 *
267 * Description: nilfs_sufile_freev() frees segments specified by @segnum and
268 * @nsegs, which must have been returned by a previous call to
269 * nilfs_sufile_alloc().
270 *
271 * Return Value: On success, 0 is returned. On error, one of the following
272 * negative error codes is returned.
273 *
274 * %-EIO - I/O error.
275 *
276 * %-ENOMEM - Insufficient amount of memory available.
277 */
278#define NILFS_SUFILE_FREEV_PREALLOC 16
279int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs)
280{
281 struct buffer_head *header_bh, **su_bh,
282 *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC];
283 struct the_nilfs *nilfs;
284 struct nilfs_sufile_header *header;
285 struct nilfs_segment_usage *su;
286 void *kaddr;
287 int ret, i;
288
289 down_write(&NILFS_MDT(sufile)->mi_sem);
290
291 nilfs = NILFS_MDT(sufile)->mi_nilfs;
292
293 /* prepare resources */
294 if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC)
295 su_bh = su_bh_prealloc;
296 else {
297 su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS);
298 if (su_bh == NULL) {
299 ret = -ENOMEM;
300 goto out_sem;
301 }
302 }
303
304 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
305 if (ret < 0)
306 goto out_su_bh;
307 for (i = 0; i < nsegs; i++) {
308 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i],
309 0, &su_bh[i]);
310 if (ret < 0)
311 goto out_bh;
312 }
313
314 /* free segments */
315 for (i = 0; i < nsegs; i++) {
316 kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0);
317 su = nilfs_sufile_block_get_segment_usage(
318 sufile, segnum[i], su_bh[i], kaddr);
319 WARN_ON(nilfs_segment_usage_error(su));
320 nilfs_segment_usage_set_clean(su);
321 kunmap_atomic(kaddr, KM_USER0);
322 nilfs_mdt_mark_buffer_dirty(su_bh[i]);
323 }
324 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
325 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
326 le64_add_cpu(&header->sh_ncleansegs, nsegs);
327 le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs);
328 kunmap_atomic(kaddr, KM_USER0);
329 nilfs_mdt_mark_buffer_dirty(header_bh);
330 nilfs_mdt_mark_dirty(sufile);
331
332 out_bh:
333 for (i--; i >= 0; i--)
334 brelse(su_bh[i]);
335 brelse(header_bh);
336
337 out_su_bh:
338 if (su_bh != su_bh_prealloc)
339 kfree(su_bh);
340
341 out_sem:
342 up_write(&NILFS_MDT(sufile)->mi_sem);
343 return ret;
344}
345
346/**
347 * nilfs_sufile_free -
348 * @sufile:
349 * @segnum:
350 */
351int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
352{
353 return nilfs_sufile_freev(sufile, &segnum, 1);
354}
355
356/**
357 * nilfs_sufile_get_segment_usage - get a segment usage
358 * @sufile: inode of segment usage file
359 * @segnum: segment number
360 * @sup: pointer to segment usage
361 * @bhp: pointer to buffer head
362 *
363 * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
364 * specified by @segnum.
365 *
366 * Return Value: On success, 0 is returned, and the segment usage and the
367 * buffer head of the buffer on which the segment usage is located are stored
368 * in the place pointed by @sup and @bhp, respectively. On error, one of the
369 * following negative error codes is returned.
370 *
371 * %-EIO - I/O error.
372 *
373 * %-ENOMEM - Insufficient amount of memory available.
374 *
375 * %-EINVAL - Invalid segment usage number.
376 */
377int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
378 struct nilfs_segment_usage **sup,
379 struct buffer_head **bhp)
380{
381 struct buffer_head *bh;
382 struct nilfs_segment_usage *su;
383 void *kaddr;
384 int ret;
385
386 /* segnum is 0 origin */
387 if (segnum >= nilfs_sufile_get_nsegments(sufile))
388 return -EINVAL;
389 down_write(&NILFS_MDT(sufile)->mi_sem);
390 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
391 if (ret < 0)
392 goto out_sem;
393 kaddr = kmap(bh->b_page);
394 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
395 if (nilfs_segment_usage_error(su)) {
396 kunmap(bh->b_page);
397 brelse(bh);
398 ret = -EINVAL;
399 goto out_sem;
400 }
401
402 if (sup != NULL)
403 *sup = su;
404 *bhp = bh;
405
406 out_sem:
407 up_write(&NILFS_MDT(sufile)->mi_sem);
408 return ret;
409}
410
411/**
412 * nilfs_sufile_put_segment_usage - put a segment usage
413 * @sufile: inode of segment usage file
414 * @segnum: segment number
415 * @bh: buffer head
416 *
417 * Description: nilfs_sufile_put_segment_usage() releases the segment usage
418 * specified by @segnum. @bh must be the buffer head which have been returned
419 * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
420 */
421void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
422 struct buffer_head *bh)
423{
424 kunmap(bh->b_page);
425 brelse(bh);
426}
427
428/**
429 * nilfs_sufile_get_stat - get segment usage statistics
430 * @sufile: inode of segment usage file
431 * @stat: pointer to a structure of segment usage statistics
432 *
433 * Description: nilfs_sufile_get_stat() returns information about segment
434 * usage.
435 *
436 * Return Value: On success, 0 is returned, and segment usage information is
437 * stored in the place pointed by @stat. On error, one of the following
438 * negative error codes is returned.
439 *
440 * %-EIO - I/O error.
441 *
442 * %-ENOMEM - Insufficient amount of memory available.
443 */
444int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
445{
446 struct buffer_head *header_bh;
447 struct nilfs_sufile_header *header;
448 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
449 void *kaddr;
450 int ret;
451
452 down_read(&NILFS_MDT(sufile)->mi_sem);
453
454 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
455 if (ret < 0)
456 goto out_sem;
457
458 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
459 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
460 sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
461 sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
462 sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
463 sustat->ss_ctime = nilfs->ns_ctime;
464 sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
465 spin_lock(&nilfs->ns_last_segment_lock);
466 sustat->ss_prot_seq = nilfs->ns_prot_seq;
467 spin_unlock(&nilfs->ns_last_segment_lock);
468 kunmap_atomic(kaddr, KM_USER0);
469 brelse(header_bh);
470
471 out_sem:
472 up_read(&NILFS_MDT(sufile)->mi_sem);
473 return ret;
474}
475
476/**
477 * nilfs_sufile_get_ncleansegs - get the number of clean segments
478 * @sufile: inode of segment usage file
479 * @nsegsp: pointer to the number of clean segments
480 *
481 * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
482 * segments.
483 *
484 * Return Value: On success, 0 is returned and the number of clean segments is
485 * stored in the place pointed by @nsegsp. On error, one of the following
486 * negative error codes is returned.
487 *
488 * %-EIO - I/O error.
489 *
490 * %-ENOMEM - Insufficient amount of memory available.
491 */
492int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
493{
494 struct nilfs_sustat sustat;
495 int ret;
496
497 ret = nilfs_sufile_get_stat(sufile, &sustat);
498 if (ret == 0)
499 *nsegsp = sustat.ss_ncleansegs;
500 return ret;
501}
502
503/**
504 * nilfs_sufile_set_error - mark a segment as erroneous
505 * @sufile: inode of segment usage file
506 * @segnum: segment number
507 *
508 * Description: nilfs_sufile_set_error() marks the segment specified by
509 * @segnum as erroneous. The error segment will never be used again.
510 *
511 * Return Value: On success, 0 is returned. On error, one of the following
512 * negative error codes is returned.
513 *
514 * %-EIO - I/O error.
515 *
516 * %-ENOMEM - Insufficient amount of memory available.
517 *
518 * %-EINVAL - Invalid segment usage number.
519 */
520int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
521{
522 struct buffer_head *header_bh, *su_bh;
523 struct nilfs_segment_usage *su;
524 struct nilfs_sufile_header *header;
525 void *kaddr;
526 int ret;
527
528 if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
529 printk(KERN_WARNING "%s: invalid segment number: %llu\n",
530 __func__, (unsigned long long)segnum);
531 return -EINVAL;
532 }
533 down_write(&NILFS_MDT(sufile)->mi_sem);
534
535 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
536 if (ret < 0)
537 goto out_sem;
538 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
539 if (ret < 0)
540 goto out_header;
541
542 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
543 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
544 if (nilfs_segment_usage_error(su)) {
545 kunmap_atomic(kaddr, KM_USER0);
546 brelse(su_bh);
547 goto out_header;
548 }
549
550 nilfs_segment_usage_set_error(su);
551 kunmap_atomic(kaddr, KM_USER0);
552 brelse(su_bh);
553
554 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
555 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
556 le64_add_cpu(&header->sh_ndirtysegs, -1);
557 kunmap_atomic(kaddr, KM_USER0);
558 nilfs_mdt_mark_buffer_dirty(header_bh);
559 nilfs_mdt_mark_buffer_dirty(su_bh);
560 nilfs_mdt_mark_dirty(sufile);
561 brelse(su_bh);
562
563 out_header:
564 brelse(header_bh);
565
566 out_sem:
567 up_write(&NILFS_MDT(sufile)->mi_sem);
568 return ret;
569}
570
571/**
572 * nilfs_sufile_get_suinfo -
573 * @sufile: inode of segment usage file
574 * @segnum: segment number to start looking
575 * @si: array of suinfo
576 * @nsi: size of suinfo array
577 *
578 * Description:
579 *
580 * Return Value: On success, 0 is returned and .... On error, one of the
581 * following negative error codes is returned.
582 *
583 * %-EIO - I/O error.
584 *
585 * %-ENOMEM - Insufficient amount of memory available.
586 */
587ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
588 struct nilfs_suinfo *si, size_t nsi)
589{
590 struct buffer_head *su_bh;
591 struct nilfs_segment_usage *su;
592 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
593 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
594 void *kaddr;
595 unsigned long nsegs, segusages_per_block;
596 ssize_t n;
597 int ret, i, j;
598
599 down_read(&NILFS_MDT(sufile)->mi_sem);
600
601 segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
602 nsegs = min_t(unsigned long,
603 nilfs_sufile_get_nsegments(sufile) - segnum,
604 nsi);
605 for (i = 0; i < nsegs; i += n, segnum += n) {
606 n = min_t(unsigned long,
607 segusages_per_block -
608 nilfs_sufile_get_offset(sufile, segnum),
609 nsegs - i);
610 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
611 &su_bh);
612 if (ret < 0) {
613 if (ret != -ENOENT)
614 goto out;
615 /* hole */
616 memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
617 continue;
618 }
619
620 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
621 su = nilfs_sufile_block_get_segment_usage(
622 sufile, segnum, su_bh, kaddr);
623 for (j = 0; j < n; j++, su = (void *)su + susz) {
624 si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
625 si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
626 si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
627 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
628 if (nilfs_segment_is_active(nilfs, segnum + i + j))
629 si[i + j].sui_flags |=
630 (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
631 }
632 kunmap_atomic(kaddr, KM_USER0);
633 brelse(su_bh);
634 }
635 ret = nsegs;
636
637 out:
638 up_read(&NILFS_MDT(sufile)->mi_sem);
639 return ret;
640}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 000000000000..d595f33a768d
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,54 @@
1/*
2 * sufile.h - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_SUFILE_H
24#define _NILFS_SUFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30
31#define NILFS_SUFILE_GFP NILFS_MDT_GFP
32
33static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34{
35 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
36}
37
38int nilfs_sufile_alloc(struct inode *, __u64 *);
39int nilfs_sufile_cancel_free(struct inode *, __u64);
40int nilfs_sufile_freev(struct inode *, __u64 *, size_t);
41int nilfs_sufile_free(struct inode *, __u64);
42int nilfs_sufile_get_segment_usage(struct inode *, __u64,
43 struct nilfs_segment_usage **,
44 struct buffer_head **);
45void nilfs_sufile_put_segment_usage(struct inode *, __u64,
46 struct buffer_head *);
47int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
48int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
49int nilfs_sufile_set_error(struct inode *, __u64);
50ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
51 size_t);
52
53
54#endif /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 000000000000..e117e1ea9bff
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1323 @@
1/*
2 * super.c - NILFS module and super block management.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22/*
23 * linux/fs/ext2/super.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/inode.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * Big-endian to little-endian byte-swapping/bitmaps by
37 * David S. Miller (davem@caip.rutgers.edu), 1995
38 */
39
40#include <linux/module.h>
41#include <linux/string.h>
42#include <linux/slab.h>
43#include <linux/init.h>
44#include <linux/blkdev.h>
45#include <linux/parser.h>
46#include <linux/random.h>
47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h>
50#include <linux/writeback.h>
51#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include "nilfs.h"
54#include "mdt.h"
55#include "alloc.h"
56#include "page.h"
57#include "cpfile.h"
58#include "ifile.h"
59#include "dat.h"
60#include "segment.h"
61#include "segbuf.h"
62
63MODULE_AUTHOR("NTT Corp.");
64MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)");
66MODULE_VERSION(NILFS_VERSION);
67MODULE_LICENSE("GPL");
68
69static int nilfs_remount(struct super_block *sb, int *flags, char *data);
70static int test_exclusive_mount(struct file_system_type *fs_type,
71 struct block_device *bdev, int flags);
72
73/**
74 * nilfs_error() - report failure condition on a filesystem
75 *
76 * nilfs_error() sets an ERROR_FS flag on the superblock as well as
77 * reporting an error message. It should be called when NILFS detects
78 * incoherences or defects of meta data on disk. As for sustainable
79 * errors such as a single-shot I/O error, nilfs_warning() or the printk()
80 * function should be used instead.
81 *
82 * The segment constructor must not call this function because it can
83 * kill itself.
84 */
85void nilfs_error(struct super_block *sb, const char *function,
86 const char *fmt, ...)
87{
88 struct nilfs_sb_info *sbi = NILFS_SB(sb);
89 va_list args;
90
91 va_start(args, fmt);
92 printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
93 vprintk(fmt, args);
94 printk("\n");
95 va_end(args);
96
97 if (!(sb->s_flags & MS_RDONLY)) {
98 struct the_nilfs *nilfs = sbi->s_nilfs;
99
100 if (!nilfs_test_opt(sbi, ERRORS_CONT))
101 nilfs_detach_segment_constructor(sbi);
102
103 down_write(&nilfs->ns_sem);
104 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
105 nilfs->ns_mount_state |= NILFS_ERROR_FS;
106 nilfs->ns_sbp[0]->s_state |=
107 cpu_to_le16(NILFS_ERROR_FS);
108 nilfs_commit_super(sbi, 1);
109 }
110 up_write(&nilfs->ns_sem);
111
112 if (nilfs_test_opt(sbi, ERRORS_RO)) {
113 printk(KERN_CRIT "Remounting filesystem read-only\n");
114 sb->s_flags |= MS_RDONLY;
115 }
116 }
117
118 if (nilfs_test_opt(sbi, ERRORS_PANIC))
119 panic("NILFS (device %s): panic forced after error\n",
120 sb->s_id);
121}
122
123void nilfs_warning(struct super_block *sb, const char *function,
124 const char *fmt, ...)
125{
126 va_list args;
127
128 va_start(args, fmt);
129 printk(KERN_WARNING "NILFS warning (device %s): %s: ",
130 sb->s_id, function);
131 vprintk(fmt, args);
132 printk("\n");
133 va_end(args);
134}
135
136static struct kmem_cache *nilfs_inode_cachep;
137
138struct inode *nilfs_alloc_inode(struct super_block *sb)
139{
140 struct nilfs_inode_info *ii;
141
142 ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
143 if (!ii)
144 return NULL;
145 ii->i_bh = NULL;
146 ii->i_state = 0;
147 ii->vfs_inode.i_version = 1;
148 nilfs_btnode_cache_init(&ii->i_btnode_cache);
149 return &ii->vfs_inode;
150}
151
152void nilfs_destroy_inode(struct inode *inode)
153{
154 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
155}
156
157static void init_once(void *obj)
158{
159 struct nilfs_inode_info *ii = obj;
160
161 INIT_LIST_HEAD(&ii->i_dirty);
162#ifdef CONFIG_NILFS_XATTR
163 init_rwsem(&ii->xattr_sem);
164#endif
165 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
166 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
167 inode_init_once(&ii->vfs_inode);
168}
169
170static int nilfs_init_inode_cache(void)
171{
172 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
173 sizeof(struct nilfs_inode_info),
174 0, SLAB_RECLAIM_ACCOUNT,
175 init_once);
176
177 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
178}
179
180static inline void nilfs_destroy_inode_cache(void)
181{
182 kmem_cache_destroy(nilfs_inode_cachep);
183}
184
185static void nilfs_clear_inode(struct inode *inode)
186{
187 struct nilfs_inode_info *ii = NILFS_I(inode);
188
189#ifdef CONFIG_NILFS_POSIX_ACL
190 if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
191 posix_acl_release(ii->i_acl);
192 ii->i_acl = NILFS_ACL_NOT_CACHED;
193 }
194 if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
195 posix_acl_release(ii->i_default_acl);
196 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
197 }
198#endif
199 /*
200 * Free resources allocated in nilfs_read_inode(), here.
201 */
202 BUG_ON(!list_empty(&ii->i_dirty));
203 brelse(ii->i_bh);
204 ii->i_bh = NULL;
205
206 if (test_bit(NILFS_I_BMAP, &ii->i_state))
207 nilfs_bmap_clear(ii->i_bmap);
208
209 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
210}
211
212static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
213{
214 struct the_nilfs *nilfs = sbi->s_nilfs;
215 int err;
216 int barrier_done = 0;
217
218 if (nilfs_test_opt(sbi, BARRIER)) {
219 set_buffer_ordered(nilfs->ns_sbh[0]);
220 barrier_done = 1;
221 }
222 retry:
223 set_buffer_dirty(nilfs->ns_sbh[0]);
224 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
225 if (err == -EOPNOTSUPP && barrier_done) {
226 nilfs_warning(sbi->s_super, __func__,
227 "barrier-based sync failed. "
228 "disabling barriers\n");
229 nilfs_clear_opt(sbi, BARRIER);
230 barrier_done = 0;
231 clear_buffer_ordered(nilfs->ns_sbh[0]);
232 goto retry;
233 }
234 if (unlikely(err)) {
235 printk(KERN_ERR
236 "NILFS: unable to write superblock (err=%d)\n", err);
237 if (err == -EIO && nilfs->ns_sbh[1]) {
238 nilfs_fall_back_super_block(nilfs);
239 goto retry;
240 }
241 } else {
242 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
243
244 /*
245 * The latest segment becomes trailable from the position
246 * written in superblock.
247 */
248 clear_nilfs_discontinued(nilfs);
249
250 /* update GC protection for recent segments */
251 if (nilfs->ns_sbh[1]) {
252 sbp = NULL;
253 if (dupsb) {
254 set_buffer_dirty(nilfs->ns_sbh[1]);
255 if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
256 sbp = nilfs->ns_sbp[1];
257 }
258 }
259 if (sbp) {
260 spin_lock(&nilfs->ns_last_segment_lock);
261 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
262 spin_unlock(&nilfs->ns_last_segment_lock);
263 }
264 }
265
266 return err;
267}
268
269int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
270{
271 struct the_nilfs *nilfs = sbi->s_nilfs;
272 struct nilfs_super_block **sbp = nilfs->ns_sbp;
273 sector_t nfreeblocks;
274 time_t t;
275 int err;
276
277 /* nilfs->sem must be locked by the caller. */
278 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
279 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
280 nilfs_swap_super_block(nilfs);
281 else {
282 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
283 sbi->s_super->s_id);
284 return -EIO;
285 }
286 }
287 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
288 if (unlikely(err)) {
289 printk(KERN_ERR "NILFS: failed to count free blocks\n");
290 return err;
291 }
292 spin_lock(&nilfs->ns_last_segment_lock);
293 sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
294 sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
295 sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
296 spin_unlock(&nilfs->ns_last_segment_lock);
297
298 t = get_seconds();
299 nilfs->ns_sbwtime[0] = t;
300 sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
301 sbp[0]->s_wtime = cpu_to_le64(t);
302 sbp[0]->s_sum = 0;
303 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
304 (unsigned char *)sbp[0],
305 nilfs->ns_sbsize));
306 if (dupsb && sbp[1]) {
307 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
308 nilfs->ns_sbwtime[1] = t;
309 }
310 sbi->s_super->s_dirt = 0;
311 return nilfs_sync_super(sbi, dupsb);
312}
313
314static void nilfs_put_super(struct super_block *sb)
315{
316 struct nilfs_sb_info *sbi = NILFS_SB(sb);
317 struct the_nilfs *nilfs = sbi->s_nilfs;
318
319 nilfs_detach_segment_constructor(sbi);
320
321 if (!(sb->s_flags & MS_RDONLY)) {
322 down_write(&nilfs->ns_sem);
323 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
324 nilfs_commit_super(sbi, 1);
325 up_write(&nilfs->ns_sem);
326 }
327
328 nilfs_detach_checkpoint(sbi);
329 put_nilfs(sbi->s_nilfs);
330 sbi->s_super = NULL;
331 sb->s_fs_info = NULL;
332 kfree(sbi);
333}
334
335/**
336 * nilfs_write_super - write super block(s) of NILFS
337 * @sb: super_block
338 *
339 * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
340 * clears s_dirt. This function is called in the section protected by
341 * lock_super().
342 *
343 * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
344 * of the struct the_nilfs. Lock order must be as follows:
345 *
346 * 1. lock_super()
347 * 2. down_write(&nilfs->ns_sem)
348 *
349 * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
350 * of the super block (nilfs->ns_sbp[]).
351 *
352 * In most cases, VFS functions call lock_super() before calling these
353 * methods. So we must be careful not to bring on deadlocks when using
354 * lock_super(); see generic_shutdown_super(), write_super(), and so on.
355 *
356 * Note that order of lock_kernel() and lock_super() depends on contexts
357 * of VFS. We should also note that lock_kernel() can be used in its
358 * protective section and only the outermost one has an effect.
359 */
360static void nilfs_write_super(struct super_block *sb)
361{
362 struct nilfs_sb_info *sbi = NILFS_SB(sb);
363 struct the_nilfs *nilfs = sbi->s_nilfs;
364
365 down_write(&nilfs->ns_sem);
366 if (!(sb->s_flags & MS_RDONLY)) {
367 struct nilfs_super_block **sbp = nilfs->ns_sbp;
368 u64 t = get_seconds();
369 int dupsb;
370
371 if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
372 t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
373 up_write(&nilfs->ns_sem);
374 return;
375 }
376 dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
377 nilfs_commit_super(sbi, dupsb);
378 }
379 sb->s_dirt = 0;
380 up_write(&nilfs->ns_sem);
381}
382
383static int nilfs_sync_fs(struct super_block *sb, int wait)
384{
385 int err = 0;
386
387 /* This function is called when super block should be written back */
388 if (wait)
389 err = nilfs_construct_segment(sb);
390 return err;
391}
392
393int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
394{
395 struct the_nilfs *nilfs = sbi->s_nilfs;
396 struct nilfs_checkpoint *raw_cp;
397 struct buffer_head *bh_cp;
398 int err;
399
400 down_write(&nilfs->ns_sem);
401 list_add(&sbi->s_list, &nilfs->ns_supers);
402 up_write(&nilfs->ns_sem);
403
404 sbi->s_ifile = nilfs_mdt_new(
405 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
406 if (!sbi->s_ifile)
407 return -ENOMEM;
408
409 err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
410 if (unlikely(err))
411 goto failed;
412
413 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
414 &bh_cp);
415 if (unlikely(err)) {
416 if (err == -ENOENT || err == -EINVAL) {
417 printk(KERN_ERR
418 "NILFS: Invalid checkpoint "
419 "(checkpoint number=%llu)\n",
420 (unsigned long long)cno);
421 err = -EINVAL;
422 }
423 goto failed;
424 }
425 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
426 if (unlikely(err))
427 goto failed_bh;
428 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
429 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
430
431 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
432 return 0;
433
434 failed_bh:
435 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
436 failed:
437 nilfs_mdt_destroy(sbi->s_ifile);
438 sbi->s_ifile = NULL;
439
440 down_write(&nilfs->ns_sem);
441 list_del_init(&sbi->s_list);
442 up_write(&nilfs->ns_sem);
443
444 return err;
445}
446
447void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
448{
449 struct the_nilfs *nilfs = sbi->s_nilfs;
450
451 nilfs_mdt_clear(sbi->s_ifile);
452 nilfs_mdt_destroy(sbi->s_ifile);
453 sbi->s_ifile = NULL;
454 down_write(&nilfs->ns_sem);
455 list_del_init(&sbi->s_list);
456 up_write(&nilfs->ns_sem);
457}
458
459static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
460{
461 struct the_nilfs *nilfs = sbi->s_nilfs;
462 int err = 0;
463
464 down_write(&nilfs->ns_sem);
465 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
466 nilfs->ns_mount_state |= NILFS_VALID_FS;
467 err = nilfs_commit_super(sbi, 1);
468 if (likely(!err))
469 printk(KERN_INFO "NILFS: recovery complete.\n");
470 }
471 up_write(&nilfs->ns_sem);
472 return err;
473}
474
475static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
476{
477 struct super_block *sb = dentry->d_sb;
478 struct nilfs_sb_info *sbi = NILFS_SB(sb);
479 unsigned long long blocks;
480 unsigned long overhead;
481 unsigned long nrsvblocks;
482 sector_t nfreeblocks;
483 struct the_nilfs *nilfs = sbi->s_nilfs;
484 int err;
485
486 /*
487 * Compute all of the segment blocks
488 *
489 * The blocks before first segment and after last segment
490 * are excluded.
491 */
492 blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
493 - nilfs->ns_first_data_block;
494 nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
495
496 /*
497 * Compute the overhead
498 *
499 * When distributing meta data blocks outside semgent structure,
500 * We must count them as the overhead.
501 */
502 overhead = 0;
503
504 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
505 if (unlikely(err))
506 return err;
507
508 buf->f_type = NILFS_SUPER_MAGIC;
509 buf->f_bsize = sb->s_blocksize;
510 buf->f_blocks = blocks - overhead;
511 buf->f_bfree = nfreeblocks;
512 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
513 (buf->f_bfree - nrsvblocks) : 0;
514 buf->f_files = atomic_read(&sbi->s_inodes_count);
515 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
516 buf->f_namelen = NILFS_NAME_LEN;
517 return 0;
518}
519
520static struct super_operations nilfs_sops = {
521 .alloc_inode = nilfs_alloc_inode,
522 .destroy_inode = nilfs_destroy_inode,
523 .dirty_inode = nilfs_dirty_inode,
524 /* .write_inode = nilfs_write_inode, */
525 /* .put_inode = nilfs_put_inode, */
526 /* .drop_inode = nilfs_drop_inode, */
527 .delete_inode = nilfs_delete_inode,
528 .put_super = nilfs_put_super,
529 .write_super = nilfs_write_super,
530 .sync_fs = nilfs_sync_fs,
531 /* .write_super_lockfs */
532 /* .unlockfs */
533 .statfs = nilfs_statfs,
534 .remount_fs = nilfs_remount,
535 .clear_inode = nilfs_clear_inode,
536 /* .umount_begin */
537 /* .show_options */
538};
539
540static struct inode *
541nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
542{
543 struct inode *inode;
544
545 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
546 ino != NILFS_SKETCH_INO)
547 return ERR_PTR(-ESTALE);
548
549 inode = nilfs_iget(sb, ino);
550 if (IS_ERR(inode))
551 return ERR_CAST(inode);
552 if (generation && inode->i_generation != generation) {
553 iput(inode);
554 return ERR_PTR(-ESTALE);
555 }
556
557 return inode;
558}
559
560static struct dentry *
561nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
562 int fh_type)
563{
564 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
565 nilfs_nfs_get_inode);
566}
567
568static struct dentry *
569nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
570 int fh_type)
571{
572 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
573 nilfs_nfs_get_inode);
574}
575
576static struct export_operations nilfs_export_ops = {
577 .fh_to_dentry = nilfs_fh_to_dentry,
578 .fh_to_parent = nilfs_fh_to_parent,
579 .get_parent = nilfs_get_parent,
580};
581
582enum {
583 Opt_err_cont, Opt_err_panic, Opt_err_ro,
584 Opt_barrier, Opt_snapshot, Opt_order,
585 Opt_err,
586};
587
588static match_table_t tokens = {
589 {Opt_err_cont, "errors=continue"},
590 {Opt_err_panic, "errors=panic"},
591 {Opt_err_ro, "errors=remount-ro"},
592 {Opt_barrier, "barrier=%s"},
593 {Opt_snapshot, "cp=%u"},
594 {Opt_order, "order=%s"},
595 {Opt_err, NULL}
596};
597
598static int match_bool(substring_t *s, int *result)
599{
600 int len = s->to - s->from;
601
602 if (strncmp(s->from, "on", len) == 0)
603 *result = 1;
604 else if (strncmp(s->from, "off", len) == 0)
605 *result = 0;
606 else
607 return 1;
608 return 0;
609}
610
611static int parse_options(char *options, struct super_block *sb)
612{
613 struct nilfs_sb_info *sbi = NILFS_SB(sb);
614 char *p;
615 substring_t args[MAX_OPT_ARGS];
616 int option;
617
618 if (!options)
619 return 1;
620
621 while ((p = strsep(&options, ",")) != NULL) {
622 int token;
623 if (!*p)
624 continue;
625
626 token = match_token(p, tokens, args);
627 switch (token) {
628 case Opt_barrier:
629 if (match_bool(&args[0], &option))
630 return 0;
631 if (option)
632 nilfs_set_opt(sbi, BARRIER);
633 else
634 nilfs_clear_opt(sbi, BARRIER);
635 break;
636 case Opt_order:
637 if (strcmp(args[0].from, "relaxed") == 0)
638 /* Ordered data semantics */
639 nilfs_clear_opt(sbi, STRICT_ORDER);
640 else if (strcmp(args[0].from, "strict") == 0)
641 /* Strict in-order semantics */
642 nilfs_set_opt(sbi, STRICT_ORDER);
643 else
644 return 0;
645 break;
646 case Opt_err_panic:
647 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
648 break;
649 case Opt_err_ro:
650 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
651 break;
652 case Opt_err_cont:
653 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
654 break;
655 case Opt_snapshot:
656 if (match_int(&args[0], &option) || option <= 0)
657 return 0;
658 if (!(sb->s_flags & MS_RDONLY))
659 return 0;
660 sbi->s_snapshot_cno = option;
661 nilfs_set_opt(sbi, SNAPSHOT);
662 break;
663 default:
664 printk(KERN_ERR
665 "NILFS: Unrecognized mount option \"%s\"\n", p);
666 return 0;
667 }
668 }
669 return 1;
670}
671
672static inline void
673nilfs_set_default_options(struct nilfs_sb_info *sbi,
674 struct nilfs_super_block *sbp)
675{
676 sbi->s_mount_opt =
677 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
678}
679
680static int nilfs_setup_super(struct nilfs_sb_info *sbi)
681{
682 struct the_nilfs *nilfs = sbi->s_nilfs;
683 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
684 int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
685 int mnt_count = le16_to_cpu(sbp->s_mnt_count);
686
687 /* nilfs->sem must be locked by the caller. */
688 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
689 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
690 } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
691 printk(KERN_WARNING
692 "NILFS warning: mounting fs with errors\n");
693#if 0
694 } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
695 printk(KERN_WARNING
696 "NILFS warning: maximal mount count reached\n");
697#endif
698 }
699 if (!max_mnt_count)
700 sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
701
702 sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
703 sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
704 sbp->s_mtime = cpu_to_le64(get_seconds());
705 return nilfs_commit_super(sbi, 1);
706}
707
708struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
709 u64 pos, int blocksize,
710 struct buffer_head **pbh)
711{
712 unsigned long long sb_index = pos;
713 unsigned long offset;
714
715 offset = do_div(sb_index, blocksize);
716 *pbh = sb_bread(sb, sb_index);
717 if (!*pbh)
718 return NULL;
719 return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
720}
721
722int nilfs_store_magic_and_option(struct super_block *sb,
723 struct nilfs_super_block *sbp,
724 char *data)
725{
726 struct nilfs_sb_info *sbi = NILFS_SB(sb);
727
728 sb->s_magic = le16_to_cpu(sbp->s_magic);
729
730 /* FS independent flags */
731#ifdef NILFS_ATIME_DISABLE
732 sb->s_flags |= MS_NOATIME;
733#endif
734
735 nilfs_set_default_options(sbi, sbp);
736
737 sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
738 sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
739 sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
740 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
741
742 return !parse_options(data, sb) ? -EINVAL : 0 ;
743}
744
745/**
746 * nilfs_fill_super() - initialize a super block instance
747 * @sb: super_block
748 * @data: mount options
749 * @silent: silent mode flag
750 * @nilfs: the_nilfs struct
751 *
752 * This function is called exclusively by bd_mount_mutex.
753 * So, the recovery process is protected from other simultaneous mounts.
754 */
755static int
756nilfs_fill_super(struct super_block *sb, void *data, int silent,
757 struct the_nilfs *nilfs)
758{
759 struct nilfs_sb_info *sbi;
760 struct inode *root;
761 __u64 cno;
762 int err;
763
764 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
765 if (!sbi)
766 return -ENOMEM;
767
768 sb->s_fs_info = sbi;
769
770 get_nilfs(nilfs);
771 sbi->s_nilfs = nilfs;
772 sbi->s_super = sb;
773
774 err = init_nilfs(nilfs, sbi, (char *)data);
775 if (err)
776 goto failed_sbi;
777
778 spin_lock_init(&sbi->s_inode_lock);
779 INIT_LIST_HEAD(&sbi->s_dirty_files);
780 INIT_LIST_HEAD(&sbi->s_list);
781
782 /*
783 * Following initialization is overlapped because
784 * nilfs_sb_info structure has been cleared at the beginning.
785 * But we reserve them to keep our interest and make ready
786 * for the future change.
787 */
788 get_random_bytes(&sbi->s_next_generation,
789 sizeof(sbi->s_next_generation));
790 spin_lock_init(&sbi->s_next_gen_lock);
791
792 sb->s_op = &nilfs_sops;
793 sb->s_export_op = &nilfs_export_ops;
794 sb->s_root = NULL;
795 sb->s_time_gran = 1;
796
797 if (!nilfs_loaded(nilfs)) {
798 err = load_nilfs(nilfs, sbi);
799 if (err)
800 goto failed_sbi;
801 }
802 cno = nilfs_last_cno(nilfs);
803
804 if (sb->s_flags & MS_RDONLY) {
805 if (nilfs_test_opt(sbi, SNAPSHOT)) {
806 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
807 sbi->s_snapshot_cno);
808 if (err < 0)
809 goto failed_sbi;
810 if (!err) {
811 printk(KERN_ERR
812 "NILFS: The specified checkpoint is "
813 "not a snapshot "
814 "(checkpoint number=%llu).\n",
815 (unsigned long long)sbi->s_snapshot_cno);
816 err = -EINVAL;
817 goto failed_sbi;
818 }
819 cno = sbi->s_snapshot_cno;
820 } else
821 /* Read-only mount */
822 sbi->s_snapshot_cno = cno;
823 }
824
825 err = nilfs_attach_checkpoint(sbi, cno);
826 if (err) {
827 printk(KERN_ERR "NILFS: error loading a checkpoint"
828 " (checkpoint number=%llu).\n", (unsigned long long)cno);
829 goto failed_sbi;
830 }
831
832 if (!(sb->s_flags & MS_RDONLY)) {
833 err = nilfs_attach_segment_constructor(sbi);
834 if (err)
835 goto failed_checkpoint;
836 }
837
838 root = nilfs_iget(sb, NILFS_ROOT_INO);
839 if (IS_ERR(root)) {
840 printk(KERN_ERR "NILFS: get root inode failed\n");
841 err = PTR_ERR(root);
842 goto failed_segctor;
843 }
844 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
845 iput(root);
846 printk(KERN_ERR "NILFS: corrupt root inode.\n");
847 err = -EINVAL;
848 goto failed_segctor;
849 }
850 sb->s_root = d_alloc_root(root);
851 if (!sb->s_root) {
852 iput(root);
853 printk(KERN_ERR "NILFS: get root dentry failed\n");
854 err = -ENOMEM;
855 goto failed_segctor;
856 }
857
858 if (!(sb->s_flags & MS_RDONLY)) {
859 down_write(&nilfs->ns_sem);
860 nilfs_setup_super(sbi);
861 up_write(&nilfs->ns_sem);
862 }
863
864 err = nilfs_mark_recovery_complete(sbi);
865 if (unlikely(err)) {
866 printk(KERN_ERR "NILFS: recovery failed.\n");
867 goto failed_root;
868 }
869
870 return 0;
871
872 failed_root:
873 dput(sb->s_root);
874 sb->s_root = NULL;
875
876 failed_segctor:
877 nilfs_detach_segment_constructor(sbi);
878
879 failed_checkpoint:
880 nilfs_detach_checkpoint(sbi);
881
882 failed_sbi:
883 put_nilfs(nilfs);
884 sb->s_fs_info = NULL;
885 kfree(sbi);
886 return err;
887}
888
889static int nilfs_remount(struct super_block *sb, int *flags, char *data)
890{
891 struct nilfs_sb_info *sbi = NILFS_SB(sb);
892 struct nilfs_super_block *sbp;
893 struct the_nilfs *nilfs = sbi->s_nilfs;
894 unsigned long old_sb_flags;
895 struct nilfs_mount_options old_opts;
896 int err;
897
898 old_sb_flags = sb->s_flags;
899 old_opts.mount_opt = sbi->s_mount_opt;
900 old_opts.snapshot_cno = sbi->s_snapshot_cno;
901
902 if (!parse_options(data, sb)) {
903 err = -EINVAL;
904 goto restore_opts;
905 }
906 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
907
908 if ((*flags & MS_RDONLY) &&
909 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
910 printk(KERN_WARNING "NILFS (device %s): couldn't "
911 "remount to a different snapshot. \n",
912 sb->s_id);
913 err = -EINVAL;
914 goto restore_opts;
915 }
916
917 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
918 goto out;
919 if (*flags & MS_RDONLY) {
920 /* Shutting down the segment constructor */
921 nilfs_detach_segment_constructor(sbi);
922 sb->s_flags |= MS_RDONLY;
923
924 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
925 /* nilfs_set_opt(sbi, SNAPSHOT); */
926
927 /*
928 * Remounting a valid RW partition RDONLY, so set
929 * the RDONLY flag and then mark the partition as valid again.
930 */
931 down_write(&nilfs->ns_sem);
932 sbp = nilfs->ns_sbp[0];
933 if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
934 (nilfs->ns_mount_state & NILFS_VALID_FS))
935 sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
936 sbp->s_mtime = cpu_to_le64(get_seconds());
937 nilfs_commit_super(sbi, 1);
938 up_write(&nilfs->ns_sem);
939 } else {
940 /*
941 * Mounting a RDONLY partition read-write, so reread and
942 * store the current valid flag. (It may have been changed
943 * by fsck since we originally mounted the partition.)
944 */
945 down(&sb->s_bdev->bd_mount_sem);
946 /* Check existing RW-mount */
947 if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
948 printk(KERN_WARNING "NILFS (device %s): couldn't "
949 "remount because a RW-mount exists.\n",
950 sb->s_id);
951 err = -EBUSY;
952 goto rw_remount_failed;
953 }
954 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
955 printk(KERN_WARNING "NILFS (device %s): couldn't "
956 "remount because the current RO-mount is not "
957 "the latest one.\n",
958 sb->s_id);
959 err = -EINVAL;
960 goto rw_remount_failed;
961 }
962 sb->s_flags &= ~MS_RDONLY;
963 nilfs_clear_opt(sbi, SNAPSHOT);
964 sbi->s_snapshot_cno = 0;
965
966 err = nilfs_attach_segment_constructor(sbi);
967 if (err)
968 goto rw_remount_failed;
969
970 down_write(&nilfs->ns_sem);
971 nilfs_setup_super(sbi);
972 up_write(&nilfs->ns_sem);
973
974 up(&sb->s_bdev->bd_mount_sem);
975 }
976 out:
977 return 0;
978
979 rw_remount_failed:
980 up(&sb->s_bdev->bd_mount_sem);
981 restore_opts:
982 sb->s_flags = old_sb_flags;
983 sbi->s_mount_opt = old_opts.mount_opt;
984 sbi->s_snapshot_cno = old_opts.snapshot_cno;
985 return err;
986}
987
988struct nilfs_super_data {
989 struct block_device *bdev;
990 __u64 cno;
991 int flags;
992};
993
994/**
995 * nilfs_identify - pre-read mount options needed to identify mount instance
996 * @data: mount options
997 * @sd: nilfs_super_data
998 */
999static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1000{
1001 char *p, *options = data;
1002 substring_t args[MAX_OPT_ARGS];
1003 int option, token;
1004 int ret = 0;
1005
1006 do {
1007 p = strsep(&options, ",");
1008 if (p != NULL && *p) {
1009 token = match_token(p, tokens, args);
1010 if (token == Opt_snapshot) {
1011 if (!(sd->flags & MS_RDONLY))
1012 ret++;
1013 else {
1014 ret = match_int(&args[0], &option);
1015 if (!ret) {
1016 if (option > 0)
1017 sd->cno = option;
1018 else
1019 ret++;
1020 }
1021 }
1022 }
1023 if (ret)
1024 printk(KERN_ERR
1025 "NILFS: invalid mount option: %s\n", p);
1026 }
1027 if (!options)
1028 break;
1029 BUG_ON(options == data);
1030 *(options - 1) = ',';
1031 } while (!ret);
1032 return ret;
1033}
1034
1035static int nilfs_set_bdev_super(struct super_block *s, void *data)
1036{
1037 struct nilfs_super_data *sd = data;
1038
1039 s->s_bdev = sd->bdev;
1040 s->s_dev = s->s_bdev->bd_dev;
1041 return 0;
1042}
1043
1044static int nilfs_test_bdev_super(struct super_block *s, void *data)
1045{
1046 struct nilfs_super_data *sd = data;
1047
1048 return s->s_bdev == sd->bdev;
1049}
1050
1051static int nilfs_test_bdev_super2(struct super_block *s, void *data)
1052{
1053 struct nilfs_super_data *sd = data;
1054 int ret;
1055
1056 if (s->s_bdev != sd->bdev)
1057 return 0;
1058
1059 if (!((s->s_flags | sd->flags) & MS_RDONLY))
1060 return 1; /* Reuse an old R/W-mode super_block */
1061
1062 if (s->s_flags & sd->flags & MS_RDONLY) {
1063 if (down_read_trylock(&s->s_umount)) {
1064 ret = s->s_root &&
1065 (sd->cno == NILFS_SB(s)->s_snapshot_cno);
1066 up_read(&s->s_umount);
1067 /*
1068 * This path is locked with sb_lock by sget().
1069 * So, drop_super() causes deadlock.
1070 */
1071 return ret;
1072 }
1073 }
1074 return 0;
1075}
1076
1077static int
1078nilfs_get_sb(struct file_system_type *fs_type, int flags,
1079 const char *dev_name, void *data, struct vfsmount *mnt)
1080{
1081 struct nilfs_super_data sd;
1082 struct super_block *s, *s2;
1083 struct the_nilfs *nilfs = NULL;
1084 int err, need_to_close = 1;
1085
1086 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
1087 if (IS_ERR(sd.bdev))
1088 return PTR_ERR(sd.bdev);
1089
1090 /*
1091 * To get mount instance using sget() vfs-routine, NILFS needs
1092 * much more information than normal filesystems to identify mount
1093 * instance. For snapshot mounts, not only a mount type (ro-mount
1094 * or rw-mount) but also a checkpoint number is required.
1095 * The results are passed in sget() using nilfs_super_data.
1096 */
1097 sd.cno = 0;
1098 sd.flags = flags;
1099 if (nilfs_identify((char *)data, &sd)) {
1100 err = -EINVAL;
1101 goto failed;
1102 }
1103
1104 /*
1105 * once the super is inserted into the list by sget, s_umount
1106 * will protect the lockfs code from trying to start a snapshot
1107 * while we are mounting
1108 */
1109 down(&sd.bdev->bd_mount_sem);
1110 if (!sd.cno &&
1111 (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
1112 err = (err < 0) ? : -EBUSY;
1113 goto failed_unlock;
1114 }
1115
1116 /*
1117 * Phase-1: search any existent instance and get the_nilfs
1118 */
1119 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1120 if (IS_ERR(s))
1121 goto error_s;
1122
1123 if (!s->s_root) {
1124 err = -ENOMEM;
1125 nilfs = alloc_nilfs(sd.bdev);
1126 if (!nilfs)
1127 goto cancel_new;
1128 } else {
1129 struct nilfs_sb_info *sbi = NILFS_SB(s);
1130
1131 /*
1132 * s_umount protects super_block from unmount process;
1133 * It covers pointers of nilfs_sb_info and the_nilfs.
1134 */
1135 nilfs = sbi->s_nilfs;
1136 get_nilfs(nilfs);
1137 up_write(&s->s_umount);
1138
1139 /*
1140 * Phase-2: search specified snapshot or R/W mode super_block
1141 */
1142 if (!sd.cno)
1143 /* trying to get the latest checkpoint. */
1144 sd.cno = nilfs_last_cno(nilfs);
1145
1146 s2 = sget(fs_type, nilfs_test_bdev_super2,
1147 nilfs_set_bdev_super, &sd);
1148 deactivate_super(s);
1149 /*
1150 * Although deactivate_super() invokes close_bdev_exclusive() at
1151 * kill_block_super(). Here, s is an existent mount; we need
1152 * one more close_bdev_exclusive() call.
1153 */
1154 s = s2;
1155 if (IS_ERR(s))
1156 goto error_s;
1157 }
1158
1159 if (!s->s_root) {
1160 char b[BDEVNAME_SIZE];
1161
1162 s->s_flags = flags;
1163 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1164 sb_set_blocksize(s, block_size(sd.bdev));
1165
1166 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
1167 if (err)
1168 goto cancel_new;
1169
1170 s->s_flags |= MS_ACTIVE;
1171 need_to_close = 0;
1172 } else if (!(s->s_flags & MS_RDONLY)) {
1173 err = -EBUSY;
1174 }
1175
1176 up(&sd.bdev->bd_mount_sem);
1177 put_nilfs(nilfs);
1178 if (need_to_close)
1179 close_bdev_exclusive(sd.bdev, flags);
1180 simple_set_mnt(mnt, s);
1181 return 0;
1182
1183 error_s:
1184 up(&sd.bdev->bd_mount_sem);
1185 if (nilfs)
1186 put_nilfs(nilfs);
1187 close_bdev_exclusive(sd.bdev, flags);
1188 return PTR_ERR(s);
1189
1190 failed_unlock:
1191 up(&sd.bdev->bd_mount_sem);
1192 failed:
1193 close_bdev_exclusive(sd.bdev, flags);
1194
1195 return err;
1196
1197 cancel_new:
1198 /* Abandoning the newly allocated superblock */
1199 up(&sd.bdev->bd_mount_sem);
1200 if (nilfs)
1201 put_nilfs(nilfs);
1202 up_write(&s->s_umount);
1203 deactivate_super(s);
1204 /*
1205 * deactivate_super() invokes close_bdev_exclusive().
1206 * We must finish all post-cleaning before this call;
1207 * put_nilfs() and unlocking bd_mount_sem need the block device.
1208 */
1209 return err;
1210}
1211
1212static int nilfs_test_bdev_super3(struct super_block *s, void *data)
1213{
1214 struct nilfs_super_data *sd = data;
1215 int ret;
1216
1217 if (s->s_bdev != sd->bdev)
1218 return 0;
1219 if (down_read_trylock(&s->s_umount)) {
1220 ret = (s->s_flags & MS_RDONLY) && s->s_root &&
1221 nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
1222 up_read(&s->s_umount);
1223 if (ret)
1224 return 0; /* ignore snapshot mounts */
1225 }
1226 return !((sd->flags ^ s->s_flags) & MS_RDONLY);
1227}
1228
1229static int __false_bdev_super(struct super_block *s, void *data)
1230{
1231#if 0 /* XXX: workaround for lock debug. This is not good idea */
1232 up_write(&s->s_umount);
1233#endif
1234 return -EFAULT;
1235}
1236
1237/**
1238 * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
1239 * fs_type: filesystem type
1240 * bdev: block device
1241 * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
1242 * res: pointer to an integer to store result
1243 *
1244 * This function must be called within a section protected by bd_mount_mutex.
1245 */
1246static int test_exclusive_mount(struct file_system_type *fs_type,
1247 struct block_device *bdev, int flags)
1248{
1249 struct super_block *s;
1250 struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
1251
1252 s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
1253 if (IS_ERR(s)) {
1254 if (PTR_ERR(s) != -EFAULT)
1255 return PTR_ERR(s);
1256 return 0; /* Not found */
1257 }
1258 up_write(&s->s_umount);
1259 deactivate_super(s);
1260 return 1; /* Found */
1261}
1262
1263struct file_system_type nilfs_fs_type = {
1264 .owner = THIS_MODULE,
1265 .name = "nilfs2",
1266 .get_sb = nilfs_get_sb,
1267 .kill_sb = kill_block_super,
1268 .fs_flags = FS_REQUIRES_DEV,
1269};
1270
1271static int __init init_nilfs_fs(void)
1272{
1273 int err;
1274
1275 err = nilfs_init_inode_cache();
1276 if (err)
1277 goto failed;
1278
1279 err = nilfs_init_transaction_cache();
1280 if (err)
1281 goto failed_inode_cache;
1282
1283 err = nilfs_init_segbuf_cache();
1284 if (err)
1285 goto failed_transaction_cache;
1286
1287 err = nilfs_btree_path_cache_init();
1288 if (err)
1289 goto failed_segbuf_cache;
1290
1291 err = register_filesystem(&nilfs_fs_type);
1292 if (err)
1293 goto failed_btree_path_cache;
1294
1295 return 0;
1296
1297 failed_btree_path_cache:
1298 nilfs_btree_path_cache_destroy();
1299
1300 failed_segbuf_cache:
1301 nilfs_destroy_segbuf_cache();
1302
1303 failed_transaction_cache:
1304 nilfs_destroy_transaction_cache();
1305
1306 failed_inode_cache:
1307 nilfs_destroy_inode_cache();
1308
1309 failed:
1310 return err;
1311}
1312
1313static void __exit exit_nilfs_fs(void)
1314{
1315 nilfs_destroy_segbuf_cache();
1316 nilfs_destroy_transaction_cache();
1317 nilfs_destroy_inode_cache();
1318 nilfs_btree_path_cache_destroy();
1319 unregister_filesystem(&nilfs_fs_type);
1320}
1321
1322module_init(init_nilfs_fs)
1323module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000000..33400cf0bbe2
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,637 @@
1/*
2 * the_nilfs.c - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/slab.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/crc32.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "alloc.h"
32#include "cpfile.h"
33#include "sufile.h"
34#include "dat.h"
35#include "seglist.h"
36#include "segbuf.h"
37
38void nilfs_set_last_segment(struct the_nilfs *nilfs,
39 sector_t start_blocknr, u64 seq, __u64 cno)
40{
41 spin_lock(&nilfs->ns_last_segment_lock);
42 nilfs->ns_last_pseg = start_blocknr;
43 nilfs->ns_last_seq = seq;
44 nilfs->ns_last_cno = cno;
45 spin_unlock(&nilfs->ns_last_segment_lock);
46}
47
48/**
49 * alloc_nilfs - allocate the_nilfs structure
50 * @bdev: block device to which the_nilfs is related
51 *
52 * alloc_nilfs() allocates memory for the_nilfs and
53 * initializes its reference count and locks.
54 *
55 * Return Value: On success, pointer to the_nilfs is returned.
56 * On error, NULL is returned.
57 */
58struct the_nilfs *alloc_nilfs(struct block_device *bdev)
59{
60 struct the_nilfs *nilfs;
61
62 nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
63 if (!nilfs)
64 return NULL;
65
66 nilfs->ns_bdev = bdev;
67 atomic_set(&nilfs->ns_count, 1);
68 atomic_set(&nilfs->ns_writer_refcount, -1);
69 atomic_set(&nilfs->ns_ndirtyblks, 0);
70 init_rwsem(&nilfs->ns_sem);
71 mutex_init(&nilfs->ns_writer_mutex);
72 INIT_LIST_HEAD(&nilfs->ns_supers);
73 spin_lock_init(&nilfs->ns_last_segment_lock);
74 nilfs->ns_gc_inodes_h = NULL;
75 init_rwsem(&nilfs->ns_segctor_sem);
76
77 return nilfs;
78}
79
80/**
81 * put_nilfs - release a reference to the_nilfs
82 * @nilfs: the_nilfs structure to be released
83 *
84 * put_nilfs() decrements a reference counter of the_nilfs.
85 * If the reference count reaches zero, the_nilfs is freed.
86 */
87void put_nilfs(struct the_nilfs *nilfs)
88{
89 if (!atomic_dec_and_test(&nilfs->ns_count))
90 return;
91 /*
92 * Increment of ns_count never occur below because the caller
93 * of get_nilfs() holds at least one reference to the_nilfs.
94 * Thus its exclusion control is not required here.
95 */
96 might_sleep();
97 if (nilfs_loaded(nilfs)) {
98 nilfs_mdt_clear(nilfs->ns_sufile);
99 nilfs_mdt_destroy(nilfs->ns_sufile);
100 nilfs_mdt_clear(nilfs->ns_cpfile);
101 nilfs_mdt_destroy(nilfs->ns_cpfile);
102 nilfs_mdt_clear(nilfs->ns_dat);
103 nilfs_mdt_destroy(nilfs->ns_dat);
104 /* XXX: how and when to clear nilfs->ns_gc_dat? */
105 nilfs_mdt_destroy(nilfs->ns_gc_dat);
106 }
107 if (nilfs_init(nilfs)) {
108 nilfs_destroy_gccache(nilfs);
109 brelse(nilfs->ns_sbh[0]);
110 brelse(nilfs->ns_sbh[1]);
111 }
112 kfree(nilfs);
113}
114
115static int nilfs_load_super_root(struct the_nilfs *nilfs,
116 struct nilfs_sb_info *sbi, sector_t sr_block)
117{
118 struct buffer_head *bh_sr;
119 struct nilfs_super_root *raw_sr;
120 struct nilfs_super_block **sbp = nilfs->ns_sbp;
121 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
122 unsigned inode_size;
123 int err;
124
125 err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
126 if (unlikely(err))
127 return err;
128
129 down_read(&nilfs->ns_sem);
130 dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
131 checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
132 segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
133 up_read(&nilfs->ns_sem);
134
135 inode_size = nilfs->ns_inode_size;
136
137 err = -ENOMEM;
138 nilfs->ns_dat = nilfs_mdt_new(
139 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
140 if (unlikely(!nilfs->ns_dat))
141 goto failed;
142
143 nilfs->ns_gc_dat = nilfs_mdt_new(
144 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
145 if (unlikely(!nilfs->ns_gc_dat))
146 goto failed_dat;
147
148 nilfs->ns_cpfile = nilfs_mdt_new(
149 nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
150 if (unlikely(!nilfs->ns_cpfile))
151 goto failed_gc_dat;
152
153 nilfs->ns_sufile = nilfs_mdt_new(
154 nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
155 if (unlikely(!nilfs->ns_sufile))
156 goto failed_cpfile;
157
158 err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
159 if (unlikely(err))
160 goto failed_sufile;
161
162 err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
163 if (unlikely(err))
164 goto failed_sufile;
165
166 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
167 nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
168 sizeof(struct nilfs_cpfile_header));
169 nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
170 sizeof(struct nilfs_sufile_header));
171
172 err = nilfs_mdt_read_inode_direct(
173 nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
174 if (unlikely(err))
175 goto failed_sufile;
176
177 err = nilfs_mdt_read_inode_direct(
178 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
179 if (unlikely(err))
180 goto failed_sufile;
181
182 err = nilfs_mdt_read_inode_direct(
183 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
184 if (unlikely(err))
185 goto failed_sufile;
186
187 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
188 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
189
190 failed:
191 brelse(bh_sr);
192 return err;
193
194 failed_sufile:
195 nilfs_mdt_destroy(nilfs->ns_sufile);
196
197 failed_cpfile:
198 nilfs_mdt_destroy(nilfs->ns_cpfile);
199
200 failed_gc_dat:
201 nilfs_mdt_destroy(nilfs->ns_gc_dat);
202
203 failed_dat:
204 nilfs_mdt_destroy(nilfs->ns_dat);
205 goto failed;
206}
207
208static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
209{
210 memset(ri, 0, sizeof(*ri));
211 INIT_LIST_HEAD(&ri->ri_used_segments);
212}
213
214static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
215{
216 nilfs_dispose_segment_list(&ri->ri_used_segments);
217}
218
219/**
220 * load_nilfs - load and recover the nilfs
221 * @nilfs: the_nilfs structure to be released
222 * @sbi: nilfs_sb_info used to recover past segment
223 *
224 * load_nilfs() searches and load the latest super root,
225 * attaches the last segment, and does recovery if needed.
226 * The caller must call this exclusively for simultaneous mounts.
227 */
228int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
229{
230 struct nilfs_recovery_info ri;
231 unsigned int s_flags = sbi->s_super->s_flags;
232 int really_read_only = bdev_read_only(nilfs->ns_bdev);
233 unsigned valid_fs;
234 int err = 0;
235
236 nilfs_init_recovery_info(&ri);
237
238 down_write(&nilfs->ns_sem);
239 valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
240 up_write(&nilfs->ns_sem);
241
242 if (!valid_fs && (s_flags & MS_RDONLY)) {
243 printk(KERN_INFO "NILFS: INFO: recovery "
244 "required for readonly filesystem.\n");
245 if (really_read_only) {
246 printk(KERN_ERR "NILFS: write access "
247 "unavailable, cannot proceed.\n");
248 err = -EROFS;
249 goto failed;
250 }
251 printk(KERN_INFO "NILFS: write access will "
252 "be enabled during recovery.\n");
253 sbi->s_super->s_flags &= ~MS_RDONLY;
254 }
255
256 err = nilfs_search_super_root(nilfs, sbi, &ri);
257 if (unlikely(err)) {
258 printk(KERN_ERR "NILFS: error searching super root.\n");
259 goto failed;
260 }
261
262 err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
263 if (unlikely(err)) {
264 printk(KERN_ERR "NILFS: error loading super root.\n");
265 goto failed;
266 }
267
268 if (!valid_fs) {
269 err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
270 if (unlikely(err)) {
271 nilfs_mdt_destroy(nilfs->ns_cpfile);
272 nilfs_mdt_destroy(nilfs->ns_sufile);
273 nilfs_mdt_destroy(nilfs->ns_dat);
274 goto failed;
275 }
276 if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
277 sbi->s_super->s_dirt = 1;
278 }
279
280 set_nilfs_loaded(nilfs);
281
282 failed:
283 nilfs_clear_recovery_info(&ri);
284 sbi->s_super->s_flags = s_flags;
285 return err;
286}
287
288static unsigned long long nilfs_max_size(unsigned int blkbits)
289{
290 unsigned int max_bits;
291 unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
292
293 max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
294 if (max_bits < 64)
295 res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
296 return res;
297}
298
299static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
300 struct nilfs_super_block *sbp)
301{
302 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
303 printk(KERN_ERR "NILFS: revision mismatch "
304 "(superblock rev.=%d.%d, current rev.=%d.%d). "
305 "Please check the version of mkfs.nilfs.\n",
306 le32_to_cpu(sbp->s_rev_level),
307 le16_to_cpu(sbp->s_minor_rev_level),
308 NILFS_CURRENT_REV, NILFS_MINOR_REV);
309 return -EINVAL;
310 }
311 nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
312 if (nilfs->ns_sbsize > BLOCK_SIZE)
313 return -EINVAL;
314
315 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
316 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
317
318 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
319 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
320 printk(KERN_ERR "NILFS: too short segment. \n");
321 return -EINVAL;
322 }
323
324 nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
325 nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
326 nilfs->ns_r_segments_percentage =
327 le32_to_cpu(sbp->s_r_segments_percentage);
328 nilfs->ns_nrsvsegs =
329 max_t(unsigned long, NILFS_MIN_NRSVSEGS,
330 DIV_ROUND_UP(nilfs->ns_nsegments *
331 nilfs->ns_r_segments_percentage, 100));
332 nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
333 return 0;
334}
335
336static int nilfs_valid_sb(struct nilfs_super_block *sbp)
337{
338 static unsigned char sum[4];
339 const int sumoff = offsetof(struct nilfs_super_block, s_sum);
340 size_t bytes;
341 u32 crc;
342
343 if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
344 return 0;
345 bytes = le16_to_cpu(sbp->s_bytes);
346 if (bytes > BLOCK_SIZE)
347 return 0;
348 crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
349 sumoff);
350 crc = crc32_le(crc, sum, 4);
351 crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
352 bytes - sumoff - 4);
353 return crc == le32_to_cpu(sbp->s_sum);
354}
355
356static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
357{
358 return offset < ((le64_to_cpu(sbp->s_nsegments) *
359 le32_to_cpu(sbp->s_blocks_per_segment)) <<
360 (le32_to_cpu(sbp->s_log_block_size) + 10));
361}
362
363static void nilfs_release_super_block(struct the_nilfs *nilfs)
364{
365 int i;
366
367 for (i = 0; i < 2; i++) {
368 if (nilfs->ns_sbp[i]) {
369 brelse(nilfs->ns_sbh[i]);
370 nilfs->ns_sbh[i] = NULL;
371 nilfs->ns_sbp[i] = NULL;
372 }
373 }
374}
375
376void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
377{
378 brelse(nilfs->ns_sbh[0]);
379 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
380 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
381 nilfs->ns_sbh[1] = NULL;
382 nilfs->ns_sbp[1] = NULL;
383}
384
385void nilfs_swap_super_block(struct the_nilfs *nilfs)
386{
387 struct buffer_head *tsbh = nilfs->ns_sbh[0];
388 struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
389
390 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
391 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
392 nilfs->ns_sbh[1] = tsbh;
393 nilfs->ns_sbp[1] = tsbp;
394}
395
396static int nilfs_load_super_block(struct the_nilfs *nilfs,
397 struct super_block *sb, int blocksize,
398 struct nilfs_super_block **sbpp)
399{
400 struct nilfs_super_block **sbp = nilfs->ns_sbp;
401 struct buffer_head **sbh = nilfs->ns_sbh;
402 u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
403 int valid[2], swp = 0;
404
405 sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
406 &sbh[0]);
407 sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
408
409 if (!sbp[0]) {
410 if (!sbp[1]) {
411 printk(KERN_ERR "NILFS: unable to read superblock\n");
412 return -EIO;
413 }
414 printk(KERN_WARNING
415 "NILFS warning: unable to read primary superblock\n");
416 } else if (!sbp[1])
417 printk(KERN_WARNING
418 "NILFS warning: unable to read secondary superblock\n");
419
420 valid[0] = nilfs_valid_sb(sbp[0]);
421 valid[1] = nilfs_valid_sb(sbp[1]);
422 swp = valid[1] &&
423 (!valid[0] ||
424 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
425
426 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
427 brelse(sbh[1]);
428 sbh[1] = NULL;
429 sbp[1] = NULL;
430 swp = 0;
431 }
432 if (!valid[swp]) {
433 nilfs_release_super_block(nilfs);
434 printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
435 sb->s_id);
436 return -EINVAL;
437 }
438
439 if (swp) {
440 printk(KERN_WARNING "NILFS warning: broken superblock. "
441 "using spare superblock.\n");
442 nilfs_swap_super_block(nilfs);
443 }
444
445 nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
446 nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
447 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
448 *sbpp = sbp[0];
449 return 0;
450}
451
452/**
453 * init_nilfs - initialize a NILFS instance.
454 * @nilfs: the_nilfs structure
455 * @sbi: nilfs_sb_info
456 * @sb: super block
457 * @data: mount options
458 *
459 * init_nilfs() performs common initialization per block device (e.g.
460 * reading the super block, getting disk layout information, initializing
461 * shared fields in the_nilfs). It takes on some portion of the jobs
462 * typically done by a fill_super() routine. This division arises from
463 * the nature that multiple NILFS instances may be simultaneously
464 * mounted on a device.
465 * For multiple mounts on the same device, only the first mount
466 * invokes these tasks.
467 *
468 * Return Value: On success, 0 is returned. On error, a negative error
469 * code is returned.
470 */
471int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
472{
473 struct super_block *sb = sbi->s_super;
474 struct nilfs_super_block *sbp;
475 struct backing_dev_info *bdi;
476 int blocksize;
477 int err;
478
479 down_write(&nilfs->ns_sem);
480 if (nilfs_init(nilfs)) {
481 /* Load values from existing the_nilfs */
482 sbp = nilfs->ns_sbp[0];
483 err = nilfs_store_magic_and_option(sb, sbp, data);
484 if (err)
485 goto out;
486
487 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
488 if (sb->s_blocksize != blocksize &&
489 !sb_set_blocksize(sb, blocksize)) {
490 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
491 blocksize);
492 err = -EINVAL;
493 }
494 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
495 goto out;
496 }
497
498 blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
499 if (!blocksize) {
500 printk(KERN_ERR "NILFS: unable to set blocksize\n");
501 err = -EINVAL;
502 goto out;
503 }
504 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
505 if (err)
506 goto out;
507
508 err = nilfs_store_magic_and_option(sb, sbp, data);
509 if (err)
510 goto failed_sbh;
511
512 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
513 if (sb->s_blocksize != blocksize) {
514 int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
515
516 if (blocksize < hw_blocksize) {
517 printk(KERN_ERR
518 "NILFS: blocksize %d too small for device "
519 "(sector-size = %d).\n",
520 blocksize, hw_blocksize);
521 err = -EINVAL;
522 goto failed_sbh;
523 }
524 nilfs_release_super_block(nilfs);
525 sb_set_blocksize(sb, blocksize);
526
527 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
528 if (err)
529 goto out;
530 /* not failed_sbh; sbh is released automatically
531 when reloading fails. */
532 }
533 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
534
535 err = nilfs_store_disk_layout(nilfs, sbp);
536 if (err)
537 goto failed_sbh;
538
539 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
540
541 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
542
543 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
544 if (!bdi)
545 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
546 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
547
548 /* Finding last segment */
549 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
550 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
551 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
552
553 nilfs->ns_seg_seq = nilfs->ns_last_seq;
554 nilfs->ns_segnum =
555 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
556 nilfs->ns_cno = nilfs->ns_last_cno + 1;
557 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
558 printk(KERN_ERR "NILFS invalid last segment number.\n");
559 err = -EINVAL;
560 goto failed_sbh;
561 }
562 /* Dummy values */
563 nilfs->ns_free_segments_count =
564 nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
565
566 /* Initialize gcinode cache */
567 err = nilfs_init_gccache(nilfs);
568 if (err)
569 goto failed_sbh;
570
571 set_nilfs_init(nilfs);
572 err = 0;
573 out:
574 up_write(&nilfs->ns_sem);
575 return err;
576
577 failed_sbh:
578 nilfs_release_super_block(nilfs);
579 goto out;
580}
581
582int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
583{
584 struct inode *dat = nilfs_dat_inode(nilfs);
585 unsigned long ncleansegs;
586 int err;
587
588 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
589 err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
590 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
591 if (likely(!err))
592 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
593 return err;
594}
595
596int nilfs_near_disk_full(struct the_nilfs *nilfs)
597{
598 struct inode *sufile = nilfs->ns_sufile;
599 unsigned long ncleansegs, nincsegs;
600 int ret;
601
602 ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
603 if (likely(!ret)) {
604 nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
605 nilfs->ns_blocks_per_segment + 1;
606 if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
607 ret++;
608 }
609 return ret;
610}
611
612int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
613 int snapshot_mount)
614{
615 struct nilfs_sb_info *sbi;
616 int ret = 0;
617
618 down_read(&nilfs->ns_sem);
619 if (cno == 0 || cno > nilfs->ns_cno)
620 goto out_unlock;
621
622 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
623 if (sbi->s_snapshot_cno == cno &&
624 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
625 /* exclude read-only mounts */
626 ret++;
627 break;
628 }
629 }
630 /* for protecting recent checkpoints */
631 if (cno >= nilfs_last_cno(nilfs))
632 ret++;
633
634 out_unlock:
635 up_read(&nilfs->ns_sem);
636 return ret;
637}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000000..30fe58778d05
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
1/*
2 * the_nilfs.h - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _THE_NILFS_H
25#define _THE_NILFS_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/blkdev.h>
31#include <linux/backing-dev.h>
32#include "sb.h"
33
34/* the_nilfs struct */
35enum {
36 THE_NILFS_INIT = 0, /* Information from super_block is set */
37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40};
41
42/**
43 * struct the_nilfs - struct to supervise multiple nilfs mount points
44 * @ns_flags: flags
45 * @ns_count: reference count
46 * @ns_bdev: block device
47 * @ns_bdi: backing dev info
48 * @ns_writer: back pointer to writable nilfs_sb_info
49 * @ns_sem: semaphore for shared states
50 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
51 * @ns_writer_refcount: number of referrers on ns_writer
52 * @ns_sbh: buffer heads of on-disk super blocks
53 * @ns_sbp: pointers to super block data
54 * @ns_sbwtime: previous write time of super blocks
55 * @ns_sbsize: size of valid data in super block
56 * @ns_supers: list of nilfs super block structs
57 * @ns_seg_seq: segment sequence counter
58 * @ns_segnum: index number of the latest full segment.
59 * @ns_nextnum: index number of the full segment index to be used next
60 * @ns_pseg_offset: offset of next partial segment in the current full segment
61 * @ns_cno: next checkpoint number
62 * @ns_ctime: write time of the last segment
63 * @ns_nongc_ctime: write time of the last segment not for cleaner operation
64 * @ns_ndirtyblks: Number of dirty data blocks
65 * @ns_last_segment_lock: lock protecting fields for the latest segment
66 * @ns_last_pseg: start block number of the latest segment
67 * @ns_last_seq: sequence value of the latest segment
68 * @ns_last_cno: checkpoint number of the latest segment
69 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
70 * @ns_free_segments_count: counter of free segments
71 * @ns_segctor_sem: segment constructor semaphore
72 * @ns_dat: DAT file inode
73 * @ns_cpfile: checkpoint file inode
74 * @ns_sufile: segusage file inode
75 * @ns_gc_dat: shadow inode of the DAT file inode for GC
76 * @ns_gc_inodes: dummy inodes to keep live blocks
77 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
78 * @ns_blocksize_bits: bit length of block size
79 * @ns_nsegments: number of segments in filesystem
80 * @ns_blocks_per_segment: number of blocks per segment
81 * @ns_r_segments_percentage: reserved segments percentage
82 * @ns_nrsvsegs: number of reserved segments
83 * @ns_first_data_block: block number of first data block
84 * @ns_inode_size: size of on-disk inode
85 * @ns_first_ino: first not-special inode number
86 * @ns_crc_seed: seed value of CRC32 calculation
87 */
88struct the_nilfs {
89 unsigned long ns_flags;
90 atomic_t ns_count;
91
92 struct block_device *ns_bdev;
93 struct backing_dev_info *ns_bdi;
94 struct nilfs_sb_info *ns_writer;
95 struct rw_semaphore ns_sem;
96 struct mutex ns_writer_mutex;
97 atomic_t ns_writer_refcount;
98
99 /*
100 * used for
101 * - loading the latest checkpoint exclusively.
102 * - allocating a new full segment.
103 * - protecting s_dirt in the super_block struct
104 * (see nilfs_write_super) and the following fields.
105 */
106 struct buffer_head *ns_sbh[2];
107 struct nilfs_super_block *ns_sbp[2];
108 time_t ns_sbwtime[2];
109 unsigned ns_sbsize;
110 unsigned ns_mount_state;
111 struct list_head ns_supers;
112
113 /*
114 * Following fields are dedicated to a writable FS-instance.
115 * Except for the period seeking checkpoint, code outside the segment
116 * constructor must lock a segment semaphore while accessing these
117 * fields.
118 * The writable FS-instance is sole during a lifetime of the_nilfs.
119 */
120 u64 ns_seg_seq;
121 __u64 ns_segnum;
122 __u64 ns_nextnum;
123 unsigned long ns_pseg_offset;
124 __u64 ns_cno;
125 time_t ns_ctime;
126 time_t ns_nongc_ctime;
127 atomic_t ns_ndirtyblks;
128
129 /*
130 * The following fields hold information on the latest partial segment
131 * written to disk with a super root. These fields are protected by
132 * ns_last_segment_lock.
133 */
134 spinlock_t ns_last_segment_lock;
135 sector_t ns_last_pseg;
136 u64 ns_last_seq;
137 __u64 ns_last_cno;
138 u64 ns_prot_seq;
139 unsigned long ns_free_segments_count;
140
141 struct rw_semaphore ns_segctor_sem;
142
143 /*
144 * Following fields are lock free except for the period before
145 * the_nilfs is initialized.
146 */
147 struct inode *ns_dat;
148 struct inode *ns_cpfile;
149 struct inode *ns_sufile;
150 struct inode *ns_gc_dat;
151
152 /* GC inode list and hash table head */
153 struct list_head ns_gc_inodes;
154 struct hlist_head *ns_gc_inodes_h;
155
156 /* Disk layout information (static) */
157 unsigned int ns_blocksize_bits;
158 unsigned long ns_nsegments;
159 unsigned long ns_blocks_per_segment;
160 unsigned long ns_r_segments_percentage;
161 unsigned long ns_nrsvsegs;
162 unsigned long ns_first_data_block;
163 int ns_inode_size;
164 int ns_first_ino;
165 u32 ns_crc_seed;
166};
167
168#define NILFS_GCINODE_HASH_BITS 8
169#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
170
171#define THE_NILFS_FNS(bit, name) \
172static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
173{ \
174 set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
175} \
176static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \
177{ \
178 clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
179} \
180static inline int nilfs_##name(struct the_nilfs *nilfs) \
181{ \
182 return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
183}
184
185THE_NILFS_FNS(INIT, init)
186THE_NILFS_FNS(LOADED, loaded)
187THE_NILFS_FNS(DISCONTINUED, discontinued)
188
189/* Minimum interval of periodical update of superblocks (in seconds) */
190#define NILFS_SB_FREQ 10
191#define NILFS_ALTSB_FREQ 60 /* spare superblock */
192
193void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
194struct the_nilfs *alloc_nilfs(struct block_device *);
195void put_nilfs(struct the_nilfs *);
196int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
197int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
198int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
199int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
200int nilfs_near_disk_full(struct the_nilfs *);
201void nilfs_fall_back_super_block(struct the_nilfs *);
202void nilfs_swap_super_block(struct the_nilfs *);
203
204
205static inline void get_nilfs(struct the_nilfs *nilfs)
206{
207 /* Caller must have at least one reference of the_nilfs. */
208 atomic_inc(&nilfs->ns_count);
209}
210
211static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
212{
213 if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
214 mutex_lock(&nilfs->ns_writer_mutex);
215 return nilfs->ns_writer;
216}
217
218static inline void nilfs_put_writer(struct the_nilfs *nilfs)
219{
220 if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
221 mutex_unlock(&nilfs->ns_writer_mutex);
222}
223
224static inline void
225nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
226{
227 mutex_lock(&nilfs->ns_writer_mutex);
228 nilfs->ns_writer = sbi;
229 mutex_unlock(&nilfs->ns_writer_mutex);
230}
231
232static inline void
233nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
234{
235 mutex_lock(&nilfs->ns_writer_mutex);
236 if (sbi == nilfs->ns_writer)
237 nilfs->ns_writer = NULL;
238 mutex_unlock(&nilfs->ns_writer_mutex);
239}
240
241static inline void
242nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
243 sector_t *seg_start, sector_t *seg_end)
244{
245 *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
246 *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
247 if (segnum == 0)
248 *seg_start = nilfs->ns_first_data_block;
249}
250
251static inline sector_t
252nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
253{
254 return (segnum == 0) ? nilfs->ns_first_data_block :
255 (sector_t)nilfs->ns_blocks_per_segment * segnum;
256}
257
258static inline __u64
259nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
260{
261 sector_t segnum = blocknr;
262
263 sector_div(segnum, nilfs->ns_blocks_per_segment);
264 return segnum;
265}
266
267static inline void
268nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
269 sector_t seg_end)
270{
271 /* terminate the current full segment (used in case of I/O-error) */
272 nilfs->ns_pseg_offset = seg_end - seg_start + 1;
273}
274
275static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
276{
277 /* move forward with a full segment */
278 nilfs->ns_segnum = nilfs->ns_nextnum;
279 nilfs->ns_pseg_offset = 0;
280 nilfs->ns_seg_seq++;
281}
282
283static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
284{
285 __u64 cno;
286
287 spin_lock(&nilfs->ns_last_segment_lock);
288 cno = nilfs->ns_last_cno;
289 spin_unlock(&nilfs->ns_last_segment_lock);
290 return cno;
291}
292
293static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
294{
295 return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
296}
297
298#endif /* _THE_NILFS_H */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
296 return PTR_ERR(acl); 296 return PTR_ERR(acl);
297 } 297 }
298 if (!acl) 298 if (!acl)
299 inode->i_mode &= ~current->fs->umask; 299 inode->i_mode &= ~current_umask();
300 } 300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone; 302 struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295}; 295};
296 296
297static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 u64 blkno)
299{
300 struct ocfs2_dx_root_block *dx_root = et->et_object;
301
302 dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303}
304
305static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306{
307 struct ocfs2_dx_root_block *dx_root = et->et_object;
308
309 return le64_to_cpu(dx_root->dr_last_eb_blk);
310}
311
312static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 struct ocfs2_extent_tree *et,
314 u32 clusters)
315{
316 struct ocfs2_dx_root_block *dx_root = et->et_object;
317
318 le32_add_cpu(&dx_root->dr_clusters, clusters);
319}
320
321static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 struct ocfs2_extent_tree *et)
323{
324 struct ocfs2_dx_root_block *dx_root = et->et_object;
325
326 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327
328 return 0;
329}
330
331static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332{
333 struct ocfs2_dx_root_block *dx_root = et->et_object;
334
335 et->et_root_el = &dx_root->dr_list;
336}
337
338static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
340 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
341 .eo_update_clusters = ocfs2_dx_root_update_clusters,
342 .eo_sanity_check = ocfs2_dx_root_sanity_check,
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344};
345
297static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
298 struct inode *inode, 347 struct inode *inode,
299 struct buffer_head *bh, 348 struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 &ocfs2_xattr_value_et_ops); 388 &ocfs2_xattr_value_et_ops);
340} 389}
341 390
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode,
393 struct buffer_head *bh)
394{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops);
397}
398
342static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
343 u64 new_last_eb_blk) 400 u64 new_last_eb_blk)
344{ 401{
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 76 struct inode *inode,
77 struct ocfs2_xattr_value_buf *vb); 77 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode,
80 struct buffer_head *bh);
78 81
79/* 82/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be 83 * Read an extent block into *bh. If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e1709a679b7..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1956} 1956}
1957 1957
1958const struct address_space_operations ocfs2_aops = { 1958const struct address_space_operations ocfs2_aops = {
1959 .readpage = ocfs2_readpage, 1959 .readpage = ocfs2_readpage,
1960 .readpages = ocfs2_readpages, 1960 .readpages = ocfs2_readpages,
1961 .writepage = ocfs2_writepage, 1961 .writepage = ocfs2_writepage,
1962 .write_begin = ocfs2_write_begin, 1962 .write_begin = ocfs2_write_begin,
1963 .write_end = ocfs2_write_end, 1963 .write_end = ocfs2_write_end,
1964 .bmap = ocfs2_bmap, 1964 .bmap = ocfs2_bmap,
1965 .sync_page = block_sync_page, 1965 .sync_page = block_sync_page,
1966 .direct_IO = ocfs2_direct_IO, 1966 .direct_IO = ocfs2_direct_IO,
1967 .invalidatepage = ocfs2_invalidatepage, 1967 .invalidatepage = ocfs2_invalidatepage,
1968 .releasepage = ocfs2_releasepage, 1968 .releasepage = ocfs2_releasepage,
1969 .migratepage = buffer_migrate_page, 1969 .migratepage = buffer_migrate_page,
1970 .is_partially_uptodate = block_is_partially_uptodate,
1970}; 1971};
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
33#include <linux/random.h> 33#include <linux/random.h>
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h>
36 37
37#include "heartbeat.h" 38#include "heartbeat.h"
38#include "tcp.h" 39#include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events); 61static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 62static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62 63
64#define O2HB_DEBUG_DIR "o2hb"
65#define O2HB_DEBUG_LIVENODES "livenodes"
66static struct dentry *o2hb_debug_dir;
67static struct dentry *o2hb_debug_livenodes;
68
63static LIST_HEAD(o2hb_all_regions); 69static LIST_HEAD(o2hb_all_regions);
64 70
65static struct o2hb_callback { 71static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
905 return 0; 911 return 0;
906} 912}
907 913
908void o2hb_init(void) 914#ifdef CONFIG_DEBUG_FS
915static int o2hb_debug_open(struct inode *inode, struct file *file)
916{
917 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
918 char *buf = NULL;
919 int i = -1;
920 int out = 0;
921
922 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
923 if (!buf)
924 goto bail;
925
926 o2hb_fill_node_map(map, sizeof(map));
927
928 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
929 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
930 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
931
932 i_size_write(inode, out);
933
934 file->private_data = buf;
935
936 return 0;
937bail:
938 return -ENOMEM;
939}
940
941static int o2hb_debug_release(struct inode *inode, struct file *file)
942{
943 kfree(file->private_data);
944 return 0;
945}
946
947static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
948 size_t nbytes, loff_t *ppos)
949{
950 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
951 i_size_read(file->f_mapping->host));
952}
953#else
954static int o2hb_debug_open(struct inode *inode, struct file *file)
955{
956 return 0;
957}
958static int o2hb_debug_release(struct inode *inode, struct file *file)
959{
960 return 0;
961}
962static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
963 size_t nbytes, loff_t *ppos)
964{
965 return 0;
966}
967#endif /* CONFIG_DEBUG_FS */
968
969static struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read,
973 .llseek = generic_file_llseek,
974};
975
976void o2hb_exit(void)
977{
978 if (o2hb_debug_livenodes)
979 debugfs_remove(o2hb_debug_livenodes);
980 if (o2hb_debug_dir)
981 debugfs_remove(o2hb_debug_dir);
982}
983
984int o2hb_init(void)
909{ 985{
910 int i; 986 int i;
911 987
@@ -918,6 +994,24 @@ void o2hb_init(void)
918 INIT_LIST_HEAD(&o2hb_node_events); 994 INIT_LIST_HEAD(&o2hb_node_events);
919 995
920 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 996 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
997
998 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
999 if (!o2hb_debug_dir) {
1000 mlog_errno(-ENOMEM);
1001 return -ENOMEM;
1002 }
1003
1004 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1005 S_IFREG|S_IRUSR,
1006 o2hb_debug_dir, NULL,
1007 &o2hb_debug_fops);
1008 if (!o2hb_debug_livenodes) {
1009 mlog_errno(-ENOMEM);
1010 debugfs_remove(o2hb_debug_dir);
1011 return -ENOMEM;
1012 }
1013
1014 return 0;
921} 1015}
922 1016
923/* if we're already in a callback then we're already serialized by the sem */ 1017/* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc); 75 struct o2hb_callback_func *hc);
76void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
77 unsigned bytes); 77 unsigned bytes);
78void o2hb_init(void); 78void o2hb_exit(void);
79int o2hb_init(void);
79int o2hb_check_node_heartbeating(u8 node_num); 80int o2hb_check_node_heartbeating(u8 node_num);
80int o2hb_check_node_heartbeating_from_callback(u8 node_num); 81int o2hb_check_node_heartbeating_from_callback(u8 node_num);
81int o2hb_check_local_node_heartbeating(void); 82int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
881 o2cb_sys_shutdown(); 881 o2cb_sys_shutdown();
882 882
883 o2net_exit(); 883 o2net_exit();
884 o2hb_exit();
884} 885}
885 886
886static int __init init_o2nm(void) 887static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
889 890
890 cluster_print_version(); 891 cluster_print_version();
891 892
892 o2hb_init(); 893 ret = o2hb_init();
894 if (ret)
895 goto out;
893 896
894 ret = o2net_init(); 897 ret = o2net_init();
895 if (ret) 898 if (ret)
896 goto out; 899 goto out_o2hb;
897 900
898 ret = o2net_register_hb_callbacks(); 901 ret = o2net_register_hb_callbacks();
899 if (ret) 902 if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
916 o2net_unregister_hb_callbacks(); 919 o2net_unregister_hb_callbacks();
917out_o2net: 920out_o2net:
918 o2net_exit(); 921 o2net_exit();
922out_o2hb:
923 o2hb_exit();
919out: 924out:
920 return ret; 925 return ret;
921} 926}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..e71160cda110 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h>
44 45
45#define MLOG_MASK_PREFIX ML_NAMEI 46#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -58,6 +59,7 @@
58#include "namei.h" 59#include "namei.h"
59#include "suballoc.h" 60#include "suballoc.h"
60#include "super.h" 61#include "super.h"
62#include "sysfile.h"
61#include "uptodate.h" 63#include "uptodate.h"
62 64
63#include "buffer_head_io.h" 65#include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
71 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 73 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
72}; 74};
73 75
74static int ocfs2_extend_dir(struct ocfs2_super *osb,
75 struct inode *dir,
76 struct buffer_head *parent_fe_bh,
77 unsigned int blocks_wanted,
78 struct buffer_head **new_de_bh);
79static int ocfs2_do_extend_dir(struct super_block *sb, 76static int ocfs2_do_extend_dir(struct super_block *sb,
80 handle_t *handle, 77 handle_t *handle,
81 struct inode *dir, 78 struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
83 struct ocfs2_alloc_context *data_ac, 80 struct ocfs2_alloc_context *data_ac,
84 struct ocfs2_alloc_context *meta_ac, 81 struct ocfs2_alloc_context *meta_ac,
85 struct buffer_head **new_bh); 82 struct buffer_head **new_bh);
83static int ocfs2_dir_indexed(struct inode *inode);
86 84
87/* 85/*
88 * These are distinct checks because future versions of the file system will 86 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing. 87 * want to have a trailing dirent structure independent of indexing.
90 */ 88 */
91static int ocfs2_dir_has_trailer(struct inode *dir) 89static int ocfs2_supports_dir_trailer(struct inode *dir)
92{ 90{
91 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
92
93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
94 return 0; 94 return 0;
95 95
96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); 96 return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
97} 97}
98 98
99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) 99/*
100 * "new' here refers to the point at which we're creating a new
101 * directory via "mkdir()", but also when we're expanding an inline
102 * directory. In either case, we don't yet have the indexing bit set
103 * on the directory, so the standard checks will fail in when metaecc
104 * is turned off. Only directory-initialization type functions should
105 * use this then. Everything else wants ocfs2_supports_dir_trailer()
106 */
107static int ocfs2_new_dir_wants_trailer(struct inode *dir)
100{ 108{
101 return ocfs2_meta_ecc(osb); 109 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
110
111 return ocfs2_meta_ecc(osb) ||
112 ocfs2_supports_indexed_dirs(osb);
102} 113}
103 114
104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) 115static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
130{ 141{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); 142 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
132 143
133 if (!ocfs2_dir_has_trailer(dir)) 144 if (!ocfs2_supports_dir_trailer(dir))
134 return 0; 145 return 0;
135 146
136 if (offset != toff) 147 if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
140} 151}
141 152
142static void ocfs2_init_dir_trailer(struct inode *inode, 153static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh) 154 struct buffer_head *bh, u16 rec_len)
144{ 155{
145 struct ocfs2_dir_block_trailer *trailer; 156 struct ocfs2_dir_block_trailer *trailer;
146 157
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); 161 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 162 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr); 163 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
164 trailer->db_free_rec_len = cpu_to_le16(rec_len);
165}
166/*
167 * Link an unindexed block with a dir trailer structure into the index free
168 * list. This function will modify dirdata_bh, but assumes you've already
169 * passed it to the journal.
170 */
171static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
172 struct buffer_head *dx_root_bh,
173 struct buffer_head *dirdata_bh)
174{
175 int ret;
176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer;
178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) {
182 mlog_errno(ret);
183 goto out;
184 }
185 trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
186 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
187
188 trailer->db_free_next = dx_root->dr_free_blk;
189 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
190
191 ocfs2_journal_dirty(handle, dx_root_bh);
192
193out:
194 return ret;
195}
196
197static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
198{
199 return res->dl_prev_leaf_bh == NULL;
200}
201
202void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
203{
204 brelse(res->dl_dx_root_bh);
205 brelse(res->dl_leaf_bh);
206 brelse(res->dl_dx_leaf_bh);
207 brelse(res->dl_prev_leaf_bh);
208}
209
210static int ocfs2_dir_indexed(struct inode *inode)
211{
212 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
213 return 1;
214 return 0;
215}
216
217static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
218{
219 return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
220}
221
222/*
223 * Hashing code adapted from ext3
224 */
225#define DELTA 0x9E3779B9
226
227static void TEA_transform(__u32 buf[4], __u32 const in[])
228{
229 __u32 sum = 0;
230 __u32 b0 = buf[0], b1 = buf[1];
231 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
232 int n = 16;
233
234 do {
235 sum += DELTA;
236 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
237 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
238 } while (--n);
239
240 buf[0] += b0;
241 buf[1] += b1;
242}
243
244static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
245{
246 __u32 pad, val;
247 int i;
248
249 pad = (__u32)len | ((__u32)len << 8);
250 pad |= pad << 16;
251
252 val = pad;
253 if (len > num*4)
254 len = num * 4;
255 for (i = 0; i < len; i++) {
256 if ((i % 4) == 0)
257 val = pad;
258 val = msg[i] + (val << 8);
259 if ((i % 4) == 3) {
260 *buf++ = val;
261 val = pad;
262 num--;
263 }
264 }
265 if (--num >= 0)
266 *buf++ = val;
267 while (--num >= 0)
268 *buf++ = pad;
269}
270
271static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
272 struct ocfs2_dx_hinfo *hinfo)
273{
274 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
275 const char *p;
276 __u32 in[8], buf[4];
277
278 /*
279 * XXX: Is this really necessary, if the index is never looked
280 * at by readdir? Is a hash value of '0' a bad idea?
281 */
282 if ((len == 1 && !strncmp(".", name, 1)) ||
283 (len == 2 && !strncmp("..", name, 2))) {
284 buf[0] = buf[1] = 0;
285 goto out;
286 }
287
288#ifdef OCFS2_DEBUG_DX_DIRS
289 /*
290 * This makes it very easy to debug indexing problems. We
291 * should never allow this to be selected without hand editing
292 * this file though.
293 */
294 buf[0] = buf[1] = len;
295 goto out;
296#endif
297
298 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
299
300 p = name;
301 while (len > 0) {
302 str2hashbuf(p, len, in, 4);
303 TEA_transform(buf, in);
304 len -= 16;
305 p += 16;
306 }
307
308out:
309 hinfo->major_hash = buf[0];
310 hinfo->minor_hash = buf[1];
153} 311}
154 312
155/* 313/*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
312} 470}
313 471
314/* 472/*
473 * Validate a directory trailer.
474 *
475 * We check the trailer here rather than in ocfs2_validate_dir_block()
476 * because that function doesn't have the inode to test.
477 */
478static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
479{
480 int rc = 0;
481 struct ocfs2_dir_block_trailer *trailer;
482
483 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
484 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
485 rc = -EINVAL;
486 ocfs2_error(dir->i_sb,
487 "Invalid dirblock #%llu: "
488 "signature = %.*s\n",
489 (unsigned long long)bh->b_blocknr, 7,
490 trailer->db_signature);
491 goto out;
492 }
493 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
494 rc = -EINVAL;
495 ocfs2_error(dir->i_sb,
496 "Directory block #%llu has an invalid "
497 "db_blkno of %llu",
498 (unsigned long long)bh->b_blocknr,
499 (unsigned long long)le64_to_cpu(trailer->db_blkno));
500 goto out;
501 }
502 if (le64_to_cpu(trailer->db_parent_dinode) !=
503 OCFS2_I(dir)->ip_blkno) {
504 rc = -EINVAL;
505 ocfs2_error(dir->i_sb,
506 "Directory block #%llu on dinode "
507 "#%llu has an invalid parent_dinode "
508 "of %llu",
509 (unsigned long long)bh->b_blocknr,
510 (unsigned long long)OCFS2_I(dir)->ip_blkno,
511 (unsigned long long)le64_to_cpu(trailer->db_blkno));
512 goto out;
513 }
514out:
515 return rc;
516}
517
518/*
315 * This function forces all errors to -EIO for consistency with its 519 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the 520 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with 521 * real error codes would do to callers. We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
322{ 526{
323 int rc = 0; 527 int rc = 0;
324 struct buffer_head *tmp = *bh; 528 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326 529
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 530 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block); 531 ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
331 goto out; 534 goto out;
332 } 535 }
333 536
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) && 537 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) { 538 ocfs2_supports_dir_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); 539 rc = ocfs2_check_dir_trailer(inode, tmp);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 540 if (rc) {
343 rc = -EINVAL; 541 if (!*bh)
344 ocfs2_error(inode->i_sb, 542 brelse(tmp);
345 "Invalid dirblock #%llu: " 543 mlog_errno(rc);
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out; 544 goto out;
371 } 545 }
372 } 546 }
@@ -379,6 +553,141 @@ out:
379 return rc ? -EIO : 0; 553 return rc ? -EIO : 0;
380} 554}
381 555
556/*
557 * Read the block at 'phys' which belongs to this directory
558 * inode. This function does no virtual->physical block translation -
559 * what's passed in is assumed to be a valid directory block.
560 */
561static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
562 struct buffer_head **bh)
563{
564 int ret;
565 struct buffer_head *tmp = *bh;
566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
568 if (ret) {
569 mlog_errno(ret);
570 goto out;
571 }
572
573 if (ocfs2_supports_dir_trailer(dir)) {
574 ret = ocfs2_check_dir_trailer(dir, tmp);
575 if (ret) {
576 if (!*bh)
577 brelse(tmp);
578 mlog_errno(ret);
579 goto out;
580 }
581 }
582
583 if (!ret && !*bh)
584 *bh = tmp;
585out:
586 return ret;
587}
588
589static int ocfs2_validate_dx_root(struct super_block *sb,
590 struct buffer_head *bh)
591{
592 int ret;
593 struct ocfs2_dx_root_block *dx_root;
594
595 BUG_ON(!buffer_uptodate(bh));
596
597 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
598
599 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
600 if (ret) {
601 mlog(ML_ERROR,
602 "Checksum failed for dir index root block %llu\n",
603 (unsigned long long)bh->b_blocknr);
604 return ret;
605 }
606
607 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
608 ocfs2_error(sb,
609 "Dir Index Root # %llu has bad signature %.*s",
610 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
611 7, dx_root->dr_signature);
612 return -EINVAL;
613 }
614
615 return 0;
616}
617
618static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
619 struct buffer_head **dx_root_bh)
620{
621 int ret;
622 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh;
624
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
626
627 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh)
629 *dx_root_bh = tmp;
630
631 return ret;
632}
633
634static int ocfs2_validate_dx_leaf(struct super_block *sb,
635 struct buffer_head *bh)
636{
637 int ret;
638 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
639
640 BUG_ON(!buffer_uptodate(bh));
641
642 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
643 if (ret) {
644 mlog(ML_ERROR,
645 "Checksum failed for dir index leaf block %llu\n",
646 (unsigned long long)bh->b_blocknr);
647 return ret;
648 }
649
650 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
651 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
652 7, dx_leaf->dl_signature);
653 return -EROFS;
654 }
655
656 return 0;
657}
658
659static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
660 struct buffer_head **dx_leaf_bh)
661{
662 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh;
664
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
666
667 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh)
669 *dx_leaf_bh = tmp;
670
671 return ret;
672}
673
674/*
675 * Read a series of dx_leaf blocks. This expects all buffer_head
676 * pointers to be NULL on function entry.
677 */
678static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
679 struct buffer_head **dx_leaf_bhs)
680{
681 int ret;
682
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf);
685 if (ret)
686 mlog_errno(ret);
687
688 return ret;
689}
690
382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 691static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
383 struct inode *dir, 692 struct inode *dir,
384 struct ocfs2_dir_entry **res_dir) 693 struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
480 return ret; 789 return ret;
481} 790}
482 791
792static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
793 struct ocfs2_extent_list *el,
794 u32 major_hash,
795 u32 *ret_cpos,
796 u64 *ret_phys_blkno,
797 unsigned int *ret_clen)
798{
799 int ret = 0, i, found;
800 struct buffer_head *eb_bh = NULL;
801 struct ocfs2_extent_block *eb;
802 struct ocfs2_extent_rec *rec = NULL;
803
804 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
806 if (ret) {
807 mlog_errno(ret);
808 goto out;
809 }
810
811 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
812 el = &eb->h_list;
813
814 if (el->l_tree_depth) {
815 ocfs2_error(inode->i_sb,
816 "Inode %lu has non zero tree depth in "
817 "btree tree block %llu\n", inode->i_ino,
818 (unsigned long long)eb_bh->b_blocknr);
819 ret = -EROFS;
820 goto out;
821 }
822 }
823
824 found = 0;
825 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
826 rec = &el->l_recs[i];
827
828 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
829 found = 1;
830 break;
831 }
832 }
833
834 if (!found) {
835 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
836 "record (%u, %u, 0) in btree", inode->i_ino,
837 le32_to_cpu(rec->e_cpos),
838 ocfs2_rec_clusters(el, rec));
839 ret = -EROFS;
840 goto out;
841 }
842
843 if (ret_phys_blkno)
844 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
845 if (ret_cpos)
846 *ret_cpos = le32_to_cpu(rec->e_cpos);
847 if (ret_clen)
848 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
849
850out:
851 brelse(eb_bh);
852 return ret;
853}
854
855/*
856 * Returns the block index, from the start of the cluster which this
857 * hash belongs too.
858 */
859static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
860 u32 minor_hash)
861{
862 return minor_hash & osb->osb_dx_mask;
863}
864
865static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
866 struct ocfs2_dx_hinfo *hinfo)
867{
868 return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
869}
870
871static int ocfs2_dx_dir_lookup(struct inode *inode,
872 struct ocfs2_extent_list *el,
873 struct ocfs2_dx_hinfo *hinfo,
874 u32 *ret_cpos,
875 u64 *ret_phys_blkno)
876{
877 int ret = 0;
878 unsigned int cend, uninitialized_var(clen);
879 u32 uninitialized_var(cpos);
880 u64 uninitialized_var(blkno);
881 u32 name_hash = hinfo->major_hash;
882
883 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
884 &clen);
885 if (ret) {
886 mlog_errno(ret);
887 goto out;
888 }
889
890 cend = cpos + clen;
891 if (name_hash >= cend) {
892 /* We want the last cluster */
893 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
894 cpos += clen - 1;
895 } else {
896 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
897 name_hash - cpos);
898 cpos = name_hash;
899 }
900
901 /*
902 * We now have the cluster which should hold our entry. To
903 * find the exact block from the start of the cluster to
904 * search, we take the lower bits of the hash.
905 */
906 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
907
908 if (ret_phys_blkno)
909 *ret_phys_blkno = blkno;
910 if (ret_cpos)
911 *ret_cpos = cpos;
912
913out:
914
915 return ret;
916}
917
918static int ocfs2_dx_dir_search(const char *name, int namelen,
919 struct inode *dir,
920 struct ocfs2_dx_root_block *dx_root,
921 struct ocfs2_dir_lookup_result *res)
922{
923 int ret, i, found;
924 u64 uninitialized_var(phys);
925 struct buffer_head *dx_leaf_bh = NULL;
926 struct ocfs2_dx_leaf *dx_leaf;
927 struct ocfs2_dx_entry *dx_entry = NULL;
928 struct buffer_head *dir_ent_bh = NULL;
929 struct ocfs2_dir_entry *dir_ent = NULL;
930 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
931 struct ocfs2_extent_list *dr_el;
932 struct ocfs2_dx_entry_list *entry_list;
933
934 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
935
936 if (ocfs2_dx_root_inline(dx_root)) {
937 entry_list = &dx_root->dr_entries;
938 goto search;
939 }
940
941 dr_el = &dx_root->dr_list;
942
943 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
944 if (ret) {
945 mlog_errno(ret);
946 goto out;
947 }
948
949 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
950 "returns: %llu\n",
951 (unsigned long long)OCFS2_I(dir)->ip_blkno,
952 namelen, name, hinfo->major_hash, hinfo->minor_hash,
953 (unsigned long long)phys);
954
955 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
956 if (ret) {
957 mlog_errno(ret);
958 goto out;
959 }
960
961 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
962
963 mlog(0, "leaf info: num_used: %d, count: %d\n",
964 le16_to_cpu(dx_leaf->dl_list.de_num_used),
965 le16_to_cpu(dx_leaf->dl_list.de_count));
966
967 entry_list = &dx_leaf->dl_list;
968
969search:
970 /*
971 * Empty leaf is legal, so no need to check for that.
972 */
973 found = 0;
974 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
975 dx_entry = &entry_list->de_entries[i];
976
977 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
978 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
979 continue;
980
981 /*
982 * Search unindexed leaf block now. We're not
983 * guaranteed to find anything.
984 */
985 ret = ocfs2_read_dir_block_direct(dir,
986 le64_to_cpu(dx_entry->dx_dirent_blk),
987 &dir_ent_bh);
988 if (ret) {
989 mlog_errno(ret);
990 goto out;
991 }
992
993 /*
994 * XXX: We should check the unindexed block here,
995 * before using it.
996 */
997
998 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
999 0, dir_ent_bh->b_data,
1000 dir->i_sb->s_blocksize, &dir_ent);
1001 if (found == 1)
1002 break;
1003
1004 if (found == -1) {
1005 /* This means we found a bad directory entry. */
1006 ret = -EIO;
1007 mlog_errno(ret);
1008 goto out;
1009 }
1010
1011 brelse(dir_ent_bh);
1012 dir_ent_bh = NULL;
1013 }
1014
1015 if (found <= 0) {
1016 ret = -ENOENT;
1017 goto out;
1018 }
1019
1020 res->dl_leaf_bh = dir_ent_bh;
1021 res->dl_entry = dir_ent;
1022 res->dl_dx_leaf_bh = dx_leaf_bh;
1023 res->dl_dx_entry = dx_entry;
1024
1025 ret = 0;
1026out:
1027 if (ret) {
1028 brelse(dx_leaf_bh);
1029 brelse(dir_ent_bh);
1030 }
1031 return ret;
1032}
1033
1034static int ocfs2_find_entry_dx(const char *name, int namelen,
1035 struct inode *dir,
1036 struct ocfs2_dir_lookup_result *lookup)
1037{
1038 int ret;
1039 struct buffer_head *di_bh = NULL;
1040 struct ocfs2_dinode *di;
1041 struct buffer_head *dx_root_bh = NULL;
1042 struct ocfs2_dx_root_block *dx_root;
1043
1044 ret = ocfs2_read_inode_block(dir, &di_bh);
1045 if (ret) {
1046 mlog_errno(ret);
1047 goto out;
1048 }
1049
1050 di = (struct ocfs2_dinode *)di_bh->b_data;
1051
1052 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1053 if (ret) {
1054 mlog_errno(ret);
1055 goto out;
1056 }
1057 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1058
1059 ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1060 if (ret) {
1061 if (ret != -ENOENT)
1062 mlog_errno(ret);
1063 goto out;
1064 }
1065
1066 lookup->dl_dx_root_bh = dx_root_bh;
1067 dx_root_bh = NULL;
1068out:
1069 brelse(di_bh);
1070 brelse(dx_root_bh);
1071 return ret;
1072}
1073
483/* 1074/*
484 * Try to find an entry of the provided name within 'dir'. 1075 * Try to find an entry of the provided name within 'dir'.
485 * 1076 *
486 * If nothing was found, NULL is returned. Otherwise, a buffer_head 1077 * If nothing was found, -ENOENT is returned. Otherwise, zero is
487 * and pointer to the dir entry are passed back. 1078 * returned and the struct 'res' will contain information useful to
1079 * other directory manipulation functions.
488 * 1080 *
489 * Caller can NOT assume anything about the contents of the 1081 * Caller can NOT assume anything about the contents of the
490 * buffer_head - it is passed back only so that it can be passed into 1082 * buffer_heads - they are passed back only so that it can be passed
491 * any one of the manipulation functions (add entry, delete entry, 1083 * into any one of the manipulation functions (add entry, delete
492 * etc). As an example, bh in the extent directory case is a data 1084 * entry, etc). As an example, bh in the extent directory case is a
493 * block, in the inline-data case it actually points to an inode. 1085 * data block, in the inline-data case it actually points to an inode,
1086 * in the indexed directory case, multiple buffers are involved.
494 */ 1087 */
495struct buffer_head *ocfs2_find_entry(const char *name, int namelen, 1088int ocfs2_find_entry(const char *name, int namelen,
496 struct inode *dir, 1089 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
497 struct ocfs2_dir_entry **res_dir)
498{ 1090{
499 *res_dir = NULL; 1091 struct buffer_head *bh;
1092 struct ocfs2_dir_entry *res_dir = NULL;
500 1093
1094 if (ocfs2_dir_indexed(dir))
1095 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1096
1097 /*
1098 * The unindexed dir code only uses part of the lookup
1099 * structure, so there's no reason to push it down further
1100 * than this.
1101 */
501 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1102 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
502 return ocfs2_find_entry_id(name, namelen, dir, res_dir); 1103 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1104 else
1105 bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1106
1107 if (bh == NULL)
1108 return -ENOENT;
503 1109
504 return ocfs2_find_entry_el(name, namelen, dir, res_dir); 1110 lookup->dl_leaf_bh = bh;
1111 lookup->dl_entry = res_dir;
1112 return 0;
505} 1113}
506 1114
507/* 1115/*
508 * Update inode number and type of a previously found directory entry. 1116 * Update inode number and type of a previously found directory entry.
509 */ 1117 */
510int ocfs2_update_entry(struct inode *dir, handle_t *handle, 1118int ocfs2_update_entry(struct inode *dir, handle_t *handle,
511 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 1119 struct ocfs2_dir_lookup_result *res,
512 struct inode *new_entry_inode) 1120 struct inode *new_entry_inode)
513{ 1121{
514 int ret; 1122 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1123 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1124 struct ocfs2_dir_entry *de = res->dl_entry;
1125 struct buffer_head *de_bh = res->dl_leaf_bh;
516 1126
517 /* 1127 /*
518 * The same code works fine for both inline-data and extent 1128 * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
538 return ret; 1148 return ret;
539} 1149}
540 1150
1151/*
1152 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1153 * previous entry
1154 */
541static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1155static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
542 struct ocfs2_dir_entry *de_del, 1156 struct ocfs2_dir_entry *de_del,
543 struct buffer_head *bh, char *first_de, 1157 struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
587 return status; 1201 return status;
588} 1202}
589 1203
1204static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1205{
1206 unsigned int hole;
1207
1208 if (le64_to_cpu(de->inode) == 0)
1209 hole = le16_to_cpu(de->rec_len);
1210 else
1211 hole = le16_to_cpu(de->rec_len) -
1212 OCFS2_DIR_REC_LEN(de->name_len);
1213
1214 return hole;
1215}
1216
1217static int ocfs2_find_max_rec_len(struct super_block *sb,
1218 struct buffer_head *dirblock_bh)
1219{
1220 int size, this_hole, largest_hole = 0;
1221 char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1222 struct ocfs2_dir_entry *de;
1223
1224 trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1225 size = ocfs2_dir_trailer_blk_off(sb);
1226 limit = start + size;
1227 de_buf = start;
1228 de = (struct ocfs2_dir_entry *)de_buf;
1229 do {
1230 if (de_buf != trailer) {
1231 this_hole = ocfs2_figure_dirent_hole(de);
1232 if (this_hole > largest_hole)
1233 largest_hole = this_hole;
1234 }
1235
1236 de_buf += le16_to_cpu(de->rec_len);
1237 de = (struct ocfs2_dir_entry *)de_buf;
1238 } while (de_buf < limit);
1239
1240 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1241 return largest_hole;
1242 return 0;
1243}
1244
1245static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1246 int index)
1247{
1248 int num_used = le16_to_cpu(entry_list->de_num_used);
1249
1250 if (num_used == 1 || index == (num_used - 1))
1251 goto clear;
1252
1253 memmove(&entry_list->de_entries[index],
1254 &entry_list->de_entries[index + 1],
1255 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1256clear:
1257 num_used--;
1258 memset(&entry_list->de_entries[num_used], 0,
1259 sizeof(struct ocfs2_dx_entry));
1260 entry_list->de_num_used = cpu_to_le16(num_used);
1261}
1262
1263static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1264 struct ocfs2_dir_lookup_result *lookup)
1265{
1266 int ret, index, max_rec_len, add_to_free_list = 0;
1267 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1268 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1269 struct ocfs2_dx_leaf *dx_leaf;
1270 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1271 struct ocfs2_dir_block_trailer *trailer;
1272 struct ocfs2_dx_root_block *dx_root;
1273 struct ocfs2_dx_entry_list *entry_list;
1274
1275 /*
1276 * This function gets a bit messy because we might have to
1277 * modify the root block, regardless of whether the indexed
1278 * entries are stored inline.
1279 */
1280
1281 /*
1282 * *Only* set 'entry_list' here, based on where we're looking
1283 * for the indexed entries. Later, we might still want to
1284 * journal both blocks, based on free list state.
1285 */
1286 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1287 if (ocfs2_dx_root_inline(dx_root)) {
1288 entry_list = &dx_root->dr_entries;
1289 } else {
1290 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1291 entry_list = &dx_leaf->dl_list;
1292 }
1293
1294 /* Neither of these are a disk corruption - that should have
1295 * been caught by lookup, before we got here. */
1296 BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1297 BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1298
1299 index = (char *)dx_entry - (char *)entry_list->de_entries;
1300 index /= sizeof(*dx_entry);
1301
1302 if (index >= le16_to_cpu(entry_list->de_num_used)) {
1303 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1304 (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1305 entry_list, dx_entry);
1306 return -EIO;
1307 }
1308
1309 /*
1310 * We know that removal of this dirent will leave enough room
1311 * for a new one, so add this block to the free list if it
1312 * isn't already there.
1313 */
1314 trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1315 if (trailer->db_free_rec_len == 0)
1316 add_to_free_list = 1;
1317
1318 /*
1319 * Add the block holding our index into the journal before
1320 * removing the unindexed entry. If we get an error return
1321 * from __ocfs2_delete_entry(), then it hasn't removed the
1322 * entry yet. Likewise, successful return means we *must*
1323 * remove the indexed entry.
1324 *
1325 * We're also careful to journal the root tree block here as
1326 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list.
1328 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) {
1332 mlog_errno(ret);
1333 goto out;
1334 }
1335
1336 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir,
1338 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) {
1341 mlog_errno(ret);
1342 goto out;
1343 }
1344 }
1345
1346 mlog(0, "Dir %llu: delete entry at index: %d\n",
1347 (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
1348
1349 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1350 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1357 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1358 if (add_to_free_list) {
1359 trailer->db_free_next = dx_root->dr_free_blk;
1360 dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1361 ocfs2_journal_dirty(handle, dx_root_bh);
1362 }
1363
1364 /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1365 ocfs2_journal_dirty(handle, leaf_bh);
1366
1367 le32_add_cpu(&dx_root->dr_num_entries, -1);
1368 ocfs2_journal_dirty(handle, dx_root_bh);
1369
1370 ocfs2_dx_list_remove_entry(entry_list, index);
1371
1372 if (!ocfs2_dx_root_inline(dx_root))
1373 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1374
1375out:
1376 return ret;
1377}
1378
590static inline int ocfs2_delete_entry_id(handle_t *handle, 1379static inline int ocfs2_delete_entry_id(handle_t *handle,
591 struct inode *dir, 1380 struct inode *dir,
592 struct ocfs2_dir_entry *de_del, 1381 struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
624} 1413}
625 1414
626/* 1415/*
627 * ocfs2_delete_entry deletes a directory entry by merging it with the 1416 * Delete a directory entry. Hide the details of directory
628 * previous entry 1417 * implementation from the caller.
629 */ 1418 */
630int ocfs2_delete_entry(handle_t *handle, 1419int ocfs2_delete_entry(handle_t *handle,
631 struct inode *dir, 1420 struct inode *dir,
632 struct ocfs2_dir_entry *de_del, 1421 struct ocfs2_dir_lookup_result *res)
633 struct buffer_head *bh)
634{ 1422{
1423 if (ocfs2_dir_indexed(dir))
1424 return ocfs2_delete_entry_dx(handle, dir, res);
1425
635 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1426 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
636 return ocfs2_delete_entry_id(handle, dir, de_del, bh); 1427 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1428 res->dl_leaf_bh);
637 1429
638 return ocfs2_delete_entry_el(handle, dir, de_del, bh); 1430 return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1431 res->dl_leaf_bh);
639} 1432}
640 1433
641/* 1434/*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
663 return 0; 1456 return 0;
664} 1457}
665 1458
1459static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1460 struct ocfs2_dx_entry *dx_new_entry)
1461{
1462 int i;
1463
1464 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1465 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1466
1467 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1468}
1469
1470static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1471 struct ocfs2_dx_hinfo *hinfo,
1472 u64 dirent_blk)
1473{
1474 int i;
1475 struct ocfs2_dx_entry *dx_entry;
1476
1477 i = le16_to_cpu(entry_list->de_num_used);
1478 dx_entry = &entry_list->de_entries[i];
1479
1480 memset(dx_entry, 0, sizeof(*dx_entry));
1481 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1482 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1483 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1484
1485 le16_add_cpu(&entry_list->de_num_used, 1);
1486}
1487
1488static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1489 struct ocfs2_dx_hinfo *hinfo,
1490 u64 dirent_blk,
1491 struct buffer_head *dx_leaf_bh)
1492{
1493 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf;
1495
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out;
1501 }
1502
1503 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1504 ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1505 ocfs2_journal_dirty(handle, dx_leaf_bh);
1506
1507out:
1508 return ret;
1509}
1510
1511static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1512 struct ocfs2_dx_hinfo *hinfo,
1513 u64 dirent_blk,
1514 struct ocfs2_dx_root_block *dx_root)
1515{
1516 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1517}
1518
1519static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1520 struct ocfs2_dir_lookup_result *lookup)
1521{
1522 int ret = 0;
1523 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) {
1529 mlog_errno(ret);
1530 goto out;
1531 }
1532
1533 dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1534 if (ocfs2_dx_root_inline(dx_root)) {
1535 ocfs2_dx_inline_root_insert(dir, handle,
1536 &lookup->dl_hinfo,
1537 lookup->dl_leaf_bh->b_blocknr,
1538 dx_root);
1539 } else {
1540 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1541 lookup->dl_leaf_bh->b_blocknr,
1542 lookup->dl_dx_leaf_bh);
1543 if (ret)
1544 goto out;
1545 }
1546
1547 le32_add_cpu(&dx_root->dr_num_entries, 1);
1548 ocfs2_journal_dirty(handle, dx_root_bh);
1549
1550out:
1551 return ret;
1552}
1553
1554static void ocfs2_remove_block_from_free_list(struct inode *dir,
1555 handle_t *handle,
1556 struct ocfs2_dir_lookup_result *lookup)
1557{
1558 struct ocfs2_dir_block_trailer *trailer, *prev;
1559 struct ocfs2_dx_root_block *dx_root;
1560 struct buffer_head *bh;
1561
1562 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1563
1564 if (ocfs2_free_list_at_root(lookup)) {
1565 bh = lookup->dl_dx_root_bh;
1566 dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1567 dx_root->dr_free_blk = trailer->db_free_next;
1568 } else {
1569 bh = lookup->dl_prev_leaf_bh;
1570 prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1571 prev->db_free_next = trailer->db_free_next;
1572 }
1573
1574 trailer->db_free_rec_len = cpu_to_le16(0);
1575 trailer->db_free_next = cpu_to_le64(0);
1576
1577 ocfs2_journal_dirty(handle, bh);
1578 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1579}
1580
1581/*
1582 * This expects that a journal write has been reserved on
1583 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1584 */
1585static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1586 struct ocfs2_dir_lookup_result *lookup)
1587{
1588 int max_rec_len;
1589 struct ocfs2_dir_block_trailer *trailer;
1590
1591 /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1592 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1593 if (max_rec_len) {
1594 /*
1595 * There's still room in this block, so no need to remove it
1596 * from the free list. In this case, we just want to update
1597 * the rec len accounting.
1598 */
1599 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1600 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1601 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1602 } else {
1603 ocfs2_remove_block_from_free_list(dir, handle, lookup);
1604 }
1605}
1606
666/* we don't always have a dentry for what we want to add, so people 1607/* we don't always have a dentry for what we want to add, so people
667 * like orphan dir can call this instead. 1608 * like orphan dir can call this instead.
668 * 1609 *
669 * If you pass me insert_bh, I'll skip the search of the other dir 1610 * The lookup context must have been filled from
670 * blocks and put the record in there. 1611 * ocfs2_prepare_dir_for_insert.
671 */ 1612 */
672int __ocfs2_add_entry(handle_t *handle, 1613int __ocfs2_add_entry(handle_t *handle,
673 struct inode *dir, 1614 struct inode *dir,
674 const char *name, int namelen, 1615 const char *name, int namelen,
675 struct inode *inode, u64 blkno, 1616 struct inode *inode, u64 blkno,
676 struct buffer_head *parent_fe_bh, 1617 struct buffer_head *parent_fe_bh,
677 struct buffer_head *insert_bh) 1618 struct ocfs2_dir_lookup_result *lookup)
678{ 1619{
679 unsigned long offset; 1620 unsigned long offset;
680 unsigned short rec_len; 1621 unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
683 struct super_block *sb = dir->i_sb; 1624 struct super_block *sb = dir->i_sb;
684 int retval, status; 1625 int retval, status;
685 unsigned int size = sb->s_blocksize; 1626 unsigned int size = sb->s_blocksize;
1627 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
686 char *data_start = insert_bh->b_data; 1628 char *data_start = insert_bh->b_data;
687 1629
688 mlog_entry_void(); 1630 mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
690 if (!namelen) 1632 if (!namelen)
691 return -EINVAL; 1633 return -EINVAL;
692 1634
693 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1635 if (ocfs2_dir_indexed(dir)) {
1636 struct buffer_head *bh;
1637
1638 /*
1639 * An indexed dir may require that we update the free space
1640 * list. Reserve a write to the previous node in the list so
1641 * that we don't fail later.
1642 *
1643 * XXX: This can be either a dx_root_block, or an unindexed
1644 * directory tree leaf block.
1645 */
1646 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else {
1651 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE);
1654 }
1655 if (retval) {
1656 mlog_errno(retval);
1657 return retval;
1658 }
1659 } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
694 data_start = di->id2.i_data.id_data; 1660 data_start = di->id2.i_data.id_data;
695 size = i_size_read(dir); 1661 size = i_size_read(dir);
696 1662
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
737 status = ocfs2_journal_access_di(handle, dir, 1703 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh, 1704 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE); 1705 OCFS2_JOURNAL_ACCESS_WRITE);
740 else 1706 else {
741 status = ocfs2_journal_access_db(handle, dir, 1707 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh, 1708 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE); 1709 OCFS2_JOURNAL_ACCESS_WRITE);
1710
1711 if (ocfs2_dir_indexed(dir)) {
1712 status = ocfs2_dx_dir_insert(dir,
1713 handle,
1714 lookup);
1715 if (status) {
1716 mlog_errno(status);
1717 goto bail;
1718 }
1719 }
1720 }
1721
744 /* By now the buffer is marked for journaling */ 1722 /* By now the buffer is marked for journaling */
745 offset += le16_to_cpu(de->rec_len); 1723 offset += le16_to_cpu(de->rec_len);
746 if (le64_to_cpu(de->inode)) { 1724 if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
761 de->name_len = namelen; 1739 de->name_len = namelen;
762 memcpy(de->name, name, namelen); 1740 memcpy(de->name, name, namelen);
763 1741
1742 if (ocfs2_dir_indexed(dir))
1743 ocfs2_recalc_free_list(dir, handle, lookup);
1744
764 dir->i_version++; 1745 dir->i_version++;
765 status = ocfs2_journal_dirty(handle, insert_bh); 1746 status = ocfs2_journal_dirty(handle, insert_bh);
766 retval = 0; 1747 retval = 0;
@@ -870,6 +1851,10 @@ out:
870 return 0; 1851 return 0;
871} 1852}
872 1853
1854/*
1855 * NOTE: This function can be called against unindexed directories,
1856 * and indexed ones.
1857 */
873static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1858static int ocfs2_dir_foreach_blk_el(struct inode *inode,
874 u64 *f_version, 1859 u64 *f_version,
875 loff_t *f_pos, void *priv, 1860 loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
1071 int namelen, 2056 int namelen,
1072 u64 *blkno, 2057 u64 *blkno,
1073 struct inode *inode, 2058 struct inode *inode,
1074 struct buffer_head **dirent_bh, 2059 struct ocfs2_dir_lookup_result *lookup)
1075 struct ocfs2_dir_entry **dirent)
1076{ 2060{
1077 int status = -ENOENT; 2061 int status = -ENOENT;
1078 2062
1079 mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", 2063 mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
1080 namelen, name, blkno, inode, dirent_bh, dirent); 2064 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1081 2065
1082 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); 2066 status = ocfs2_find_entry(name, namelen, inode, lookup);
1083 if (!*dirent_bh || !*dirent) { 2067 if (status)
1084 status = -ENOENT;
1085 goto leave; 2068 goto leave;
1086 }
1087 2069
1088 *blkno = le64_to_cpu((*dirent)->inode); 2070 *blkno = le64_to_cpu(lookup->dl_entry->inode);
1089 2071
1090 status = 0; 2072 status = 0;
1091leave: 2073leave:
1092 if (status < 0) {
1093 *dirent = NULL;
1094 brelse(*dirent_bh);
1095 *dirent_bh = NULL;
1096 }
1097 2074
1098 mlog_exit(status);
1099 return status; 2075 return status;
1100} 2076}
1101 2077
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
1107 int namelen, u64 *blkno) 2083 int namelen, u64 *blkno)
1108{ 2084{
1109 int ret; 2085 int ret;
1110 struct buffer_head *bh = NULL; 2086 struct ocfs2_dir_lookup_result lookup = { NULL, };
1111 struct ocfs2_dir_entry *dirent = NULL;
1112 2087
1113 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent); 2088 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
1114 brelse(bh); 2089 ocfs2_free_dir_lookup_result(&lookup);
1115 2090
1116 return ret; 2091 return ret;
1117} 2092}
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
1128 int namelen) 2103 int namelen)
1129{ 2104{
1130 int ret; 2105 int ret;
1131 struct buffer_head *dirent_bh = NULL; 2106 struct ocfs2_dir_lookup_result lookup = { NULL, };
1132 struct ocfs2_dir_entry *dirent = NULL;
1133 2107
1134 mlog_entry("dir %llu, name '%.*s'\n", 2108 mlog_entry("dir %llu, name '%.*s'\n",
1135 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2109 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
1136 2110
1137 ret = -EEXIST; 2111 ret = -EEXIST;
1138 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); 2112 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
1139 if (dirent_bh)
1140 goto bail; 2113 goto bail;
1141 2114
1142 ret = 0; 2115 ret = 0;
1143bail: 2116bail:
1144 brelse(dirent_bh); 2117 ocfs2_free_dir_lookup_result(&lookup);
1145 2118
1146 mlog_exit(ret); 2119 mlog_exit(ret);
1147 return ret; 2120 return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
1151 unsigned seen_dot; 2124 unsigned seen_dot;
1152 unsigned seen_dot_dot; 2125 unsigned seen_dot_dot;
1153 unsigned seen_other; 2126 unsigned seen_other;
2127 unsigned dx_dir;
1154}; 2128};
1155static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, 2129static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1156 loff_t pos, u64 ino, unsigned type) 2130 loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1160 /* 2134 /*
1161 * Check the positions of "." and ".." records to be sure 2135 * Check the positions of "." and ".." records to be sure
1162 * they're in the correct place. 2136 * they're in the correct place.
2137 *
2138 * Indexed directories don't need to proceed past the first
2139 * two entries, so we end the scan after seeing '..'. Despite
2140 * that, we allow the scan to proceed In the event that we
2141 * have a corrupted indexed directory (no dot or dot dot
2142 * entries). This allows us to double check for existing
2143 * entries which might not have been found in the index.
1163 */ 2144 */
1164 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { 2145 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
1165 p->seen_dot = 1; 2146 p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1169 if (name_len == 2 && !strncmp("..", name, 2) && 2150 if (name_len == 2 && !strncmp("..", name, 2) &&
1170 pos == OCFS2_DIR_REC_LEN(1)) { 2151 pos == OCFS2_DIR_REC_LEN(1)) {
1171 p->seen_dot_dot = 1; 2152 p->seen_dot_dot = 1;
2153
2154 if (p->dx_dir && p->seen_dot)
2155 return 1;
2156
1172 return 0; 2157 return 0;
1173 } 2158 }
1174 2159
1175 p->seen_other = 1; 2160 p->seen_other = 1;
1176 return 1; 2161 return 1;
1177} 2162}
2163
2164static int ocfs2_empty_dir_dx(struct inode *inode,
2165 struct ocfs2_empty_dir_priv *priv)
2166{
2167 int ret;
2168 struct buffer_head *di_bh = NULL;
2169 struct buffer_head *dx_root_bh = NULL;
2170 struct ocfs2_dinode *di;
2171 struct ocfs2_dx_root_block *dx_root;
2172
2173 priv->dx_dir = 1;
2174
2175 ret = ocfs2_read_inode_block(inode, &di_bh);
2176 if (ret) {
2177 mlog_errno(ret);
2178 goto out;
2179 }
2180 di = (struct ocfs2_dinode *)di_bh->b_data;
2181
2182 ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2183 if (ret) {
2184 mlog_errno(ret);
2185 goto out;
2186 }
2187 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2188
2189 if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2190 priv->seen_other = 1;
2191
2192out:
2193 brelse(di_bh);
2194 brelse(dx_root_bh);
2195 return ret;
2196}
2197
1178/* 2198/*
1179 * routine to check that the specified directory is empty (for rmdir) 2199 * routine to check that the specified directory is empty (for rmdir)
1180 * 2200 *
1181 * Returns 1 if dir is empty, zero otherwise. 2201 * Returns 1 if dir is empty, zero otherwise.
2202 *
2203 * XXX: This is a performance problem for unindexed directories.
1182 */ 2204 */
1183int ocfs2_empty_dir(struct inode *inode) 2205int ocfs2_empty_dir(struct inode *inode)
1184{ 2206{
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
1188 2210
1189 memset(&priv, 0, sizeof(priv)); 2211 memset(&priv, 0, sizeof(priv));
1190 2212
2213 if (ocfs2_dir_indexed(inode)) {
2214 ret = ocfs2_empty_dir_dx(inode, &priv);
2215 if (ret)
2216 mlog_errno(ret);
2217 /*
2218 * We still run ocfs2_dir_foreach to get the checks
2219 * for "." and "..".
2220 */
2221 }
2222
1191 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); 2223 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
1192 if (ret) 2224 if (ret)
1193 mlog_errno(ret); 2225 mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1280 struct inode *parent, 2312 struct inode *parent,
1281 struct inode *inode, 2313 struct inode *inode,
1282 struct buffer_head *fe_bh, 2314 struct buffer_head *fe_bh,
1283 struct ocfs2_alloc_context *data_ac) 2315 struct ocfs2_alloc_context *data_ac,
2316 struct buffer_head **ret_new_bh)
1284{ 2317{
1285 int status; 2318 int status;
1286 unsigned int size = osb->sb->s_blocksize; 2319 unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1289 2322
1290 mlog_entry_void(); 2323 mlog_entry_void();
1291 2324
1292 if (ocfs2_supports_dir_trailer(osb)) 2325 if (ocfs2_new_dir_wants_trailer(inode))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2326 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294 2327
1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 2328 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 2343 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1311 2344
1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); 2345 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1313 if (ocfs2_supports_dir_trailer(osb)) 2346 if (ocfs2_new_dir_wants_trailer(inode)) {
1314 ocfs2_init_dir_trailer(inode, new_bh); 2347 int size = le16_to_cpu(de->rec_len);
2348
2349 /*
2350 * Figure out the size of the hole left over after
2351 * insertion of '.' and '..'. The trailer wants this
2352 * information.
2353 */
2354 size -= OCFS2_DIR_REC_LEN(2);
2355 size -= sizeof(struct ocfs2_dir_block_trailer);
2356
2357 ocfs2_init_dir_trailer(inode, new_bh, size);
2358 }
1315 2359
1316 status = ocfs2_journal_dirty(handle, new_bh); 2360 status = ocfs2_journal_dirty(handle, new_bh);
1317 if (status < 0) { 2361 if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1329 } 2373 }
1330 2374
1331 status = 0; 2375 status = 0;
2376 if (ret_new_bh) {
2377 *ret_new_bh = new_bh;
2378 new_bh = NULL;
2379 }
1332bail: 2380bail:
1333 brelse(new_bh); 2381 brelse(new_bh);
1334 2382
@@ -1336,20 +2384,427 @@ bail:
1336 return status; 2384 return status;
1337} 2385}
1338 2386
2387static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2388 handle_t *handle, struct inode *dir,
2389 struct buffer_head *di_bh,
2390 struct buffer_head *dirdata_bh,
2391 struct ocfs2_alloc_context *meta_ac,
2392 int dx_inline, u32 num_entries,
2393 struct buffer_head **ret_dx_root_bh)
2394{
2395 int ret;
2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2397 u16 dr_suballoc_bit;
2398 u64 dr_blkno;
2399 unsigned int num_bits;
2400 struct buffer_head *dx_root_bh = NULL;
2401 struct ocfs2_dx_root_block *dx_root;
2402 struct ocfs2_dir_block_trailer *trailer =
2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2404
2405 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
2406 &num_bits, &dr_blkno);
2407 if (ret) {
2408 mlog_errno(ret);
2409 goto out;
2410 }
2411
2412 mlog(0, "Dir %llu, attach new index block: %llu\n",
2413 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2414 (unsigned long long)dr_blkno);
2415
2416 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2417 if (dx_root_bh == NULL) {
2418 ret = -EIO;
2419 goto out;
2420 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
2422
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429
2430 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2431 memset(dx_root, 0, osb->sb->s_blocksize);
2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2433 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
2434 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2435 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2436 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2437 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2438 dx_root->dr_num_entries = cpu_to_le32(num_entries);
2439 if (le16_to_cpu(trailer->db_free_rec_len))
2440 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2441 else
2442 dx_root->dr_free_blk = cpu_to_le64(0);
2443
2444 if (dx_inline) {
2445 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2446 dx_root->dr_entries.de_count =
2447 cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2448 } else {
2449 dx_root->dr_list.l_count =
2450 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2451 }
2452
2453 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2454 if (ret)
2455 mlog_errno(ret);
2456
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) {
2460 mlog_errno(ret);
2461 goto out;
2462 }
2463
2464 di->i_dx_root = cpu_to_le64(dr_blkno);
2465
2466 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2467 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2468
2469 ret = ocfs2_journal_dirty(handle, di_bh);
2470 if (ret)
2471 mlog_errno(ret);
2472
2473 *ret_dx_root_bh = dx_root_bh;
2474 dx_root_bh = NULL;
2475
2476out:
2477 brelse(dx_root_bh);
2478 return ret;
2479}
2480
2481static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2482 handle_t *handle, struct inode *dir,
2483 struct buffer_head **dx_leaves,
2484 int num_dx_leaves, u64 start_blk)
2485{
2486 int ret, i;
2487 struct ocfs2_dx_leaf *dx_leaf;
2488 struct buffer_head *bh;
2489
2490 for (i = 0; i < num_dx_leaves; i++) {
2491 bh = sb_getblk(osb->sb, start_blk + i);
2492 if (bh == NULL) {
2493 ret = -EIO;
2494 goto out;
2495 }
2496 dx_leaves[i] = bh;
2497
2498 ocfs2_set_new_buffer_uptodate(dir, bh);
2499
2500 ret = ocfs2_journal_access_dl(handle, dir, bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) {
2503 mlog_errno(ret);
2504 goto out;
2505 }
2506
2507 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2508
2509 memset(dx_leaf, 0, osb->sb->s_blocksize);
2510 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2511 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2512 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2513 dx_leaf->dl_list.de_count =
2514 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2515
2516 mlog(0,
2517 "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
2518 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2519 (unsigned long long)bh->b_blocknr,
2520 le16_to_cpu(dx_leaf->dl_list.de_count));
2521
2522 ocfs2_journal_dirty(handle, bh);
2523 }
2524
2525 ret = 0;
2526out:
2527 return ret;
2528}
2529
2530/*
2531 * Allocates and formats a new cluster for use in an indexed dir
2532 * leaf. This version will not do the extent insert, so that it can be
2533 * used by operations which need careful ordering.
2534 */
2535static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2536 u32 cpos, handle_t *handle,
2537 struct ocfs2_alloc_context *data_ac,
2538 struct buffer_head **dx_leaves,
2539 int num_dx_leaves, u64 *ret_phys_blkno)
2540{
2541 int ret;
2542 u32 phys, num;
2543 u64 phys_blkno;
2544 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2545
2546 /*
2547 * XXX: For create, this should claim cluster for the index
2548 * *before* the unindexed insert so that we have a better
2549 * chance of contiguousness as the directory grows in number
2550 * of entries.
2551 */
2552 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
2553 if (ret) {
2554 mlog_errno(ret);
2555 goto out;
2556 }
2557
2558 /*
2559 * Format the new cluster first. That way, we're inserting
2560 * valid data.
2561 */
2562 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2563 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2564 num_dx_leaves, phys_blkno);
2565 if (ret) {
2566 mlog_errno(ret);
2567 goto out;
2568 }
2569
2570 *ret_phys_blkno = phys_blkno;
2571out:
2572 return ret;
2573}
2574
2575static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2576 struct ocfs2_extent_tree *et,
2577 u32 cpos, handle_t *handle,
2578 struct ocfs2_alloc_context *data_ac,
2579 struct ocfs2_alloc_context *meta_ac,
2580 struct buffer_head **dx_leaves,
2581 int num_dx_leaves)
2582{
2583 int ret;
2584 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno);
2589 if (ret) {
2590 mlog_errno(ret);
2591 goto out;
2592 }
2593
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
2595 meta_ac);
2596 if (ret)
2597 mlog_errno(ret);
2598out:
2599 return ret;
2600}
2601
2602static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2603 int *ret_num_leaves)
2604{
2605 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2606 struct buffer_head **dx_leaves;
2607
2608 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2609 GFP_NOFS);
2610 if (dx_leaves && ret_num_leaves)
2611 *ret_num_leaves = num_dx_leaves;
2612
2613 return dx_leaves;
2614}
2615
2616static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2617 handle_t *handle,
2618 struct inode *parent,
2619 struct inode *inode,
2620 struct buffer_head *di_bh,
2621 struct ocfs2_alloc_context *data_ac,
2622 struct ocfs2_alloc_context *meta_ac)
2623{
2624 int ret;
2625 struct buffer_head *leaf_bh = NULL;
2626 struct buffer_head *dx_root_bh = NULL;
2627 struct ocfs2_dx_hinfo hinfo;
2628 struct ocfs2_dx_root_block *dx_root;
2629 struct ocfs2_dx_entry_list *entry_list;
2630
2631 /*
2632 * Our strategy is to create the directory as though it were
2633 * unindexed, then add the index block. This works with very
2634 * little complication since the state of a new directory is a
2635 * very well known quantity.
2636 *
2637 * Essentially, we have two dirents ("." and ".."), in the 1st
2638 * block which need indexing. These are easily inserted into
2639 * the index block.
2640 */
2641
2642 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2643 data_ac, &leaf_bh);
2644 if (ret) {
2645 mlog_errno(ret);
2646 goto out;
2647 }
2648
2649 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2650 meta_ac, 1, 2, &dx_root_bh);
2651 if (ret) {
2652 mlog_errno(ret);
2653 goto out;
2654 }
2655 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2656 entry_list = &dx_root->dr_entries;
2657
2658 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2659 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2660 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2661
2662 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2663 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2664
2665out:
2666 brelse(dx_root_bh);
2667 brelse(leaf_bh);
2668 return ret;
2669}
2670
1339int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2671int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1340 handle_t *handle, 2672 handle_t *handle,
1341 struct inode *parent, 2673 struct inode *parent,
1342 struct inode *inode, 2674 struct inode *inode,
1343 struct buffer_head *fe_bh, 2675 struct buffer_head *fe_bh,
1344 struct ocfs2_alloc_context *data_ac) 2676 struct ocfs2_alloc_context *data_ac,
2677 struct ocfs2_alloc_context *meta_ac)
2678
1345{ 2679{
1346 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2680 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1347 2681
1348 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2682 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1349 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2683 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1350 2684
2685 if (ocfs2_supports_indexed_dirs(osb))
2686 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2687 data_ac, meta_ac);
2688
1351 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2689 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1352 data_ac); 2690 data_ac, NULL);
2691}
2692
2693static int ocfs2_dx_dir_index_block(struct inode *dir,
2694 handle_t *handle,
2695 struct buffer_head **dx_leaves,
2696 int num_dx_leaves,
2697 u32 *num_dx_entries,
2698 struct buffer_head *dirent_bh)
2699{
2700 int ret, namelen, i;
2701 char *de_buf, *limit;
2702 struct ocfs2_dir_entry *de;
2703 struct buffer_head *dx_leaf_bh;
2704 struct ocfs2_dx_hinfo hinfo;
2705 u64 dirent_blk = dirent_bh->b_blocknr;
2706
2707 de_buf = dirent_bh->b_data;
2708 limit = de_buf + dir->i_sb->s_blocksize;
2709
2710 while (de_buf < limit) {
2711 de = (struct ocfs2_dir_entry *)de_buf;
2712
2713 namelen = de->name_len;
2714 if (!namelen || !de->inode)
2715 goto inc;
2716
2717 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2718
2719 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2720 dx_leaf_bh = dx_leaves[i];
2721
2722 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2723 dirent_blk, dx_leaf_bh);
2724 if (ret) {
2725 mlog_errno(ret);
2726 goto out;
2727 }
2728
2729 *num_dx_entries = *num_dx_entries + 1;
2730
2731inc:
2732 de_buf += le16_to_cpu(de->rec_len);
2733 }
2734
2735out:
2736 return ret;
2737}
2738
2739/*
2740 * XXX: This expects dx_root_bh to already be part of the transaction.
2741 */
2742static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2743 struct buffer_head *dx_root_bh,
2744 struct buffer_head *dirent_bh)
2745{
2746 char *de_buf, *limit;
2747 struct ocfs2_dx_root_block *dx_root;
2748 struct ocfs2_dir_entry *de;
2749 struct ocfs2_dx_hinfo hinfo;
2750 u64 dirent_blk = dirent_bh->b_blocknr;
2751
2752 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2753
2754 de_buf = dirent_bh->b_data;
2755 limit = de_buf + dir->i_sb->s_blocksize;
2756
2757 while (de_buf < limit) {
2758 de = (struct ocfs2_dir_entry *)de_buf;
2759
2760 if (!de->name_len || !de->inode)
2761 goto inc;
2762
2763 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2764
2765 mlog(0,
2766 "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
2767 (unsigned long long)dir->i_ino, hinfo.major_hash,
2768 hinfo.minor_hash,
2769 le16_to_cpu(dx_root->dr_entries.de_num_used),
2770 de->name_len, de->name);
2771
2772 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2773 dirent_blk);
2774
2775 le32_add_cpu(&dx_root->dr_num_entries, 1);
2776inc:
2777 de_buf += le16_to_cpu(de->rec_len);
2778 }
2779}
2780
2781/*
2782 * Count the number of inline directory entries in di_bh and compare
2783 * them against the number of entries we can hold in an inline dx root
2784 * block.
2785 */
2786static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2787 struct buffer_head *di_bh)
2788{
2789 int dirent_count = 0;
2790 char *de_buf, *limit;
2791 struct ocfs2_dir_entry *de;
2792 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2793
2794 de_buf = di->id2.i_data.id_data;
2795 limit = de_buf + i_size_read(dir);
2796
2797 while (de_buf < limit) {
2798 de = (struct ocfs2_dir_entry *)de_buf;
2799
2800 if (de->name_len && de->inode)
2801 dirent_count++;
2802
2803 de_buf += le16_to_cpu(de->rec_len);
2804 }
2805
2806 /* We are careful to leave room for one extra record. */
2807 return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
1353} 2808}
1354 2809
1355/* 2810/*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1358 * expansion from an inline directory to one with extents. The first dir block 2813 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block. 2814 * in that case is taken from the inline data portion of the inode block.
1360 * 2815 *
2816 * This will also return the largest amount of contiguous space for a dirent
2817 * in the block. That value is *not* necessarily the last dirent, even after
2818 * expansion. The directory indexing code wants this value for free space
2819 * accounting. We do this here since we're already walking the entire dir
2820 * block.
2821 *
1361 * We add the dir trailer if this filesystem wants it. 2822 * We add the dir trailer if this filesystem wants it.
1362 */ 2823 */
1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2824static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1364 struct super_block *sb) 2825 struct inode *dir)
1365{ 2826{
2827 struct super_block *sb = dir->i_sb;
1366 struct ocfs2_dir_entry *de; 2828 struct ocfs2_dir_entry *de;
1367 struct ocfs2_dir_entry *prev_de; 2829 struct ocfs2_dir_entry *prev_de;
1368 char *de_buf, *limit; 2830 char *de_buf, *limit;
1369 unsigned int new_size = sb->s_blocksize; 2831 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes; 2832 unsigned int bytes, this_hole;
2833 unsigned int largest_hole = 0;
1371 2834
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 2835 if (ocfs2_new_dir_wants_trailer(dir))
1373 new_size = ocfs2_dir_trailer_blk_off(sb); 2836 new_size = ocfs2_dir_trailer_blk_off(sb);
1374 2837
1375 bytes = new_size - old_size; 2838 bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1378 de_buf = start; 2841 de_buf = start;
1379 de = (struct ocfs2_dir_entry *)de_buf; 2842 de = (struct ocfs2_dir_entry *)de_buf;
1380 do { 2843 do {
2844 this_hole = ocfs2_figure_dirent_hole(de);
2845 if (this_hole > largest_hole)
2846 largest_hole = this_hole;
2847
1381 prev_de = de; 2848 prev_de = de;
1382 de_buf += le16_to_cpu(de->rec_len); 2849 de_buf += le16_to_cpu(de->rec_len);
1383 de = (struct ocfs2_dir_entry *)de_buf; 2850 de = (struct ocfs2_dir_entry *)de_buf;
1384 } while (de_buf < limit); 2851 } while (de_buf < limit);
1385 2852
1386 le16_add_cpu(&prev_de->rec_len, bytes); 2853 le16_add_cpu(&prev_de->rec_len, bytes);
2854
2855 /* We need to double check this after modification of the final
2856 * dirent. */
2857 this_hole = ocfs2_figure_dirent_hole(prev_de);
2858 if (this_hole > largest_hole)
2859 largest_hole = this_hole;
2860
2861 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2862 return largest_hole;
2863 return 0;
1387} 2864}
1388 2865
1389/* 2866/*
@@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1396 */ 2873 */
1397static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2874static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1398 unsigned int blocks_wanted, 2875 unsigned int blocks_wanted,
2876 struct ocfs2_dir_lookup_result *lookup,
1399 struct buffer_head **first_block_bh) 2877 struct buffer_head **first_block_bh)
1400{ 2878{
1401 u32 alloc, bit_off, len; 2879 u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
1402 struct super_block *sb = dir->i_sb; 2880 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb); 2881 int ret, i, num_dx_leaves = 0, dx_inline = 0,
1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 2882 credits = ocfs2_inline_to_extents_credits(sb);
2883 u64 dx_insert_blkno, blkno,
2884 bytes = blocks_wanted << sb->s_blocksize_bits;
1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2885 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1406 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2886 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1407 struct ocfs2_alloc_context *data_ac; 2887 struct ocfs2_alloc_context *data_ac;
2888 struct ocfs2_alloc_context *meta_ac = NULL;
1408 struct buffer_head *dirdata_bh = NULL; 2889 struct buffer_head *dirdata_bh = NULL;
2890 struct buffer_head *dx_root_bh = NULL;
2891 struct buffer_head **dx_leaves = NULL;
1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2892 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1410 handle_t *handle; 2893 handle_t *handle;
1411 struct ocfs2_extent_tree et; 2894 struct ocfs2_extent_tree et;
1412 int did_quota = 0; 2895 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0;
1413 2897
1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1415 2899
1416 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2900 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0;
2902
2903 if (ocfs2_supports_indexed_dirs(osb)) {
2904 credits += ocfs2_add_dir_index_credits(sb);
2905
2906 dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2907 if (!dx_inline) {
2908 /* Add one more cluster for an index leaf */
2909 dx_alloc++;
2910 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2911 &num_dx_leaves);
2912 if (!dx_leaves) {
2913 ret = -ENOMEM;
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917 }
2918
2919 /* This gets us the dx_root */
2920 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2921 if (ret) {
2922 mlog_errno(ret);
2923 goto out;
2924 }
2925 }
1417 2926
1418 /* 2927 /*
1419 * We should never need more than 2 clusters for this - 2928 * We should never need more than 2 clusters for the unindexed
1420 * maximum dirent size is far less than one block. In fact, 2929 * tree - maximum dirent size is far less than one block. In
1421 * the only time we'd need more than one cluster is if 2930 * fact, the only time we'd need more than one cluster is if
1422 * blocksize == clustersize and the dirent won't fit in the 2931 * blocksize == clustersize and the dirent won't fit in the
1423 * extra space that the expansion to a single block gives. As 2932 * extra space that the expansion to a single block gives. As
1424 * of today, that only happens on 4k/4k file systems. 2933 * of today, that only happens on 4k/4k file systems.
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1435 2944
1436 /* 2945 /*
1437 * Prepare for worst case allocation scenario of two separate 2946 * Prepare for worst case allocation scenario of two separate
1438 * extents. 2947 * extents in the unindexed tree.
1439 */ 2948 */
1440 if (alloc == 2) 2949 if (alloc == 2)
1441 credits += OCFS2_SUBALLOC_ALLOC; 2950 credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1448 } 2957 }
1449 2958
1450 if (vfs_dq_alloc_space_nodirty(dir, 2959 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) { 2960 ocfs2_clusters_to_bytes(osb->sb,
2961 alloc + dx_alloc))) {
1452 ret = -EDQUOT; 2962 ret = -EDQUOT;
1453 goto out_commit; 2963 goto out_commit;
1454 } 2964 }
1455 did_quota = 1; 2965 did_quota = 1;
2966
2967 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2968 /*
2969 * Allocate our index cluster first, to maximize the
2970 * possibility that unindexed leaves grow
2971 * contiguously.
2972 */
2973 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2974 dx_leaves, num_dx_leaves,
2975 &dx_insert_blkno);
2976 if (ret) {
2977 mlog_errno(ret);
2978 goto out_commit;
2979 }
2980 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2981 }
2982
1456 /* 2983 /*
1457 * Try to claim as many clusters as the bitmap can give though 2984 * Try to claim as many clusters as the bitmap can give though
1458 * if we only get one now, that's enough to continue. The rest 2985 * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1463 mlog_errno(ret); 2990 mlog_errno(ret);
1464 goto out_commit; 2991 goto out_commit;
1465 } 2992 }
2993 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1466 2994
1467 /* 2995 /*
1468 * Operations are carefully ordered so that we set up the new 2996 * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 3017 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1490 memset(dirdata_bh->b_data + i_size_read(dir), 0, 3018 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1491 sb->s_blocksize - i_size_read(dir)); 3019 sb->s_blocksize - i_size_read(dir));
1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); 3020 i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
1493 if (ocfs2_supports_dir_trailer(osb)) 3021 if (ocfs2_new_dir_wants_trailer(dir)) {
1494 ocfs2_init_dir_trailer(dir, dirdata_bh); 3022 /*
3023 * Prepare the dir trailer up front. It will otherwise look
3024 * like a valid dirent. Even if inserting the index fails
3025 * (unlikely), then all we'll have done is given first dir
3026 * block a small amount of fragmentation.
3027 */
3028 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3029 }
1495 3030
1496 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3031 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1497 if (ret) { 3032 if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1499 goto out_commit; 3034 goto out_commit;
1500 } 3035 }
1501 3036
3037 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3038 /*
3039 * Dx dirs with an external cluster need to do this up
3040 * front. Inline dx root's get handled later, after
3041 * we've allocated our root block. We get passed back
3042 * a total number of items so that dr_num_entries can
3043 * be correctly set once the dx_root has been
3044 * allocated.
3045 */
3046 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
3047 num_dx_leaves, &num_dx_entries,
3048 dirdata_bh);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out_commit;
3052 }
3053 }
3054
1502 /* 3055 /*
1503 * Set extent, i_size, etc on the directory. After this, the 3056 * Set extent, i_size, etc on the directory. After this, the
1504 * inode should contain the same exact dirents as before and 3057 * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1551 goto out_commit; 3104 goto out_commit;
1552 } 3105 }
1553 3106
3107 if (ocfs2_supports_indexed_dirs(osb)) {
3108 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
3109 dirdata_bh, meta_ac, dx_inline,
3110 num_dx_entries, &dx_root_bh);
3111 if (ret) {
3112 mlog_errno(ret);
3113 goto out_commit;
3114 }
3115
3116 if (dx_inline) {
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh);
3119 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL);
3123 if (ret)
3124 mlog_errno(ret);
3125 }
3126 }
3127
1554 /* 3128 /*
1555 * We asked for two clusters, but only got one in the 1st 3129 * We asked for two clusters, but only got one in the 1st
1556 * pass. Claim the 2nd cluster as a separate extent. 3130 * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1570 mlog_errno(ret); 3144 mlog_errno(ret);
1571 goto out_commit; 3145 goto out_commit;
1572 } 3146 }
3147 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1573 } 3148 }
1574 3149
1575 *first_block_bh = dirdata_bh; 3150 *first_block_bh = dirdata_bh;
1576 dirdata_bh = NULL; 3151 dirdata_bh = NULL;
3152 if (ocfs2_supports_indexed_dirs(osb)) {
3153 unsigned int off;
3154
3155 if (!dx_inline) {
3156 /*
3157 * We need to return the correct block within the
3158 * cluster which should hold our entry.
3159 */
3160 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
3161 &lookup->dl_hinfo);
3162 get_bh(dx_leaves[off]);
3163 lookup->dl_dx_leaf_bh = dx_leaves[off];
3164 }
3165 lookup->dl_dx_root_bh = dx_root_bh;
3166 dx_root_bh = NULL;
3167 }
1577 3168
1578out_commit: 3169out_commit:
1579 if (ret < 0 && did_quota) 3170 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir, 3171 vfs_dq_free_space_nodirty(dir, bytes_allocated);
1581 ocfs2_clusters_to_bytes(osb->sb, 2)); 3172
1582 ocfs2_commit_trans(osb, handle); 3173 ocfs2_commit_trans(osb, handle);
1583 3174
1584out_sem: 3175out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
1587out: 3178out:
1588 if (data_ac) 3179 if (data_ac)
1589 ocfs2_free_alloc_context(data_ac); 3180 ocfs2_free_alloc_context(data_ac);
3181 if (meta_ac)
3182 ocfs2_free_alloc_context(meta_ac);
3183
3184 if (dx_leaves) {
3185 for (i = 0; i < num_dx_leaves; i++)
3186 brelse(dx_leaves[i]);
3187 kfree(dx_leaves);
3188 }
1590 3189
1591 brelse(dirdata_bh); 3190 brelse(dirdata_bh);
3191 brelse(dx_root_bh);
1592 3192
1593 return ret; 3193 return ret;
1594} 3194}
@@ -1658,11 +3258,14 @@ bail:
1658 * is to be turned into an extent based one. The size of the dirent to 3258 * is to be turned into an extent based one. The size of the dirent to
1659 * insert might be larger than the space gained by growing to just one 3259 * insert might be larger than the space gained by growing to just one
1660 * block, so we may have to grow the inode by two blocks in that case. 3260 * block, so we may have to grow the inode by two blocks in that case.
3261 *
3262 * If the directory is already indexed, dx_root_bh must be provided.
1661 */ 3263 */
1662static int ocfs2_extend_dir(struct ocfs2_super *osb, 3264static int ocfs2_extend_dir(struct ocfs2_super *osb,
1663 struct inode *dir, 3265 struct inode *dir,
1664 struct buffer_head *parent_fe_bh, 3266 struct buffer_head *parent_fe_bh,
1665 unsigned int blocks_wanted, 3267 unsigned int blocks_wanted,
3268 struct ocfs2_dir_lookup_result *lookup,
1666 struct buffer_head **new_de_bh) 3269 struct buffer_head **new_de_bh)
1667{ 3270{
1668 int status = 0; 3271 int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1677 struct ocfs2_dir_entry * de; 3280 struct ocfs2_dir_entry * de;
1678 struct super_block *sb = osb->sb; 3281 struct super_block *sb = osb->sb;
1679 struct ocfs2_extent_tree et; 3282 struct ocfs2_extent_tree et;
3283 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1680 3284
1681 mlog_entry_void(); 3285 mlog_entry_void();
1682 3286
1683 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3287 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3288 /*
3289 * This would be a code error as an inline directory should
3290 * never have an index root.
3291 */
3292 BUG_ON(dx_root_bh);
3293
1684 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 3294 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1685 blocks_wanted, &new_bh); 3295 blocks_wanted, lookup,
3296 &new_bh);
1686 if (status) { 3297 if (status) {
1687 mlog_errno(status); 3298 mlog_errno(status);
1688 goto bail; 3299 goto bail;
1689 } 3300 }
1690 3301
3302 /* Expansion from inline to an indexed directory will
3303 * have given us this. */
3304 dx_root_bh = lookup->dl_dx_root_bh;
3305
1691 if (blocks_wanted == 1) { 3306 if (blocks_wanted == 1) {
1692 /* 3307 /*
1693 * If the new dirent will fit inside the space 3308 * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1751 } 3366 }
1752 3367
1753do_extend: 3368do_extend:
3369 if (ocfs2_dir_indexed(dir))
3370 credits++; /* For attaching the new dirent block to the
3371 * dx_root */
3372
1754 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3373 down_write(&OCFS2_I(dir)->ip_alloc_sem);
1755 drop_alloc_sem = 1; 3374 drop_alloc_sem = 1;
1756 3375
@@ -1781,9 +3400,19 @@ do_extend:
1781 3400
1782 de = (struct ocfs2_dir_entry *) new_bh->b_data; 3401 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1783 de->inode = 0; 3402 de->inode = 0;
1784 if (ocfs2_dir_has_trailer(dir)) { 3403 if (ocfs2_supports_dir_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); 3404 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh); 3405
3406 ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3407
3408 if (ocfs2_dir_indexed(dir)) {
3409 status = ocfs2_dx_dir_link_trailer(dir, handle,
3410 dx_root_bh, new_bh);
3411 if (status) {
3412 mlog_errno(status);
3413 goto bail;
3414 }
3415 }
1787 } else { 3416 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize); 3417 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 } 3418 }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1839 * This calculates how many free bytes we'd have in block zero, should 3468 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree. 3469 * this function force expansion to an extent tree.
1841 */ 3470 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 3471 if (ocfs2_new_dir_wants_trailer(dir))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); 3472 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else 3473 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir); 3474 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
1970 return status; 3599 return status;
1971} 3600}
1972 3601
3602static int dx_leaf_sort_cmp(const void *a, const void *b)
3603{
3604 const struct ocfs2_dx_entry *entry1 = a;
3605 const struct ocfs2_dx_entry *entry2 = b;
3606 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3607 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3608 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3609 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3610
3611 if (major_hash1 > major_hash2)
3612 return 1;
3613 if (major_hash1 < major_hash2)
3614 return -1;
3615
3616 /*
3617 * It is not strictly necessary to sort by minor
3618 */
3619 if (minor_hash1 > minor_hash2)
3620 return 1;
3621 if (minor_hash1 < minor_hash2)
3622 return -1;
3623 return 0;
3624}
3625
3626static void dx_leaf_sort_swap(void *a, void *b, int size)
3627{
3628 struct ocfs2_dx_entry *entry1 = a;
3629 struct ocfs2_dx_entry *entry2 = b;
3630 struct ocfs2_dx_entry tmp;
3631
3632 BUG_ON(size != sizeof(*entry1));
3633
3634 tmp = *entry1;
3635 *entry1 = *entry2;
3636 *entry2 = tmp;
3637}
3638
3639static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3640{
3641 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3642 int i, num = le16_to_cpu(dl_list->de_num_used);
3643
3644 for (i = 0; i < (num - 1); i++) {
3645 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3646 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3647 return 0;
3648 }
3649
3650 return 1;
3651}
3652
3653/*
3654 * Find the optimal value to split this leaf on. This expects the leaf
3655 * entries to be in sorted order.
3656 *
3657 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3658 * the hash we want to insert.
3659 *
3660 * This function is only concerned with the major hash - that which
3661 * determines which cluster an item belongs to.
3662 */
3663static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3664 u32 leaf_cpos, u32 insert_hash,
3665 u32 *split_hash)
3666{
3667 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3668 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3669 int allsame;
3670
3671 /*
3672 * There's a couple rare, but nasty corner cases we have to
3673 * check for here. All of them involve a leaf where all value
3674 * have the same hash, which is what we look for first.
3675 *
3676 * Most of the time, all of the above is false, and we simply
3677 * pick the median value for a split.
3678 */
3679 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3680 if (allsame) {
3681 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3682
3683 if (val == insert_hash) {
3684 /*
3685 * No matter where we would choose to split,
3686 * the new entry would want to occupy the same
3687 * block as these. Since there's no space left
3688 * in their existing block, we know there
3689 * won't be space after the split.
3690 */
3691 return -ENOSPC;
3692 }
3693
3694 if (val == leaf_cpos) {
3695 /*
3696 * Because val is the same as leaf_cpos (which
3697 * is the smallest value this leaf can have),
3698 * yet is not equal to insert_hash, then we
3699 * know that insert_hash *must* be larger than
3700 * val (and leaf_cpos). At least cpos+1 in value.
3701 *
3702 * We also know then, that there cannot be an
3703 * adjacent extent (otherwise we'd be looking
3704 * at it). Choosing this value gives us a
3705 * chance to get some contiguousness.
3706 */
3707 *split_hash = leaf_cpos + 1;
3708 return 0;
3709 }
3710
3711 if (val > insert_hash) {
3712 /*
3713 * val can not be the same as insert hash, and
3714 * also must be larger than leaf_cpos. Also,
3715 * we know that there can't be a leaf between
3716 * cpos and val, otherwise the entries with
3717 * hash 'val' would be there.
3718 */
3719 *split_hash = val;
3720 return 0;
3721 }
3722
3723 *split_hash = insert_hash;
3724 return 0;
3725 }
3726
3727 /*
3728 * Since the records are sorted and the checks above
3729 * guaranteed that not all records in this block are the same,
3730 * we simple travel forward, from the median, and pick the 1st
3731 * record whose value is larger than leaf_cpos.
3732 */
3733 for (i = (num_used / 2); i < num_used; i++)
3734 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3735 leaf_cpos)
3736 break;
3737
3738 BUG_ON(i == num_used); /* Should be impossible */
3739 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3740 return 0;
3741}
3742
3743/*
3744 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3745 * larger than split_hash into new_dx_leaves. We use a temporary
3746 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3747 *
3748 * Since the block offset inside a leaf (cluster) is a constant mask
3749 * of minor_hash, we can optimize - an item at block offset X within
3750 * the original cluster, will be at offset X within the new cluster.
3751 */
3752static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3753 handle_t *handle,
3754 struct ocfs2_dx_leaf *tmp_dx_leaf,
3755 struct buffer_head **orig_dx_leaves,
3756 struct buffer_head **new_dx_leaves,
3757 int num_dx_leaves)
3758{
3759 int i, j, num_used;
3760 u32 major_hash;
3761 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3762 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3763 struct ocfs2_dx_entry *dx_entry;
3764
3765 tmp_list = &tmp_dx_leaf->dl_list;
3766
3767 for (i = 0; i < num_dx_leaves; i++) {
3768 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3769 orig_list = &orig_dx_leaf->dl_list;
3770 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3771 new_list = &new_dx_leaf->dl_list;
3772
3773 num_used = le16_to_cpu(orig_list->de_num_used);
3774
3775 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3776 tmp_list->de_num_used = cpu_to_le16(0);
3777 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3778
3779 for (j = 0; j < num_used; j++) {
3780 dx_entry = &orig_list->de_entries[j];
3781 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3782 if (major_hash >= split_hash)
3783 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3784 dx_entry);
3785 else
3786 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3787 dx_entry);
3788 }
3789 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3790
3791 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3792 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3793 }
3794}
3795
3796static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3797 struct ocfs2_dx_root_block *dx_root)
3798{
3799 int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3800
3801 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3802 credits += ocfs2_quota_trans_credits(osb->sb);
3803 return credits;
3804}
3805
3806/*
3807 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3808 * half our entries into.
3809 */
3810static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3811 struct buffer_head *dx_root_bh,
3812 struct buffer_head *dx_leaf_bh,
3813 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3814 u64 leaf_blkno)
3815{
3816 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3817 int credits, ret, i, num_used, did_quota = 0;
3818 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3819 u64 orig_leaves_start;
3820 int num_dx_leaves;
3821 struct buffer_head **orig_dx_leaves = NULL;
3822 struct buffer_head **new_dx_leaves = NULL;
3823 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3824 struct ocfs2_extent_tree et;
3825 handle_t *handle = NULL;
3826 struct ocfs2_dx_root_block *dx_root;
3827 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3828
3829 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
3830 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3831 (unsigned long long)leaf_blkno, insert_hash);
3832
3833 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3834
3835 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3836 /*
3837 * XXX: This is a rather large limit. We should use a more
3838 * realistic value.
3839 */
3840 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3841 return -ENOSPC;
3842
3843 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3844 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3845 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3846 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3847 (unsigned long long)leaf_blkno, num_used);
3848 ret = -EIO;
3849 goto out;
3850 }
3851
3852 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3853 if (!orig_dx_leaves) {
3854 ret = -ENOMEM;
3855 mlog_errno(ret);
3856 goto out;
3857 }
3858
3859 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3860 if (!new_dx_leaves) {
3861 ret = -ENOMEM;
3862 mlog_errno(ret);
3863 goto out;
3864 }
3865
3866 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3867 if (ret) {
3868 if (ret != -ENOSPC)
3869 mlog_errno(ret);
3870 goto out;
3871 }
3872
3873 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3874 handle = ocfs2_start_trans(osb, credits);
3875 if (IS_ERR(handle)) {
3876 ret = PTR_ERR(handle);
3877 handle = NULL;
3878 mlog_errno(ret);
3879 goto out;
3880 }
3881
3882 if (vfs_dq_alloc_space_nodirty(dir,
3883 ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
3884 ret = -EDQUOT;
3885 goto out_commit;
3886 }
3887 did_quota = 1;
3888
3889 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
3890 OCFS2_JOURNAL_ACCESS_WRITE);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out_commit;
3894 }
3895
3896 /*
3897 * This block is changing anyway, so we can sort it in place.
3898 */
3899 sort(dx_leaf->dl_list.de_entries, num_used,
3900 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3901 dx_leaf_sort_swap);
3902
3903 ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
3904 if (ret) {
3905 mlog_errno(ret);
3906 goto out_commit;
3907 }
3908
3909 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3910 &split_hash);
3911 if (ret) {
3912 mlog_errno(ret);
3913 goto out_commit;
3914 }
3915
3916 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
3917 leaf_cpos, split_hash, insert_hash);
3918
3919 /*
3920 * We have to carefully order operations here. There are items
3921 * which want to be in the new cluster before insert, but in
3922 * order to put those items in the new cluster, we alter the
3923 * old cluster. A failure to insert gets nasty.
3924 *
3925 * So, start by reserving writes to the old
3926 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3927 * the new cluster for us, before inserting it. The insert
3928 * won't happen if there's an error before that. Once the
3929 * insert is done then, we can transfer from one leaf into the
3930 * other without fear of hitting any error.
3931 */
3932
3933 /*
3934 * The leaf transfer wants some scratch space so that we don't
3935 * wind up doing a bunch of expensive memmove().
3936 */
3937 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3938 if (!tmp_dx_leaf) {
3939 ret = -ENOMEM;
3940 mlog_errno(ret);
3941 goto out_commit;
3942 }
3943
3944 orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3945 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3946 orig_dx_leaves);
3947 if (ret) {
3948 mlog_errno(ret);
3949 goto out_commit;
3950 }
3951
3952 for (i = 0; i < num_dx_leaves; i++) {
3953 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
3954 OCFS2_JOURNAL_ACCESS_WRITE);
3955 if (ret) {
3956 mlog_errno(ret);
3957 goto out_commit;
3958 }
3959 }
3960
3961 cpos = split_hash;
3962 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3963 data_ac, meta_ac, new_dx_leaves,
3964 num_dx_leaves);
3965 if (ret) {
3966 mlog_errno(ret);
3967 goto out_commit;
3968 }
3969
3970 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3971 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3972
3973out_commit:
3974 if (ret < 0 && did_quota)
3975 vfs_dq_free_space_nodirty(dir,
3976 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3977
3978 ocfs2_commit_trans(osb, handle);
3979
3980out:
3981 if (orig_dx_leaves || new_dx_leaves) {
3982 for (i = 0; i < num_dx_leaves; i++) {
3983 if (orig_dx_leaves)
3984 brelse(orig_dx_leaves[i]);
3985 if (new_dx_leaves)
3986 brelse(new_dx_leaves[i]);
3987 }
3988 kfree(orig_dx_leaves);
3989 kfree(new_dx_leaves);
3990 }
3991
3992 if (meta_ac)
3993 ocfs2_free_alloc_context(meta_ac);
3994 if (data_ac)
3995 ocfs2_free_alloc_context(data_ac);
3996
3997 kfree(tmp_dx_leaf);
3998 return ret;
3999}
4000
4001static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
4002 struct buffer_head *di_bh,
4003 struct buffer_head *dx_root_bh,
4004 const char *name, int namelen,
4005 struct ocfs2_dir_lookup_result *lookup)
4006{
4007 int ret, rebalanced = 0;
4008 struct ocfs2_dx_root_block *dx_root;
4009 struct buffer_head *dx_leaf_bh = NULL;
4010 struct ocfs2_dx_leaf *dx_leaf;
4011 u64 blkno;
4012 u32 leaf_cpos;
4013
4014 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4015
4016restart_search:
4017 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
4018 &leaf_cpos, &blkno);
4019 if (ret) {
4020 mlog_errno(ret);
4021 goto out;
4022 }
4023
4024 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
4025 if (ret) {
4026 mlog_errno(ret);
4027 goto out;
4028 }
4029
4030 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
4031
4032 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
4033 le16_to_cpu(dx_leaf->dl_list.de_count)) {
4034 if (rebalanced) {
4035 /*
4036 * Rebalancing should have provided us with
4037 * space in an appropriate leaf.
4038 *
4039 * XXX: Is this an abnormal condition then?
4040 * Should we print a message here?
4041 */
4042 ret = -ENOSPC;
4043 goto out;
4044 }
4045
4046 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
4047 &lookup->dl_hinfo, leaf_cpos,
4048 blkno);
4049 if (ret) {
4050 if (ret != -ENOSPC)
4051 mlog_errno(ret);
4052 goto out;
4053 }
4054
4055 /*
4056 * Restart the lookup. The rebalance might have
4057 * changed which block our item fits into. Mark our
4058 * progress, so we only execute this once.
4059 */
4060 brelse(dx_leaf_bh);
4061 dx_leaf_bh = NULL;
4062 rebalanced = 1;
4063 goto restart_search;
4064 }
4065
4066 lookup->dl_dx_leaf_bh = dx_leaf_bh;
4067 dx_leaf_bh = NULL;
4068
4069out:
4070 brelse(dx_leaf_bh);
4071 return ret;
4072}
4073
4074static int ocfs2_search_dx_free_list(struct inode *dir,
4075 struct buffer_head *dx_root_bh,
4076 int namelen,
4077 struct ocfs2_dir_lookup_result *lookup)
4078{
4079 int ret = -ENOSPC;
4080 struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
4081 struct ocfs2_dir_block_trailer *db;
4082 u64 next_block;
4083 int rec_len = OCFS2_DIR_REC_LEN(namelen);
4084 struct ocfs2_dx_root_block *dx_root;
4085
4086 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4087 next_block = le64_to_cpu(dx_root->dr_free_blk);
4088
4089 while (next_block) {
4090 brelse(prev_leaf_bh);
4091 prev_leaf_bh = leaf_bh;
4092 leaf_bh = NULL;
4093
4094 ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
4095 if (ret) {
4096 mlog_errno(ret);
4097 goto out;
4098 }
4099
4100 db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
4101 if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
4102 lookup->dl_leaf_bh = leaf_bh;
4103 lookup->dl_prev_leaf_bh = prev_leaf_bh;
4104 leaf_bh = NULL;
4105 prev_leaf_bh = NULL;
4106 break;
4107 }
4108
4109 next_block = le64_to_cpu(db->db_free_next);
4110 }
4111
4112 if (!next_block)
4113 ret = -ENOSPC;
4114
4115out:
4116
4117 brelse(leaf_bh);
4118 brelse(prev_leaf_bh);
4119 return ret;
4120}
4121
4122static int ocfs2_expand_inline_dx_root(struct inode *dir,
4123 struct buffer_head *dx_root_bh)
4124{
4125 int ret, num_dx_leaves, i, j, did_quota = 0;
4126 struct buffer_head **dx_leaves = NULL;
4127 struct ocfs2_extent_tree et;
4128 u64 insert_blkno;
4129 struct ocfs2_alloc_context *data_ac = NULL;
4130 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4131 handle_t *handle = NULL;
4132 struct ocfs2_dx_root_block *dx_root;
4133 struct ocfs2_dx_entry_list *entry_list;
4134 struct ocfs2_dx_entry *dx_entry;
4135 struct ocfs2_dx_leaf *target_leaf;
4136
4137 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4138 if (ret) {
4139 mlog_errno(ret);
4140 goto out;
4141 }
4142
4143 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4144 if (!dx_leaves) {
4145 ret = -ENOMEM;
4146 mlog_errno(ret);
4147 goto out;
4148 }
4149
4150 handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4151 if (IS_ERR(handle)) {
4152 ret = PTR_ERR(handle);
4153 mlog_errno(ret);
4154 goto out;
4155 }
4156
4157 if (vfs_dq_alloc_space_nodirty(dir,
4158 ocfs2_clusters_to_bytes(osb->sb, 1))) {
4159 ret = -EDQUOT;
4160 goto out_commit;
4161 }
4162 did_quota = 1;
4163
4164 /*
4165 * We do this up front, before the allocation, so that a
4166 * failure to add the dx_root_bh to the journal won't result
4167 * us losing clusters.
4168 */
4169 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
4170 OCFS2_JOURNAL_ACCESS_WRITE);
4171 if (ret) {
4172 mlog_errno(ret);
4173 goto out_commit;
4174 }
4175
4176 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4177 num_dx_leaves, &insert_blkno);
4178 if (ret) {
4179 mlog_errno(ret);
4180 goto out_commit;
4181 }
4182
4183 /*
4184 * Transfer the entries from our dx_root into the appropriate
4185 * block
4186 */
4187 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4188 entry_list = &dx_root->dr_entries;
4189
4190 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4191 dx_entry = &entry_list->de_entries[i];
4192
4193 j = __ocfs2_dx_dir_hash_idx(osb,
4194 le32_to_cpu(dx_entry->dx_minor_hash));
4195 target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4196
4197 ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4198
4199 /* Each leaf has been passed to the journal already
4200 * via __ocfs2_dx_dir_new_cluster() */
4201 }
4202
4203 dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4204 memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4205 offsetof(struct ocfs2_dx_root_block, dr_list));
4206 dx_root->dr_list.l_count =
4207 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4208
4209 /* This should never fail considering we start with an empty
4210 * dx_root. */
4211 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4212 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
4213 insert_blkno, 1, 0, NULL);
4214 if (ret)
4215 mlog_errno(ret);
4216 did_quota = 0;
4217
4218 ocfs2_journal_dirty(handle, dx_root_bh);
4219
4220out_commit:
4221 if (ret < 0 && did_quota)
4222 vfs_dq_free_space_nodirty(dir,
4223 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4224
4225 ocfs2_commit_trans(osb, handle);
4226
4227out:
4228 if (data_ac)
4229 ocfs2_free_alloc_context(data_ac);
4230
4231 if (dx_leaves) {
4232 for (i = 0; i < num_dx_leaves; i++)
4233 brelse(dx_leaves[i]);
4234 kfree(dx_leaves);
4235 }
4236 return ret;
4237}
4238
4239static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4240{
4241 struct ocfs2_dx_root_block *dx_root;
4242 struct ocfs2_dx_entry_list *entry_list;
4243
4244 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4245 entry_list = &dx_root->dr_entries;
4246
4247 if (le16_to_cpu(entry_list->de_num_used) >=
4248 le16_to_cpu(entry_list->de_count))
4249 return -ENOSPC;
4250
4251 return 0;
4252}
4253
4254static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4255 struct buffer_head *di_bh,
4256 const char *name,
4257 int namelen,
4258 struct ocfs2_dir_lookup_result *lookup)
4259{
4260 int ret, free_dx_root = 1;
4261 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4262 struct buffer_head *dx_root_bh = NULL;
4263 struct buffer_head *leaf_bh = NULL;
4264 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4265 struct ocfs2_dx_root_block *dx_root;
4266
4267 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4268 if (ret) {
4269 mlog_errno(ret);
4270 goto out;
4271 }
4272
4273 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4274 if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4275 ret = -ENOSPC;
4276 mlog_errno(ret);
4277 goto out;
4278 }
4279
4280 if (ocfs2_dx_root_inline(dx_root)) {
4281 ret = ocfs2_inline_dx_has_space(dx_root_bh);
4282
4283 if (ret == 0)
4284 goto search_el;
4285
4286 /*
4287 * We ran out of room in the root block. Expand it to
4288 * an extent, then allow ocfs2_find_dir_space_dx to do
4289 * the rest.
4290 */
4291 ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4292 if (ret) {
4293 mlog_errno(ret);
4294 goto out;
4295 }
4296 }
4297
4298 /*
4299 * Insert preparation for an indexed directory is split into two
4300 * steps. The call to find_dir_space_dx reserves room in the index for
4301 * an additional item. If we run out of space there, it's a real error
4302 * we can't continue on.
4303 */
4304 ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4305 namelen, lookup);
4306 if (ret) {
4307 mlog_errno(ret);
4308 goto out;
4309 }
4310
4311search_el:
4312 /*
4313 * Next, we need to find space in the unindexed tree. This call
4314 * searches using the free space linked list. If the unindexed tree
4315 * lacks sufficient space, we'll expand it below. The expansion code
4316 * is smart enough to add any new blocks to the free space list.
4317 */
4318 ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4319 if (ret && ret != -ENOSPC) {
4320 mlog_errno(ret);
4321 goto out;
4322 }
4323
4324 /* Do this up here - ocfs2_extend_dir might need the dx_root */
4325 lookup->dl_dx_root_bh = dx_root_bh;
4326 free_dx_root = 0;
4327
4328 if (ret == -ENOSPC) {
4329 ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4330
4331 if (ret) {
4332 mlog_errno(ret);
4333 goto out;
4334 }
4335
4336 /*
4337 * We make the assumption here that new leaf blocks are added
4338 * to the front of our free list.
4339 */
4340 lookup->dl_prev_leaf_bh = NULL;
4341 lookup->dl_leaf_bh = leaf_bh;
4342 }
4343
4344out:
4345 if (free_dx_root)
4346 brelse(dx_root_bh);
4347 return ret;
4348}
4349
4350/*
4351 * Get a directory ready for insert. Any directory allocation required
4352 * happens here. Success returns zero, and enough context in the dir
4353 * lookup result that ocfs2_add_entry() will be able complete the task
4354 * with minimal performance impact.
4355 */
1973int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 4356int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1974 struct inode *dir, 4357 struct inode *dir,
1975 struct buffer_head *parent_fe_bh, 4358 struct buffer_head *parent_fe_bh,
1976 const char *name, 4359 const char *name,
1977 int namelen, 4360 int namelen,
1978 struct buffer_head **ret_de_bh) 4361 struct ocfs2_dir_lookup_result *lookup)
1979{ 4362{
1980 int ret; 4363 int ret;
1981 unsigned int blocks_wanted = 1; 4364 unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1984 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 4367 mlog(0, "getting ready to insert namelen %d into dir %llu\n",
1985 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 4368 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
1986 4369
1987 *ret_de_bh = NULL;
1988
1989 if (!namelen) { 4370 if (!namelen) {
1990 ret = -EINVAL; 4371 ret = -EINVAL;
1991 mlog_errno(ret); 4372 mlog_errno(ret);
1992 goto out; 4373 goto out;
1993 } 4374 }
1994 4375
4376 /*
4377 * Do this up front to reduce confusion.
4378 *
4379 * The directory might start inline, then be turned into an
4380 * indexed one, in which case we'd need to hash deep inside
4381 * ocfs2_find_dir_space_id(). Since
4382 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4383 * done, there seems no point in spreading out the calls. We
4384 * can optimize away the case where the file system doesn't
4385 * support indexing.
4386 */
4387 if (ocfs2_supports_indexed_dirs(osb))
4388 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4389
4390 if (ocfs2_dir_indexed(dir)) {
4391 ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4392 name, namelen, lookup);
4393 if (ret)
4394 mlog_errno(ret);
4395 goto out;
4396 }
4397
1995 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4398 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1996 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 4399 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
1997 namelen, &bh, &blocks_wanted); 4400 namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2010 BUG_ON(bh); 4413 BUG_ON(bh);
2011 4414
2012 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 4415 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
2013 &bh); 4416 lookup, &bh);
2014 if (ret) { 4417 if (ret) {
2015 if (ret != -ENOSPC) 4418 if (ret != -ENOSPC)
2016 mlog_errno(ret); 4419 mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2020 BUG_ON(!bh); 4423 BUG_ON(!bh);
2021 } 4424 }
2022 4425
2023 *ret_de_bh = bh; 4426 lookup->dl_leaf_bh = bh;
2024 bh = NULL; 4427 bh = NULL;
2025out: 4428out:
2026 brelse(bh); 4429 brelse(bh);
2027 return ret; 4430 return ret;
2028} 4431}
4432
4433static int ocfs2_dx_dir_remove_index(struct inode *dir,
4434 struct buffer_head *di_bh,
4435 struct buffer_head *dx_root_bh)
4436{
4437 int ret;
4438 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4439 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4440 struct ocfs2_dx_root_block *dx_root;
4441 struct inode *dx_alloc_inode = NULL;
4442 struct buffer_head *dx_alloc_bh = NULL;
4443 handle_t *handle;
4444 u64 blk;
4445 u16 bit;
4446 u64 bg_blkno;
4447
4448 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4449
4450 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4451 EXTENT_ALLOC_SYSTEM_INODE,
4452 le16_to_cpu(dx_root->dr_suballoc_slot));
4453 if (!dx_alloc_inode) {
4454 ret = -ENOMEM;
4455 mlog_errno(ret);
4456 goto out;
4457 }
4458 mutex_lock(&dx_alloc_inode->i_mutex);
4459
4460 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4461 if (ret) {
4462 mlog_errno(ret);
4463 goto out_mutex;
4464 }
4465
4466 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4467 if (IS_ERR(handle)) {
4468 ret = PTR_ERR(handle);
4469 mlog_errno(ret);
4470 goto out_unlock;
4471 }
4472
4473 ret = ocfs2_journal_access_di(handle, dir, di_bh,
4474 OCFS2_JOURNAL_ACCESS_WRITE);
4475 if (ret) {
4476 mlog_errno(ret);
4477 goto out_commit;
4478 }
4479
4480 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4481 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4482 di->i_dx_root = cpu_to_le64(0ULL);
4483
4484 ocfs2_journal_dirty(handle, di_bh);
4485
4486 blk = le64_to_cpu(dx_root->dr_blkno);
4487 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4488 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4489 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4490 bit, bg_blkno, 1);
4491 if (ret)
4492 mlog_errno(ret);
4493
4494out_commit:
4495 ocfs2_commit_trans(osb, handle);
4496
4497out_unlock:
4498 ocfs2_inode_unlock(dx_alloc_inode, 1);
4499
4500out_mutex:
4501 mutex_unlock(&dx_alloc_inode->i_mutex);
4502 brelse(dx_alloc_bh);
4503out:
4504 iput(dx_alloc_inode);
4505 return ret;
4506}
4507
4508int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4509{
4510 int ret;
4511 unsigned int uninitialized_var(clen);
4512 u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
4513 u64 uninitialized_var(blkno);
4514 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4515 struct buffer_head *dx_root_bh = NULL;
4516 struct ocfs2_dx_root_block *dx_root;
4517 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4518 struct ocfs2_cached_dealloc_ctxt dealloc;
4519 struct ocfs2_extent_tree et;
4520
4521 ocfs2_init_dealloc_ctxt(&dealloc);
4522
4523 if (!ocfs2_dir_indexed(dir))
4524 return 0;
4525
4526 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4527 if (ret) {
4528 mlog_errno(ret);
4529 goto out;
4530 }
4531 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4532
4533 if (ocfs2_dx_root_inline(dx_root))
4534 goto remove_index;
4535
4536 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4537
4538 /* XXX: What if dr_clusters is too large? */
4539 while (le32_to_cpu(dx_root->dr_clusters)) {
4540 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4541 major_hash, &cpos, &blkno, &clen);
4542 if (ret) {
4543 mlog_errno(ret);
4544 goto out;
4545 }
4546
4547 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4548
4549 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
4550 &dealloc);
4551 if (ret) {
4552 mlog_errno(ret);
4553 goto out;
4554 }
4555
4556 if (cpos == 0)
4557 break;
4558
4559 major_hash = cpos - 1;
4560 }
4561
4562remove_index:
4563 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4564 if (ret) {
4565 mlog_errno(ret);
4566 goto out;
4567 }
4568
4569 ocfs2_remove_from_cache(dir, dx_root_bh);
4570out:
4571 ocfs2_schedule_truncate_log_flush(osb, 1);
4572 ocfs2_run_deallocs(osb, &dealloc);
4573
4574 brelse(dx_root_bh);
4575 return ret;
4576}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct buffer_head *ocfs2_find_entry(const char *name, 29struct ocfs2_dx_hinfo {
30 int namelen, 30 u32 major_hash;
31 struct inode *dir, 31 u32 minor_hash;
32 struct ocfs2_dir_entry **res_dir); 32};
33
34struct ocfs2_dir_lookup_result {
35 struct buffer_head *dl_leaf_bh; /* Unindexed leaf
36 * block */
37 struct ocfs2_dir_entry *dl_entry; /* Target dirent in
38 * unindexed leaf */
39
40 struct buffer_head *dl_dx_root_bh; /* Root of indexed
41 * tree */
42
43 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
44 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
45 * indexed leaf */
46 struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
47
48 struct buffer_head *dl_prev_leaf_bh;/* Previous entry in
49 * dir free space
50 * list. NULL if
51 * previous entry is
52 * dx root block. */
53};
54
55void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
56
57int ocfs2_find_entry(const char *name, int namelen,
58 struct inode *dir,
59 struct ocfs2_dir_lookup_result *lookup);
33int ocfs2_delete_entry(handle_t *handle, 60int ocfs2_delete_entry(handle_t *handle,
34 struct inode *dir, 61 struct inode *dir,
35 struct ocfs2_dir_entry *de_del, 62 struct ocfs2_dir_lookup_result *res);
36 struct buffer_head *bh);
37int __ocfs2_add_entry(handle_t *handle, 63int __ocfs2_add_entry(handle_t *handle,
38 struct inode *dir, 64 struct inode *dir,
39 const char *name, int namelen, 65 const char *name, int namelen,
40 struct inode *inode, u64 blkno, 66 struct inode *inode, u64 blkno,
41 struct buffer_head *parent_fe_bh, 67 struct buffer_head *parent_fe_bh,
42 struct buffer_head *insert_bh); 68 struct ocfs2_dir_lookup_result *lookup);
43static inline int ocfs2_add_entry(handle_t *handle, 69static inline int ocfs2_add_entry(handle_t *handle,
44 struct dentry *dentry, 70 struct dentry *dentry,
45 struct inode *inode, u64 blkno, 71 struct inode *inode, u64 blkno,
46 struct buffer_head *parent_fe_bh, 72 struct buffer_head *parent_fe_bh,
47 struct buffer_head *insert_bh) 73 struct ocfs2_dir_lookup_result *lookup)
48{ 74{
49 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, 75 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
50 dentry->d_name.name, dentry->d_name.len, 76 dentry->d_name.name, dentry->d_name.len,
51 inode, blkno, parent_fe_bh, insert_bh); 77 inode, blkno, parent_fe_bh, lookup);
52} 78}
53int ocfs2_update_entry(struct inode *dir, handle_t *handle, 79int ocfs2_update_entry(struct inode *dir, handle_t *handle,
54 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 80 struct ocfs2_dir_lookup_result *res,
55 struct inode *new_entry_inode); 81 struct inode *new_entry_inode);
56 82
57int ocfs2_check_dir_for_entry(struct inode *dir, 83int ocfs2_check_dir_for_entry(struct inode *dir,
58 const char *name, 84 const char *name,
59 int namelen); 85 int namelen);
60int ocfs2_empty_dir(struct inode *inode); 86int ocfs2_empty_dir(struct inode *inode);
87
61int ocfs2_find_files_on_disk(const char *name, 88int ocfs2_find_files_on_disk(const char *name,
62 int namelen, 89 int namelen,
63 u64 *blkno, 90 u64 *blkno,
64 struct inode *inode, 91 struct inode *inode,
65 struct buffer_head **dirent_bh, 92 struct ocfs2_dir_lookup_result *res);
66 struct ocfs2_dir_entry **dirent);
67int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
68 int namelen, u64 *blkno); 94 int namelen, u64 *blkno);
69int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 95int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
74 struct buffer_head *parent_fe_bh, 100 struct buffer_head *parent_fe_bh,
75 const char *name, 101 const char *name,
76 int namelen, 102 int namelen,
77 struct buffer_head **ret_de_bh); 103 struct ocfs2_dir_lookup_result *lookup);
78struct ocfs2_alloc_context; 104struct ocfs2_alloc_context;
79int ocfs2_fill_new_dir(struct ocfs2_super *osb, 105int ocfs2_fill_new_dir(struct ocfs2_super *osb,
80 handle_t *handle, 106 handle_t *handle,
81 struct inode *parent, 107 struct inode *parent,
82 struct inode *inode, 108 struct inode *inode,
83 struct buffer_head *fe_bh, 109 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 110 struct ocfs2_alloc_context *data_ac,
111 struct ocfs2_alloc_context *meta_ac);
112
113int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
85 114
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, 115struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data); 116 void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION 55 DLM_MLE_MIGRATION,
56}; 56 DLM_MLE_NUM_TYPES
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61}; 57};
62 58
63struct dlm_master_list_entry { 59struct dlm_master_list_entry {
64 struct list_head list; 60 struct hlist_node master_hash_node;
65 struct list_head hb_events; 61 struct list_head hb_events;
66 struct dlm_ctxt *dlm; 62 struct dlm_ctxt *dlm;
67 spinlock_t spinlock; 63 spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
78 enum dlm_mle_type type; 74 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up; 75 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down; 76 struct o2hb_callback_func mle_hb_down;
81 union { 77 struct dlm_lock_resource *mleres;
82 struct dlm_lock_resource *res; 78 unsigned char mname[DLM_LOCKID_NAME_MAX];
83 struct dlm_lock_name name; 79 unsigned int mnamelen;
84 } u; 80 unsigned int mnamehash;
85}; 81};
86 82
87enum dlm_ast_type { 83enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
151 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
152 struct dlm_recovery_ctxt reco; 148 struct dlm_recovery_ctxt reco;
153 spinlock_t master_lock; 149 spinlock_t master_lock;
154 struct list_head master_list; 150 struct hlist_head **master_hash;
155 struct list_head mle_hb_events; 151 struct list_head mle_hb_events;
156 152
157 /* these give a really vague idea of the system load */ 153 /* these give a really vague idea of the system load */
158 atomic_t local_resources; 154 atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
159 atomic_t remote_resources; 155 atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
160 atomic_t unknown_resources; 156 atomic_t res_tot_count;
157 atomic_t res_cur_count;
161 158
162 struct dlm_debug_ctxt *dlm_debug_ctxt; 159 struct dlm_debug_ctxt *dlm_debug_ctxt;
163 struct dentry *dlm_debugfs_subroot; 160 struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
195 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); 192 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
196} 193}
197 194
195static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
196 unsigned i)
197{
198 return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
199 (i % DLM_BUCKETS_PER_PAGE);
200}
201
198/* these keventd work queue items are for less-frequently 202/* these keventd work queue items are for less-frequently
199 * called functions that cannot be directly called from the 203 * called functions that cannot be directly called from the
200 * net message handlers for some reason, usually because 204 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
848 unsigned int len); 852 unsigned int len);
849 853
850int dlm_is_host_down(int errno); 854int dlm_is_host_down(int errno);
851void dlm_change_lockres_owner(struct dlm_ctxt *dlm, 855
852 struct dlm_lock_resource *res,
853 u8 owner);
854struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 856struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
855 const char *lockid, 857 const char *lockid,
856 int namelen, 858 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1008 DLM_LOCK_RES_MIGRATING)); 1010 DLM_LOCK_RES_MIGRATING));
1009} 1011}
1010 1012
1013void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1014void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1015
1011/* create/destroy slab caches */ 1016/* create/destroy slab caches */
1012int dlm_init_master_caches(void); 1017int dlm_init_master_caches(void);
1013void dlm_destroy_master_caches(void); 1018void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
1110 return bit; 1115 return bit;
1111} 1116}
1112 1117
1118static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
1119 struct dlm_lock_resource *res,
1120 u8 owner)
1121{
1122 assert_spin_locked(&res->spinlock);
1123
1124 res->owner = owner;
1125}
1113 1126
1127static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
1128 struct dlm_lock_resource *res,
1129 u8 owner)
1130{
1131 assert_spin_locked(&res->spinlock);
1132
1133 if (owner != res->owner)
1134 dlm_set_lockres_owner(dlm, res, owner);
1135}
1114 1136
1115#endif /* DLMCOMMON_H */ 1137#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) 287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
288{ 288{
289 int out = 0; 289 int out = 0;
290 unsigned int namelen;
291 const char *name;
292 char *mle_type; 290 char *mle_type;
293 291
294 if (mle->type != DLM_MLE_MASTER) {
295 namelen = mle->u.name.len;
296 name = mle->u.name.name;
297 } else {
298 namelen = mle->u.res->lockname.len;
299 name = mle->u.res->lockname.name;
300 }
301
302 if (mle->type == DLM_MLE_BLOCK) 292 if (mle->type == DLM_MLE_BLOCK)
303 mle_type = "BLK"; 293 mle_type = "BLK";
304 else if (mle->type == DLM_MLE_MASTER) 294 else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
306 else 296 else
307 mle_type = "MIG"; 297 mle_type = "MIG";
308 298
309 out += stringify_lockname(name, namelen, buf + out, len - out); 299 out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
310 out += snprintf(buf + out, len - out, 300 out += snprintf(buf + out, len - out,
311 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", 301 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
312 mle_type, mle->master, mle->new_master, 302 mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
501static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 491static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
502{ 492{
503 struct dlm_master_list_entry *mle; 493 struct dlm_master_list_entry *mle;
504 int out = 0; 494 struct hlist_head *bucket;
505 unsigned long total = 0; 495 struct hlist_node *list;
496 int i, out = 0;
497 unsigned long total = 0, longest = 0, bktcnt;
506 498
507 out += snprintf(db->buf + out, db->len - out, 499 out += snprintf(db->buf + out, db->len - out,
508 "Dumping MLEs for Domain: %s\n", dlm->name); 500 "Dumping MLEs for Domain: %s\n", dlm->name);
509 501
510 spin_lock(&dlm->master_lock); 502 spin_lock(&dlm->master_lock);
511 list_for_each_entry(mle, &dlm->master_list, list) { 503 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
512 ++total; 504 bucket = dlm_master_hash(dlm, i);
513 if (db->len - out < 200) 505 hlist_for_each(list, bucket) {
514 continue; 506 mle = hlist_entry(list, struct dlm_master_list_entry,
515 out += dump_mle(mle, db->buf + out, db->len - out); 507 master_hash_node);
508 ++total;
509 ++bktcnt;
510 if (db->len - out < 200)
511 continue;
512 out += dump_mle(mle, db->buf + out, db->len - out);
513 }
514 longest = max(longest, bktcnt);
515 bktcnt = 0;
516 } 516 }
517 spin_unlock(&dlm->master_lock); 517 spin_unlock(&dlm->master_lock);
518 518
519 out += snprintf(db->buf + out, db->len - out, 519 out += snprintf(db->buf + out, db->len - out,
520 "Total on list: %ld\n", total); 520 "Total: %ld, Longest: %ld\n", total, longest);
521 return out; 521 return out;
522} 522}
523 523
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
756 int out = 0; 756 int out = 0;
757 struct dlm_reco_node_data *node; 757 struct dlm_reco_node_data *node;
758 char *state; 758 char *state;
759 int lres, rres, ures, tres; 759 int cur_mles = 0, tot_mles = 0;
760 760 int i;
761 lres = atomic_read(&dlm->local_resources);
762 rres = atomic_read(&dlm->remote_resources);
763 ures = atomic_read(&dlm->unknown_resources);
764 tres = lres + rres + ures;
765 761
766 spin_lock(&dlm->spinlock); 762 spin_lock(&dlm->spinlock);
767 763
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
804 db->buf + out, db->len - out); 800 db->buf + out, db->len - out);
805 out += snprintf(db->buf + out, db->len - out, "\n"); 801 out += snprintf(db->buf + out, db->len - out, "\n");
806 802
807 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ 803 /* Lock Resources: xxx (xxx) */
804 out += snprintf(db->buf + out, db->len - out,
805 "Lock Resources: %d (%d)\n",
806 atomic_read(&dlm->res_cur_count),
807 atomic_read(&dlm->res_tot_count));
808
809 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
810 tot_mles += atomic_read(&dlm->mle_tot_count[i]);
811
812 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
813 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
814
815 /* MLEs: xxx (xxx) */
816 out += snprintf(db->buf + out, db->len - out,
817 "MLEs: %d (%d)\n", cur_mles, tot_mles);
818
819 /* Blocking: xxx (xxx) */
820 out += snprintf(db->buf + out, db->len - out,
821 " Blocking: %d (%d)\n",
822 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
823 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
824
825 /* Mastery: xxx (xxx) */
826 out += snprintf(db->buf + out, db->len - out,
827 " Mastery: %d (%d)\n",
828 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
829 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
830
831 /* Migration: xxx (xxx) */
808 out += snprintf(db->buf + out, db->len - out, 832 out += snprintf(db->buf + out, db->len - out,
809 "Mastered Resources Total: %d Locally: %d " 833 " Migration: %d (%d)\n",
810 "Remotely: %d Unknown: %d\n", 834 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
811 tres, lres, rres, ures); 835 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
812 836
813 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 837 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
814 out += snprintf(db->buf + out, db->len - out, 838 out += snprintf(db->buf + out, db->len - out,
815 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 839 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
816 "PendingBASTs=%s Master=%s\n", 840 "PendingBASTs=%s\n",
817 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 841 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
818 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), 842 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
819 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), 843 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
820 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), 844 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
821 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
822 845
823 /* Purge Count: xxx Refs: xxx */ 846 /* Purge Count: xxx Refs: xxx */
824 out += snprintf(db->buf + out, db->len - out, 847 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
304 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
306 306
307 if (dlm->master_hash)
308 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
309
307 if (dlm->name) 310 if (dlm->name)
308 kfree(dlm->name); 311 kfree(dlm->name);
309 312
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1534 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1537 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1535 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); 1538 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
1536 1539
1540 dlm->master_hash = (struct hlist_head **)
1541 dlm_alloc_pagevec(DLM_HASH_PAGES);
1542 if (!dlm->master_hash) {
1543 mlog_errno(-ENOMEM);
1544 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1545 kfree(dlm->name);
1546 kfree(dlm);
1547 dlm = NULL;
1548 goto leave;
1549 }
1550
1551 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1552 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1553
1537 strcpy(dlm->name, domain); 1554 strcpy(dlm->name, domain);
1538 dlm->key = key; 1555 dlm->key = key;
1539 dlm->node_num = o2nm_this_node(); 1556 dlm->node_num = o2nm_this_node();
1540 1557
1541 ret = dlm_create_debugfs_subroot(dlm); 1558 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) { 1559 if (ret < 0) {
1560 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 1561 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name); 1562 kfree(dlm->name);
1545 kfree(dlm); 1563 kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1579 init_waitqueue_head(&dlm->reco.event); 1597 init_waitqueue_head(&dlm->reco.event);
1580 init_waitqueue_head(&dlm->ast_wq); 1598 init_waitqueue_head(&dlm->ast_wq);
1581 init_waitqueue_head(&dlm->migration_wq); 1599 init_waitqueue_head(&dlm->migration_wq);
1582 INIT_LIST_HEAD(&dlm->master_list);
1583 INIT_LIST_HEAD(&dlm->mle_hb_events); 1600 INIT_LIST_HEAD(&dlm->mle_hb_events);
1584 1601
1585 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; 1602 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1587 1604
1588 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 1605 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1589 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 1606 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1590 atomic_set(&dlm->local_resources, 0); 1607
1591 atomic_set(&dlm->remote_resources, 0); 1608 atomic_set(&dlm->res_tot_count, 0);
1592 atomic_set(&dlm->unknown_resources, 0); 1609 atomic_set(&dlm->res_cur_count, 0);
1610 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
1611 atomic_set(&dlm->mle_tot_count[i], 0);
1612 atomic_set(&dlm->mle_cur_count[i], 0);
1613 }
1593 1614
1594 spin_lock_init(&dlm->work_lock); 1615 spin_lock_init(&dlm->work_lock);
1595 INIT_LIST_HEAD(&dlm->work_list); 1616 INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
73 const char *name, 73 const char *name,
74 unsigned int namelen) 74 unsigned int namelen)
75{ 75{
76 struct dlm_lock_resource *res;
77
78 if (dlm != mle->dlm) 76 if (dlm != mle->dlm)
79 return 0; 77 return 0;
80 78
81 if (mle->type == DLM_MLE_BLOCK || 79 if (namelen != mle->mnamelen ||
82 mle->type == DLM_MLE_MIGRATION) { 80 memcmp(name, mle->mname, namelen) != 0)
83 if (namelen != mle->u.name.len || 81 return 0;
84 memcmp(name, mle->u.name.name, namelen)!=0) 82
85 return 0;
86 } else {
87 res = mle->u.res;
88 if (namelen != res->lockname.len ||
89 memcmp(res->lockname.name, name, namelen) != 0)
90 return 0;
91 }
92 return 1; 83 return 1;
93} 84}
94 85
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
283 274
284 mle->dlm = dlm; 275 mle->dlm = dlm;
285 mle->type = type; 276 mle->type = type;
286 INIT_LIST_HEAD(&mle->list); 277 INIT_HLIST_NODE(&mle->master_hash_node);
287 INIT_LIST_HEAD(&mle->hb_events); 278 INIT_LIST_HEAD(&mle->hb_events);
288 memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 279 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
289 spin_lock_init(&mle->spinlock); 280 spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
295 mle->new_master = O2NM_MAX_NODES; 286 mle->new_master = O2NM_MAX_NODES;
296 mle->inuse = 0; 287 mle->inuse = 0;
297 288
289 BUG_ON(mle->type != DLM_MLE_BLOCK &&
290 mle->type != DLM_MLE_MASTER &&
291 mle->type != DLM_MLE_MIGRATION);
292
298 if (mle->type == DLM_MLE_MASTER) { 293 if (mle->type == DLM_MLE_MASTER) {
299 BUG_ON(!res); 294 BUG_ON(!res);
300 mle->u.res = res; 295 mle->mleres = res;
301 } else if (mle->type == DLM_MLE_BLOCK) { 296 memcpy(mle->mname, res->lockname.name, res->lockname.len);
302 BUG_ON(!name); 297 mle->mnamelen = res->lockname.len;
303 memcpy(mle->u.name.name, name, namelen); 298 mle->mnamehash = res->lockname.hash;
304 mle->u.name.len = namelen; 299 } else {
305 } else /* DLM_MLE_MIGRATION */ {
306 BUG_ON(!name); 300 BUG_ON(!name);
307 memcpy(mle->u.name.name, name, namelen); 301 mle->mleres = NULL;
308 mle->u.name.len = namelen; 302 memcpy(mle->mname, name, namelen);
303 mle->mnamelen = namelen;
304 mle->mnamehash = dlm_lockid_hash(name, namelen);
309 } 305 }
310 306
307 atomic_inc(&dlm->mle_tot_count[mle->type]);
308 atomic_inc(&dlm->mle_cur_count[mle->type]);
309
311 /* copy off the node_map and register hb callbacks on our copy */ 310 /* copy off the node_map and register hb callbacks on our copy */
312 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 311 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
313 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 312 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
318 __dlm_mle_attach_hb_events(dlm, mle); 317 __dlm_mle_attach_hb_events(dlm, mle);
319} 318}
320 319
320void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
321{
322 assert_spin_locked(&dlm->spinlock);
323 assert_spin_locked(&dlm->master_lock);
324
325 if (!hlist_unhashed(&mle->master_hash_node))
326 hlist_del_init(&mle->master_hash_node);
327}
328
329void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
330{
331 struct hlist_head *bucket;
332
333 assert_spin_locked(&dlm->master_lock);
334
335 bucket = dlm_master_hash(dlm, mle->mnamehash);
336 hlist_add_head(&mle->master_hash_node, bucket);
337}
321 338
322/* returns 1 if found, 0 if not */ 339/* returns 1 if found, 0 if not */
323static int dlm_find_mle(struct dlm_ctxt *dlm, 340static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
325 char *name, unsigned int namelen) 342 char *name, unsigned int namelen)
326{ 343{
327 struct dlm_master_list_entry *tmpmle; 344 struct dlm_master_list_entry *tmpmle;
345 struct hlist_head *bucket;
346 struct hlist_node *list;
347 unsigned int hash;
328 348
329 assert_spin_locked(&dlm->master_lock); 349 assert_spin_locked(&dlm->master_lock);
330 350
331 list_for_each_entry(tmpmle, &dlm->master_list, list) { 351 hash = dlm_lockid_hash(name, namelen);
352 bucket = dlm_master_hash(dlm, hash);
353 hlist_for_each(list, bucket) {
354 tmpmle = hlist_entry(list, struct dlm_master_list_entry,
355 master_hash_node);
332 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 356 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
333 continue; 357 continue;
334 dlm_get_mle(tmpmle); 358 dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
408 mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 432 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
409 dlm = mle->dlm; 433 dlm = mle->dlm;
410 434
411 if (mle->type != DLM_MLE_MASTER) {
412 mlog(0, "calling mle_release for %.*s, type %d\n",
413 mle->u.name.len, mle->u.name.name, mle->type);
414 } else {
415 mlog(0, "calling mle_release for %.*s, type %d\n",
416 mle->u.res->lockname.len,
417 mle->u.res->lockname.name, mle->type);
418 }
419 assert_spin_locked(&dlm->spinlock); 435 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&dlm->master_lock); 436 assert_spin_locked(&dlm->master_lock);
421 437
438 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
439 mle->type);
440
422 /* remove from list if not already */ 441 /* remove from list if not already */
423 if (!list_empty(&mle->list)) 442 __dlm_unlink_mle(dlm, mle);
424 list_del_init(&mle->list);
425 443
426 /* detach the mle from the domain node up/down events */ 444 /* detach the mle from the domain node up/down events */
427 __dlm_mle_detach_hb_events(dlm, mle); 445 __dlm_mle_detach_hb_events(dlm, mle);
428 446
447 atomic_dec(&dlm->mle_cur_count[mle->type]);
448
429 /* NOTE: kfree under spinlock here. 449 /* NOTE: kfree under spinlock here.
430 * if this is bad, we can move this to a freelist. */ 450 * if this is bad, we can move this to a freelist. */
431 kmem_cache_free(dlm_mle_cache, mle); 451 kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
465 kmem_cache_destroy(dlm_lockres_cache); 485 kmem_cache_destroy(dlm_lockres_cache);
466} 486}
467 487
468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
469 struct dlm_lock_resource *res,
470 u8 owner)
471{
472 assert_spin_locked(&res->spinlock);
473
474 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
475
476 if (owner == dlm->node_num)
477 atomic_inc(&dlm->local_resources);
478 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
479 atomic_inc(&dlm->unknown_resources);
480 else
481 atomic_inc(&dlm->remote_resources);
482
483 res->owner = owner;
484}
485
486void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
487 struct dlm_lock_resource *res, u8 owner)
488{
489 assert_spin_locked(&res->spinlock);
490
491 if (owner == res->owner)
492 return;
493
494 if (res->owner == dlm->node_num)
495 atomic_dec(&dlm->local_resources);
496 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
497 atomic_dec(&dlm->unknown_resources);
498 else
499 atomic_dec(&dlm->remote_resources);
500
501 dlm_set_lockres_owner(dlm, res, owner);
502}
503
504
505static void dlm_lockres_release(struct kref *kref) 488static void dlm_lockres_release(struct kref *kref)
506{ 489{
507 struct dlm_lock_resource *res; 490 struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
527 } 510 }
528 spin_unlock(&dlm->track_lock); 511 spin_unlock(&dlm->track_lock);
529 512
513 atomic_dec(&dlm->res_cur_count);
514
530 dlm_put(dlm); 515 dlm_put(dlm);
531 516
532 if (!hlist_unhashed(&res->hash_node) || 517 if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
607 592
608 kref_init(&res->refs); 593 kref_init(&res->refs);
609 594
595 atomic_inc(&dlm->res_tot_count);
596 atomic_inc(&dlm->res_cur_count);
597
610 /* just for consistency */ 598 /* just for consistency */
611 spin_lock(&res->spinlock); 599 spin_lock(&res->spinlock);
612 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 600 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
843 alloc_mle = NULL; 831 alloc_mle = NULL;
844 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 832 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
845 set_bit(dlm->node_num, mle->maybe_map); 833 set_bit(dlm->node_num, mle->maybe_map);
846 list_add(&mle->list, &dlm->master_list); 834 __dlm_insert_mle(dlm, mle);
847 835
848 /* still holding the dlm spinlock, check the recovery map 836 /* still holding the dlm spinlock, check the recovery map
849 * to see if there are any nodes that still need to be 837 * to see if there are any nodes that still need to be
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1270 res->lockname.len, 1258 res->lockname.len,
1271 res->lockname.name); 1259 res->lockname.name);
1272 mle->type = DLM_MLE_MASTER; 1260 mle->type = DLM_MLE_MASTER;
1273 mle->u.res = res; 1261 mle->mleres = res;
1274 } 1262 }
1275 } 1263 }
1276 } 1264 }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
1315 1303
1316 BUG_ON(mle->type == DLM_MLE_MIGRATION); 1304 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1317 1305
1318 if (mle->type != DLM_MLE_MASTER) { 1306 request.namelen = (u8)mle->mnamelen;
1319 request.namelen = mle->u.name.len; 1307 memcpy(request.name, mle->mname, request.namelen);
1320 memcpy(request.name, mle->u.name.name, request.namelen);
1321 } else {
1322 request.namelen = mle->u.res->lockname.len;
1323 memcpy(request.name, mle->u.res->lockname.name,
1324 request.namelen);
1325 }
1326 1308
1327again: 1309again:
1328 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 1310 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
1575 // "add the block.\n"); 1557 // "add the block.\n");
1576 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 1558 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1577 set_bit(request->node_idx, mle->maybe_map); 1559 set_bit(request->node_idx, mle->maybe_map);
1578 list_add(&mle->list, &dlm->master_list); 1560 __dlm_insert_mle(dlm, mle);
1579 response = DLM_MASTER_RESP_NO; 1561 response = DLM_MASTER_RESP_NO;
1580 } else { 1562 } else {
1581 // mlog(0, "mle was found\n"); 1563 // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
1967 assert->node_idx, rr, extra_ref, mle->inuse); 1949 assert->node_idx, rr, extra_ref, mle->inuse);
1968 dlm_print_one_mle(mle); 1950 dlm_print_one_mle(mle);
1969 } 1951 }
1970 list_del_init(&mle->list); 1952 __dlm_unlink_mle(dlm, mle);
1971 __dlm_mle_detach_hb_events(dlm, mle); 1953 __dlm_mle_detach_hb_events(dlm, mle);
1972 __dlm_put_mle(mle); 1954 __dlm_put_mle(mle);
1973 if (extra_ref) { 1955 if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3159 tmp->master = master; 3141 tmp->master = master;
3160 atomic_set(&tmp->woken, 1); 3142 atomic_set(&tmp->woken, 1);
3161 wake_up(&tmp->wq); 3143 wake_up(&tmp->wq);
3162 /* remove it from the list so that only one 3144 /* remove it so that only one mle will be found */
3163 * mle will be found */ 3145 __dlm_unlink_mle(dlm, tmp);
3164 list_del_init(&tmp->list);
3165 /* this was obviously WRONG. mle is uninited here. should be tmp. */
3166 __dlm_mle_detach_hb_events(dlm, tmp); 3146 __dlm_mle_detach_hb_events(dlm, tmp);
3167 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3147 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3168 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3148 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3181 mle->master = master; 3161 mle->master = master;
3182 /* do this for consistency with other mle types */ 3162 /* do this for consistency with other mle types */
3183 set_bit(new_master, mle->maybe_map); 3163 set_bit(new_master, mle->maybe_map);
3184 list_add(&mle->list, &dlm->master_list); 3164 __dlm_insert_mle(dlm, mle);
3185 3165
3186 return ret; 3166 return ret;
3187} 3167}
3188 3168
3189 3169/*
3190void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3170 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3171 */
3172static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3173 struct dlm_master_list_entry *mle)
3191{ 3174{
3192 struct dlm_master_list_entry *mle, *next;
3193 struct dlm_lock_resource *res; 3175 struct dlm_lock_resource *res;
3194 unsigned int hash;
3195 3176
3196 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 3177 /* Find the lockres associated to the mle and set its owner to UNK */
3197top: 3178 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3198 assert_spin_locked(&dlm->spinlock); 3179 mle->mnamehash);
3180 if (res) {
3181 spin_unlock(&dlm->master_lock);
3199 3182
3200 /* clean the master list */ 3183 /* move lockres onto recovery list */
3201 spin_lock(&dlm->master_lock); 3184 spin_lock(&res->spinlock);
3202 list_for_each_entry_safe(mle, next, &dlm->master_list, list) { 3185 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3203 BUG_ON(mle->type != DLM_MLE_BLOCK && 3186 dlm_move_lockres_to_recovery_list(dlm, res);
3204 mle->type != DLM_MLE_MASTER && 3187 spin_unlock(&res->spinlock);
3205 mle->type != DLM_MLE_MIGRATION); 3188 dlm_lockres_put(res);
3206
3207 /* MASTER mles are initiated locally. the waiting
3208 * process will notice the node map change
3209 * shortly. let that happen as normal. */
3210 if (mle->type == DLM_MLE_MASTER)
3211 continue;
3212 3189
3190 /* about to get rid of mle, detach from heartbeat */
3191 __dlm_mle_detach_hb_events(dlm, mle);
3213 3192
3214 /* BLOCK mles are initiated by other nodes. 3193 /* dump the mle */
3215 * need to clean up if the dead node would have 3194 spin_lock(&dlm->master_lock);
3216 * been the master. */ 3195 __dlm_put_mle(mle);
3217 if (mle->type == DLM_MLE_BLOCK) { 3196 spin_unlock(&dlm->master_lock);
3218 int bit; 3197 }
3219 3198
3220 spin_lock(&mle->spinlock); 3199 return res;
3221 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 3200}
3222 if (bit != dead_node) {
3223 mlog(0, "mle found, but dead node %u would "
3224 "not have been master\n", dead_node);
3225 spin_unlock(&mle->spinlock);
3226 } else {
3227 /* must drop the refcount by one since the
3228 * assert_master will never arrive. this
3229 * may result in the mle being unlinked and
3230 * freed, but there may still be a process
3231 * waiting in the dlmlock path which is fine. */
3232 mlog(0, "node %u was expected master\n",
3233 dead_node);
3234 atomic_set(&mle->woken, 1);
3235 spin_unlock(&mle->spinlock);
3236 wake_up(&mle->wq);
3237 /* do not need events any longer, so detach
3238 * from heartbeat */
3239 __dlm_mle_detach_hb_events(dlm, mle);
3240 __dlm_put_mle(mle);
3241 }
3242 continue;
3243 }
3244 3201
3245 /* everything else is a MIGRATION mle */ 3202static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3246 3203 struct dlm_master_list_entry *mle)
3247 /* the rule for MIGRATION mles is that the master 3204{
3248 * becomes UNKNOWN if *either* the original or 3205 __dlm_mle_detach_hb_events(dlm, mle);
3249 * the new master dies. all UNKNOWN lockreses
3250 * are sent to whichever node becomes the recovery
3251 * master. the new master is responsible for
3252 * determining if there is still a master for
3253 * this lockres, or if he needs to take over
3254 * mastery. either way, this node should expect
3255 * another message to resolve this. */
3256 if (mle->master != dead_node &&
3257 mle->new_master != dead_node)
3258 continue;
3259 3206
3260 /* if we have reached this point, this mle needs to 3207 spin_lock(&mle->spinlock);
3261 * be removed from the list and freed. */ 3208 __dlm_unlink_mle(dlm, mle);
3209 atomic_set(&mle->woken, 1);
3210 spin_unlock(&mle->spinlock);
3262 3211
3263 /* remove from the list early. NOTE: unlinking 3212 wake_up(&mle->wq);
3264 * list_head while in list_for_each_safe */ 3213}
3265 __dlm_mle_detach_hb_events(dlm, mle); 3214
3266 spin_lock(&mle->spinlock); 3215static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3267 list_del_init(&mle->list); 3216 struct dlm_master_list_entry *mle, u8 dead_node)
3217{
3218 int bit;
3219
3220 BUG_ON(mle->type != DLM_MLE_BLOCK);
3221
3222 spin_lock(&mle->spinlock);
3223 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3224 if (bit != dead_node) {
3225 mlog(0, "mle found, but dead node %u would not have been "
3226 "master\n", dead_node);
3227 spin_unlock(&mle->spinlock);
3228 } else {
3229 /* Must drop the refcount by one since the assert_master will
3230 * never arrive. This may result in the mle being unlinked and
3231 * freed, but there may still be a process waiting in the
3232 * dlmlock path which is fine. */
3233 mlog(0, "node %u was expected master\n", dead_node);
3268 atomic_set(&mle->woken, 1); 3234 atomic_set(&mle->woken, 1);
3269 spin_unlock(&mle->spinlock); 3235 spin_unlock(&mle->spinlock);
3270 wake_up(&mle->wq); 3236 wake_up(&mle->wq);
3271 3237
3272 mlog(0, "%s: node %u died during migration from " 3238 /* Do not need events any longer, so detach from heartbeat */
3273 "%u to %u!\n", dlm->name, dead_node, 3239 __dlm_mle_detach_hb_events(dlm, mle);
3274 mle->master, mle->new_master); 3240 __dlm_put_mle(mle);
3275 /* if there is a lockres associated with this 3241 }
3276 * mle, find it and set its owner to UNKNOWN */ 3242}
3277 hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
3278 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
3279 mle->u.name.len, hash);
3280 if (res) {
3281 /* unfortunately if we hit this rare case, our
3282 * lock ordering is messed. we need to drop
3283 * the master lock so that we can take the
3284 * lockres lock, meaning that we will have to
3285 * restart from the head of list. */
3286 spin_unlock(&dlm->master_lock);
3287 3243
3288 /* move lockres onto recovery list */ 3244void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3289 spin_lock(&res->spinlock); 3245{
3290 dlm_set_lockres_owner(dlm, res, 3246 struct dlm_master_list_entry *mle;
3291 DLM_LOCK_RES_OWNER_UNKNOWN); 3247 struct dlm_lock_resource *res;
3292 dlm_move_lockres_to_recovery_list(dlm, res); 3248 struct hlist_head *bucket;
3293 spin_unlock(&res->spinlock); 3249 struct hlist_node *list;
3294 dlm_lockres_put(res); 3250 unsigned int i;
3295 3251
3296 /* about to get rid of mle, detach from heartbeat */ 3252 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
3297 __dlm_mle_detach_hb_events(dlm, mle); 3253top:
3254 assert_spin_locked(&dlm->spinlock);
3298 3255
3299 /* dump the mle */ 3256 /* clean the master list */
3300 spin_lock(&dlm->master_lock); 3257 spin_lock(&dlm->master_lock);
3301 __dlm_put_mle(mle); 3258 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3302 spin_unlock(&dlm->master_lock); 3259 bucket = dlm_master_hash(dlm, i);
3260 hlist_for_each(list, bucket) {
3261 mle = hlist_entry(list, struct dlm_master_list_entry,
3262 master_hash_node);
3263
3264 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3265 mle->type != DLM_MLE_MASTER &&
3266 mle->type != DLM_MLE_MIGRATION);
3267
3268 /* MASTER mles are initiated locally. The waiting
3269 * process will notice the node map change shortly.
3270 * Let that happen as normal. */
3271 if (mle->type == DLM_MLE_MASTER)
3272 continue;
3273
3274 /* BLOCK mles are initiated by other nodes. Need to
3275 * clean up if the dead node would have been the
3276 * master. */
3277 if (mle->type == DLM_MLE_BLOCK) {
3278 dlm_clean_block_mle(dlm, mle, dead_node);
3279 continue;
3280 }
3303 3281
3304 /* restart */ 3282 /* Everything else is a MIGRATION mle */
3305 goto top; 3283
3306 } 3284 /* The rule for MIGRATION mles is that the master
3285 * becomes UNKNOWN if *either* the original or the new
3286 * master dies. All UNKNOWN lockres' are sent to
3287 * whichever node becomes the recovery master. The new
3288 * master is responsible for determining if there is
3289 * still a master for this lockres, or if he needs to
3290 * take over mastery. Either way, this node should
3291 * expect another message to resolve this. */
3292
3293 if (mle->master != dead_node &&
3294 mle->new_master != dead_node)
3295 continue;
3296
3297 /* If we have reached this point, this mle needs to be
3298 * removed from the list and freed. */
3299 dlm_clean_migration_mle(dlm, mle);
3300
3301 mlog(0, "%s: node %u died during migration from "
3302 "%u to %u!\n", dlm->name, dead_node, mle->master,
3303 mle->new_master);
3304
3305 /* If we find a lockres associated with the mle, we've
3306 * hit this rare case that messes up our lock ordering.
3307 * If so, we need to drop the master lock so that we can
3308 * take the lockres lock, meaning that we will have to
3309 * restart from the head of list. */
3310 res = dlm_reset_mleres_owner(dlm, mle);
3311 if (res)
3312 /* restart */
3313 goto top;
3307 3314
3308 /* this may be the last reference */ 3315 /* This may be the last reference */
3309 __dlm_put_mle(mle); 3316 __dlm_put_mle(mle);
3317 }
3310 } 3318 }
3311 spin_unlock(&dlm->master_lock); 3319 spin_unlock(&dlm->master_lock);
3312} 3320}
3313 3321
3314
3315int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 3322int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3316 u8 old_master) 3323 u8 old_master)
3317{ 3324{
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
162 162
163 spin_lock(&res->spinlock); 163 spin_lock(&res->spinlock);
164 if (!__dlm_lockres_unused(res)) { 164 if (!__dlm_lockres_unused(res)) {
165 spin_unlock(&res->spinlock);
166 mlog(0, "%s:%.*s: tried to purge but not unused\n", 165 mlog(0, "%s:%.*s: tried to purge but not unused\n",
167 dlm->name, res->lockname.len, res->lockname.name); 166 dlm->name, res->lockname.len, res->lockname.name);
168 return -ENOTEMPTY; 167 __dlm_print_one_lock_resource(res);
168 spin_unlock(&res->spinlock);
169 BUG();
169 } 170 }
171
172 if (res->state & DLM_LOCK_RES_MIGRATING) {
173 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
174 "being remastered\n", dlm->name, res->lockname.len,
175 res->lockname.name);
176 /* Re-add the lockres to the end of the purge list */
177 if (!list_empty(&res->purge)) {
178 list_del_init(&res->purge);
179 list_add_tail(&res->purge, &dlm->purge_list);
180 }
181 spin_unlock(&res->spinlock);
182 return 0;
183 }
184
170 master = (res->owner == dlm->node_num); 185 master = (res->owner == dlm->node_num);
186
171 if (!master) 187 if (!master)
172 res->state |= DLM_LOCK_RES_DROPPING_REF; 188 res->state |= DLM_LOCK_RES_DROPPING_REF;
173 spin_unlock(&res->spinlock); 189 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
244 .flags = 0, 244 .flags = 0,
245}; 245};
246 246
247static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
248 .flags = 0,
249};
250
247static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { 251static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
248 .get_osb = ocfs2_get_dentry_osb, 252 .get_osb = ocfs2_get_dentry_osb,
249 .post_unlock = ocfs2_dentry_post_unlock, 253 .post_unlock = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
622 &ocfs2_rename_lops, osb); 626 &ocfs2_rename_lops, osb);
623} 627}
624 628
629static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
630 struct ocfs2_super *osb)
631{
632 /* nfs_sync lockres doesn't come from a slab so we call init
633 * once on it manually. */
634 ocfs2_lock_res_init_once(res);
635 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
636 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
637 &ocfs2_nfs_sync_lops, osb);
638}
639
625void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 640void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
626 struct ocfs2_file_private *fp) 641 struct ocfs2_file_private *fp)
627{ 642{
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2417 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); 2432 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2418} 2433}
2419 2434
2435int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
2436{
2437 int status;
2438 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2439
2440 if (ocfs2_is_hard_readonly(osb))
2441 return -EROFS;
2442
2443 if (ocfs2_mount_local(osb))
2444 return 0;
2445
2446 status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
2447 0, 0);
2448 if (status < 0)
2449 mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
2450
2451 return status;
2452}
2453
2454void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
2455{
2456 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2457
2458 if (!ocfs2_mount_local(osb))
2459 ocfs2_cluster_unlock(osb, lockres,
2460 ex ? LKM_EXMODE : LKM_PRMODE);
2461}
2462
2420int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2463int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2421{ 2464{
2422 int ret; 2465 int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2798local: 2841local:
2799 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2842 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2800 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2843 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2844 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
2801 2845
2802 osb->cconn = conn; 2846 osb->cconn = conn;
2803 2847
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2833 2877
2834 ocfs2_lock_res_free(&osb->osb_super_lockres); 2878 ocfs2_lock_res_free(&osb->osb_super_lockres);
2835 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2879 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2880 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
2836 2881
2837 ocfs2_cluster_disconnect(osb->cconn, hangup_pending); 2882 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2838 osb->cconn = NULL; 2883 osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
3015{ 3060{
3016 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); 3061 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
3017 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); 3062 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
3063 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
3018} 3064}
3019 3065
3020int ocfs2_drop_inode_locks(struct inode *inode) 3066int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
115 int ex); 115 int ex);
116int ocfs2_rename_lock(struct ocfs2_super *osb); 116int ocfs2_rename_lock(struct ocfs2_super *osb);
117void ocfs2_rename_unlock(struct ocfs2_super *osb); 117void ocfs2_rename_unlock(struct ocfs2_super *osb);
118int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
119void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
118int ocfs2_dentry_lock(struct dentry *dentry, int ex); 120int ocfs2_dentry_lock(struct dentry *dentry, int ex);
119void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 121void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
120int ocfs2_file_lock(struct file *file, int ex, int trylock); 122int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..de3da8eb558c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
31 31
32#include "ocfs2.h" 32#include "ocfs2.h"
33 33
34#include "alloc.h"
34#include "dir.h" 35#include "dir.h"
35#include "dlmglue.h" 36#include "dlmglue.h"
36#include "dcache.h" 37#include "dcache.h"
@@ -38,6 +39,7 @@
38#include "inode.h" 39#include "inode.h"
39 40
40#include "buffer_head_io.h" 41#include "buffer_head_io.h"
42#include "suballoc.h"
41 43
42struct ocfs2_inode_handle 44struct ocfs2_inode_handle
43{ 45{
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
49 struct ocfs2_inode_handle *handle) 51 struct ocfs2_inode_handle *handle)
50{ 52{
51 struct inode *inode; 53 struct inode *inode;
54 struct ocfs2_super *osb = OCFS2_SB(sb);
55 u64 blkno = handle->ih_blkno;
56 int status, set;
52 struct dentry *result; 57 struct dentry *result;
53 58
54 mlog_entry("(0x%p, 0x%p)\n", sb, handle); 59 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
55 60
56 if (handle->ih_blkno == 0) { 61 if (blkno == 0) {
57 mlog_errno(-ESTALE); 62 mlog(0, "nfs wants inode with blkno: 0\n");
58 return ERR_PTR(-ESTALE); 63 result = ERR_PTR(-ESTALE);
64 goto bail;
65 }
66
67 inode = ocfs2_ilookup(sb, blkno);
68 /*
69 * If the inode exists in memory, we only need to check it's
70 * generation number
71 */
72 if (inode)
73 goto check_gen;
74
75 /*
76 * This will synchronize us against ocfs2_delete_inode() on
77 * all nodes
78 */
79 status = ocfs2_nfs_sync_lock(osb, 1);
80 if (status < 0) {
81 mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
82 goto check_err;
83 }
84
85 status = ocfs2_test_inode_bit(osb, blkno, &set);
86 if (status < 0) {
87 if (status == -EINVAL) {
88 /*
89 * The blkno NFS gave us doesn't even show up
90 * as an inode, we return -ESTALE to be
91 * nice
92 */
93 mlog(0, "test inode bit failed %d\n", status);
94 status = -ESTALE;
95 } else {
96 mlog(ML_ERROR, "test inode bit failed %d\n", status);
97 }
98 goto unlock_nfs_sync;
99 }
100
101 /* If the inode allocator bit is clear, this inode must be stale */
102 if (!set) {
103 mlog(0, "inode %llu suballoc bit is clear\n", blkno);
104 status = -ESTALE;
105 goto unlock_nfs_sync;
59 } 106 }
60 107
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); 108 inode = ocfs2_iget(osb, blkno, 0, 0);
62 109
63 if (IS_ERR(inode)) 110unlock_nfs_sync:
64 return (void *)inode; 111 ocfs2_nfs_sync_unlock(osb, 1);
65 112
113check_err:
114 if (status < 0) {
115 if (status == -ESTALE) {
116 mlog(0, "stale inode ino: %llu generation: %u\n",
117 blkno, handle->ih_generation);
118 }
119 result = ERR_PTR(status);
120 goto bail;
121 }
122
123 if (IS_ERR(inode)) {
124 mlog_errno(PTR_ERR(inode));
125 result = (void *)inode;
126 goto bail;
127 }
128
129check_gen:
66 if (handle->ih_generation != inode->i_generation) { 130 if (handle->ih_generation != inode->i_generation) {
67 iput(inode); 131 iput(inode);
68 return ERR_PTR(-ESTALE); 132 mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
133 handle->ih_generation);
134 result = ERR_PTR(-ESTALE);
135 goto bail;
69 } 136 }
70 137
71 result = d_obtain_alias(inode); 138 result = d_obtain_alias(inode);
72 if (!IS_ERR(result)) 139 if (!IS_ERR(result))
73 result->d_op = &ocfs2_dentry_ops; 140 result->d_op = &ocfs2_dentry_ops;
141 else
142 mlog_errno(PTR_ERR(result));
74 143
144bail:
75 mlog_exit_ptr(result); 145 mlog_exit_ptr(result);
76 return result; 146 return result;
77} 147}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8a..8672b9536039 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1926,7 +1926,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1926 out->f_path.dentry->d_name.len, 1926 out->f_path.dentry->d_name.len,
1927 out->f_path.dentry->d_name.name); 1927 out->f_path.dentry->d_name.name);
1928 1928
1929 inode_double_lock(inode, pipe->inode); 1929 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
1930 1930
1931 ret = ocfs2_rw_lock(inode, 1); 1931 ret = ocfs2_rw_lock(inode, 1);
1932 if (ret < 0) { 1932 if (ret < 0) {
@@ -1941,12 +1941,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1941 goto out_unlock; 1941 goto out_unlock;
1942 } 1942 }
1943 1943
1944 if (pipe->inode)
1945 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
1944 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1946 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
1947 if (pipe->inode)
1948 mutex_unlock(&pipe->inode->i_mutex);
1945 1949
1946out_unlock: 1950out_unlock:
1947 ocfs2_rw_unlock(inode, 1); 1951 ocfs2_rw_unlock(inode, 1);
1948out: 1952out:
1949 inode_double_unlock(inode, pipe->inode); 1953 mutex_unlock(&inode->i_mutex);
1950 1954
1951 mlog_exit(ret); 1955 mlog_exit(ret);
1952 return ret; 1956 return ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "alloc.h" 40#include "alloc.h"
41#include "dir.h"
41#include "blockcheck.h" 42#include "blockcheck.h"
42#include "dlmglue.h" 43#include "dlmglue.h"
43#include "extent_map.h" 44#include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
112 oi->ip_attr |= OCFS2_DIRSYNC_FL; 113 oi->ip_attr |= OCFS2_DIRSYNC_FL;
113} 114}
114 115
116struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
117{
118 struct ocfs2_find_inode_args args;
119
120 args.fi_blkno = blkno;
121 args.fi_flags = 0;
122 args.fi_ino = ino_from_blkno(sb, blkno);
123 args.fi_sysfile_type = 0;
124
125 return ilookup5(sb, blkno, ocfs2_find_actor, &args);
126}
115struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
116 int sysfile_type) 128 int sysfile_type)
117{ 129{
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
275 (unsigned long long)OCFS2_I(inode)->ip_blkno, 287 (unsigned long long)OCFS2_I(inode)->ip_blkno,
276 (unsigned long long)le64_to_cpu(fe->i_blkno)); 288 (unsigned long long)le64_to_cpu(fe->i_blkno));
277 289
278 inode->i_nlink = le16_to_cpu(fe->i_links_count); 290 inode->i_nlink = ocfs2_read_links_count(fe);
279 291
280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 292 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 293 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
351 363
352 ocfs2_set_inode_flags(inode); 364 ocfs2_set_inode_flags(inode);
353 365
366 OCFS2_I(inode)->ip_last_used_slot = 0;
367 OCFS2_I(inode)->ip_last_used_group = 0;
354 mlog_exit_void(); 368 mlog_exit_void();
355} 369}
356 370
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
606 } 620 }
607 621
608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 622 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb)); 623 ocfs2_quota_trans_credits(inode->i_sb));
610 if (IS_ERR(handle)) { 624 if (IS_ERR(handle)) {
611 status = PTR_ERR(handle); 625 status = PTR_ERR(handle);
612 mlog_errno(status); 626 mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
740 goto bail_unlock_dir; 754 goto bail_unlock_dir;
741 } 755 }
742 756
757 /* Remove any dir index tree */
758 if (S_ISDIR(inode->i_mode)) {
759 status = ocfs2_dx_dir_truncate(inode, di_bh);
760 if (status) {
761 mlog_errno(status);
762 goto bail_unlock_dir;
763 }
764 }
765
743 /*Free extended attribute resources associated with this inode.*/ 766 /*Free extended attribute resources associated with this inode.*/
744 status = ocfs2_xattr_remove(inode, di_bh); 767 status = ocfs2_xattr_remove(inode, di_bh);
745 if (status < 0) { 768 if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
949 goto bail; 972 goto bail;
950 } 973 }
951 974
975 /*
976 * Synchronize us against ocfs2_get_dentry. We take this in
977 * shared mode so that all nodes can still concurrently
978 * process deletes.
979 */
980 status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
981 if (status < 0) {
982 mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
983 ocfs2_cleanup_delete_inode(inode, 0);
984 goto bail_unblock;
985 }
952 /* Lock down the inode. This gives us an up to date view of 986 /* Lock down the inode. This gives us an up to date view of
953 * it's metadata (for verification), and allows us to 987 * it's metadata (for verification), and allows us to
954 * serialize delete_inode on multiple nodes. 988 * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
962 if (status != -ENOENT) 996 if (status != -ENOENT)
963 mlog_errno(status); 997 mlog_errno(status);
964 ocfs2_cleanup_delete_inode(inode, 0); 998 ocfs2_cleanup_delete_inode(inode, 0);
965 goto bail_unblock; 999 goto bail_unlock_nfs_sync;
966 } 1000 }
967 1001
968 /* Query the cluster. This will be the final decision made 1002 /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
1005bail_unlock_inode: 1039bail_unlock_inode:
1006 ocfs2_inode_unlock(inode, 1); 1040 ocfs2_inode_unlock(inode, 1);
1007 brelse(di_bh); 1041 brelse(di_bh);
1042
1043bail_unlock_nfs_sync:
1044 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1045
1008bail_unblock: 1046bail_unblock:
1009 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1047 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
1010 if (status < 0) 1048 if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1205 spin_unlock(&OCFS2_I(inode)->ip_lock); 1243 spin_unlock(&OCFS2_I(inode)->ip_lock);
1206 1244
1207 fe->i_size = cpu_to_le64(i_size_read(inode)); 1245 fe->i_size = cpu_to_le64(i_size_read(inode));
1208 fe->i_links_count = cpu_to_le16(inode->i_nlink); 1246 ocfs2_set_links_count(fe, inode->i_nlink);
1209 fe->i_uid = cpu_to_le32(inode->i_uid); 1247 fe->i_uid = cpu_to_le32(inode->i_uid);
1210 fe->i_gid = cpu_to_le32(inode->i_gid); 1248 fe->i_gid = cpu_to_le32(inode->i_gid);
1211 fe->i_mode = cpu_to_le16(inode->i_mode); 1249 fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1242 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1280 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1243 ocfs2_set_inode_flags(inode); 1281 ocfs2_set_inode_flags(inode);
1244 i_size_write(inode, le64_to_cpu(fe->i_size)); 1282 i_size_write(inode, le64_to_cpu(fe->i_size));
1245 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1283 inode->i_nlink = ocfs2_read_links_count(fe);
1246 inode->i_uid = le32_to_cpu(fe->i_uid); 1284 inode->i_uid = le32_to_cpu(fe->i_uid);
1247 inode->i_gid = le32_to_cpu(fe->i_gid); 1285 inode->i_gid = le32_to_cpu(fe->i_gid);
1248 inode->i_mode = le16_to_cpu(fe->i_mode); 1286 inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
72 72
73 struct inode vfs_inode; 73 struct inode vfs_inode;
74 struct jbd2_inode ip_jinode; 74 struct jbd2_inode ip_jinode;
75
76 /* Only valid if the inode is the dir. */
77 u32 ip_last_used_slot;
78 u64 ip_last_used_group;
75}; 79};
76 80
77/* 81/*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
124/* Flags for ocfs2_iget() */ 128/* Flags for ocfs2_iget() */
125#define OCFS2_FI_FLAG_SYSFILE 0x1 129#define OCFS2_FI_FLAG_SYSFILE 0x1
126#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 130#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
131struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, 132struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 133 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 134int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
65static int ocfs2_recover_orphans(struct ocfs2_super *osb, 65static int ocfs2_recover_orphans(struct ocfs2_super *osb,
66 int slot); 66 int slot);
67static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
68static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
69 int slot_num,
70 struct ocfs2_dinode *la_dinode,
71 struct ocfs2_dinode *tl_dinode,
72 struct ocfs2_quota_recovery *qrec);
68 73
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) 74static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{ 75{
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
76 return __ocfs2_wait_on_mount(osb, 1); 81 return __ocfs2_wait_on_mount(osb, 1);
77} 82}
78 83
79
80
81/* 84/*
82 * The recovery_list is a simple linked list of node numbers to recover. 85 * This replay_map is to track online/offline slots, so we could recover
83 * It is protected by the recovery_lock. 86 * offline slots during recovery and mount
84 */ 87 */
85 88
86struct ocfs2_recovery_map { 89enum ocfs2_replay_state {
87 unsigned int rm_used; 90 REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */
88 unsigned int *rm_entries; 91 REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */
92 REPLAY_DONE /* Replay was already queued */
89}; 93};
90 94
95struct ocfs2_replay_map {
96 unsigned int rm_slots;
97 enum ocfs2_replay_state rm_state;
98 unsigned char rm_replay_slots[0];
99};
100
101void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
102{
103 if (!osb->replay_map)
104 return;
105
106 /* If we've already queued the replay, we don't have any more to do */
107 if (osb->replay_map->rm_state == REPLAY_DONE)
108 return;
109
110 osb->replay_map->rm_state = state;
111}
112
113int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
114{
115 struct ocfs2_replay_map *replay_map;
116 int i, node_num;
117
118 /* If replay map is already set, we don't do it again */
119 if (osb->replay_map)
120 return 0;
121
122 replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
123 (osb->max_slots * sizeof(char)), GFP_KERNEL);
124
125 if (!replay_map) {
126 mlog_errno(-ENOMEM);
127 return -ENOMEM;
128 }
129
130 spin_lock(&osb->osb_lock);
131
132 replay_map->rm_slots = osb->max_slots;
133 replay_map->rm_state = REPLAY_UNNEEDED;
134
135 /* set rm_replay_slots for offline slot(s) */
136 for (i = 0; i < replay_map->rm_slots; i++) {
137 if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
138 replay_map->rm_replay_slots[i] = 1;
139 }
140
141 osb->replay_map = replay_map;
142 spin_unlock(&osb->osb_lock);
143 return 0;
144}
145
146void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
147{
148 struct ocfs2_replay_map *replay_map = osb->replay_map;
149 int i;
150
151 if (!replay_map)
152 return;
153
154 if (replay_map->rm_state != REPLAY_NEEDED)
155 return;
156
157 for (i = 0; i < replay_map->rm_slots; i++)
158 if (replay_map->rm_replay_slots[i])
159 ocfs2_queue_recovery_completion(osb->journal, i, NULL,
160 NULL, NULL);
161 replay_map->rm_state = REPLAY_DONE;
162}
163
164void ocfs2_free_replay_slots(struct ocfs2_super *osb)
165{
166 struct ocfs2_replay_map *replay_map = osb->replay_map;
167
168 if (!osb->replay_map)
169 return;
170
171 kfree(replay_map);
172 osb->replay_map = NULL;
173}
174
91int ocfs2_recovery_init(struct ocfs2_super *osb) 175int ocfs2_recovery_init(struct ocfs2_super *osb)
92{ 176{
93 struct ocfs2_recovery_map *rm; 177 struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
496 }, 580 },
497}; 581};
498 582
583static struct ocfs2_triggers dr_triggers = {
584 .ot_triggers = {
585 .t_commit = ocfs2_commit_trigger,
586 .t_abort = ocfs2_abort_trigger,
587 },
588 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
589};
590
591static struct ocfs2_triggers dl_triggers = {
592 .ot_triggers = {
593 .t_commit = ocfs2_commit_trigger,
594 .t_abort = ocfs2_abort_trigger,
595 },
596 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
597};
598
499static int __ocfs2_journal_access(handle_t *handle, 599static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode, 600 struct inode *inode,
501 struct buffer_head *bh, 601 struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
600 type); 700 type);
601} 701}
602 702
703int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
704 struct buffer_head *bh, int type)
705{
706 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
707 type);
708}
709
710int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
711 struct buffer_head *bh, int type)
712{
713 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
714 type);
715}
716
603int ocfs2_journal_access(handle_t *handle, struct inode *inode, 717int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type) 718 struct buffer_head *bh, int type)
605{ 719{
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1176} 1290}
1177 1291
1178/* Called by the mount code to queue recovery the last part of 1292/* Called by the mount code to queue recovery the last part of
1179 * recovery for it's own slot. */ 1293 * recovery for it's own and offline slot(s). */
1180void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) 1294void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1181{ 1295{
1182 struct ocfs2_journal *journal = osb->journal; 1296 struct ocfs2_journal *journal = osb->journal;
1183 1297
1184 if (osb->dirty) { 1298 /* No need to queue up our truncate_log as regular cleanup will catch
1185 /* No need to queue up our truncate_log as regular 1299 * that */
1186 * cleanup will catch that. */ 1300 ocfs2_queue_recovery_completion(journal, osb->slot_num,
1187 ocfs2_queue_recovery_completion(journal, 1301 osb->local_alloc_copy, NULL, NULL);
1188 osb->slot_num, 1302 ocfs2_schedule_truncate_log_flush(osb, 0);
1189 osb->local_alloc_copy,
1190 NULL,
1191 NULL);
1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1193 1303
1194 osb->local_alloc_copy = NULL; 1304 osb->local_alloc_copy = NULL;
1195 osb->dirty = 0; 1305 osb->dirty = 0;
1196 } 1306
1307 /* queue to recover orphan slots for all offline slots */
1308 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1309 ocfs2_queue_replay_slots(osb);
1310 ocfs2_free_replay_slots(osb);
1197} 1311}
1198 1312
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) 1313void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
1236 goto bail; 1350 goto bail;
1237 } 1351 }
1238 1352
1353 status = ocfs2_compute_replay_slots(osb);
1354 if (status < 0)
1355 mlog_errno(status);
1356
1357 /* queue recovery for our own slot */
1358 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1359 NULL, NULL);
1360
1239 spin_lock(&osb->osb_lock); 1361 spin_lock(&osb->osb_lock);
1240 while (rm->rm_used) { 1362 while (rm->rm_used) {
1241 /* It's always safe to remove entry zero, as we won't 1363 /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
1301 1423
1302 ocfs2_super_unlock(osb, 1); 1424 ocfs2_super_unlock(osb, 1);
1303 1425
1304 /* We always run recovery on our own orphan dir - the dead 1426 /* queue recovery for offline slots */
1305 * node(s) may have disallowd a previos inode delete. Re-processing 1427 ocfs2_queue_replay_slots(osb);
1306 * is therefore required. */
1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1308 NULL, NULL);
1309 1428
1310bail: 1429bail:
1311 mutex_lock(&osb->recovery_lock); 1430 mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
1314 goto restart; 1433 goto restart;
1315 } 1434 }
1316 1435
1436 ocfs2_free_replay_slots(osb);
1317 osb->recovery_thread_task = NULL; 1437 osb->recovery_thread_task = NULL;
1318 mb(); /* sync with ocfs2_recovery_thread_running */ 1438 mb(); /* sync with ocfs2_recovery_thread_running */
1319 wake_up(&osb->recovery_event); 1439 wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1465 goto done; 1585 goto done;
1466 } 1586 }
1467 1587
1588 /* we need to run complete recovery for offline orphan slots */
1589 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1590
1468 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1591 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1469 node_num, slot_num, 1592 node_num, slot_num,
1470 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1593 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..619dd7f6c053 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
38struct ocfs2_super; 38struct ocfs2_super;
39struct ocfs2_dinode; 39struct ocfs2_dinode;
40 40
41/*
42 * The recovery_list is a simple linked list of node numbers to recover.
43 * It is protected by the recovery_lock.
44 */
45
46struct ocfs2_recovery_map {
47 unsigned int rm_used;
48 unsigned int *rm_entries;
49};
50
51
41struct ocfs2_journal { 52struct ocfs2_journal {
42 enum ocfs2_journal_state j_state; /* Journals current state */ 53 enum ocfs2_journal_state j_state; /* Journals current state */
43 54
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
139int ocfs2_recovery_init(struct ocfs2_super *osb); 150int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb); 151void ocfs2_recovery_exit(struct ocfs2_super *osb);
141 152
153int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
142/* 154/*
143 * Journal Control: 155 * Journal Control:
144 * Initialize, Load, Shutdown, Wipe a journal. 156 * Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
266/* dirblock */ 278/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
281/* ocfs2_dx_root_block */
282int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
283 struct buffer_head *bh, int type);
284/* ocfs2_dx_leaf */
285int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
286 struct buffer_head *bh, int type);
269/* Anything that has no ecc */ 287/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
368} 386}
369 387
370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 388/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
371 * bitmap block for the new bit) */ 389 * bitmap block for the new bit) dx_root update for free list */
372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 390#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
391
392static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
393{
394 /* 1 block for index, 2 allocs (data, metadata), 1 clusters
395 * worth of blocks for initial extent. */
396 return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
397 ocfs2_clusters_to_blocks(sb, 1);
398}
373 399
374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 400/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
375 * group descriptor + mkdir/symlink blocks + quota update */ 401 * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
376static inline int ocfs2_mknod_credits(struct super_block *sb) 402 * blocks + quota update */
403static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
404 int xattr_credits)
377{ 405{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + 406 int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
407
408 if (is_dir)
409 dir_credits += ocfs2_add_dir_index_credits(sb);
410
411 return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
379 ocfs2_quota_trans_credits(sb); 412 ocfs2_quota_trans_credits(sb);
380} 413}
381 414
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 421#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
389 422
390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 423/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
391 * update on dir */ 424 * update on dir + index leaf + dx root update for free list */
392static inline int ocfs2_link_credits(struct super_block *sb) 425static inline int ocfs2_link_credits(struct super_block *sb)
393{ 426{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + 427 return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
395 ocfs2_quota_trans_credits(sb); 428 ocfs2_quota_trans_credits(sb);
396} 429}
397 430
398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 431/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
399 * dir inode link */ 432 * dir inode link + dir inode index leaf + dir index root */
400static inline int ocfs2_unlink_credits(struct super_block *sb) 433static inline int ocfs2_unlink_credits(struct super_block *sb)
401{ 434{
402 /* The quota update from ocfs2_link_credits is unused here... */ 435 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); 436 return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
404} 437}
405 438
406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 439/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
407 * inode alloc group descriptor */ 440 * inode alloc group descriptor + orphan dir index leaf */
408#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) 441#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
409 442
410/* dinode update, old dir dinode update, new dir dinode update, old 443/* dinode update, old dir dinode update, new dir dinode update, old
411 * dir dir entry, new dir dir entry, dir entry update for renaming 444 * dir dir entry, new dir dir entry, dir entry update for renaming
412 * directory + target unlink */ 445 * directory + target unlink + 3 x dir index leaves */
413static inline int ocfs2_rename_credits(struct super_block *sb) 446static inline int ocfs2_rename_credits(struct super_block *sb)
414{ 447{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); 448 return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
416} 449}
417 450
418/* global bitmap dinode, group desc., relinked group, 451/* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
422 + OCFS2_INODE_UPDATE_CREDITS \ 455 + OCFS2_INODE_UPDATE_CREDITS \
423 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) 456 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
424 457
458/* inode update, removal of dx root block from allocator */
459#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
460 OCFS2_SUBALLOC_FREE)
461
462static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
463{
464 int credits = 1 + OCFS2_SUBALLOC_ALLOC;
465
466 credits += ocfs2_clusters_to_blocks(sb, 1);
467 credits += ocfs2_quota_trans_credits(sb);
468
469 return credits;
470}
471
425/* 472/*
426 * Please note that the caller must make sure that root_el is the root 473 * Please note that the caller must make sure that root_el is the root
427 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 474 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
457 504
458static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 505static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
459{ 506{
460 int blocks = ocfs2_mknod_credits(sb); 507 int blocks = ocfs2_mknod_credits(sb, 0, 0);
461 508
462 /* links can be longer than one block so we may update many 509 /* links can be longer than one block so we may update many
463 * within our single allocated extent. */ 510 * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/debugfs.h>
32 31
33#define MLOG_MASK_PREFIX ML_DISK_ALLOC 32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
34#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 75 struct inode *local_alloc_inode);
77 76
78#ifdef CONFIG_OCFS2_FS_STATS
79
80static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
81{
82 file->private_data = inode->i_private;
83 return 0;
84}
85
86#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
87#define LA_DEBUG_VER 1
88static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
89 size_t count, loff_t *ppos)
90{
91 static DEFINE_MUTEX(la_debug_mutex);
92 struct ocfs2_super *osb = file->private_data;
93 int written, ret;
94 char *buf = osb->local_alloc_debug_buf;
95
96 mutex_lock(&la_debug_mutex);
97 memset(buf, 0, LA_DEBUG_BUF_SZ);
98
99 written = snprintf(buf, LA_DEBUG_BUF_SZ,
100 "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
101 LA_DEBUG_VER,
102 (unsigned long long)osb->la_last_gd,
103 osb->local_alloc_default_bits,
104 osb->local_alloc_bits, osb->local_alloc_state);
105
106 ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
107
108 mutex_unlock(&la_debug_mutex);
109 return ret;
110}
111
112static const struct file_operations ocfs2_la_debug_fops = {
113 .open = ocfs2_la_debug_open,
114 .read = ocfs2_la_debug_read,
115};
116
117static void ocfs2_init_la_debug(struct ocfs2_super *osb)
118{
119 osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
120 if (!osb->local_alloc_debug_buf)
121 return;
122
123 osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
124 S_IFREG|S_IRUSR,
125 osb->osb_debug_root,
126 osb,
127 &ocfs2_la_debug_fops);
128 if (!osb->local_alloc_debug) {
129 kfree(osb->local_alloc_debug_buf);
130 osb->local_alloc_debug_buf = NULL;
131 }
132}
133
134static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
135{
136 if (osb->local_alloc_debug)
137 debugfs_remove(osb->local_alloc_debug);
138
139 if (osb->local_alloc_debug_buf)
140 kfree(osb->local_alloc_debug_buf);
141
142 osb->local_alloc_debug_buf = NULL;
143 osb->local_alloc_debug = NULL;
144}
145#else /* CONFIG_OCFS2_FS_STATS */
146static void ocfs2_init_la_debug(struct ocfs2_super *osb)
147{
148 return;
149}
150static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
151{
152 return;
153}
154#endif
155
156static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
157{ 78{
158 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
226 147
227 mlog_entry_void(); 148 mlog_entry_void();
228 149
229 ocfs2_init_la_debug(osb);
230
231 if (osb->local_alloc_bits == 0) 150 if (osb->local_alloc_bits == 0)
232 goto bail; 151 goto bail;
233 152
@@ -299,9 +218,6 @@ bail:
299 if (inode) 218 if (inode)
300 iput(inode); 219 iput(inode);
301 220
302 if (status < 0)
303 ocfs2_shutdown_la_debug(osb);
304
305 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); 221 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
306 222
307 mlog_exit(status); 223 mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
331 cancel_delayed_work(&osb->la_enable_wq); 247 cancel_delayed_work(&osb->la_enable_wq);
332 flush_workqueue(ocfs2_wq); 248 flush_workqueue(ocfs2_wq);
333 249
334 ocfs2_shutdown_la_debug(osb);
335
336 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 250 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
337 goto out; 251 goto out;
338 252
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4b11762f249e..2220f93f668b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 80 struct inode **ret_orphan_dir,
81 struct inode *inode, 81 struct inode *inode,
82 char *name, 82 char *name,
83 struct buffer_head **de_bh); 83 struct ocfs2_dir_lookup_result *lookup);
84 84
85static int ocfs2_orphan_add(struct ocfs2_super *osb, 85static int ocfs2_orphan_add(struct ocfs2_super *osb,
86 handle_t *handle, 86 handle_t *handle,
87 struct inode *inode, 87 struct inode *inode,
88 struct ocfs2_dinode *fe, 88 struct ocfs2_dinode *fe,
89 char *name, 89 char *name,
90 struct buffer_head *de_bh, 90 struct ocfs2_dir_lookup_result *lookup,
91 struct inode *orphan_dir_inode); 91 struct inode *orphan_dir_inode);
92 92
93static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 93static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
228 struct ocfs2_super *osb; 228 struct ocfs2_super *osb;
229 struct ocfs2_dinode *dirfe; 229 struct ocfs2_dinode *dirfe;
230 struct buffer_head *new_fe_bh = NULL; 230 struct buffer_head *new_fe_bh = NULL;
231 struct buffer_head *de_bh = NULL;
232 struct inode *inode = NULL; 231 struct inode *inode = NULL;
233 struct ocfs2_alloc_context *inode_ac = NULL; 232 struct ocfs2_alloc_context *inode_ac = NULL;
234 struct ocfs2_alloc_context *data_ac = NULL; 233 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL; 234 struct ocfs2_alloc_context *meta_ac = NULL;
236 int want_clusters = 0; 235 int want_clusters = 0;
236 int want_meta = 0;
237 int xattr_credits = 0; 237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = { 238 struct ocfs2_security_xattr_info si = {
239 .enable = 1, 239 .enable = 1,
240 }; 240 };
241 int did_quota_inode = 0; 241 int did_quota_inode = 0;
242 struct ocfs2_dir_lookup_result lookup = { NULL, };
242 243
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 244 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 245 (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
254 return status; 255 return status;
255 } 256 }
256 257
257 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { 258 if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
258 status = -EMLINK; 259 status = -EMLINK;
259 goto leave; 260 goto leave;
260 } 261 }
261 262
262 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 263 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
263 if (!dirfe->i_links_count) { 264 if (!ocfs2_read_links_count(dirfe)) {
264 /* can't make a file in a deleted directory. */ 265 /* can't make a file in a deleted directory. */
265 status = -ENOENT; 266 status = -ENOENT;
266 goto leave; 267 goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
274 /* get a spot inside the dir. */ 275 /* get a spot inside the dir. */
275 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 276 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
276 dentry->d_name.name, 277 dentry->d_name.name,
277 dentry->d_name.len, &de_bh); 278 dentry->d_name.len, &lookup);
278 if (status < 0) { 279 if (status < 0) {
279 mlog_errno(status); 280 mlog_errno(status);
280 goto leave; 281 goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
308 309
309 /* calculate meta data/clusters for setting security and acl xattr */ 310 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, 311 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters, 312 &si, &want_clusters,
312 &xattr_credits, &xattr_ac); 313 &xattr_credits, &want_meta);
313 if (status < 0) { 314 if (status < 0) {
314 mlog_errno(status); 315 mlog_errno(status);
315 goto leave; 316 goto leave;
316 } 317 }
317 318
318 /* Reserve a cluster if creating an extent based directory. */ 319 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) 320 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
320 want_clusters += 1; 321 want_clusters += 1;
321 322
323 /* Dir indexing requires extra space as well */
324 if (ocfs2_supports_indexed_dirs(osb))
325 want_meta++;
326 }
327
328 status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
329 if (status < 0) {
330 if (status != -ENOSPC)
331 mlog_errno(status);
332 goto leave;
333 }
334
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); 335 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) { 336 if (status < 0) {
324 if (status != -ENOSPC) 337 if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
326 goto leave; 339 goto leave;
327 } 340 }
328 341
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + 342 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
330 xattr_credits); 343 S_ISDIR(mode),
344 xattr_credits));
331 if (IS_ERR(handle)) { 345 if (IS_ERR(handle)) {
332 status = PTR_ERR(handle); 346 status = PTR_ERR(handle);
333 handle = NULL; 347 handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
355 369
356 if (S_ISDIR(mode)) { 370 if (S_ISDIR(mode)) {
357 status = ocfs2_fill_new_dir(osb, handle, dir, inode, 371 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
358 new_fe_bh, data_ac); 372 new_fe_bh, data_ac, meta_ac);
359 if (status < 0) { 373 if (status < 0) {
360 mlog_errno(status); 374 mlog_errno(status);
361 goto leave; 375 goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
367 mlog_errno(status); 381 mlog_errno(status);
368 goto leave; 382 goto leave;
369 } 383 }
370 le16_add_cpu(&dirfe->i_links_count, 1); 384 ocfs2_add_links_count(dirfe, 1);
371 status = ocfs2_journal_dirty(handle, parent_fe_bh); 385 status = ocfs2_journal_dirty(handle, parent_fe_bh);
372 if (status < 0) { 386 if (status < 0) {
373 mlog_errno(status); 387 mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
377 } 391 }
378 392
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, 393 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac); 394 meta_ac, data_ac);
381 if (status < 0) { 395 if (status < 0) {
382 mlog_errno(status); 396 mlog_errno(status);
383 goto leave; 397 goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
385 399
386 if (si.enable) { 400 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, 401 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac); 402 meta_ac, data_ac);
389 if (status < 0) { 403 if (status < 0) {
390 mlog_errno(status); 404 mlog_errno(status);
391 goto leave; 405 goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
394 408
395 status = ocfs2_add_entry(handle, dentry, inode, 409 status = ocfs2_add_entry(handle, dentry, inode,
396 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 410 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
397 de_bh); 411 &lookup);
398 if (status < 0) { 412 if (status < 0) {
399 mlog_errno(status); 413 mlog_errno(status);
400 goto leave; 414 goto leave;
@@ -423,11 +437,12 @@ leave:
423 mlog(0, "Disk is full\n"); 437 mlog(0, "Disk is full\n");
424 438
425 brelse(new_fe_bh); 439 brelse(new_fe_bh);
426 brelse(de_bh);
427 brelse(parent_fe_bh); 440 brelse(parent_fe_bh);
428 kfree(si.name); 441 kfree(si.name);
429 kfree(si.value); 442 kfree(si.value);
430 443
444 ocfs2_free_dir_lookup_result(&lookup);
445
431 if ((status < 0) && inode) { 446 if ((status < 0) && inode) {
432 clear_nlink(inode); 447 clear_nlink(inode);
433 iput(inode); 448 iput(inode);
@@ -439,8 +454,8 @@ leave:
439 if (data_ac) 454 if (data_ac)
440 ocfs2_free_alloc_context(data_ac); 455 ocfs2_free_alloc_context(data_ac);
441 456
442 if (xattr_ac) 457 if (meta_ac)
443 ocfs2_free_alloc_context(xattr_ac); 458 ocfs2_free_alloc_context(meta_ac);
444 459
445 mlog_exit(status); 460 mlog_exit(status);
446 461
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
462 struct ocfs2_extent_list *fel; 477 struct ocfs2_extent_list *fel;
463 u64 fe_blkno = 0; 478 u64 fe_blkno = 0;
464 u16 suballoc_bit; 479 u16 suballoc_bit;
480 u16 feat;
465 481
466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
467 inode->i_mode, (unsigned long)dev, dentry->d_name.len, 483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
469 485
470 *new_fe_bh = NULL; 486 *new_fe_bh = NULL;
471 487
472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
473 &fe_blkno); 489 inode_ac, &suballoc_bit, &fe_blkno);
474 if (status < 0) { 490 if (status < 0) {
475 mlog_errno(status); 491 mlog_errno(status);
476 goto leave; 492 goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
513 fe->i_mode = cpu_to_le16(inode->i_mode); 529 fe->i_mode = cpu_to_le16(inode->i_mode);
514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 530 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 531 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
516 fe->i_links_count = cpu_to_le16(inode->i_nlink); 532
533 ocfs2_set_links_count(fe, inode->i_nlink);
517 534
518 fe->i_last_eb_blk = 0; 535 fe->i_last_eb_blk = 0;
519 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 536 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
525 fe->i_dtime = 0; 542 fe->i_dtime = 0;
526 543
527 /* 544 /*
528 * If supported, directories start with inline data. 545 * If supported, directories start with inline data. If inline
546 * isn't supported, but indexing is, we start them as indexed.
529 */ 547 */
548 feat = le16_to_cpu(fe->i_dyn_features);
530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { 549 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
531 u16 feat = le16_to_cpu(fe->i_dyn_features);
532
533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 550 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
534 551
535 fe->id2.i_data.id_count = cpu_to_le16( 552 fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
608 int err; 625 int err;
609 struct buffer_head *fe_bh = NULL; 626 struct buffer_head *fe_bh = NULL;
610 struct buffer_head *parent_fe_bh = NULL; 627 struct buffer_head *parent_fe_bh = NULL;
611 struct buffer_head *de_bh = NULL;
612 struct ocfs2_dinode *fe = NULL; 628 struct ocfs2_dinode *fe = NULL;
613 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 629 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
630 struct ocfs2_dir_lookup_result lookup = { NULL, };
614 631
615 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 632 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
616 old_dentry->d_name.len, old_dentry->d_name.name, 633 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
638 655
639 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 656 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
640 dentry->d_name.name, 657 dentry->d_name.name,
641 dentry->d_name.len, &de_bh); 658 dentry->d_name.len, &lookup);
642 if (err < 0) { 659 if (err < 0) {
643 mlog_errno(err); 660 mlog_errno(err);
644 goto out; 661 goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
652 } 669 }
653 670
654 fe = (struct ocfs2_dinode *) fe_bh->b_data; 671 fe = (struct ocfs2_dinode *) fe_bh->b_data;
655 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { 672 if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
656 err = -EMLINK; 673 err = -EMLINK;
657 goto out_unlock_inode; 674 goto out_unlock_inode;
658 } 675 }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
674 691
675 inc_nlink(inode); 692 inc_nlink(inode);
676 inode->i_ctime = CURRENT_TIME; 693 inode->i_ctime = CURRENT_TIME;
677 fe->i_links_count = cpu_to_le16(inode->i_nlink); 694 ocfs2_set_links_count(fe, inode->i_nlink);
678 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 695 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
679 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 696 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
680 697
681 err = ocfs2_journal_dirty(handle, fe_bh); 698 err = ocfs2_journal_dirty(handle, fe_bh);
682 if (err < 0) { 699 if (err < 0) {
683 le16_add_cpu(&fe->i_links_count, -1); 700 ocfs2_add_links_count(fe, -1);
684 drop_nlink(inode); 701 drop_nlink(inode);
685 mlog_errno(err); 702 mlog_errno(err);
686 goto out_commit; 703 goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
688 705
689 err = ocfs2_add_entry(handle, dentry, inode, 706 err = ocfs2_add_entry(handle, dentry, inode,
690 OCFS2_I(inode)->ip_blkno, 707 OCFS2_I(inode)->ip_blkno,
691 parent_fe_bh, de_bh); 708 parent_fe_bh, &lookup);
692 if (err) { 709 if (err) {
693 le16_add_cpu(&fe->i_links_count, -1); 710 ocfs2_add_links_count(fe, -1);
694 drop_nlink(inode); 711 drop_nlink(inode);
695 mlog_errno(err); 712 mlog_errno(err);
696 goto out_commit; 713 goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
714out: 731out:
715 ocfs2_inode_unlock(dir, 1); 732 ocfs2_inode_unlock(dir, 1);
716 733
717 brelse(de_bh);
718 brelse(fe_bh); 734 brelse(fe_bh);
719 brelse(parent_fe_bh); 735 brelse(parent_fe_bh);
720 736
737 ocfs2_free_dir_lookup_result(&lookup);
738
721 mlog_exit(err); 739 mlog_exit(err);
722 740
723 return err; 741 return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
766 struct buffer_head *fe_bh = NULL; 784 struct buffer_head *fe_bh = NULL;
767 struct buffer_head *parent_node_bh = NULL; 785 struct buffer_head *parent_node_bh = NULL;
768 handle_t *handle = NULL; 786 handle_t *handle = NULL;
769 struct ocfs2_dir_entry *dirent = NULL;
770 struct buffer_head *dirent_bh = NULL;
771 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 787 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
772 struct buffer_head *orphan_entry_bh = NULL; 788 struct ocfs2_dir_lookup_result lookup = { NULL, };
789 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
773 790
774 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
775 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
791 } 808 }
792 809
793 status = ocfs2_find_files_on_disk(dentry->d_name.name, 810 status = ocfs2_find_files_on_disk(dentry->d_name.name,
794 dentry->d_name.len, &blkno, 811 dentry->d_name.len, &blkno, dir,
795 dir, &dirent_bh, &dirent); 812 &lookup);
796 if (status < 0) { 813 if (status < 0) {
797 if (status != -ENOENT) 814 if (status != -ENOENT)
798 mlog_errno(status); 815 mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
817 child_locked = 1; 834 child_locked = 1;
818 835
819 if (S_ISDIR(inode->i_mode)) { 836 if (S_ISDIR(inode->i_mode)) {
820 if (!ocfs2_empty_dir(inode)) { 837 if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
821 status = -ENOTEMPTY;
822 goto leave;
823 } else if (inode->i_nlink != 2) {
824 status = -ENOTEMPTY; 838 status = -ENOTEMPTY;
825 goto leave; 839 goto leave;
826 } 840 }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
836 850
837 if (inode_is_unlinkable(inode)) { 851 if (inode_is_unlinkable(inode)) {
838 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 852 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
839 orphan_name, 853 orphan_name, &orphan_insert);
840 &orphan_entry_bh);
841 if (status < 0) { 854 if (status < 0) {
842 mlog_errno(status); 855 mlog_errno(status);
843 goto leave; 856 goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
863 876
864 if (inode_is_unlinkable(inode)) { 877 if (inode_is_unlinkable(inode)) {
865 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 878 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
866 orphan_entry_bh, orphan_dir); 879 &orphan_insert, orphan_dir);
867 if (status < 0) { 880 if (status < 0) {
868 mlog_errno(status); 881 mlog_errno(status);
869 goto leave; 882 goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
871 } 884 }
872 885
873 /* delete the name from the parent dir */ 886 /* delete the name from the parent dir */
874 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); 887 status = ocfs2_delete_entry(handle, dir, &lookup);
875 if (status < 0) { 888 if (status < 0) {
876 mlog_errno(status); 889 mlog_errno(status);
877 goto leave; 890 goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
880 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
881 drop_nlink(inode); 894 drop_nlink(inode);
882 drop_nlink(inode); 895 drop_nlink(inode);
883 fe->i_links_count = cpu_to_le16(inode->i_nlink); 896 ocfs2_set_links_count(fe, inode->i_nlink);
884 897
885 status = ocfs2_journal_dirty(handle, fe_bh); 898 status = ocfs2_journal_dirty(handle, fe_bh);
886 if (status < 0) { 899 if (status < 0) {
@@ -916,9 +929,10 @@ leave:
916 } 929 }
917 930
918 brelse(fe_bh); 931 brelse(fe_bh);
919 brelse(dirent_bh);
920 brelse(parent_node_bh); 932 brelse(parent_node_bh);
921 brelse(orphan_entry_bh); 933
934 ocfs2_free_dir_lookup_result(&orphan_insert);
935 ocfs2_free_dir_lookup_result(&lookup);
922 936
923 mlog_exit(status); 937 mlog_exit(status);
924 938
@@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
1004 struct inode *new_dir, 1018 struct inode *new_dir,
1005 struct dentry *new_dentry) 1019 struct dentry *new_dentry)
1006{ 1020{
1007 int status = 0, rename_lock = 0, parents_locked = 0; 1021 int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
1008 int old_child_locked = 0, new_child_locked = 0; 1022 int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
1009 struct inode *old_inode = old_dentry->d_inode; 1023 struct inode *old_inode = old_dentry->d_inode;
1010 struct inode *new_inode = new_dentry->d_inode; 1024 struct inode *new_inode = new_dentry->d_inode;
1011 struct inode *orphan_dir = NULL; 1025 struct inode *orphan_dir = NULL;
@@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
1020 handle_t *handle = NULL; 1034 handle_t *handle = NULL;
1021 struct buffer_head *old_dir_bh = NULL; 1035 struct buffer_head *old_dir_bh = NULL;
1022 struct buffer_head *new_dir_bh = NULL; 1036 struct buffer_head *new_dir_bh = NULL;
1023 struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
1024 *new_de = NULL;
1025 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1026 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1027 // this is the 1st dirent bh
1028 nlink_t old_dir_nlink = old_dir->i_nlink; 1037 nlink_t old_dir_nlink = old_dir->i_nlink;
1029 struct ocfs2_dinode *old_di; 1038 struct ocfs2_dinode *old_di;
1039 struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
1040 struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
1041 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
1042 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
1043 struct ocfs2_dir_lookup_result target_insert = { NULL, };
1030 1044
1031 /* At some point it might be nice to break this function up a 1045 /* At some point it might be nice to break this function up a
1032 * bit. */ 1046 * bit. */
@@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
1108 if (S_ISDIR(old_inode->i_mode)) { 1122 if (S_ISDIR(old_inode->i_mode)) {
1109 u64 old_inode_parent; 1123 u64 old_inode_parent;
1110 1124
1125 update_dot_dot = 1;
1111 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, 1126 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
1112 old_inode, &old_inode_de_bh, 1127 old_inode,
1113 &old_inode_dot_dot_de); 1128 &old_inode_dot_dot_res);
1114 if (status) { 1129 if (status) {
1115 status = -EIO; 1130 status = -EIO;
1116 goto bail; 1131 goto bail;
@@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
1122 } 1137 }
1123 1138
1124 if (!new_inode && new_dir != old_dir && 1139 if (!new_inode && new_dir != old_dir &&
1125 new_dir->i_nlink >= OCFS2_LINK_MAX) { 1140 new_dir->i_nlink >= ocfs2_link_max(osb)) {
1126 status = -EMLINK; 1141 status = -EMLINK;
1127 goto bail; 1142 goto bail;
1128 } 1143 }
@@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
1151 * to delete it */ 1166 * to delete it */
1152 status = ocfs2_find_files_on_disk(new_dentry->d_name.name, 1167 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1153 new_dentry->d_name.len, 1168 new_dentry->d_name.len,
1154 &newfe_blkno, new_dir, &new_de_bh, 1169 &newfe_blkno, new_dir,
1155 &new_de); 1170 &target_lookup_res);
1156 /* The only error we allow here is -ENOENT because the new 1171 /* The only error we allow here is -ENOENT because the new
1157 * file not existing is perfectly valid. */ 1172 * file not existing is perfectly valid. */
1158 if ((status < 0) && (status != -ENOENT)) { 1173 if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
1161 mlog_errno(status); 1176 mlog_errno(status);
1162 goto bail; 1177 goto bail;
1163 } 1178 }
1179 if (status == 0)
1180 target_exists = 1;
1164 1181
1165 if (!new_de && new_inode) { 1182 if (!target_exists && new_inode) {
1166 /* 1183 /*
1167 * Target was unlinked by another node while we were 1184 * Target was unlinked by another node while we were
1168 * waiting to get to ocfs2_rename(). There isn't 1185 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
1175 1192
1176 /* In case we need to overwrite an existing file, we blow it 1193 /* In case we need to overwrite an existing file, we blow it
1177 * away first */ 1194 * away first */
1178 if (new_de) { 1195 if (target_exists) {
1179 /* VFS didn't think there existed an inode here, but 1196 /* VFS didn't think there existed an inode here, but
1180 * someone else in the cluster must have raced our 1197 * someone else in the cluster must have raced our
1181 * rename to create one. Today we error cleanly, in 1198 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
1216 1233
1217 newfe = (struct ocfs2_dinode *) newfe_bh->b_data; 1234 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1218 1235
1219 mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " 1236 mlog(0, "aha rename over existing... new_blkno=%llu "
1220 "newfebh=%p bhblocknr=%llu\n", new_de, 1237 "newfebh=%p bhblocknr=%llu\n",
1221 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? 1238 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
1222 (unsigned long long)newfe_bh->b_blocknr : 0ULL); 1239 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1223 1240
@@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
1225 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1242 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1226 new_inode, 1243 new_inode,
1227 orphan_name, 1244 orphan_name,
1228 &orphan_entry_bh); 1245 &orphan_insert);
1229 if (status < 0) { 1246 if (status < 0) {
1230 mlog_errno(status); 1247 mlog_errno(status);
1231 goto bail; 1248 goto bail;
@@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
1243 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, 1260 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1244 new_dentry->d_name.name, 1261 new_dentry->d_name.name,
1245 new_dentry->d_name.len, 1262 new_dentry->d_name.len,
1246 &insert_entry_bh); 1263 &target_insert);
1247 if (status < 0) { 1264 if (status < 0) {
1248 mlog_errno(status); 1265 mlog_errno(status);
1249 goto bail; 1266 goto bail;
@@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
1258 goto bail; 1275 goto bail;
1259 } 1276 }
1260 1277
1261 if (new_de) { 1278 if (target_exists) {
1262 if (S_ISDIR(new_inode->i_mode)) { 1279 if (S_ISDIR(new_inode->i_mode)) {
1263 if (!ocfs2_empty_dir(new_inode) || 1280 if (new_inode->i_nlink != 2 ||
1264 new_inode->i_nlink != 2) { 1281 !ocfs2_empty_dir(new_inode)) {
1265 status = -ENOTEMPTY; 1282 status = -ENOTEMPTY;
1266 goto bail; 1283 goto bail;
1267 } 1284 }
@@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
1274 } 1291 }
1275 1292
1276 if (S_ISDIR(new_inode->i_mode) || 1293 if (S_ISDIR(new_inode->i_mode) ||
1277 (newfe->i_links_count == cpu_to_le16(1))){ 1294 (ocfs2_read_links_count(newfe) == 1)) {
1278 status = ocfs2_orphan_add(osb, handle, new_inode, 1295 status = ocfs2_orphan_add(osb, handle, new_inode,
1279 newfe, orphan_name, 1296 newfe, orphan_name,
1280 orphan_entry_bh, orphan_dir); 1297 &orphan_insert, orphan_dir);
1281 if (status < 0) { 1298 if (status < 0) {
1282 mlog_errno(status); 1299 mlog_errno(status);
1283 goto bail; 1300 goto bail;
@@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
1285 } 1302 }
1286 1303
1287 /* change the dirent to point to the correct inode */ 1304 /* change the dirent to point to the correct inode */
1288 status = ocfs2_update_entry(new_dir, handle, new_de_bh, 1305 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
1289 new_de, old_inode); 1306 old_inode);
1290 if (status < 0) { 1307 if (status < 0) {
1291 mlog_errno(status); 1308 mlog_errno(status);
1292 goto bail; 1309 goto bail;
@@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
1294 new_dir->i_version++; 1311 new_dir->i_version++;
1295 1312
1296 if (S_ISDIR(new_inode->i_mode)) 1313 if (S_ISDIR(new_inode->i_mode))
1297 newfe->i_links_count = 0; 1314 ocfs2_set_links_count(newfe, 0);
1298 else 1315 else
1299 le16_add_cpu(&newfe->i_links_count, -1); 1316 ocfs2_add_links_count(newfe, -1);
1300 1317
1301 status = ocfs2_journal_dirty(handle, newfe_bh); 1318 status = ocfs2_journal_dirty(handle, newfe_bh);
1302 if (status < 0) { 1319 if (status < 0) {
@@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
1307 /* if the name was not found in new_dir, add it now */ 1324 /* if the name was not found in new_dir, add it now */
1308 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1325 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1309 OCFS2_I(old_inode)->ip_blkno, 1326 OCFS2_I(old_inode)->ip_blkno,
1310 new_dir_bh, insert_entry_bh); 1327 new_dir_bh, &target_insert);
1311 } 1328 }
1312 1329
1313 old_inode->i_ctime = CURRENT_TIME; 1330 old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
1334 * because the insert might have changed the type of directory 1351 * because the insert might have changed the type of directory
1335 * we're dealing with. 1352 * we're dealing with.
1336 */ 1353 */
1337 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1354 status = ocfs2_find_entry(old_dentry->d_name.name,
1338 old_dentry->d_name.len, 1355 old_dentry->d_name.len, old_dir,
1339 old_dir, &old_de); 1356 &old_entry_lookup);
1340 if (!old_de_bh) { 1357 if (status)
1341 status = -EIO;
1342 goto bail; 1358 goto bail;
1343 }
1344 1359
1345 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1360 status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
1346 if (status < 0) { 1361 if (status < 0) {
1347 mlog_errno(status); 1362 mlog_errno(status);
1348 goto bail; 1363 goto bail;
@@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
1353 new_inode->i_ctime = CURRENT_TIME; 1368 new_inode->i_ctime = CURRENT_TIME;
1354 } 1369 }
1355 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1370 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1356 if (old_inode_de_bh) { 1371
1357 status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh, 1372 if (update_dot_dot) {
1358 old_inode_dot_dot_de, new_dir); 1373 status = ocfs2_update_entry(old_inode, handle,
1374 &old_inode_dot_dot_res, new_dir);
1359 old_dir->i_nlink--; 1375 old_dir->i_nlink--;
1360 if (new_inode) { 1376 if (new_inode) {
1361 new_inode->i_nlink--; 1377 new_inode->i_nlink--;
@@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
1391 } else { 1407 } else {
1392 struct ocfs2_dinode *fe; 1408 struct ocfs2_dinode *fe;
1393 status = ocfs2_journal_access_di(handle, old_dir, 1409 status = ocfs2_journal_access_di(handle, old_dir,
1394 old_dir_bh, 1410 old_dir_bh,
1395 OCFS2_JOURNAL_ACCESS_WRITE); 1411 OCFS2_JOURNAL_ACCESS_WRITE);
1396 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1412 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1397 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1413 ocfs2_set_links_count(fe, old_dir->i_nlink);
1398 status = ocfs2_journal_dirty(handle, old_dir_bh); 1414 status = ocfs2_journal_dirty(handle, old_dir_bh);
1399 } 1415 }
1400 } 1416 }
1401
1402 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1417 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
1403 status = 0; 1418 status = 0;
1404bail: 1419bail:
@@ -1429,13 +1444,17 @@ bail:
1429 1444
1430 if (new_inode) 1445 if (new_inode)
1431 iput(new_inode); 1446 iput(new_inode);
1447
1448 ocfs2_free_dir_lookup_result(&target_lookup_res);
1449 ocfs2_free_dir_lookup_result(&old_entry_lookup);
1450 ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
1451 ocfs2_free_dir_lookup_result(&orphan_insert);
1452 ocfs2_free_dir_lookup_result(&target_insert);
1453
1432 brelse(newfe_bh); 1454 brelse(newfe_bh);
1433 brelse(old_inode_bh); 1455 brelse(old_inode_bh);
1434 brelse(old_dir_bh); 1456 brelse(old_dir_bh);
1435 brelse(new_dir_bh); 1457 brelse(new_dir_bh);
1436 brelse(new_de_bh);
1437 brelse(old_de_bh);
1438 brelse(old_inode_de_bh);
1439 brelse(orphan_entry_bh); 1458 brelse(orphan_entry_bh);
1440 brelse(insert_entry_bh); 1459 brelse(insert_entry_bh);
1441 1460
@@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
1558 struct inode *inode = NULL; 1577 struct inode *inode = NULL;
1559 struct super_block *sb; 1578 struct super_block *sb;
1560 struct buffer_head *new_fe_bh = NULL; 1579 struct buffer_head *new_fe_bh = NULL;
1561 struct buffer_head *de_bh = NULL;
1562 struct buffer_head *parent_fe_bh = NULL; 1580 struct buffer_head *parent_fe_bh = NULL;
1563 struct ocfs2_dinode *fe = NULL; 1581 struct ocfs2_dinode *fe = NULL;
1564 struct ocfs2_dinode *dirfe; 1582 struct ocfs2_dinode *dirfe;
@@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
1572 .enable = 1, 1590 .enable = 1,
1573 }; 1591 };
1574 int did_quota = 0, did_quota_inode = 0; 1592 int did_quota = 0, did_quota_inode = 0;
1593 struct ocfs2_dir_lookup_result lookup = { NULL, };
1575 1594
1576 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1595 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1577 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1596 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
1592 } 1611 }
1593 1612
1594 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1613 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1595 if (!dirfe->i_links_count) { 1614 if (!ocfs2_read_links_count(dirfe)) {
1596 /* can't make a file in a deleted directory. */ 1615 /* can't make a file in a deleted directory. */
1597 status = -ENOENT; 1616 status = -ENOENT;
1598 goto bail; 1617 goto bail;
@@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
1605 1624
1606 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 1625 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1607 dentry->d_name.name, 1626 dentry->d_name.name,
1608 dentry->d_name.len, &de_bh); 1627 dentry->d_name.len, &lookup);
1609 if (status < 0) { 1628 if (status < 0) {
1610 mlog_errno(status); 1629 mlog_errno(status);
1611 goto bail; 1630 goto bail;
@@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
1744 1763
1745 status = ocfs2_add_entry(handle, dentry, inode, 1764 status = ocfs2_add_entry(handle, dentry, inode,
1746 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1765 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1747 de_bh); 1766 &lookup);
1748 if (status < 0) { 1767 if (status < 0) {
1749 mlog_errno(status); 1768 mlog_errno(status);
1750 goto bail; 1769 goto bail;
@@ -1772,9 +1791,9 @@ bail:
1772 1791
1773 brelse(new_fe_bh); 1792 brelse(new_fe_bh);
1774 brelse(parent_fe_bh); 1793 brelse(parent_fe_bh);
1775 brelse(de_bh);
1776 kfree(si.name); 1794 kfree(si.name);
1777 kfree(si.value); 1795 kfree(si.value);
1796 ocfs2_free_dir_lookup_result(&lookup);
1778 if (inode_ac) 1797 if (inode_ac)
1779 ocfs2_free_alloc_context(inode_ac); 1798 ocfs2_free_alloc_context(inode_ac);
1780 if (data_ac) 1799 if (data_ac)
@@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1826 struct inode **ret_orphan_dir, 1845 struct inode **ret_orphan_dir,
1827 struct inode *inode, 1846 struct inode *inode,
1828 char *name, 1847 char *name,
1829 struct buffer_head **de_bh) 1848 struct ocfs2_dir_lookup_result *lookup)
1830{ 1849{
1831 struct inode *orphan_dir_inode; 1850 struct inode *orphan_dir_inode;
1832 struct buffer_head *orphan_dir_bh = NULL; 1851 struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1857 1876
1858 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1877 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1859 orphan_dir_bh, name, 1878 orphan_dir_bh, name,
1860 OCFS2_ORPHAN_NAMELEN, de_bh); 1879 OCFS2_ORPHAN_NAMELEN, lookup);
1861 if (status < 0) { 1880 if (status < 0) {
1862 ocfs2_inode_unlock(orphan_dir_inode, 1); 1881 ocfs2_inode_unlock(orphan_dir_inode, 1);
1863 1882
@@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1884 struct inode *inode, 1903 struct inode *inode,
1885 struct ocfs2_dinode *fe, 1904 struct ocfs2_dinode *fe,
1886 char *name, 1905 char *name,
1887 struct buffer_head *de_bh, 1906 struct ocfs2_dir_lookup_result *lookup,
1888 struct inode *orphan_dir_inode) 1907 struct inode *orphan_dir_inode)
1889{ 1908{
1890 struct buffer_head *orphan_dir_bh = NULL; 1909 struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 * underneath us... */ 1929 * underneath us... */
1911 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 1930 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1912 if (S_ISDIR(inode->i_mode)) 1931 if (S_ISDIR(inode->i_mode))
1913 le16_add_cpu(&orphan_fe->i_links_count, 1); 1932 ocfs2_add_links_count(orphan_fe, 1);
1914 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 1933 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1915 1934
1916 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 1935 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
1917 if (status < 0) { 1936 if (status < 0) {
@@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1922 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1941 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1923 OCFS2_ORPHAN_NAMELEN, inode, 1942 OCFS2_ORPHAN_NAMELEN, inode,
1924 OCFS2_I(inode)->ip_blkno, 1943 OCFS2_I(inode)->ip_blkno,
1925 orphan_dir_bh, de_bh); 1944 orphan_dir_bh, lookup);
1926 if (status < 0) { 1945 if (status < 0) {
1927 mlog_errno(status); 1946 mlog_errno(status);
1928 goto leave; 1947 goto leave;
@@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1955 char name[OCFS2_ORPHAN_NAMELEN + 1]; 1974 char name[OCFS2_ORPHAN_NAMELEN + 1];
1956 struct ocfs2_dinode *orphan_fe; 1975 struct ocfs2_dinode *orphan_fe;
1957 int status = 0; 1976 int status = 0;
1958 struct buffer_head *target_de_bh = NULL; 1977 struct ocfs2_dir_lookup_result lookup = { NULL, };
1959 struct ocfs2_dir_entry *target_de = NULL;
1960 1978
1961 mlog_entry_void(); 1979 mlog_entry_void();
1962 1980
@@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1971 OCFS2_ORPHAN_NAMELEN); 1989 OCFS2_ORPHAN_NAMELEN);
1972 1990
1973 /* find it's spot in the orphan directory */ 1991 /* find it's spot in the orphan directory */
1974 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, 1992 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
1975 orphan_dir_inode, &target_de); 1993 &lookup);
1976 if (!target_de_bh) { 1994 if (status) {
1977 status = -ENOENT;
1978 mlog_errno(status); 1995 mlog_errno(status);
1979 goto leave; 1996 goto leave;
1980 } 1997 }
1981 1998
1982 /* remove it from the orphan directory */ 1999 /* remove it from the orphan directory */
1983 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, 2000 status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
1984 target_de_bh);
1985 if (status < 0) { 2001 if (status < 0) {
1986 mlog_errno(status); 2002 mlog_errno(status);
1987 goto leave; 2003 goto leave;
@@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1997 /* do the i_nlink dance! :) */ 2013 /* do the i_nlink dance! :) */
1998 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2014 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1999 if (S_ISDIR(inode->i_mode)) 2015 if (S_ISDIR(inode->i_mode))
2000 le16_add_cpu(&orphan_fe->i_links_count, -1); 2016 ocfs2_add_links_count(orphan_fe, -1);
2001 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 2017 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2002 2018
2003 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 2019 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2004 if (status < 0) { 2020 if (status < 0) {
@@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2007 } 2023 }
2008 2024
2009leave: 2025leave:
2010 brelse(target_de_bh); 2026 ocfs2_free_dir_lookup_result(&lookup);
2011 2027
2012 mlog_exit(status); 2028 mlog_exit(status);
2013 return status; 2029 return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
209struct ocfs2_journal; 209struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_replay_map;
212struct ocfs2_quota_recovery; 213struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock; 214struct ocfs2_dentry_lock;
214struct ocfs2_super 215struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
264 atomic_t vol_state; 265 atomic_t vol_state;
265 struct mutex recovery_lock; 266 struct mutex recovery_lock;
266 struct ocfs2_recovery_map *recovery_map; 267 struct ocfs2_recovery_map *recovery_map;
268 struct ocfs2_replay_map *replay_map;
267 struct task_struct *recovery_thread_task; 269 struct task_struct *recovery_thread_task;
268 int disable_recovery; 270 int disable_recovery;
269 wait_queue_head_t checkpoint_event; 271 wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
287 289
288 u64 la_last_gd; 290 u64 la_last_gd;
289 291
290#ifdef CONFIG_OCFS2_FS_STATS
291 struct dentry *local_alloc_debug;
292 char *local_alloc_debug_buf;
293#endif
294
295 /* Next three fields are for local node slot recovery during 292 /* Next three fields are for local node slot recovery during
296 * mount. */ 293 * mount. */
297 int dirty; 294 int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
305 struct ocfs2_cluster_connection *cconn; 302 struct ocfs2_cluster_connection *cconn;
306 struct ocfs2_lock_res osb_super_lockres; 303 struct ocfs2_lock_res osb_super_lockres;
307 struct ocfs2_lock_res osb_rename_lockres; 304 struct ocfs2_lock_res osb_rename_lockres;
305 struct ocfs2_lock_res osb_nfs_sync_lockres;
308 struct ocfs2_dlm_debug *osb_dlm_debug; 306 struct ocfs2_dlm_debug *osb_dlm_debug;
309 307
310 struct dentry *osb_debug_root; 308 struct dentry *osb_debug_root;
309 struct dentry *osb_ctxt;
311 310
312 wait_queue_head_t recovery_event; 311 wait_queue_head_t recovery_event;
313 312
@@ -344,6 +343,12 @@ struct ocfs2_super
344 343
345 /* used to protect metaecc calculation check of xattr. */ 344 /* used to protect metaecc calculation check of xattr. */
346 spinlock_t osb_xattr_lock; 345 spinlock_t osb_xattr_lock;
346
347 unsigned int osb_dx_mask;
348 u32 osb_dx_seed[4];
349
350 /* the group we used to allocate inodes. */
351 u64 osb_inode_alloc_group;
347}; 352};
348 353
349#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 354#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
402 return 0; 407 return 0;
403} 408}
404 409
410static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
411{
412 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
413 return 1;
414 return 0;
415}
416
417static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
418{
419 if (ocfs2_supports_indexed_dirs(osb))
420 return OCFS2_DX_LINK_MAX;
421 return OCFS2_LINK_MAX;
422}
423
424static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
425{
426 u32 nlink = le16_to_cpu(di->i_links_count);
427 u32 hi = le16_to_cpu(di->i_links_count_hi);
428
429 if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
430 nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
431
432 return nlink;
433}
434
435static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
436{
437 u16 lo, hi;
438
439 lo = nlink;
440 hi = nlink >> OCFS2_LINKS_HI_SHIFT;
441
442 di->i_links_count = cpu_to_le16(lo);
443 di->i_links_count_hi = cpu_to_le16(hi);
444}
445
446static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
447{
448 u32 links = ocfs2_read_links_count(di);
449
450 links += n;
451
452 ocfs2_set_links_count(di, links);
453}
454
405/* set / clear functions because cluster events can make these happen 455/* set / clear functions because cluster events can make these happen
406 * in parallel so we want the transitions to be atomic. this also 456 * in parallel so we want the transitions to be atomic. this also
407 * means that any future flags osb_flags must be protected by spinlock 457 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
482#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ 532#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
483 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) 533 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
484 534
535#define OCFS2_IS_VALID_DX_ROOT(ptr) \
536 (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
537
538#define OCFS2_IS_VALID_DX_LEAF(ptr) \
539 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
540
485static inline unsigned long ino_from_blkno(struct super_block *sb, 541static inline unsigned long ino_from_blkno(struct super_block *sb,
486 u64 blkno) 542 u64 blkno)
487{ 543{
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
532 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; 588 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
533} 589}
534 590
591static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
592 u64 blocks)
593{
594 int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
595 unsigned int clusters;
596
597 clusters = ocfs2_blocks_to_clusters(sb, blocks);
598 return (u64)clusters << bits;
599}
600
535static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, 601static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
536 u64 bytes) 602 u64 bytes)
537{ 603{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
69 71
70/* Compatibility flags */ 72/* Compatibility flags */
71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -95,7 +97,8 @@
95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 97 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
97 | OCFS2_FEATURE_INCOMPAT_XATTR \ 99 | OCFS2_FEATURE_INCOMPAT_XATTR \
98 | OCFS2_FEATURE_INCOMPAT_META_ECC) 100 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
151/* Support for extended attributes */ 154/* Support for extended attributes */
152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 155#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
153 156
157/* Support for indexed directores */
158#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
159
154/* Metadata checksum and error correction */ 160/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156 162
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
411#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ 417#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
412 OCFS2_DIR_ROUND) & \ 418 OCFS2_DIR_ROUND) & \
413 ~OCFS2_DIR_ROUND) 419 ~OCFS2_DIR_ROUND)
420#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1)
414 421
415#define OCFS2_LINK_MAX 32000 422#define OCFS2_LINK_MAX 32000
423#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U)
424#define OCFS2_LINKS_HI_SHIFT 16
425#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
416 426
417#define S_SHIFT 12 427#define S_SHIFT 12
418static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { 428static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
628/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 638/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
629 for this fs*/ 639 for this fs*/
630 __le16 s_reserved0; 640 __le16 s_reserved0;
631 __le32 s_reserved1; 641 __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
632/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ 642 * s_uuid_hash serves as seed[3]. */
643/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
633/*140*/ 644/*140*/
634 645
635 /* 646 /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
679 belongs to */ 690 belongs to */
680 __le16 i_suballoc_bit; /* Bit offset in suballocator 691 __le16 i_suballoc_bit; /* Bit offset in suballocator
681 block group */ 692 block group */
682/*10*/ __le16 i_reserved0; 693/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */
683 __le16 i_xattr_inline_size; 694 __le16 i_xattr_inline_size;
684 __le32 i_clusters; /* Cluster count */ 695 __le32 i_clusters; /* Cluster count */
685 __le32 i_uid; /* Owner UID */ 696 __le32 i_uid; /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
705 __le16 i_dyn_features; 716 __le16 i_dyn_features;
706 __le64 i_xattr_loc; 717 __le64 i_xattr_loc;
707/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 718/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6]; 719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5];
709/*B8*/ union { 721/*B8*/ union {
710 __le64 i_pad1; /* Generic way to refer to this 722 __le64 i_pad1; /* Generic way to refer to this
711 64bit union */ 723 64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
781/*40*/ 793/*40*/
782}; 794};
783 795
796 /*
797 * A directory entry in the indexed tree. We don't store the full name here,
798 * but instead provide a pointer to the full dirent in the unindexed tree.
799 *
800 * We also store name_len here so as to reduce the number of leaf blocks we
801 * need to search in case of collisions.
802 */
803struct ocfs2_dx_entry {
804 __le32 dx_major_hash; /* Used to find logical
805 * cluster in index */
806 __le32 dx_minor_hash; /* Lower bits used to find
807 * block in cluster */
808 __le64 dx_dirent_blk; /* Physical block in unindexed
809 * tree holding this dirent. */
810};
811
812struct ocfs2_dx_entry_list {
813 __le32 de_reserved;
814 __le16 de_count; /* Maximum number of entries
815 * possible in de_entries */
816 __le16 de_num_used; /* Current number of
817 * de_entries entries */
818 struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
819 * in a packed array of
820 * length de_num_used */
821};
822
823#define OCFS2_DX_FLAG_INLINE 0x01
824
825/*
826 * A directory indexing block. Each indexed directory has one of these,
827 * pointed to by ocfs2_dinode.
828 *
829 * This block stores an indexed btree root, and a set of free space
830 * start-of-list pointers.
831 */
832struct ocfs2_dx_root_block {
833 __u8 dr_signature[8]; /* Signature for verification */
834 struct ocfs2_block_check dr_check; /* Error checking */
835 __le16 dr_suballoc_slot; /* Slot suballocator this
836 * block belongs to. */
837 __le16 dr_suballoc_bit; /* Bit offset in suballocator
838 * block group */
839 __le32 dr_fs_generation; /* Must match super block */
840 __le64 dr_blkno; /* Offset on disk, in blocks */
841 __le64 dr_last_eb_blk; /* Pointer to last
842 * extent block */
843 __le32 dr_clusters; /* Clusters allocated
844 * to the indexed tree. */
845 __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */
846 __u8 dr_reserved0;
847 __le16 dr_reserved1;
848 __le64 dr_dir_blkno; /* Pointer to parent inode */
849 __le32 dr_num_entries; /* Total number of
850 * names stored in
851 * this directory.*/
852 __le32 dr_reserved2;
853 __le64 dr_free_blk; /* Pointer to head of free
854 * unindexed block list. */
855 __le64 dr_reserved3[15];
856 union {
857 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
858 * bits for maximum space
859 * efficiency. */
860 struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
861 * entries. We grow out
862 * to extents if this
863 * gets too big. */
864 };
865};
866
867/*
868 * The header of a leaf block in the indexed tree.
869 */
870struct ocfs2_dx_leaf {
871 __u8 dl_signature[8];/* Signature for verification */
872 struct ocfs2_block_check dl_check; /* Error checking */
873 __le64 dl_blkno; /* Offset on disk, in blocks */
874 __le32 dl_fs_generation;/* Must match super block */
875 __le32 dl_reserved0;
876 __le64 dl_reserved1;
877 struct ocfs2_dx_entry_list dl_list;
878};
879
784/* 880/*
785 * On disk allocator group structure for OCFS2 881 * On disk allocator group structure for OCFS2
786 */ 882 */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
1112 return size / sizeof(struct ocfs2_extent_rec); 1208 return size / sizeof(struct ocfs2_extent_rec);
1113} 1209}
1114 1210
1211static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
1212{
1213 int size;
1214
1215 size = sb->s_blocksize -
1216 offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
1217
1218 return size / sizeof(struct ocfs2_extent_rec);
1219}
1220
1115static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 1221static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
1116{ 1222{
1117 int size; 1223 int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1132 return size / sizeof(struct ocfs2_extent_rec); 1238 return size / sizeof(struct ocfs2_extent_rec);
1133} 1239}
1134 1240
1241static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1242{
1243 int size;
1244
1245 size = sb->s_blocksize -
1246 offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
1247
1248 return size / sizeof(struct ocfs2_dx_entry);
1249}
1250
1251static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
1252{
1253 int size;
1254
1255 size = sb->s_blocksize -
1256 offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
1257
1258 return size / sizeof(struct ocfs2_dx_entry);
1259}
1260
1135static inline u16 ocfs2_local_alloc_size(struct super_block *sb) 1261static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1136{ 1262{
1137 u16 size; 1263 u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC,
50 OCFS2_NUM_LOCK_TYPES 51 OCFS2_NUM_LOCK_TYPES
51}; 52};
52 53
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
81 case OCFS2_LOCK_TYPE_QINFO: 82 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q'; 83 c = 'Q';
83 break; 84 break;
85 case OCFS2_LOCK_TYPE_NFS_SYNC:
86 c = 'Y';
87 break;
84 default: 88 default:
85 c = '\0'; 89 c = '\0';
86 } 90 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..b4ca5911caaf 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
48#include "buffer_head_io.h" 48#include "buffer_head_io.h"
49 49
50#define NOT_ALLOC_NEW_GROUP 0 50#define NOT_ALLOC_NEW_GROUP 0
51#define ALLOC_NEW_GROUP 1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
52 53
53#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_INODES_TO_STEAL 1024
54 55
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
64static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 65static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
65 struct inode *alloc_inode, 66 struct inode *alloc_inode,
66 struct buffer_head *bh, 67 struct buffer_head *bh,
67 u64 max_block); 68 u64 max_block,
69 u64 *last_alloc_group,
70 int flags);
68 71
69static int ocfs2_cluster_group_search(struct inode *inode, 72static int ocfs2_cluster_group_search(struct inode *inode,
70 struct buffer_head *group_bh, 73 struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
116 u16 *bg_bit_off); 119 u16 *bg_bit_off);
117static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 120static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
118 u32 bits_wanted, u64 max_block, 121 u32 bits_wanted, u64 max_block,
122 int flags,
119 struct ocfs2_alloc_context **ac); 123 struct ocfs2_alloc_context **ac);
120 124
121void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 125void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
403static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 407static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404 struct inode *alloc_inode, 408 struct inode *alloc_inode,
405 struct buffer_head *bh, 409 struct buffer_head *bh,
406 u64 max_block) 410 u64 max_block,
411 u64 *last_alloc_group,
412 int flags)
407{ 413{
408 int status, credits; 414 int status, credits;
409 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 415 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
423 cl = &fe->id2.i_chain; 429 cl = &fe->id2.i_chain;
424 status = ocfs2_reserve_clusters_with_limit(osb, 430 status = ocfs2_reserve_clusters_with_limit(osb,
425 le16_to_cpu(cl->cl_cpg), 431 le16_to_cpu(cl->cl_cpg),
426 max_block, &ac); 432 max_block, flags, &ac);
427 if (status < 0) { 433 if (status < 0) {
428 if (status != -ENOSPC) 434 if (status != -ENOSPC)
429 mlog_errno(status); 435 mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
440 goto bail; 446 goto bail;
441 } 447 }
442 448
449 if (last_alloc_group && *last_alloc_group != 0) {
450 mlog(0, "use old allocation group %llu for block group alloc\n",
451 (unsigned long long)*last_alloc_group);
452 ac->ac_last_group = *last_alloc_group;
453 }
443 status = ocfs2_claim_clusters(osb, 454 status = ocfs2_claim_clusters(osb,
444 handle, 455 handle,
445 ac, 456 ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
514 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 525 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
515 526
516 status = 0; 527 status = 0;
528
529 /* save the new last alloc group so that the caller can cache it. */
530 if (last_alloc_group)
531 *last_alloc_group = ac->ac_last_group;
532
517bail: 533bail:
518 if (handle) 534 if (handle)
519 ocfs2_commit_trans(osb, handle); 535 ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
531 struct ocfs2_alloc_context *ac, 547 struct ocfs2_alloc_context *ac,
532 int type, 548 int type,
533 u32 slot, 549 u32 slot,
534 int alloc_new_group) 550 u64 *last_alloc_group,
551 int flags)
535{ 552{
536 int status; 553 int status;
537 u32 bits_wanted = ac->ac_bits_wanted; 554 u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
587 goto bail; 604 goto bail;
588 } 605 }
589 606
590 if (alloc_new_group != ALLOC_NEW_GROUP) { 607 if (!(flags & ALLOC_NEW_GROUP)) {
591 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 608 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
592 "and we don't alloc a new group for it.\n", 609 "and we don't alloc a new group for it.\n",
593 slot, bits_wanted, free_bits); 610 slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
596 } 613 }
597 614
598 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 615 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
599 ac->ac_max_block); 616 ac->ac_max_block,
617 last_alloc_group, flags);
600 if (status < 0) { 618 if (status < 0) {
601 if (status != -ENOSPC) 619 if (status != -ENOSPC)
602 mlog_errno(status); 620 mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
640 658
641 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 659 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
642 EXTENT_ALLOC_SYSTEM_INODE, 660 EXTENT_ALLOC_SYSTEM_INODE,
643 slot, ALLOC_NEW_GROUP); 661 slot, NULL, ALLOC_NEW_GROUP);
644 if (status < 0) { 662 if (status < 0) {
645 if (status != -ENOSPC) 663 if (status != -ENOSPC)
646 mlog_errno(status); 664 mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
686 704
687 status = ocfs2_reserve_suballoc_bits(osb, ac, 705 status = ocfs2_reserve_suballoc_bits(osb, ac,
688 INODE_ALLOC_SYSTEM_INODE, 706 INODE_ALLOC_SYSTEM_INODE,
689 slot, NOT_ALLOC_NEW_GROUP); 707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
690 if (status >= 0) { 709 if (status >= 0) {
691 ocfs2_set_inode_steal_slot(osb, slot); 710 ocfs2_set_inode_steal_slot(osb, slot);
692 break; 711 break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
703{ 722{
704 int status; 723 int status;
705 s16 slot = ocfs2_get_inode_steal_slot(osb); 724 s16 slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group;
706 726
707 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
708 if (!(*ac)) { 728 if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
738 goto inode_steal; 758 goto inode_steal;
739 759
740 atomic_set(&osb->s_num_inodes_stolen, 0); 760 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group;
741 status = ocfs2_reserve_suballoc_bits(osb, *ac, 762 status = ocfs2_reserve_suballoc_bits(osb, *ac,
742 INODE_ALLOC_SYSTEM_INODE, 763 INODE_ALLOC_SYSTEM_INODE,
743 osb->slot_num, ALLOC_NEW_GROUP); 764 osb->slot_num,
765 &alloc_group,
766 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL);
744 if (status >= 0) { 768 if (status >= 0) {
745 status = 0; 769 status = 0;
746 770
771 spin_lock(&osb->osb_lock);
772 osb->osb_inode_alloc_group = alloc_group;
773 spin_unlock(&osb->osb_lock);
774 mlog(0, "after reservation, new allocation group is "
775 "%llu\n", (unsigned long long)alloc_group);
776
747 /* 777 /*
748 * Some inodes must be freed by us, so try to allocate 778 * Some inodes must be freed by us, so try to allocate
749 * from our own next time. 779 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
790 820
791 status = ocfs2_reserve_suballoc_bits(osb, ac, 821 status = ocfs2_reserve_suballoc_bits(osb, ac,
792 GLOBAL_BITMAP_SYSTEM_INODE, 822 GLOBAL_BITMAP_SYSTEM_INODE,
793 OCFS2_INVALID_SLOT, 823 OCFS2_INVALID_SLOT, NULL,
794 ALLOC_NEW_GROUP); 824 ALLOC_NEW_GROUP);
795 if (status < 0 && status != -ENOSPC) { 825 if (status < 0 && status != -ENOSPC) {
796 mlog_errno(status); 826 mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
806 * things a bit. */ 836 * things a bit. */
807static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 837static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
808 u32 bits_wanted, u64 max_block, 838 u32 bits_wanted, u64 max_block,
839 int flags,
809 struct ocfs2_alloc_context **ac) 840 struct ocfs2_alloc_context **ac)
810{ 841{
811 int status; 842 int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
823 (*ac)->ac_max_block = max_block; 854 (*ac)->ac_max_block = max_block;
824 855
825 status = -ENOSPC; 856 status = -ENOSPC;
826 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 857 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
858 ocfs2_alloc_should_use_local(osb, bits_wanted)) {
827 status = ocfs2_reserve_local_alloc_bits(osb, 859 status = ocfs2_reserve_local_alloc_bits(osb,
828 bits_wanted, 860 bits_wanted,
829 *ac); 861 *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
861 u32 bits_wanted, 893 u32 bits_wanted,
862 struct ocfs2_alloc_context **ac) 894 struct ocfs2_alloc_context **ac)
863{ 895{
864 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); 896 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
897 ALLOC_NEW_GROUP, ac);
865} 898}
866 899
867/* 900/*
@@ -1618,8 +1651,41 @@ bail:
1618 return status; 1651 return status;
1619} 1652}
1620 1653
1654static void ocfs2_init_inode_ac_group(struct inode *dir,
1655 struct buffer_head *parent_fe_bh,
1656 struct ocfs2_alloc_context *ac)
1657{
1658 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1659 /*
1660 * Try to allocate inodes from some specific group.
1661 *
1662 * If the parent dir has recorded the last group used in allocation,
1663 * cool, use it. Otherwise if we try to allocate new inode from the
1664 * same slot the parent dir belongs to, use the same chunk.
1665 *
1666 * We are very careful here to avoid the mistake of setting
1667 * ac_last_group to a group descriptor from a different (unlocked) slot.
1668 */
1669 if (OCFS2_I(dir)->ip_last_used_group &&
1670 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1671 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1672 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1673 ac->ac_last_group = ocfs2_which_suballoc_group(
1674 le64_to_cpu(fe->i_blkno),
1675 le16_to_cpu(fe->i_suballoc_bit));
1676}
1677
1678static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1679 struct ocfs2_alloc_context *ac)
1680{
1681 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1682 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1683}
1684
1621int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1685int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1622 handle_t *handle, 1686 handle_t *handle,
1687 struct inode *dir,
1688 struct buffer_head *parent_fe_bh,
1623 struct ocfs2_alloc_context *ac, 1689 struct ocfs2_alloc_context *ac,
1624 u16 *suballoc_bit, 1690 u16 *suballoc_bit,
1625 u64 *fe_blkno) 1691 u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1635 BUG_ON(ac->ac_bits_wanted != 1); 1701 BUG_ON(ac->ac_bits_wanted != 1);
1636 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1702 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1637 1703
1704 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1705
1638 status = ocfs2_claim_suballoc_bits(osb, 1706 status = ocfs2_claim_suballoc_bits(osb,
1639 ac, 1707 ac,
1640 handle, 1708 handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1653 1721
1654 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1722 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1655 ac->ac_bits_given++; 1723 ac->ac_bits_given++;
1724 ocfs2_save_inode_ac_group(dir, ac);
1656 status = 0; 1725 status = 0;
1657bail: 1726bail:
1658 mlog_exit(status); 1727 mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
2116 2185
2117 return ret; 2186 return ret;
2118} 2187}
2188
2189/*
2190 * Read the inode specified by blkno to get suballoc_slot and
2191 * suballoc_bit.
2192 */
2193static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2194 u16 *suballoc_slot, u16 *suballoc_bit)
2195{
2196 int status;
2197 struct buffer_head *inode_bh = NULL;
2198 struct ocfs2_dinode *inode_fe;
2199
2200 mlog_entry("blkno: %llu\n", blkno);
2201
2202 /* dirty read disk */
2203 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2204 if (status < 0) {
2205 mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
2206 goto bail;
2207 }
2208
2209 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2210 if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2211 mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
2212 status = -EINVAL;
2213 goto bail;
2214 }
2215
2216 if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
2217 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2218 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2219 blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2220 status = -EINVAL;
2221 goto bail;
2222 }
2223
2224 if (suballoc_slot)
2225 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2226 if (suballoc_bit)
2227 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2228
2229bail:
2230 brelse(inode_bh);
2231
2232 mlog_exit(status);
2233 return status;
2234}
2235
2236/*
2237 * test whether bit is SET in allocator bitmap or not. on success, 0
2238 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
2239 * is returned and *res is meaningless. Call this after you have
2240 * cluster locked against suballoc, or you may get a result based on
2241 * non-up2date contents
2242 */
2243static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2244 struct inode *suballoc,
2245 struct buffer_head *alloc_bh, u64 blkno,
2246 u16 bit, int *res)
2247{
2248 struct ocfs2_dinode *alloc_fe;
2249 struct ocfs2_group_desc *group;
2250 struct buffer_head *group_bh = NULL;
2251 u64 bg_blkno;
2252 int status;
2253
2254 mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
2255
2256 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2257 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2258 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2259 (unsigned int)bit,
2260 ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2261 status = -EINVAL;
2262 goto bail;
2263 }
2264
2265 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2266 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2267 &group_bh);
2268 if (status < 0) {
2269 mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
2270 goto bail;
2271 }
2272
2273 group = (struct ocfs2_group_desc *) group_bh->b_data;
2274 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2275
2276bail:
2277 brelse(group_bh);
2278
2279 mlog_exit(status);
2280 return status;
2281}
2282
2283/*
2284 * Test if the bit representing this inode (blkno) is set in the
2285 * suballocator.
2286 *
2287 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2288 *
2289 * In the event of failure, a negative value is returned and *res is
2290 * meaningless.
2291 *
2292 * Callers must make sure to hold nfs_sync_lock to prevent
2293 * ocfs2_delete_inode() on another node from accessing the same
2294 * suballocator concurrently.
2295 */
2296int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2297{
2298 int status;
2299 u16 suballoc_bit = 0, suballoc_slot = 0;
2300 struct inode *inode_alloc_inode;
2301 struct buffer_head *alloc_bh = NULL;
2302
2303 mlog_entry("blkno: %llu", blkno);
2304
2305 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2306 &suballoc_bit);
2307 if (status < 0) {
2308 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2309 goto bail;
2310 }
2311
2312 inode_alloc_inode =
2313 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2314 suballoc_slot);
2315 if (!inode_alloc_inode) {
2316 /* the error code could be inaccurate, but we are not able to
2317 * get the correct one. */
2318 status = -EINVAL;
2319 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2320 (u32)suballoc_slot);
2321 goto bail;
2322 }
2323
2324 mutex_lock(&inode_alloc_inode->i_mutex);
2325 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2326 if (status < 0) {
2327 mutex_unlock(&inode_alloc_inode->i_mutex);
2328 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2329 (u32)suballoc_slot, status);
2330 goto bail;
2331 }
2332
2333 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2334 blkno, suballoc_bit, res);
2335 if (status < 0)
2336 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2337
2338 ocfs2_inode_unlock(inode_alloc_inode, 0);
2339 mutex_unlock(&inode_alloc_inode->i_mutex);
2340
2341 iput(inode_alloc_inode);
2342 brelse(alloc_bh);
2343bail:
2344 mlog_exit(status);
2345 return status;
2346}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
88 u64 *blkno_start); 88 u64 *blkno_start);
89int ocfs2_claim_new_inode(struct ocfs2_super *osb, 89int ocfs2_claim_new_inode(struct ocfs2_super *osb,
90 handle_t *handle, 90 handle_t *handle,
91 struct inode *dir,
92 struct buffer_head *parent_fe_bh,
91 struct ocfs2_alloc_context *ac, 93 struct ocfs2_alloc_context *ac,
92 u16 *suballoc_bit, 94 u16 *suballoc_bit,
93 u64 *fe_blkno); 95 u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
186 u32 clusters_to_add, u32 extents_to_split, 188 u32 clusters_to_add, u32 extents_to_split,
187 struct ocfs2_alloc_context **data_ac, 189 struct ocfs2_alloc_context **data_ac,
188 struct ocfs2_alloc_context **meta_ac); 190 struct ocfs2_alloc_context **meta_ac);
191
192int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
189#endif /* _CHAINALLOC_H_ */ 193#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
201 {Opt_err, NULL} 201 {Opt_err, NULL}
202}; 202};
203 203
204#ifdef CONFIG_DEBUG_FS
205static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
206{
207 int out = 0;
208 int i;
209 struct ocfs2_cluster_connection *cconn = osb->cconn;
210 struct ocfs2_recovery_map *rm = osb->recovery_map;
211
212 out += snprintf(buf + out, len - out,
213 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
214 "Device", osb->dev_str, osb->uuid_str,
215 osb->fs_generation, osb->vol_label);
216
217 out += snprintf(buf + out, len - out,
218 "%10s => State: %d Flags: 0x%lX\n", "Volume",
219 atomic_read(&osb->vol_state), osb->osb_flags);
220
221 out += snprintf(buf + out, len - out,
222 "%10s => Block: %lu Cluster: %d\n", "Sizes",
223 osb->sb->s_blocksize, osb->s_clustersize);
224
225 out += snprintf(buf + out, len - out,
226 "%10s => Compat: 0x%X Incompat: 0x%X "
227 "ROcompat: 0x%X\n",
228 "Features", osb->s_feature_compat,
229 osb->s_feature_incompat, osb->s_feature_ro_compat);
230
231 out += snprintf(buf + out, len - out,
232 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
233 osb->s_mount_opt, osb->s_atime_quantum);
234
235 out += snprintf(buf + out, len - out,
236 "%10s => Stack: %s Name: %*s Version: %d.%d\n",
237 "Cluster",
238 (*osb->osb_cluster_stack == '\0' ?
239 "o2cb" : osb->osb_cluster_stack),
240 cconn->cc_namelen, cconn->cc_name,
241 cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
242
243 spin_lock(&osb->dc_task_lock);
244 out += snprintf(buf + out, len - out,
245 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
246 "WorkSeq: %lu\n", "DownCnvt",
247 task_pid_nr(osb->dc_task), osb->blocked_lock_count,
248 osb->dc_wake_sequence, osb->dc_work_sequence);
249 spin_unlock(&osb->dc_task_lock);
250
251 spin_lock(&osb->osb_lock);
252 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
253 "Recovery",
254 (osb->recovery_thread_task ?
255 task_pid_nr(osb->recovery_thread_task) : -1));
256 if (rm->rm_used == 0)
257 out += snprintf(buf + out, len - out, " None\n");
258 else {
259 for (i = 0; i < rm->rm_used; i++)
260 out += snprintf(buf + out, len - out, " %d",
261 rm->rm_entries[i]);
262 out += snprintf(buf + out, len - out, "\n");
263 }
264 spin_unlock(&osb->osb_lock);
265
266 out += snprintf(buf + out, len - out,
267 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
268 task_pid_nr(osb->commit_task), osb->osb_commit_interval,
269 atomic_read(&osb->needs_checkpoint));
270
271 out += snprintf(buf + out, len - out,
272 "%10s => State: %d NumTxns: %d TxnId: %lu\n",
273 "Journal", osb->journal->j_state,
274 atomic_read(&osb->journal->j_num_trans),
275 osb->journal->j_trans_id);
276
277 out += snprintf(buf + out, len - out,
278 "%10s => GlobalAllocs: %d LocalAllocs: %d "
279 "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
280 "Stats",
281 atomic_read(&osb->alloc_stats.bitmap_data),
282 atomic_read(&osb->alloc_stats.local_data),
283 atomic_read(&osb->alloc_stats.bg_allocs),
284 atomic_read(&osb->alloc_stats.moves),
285 atomic_read(&osb->alloc_stats.bg_extends));
286
287 out += snprintf(buf + out, len - out,
288 "%10s => State: %u Descriptor: %llu Size: %u bits "
289 "Default: %u bits\n",
290 "LocalAlloc", osb->local_alloc_state,
291 (unsigned long long)osb->la_last_gd,
292 osb->local_alloc_bits, osb->local_alloc_default_bits);
293
294 spin_lock(&osb->osb_lock);
295 out += snprintf(buf + out, len - out,
296 "%10s => Slot: %d NumStolen: %d\n", "Steal",
297 osb->s_inode_steal_slot,
298 atomic_read(&osb->s_num_inodes_stolen));
299 spin_unlock(&osb->osb_lock);
300
301 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
302 "Slots", "Num", "RecoGen");
303
304 for (i = 0; i < osb->max_slots; ++i) {
305 out += snprintf(buf + out, len - out,
306 "%10s %c %3d %10d\n",
307 " ",
308 (i == osb->slot_num ? '*' : ' '),
309 i, osb->slot_recovery_generations[i]);
310 }
311
312 return out;
313}
314
315static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
316{
317 struct ocfs2_super *osb = inode->i_private;
318 char *buf = NULL;
319
320 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
321 if (!buf)
322 goto bail;
323
324 i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
325
326 file->private_data = buf;
327
328 return 0;
329bail:
330 return -ENOMEM;
331}
332
333static int ocfs2_debug_release(struct inode *inode, struct file *file)
334{
335 kfree(file->private_data);
336 return 0;
337}
338
339static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
340 size_t nbytes, loff_t *ppos)
341{
342 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
343 i_size_read(file->f_mapping->host));
344}
345#else
346static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
347{
348 return 0;
349}
350static int ocfs2_debug_release(struct inode *inode, struct file *file)
351{
352 return 0;
353}
354static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
355 size_t nbytes, loff_t *ppos)
356{
357 return 0;
358}
359#endif /* CONFIG_DEBUG_FS */
360
361static struct file_operations ocfs2_osb_debug_fops = {
362 .open = ocfs2_osb_debug_open,
363 .release = ocfs2_debug_release,
364 .read = ocfs2_debug_read,
365 .llseek = generic_file_llseek,
366};
367
204/* 368/*
205 * write_super and sync_fs ripped right out of ext3. 369 * write_super and sync_fs ripped right out of ext3.
206 */ 370 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
926 goto read_super_error; 1090 goto read_super_error;
927 } 1091 }
928 1092
1093 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
1094 osb->osb_debug_root,
1095 osb,
1096 &ocfs2_osb_debug_fops);
1097 if (!osb->osb_ctxt) {
1098 status = -EINVAL;
1099 mlog_errno(status);
1100 goto read_super_error;
1101 }
1102
929 status = ocfs2_mount_volume(sb); 1103 status = ocfs2_mount_volume(sb);
930 if (osb->root_inode) 1104 if (osb->root_inode)
931 inode = igrab(osb->root_inode); 1105 inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1620 osb = OCFS2_SB(sb); 1794 osb = OCFS2_SB(sb);
1621 BUG_ON(!osb); 1795 BUG_ON(!osb);
1622 1796
1797 debugfs_remove(osb->osb_ctxt);
1798
1623 ocfs2_disable_quotas(osb); 1799 ocfs2_disable_quotas(osb);
1624 1800
1625 ocfs2_shutdown_local_alloc(osb); 1801 ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1742 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 1918 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
1743 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 1919 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
1744 1920
1921 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
1922
1923 for (i = 0; i < 3; i++)
1924 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
1925 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1926
1745 osb->sb = sb; 1927 osb->sb = sb;
1746 /* Save off for ocfs2_rw_direct */ 1928 /* Save off for ocfs2_rw_direct */
1747 osb->s_sectsize_bits = blksize_bits(sector_size); 1929 osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2130 * lock, and it's marked as dirty, set the bit in the recover 2312 * lock, and it's marked as dirty, set the bit in the recover
2131 * map and launch a recovery thread for it. */ 2313 * map and launch a recovery thread for it. */
2132 status = ocfs2_mark_dead_nodes(osb); 2314 status = ocfs2_mark_dead_nodes(osb);
2315 if (status < 0) {
2316 mlog_errno(status);
2317 goto finally;
2318 }
2319
2320 status = ocfs2_compute_replay_slots(osb);
2133 if (status < 0) 2321 if (status < 0)
2134 mlog_errno(status); 2322 mlog_errno(status);
2135 2323
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
512 struct ocfs2_security_xattr_info *si, 512 struct ocfs2_security_xattr_info *si,
513 int *want_clusters, 513 int *want_clusters,
514 int *xattr_credits, 514 int *xattr_credits,
515 struct ocfs2_alloc_context **xattr_ac) 515 int *want_meta)
516{ 516{
517 int ret = 0; 517 int ret = 0;
518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || 555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
557 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 *want_meta = *want_meta + 1;
558 if (ret) {
559 mlog_errno(ret);
560 return ret;
561 }
562 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 558 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
563 } 559 }
564 560
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
68 int *, int *, struct ocfs2_alloc_context **); 68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, 69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *, 70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **); 71 int *, int *, int *);
72 72
73/* 73/*
74 * xattrs can live inside an inode, as part of an external xattr block, 74 * xattrs can live inside an inode, as part of an external xattr block,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..379ae5fb4411 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
262{ 262{
263 struct super_block *s = dentry->d_sb; 263 struct super_block *s = dentry->d_sb;
264 struct omfs_sb_info *sbi = OMFS_SB(s); 264 struct omfs_sb_info *sbi = OMFS_SB(s);
265 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
266
265 buf->f_type = OMFS_MAGIC; 267 buf->f_type = OMFS_MAGIC;
266 buf->f_bsize = sbi->s_blocksize; 268 buf->f_bsize = sbi->s_blocksize;
267 buf->f_blocks = sbi->s_num_blocks; 269 buf->f_blocks = sbi->s_num_blocks;
268 buf->f_files = sbi->s_num_blocks; 270 buf->f_files = sbi->s_num_blocks;
269 buf->f_namelen = OMFS_NAMELEN; 271 buf->f_namelen = OMFS_NAMELEN;
272 buf->f_fsid.val[0] = (u32)id;
273 buf->f_fsid.val[1] = (u32)(id >> 32);
270 274
271 buf->f_bfree = buf->f_bavail = buf->f_ffree = 275 buf->f_bfree = buf->f_bavail = buf->f_ffree =
272 omfs_count_free(s); 276 omfs_count_free(s);
277
273 return 0; 278 return 0;
274} 279}
275 280
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
421 426
422 sbi->s_uid = current_uid(); 427 sbi->s_uid = current_uid();
423 sbi->s_gid = current_gid(); 428 sbi->s_gid = current_gid();
424 sbi->s_dmask = sbi->s_fmask = current->fs->umask; 429 sbi->s_dmask = sbi->s_fmask = current_umask();
425 430
426 if (!parse_options((char *) data, sbi)) 431 if (!parse_options((char *) data, sbi))
427 goto end; 432 goto end;
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/audit.h> 30#include <linux/audit.h>
31#include <linux/falloc.h> 31#include <linux/falloc.h>
32#include <linux/fs_struct.h>
32 33
33int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) 34int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
34{ 35{
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 38e337d51ced..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <linux/ctype.h> 20#include <linux/ctype.h>
21#include <linux/genhd.h> 21#include <linux/genhd.h>
22#include <linux/blktrace_api.h>
22 23
23#include "check.h" 24#include "check.h"
24 25
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
294 295
295static struct attribute_group *part_attr_groups[] = { 296static struct attribute_group *part_attr_groups[] = {
296 &part_attr_group, 297 &part_attr_group,
298#ifdef CONFIG_BLK_DEV_IO_TRACE
299 &blk_trace_attr_group,
300#endif
297 NULL 301 NULL
298}; 302};
299 303
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
80#include <linux/oom.h> 80#include <linux/oom.h>
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h>
83#include "internal.h" 84#include "internal.h"
84 85
85/* NOTE: 86/* NOTE:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
120 K(i.freeram-i.freehigh), 120 K(i.freeram-i.freehigh),
121#endif 121#endif
122#ifndef CONFIG_MMU 122#ifndef CONFIG_MMU
123 K((unsigned long) atomic_read(&mmap_pages_allocated)), 123 K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
124#endif 124#endif
125 K(i.totalswap), 125 K(i.totalswap),
126 K(i.freeswap), 126 K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b446d7ad0b0d..7e14d1a04001 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
76 76
77/* 77/*
78 * display a list of all the REGIONs the kernel knows about 78 * display a list of all the REGIONs the kernel knows about
79 * - nommu kernals have a single flat list 79 * - nommu kernels have a single flat list
80 */ 80 */
81static int nommu_region_list_show(struct seq_file *m, void *_p) 81static int nommu_region_list_show(struct seq_file *m, void *_p)
82{ 82{
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801f..39e4ad4f59f4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
204 struct file *file = vma->vm_file; 204 struct file *file = vma->vm_file;
205 int flags = vma->vm_flags; 205 int flags = vma->vm_flags;
206 unsigned long ino = 0; 206 unsigned long ino = 0;
207 unsigned long long pgoff = 0;
207 dev_t dev = 0; 208 dev_t dev = 0;
208 int len; 209 int len;
209 210
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
211 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 212 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
212 dev = inode->i_sb->s_dev; 213 dev = inode->i_sb->s_dev;
213 ino = inode->i_ino; 214 ino = inode->i_ino;
215 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
214 } 216 }
215 217
216 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 218 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
220 flags & VM_WRITE ? 'w' : '-', 222 flags & VM_WRITE ? 'w' : '-',
221 flags & VM_EXEC ? 'x' : '-', 223 flags & VM_EXEC ? 'x' : '-',
222 flags & VM_MAYSHARE ? 's' : 'p', 224 flags & VM_MAYSHARE ? 's' : 'p',
223 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, 225 pgoff,
224 MAJOR(dev), MINOR(dev), ino, &len); 226 MAJOR(dev), MINOR(dev), ino, &len);
225 227
226 /* 228 /*
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..12c20377772d 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/fdtable.h> 4#include <linux/fdtable.h>
5#include <linux/fs_struct.h>
5#include <linux/mount.h> 6#include <linux/mount.h>
6#include <linux/ptrace.h> 7#include <linux/ptrace.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
49 else 50 else
50 bytes += kobjsize(mm); 51 bytes += kobjsize(mm);
51 52
52 if (current->fs && atomic_read(&current->fs->count) > 1) 53 if (current->fs && current->fs->users > 1)
53 sbytes += kobjsize(current->fs); 54 sbytes += kobjsize(current->fs);
54 else 55 else
55 bytes += kobjsize(current->fs); 56 bytes += kobjsize(current->fs);
@@ -125,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
125 struct file *file; 126 struct file *file;
126 dev_t dev = 0; 127 dev_t dev = 0;
127 int flags, len; 128 int flags, len;
129 unsigned long long pgoff = 0;
128 130
129 flags = vma->vm_flags; 131 flags = vma->vm_flags;
130 file = vma->vm_file; 132 file = vma->vm_file;
@@ -133,17 +135,18 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
133 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 135 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
134 dev = inode->i_sb->s_dev; 136 dev = inode->i_sb->s_dev;
135 ino = inode->i_ino; 137 ino = inode->i_ino;
138 pgoff = (loff_t)vma->pg_off << PAGE_SHIFT;
136 } 139 }
137 140
138 seq_printf(m, 141 seq_printf(m,
139 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", 142 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
140 vma->vm_start, 143 vma->vm_start,
141 vma->vm_end, 144 vma->vm_end,
142 flags & VM_READ ? 'r' : '-', 145 flags & VM_READ ? 'r' : '-',
143 flags & VM_WRITE ? 'w' : '-', 146 flags & VM_WRITE ? 'w' : '-',
144 flags & VM_EXEC ? 'x' : '-', 147 flags & VM_EXEC ? 'x' : '-',
145 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 148 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
146 vma->vm_pgoff << PAGE_SHIFT, 149 pgoff,
147 MAJOR(dev), MINOR(dev), ino, &len); 150 MAJOR(dev), MINOR(dev), ino, &len);
148 151
149 if (file) { 152 if (file) {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84c..fe1f0f31d11c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) 282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
283{ 283{
284 struct super_block *sb = dentry->d_sb; 284 struct super_block *sb = dentry->d_sb;
285 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
285 286
286 lock_kernel(); 287 lock_kernel();
287 288
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
291 buf->f_bfree = qnx4_count_free_blocks(sb); 292 buf->f_bfree = qnx4_count_free_blocks(sb);
292 buf->f_bavail = buf->f_bfree; 293 buf->f_bavail = buf->f_bfree;
293 buf->f_namelen = QNX4_NAME_MAX; 294 buf->f_namelen = QNX4_NAME_MAX;
295 buf->f_fsid.val[0] = (u32)id;
296 buf->f_fsid.val[1] = (u32)(id >> 32);
294 297
295 unlock_kernel(); 298 unlock_kernel();
296 299
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2ca967a5ef77..607c579e5eca 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
823 823
824 spin_lock(&inode_lock); 824 spin_lock(&inode_lock);
825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
826 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 826 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
827 continue; 827 continue;
828 if (!atomic_read(&inode->i_writecount)) 828 if (!atomic_read(&inode->i_writecount))
829 continue; 829 continue;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a404fb88e456..3a6b193d8444 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
221 save_mount_options(sb, data); 221 save_mount_options(sb, data);
222 222
223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); 223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
224 sb->s_fs_info = fsi;
224 if (!fsi) { 225 if (!fsi) {
225 err = -ENOMEM; 226 err = -ENOMEM;
226 goto fail; 227 goto fail;
227 } 228 }
228 sb->s_fs_info = fsi;
229 229
230 err = ramfs_parse_options(data, &fsi->mount_opts); 230 err = ramfs_parse_options(data, &fsi->mount_opts);
231 if (err) 231 if (err)
232 goto fail; 232 goto fail;
233 233
234 sb->s_maxbytes = MAX_LFS_FILESIZE; 234 sb->s_maxbytes = MAX_LFS_FILESIZE;
235 sb->s_blocksize = PAGE_CACHE_SIZE; 235 sb->s_blocksize = PAGE_CACHE_SIZE;
236 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 236 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
237 sb->s_magic = RAMFS_MAGIC; 237 sb->s_magic = RAMFS_MAGIC;
238 sb->s_op = &ramfs_ops; 238 sb->s_op = &ramfs_ops;
239 sb->s_time_gran = 1; 239 sb->s_time_gran = 1;
240
240 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 241 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
241 if (!inode) { 242 if (!inode) {
242 err = -ENOMEM; 243 err = -ENOMEM;
@@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
244 } 245 }
245 246
246 root = d_alloc_root(inode); 247 root = d_alloc_root(inode);
248 sb->s_root = root;
247 if (!root) { 249 if (!root) {
248 err = -ENOMEM; 250 err = -ENOMEM;
249 goto fail; 251 goto fail;
250 } 252 }
251 sb->s_root = root; 253
252 return 0; 254 return 0;
253fail: 255fail:
254 kfree(fsi); 256 kfree(fsi);
257 sb->s_fs_info = NULL;
255 iput(inode); 258 iput(inode);
256 return err; 259 return err;
257} 260}
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
731 return ret; 731 return ret;
732} 732}
733 733
734static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
735{
736#define HALF_LONG_BITS (BITS_PER_LONG / 2)
737 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
738}
739
740SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
741 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
742{
743 loff_t pos = pos_from_hilo(pos_h, pos_l);
744 struct file *file;
745 ssize_t ret = -EBADF;
746 int fput_needed;
747
748 if (pos < 0)
749 return -EINVAL;
750
751 file = fget_light(fd, &fput_needed);
752 if (file) {
753 ret = -ESPIPE;
754 if (file->f_mode & FMODE_PREAD)
755 ret = vfs_readv(file, vec, vlen, &pos);
756 fput_light(file, fput_needed);
757 }
758
759 if (ret > 0)
760 add_rchar(current, ret);
761 inc_syscr(current);
762 return ret;
763}
764
765SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
766 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
767{
768 loff_t pos = pos_from_hilo(pos_h, pos_l);
769 struct file *file;
770 ssize_t ret = -EBADF;
771 int fput_needed;
772
773 if (pos < 0)
774 return -EINVAL;
775
776 file = fget_light(fd, &fput_needed);
777 if (file) {
778 ret = -ESPIPE;
779 if (file->f_mode & FMODE_PWRITE)
780 ret = vfs_writev(file, vec, vlen, &pos);
781 fput_light(file, fput_needed);
782 }
783
784 if (ret > 0)
785 add_wchar(current, ret);
786 inc_syscw(current);
787 return ret;
788}
789
734static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 790static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
735 size_t count, loff_t max) 791 size_t count, loff_t max)
736{ 792{
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
1config REISERFS_FS 1config REISERFS_FS
2 tristate "Reiserfs support" 2 tristate "Reiserfs support"
3 select CRC32
3 help 4 help
4 Stores not just filenames but the files themselves in a balanced 5 Stores not just filenames but the files themselves in a balanced
5 tree. Uses journalling. 6 tree. Uses journalling.
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 972250c62896..0ae6486d9046 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h>
30 31
31struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
32 33
@@ -1904,6 +1905,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1904 buf->f_bsize = dentry->d_sb->s_blocksize; 1905 buf->f_bsize = dentry->d_sb->s_blocksize;
1905 /* changed to accommodate gcc folks. */ 1906 /* changed to accommodate gcc folks. */
1906 buf->f_type = REISERFS_SUPER_MAGIC; 1907 buf->f_type = REISERFS_SUPER_MAGIC;
1908 buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
1909 buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
1910 sizeof(rs->s_uuid)/2);
1911
1907 return 0; 1912 return 0;
1908} 1913}
1909 1914
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d1..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
428 } else { 428 } else {
429 apply_umask: 429 apply_umask:
430 /* no ACL, apply umask */ 430 /* no ACL, apply umask */
431 inode->i_mode &= ~current->fs->umask; 431 inode->i_mode &= ~current_umask();
432 } 432 }
433 433
434 return err; 434 return err;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
1config ROMFS_FS 1config ROMFS_FS
2 tristate "ROM file system support" 2 tristate "ROM file system support"
3 depends on BLOCK 3 depends on BLOCK || MTD
4 ---help--- 4 ---help---
5 This is a very small read-only file system mainly intended for 5 This is a very small read-only file system mainly intended for
6 initial ram disks of installation disks, but it could be used for 6 initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
14 14
15 If you don't know whether you need it, then you don't need it: 15 If you don't know whether you need it, then you don't need it:
16 answer N. 16 answer N.
17
18#
19# Select the backing stores to be supported
20#
21choice
22 prompt "RomFS backing stores"
23 depends on ROMFS_FS
24 default ROMFS_BACKED_BY_BLOCK
25 help
26 Select the backing stores to be supported.
27
28config ROMFS_BACKED_BY_BLOCK
29 bool "Block device-backed ROM file system support"
30 depends on BLOCK
31 help
32 This permits ROMFS to use block devices buffered through the page
33 cache as the medium from which to retrieve data. It does not allow
34 direct mapping of the medium.
35
36 If unsure, answer Y.
37
38config ROMFS_BACKED_BY_MTD
39 bool "MTD-backed ROM file system support"
40 depends on MTD=y || (ROMFS_FS=m && MTD)
41 help
42 This permits ROMFS to use MTD based devices directly, without the
43 intercession of the block layer (which may have been disabled). It
44 also allows direct mapping of MTD devices through romfs files under
45 NOMMU conditions if the underlying device is directly addressable by
46 the CPU.
47
48 If unsure, answer Y.
49
50config ROMFS_BACKED_BY_BOTH
51 bool "Both the above"
52 depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
53endchoice
54
55
56config ROMFS_ON_BLOCK
57 bool
58 default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
59
60config ROMFS_ON_MTD
61 bool
62 default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
1# 1#
2# Makefile for the linux romfs filesystem routines. 2# Makefile for the linux RomFS filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_ROMFS_FS) += romfs.o 5obj-$(CONFIG_ROMFS_FS) += romfs.o
6 6
7romfs-objs := inode.o 7romfs-y := storage.o super.o
8
9ifneq ($(CONFIG_MMU),y)
10romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
11endif
12
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
1/*
2 * ROMFS file system, Linux implementation
3 *
4 * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
5 *
6 * Using parts of the minix filesystem
7 * Copyright (C) 1991, 1992 Linus Torvalds
8 *
9 * and parts of the affs filesystem additionally
10 * Copyright (C) 1993 Ray Burr
11 * Copyright (C) 1996 Hans-Joachim Widmaier
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 *
18 * Changes
19 * Changed for 2.1.19 modules
20 * Jan 1997 Initial release
21 * Jun 1997 2.1.43+ changes
22 * Proper page locking in readpage
23 * Changed to work with 2.1.45+ fs
24 * Jul 1997 Fixed follow_link
25 * 2.1.47
26 * lookup shouldn't return -ENOENT
27 * from Horst von Brand:
28 * fail on wrong checksum
29 * double unlock_super was possible
30 * correct namelen for statfs
31 * spotted by Bill Hawes:
32 * readlink shouldn't iput()
33 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
34 * exposed a problem in readdir
35 * 2.1.107 code-freeze spellchecker run
36 * Aug 1998 2.1.118+ VFS changes
37 * Sep 1998 2.1.122 another VFS change (follow_link)
38 * Apr 1999 2.2.7 no more EBADF checking in
39 * lookup/readdir, use ERR_PTR
40 * Jun 1999 2.3.6 d_alloc_root use changed
41 * 2.3.9 clean up usage of ENOENT/negative
42 * dentries in lookup
43 * clean up page flags setting
44 * (error, uptodate, locking) in
45 * in readpage
46 * use init_special_inode for
47 * fifos/sockets (and streamline) in
48 * read_inode, fix _ops table order
49 * Aug 1999 2.3.16 __initfunc() => __init change
50 * Oct 1999 2.3.24 page->owner hack obsoleted
51 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
52 */
53
54/* todo:
55 * - see Documentation/filesystems/romfs.txt
56 * - use allocated, not stack memory for file names?
57 * - considering write access...
58 * - network (tftp) files?
59 * - merge back some _op tables
60 */
61
62/*
63 * Sorry about some optimizations and for some goto's. I just wanted
64 * to squeeze some more bytes out of this code.. :)
65 */
66
67#include <linux/module.h>
68#include <linux/types.h>
69#include <linux/errno.h>
70#include <linux/slab.h>
71#include <linux/romfs_fs.h>
72#include <linux/fs.h>
73#include <linux/init.h>
74#include <linux/pagemap.h>
75#include <linux/smp_lock.h>
76#include <linux/buffer_head.h>
77#include <linux/vfs.h>
78
79#include <asm/uaccess.h>
80
81struct romfs_inode_info {
82 unsigned long i_metasize; /* size of non-data area */
83 unsigned long i_dataoffset; /* from the start of fs */
84 struct inode vfs_inode;
85};
86
87static struct inode *romfs_iget(struct super_block *, unsigned long);
88
89/* instead of private superblock data */
90static inline unsigned long romfs_maxsize(struct super_block *sb)
91{
92 return (unsigned long)sb->s_fs_info;
93}
94
95static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
96{
97 return container_of(inode, struct romfs_inode_info, vfs_inode);
98}
99
100static __u32
101romfs_checksum(void *data, int size)
102{
103 __u32 sum;
104 __be32 *ptr;
105
106 sum = 0; ptr = data;
107 size>>=2;
108 while (size>0) {
109 sum += be32_to_cpu(*ptr++);
110 size--;
111 }
112 return sum;
113}
114
115static const struct super_operations romfs_ops;
116
117static int romfs_fill_super(struct super_block *s, void *data, int silent)
118{
119 struct buffer_head *bh;
120 struct romfs_super_block *rsb;
121 struct inode *root;
122 int sz, ret = -EINVAL;
123
124 /* I would parse the options here, but there are none.. :) */
125
126 sb_set_blocksize(s, ROMBSIZE);
127 s->s_maxbytes = 0xFFFFFFFF;
128
129 bh = sb_bread(s, 0);
130 if (!bh) {
131 /* XXX merge with other printk? */
132 printk ("romfs: unable to read superblock\n");
133 goto outnobh;
134 }
135
136 rsb = (struct romfs_super_block *)bh->b_data;
137 sz = be32_to_cpu(rsb->size);
138 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
139 || sz < ROMFH_SIZE) {
140 if (!silent)
141 printk ("VFS: Can't find a romfs filesystem on dev "
142 "%s.\n", s->s_id);
143 goto out;
144 }
145 if (romfs_checksum(rsb, min_t(int, sz, 512))) {
146 printk ("romfs: bad initial checksum on dev "
147 "%s.\n", s->s_id);
148 goto out;
149 }
150
151 s->s_magic = ROMFS_MAGIC;
152 s->s_fs_info = (void *)(long)sz;
153
154 s->s_flags |= MS_RDONLY;
155
156 /* Find the start of the fs */
157 sz = (ROMFH_SIZE +
158 strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
159 & ROMFH_MASK;
160
161 s->s_op = &romfs_ops;
162 root = romfs_iget(s, sz);
163 if (IS_ERR(root)) {
164 ret = PTR_ERR(root);
165 goto out;
166 }
167
168 ret = -ENOMEM;
169 s->s_root = d_alloc_root(root);
170 if (!s->s_root)
171 goto outiput;
172
173 brelse(bh);
174 return 0;
175
176outiput:
177 iput(root);
178out:
179 brelse(bh);
180outnobh:
181 return ret;
182}
183
184/* That's simple too. */
185
186static int
187romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
188{
189 buf->f_type = ROMFS_MAGIC;
190 buf->f_bsize = ROMBSIZE;
191 buf->f_bfree = buf->f_bavail = buf->f_ffree;
192 buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
193 buf->f_namelen = ROMFS_MAXFN;
194 return 0;
195}
196
197/* some helper routines */
198
199static int
200romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
201{
202 struct buffer_head *bh;
203 unsigned long avail, maxsize, res;
204
205 maxsize = romfs_maxsize(i->i_sb);
206 if (offset >= maxsize)
207 return -1;
208
209 /* strnlen is almost always valid */
210 if (count > maxsize || offset+count > maxsize)
211 count = maxsize-offset;
212
213 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
214 if (!bh)
215 return -1; /* error */
216
217 avail = ROMBSIZE - (offset & ROMBMASK);
218 maxsize = min_t(unsigned long, count, avail);
219 res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
220 brelse(bh);
221
222 if (res < maxsize)
223 return res; /* found all of it */
224
225 while (res < count) {
226 offset += maxsize;
227
228 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
229 if (!bh)
230 return -1;
231 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
232 avail = strnlen(bh->b_data, maxsize);
233 res += avail;
234 brelse(bh);
235 if (avail < maxsize)
236 return res;
237 }
238 return res;
239}
240
241static int
242romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
243{
244 struct buffer_head *bh;
245 unsigned long avail, maxsize, res;
246
247 maxsize = romfs_maxsize(i->i_sb);
248 if (offset >= maxsize || count > maxsize || offset+count>maxsize)
249 return -1;
250
251 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
252 if (!bh)
253 return -1; /* error */
254
255 avail = ROMBSIZE - (offset & ROMBMASK);
256 maxsize = min_t(unsigned long, count, avail);
257 memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
258 brelse(bh);
259
260 res = maxsize; /* all of it */
261
262 while (res < count) {
263 offset += maxsize;
264 dest += maxsize;
265
266 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
267 if (!bh)
268 return -1;
269 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
270 memcpy(dest, bh->b_data, maxsize);
271 brelse(bh);
272 res += maxsize;
273 }
274 return res;
275}
276
277static unsigned char romfs_dtype_table[] = {
278 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
279};
280
281static int
282romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
283{
284 struct inode *i = filp->f_path.dentry->d_inode;
285 struct romfs_inode ri;
286 unsigned long offset, maxoff;
287 int j, ino, nextfh;
288 int stored = 0;
289 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
290
291 lock_kernel();
292
293 maxoff = romfs_maxsize(i->i_sb);
294
295 offset = filp->f_pos;
296 if (!offset) {
297 offset = i->i_ino & ROMFH_MASK;
298 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
299 goto out;
300 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
301 }
302
303 /* Not really failsafe, but we are read-only... */
304 for(;;) {
305 if (!offset || offset >= maxoff) {
306 offset = maxoff;
307 filp->f_pos = offset;
308 goto out;
309 }
310 filp->f_pos = offset;
311
312 /* Fetch inode info */
313 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
314 goto out;
315
316 j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
317 if (j < 0)
318 goto out;
319
320 fsname[j]=0;
321 romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
322
323 ino = offset;
324 nextfh = be32_to_cpu(ri.next);
325 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
326 ino = be32_to_cpu(ri.spec);
327 if (filldir(dirent, fsname, j, offset, ino,
328 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
329 goto out;
330 }
331 stored++;
332 offset = nextfh & ROMFH_MASK;
333 }
334out:
335 unlock_kernel();
336 return stored;
337}
338
339static struct dentry *
340romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
341{
342 unsigned long offset, maxoff;
343 long res;
344 int fslen;
345 struct inode *inode = NULL;
346 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
347 struct romfs_inode ri;
348 const char *name; /* got from dentry */
349 int len;
350
351 res = -EACCES; /* placeholder for "no data here" */
352 offset = dir->i_ino & ROMFH_MASK;
353 lock_kernel();
354 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
355 goto error;
356
357 maxoff = romfs_maxsize(dir->i_sb);
358 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
359
360 /* OK, now find the file whose name is in "dentry" in the
361 * directory specified by "dir". */
362
363 name = dentry->d_name.name;
364 len = dentry->d_name.len;
365
366 for(;;) {
367 if (!offset || offset >= maxoff)
368 goto success; /* negative success */
369 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
370 goto error;
371
372 /* try to match the first 16 bytes of name */
373 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
374 if (len < ROMFH_SIZE) {
375 if (len == fslen) {
376 /* both are shorter, and same size */
377 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
378 if (strncmp (name, fsname, len) == 0)
379 break;
380 }
381 } else if (fslen >= ROMFH_SIZE) {
382 /* both are longer; XXX optimize max size */
383 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
384 if (len == fslen) {
385 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
386 if (strncmp(name, fsname, len) == 0)
387 break;
388 }
389 }
390 /* next entry */
391 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
392 }
393
394 /* Hard link handling */
395 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
396 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
397
398 inode = romfs_iget(dir->i_sb, offset);
399 if (IS_ERR(inode)) {
400 res = PTR_ERR(inode);
401 goto error;
402 }
403
404success:
405 d_add(dentry, inode);
406 res = 0;
407error:
408 unlock_kernel();
409 return ERR_PTR(res);
410}
411
412/*
413 * Ok, we do readpage, to be able to execute programs. Unfortunately,
414 * we can't use bmap, since we may have looser alignments.
415 */
416
417static int
418romfs_readpage(struct file *file, struct page * page)
419{
420 struct inode *inode = page->mapping->host;
421 loff_t offset, size;
422 unsigned long filled;
423 void *buf;
424 int result = -EIO;
425
426 page_cache_get(page);
427 lock_kernel();
428 buf = kmap(page);
429 if (!buf)
430 goto err_out;
431
432 /* 32 bit warning -- but not for us :) */
433 offset = page_offset(page);
434 size = i_size_read(inode);
435 filled = 0;
436 result = 0;
437 if (offset < size) {
438 unsigned long readlen;
439
440 size -= offset;
441 readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
442
443 filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
444
445 if (filled != readlen) {
446 SetPageError(page);
447 filled = 0;
448 result = -EIO;
449 }
450 }
451
452 if (filled < PAGE_SIZE)
453 memset(buf + filled, 0, PAGE_SIZE-filled);
454
455 if (!result)
456 SetPageUptodate(page);
457 flush_dcache_page(page);
458
459 unlock_page(page);
460
461 kunmap(page);
462err_out:
463 page_cache_release(page);
464 unlock_kernel();
465
466 return result;
467}
468
469/* Mapping from our types to the kernel */
470
471static const struct address_space_operations romfs_aops = {
472 .readpage = romfs_readpage
473};
474
475static const struct file_operations romfs_dir_operations = {
476 .read = generic_read_dir,
477 .readdir = romfs_readdir,
478};
479
480static const struct inode_operations romfs_dir_inode_operations = {
481 .lookup = romfs_lookup,
482};
483
484static mode_t romfs_modemap[] =
485{
486 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
487 S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
488};
489
490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino)
492{
493 int nextfh, ret;
494 struct romfs_inode ri;
495 struct inode *i;
496
497 ino &= ROMFH_MASK;
498 i = iget_locked(sb, ino);
499 if (!i)
500 return ERR_PTR(-ENOMEM);
501 if (!(i->i_state & I_NEW))
502 return i;
503
504 i->i_mode = 0;
505
506 /* Loop for finding the real hard link */
507 for(;;) {
508 if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
509 printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
510 ino);
511 iget_failed(i);
512 return ERR_PTR(-EIO);
513 }
514 /* XXX: do romfs_checksum here too (with name) */
515
516 nextfh = be32_to_cpu(ri.next);
517 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
518 break;
519
520 ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
521 }
522
523 i->i_nlink = 1; /* Hard to decide.. */
524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527
528 /* Precalculate the data offset */
529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
530 if (ret >= 0)
531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
532 else
533 ino = 0;
534
535 ROMFS_I(i)->i_metasize = ino;
536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
537
538 /* Compute permissions */
539 ino = romfs_modemap[nextfh & ROMFH_TYPE];
540 /* only "normal" files have ops */
541 switch (nextfh & ROMFH_TYPE) {
542 case 1:
543 i->i_size = ROMFS_I(i)->i_metasize;
544 i->i_op = &romfs_dir_inode_operations;
545 i->i_fop = &romfs_dir_operations;
546 if (nextfh & ROMFH_EXEC)
547 ino |= S_IXUGO;
548 i->i_mode = ino;
549 break;
550 case 2:
551 i->i_fop = &generic_ro_fops;
552 i->i_data.a_ops = &romfs_aops;
553 if (nextfh & ROMFH_EXEC)
554 ino |= S_IXUGO;
555 i->i_mode = ino;
556 break;
557 case 3:
558 i->i_op = &page_symlink_inode_operations;
559 i->i_data.a_ops = &romfs_aops;
560 i->i_mode = ino | S_IRWXUGO;
561 break;
562 default:
563 /* depending on MBZ for sock/fifos */
564 nextfh = be32_to_cpu(ri.spec);
565 init_special_inode(i, ino,
566 MKDEV(nextfh>>16,nextfh&0xffff));
567 }
568 unlock_new_inode(i);
569 return i;
570}
571
572static struct kmem_cache * romfs_inode_cachep;
573
574static struct inode *romfs_alloc_inode(struct super_block *sb)
575{
576 struct romfs_inode_info *ei;
577 ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
578 if (!ei)
579 return NULL;
580 return &ei->vfs_inode;
581}
582
583static void romfs_destroy_inode(struct inode *inode)
584{
585 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
586}
587
588static void init_once(void *foo)
589{
590 struct romfs_inode_info *ei = foo;
591
592 inode_init_once(&ei->vfs_inode);
593}
594
595static int init_inodecache(void)
596{
597 romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
598 sizeof(struct romfs_inode_info),
599 0, (SLAB_RECLAIM_ACCOUNT|
600 SLAB_MEM_SPREAD),
601 init_once);
602 if (romfs_inode_cachep == NULL)
603 return -ENOMEM;
604 return 0;
605}
606
607static void destroy_inodecache(void)
608{
609 kmem_cache_destroy(romfs_inode_cachep);
610}
611
612static int romfs_remount(struct super_block *sb, int *flags, char *data)
613{
614 *flags |= MS_RDONLY;
615 return 0;
616}
617
618static const struct super_operations romfs_ops = {
619 .alloc_inode = romfs_alloc_inode,
620 .destroy_inode = romfs_destroy_inode,
621 .statfs = romfs_statfs,
622 .remount_fs = romfs_remount,
623};
624
625static int romfs_get_sb(struct file_system_type *fs_type,
626 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
627{
628 return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
629 mnt);
630}
631
632static struct file_system_type romfs_fs_type = {
633 .owner = THIS_MODULE,
634 .name = "romfs",
635 .get_sb = romfs_get_sb,
636 .kill_sb = kill_block_super,
637 .fs_flags = FS_REQUIRES_DEV,
638};
639
640static int __init init_romfs_fs(void)
641{
642 int err = init_inodecache();
643 if (err)
644 goto out1;
645 err = register_filesystem(&romfs_fs_type);
646 if (err)
647 goto out;
648 return 0;
649out:
650 destroy_inodecache();
651out1:
652 return err;
653}
654
655static void __exit exit_romfs_fs(void)
656{
657 unregister_filesystem(&romfs_fs_type);
658 destroy_inodecache();
659}
660
661/* Yes, works even as a module... :) */
662
663module_init(init_romfs_fs)
664module_exit(exit_romfs_fs)
665MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..06044a9dc62d
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
1/* RomFS internal definitions
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/romfs_fs.h>
13
14struct romfs_inode_info {
15 struct inode vfs_inode;
16 unsigned long i_metasize; /* size of non-data area */
17 unsigned long i_dataoffset; /* from the start of fs */
18};
19
20static inline size_t romfs_maxsize(struct super_block *sb)
21{
22 return (size_t) (unsigned long) sb->s_fs_info;
23}
24
25static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
26{
27 return container_of(inode, struct romfs_inode_info, vfs_inode);
28}
29
30/*
31 * mmap-nommu.c
32 */
33#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
34extern const struct file_operations romfs_ro_fops;
35#else
36#define romfs_ro_fops generic_ro_fops
37#endif
38
39/*
40 * storage.c
41 */
42extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
43 void *buf, size_t buflen);
44extern ssize_t romfs_dev_strnlen(struct super_block *sb,
45 unsigned long pos, size_t maxlen);
46extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
47 const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
1/* NOMMU mmap support for RomFS on MTD devices
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/mtd/super.h>
14#include "internal.h"
15
16/*
17 * try to determine where a shared mapping can be made
18 * - only supported for NOMMU at the moment (MMU can't doesn't copy private
19 * mappings)
20 * - attempts to map through to the underlying MTD device
21 */
22static unsigned long romfs_get_unmapped_area(struct file *file,
23 unsigned long addr,
24 unsigned long len,
25 unsigned long pgoff,
26 unsigned long flags)
27{
28 struct inode *inode = file->f_mapping->host;
29 struct mtd_info *mtd = inode->i_sb->s_mtd;
30 unsigned long isize, offset;
31
32 if (!mtd)
33 goto cant_map_directly;
34
35 isize = i_size_read(inode);
36 offset = pgoff << PAGE_SHIFT;
37 if (offset > isize || len > isize || offset > isize - len)
38 return (unsigned long) -EINVAL;
39
40 /* we need to call down to the MTD layer to do the actual mapping */
41 if (mtd->get_unmapped_area) {
42 if (addr != 0)
43 return (unsigned long) -EINVAL;
44
45 if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
46 return (unsigned long) -EINVAL;
47
48 offset += ROMFS_I(inode)->i_dataoffset;
49 if (offset > mtd->size - len)
50 return (unsigned long) -EINVAL;
51
52 return mtd->get_unmapped_area(mtd, len, offset, flags);
53 }
54
55cant_map_directly:
56 return (unsigned long) -ENOSYS;
57}
58
59/*
60 * permit a R/O mapping to be made directly through onto an MTD device if
61 * possible
62 */
63static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
64{
65 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
66}
67
68const struct file_operations romfs_ro_fops = {
69 .llseek = generic_file_llseek,
70 .read = do_sync_read,
71 .aio_read = generic_file_aio_read,
72 .splice_read = generic_file_splice_read,
73 .mmap = romfs_mmap,
74 .get_unmapped_area = romfs_get_unmapped_area,
75};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..7e3e1e12a081
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
1/* RomFS storage access routines
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/mtd/super.h>
14#include <linux/buffer_head.h>
15#include "internal.h"
16
17#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
18#error no ROMFS backing store interface configured
19#endif
20
21#ifdef CONFIG_ROMFS_ON_MTD
22#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
23
24/*
25 * read data from an romfs image on an MTD device
26 */
27static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
28 void *buf, size_t buflen)
29{
30 size_t rlen;
31 int ret;
32
33 ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
34 return (ret < 0 || rlen != buflen) ? -EIO : 0;
35}
36
37/*
38 * determine the length of a string in a romfs image on an MTD device
39 */
40static ssize_t romfs_mtd_strnlen(struct super_block *sb,
41 unsigned long pos, size_t maxlen)
42{
43 ssize_t n = 0;
44 size_t segment;
45 u_char buf[16], *p;
46 size_t len;
47 int ret;
48
49 /* scan the string up to 16 bytes at a time */
50 while (maxlen > 0) {
51 segment = min_t(size_t, maxlen, 16);
52 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
53 if (ret < 0)
54 return ret;
55 p = memchr(buf, 0, len);
56 if (p)
57 return n + (p - buf);
58 maxlen -= len;
59 pos += len;
60 n += len;
61 }
62
63 return n;
64}
65
66/*
67 * compare a string to one in a romfs image on MTD
68 * - return 1 if matched, 0 if differ, -ve if error
69 */
70static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
71 const char *str, size_t size)
72{
73 u_char buf[16];
74 size_t len, segment;
75 int ret;
76
77 /* scan the string up to 16 bytes at a time */
78 while (size > 0) {
79 segment = min_t(size_t, size, 16);
80 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
81 if (ret < 0)
82 return ret;
83 if (memcmp(buf, str, len) != 0)
84 return 0;
85 size -= len;
86 pos += len;
87 str += len;
88 }
89
90 return 1;
91}
92#endif /* CONFIG_ROMFS_ON_MTD */
93
94#ifdef CONFIG_ROMFS_ON_BLOCK
95/*
96 * read data from an romfs image on a block device
97 */
98static int romfs_blk_read(struct super_block *sb, unsigned long pos,
99 void *buf, size_t buflen)
100{
101 struct buffer_head *bh;
102 unsigned long offset;
103 size_t segment;
104
105 /* copy the string up to blocksize bytes at a time */
106 while (buflen > 0) {
107 offset = pos & (ROMBSIZE - 1);
108 segment = min_t(size_t, buflen, ROMBSIZE - offset);
109 bh = sb_bread(sb, pos >> ROMBSBITS);
110 if (!bh)
111 return -EIO;
112 memcpy(buf, bh->b_data + offset, segment);
113 brelse(bh);
114 buflen -= segment;
115 pos += segment;
116 }
117
118 return 0;
119}
120
121/*
122 * determine the length of a string in romfs on a block device
123 */
124static ssize_t romfs_blk_strnlen(struct super_block *sb,
125 unsigned long pos, size_t limit)
126{
127 struct buffer_head *bh;
128 unsigned long offset;
129 ssize_t n = 0;
130 size_t segment;
131 u_char *buf, *p;
132
133 /* scan the string up to blocksize bytes at a time */
134 while (limit > 0) {
135 offset = pos & (ROMBSIZE - 1);
136 segment = min_t(size_t, limit, ROMBSIZE - offset);
137 bh = sb_bread(sb, pos >> ROMBSBITS);
138 if (!bh)
139 return -EIO;
140 buf = bh->b_data + offset;
141 p = memchr(buf, 0, segment);
142 brelse(bh);
143 if (p)
144 return n + (p - buf);
145 limit -= segment;
146 pos += segment;
147 n += segment;
148 }
149
150 return n;
151}
152
153/*
154 * compare a string to one in a romfs image on a block device
155 * - return 1 if matched, 0 if differ, -ve if error
156 */
157static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
158 const char *str, size_t size)
159{
160 struct buffer_head *bh;
161 unsigned long offset;
162 size_t segment;
163 bool x;
164
165 /* scan the string up to 16 bytes at a time */
166 while (size > 0) {
167 offset = pos & (ROMBSIZE - 1);
168 segment = min_t(size_t, size, ROMBSIZE - offset);
169 bh = sb_bread(sb, pos >> ROMBSBITS);
170 if (!bh)
171 return -EIO;
172 x = (memcmp(bh->b_data + offset, str, segment) != 0);
173 brelse(bh);
174 if (x)
175 return 0;
176 size -= segment;
177 pos += segment;
178 str += segment;
179 }
180
181 return 1;
182}
183#endif /* CONFIG_ROMFS_ON_BLOCK */
184
185/*
186 * read data from the romfs image
187 */
188int romfs_dev_read(struct super_block *sb, unsigned long pos,
189 void *buf, size_t buflen)
190{
191 size_t limit;
192
193 limit = romfs_maxsize(sb);
194 if (pos >= limit)
195 return -EIO;
196 if (buflen > limit - pos)
197 buflen = limit - pos;
198
199#ifdef CONFIG_ROMFS_ON_MTD
200 if (sb->s_mtd)
201 return romfs_mtd_read(sb, pos, buf, buflen);
202#endif
203#ifdef CONFIG_ROMFS_ON_BLOCK
204 if (sb->s_bdev)
205 return romfs_blk_read(sb, pos, buf, buflen);
206#endif
207 return -EIO;
208}
209
210/*
211 * determine the length of a string in romfs
212 */
213ssize_t romfs_dev_strnlen(struct super_block *sb,
214 unsigned long pos, size_t maxlen)
215{
216 size_t limit;
217
218 limit = romfs_maxsize(sb);
219 if (pos >= limit)
220 return -EIO;
221 if (maxlen > limit - pos)
222 maxlen = limit - pos;
223
224#ifdef CONFIG_ROMFS_ON_MTD
225 if (sb->s_mtd)
226 return romfs_mtd_strnlen(sb, pos, limit);
227#endif
228#ifdef CONFIG_ROMFS_ON_BLOCK
229 if (sb->s_bdev)
230 return romfs_blk_strnlen(sb, pos, limit);
231#endif
232 return -EIO;
233}
234
235/*
236 * compare a string to one in romfs
237 * - return 1 if matched, 0 if differ, -ve if error
238 */
239int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
240 const char *str, size_t size)
241{
242 size_t limit;
243
244 limit = romfs_maxsize(sb);
245 if (pos >= limit)
246 return -EIO;
247 if (size > ROMFS_MAXFN)
248 return -ENAMETOOLONG;
249 if (size > limit - pos)
250 return -EIO;
251
252#ifdef CONFIG_ROMFS_ON_MTD
253 if (sb->s_mtd)
254 return romfs_mtd_strncmp(sb, pos, str, size);
255#endif
256#ifdef CONFIG_ROMFS_ON_BLOCK
257 if (sb->s_bdev)
258 return romfs_blk_strncmp(sb, pos, str, size);
259#endif
260 return -EIO;
261}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..10ca7d984a8b
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,653 @@
1/* Block- or MTD-based romfs
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * Derived from: ROMFS file system, Linux implementation
7 *
8 * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
9 *
10 * Using parts of the minix filesystem
11 * Copyright © 1991, 1992 Linus Torvalds
12 *
13 * and parts of the affs filesystem additionally
14 * Copyright © 1993 Ray Burr
15 * Copyright © 1996 Hans-Joachim Widmaier
16 *
17 * Changes
18 * Changed for 2.1.19 modules
19 * Jan 1997 Initial release
20 * Jun 1997 2.1.43+ changes
21 * Proper page locking in readpage
22 * Changed to work with 2.1.45+ fs
23 * Jul 1997 Fixed follow_link
24 * 2.1.47
25 * lookup shouldn't return -ENOENT
26 * from Horst von Brand:
27 * fail on wrong checksum
28 * double unlock_super was possible
29 * correct namelen for statfs
30 * spotted by Bill Hawes:
31 * readlink shouldn't iput()
32 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
33 * exposed a problem in readdir
34 * 2.1.107 code-freeze spellchecker run
35 * Aug 1998 2.1.118+ VFS changes
36 * Sep 1998 2.1.122 another VFS change (follow_link)
37 * Apr 1999 2.2.7 no more EBADF checking in
38 * lookup/readdir, use ERR_PTR
39 * Jun 1999 2.3.6 d_alloc_root use changed
40 * 2.3.9 clean up usage of ENOENT/negative
41 * dentries in lookup
42 * clean up page flags setting
43 * (error, uptodate, locking) in
44 * in readpage
45 * use init_special_inode for
46 * fifos/sockets (and streamline) in
47 * read_inode, fix _ops table order
48 * Aug 1999 2.3.16 __initfunc() => __init change
49 * Oct 1999 2.3.24 page->owner hack obsoleted
50 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
51 *
52 *
53 * This program is free software; you can redistribute it and/or
54 * modify it under the terms of the GNU General Public Licence
55 * as published by the Free Software Foundation; either version
56 * 2 of the Licence, or (at your option) any later version.
57 */
58
59#include <linux/module.h>
60#include <linux/string.h>
61#include <linux/fs.h>
62#include <linux/time.h>
63#include <linux/slab.h>
64#include <linux/init.h>
65#include <linux/blkdev.h>
66#include <linux/parser.h>
67#include <linux/mount.h>
68#include <linux/namei.h>
69#include <linux/statfs.h>
70#include <linux/mtd/super.h>
71#include <linux/ctype.h>
72#include <linux/highmem.h>
73#include <linux/pagemap.h>
74#include <linux/uaccess.h>
75#include "internal.h"
76
77static struct kmem_cache *romfs_inode_cachep;
78
79static const umode_t romfs_modemap[8] = {
80 0, /* hard link */
81 S_IFDIR | 0644, /* directory */
82 S_IFREG | 0644, /* regular file */
83 S_IFLNK | 0777, /* symlink */
84 S_IFBLK | 0600, /* blockdev */
85 S_IFCHR | 0600, /* chardev */
86 S_IFSOCK | 0644, /* socket */
87 S_IFIFO | 0644 /* FIFO */
88};
89
90static const unsigned char romfs_dtype_table[] = {
91 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
92};
93
94static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
95
96/*
97 * read a page worth of data from the image
98 */
99static int romfs_readpage(struct file *file, struct page *page)
100{
101 struct inode *inode = page->mapping->host;
102 loff_t offset, size;
103 unsigned long fillsize, pos;
104 void *buf;
105 int ret;
106
107 buf = kmap(page);
108 if (!buf)
109 return -ENOMEM;
110
111 /* 32 bit warning -- but not for us :) */
112 offset = page_offset(page);
113 size = i_size_read(inode);
114 fillsize = 0;
115 ret = 0;
116 if (offset < size) {
117 size -= offset;
118 fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
119
120 pos = ROMFS_I(inode)->i_dataoffset + offset;
121
122 ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
123 if (ret < 0) {
124 SetPageError(page);
125 fillsize = 0;
126 ret = -EIO;
127 }
128 }
129
130 if (fillsize < PAGE_SIZE)
131 memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
132 if (ret == 0)
133 SetPageUptodate(page);
134
135 flush_dcache_page(page);
136 kunmap(page);
137 unlock_page(page);
138 return ret;
139}
140
141static const struct address_space_operations romfs_aops = {
142 .readpage = romfs_readpage
143};
144
145/*
146 * read the entries from a directory
147 */
148static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
149{
150 struct inode *i = filp->f_dentry->d_inode;
151 struct romfs_inode ri;
152 unsigned long offset, maxoff;
153 int j, ino, nextfh;
154 int stored = 0;
155 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
156 int ret;
157
158 maxoff = romfs_maxsize(i->i_sb);
159
160 offset = filp->f_pos;
161 if (!offset) {
162 offset = i->i_ino & ROMFH_MASK;
163 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
164 if (ret < 0)
165 goto out;
166 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
167 }
168
169 /* Not really failsafe, but we are read-only... */
170 for (;;) {
171 if (!offset || offset >= maxoff) {
172 offset = maxoff;
173 filp->f_pos = offset;
174 goto out;
175 }
176 filp->f_pos = offset;
177
178 /* Fetch inode info */
179 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
180 if (ret < 0)
181 goto out;
182
183 j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
184 sizeof(fsname) - 1);
185 if (j < 0)
186 goto out;
187
188 ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
189 if (ret < 0)
190 goto out;
191 fsname[j] = '\0';
192
193 ino = offset;
194 nextfh = be32_to_cpu(ri.next);
195 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
196 ino = be32_to_cpu(ri.spec);
197 if (filldir(dirent, fsname, j, offset, ino,
198 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
199 goto out;
200
201 stored++;
202 offset = nextfh & ROMFH_MASK;
203 }
204
205out:
206 return stored;
207}
208
209/*
210 * look up an entry in a directory
211 */
212static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
213 struct nameidata *nd)
214{
215 unsigned long offset, maxoff;
216 struct inode *inode;
217 struct romfs_inode ri;
218 const char *name; /* got from dentry */
219 int len, ret;
220
221 offset = dir->i_ino & ROMFH_MASK;
222 ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
223 if (ret < 0)
224 goto error;
225
226 /* search all the file entries in the list starting from the one
227 * pointed to by the directory's special data */
228 maxoff = romfs_maxsize(dir->i_sb);
229 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
230
231 name = dentry->d_name.name;
232 len = dentry->d_name.len;
233
234 for (;;) {
235 if (!offset || offset >= maxoff)
236 goto out0;
237
238 ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
239 if (ret < 0)
240 goto error;
241
242 /* try to match the first 16 bytes of name */
243 ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
244 len);
245 if (ret < 0)
246 goto error;
247 if (ret == 1)
248 break;
249
250 /* next entry */
251 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
252 }
253
254 /* Hard link handling */
255 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
256 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
257
258 inode = romfs_iget(dir->i_sb, offset);
259 if (IS_ERR(inode)) {
260 ret = PTR_ERR(inode);
261 goto error;
262 }
263 goto outi;
264
265 /*
266 * it's a bit funky, _lookup needs to return an error code
267 * (negative) or a NULL, both as a dentry. ENOENT should not
268 * be returned, instead we need to create a negative dentry by
269 * d_add(dentry, NULL); and return 0 as no error.
270 * (Although as I see, it only matters on writable file
271 * systems).
272 */
273out0:
274 inode = NULL;
275outi:
276 d_add(dentry, inode);
277 ret = 0;
278error:
279 return ERR_PTR(ret);
280}
281
282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir,
284 .readdir = romfs_readdir,
285};
286
287static struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup,
289};
290
291/*
292 * get a romfs inode based on its position in the image (which doubles as the
293 * inode number)
294 */
295static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
296{
297 struct romfs_inode_info *inode;
298 struct romfs_inode ri;
299 struct inode *i;
300 unsigned long nlen;
301 unsigned nextfh, ret;
302 umode_t mode;
303
304 /* we might have to traverse a chain of "hard link" file entries to get
305 * to the actual file */
306 for (;;) {
307 ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
308 if (ret < 0)
309 goto error;
310
311 /* XXX: do romfs_checksum here too (with name) */
312
313 nextfh = be32_to_cpu(ri.next);
314 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
315 break;
316
317 pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
318 }
319
320 /* determine the length of the filename */
321 nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
322 if (IS_ERR_VALUE(nlen))
323 goto eio;
324
325 /* get an inode for this image position */
326 i = iget_locked(sb, pos);
327 if (!i)
328 return ERR_PTR(-ENOMEM);
329
330 if (!(i->i_state & I_NEW))
331 return i;
332
333 /* precalculate the data offset */
334 inode = ROMFS_I(i);
335 inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
336 inode->i_dataoffset = pos + inode->i_metasize;
337
338 i->i_nlink = 1; /* Hard to decide.. */
339 i->i_size = be32_to_cpu(ri.size);
340 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
341 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
342
343 /* set up mode and ops */
344 mode = romfs_modemap[nextfh & ROMFH_TYPE];
345
346 switch (nextfh & ROMFH_TYPE) {
347 case ROMFH_DIR:
348 i->i_size = ROMFS_I(i)->i_metasize;
349 i->i_op = &romfs_dir_inode_operations;
350 i->i_fop = &romfs_dir_operations;
351 if (nextfh & ROMFH_EXEC)
352 mode |= S_IXUGO;
353 break;
354 case ROMFH_REG:
355 i->i_fop = &romfs_ro_fops;
356 i->i_data.a_ops = &romfs_aops;
357 if (i->i_sb->s_mtd)
358 i->i_data.backing_dev_info =
359 i->i_sb->s_mtd->backing_dev_info;
360 if (nextfh & ROMFH_EXEC)
361 mode |= S_IXUGO;
362 break;
363 case ROMFH_SYM:
364 i->i_op = &page_symlink_inode_operations;
365 i->i_data.a_ops = &romfs_aops;
366 mode |= S_IRWXUGO;
367 break;
368 default:
369 /* depending on MBZ for sock/fifos */
370 nextfh = be32_to_cpu(ri.spec);
371 init_special_inode(i, mode, MKDEV(nextfh >> 16,
372 nextfh & 0xffff));
373 break;
374 }
375
376 i->i_mode = mode;
377
378 unlock_new_inode(i);
379 return i;
380
381eio:
382 ret = -EIO;
383error:
384 printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
385 return ERR_PTR(ret);
386}
387
388/*
389 * allocate a new inode
390 */
391static struct inode *romfs_alloc_inode(struct super_block *sb)
392{
393 struct romfs_inode_info *inode;
394 inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
395 return inode ? &inode->vfs_inode : NULL;
396}
397
398/*
399 * return a spent inode to the slab cache
400 */
401static void romfs_destroy_inode(struct inode *inode)
402{
403 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
404}
405
406/*
407 * get filesystem statistics
408 */
409static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
410{
411 struct super_block *sb = dentry->d_sb;
412 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
413
414 buf->f_type = ROMFS_MAGIC;
415 buf->f_namelen = ROMFS_MAXFN;
416 buf->f_bsize = ROMBSIZE;
417 buf->f_bfree = buf->f_bavail = buf->f_ffree;
418 buf->f_blocks =
419 (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
420 buf->f_fsid.val[0] = (u32)id;
421 buf->f_fsid.val[1] = (u32)(id >> 32);
422 return 0;
423}
424
425/*
426 * remounting must involve read-only
427 */
428static int romfs_remount(struct super_block *sb, int *flags, char *data)
429{
430 *flags |= MS_RDONLY;
431 return 0;
432}
433
434static const struct super_operations romfs_super_ops = {
435 .alloc_inode = romfs_alloc_inode,
436 .destroy_inode = romfs_destroy_inode,
437 .statfs = romfs_statfs,
438 .remount_fs = romfs_remount,
439};
440
441/*
442 * checksum check on part of a romfs filesystem
443 */
444static __u32 romfs_checksum(const void *data, int size)
445{
446 const __be32 *ptr = data;
447 __u32 sum;
448
449 sum = 0;
450 size >>= 2;
451 while (size > 0) {
452 sum += be32_to_cpu(*ptr++);
453 size--;
454 }
455 return sum;
456}
457
458/*
459 * fill in the superblock
460 */
461static int romfs_fill_super(struct super_block *sb, void *data, int silent)
462{
463 struct romfs_super_block *rsb;
464 struct inode *root;
465 unsigned long pos, img_size;
466 const char *storage;
467 size_t len;
468 int ret;
469
470#ifdef CONFIG_BLOCK
471 if (!sb->s_mtd) {
472 sb_set_blocksize(sb, ROMBSIZE);
473 } else {
474 sb->s_blocksize = ROMBSIZE;
475 sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
476 }
477#endif
478
479 sb->s_maxbytes = 0xFFFFFFFF;
480 sb->s_magic = ROMFS_MAGIC;
481 sb->s_flags |= MS_RDONLY | MS_NOATIME;
482 sb->s_op = &romfs_super_ops;
483
484 /* read the image superblock and check it */
485 rsb = kmalloc(512, GFP_KERNEL);
486 if (!rsb)
487 return -ENOMEM;
488
489 sb->s_fs_info = (void *) 512;
490 ret = romfs_dev_read(sb, 0, rsb, 512);
491 if (ret < 0)
492 goto error_rsb;
493
494 img_size = be32_to_cpu(rsb->size);
495
496 if (sb->s_mtd && img_size > sb->s_mtd->size)
497 goto error_rsb_inval;
498
499 sb->s_fs_info = (void *) img_size;
500
501 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
502 img_size < ROMFH_SIZE) {
503 if (!silent)
504 printk(KERN_WARNING "VFS:"
505 " Can't find a romfs filesystem on dev %s.\n",
506 sb->s_id);
507 goto error_rsb_inval;
508 }
509
510 if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
511 printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
512 sb->s_id);
513 goto error_rsb_inval;
514 }
515
516 storage = sb->s_mtd ? "MTD" : "the block layer";
517
518 len = strnlen(rsb->name, ROMFS_MAXFN);
519 if (!silent)
520 printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
521 (unsigned) len, (unsigned) len, rsb->name, storage);
522
523 kfree(rsb);
524 rsb = NULL;
525
526 /* find the root directory */
527 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
528
529 root = romfs_iget(sb, pos);
530 if (!root)
531 goto error;
532
533 sb->s_root = d_alloc_root(root);
534 if (!sb->s_root)
535 goto error_i;
536
537 return 0;
538
539error_i:
540 iput(root);
541error:
542 return -EINVAL;
543error_rsb_inval:
544 ret = -EINVAL;
545error_rsb:
546 return ret;
547}
548
549/*
550 * get a superblock for mounting
551 */
552static int romfs_get_sb(struct file_system_type *fs_type,
553 int flags, const char *dev_name,
554 void *data, struct vfsmount *mnt)
555{
556 int ret = -EINVAL;
557
558#ifdef CONFIG_ROMFS_ON_MTD
559 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
560 mnt);
561#endif
562#ifdef CONFIG_ROMFS_ON_BLOCK
563 if (ret == -EINVAL)
564 ret = get_sb_bdev(fs_type, flags, dev_name, data,
565 romfs_fill_super, mnt);
566#endif
567 return ret;
568}
569
570/*
571 * destroy a romfs superblock in the appropriate manner
572 */
573static void romfs_kill_sb(struct super_block *sb)
574{
575#ifdef CONFIG_ROMFS_ON_MTD
576 if (sb->s_mtd) {
577 kill_mtd_super(sb);
578 return;
579 }
580#endif
581#ifdef CONFIG_ROMFS_ON_BLOCK
582 if (sb->s_bdev) {
583 kill_block_super(sb);
584 return;
585 }
586#endif
587}
588
589static struct file_system_type romfs_fs_type = {
590 .owner = THIS_MODULE,
591 .name = "romfs",
592 .get_sb = romfs_get_sb,
593 .kill_sb = romfs_kill_sb,
594 .fs_flags = FS_REQUIRES_DEV,
595};
596
597/*
598 * inode storage initialiser
599 */
600static void romfs_i_init_once(void *_inode)
601{
602 struct romfs_inode_info *inode = _inode;
603
604 inode_init_once(&inode->vfs_inode);
605}
606
607/*
608 * romfs module initialisation
609 */
610static int __init init_romfs_fs(void)
611{
612 int ret;
613
614 printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
615
616 romfs_inode_cachep =
617 kmem_cache_create("romfs_i",
618 sizeof(struct romfs_inode_info), 0,
619 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
620 romfs_i_init_once);
621
622 if (!romfs_inode_cachep) {
623 printk(KERN_ERR
624 "ROMFS error: Failed to initialise inode cache\n");
625 return -ENOMEM;
626 }
627 ret = register_filesystem(&romfs_fs_type);
628 if (ret) {
629 printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
630 goto error_register;
631 }
632 return 0;
633
634error_register:
635 kmem_cache_destroy(romfs_inode_cachep);
636 return ret;
637}
638
639/*
640 * romfs module removal
641 */
642static void __exit exit_romfs_fs(void)
643{
644 unregister_filesystem(&romfs_fs_type);
645 kmem_cache_destroy(romfs_inode_cachep);
646}
647
648module_init(init_romfs_fs);
649module_exit(exit_romfs_fs);
650
651MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
652MODULE_AUTHOR("Red Hat, Inc.");
653MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..c18aa7e03e2b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
59 */ 59 */
60 wait_on_page_writeback(page); 60 wait_on_page_writeback(page);
61 61
62 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 62 if (page_has_private(page) &&
63 !try_to_release_page(page, GFP_KERNEL))
63 goto out_unlock; 64 goto out_unlock;
64 65
65 /* 66 /*
@@ -736,10 +737,19 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
736 * ->write_end. Most of the time, these expect i_mutex to 737 * ->write_end. Most of the time, these expect i_mutex to
737 * be held. Since this may result in an ABBA deadlock with 738 * be held. Since this may result in an ABBA deadlock with
738 * pipe->inode, we have to order lock acquiry here. 739 * pipe->inode, we have to order lock acquiry here.
740 *
741 * Outer lock must be inode->i_mutex, as pipe_wait() will
742 * release and reacquire pipe->inode->i_mutex, AND inode must
743 * never be a pipe.
739 */ 744 */
740 inode_double_lock(inode, pipe->inode); 745 WARN_ON(S_ISFIFO(inode->i_mode));
746 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
747 if (pipe->inode)
748 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
741 ret = __splice_from_pipe(pipe, &sd, actor); 749 ret = __splice_from_pipe(pipe, &sd, actor);
742 inode_double_unlock(inode, pipe->inode); 750 if (pipe->inode)
751 mutex_unlock(&pipe->inode->i_mutex);
752 mutex_unlock(&inode->i_mutex);
743 753
744 return ret; 754 return ret;
745} 755}
@@ -830,11 +840,17 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
830 }; 840 };
831 ssize_t ret; 841 ssize_t ret;
832 842
833 inode_double_lock(inode, pipe->inode); 843 WARN_ON(S_ISFIFO(inode->i_mode));
844 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
834 ret = file_remove_suid(out); 845 ret = file_remove_suid(out);
835 if (likely(!ret)) 846 if (likely(!ret)) {
847 if (pipe->inode)
848 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
836 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 849 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
837 inode_double_unlock(inode, pipe->inode); 850 if (pipe->inode)
851 mutex_unlock(&pipe->inode->i_mutex);
852 }
853 mutex_unlock(&inode->i_mutex);
838 if (ret > 0) { 854 if (ret > 0) {
839 unsigned long nr_pages; 855 unsigned long nr_pages;
840 856
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h> 42#include <linux/zlib.h>
43#include <linux/slab.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 681ec0d83799..ffa6edcd2d0c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -301,6 +301,7 @@ failure:
301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) 301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
302{ 302{
303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; 303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
304 u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
304 305
305 TRACE("Entered squashfs_statfs\n"); 306 TRACE("Entered squashfs_statfs\n");
306 307
@@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
311 buf->f_files = msblk->inodes; 312 buf->f_files = msblk->inodes;
312 buf->f_ffree = 0; 313 buf->f_ffree = 0;
313 buf->f_namelen = SQUASHFS_NAME_LEN; 314 buf->f_namelen = SQUASHFS_NAME_LEN;
315 buf->f_fsid.val[0] = (u32)id;
316 buf->f_fsid.val[1] = (u32)(id >> 32);
314 317
315 return 0; 318 return 0;
316} 319}
diff --git a/fs/super.c b/fs/super.c
index 2ba481518ba7..786fe7d72790 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb)
287 __fsync_super(sb); 287 __fsync_super(sb);
288 return sync_blockdev(sb->s_bdev); 288 return sync_blockdev(sb->s_bdev);
289} 289}
290EXPORT_SYMBOL_GPL(fsync_super);
290 291
291/** 292/**
292 * generic_shutdown_super - common helper for ->kill_sb() 293 * generic_shutdown_super - common helper for ->kill_sb()
@@ -770,6 +771,46 @@ void kill_litter_super(struct super_block *sb)
770 771
771EXPORT_SYMBOL(kill_litter_super); 772EXPORT_SYMBOL(kill_litter_super);
772 773
774static int ns_test_super(struct super_block *sb, void *data)
775{
776 return sb->s_fs_info == data;
777}
778
779static int ns_set_super(struct super_block *sb, void *data)
780{
781 sb->s_fs_info = data;
782 return set_anon_super(sb, NULL);
783}
784
785int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
786 int (*fill_super)(struct super_block *, void *, int),
787 struct vfsmount *mnt)
788{
789 struct super_block *sb;
790
791 sb = sget(fs_type, ns_test_super, ns_set_super, data);
792 if (IS_ERR(sb))
793 return PTR_ERR(sb);
794
795 if (!sb->s_root) {
796 int err;
797 sb->s_flags = flags;
798 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
799 if (err) {
800 up_write(&sb->s_umount);
801 deactivate_super(sb);
802 return err;
803 }
804
805 sb->s_flags |= MS_ACTIVE;
806 }
807
808 simple_set_mnt(mnt, sb);
809 return 0;
810}
811
812EXPORT_SYMBOL(get_sb_ns);
813
773#ifdef CONFIG_BLOCK 814#ifdef CONFIG_BLOCK
774static int set_bdev_super(struct super_block *s, void *data) 815static int set_bdev_super(struct super_block *s, void *data)
775{ 816{
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae2..da20b48d350f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
90{ 90{
91 struct super_block *sb = dentry->d_sb; 91 struct super_block *sb = dentry->d_sb;
92 struct sysv_sb_info *sbi = SYSV_SB(sb); 92 struct sysv_sb_info *sbi = SYSV_SB(sb);
93 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
93 94
94 buf->f_type = sb->s_magic; 95 buf->f_type = sb->s_magic;
95 buf->f_bsize = sb->s_blocksize; 96 buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
98 buf->f_files = sbi->s_ninodes; 99 buf->f_files = sbi->s_ninodes;
99 buf->f_ffree = sysv_count_free_inodes(sb); 100 buf->f_ffree = sysv_count_free_inodes(sb);
100 buf->f_namelen = SYSV_NAMELEN; 101 buf->f_namelen = SYSV_NAMELEN;
102 buf->f_fsid.val[0] = (u32)id;
103 buf->f_fsid.val[1] = (u32)(id >> 32);
101 return 0; 104 return 0;
102} 105}
103 106
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
22 depends on UBIFS_FS 22 depends on UBIFS_FS
23 help 23 help
24 This option allows to explicitly choose which compressions, if any, 24 This option allows to explicitly choose which compressions, if any,
25 are enabled in UBIFS. Removing compressors means inbility to read 25 are enabled in UBIFS. Removing compressors means inability to read
26 existing file systems. 26 existing file systems.
27 27
28 If unsure, say 'N'. 28 If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
32 depends on UBIFS_FS 32 depends on UBIFS_FS
33 default y 33 default y
34 help 34 help
35 LZO compressor is generally faster then zlib but compresses worse. 35 LZO compressor is generally faster than zlib but compresses worse.
36 Say 'Y' if unsure. 36 Say 'Y' if unsure.
37 37
38config UBIFS_FS_ZLIB 38config UBIFS_FS_ZLIB
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
194} 194}
195 195
196/** 196/**
197 * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. 197 * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
198 * @c: UBIFS file-system description object 198 * @c: UBIFS file-system description object
199 * 199 *
200 * This function calculates and returns the number of eraseblocks which should 200 * This function calculates and returns the number of LEBs which should be kept
201 * be kept for index usage. 201 * for index usage.
202 */ 202 */
203int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 203int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
204{ 204{
205 int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; 205 int idx_lebs;
206 long long idx_size; 206 long long idx_size;
207 207
208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
209
210 /* And make sure we have thrice the index size of space reserved */ 209 /* And make sure we have thrice the index size of space reserved */
211 idx_size = idx_size + (idx_size << 1); 210 idx_size += idx_size << 1;
212
213 /* 211 /*
214 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 212 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
215 * pair, nor similarly the two variables for the new index size, so we 213 * pair, nor similarly the two variables for the new index size, so we
216 * have to do this costly 64-bit division on fast-path. 214 * have to do this costly 64-bit division on fast-path.
217 */ 215 */
218 idx_size += eff_leb_size - 1; 216 idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
219 idx_lebs = div_u64(idx_size, eff_leb_size);
220 /* 217 /*
221 * The index head is not available for the in-the-gaps method, so add an 218 * The index head is not available for the in-the-gaps method, so add an
222 * extra LEB to compensate. 219 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
310 * do_budget_space - reserve flash space for index and data growth. 307 * do_budget_space - reserve flash space for index and data growth.
311 * @c: UBIFS file-system description object 308 * @c: UBIFS file-system description object
312 * 309 *
313 * This function makes sure UBIFS has enough free eraseblocks for index growth 310 * This function makes sure UBIFS has enough free LEBs for index growth and
314 * and data. 311 * data.
315 * 312 *
316 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index 313 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
317 * would take if it was consolidated and written to the flash. This guarantees 314 * would take if it was consolidated and written to the flash. This guarantees
318 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 315 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
319 * be able to commit dirty index. So this function basically adds amount of 316 * be able to commit dirty index. So this function basically adds amount of
320 * budgeted index space to the size of the current index, multiplies this by 3, 317 * budgeted index space to the size of the current index, multiplies this by 3,
321 * and makes sure this does not exceed the amount of free eraseblocks. 318 * and makes sure this does not exceed the amount of free LEBs.
322 * 319 *
323 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 320 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
324 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 321 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
325 * be large, because UBIFS does not do any index consolidation as long as 322 * be large, because UBIFS does not do any index consolidation as long as
326 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 323 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
327 * will contain a lot of dirt. 324 * will contain a lot of dirt.
328 * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be 325 * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
329 * consolidated to take up to @c->min_idx_lebs LEBs. 326 * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
330 * 327 *
331 * This function returns zero in case of success, and %-ENOSPC in case of 328 * This function returns zero in case of success, and %-ENOSPC in case of
332 * failure. 329 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
695 * This function calculates amount of free space to report to user-space. 692 * This function calculates amount of free space to report to user-space.
696 * 693 *
697 * Because UBIFS may introduce substantial overhead (the index, node headers, 694 * Because UBIFS may introduce substantial overhead (the index, node headers,
698 * alignment, wastage at the end of eraseblocks, etc), it cannot report real 695 * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
699 * amount of free flash space it has (well, because not all dirty space is 696 * free flash space it has (well, because not all dirty space is reclaimable,
700 * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, 697 * UBIFS does not actually know the real amount). If UBIFS did so, it would
701 * it would bread user expectations about what free space is. Users seem to 698 * bread user expectations about what free space is. Users seem to accustomed
702 * accustomed to assume that if the file-system reports N bytes of free space, 699 * to assume that if the file-system reports N bytes of free space, they would
703 * they would be able to fit a file of N bytes to the FS. This almost works for 700 * be able to fit a file of N bytes to the FS. This almost works for
704 * traditional file-systems, because they have way less overhead than UBIFS. 701 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 702 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 703 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
479 "bad or corrupted node)"); 479 "bad or corrupted node)");
480 else { 480 else {
481 for (i = 0; i < nlen && dent->name[i]; i++) 481 for (i = 0; i < nlen && dent->name[i]; i++)
482 printk("%c", dent->name[i]); 482 printk(KERN_CONT "%c", dent->name[i]);
483 } 483 }
484 printk("\n"); 484 printk(KERN_CONT "\n");
485 485
486 break; 486 break;
487 } 487 }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
1214 1214
1215 /* 1215 /*
1216 * Make sure the last key in our znode is less or 1216 * Make sure the last key in our znode is less or
1217 * equivalent than the the key in zbranch which goes 1217 * equivalent than the key in the zbranch which goes
1218 * after our pointing zbranch. 1218 * after our pointing zbranch.
1219 */ 1219 */
1220 cmp = keys_cmp(c, max, 1220 cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
430 struct ubifs_inode *ui = ubifs_inode(inode); 430 struct ubifs_inode *ui = ubifs_inode(inode);
431 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 431 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 int skipped_read = 0;
433 struct page *page; 434 struct page *page;
434 435
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
444 445
445 if (!PageUptodate(page)) { 446 if (!PageUptodate(page)) {
446 /* The page is not loaded from the flash */ 447 /* The page is not loaded from the flash */
447 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
448 /* 449 /*
449 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
450 * have to set the @PG_checked flag to make the further 451 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
453 * the media. 454 * the media.
454 */ 455 */
455 SetPageChecked(page); 456 SetPageChecked(page);
456 else { 457 skipped_read = 1;
458 } else {
457 err = do_readpage(page); 459 err = do_readpage(page);
458 if (err) { 460 if (err) {
459 unlock_page(page); 461 unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
470 if (unlikely(err)) { 472 if (unlikely(err)) {
471 ubifs_assert(err == -ENOSPC); 473 ubifs_assert(err == -ENOSPC);
472 /* 474 /*
475 * If we skipped reading the page because we were going to
476 * write all of it, then it is not up to date.
477 */
478 if (skipped_read) {
479 ClearPageChecked(page);
480 ClearPageUptodate(page);
481 }
482 /*
473 * Budgeting failed which means it would have to force 483 * Budgeting failed which means it would have to force
474 * write-back but didn't, because we set the @fast flag in the 484 * write-back but didn't, because we set the @fast flag in the
475 * request. Write-back cannot be done now, while we have the 485 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
949 * whole index and correct all inode sizes, which is long an unacceptable. 959 * whole index and correct all inode sizes, which is long an unacceptable.
950 * 960 *
951 * To prevent situations like this, UBIFS writes pages back only if they are 961 * To prevent situations like this, UBIFS writes pages back only if they are
952 * within last synchronized inode size, i.e. the the size which has been 962 * within the last synchronized inode size, i.e. the size which has been
953 * written to the flash media last time. Otherwise, UBIFS forces inode 963 * written to the flash media last time. Otherwise, UBIFS forces inode
954 * write-back, thus making sure the on-flash inode contains current inode size, 964 * write-back, thus making sure the on-flash inode contains current inode size,
955 * and then keeps writing pages back. 965 * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
478 * ubifs_find_free_space - find a data LEB with free space. 478 * ubifs_find_free_space - find a data LEB with free space.
479 * @c: the UBIFS file-system description object 479 * @c: the UBIFS file-system description object
480 * @min_space: minimum amount of required free space 480 * @min_space: minimum amount of required free space
481 * @free: contains amount of free space in the LEB on exit 481 * @offs: contains offset of where free space starts on exit
482 * @squeeze: whether to try to find space in a non-empty LEB first 482 * @squeeze: whether to try to find space in a non-empty LEB first
483 * 483 *
484 * This function looks for an LEB with at least @min_space bytes of free space. 484 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
490 * failed to find a LEB with @min_space bytes of free space and other a negative 490 * failed to find a LEB with @min_space bytes of free space and other a negative
491 * error codes in case of failure. 491 * error codes in case of failure.
492 */ 492 */
493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
494 int squeeze) 494 int squeeze)
495{ 495{
496 const struct ubifs_lprops *lprops; 496 const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
558 spin_unlock(&c->space_lock); 558 spin_unlock(&c->space_lock);
559 } 559 }
560 560
561 *free = lprops->free; 561 *offs = c->leb_size - lprops->free;
562 ubifs_release_lprops(c); 562 ubifs_release_lprops(c);
563 563
564 if (*free == c->leb_size) { 564 if (*offs == 0) {
565 /* 565 /*
566 * Ensure that empty LEBs have been unmapped. They may not have 566 * Ensure that empty LEBs have been unmapped. They may not have
567 * been, for example, because of an unclean unmount. Also 567 * been, for example, because of an unclean unmount. Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
573 return err; 573 return err;
574 } 574 }
575 575
576 dbg_find("found LEB %d, free %d", lnum, *free); 576 dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
577 ubifs_assert(*free >= min_space); 577 ubifs_assert(*offs <= c->leb_size - min_space);
578 return lnum; 578 return lnum;
579 579
580out: 580out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
47 * have to waste large pieces of free space at the end of LEB B, because nodes 47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of 48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB 49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh 50 * which are guaranteed to be reclaimable. If LEB has less space, the GC might
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark 51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so 52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
57#include "ubifs.h" 57#include "ubifs.h"
58 58
59/* 59/*
60 * GC tries to optimize the way it fit nodes to available space, and it sorts
61 * nodes a little. The below constants are watermarks which define "large",
62 * "medium", and "small" nodes.
63 */
64#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
65#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
66
67/*
68 * GC may need to move more than one LEB to make progress. The below constants 60 * GC may need to move more than one LEB to make progress. The below constants
69 * define "soft" and "hard" limits on the number of LEBs the garbage collector 61 * define "soft" and "hard" limits on the number of LEBs the garbage collector
70 * may move. 62 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
116} 108}
117 109
118/** 110/**
119 * joinup - bring data nodes for an inode together. 111 * list_sort - sort a list.
120 * @c: UBIFS file-system description object 112 * @priv: private data, passed to @cmp
121 * @sleb: describes scanned LEB 113 * @head: the list to sort
122 * @inum: inode number 114 * @cmp: the elements comparison function
123 * @blk: block number
124 * @data: list to which to add data nodes
125 * 115 *
126 * This function looks at the first few nodes in the scanned LEB @sleb and adds 116 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
127 * them to @data if they are data nodes from @inum and have a larger block 117 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
128 * number than @blk. This function returns %0 on success and a negative error 118 * in ascending order.
129 * code on failure. 119 *
120 * The comparison function @cmp is supposed to return a negative value if @a is
121 * than @b, and a positive value if @a is greater than @b. If @a and @b are
122 * equivalent, then it does not matter what this function returns.
130 */ 123 */
131static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, 124static void list_sort(void *priv, struct list_head *head,
132 unsigned int blk, struct list_head *data) 125 int (*cmp)(void *priv, struct list_head *a,
126 struct list_head *b))
133{ 127{
134 int err, cnt = 6, lnum = sleb->lnum, offs; 128 struct list_head *p, *q, *e, *list, *tail, *oldhead;
135 struct ubifs_scan_node *snod, *tmp; 129 int insize, nmerges, psize, qsize, i;
136 union ubifs_key *key; 130
131 if (list_empty(head))
132 return;
133
134 list = head->next;
135 list_del(head);
136 insize = 1;
137 for (;;) {
138 p = oldhead = list;
139 list = tail = NULL;
140 nmerges = 0;
141
142 while (p) {
143 nmerges++;
144 q = p;
145 psize = 0;
146 for (i = 0; i < insize; i++) {
147 psize++;
148 q = q->next == oldhead ? NULL : q->next;
149 if (!q)
150 break;
151 }
137 152
138 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 153 qsize = insize;
139 key = &snod->key; 154 while (psize > 0 || (qsize > 0 && q)) {
140 if (key_inum(c, key) == inum && 155 if (!psize) {
141 key_type(c, key) == UBIFS_DATA_KEY && 156 e = q;
142 key_block(c, key) > blk) { 157 q = q->next;
143 offs = snod->offs; 158 qsize--;
144 err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); 159 if (q == oldhead)
145 if (err < 0) 160 q = NULL;
146 return err; 161 } else if (!qsize || !q) {
147 list_del(&snod->list); 162 e = p;
148 if (err) { 163 p = p->next;
149 list_add_tail(&snod->list, data); 164 psize--;
150 blk = key_block(c, key); 165 if (p == oldhead)
151 } else 166 p = NULL;
152 kfree(snod); 167 } else if (cmp(priv, p, q) <= 0) {
153 cnt = 6; 168 e = p;
154 } else if (--cnt == 0) 169 p = p->next;
170 psize--;
171 if (p == oldhead)
172 p = NULL;
173 } else {
174 e = q;
175 q = q->next;
176 qsize--;
177 if (q == oldhead)
178 q = NULL;
179 }
180 if (tail)
181 tail->next = e;
182 else
183 list = e;
184 e->prev = tail;
185 tail = e;
186 }
187 p = q;
188 }
189
190 tail->next = list;
191 list->prev = tail;
192
193 if (nmerges <= 1)
155 break; 194 break;
195
196 insize *= 2;
156 } 197 }
157 return 0; 198
199 head->next = list;
200 head->prev = list->prev;
201 list->prev->next = head;
202 list->prev = head;
158} 203}
159 204
160/** 205/**
161 * move_nodes - move nodes. 206 * data_nodes_cmp - compare 2 data nodes.
207 * @priv: UBIFS file-system description object
208 * @a: first data node
209 * @a: second data node
210 *
211 * This function compares data nodes @a and @b. Returns %1 if @a has greater
212 * inode or block number, and %-1 otherwise.
213 */
214int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
215{
216 ino_t inuma, inumb;
217 struct ubifs_info *c = priv;
218 struct ubifs_scan_node *sa, *sb;
219
220 cond_resched();
221 sa = list_entry(a, struct ubifs_scan_node, list);
222 sb = list_entry(b, struct ubifs_scan_node, list);
223 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
224 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
225
226 inuma = key_inum(c, &sa->key);
227 inumb = key_inum(c, &sb->key);
228
229 if (inuma == inumb) {
230 unsigned int blka = key_block(c, &sa->key);
231 unsigned int blkb = key_block(c, &sb->key);
232
233 if (blka <= blkb)
234 return -1;
235 } else if (inuma <= inumb)
236 return -1;
237
238 return 1;
239}
240
241/*
242 * nondata_nodes_cmp - compare 2 non-data nodes.
243 * @priv: UBIFS file-system description object
244 * @a: first node
245 * @a: second node
246 *
247 * This function compares nodes @a and @b. It makes sure that inode nodes go
248 * first and sorted by length in descending order. Directory entry nodes go
249 * after inode nodes and are sorted in ascending hash valuer order.
250 */
251int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
252{
253 int typea, typeb;
254 ino_t inuma, inumb;
255 struct ubifs_info *c = priv;
256 struct ubifs_scan_node *sa, *sb;
257
258 cond_resched();
259 sa = list_entry(a, struct ubifs_scan_node, list);
260 sb = list_entry(b, struct ubifs_scan_node, list);
261 typea = key_type(c, &sa->key);
262 typeb = key_type(c, &sb->key);
263 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
264
265 /* Inodes go before directory entries */
266 if (typea == UBIFS_INO_KEY) {
267 if (typeb == UBIFS_INO_KEY)
268 return sb->len - sa->len;
269 return -1;
270 }
271 if (typeb == UBIFS_INO_KEY)
272 return 1;
273
274 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
275 inuma = key_inum(c, &sa->key);
276 inumb = key_inum(c, &sb->key);
277
278 if (inuma == inumb) {
279 uint32_t hasha = key_hash(c, &sa->key);
280 uint32_t hashb = key_hash(c, &sb->key);
281
282 if (hasha <= hashb)
283 return -1;
284 } else if (inuma <= inumb)
285 return -1;
286
287 return 1;
288}
289
290/**
291 * sort_nodes - sort nodes for GC.
162 * @c: UBIFS file-system description object 292 * @c: UBIFS file-system description object
163 * @sleb: describes nodes to move 293 * @sleb: describes nodes to sort and contains the result on exit
294 * @nondata: contains non-data nodes on exit
295 * @min: minimum node size is returned here
164 * 296 *
165 * This function moves valid nodes from data LEB described by @sleb to the GC 297 * This function sorts the list of inodes to garbage collect. First of all, it
166 * journal head. The obsolete nodes are dropped. 298 * kills obsolete nodes and separates data and non-data nodes to the
299 * @sleb->nodes and @nondata lists correspondingly.
300 *
301 * Data nodes are then sorted in block number order - this is important for
302 * bulk-read; data nodes with lower inode number go before data nodes with
303 * higher inode number, and data nodes with lower block number go before data
304 * nodes with higher block number;
167 * 305 *
168 * When moving nodes we have to deal with classical bin-packing problem: the 306 * Non-data nodes are sorted as follows.
169 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", 307 * o First go inode nodes - they are sorted in descending length order.
170 * where the nodes in the @sleb->nodes list are the elements which should be 308 * o Then go directory entry nodes - they are sorted in hash order, which
171 * fit optimally to the bins. This function uses the "first fit decreasing" 309 * should supposedly optimize 'readdir()'. Direntry nodes with lower parent
172 * strategy, although it does not really sort the nodes but just split them on 310 * inode number go before direntry nodes with higher parent inode number,
173 * 3 classes - large, medium, and small, so they are roughly sorted. 311 * and direntry nodes with lower name hash values go before direntry nodes
312 * with higher name hash values.
174 * 313 *
175 * This function returns zero in case of success, %-EAGAIN if commit is 314 * This function returns zero in case of success and a negative error code in
176 * required, and other negative error codes in case of other failures. 315 * case of failure.
177 */ 316 */
178static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) 317static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
318 struct list_head *nondata, int *min)
179{ 319{
180 struct ubifs_scan_node *snod, *tmp; 320 struct ubifs_scan_node *snod, *tmp;
181 struct list_head data, large, medium, small;
182 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
183 int avail, err, min = INT_MAX;
184 unsigned int blk = 0;
185 ino_t inum = 0;
186 321
187 INIT_LIST_HEAD(&data); 322 *min = INT_MAX;
188 INIT_LIST_HEAD(&large);
189 INIT_LIST_HEAD(&medium);
190 INIT_LIST_HEAD(&small);
191 323
192 while (!list_empty(&sleb->nodes)) { 324 /* Separate data nodes and non-data nodes */
193 struct list_head *lst = sleb->nodes.next; 325 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
194 326 int err;
195 snod = list_entry(lst, struct ubifs_scan_node, list);
196 327
197 ubifs_assert(snod->type != UBIFS_IDX_NODE); 328 ubifs_assert(snod->type != UBIFS_IDX_NODE);
198 ubifs_assert(snod->type != UBIFS_REF_NODE); 329 ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
201 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 332 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
202 snod->offs, 0); 333 snod->offs, 0);
203 if (err < 0) 334 if (err < 0)
204 goto out; 335 return err;
205 336
206 list_del(lst);
207 if (!err) { 337 if (!err) {
208 /* The node is obsolete, remove it from the list */ 338 /* The node is obsolete, remove it from the list */
339 list_del(&snod->list);
209 kfree(snod); 340 kfree(snod);
210 continue; 341 continue;
211 } 342 }
212 343
213 /* 344 if (snod->len < *min)
214 * Sort the list of nodes so that data nodes go first, large 345 *min = snod->len;
215 * nodes go second, and small nodes go last. 346
216 */ 347 if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
217 if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { 348 list_move_tail(&snod->list, nondata);
218 if (inum != key_inum(c, &snod->key)) {
219 if (inum) {
220 /*
221 * Try to move data nodes from the same
222 * inode together.
223 */
224 err = joinup(c, sleb, inum, blk, &data);
225 if (err)
226 goto out;
227 }
228 inum = key_inum(c, &snod->key);
229 blk = key_block(c, &snod->key);
230 }
231 list_add_tail(lst, &data);
232 } else if (snod->len > MEDIUM_NODE_WM)
233 list_add_tail(lst, &large);
234 else if (snod->len > SMALL_NODE_WM)
235 list_add_tail(lst, &medium);
236 else
237 list_add_tail(lst, &small);
238
239 /* And find the smallest node */
240 if (snod->len < min)
241 min = snod->len;
242 } 349 }
243 350
244 /* 351 /* Sort data and non-data nodes */
245 * Join the tree lists so that we'd have one roughly sorted list 352 list_sort(c, &sleb->nodes, &data_nodes_cmp);
246 * ('large' will be the head of the joined list). 353 list_sort(c, nondata, &nondata_nodes_cmp);
247 */ 354 return 0;
248 list_splice(&data, &large); 355}
249 list_splice(&medium, large.prev); 356
250 list_splice(&small, large.prev); 357/**
358 * move_node - move a node.
359 * @c: UBIFS file-system description object
360 * @sleb: describes the LEB to move nodes from
361 * @snod: the mode to move
362 * @wbuf: write-buffer to move node to
363 *
364 * This function moves node @snod to @wbuf, changes TNC correspondingly, and
365 * destroys @snod. Returns zero in case of success and a negative error code in
366 * case of failure.
367 */
368static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
369 struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
370{
371 int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
372
373 cond_resched();
374 err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
375 if (err)
376 return err;
377
378 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
379 snod->offs, new_lnum, new_offs,
380 snod->len);
381 list_del(&snod->list);
382 kfree(snod);
383 return err;
384}
385
386/**
387 * move_nodes - move nodes.
388 * @c: UBIFS file-system description object
389 * @sleb: describes the LEB to move nodes from
390 *
391 * This function moves valid nodes from data LEB described by @sleb to the GC
392 * journal head. This function returns zero in case of success, %-EAGAIN if
393 * commit is required, and other negative error codes in case of other
394 * failures.
395 */
396static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
397{
398 int err, min;
399 LIST_HEAD(nondata);
400 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
251 401
252 if (wbuf->lnum == -1) { 402 if (wbuf->lnum == -1) {
253 /* 403 /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
256 */ 406 */
257 err = switch_gc_head(c); 407 err = switch_gc_head(c);
258 if (err) 408 if (err)
259 goto out; 409 return err;
260 } 410 }
261 411
412 err = sort_nodes(c, sleb, &nondata, &min);
413 if (err)
414 goto out;
415
262 /* Write nodes to their new location. Use the first-fit strategy */ 416 /* Write nodes to their new location. Use the first-fit strategy */
263 while (1) { 417 while (1) {
264 avail = c->leb_size - wbuf->offs - wbuf->used; 418 int avail;
265 list_for_each_entry_safe(snod, tmp, &large, list) { 419 struct ubifs_scan_node *snod, *tmp;
266 int new_lnum, new_offs; 420
421 /* Move data nodes */
422 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
423 avail = c->leb_size - wbuf->offs - wbuf->used;
424 if (snod->len > avail)
425 /*
426 * Do not skip data nodes in order to optimize
427 * bulk-read.
428 */
429 break;
430
431 err = move_node(c, sleb, snod, wbuf);
432 if (err)
433 goto out;
434 }
267 435
436 /* Move non-data nodes */
437 list_for_each_entry_safe(snod, tmp, &nondata, list) {
438 avail = c->leb_size - wbuf->offs - wbuf->used;
268 if (avail < min) 439 if (avail < min)
269 break; 440 break;
270 441
271 if (snod->len > avail) 442 if (snod->len > avail) {
272 /* This node does not fit */ 443 /*
444 * Keep going only if this is an inode with
445 * some data. Otherwise stop and switch the GC
446 * head. IOW, we assume that data-less inode
447 * nodes and direntry nodes are roughly of the
448 * same size.
449 */
450 if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
451 snod->len == UBIFS_INO_NODE_SZ)
452 break;
273 continue; 453 continue;
454 }
274 455
275 cond_resched(); 456 err = move_node(c, sleb, snod, wbuf);
276
277 new_lnum = wbuf->lnum;
278 new_offs = wbuf->offs + wbuf->used;
279 err = ubifs_wbuf_write_nolock(wbuf, snod->node,
280 snod->len);
281 if (err) 457 if (err)
282 goto out; 458 goto out;
283 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
284 snod->offs, new_lnum, new_offs,
285 snod->len);
286 if (err)
287 goto out;
288
289 avail = c->leb_size - wbuf->offs - wbuf->used;
290 list_del(&snod->list);
291 kfree(snod);
292 } 459 }
293 460
294 if (list_empty(&large)) 461 if (list_empty(&sleb->nodes) && list_empty(&nondata))
295 break; 462 break;
296 463
297 /* 464 /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
306 return 0; 473 return 0;
307 474
308out: 475out:
309 list_for_each_entry_safe(snod, tmp, &large, list) { 476 list_splice_tail(&nondata, &sleb->nodes);
310 list_del(&snod->list);
311 kfree(snod);
312 }
313 return err; 477 return err;
314} 478}
315 479
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
114 */ 114 */
115static int reserve_space(struct ubifs_info *c, int jhead, int len) 115static int reserve_space(struct ubifs_info *c, int jhead, int len)
116{ 116{
117 int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; 117 int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; 118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
119 119
120 /* 120 /*
@@ -139,10 +139,9 @@ again:
139 * Write buffer wasn't seek'ed or there is no enough space - look for an 139 * Write buffer wasn't seek'ed or there is no enough space - look for an
140 * LEB with some empty space. 140 * LEB with some empty space.
141 */ 141 */
142 lnum = ubifs_find_free_space(c, len, &free, squeeze); 142 lnum = ubifs_find_free_space(c, len, &offs, squeeze);
143 if (lnum >= 0) { 143 if (lnum >= 0) {
144 /* Found an LEB, add it to the journal head */ 144 /* Found an LEB, add it to the journal head */
145 offs = c->leb_size - free;
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs); 145 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err) 146 if (err)
148 goto out_return; 147 goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
1366 * @host: host inode 1365 * @host: host inode
1367 * 1366 *
1368 * This function writes the updated version of an extended attribute inode and 1367 * This function writes the updated version of an extended attribute inode and
1369 * the host inode tho the journal (to the base head). The host inode is written 1368 * the host inode to the journal (to the base head). The host inode is written
1370 * after the extended attribute inode in order to guarantee that the extended 1369 * after the extended attribute inode in order to guarantee that the extended
1371 * attribute will be flushed when the inode is synchronized by 'fsync()' and 1370 * attribute will be flushed when the inode is synchronized by 'fsync()' and
1372 * consequently, the write-buffer is synchronized. This function returns zero 1371 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
381 * @c: UBIFS file-system description object 381 * @c: UBIFS file-system description object
382 * @key: the key to get hash from 382 * @key: the key to get hash from
383 */ 383 */
384static inline int key_hash(const struct ubifs_info *c, 384static inline uint32_t key_hash(const struct ubifs_info *c,
385 const union ubifs_key *key) 385 const union ubifs_key *key)
386{ 386{
387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK; 387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
388} 388}
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
392 * @c: UBIFS file-system description object 392 * @c: UBIFS file-system description object
393 * @k: the key to get hash from 393 * @k: the key to get hash from
394 */ 394 */
395static inline int key_hash_flash(const struct ubifs_info *c, const void *k) 395static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
396{ 396{
397 const union ubifs_key *key = k; 397 const union ubifs_key *key = k;
398 398
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
239 } 239 }
240 240
241 /* 241 /*
242 * Make sure the the amount of space in buds will not exceed 242 * Make sure the amount of space in buds will not exceed the
243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time 243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
244 * limits. 244 * limits.
245 * 245 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
367 bud->jhead, c->leb_size - bud->start, 367 bud->jhead, c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 list_del(&bud->list);
371 /* 370 /*
372 * If the commit does not finish, the recovery will need 371 * If the commit does not finish, the recovery will need
373 * to replay the journal, in which case the old buds 372 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
375 * commit i.e. do not allow them to be garbage 374 * commit i.e. do not allow them to be garbage
376 * collected. 375 * collected.
377 */ 376 */
378 list_add(&bud->list, &c->old_buds); 377 list_move(&bud->list, &c->old_buds);
379 } 378 }
380 } 379 }
381 spin_unlock(&c->buds_lock); 380 spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
229 while (offs + len > c->leb_size) { 229 while (offs + len > c->leb_size) {
230 alen = ALIGN(offs, c->min_io_size); 230 alen = ALIGN(offs, c->min_io_size);
231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
232 dbg_chk_lpt_sz(c, 2, alen - offs); 232 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
233 err = alloc_lpt_leb(c, &lnum); 233 err = alloc_lpt_leb(c, &lnum);
234 if (err) 234 if (err)
235 goto no_space; 235 goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
272 if (offs + c->lsave_sz > c->leb_size) { 272 if (offs + c->lsave_sz > c->leb_size) {
273 alen = ALIGN(offs, c->min_io_size); 273 alen = ALIGN(offs, c->min_io_size);
274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
275 dbg_chk_lpt_sz(c, 2, alen - offs); 275 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
276 err = alloc_lpt_leb(c, &lnum); 276 err = alloc_lpt_leb(c, &lnum);
277 if (err) 277 if (err)
278 goto no_space; 278 goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
292 if (offs + c->ltab_sz > c->leb_size) { 292 if (offs + c->ltab_sz > c->leb_size) {
293 alen = ALIGN(offs, c->min_io_size); 293 alen = ALIGN(offs, c->min_io_size);
294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
295 dbg_chk_lpt_sz(c, 2, alen - offs); 295 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
296 err = alloc_lpt_leb(c, &lnum); 296 err = alloc_lpt_leb(c, &lnum);
297 if (err) 297 if (err)
298 goto no_space; 298 goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
416 alen, UBI_SHORTTERM); 416 alen, UBI_SHORTTERM);
417 if (err) 417 if (err)
418 return err; 418 return err;
419 dbg_chk_lpt_sz(c, 4, alen - wlen);
420 } 419 }
421 dbg_chk_lpt_sz(c, 2, 0); 420 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
422 err = realloc_lpt_leb(c, &lnum); 421 err = realloc_lpt_leb(c, &lnum);
423 if (err) 422 if (err)
424 goto no_space; 423 goto no_space;
425 offs = 0; 424 offs = from = 0;
426 from = 0;
427 ubifs_assert(lnum >= c->lpt_first && 425 ubifs_assert(lnum >= c->lpt_first &&
428 lnum <= c->lpt_last); 426 lnum <= c->lpt_last);
429 err = ubifs_leb_unmap(c, lnum); 427 err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
477 UBI_SHORTTERM); 475 UBI_SHORTTERM);
478 if (err) 476 if (err)
479 return err; 477 return err;
480 dbg_chk_lpt_sz(c, 2, alen - wlen); 478 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
481 err = realloc_lpt_leb(c, &lnum); 479 err = realloc_lpt_leb(c, &lnum);
482 if (err) 480 if (err)
483 goto no_space; 481 goto no_space;
484 offs = 0; 482 offs = from = 0;
485 ubifs_assert(lnum >= c->lpt_first && 483 ubifs_assert(lnum >= c->lpt_first &&
486 lnum <= c->lpt_last); 484 lnum <= c->lpt_last);
487 err = ubifs_leb_unmap(c, lnum); 485 err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
504 UBI_SHORTTERM); 502 UBI_SHORTTERM);
505 if (err) 503 if (err)
506 return err; 504 return err;
507 dbg_chk_lpt_sz(c, 2, alen - wlen); 505 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
508 err = realloc_lpt_leb(c, &lnum); 506 err = realloc_lpt_leb(c, &lnum);
509 if (err) 507 if (err)
510 goto no_space; 508 goto no_space;
511 offs = 0; 509 offs = from = 0;
512 ubifs_assert(lnum >= c->lpt_first && 510 ubifs_assert(lnum >= c->lpt_first &&
513 lnum <= c->lpt_last); 511 lnum <= c->lpt_last);
514 err = ubifs_leb_unmap(c, lnum); 512 err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1756/** 1754/**
1757 * dbg_chk_lpt_sz - check LPT does not write more than LPT size. 1755 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
1758 * @c: the UBIFS file-system description object 1756 * @c: the UBIFS file-system description object
1759 * @action: action 1757 * @action: what to do
1760 * @len: length written 1758 * @len: length written
1761 * 1759 *
1762 * This function returns %0 on success and a negative error code on failure. 1760 * This function returns %0 on success and a negative error code on failure.
1761 * The @action argument may be one of:
1762 * o %0 - LPT debugging checking starts, initialize debugging variables;
1763 * o %1 - wrote an LPT node, increase LPT size by @len bytes;
1764 * o %2 - switched to a different LEB and wasted @len bytes;
1765 * o %3 - check that we've written the right number of bytes.
1766 * o %4 - wasted @len bytes;
1763 */ 1767 */
1764int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1768int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1765{ 1769{
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1917 lnum, offs); 1921 lnum, offs);
1918 err = ubifs_unpack_nnode(c, buf, &nnode); 1922 err = ubifs_unpack_nnode(c, buf, &nnode);
1919 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1923 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1920 printk("%d:%d", nnode.nbranch[i].lnum, 1924 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
1921 nnode.nbranch[i].offs); 1925 nnode.nbranch[i].offs);
1922 if (i != UBIFS_LPT_FANOUT - 1) 1926 if (i != UBIFS_LPT_FANOUT - 1)
1923 printk(", "); 1927 printk(KERN_CONT ", ");
1924 } 1928 }
1925 printk("\n"); 1929 printk(KERN_CONT "\n");
1926 break; 1930 break;
1927 } 1931 }
1928 case UBIFS_LPT_LTAB: 1932 case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
425 * @lnum: LEB number of the LEB from which @buf was read 425 * @lnum: LEB number of the LEB from which @buf was read
426 * @offs: offset from which @buf was read 426 * @offs: offset from which @buf was read
427 * 427 *
428 * This function scans @buf for more nodes and returns %0 is a node is found and 428 * This function ensures that the corrupted node at @offs is the last thing
429 * %1 if no more nodes are found. 429 * written to a LEB. This function returns %1 if more data is not found and
430 * %0 if more data is found.
430 */ 431 */
431static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, 432static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
432 int lnum, int offs) 433 int lnum, int offs)
433{ 434{
434 int skip, next_offs = 0; 435 struct ubifs_ch *ch = buf;
436 int skip, dlen = le32_to_cpu(ch->len);
435 437
436 if (len > UBIFS_DATA_NODE_SZ) { 438 /* Check for empty space after the corrupt node's common header */
437 struct ubifs_ch *ch = buf; 439 skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
438 int dlen = le32_to_cpu(ch->len); 440 if (is_empty(buf + skip, len - skip))
439 441 return 1;
440 if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && 442 /*
441 dlen <= UBIFS_MAX_DATA_NODE_SZ) 443 * The area after the common header size is not empty, so the common
442 /* The corrupt node looks like a data node */ 444 * header must be intact. Check it.
443 next_offs = ALIGN(offs + dlen, 8); 445 */
444 } 446 if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
445 447 dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
446 if (c->min_io_size == 1) 448 return 0;
447 skip = 8;
448 else
449 skip = ALIGN(offs + 1, c->min_io_size) - offs;
450
451 offs += skip;
452 buf += skip;
453 len -= skip;
454 while (len > 8) {
455 struct ubifs_ch *ch = buf;
456 uint32_t magic = le32_to_cpu(ch->magic);
457 int ret;
458
459 if (magic == UBIFS_NODE_MAGIC) {
460 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
461 if (ret == SCANNED_A_NODE || ret > 0) {
462 /*
463 * There is a small chance this is just data in
464 * a data node, so check that possibility. e.g.
465 * this is part of a file that itself contains
466 * a UBIFS image.
467 */
468 if (next_offs && offs + le32_to_cpu(ch->len) <=
469 next_offs)
470 continue;
471 dbg_rcvry("unexpected node at %d:%d", lnum,
472 offs);
473 return 0;
474 }
475 }
476 offs += 8;
477 buf += 8;
478 len -= 8;
479 } 449 }
480 return 1; 450 /* Now we know the corrupt node's length we can skip over it */
451 skip = ALIGN(offs + dlen, c->min_io_size) - offs;
452 /* After which there should be empty space */
453 if (is_empty(buf + skip, len - skip))
454 return 1;
455 dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
456 return 0;
481} 457}
482 458
483/** 459/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
143 dirty -= c->leb_size - lp->free; 143 dirty -= c->leb_size - lp->free;
144 /* 144 /*
145 * If the replay order was perfect the dirty space would now be 145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads 146 * zero. The order is not perfect because the journal heads
147 * race with each other. This is not a problem but is does mean 147 * race with each other. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size 148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay. 149 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
193 if (tmp64 > DEFAULT_MAX_RP_SIZE) 193 if (tmp64 > DEFAULT_MAX_RP_SIZE)
194 tmp64 = DEFAULT_MAX_RP_SIZE; 194 tmp64 = DEFAULT_MAX_RP_SIZE;
195 sup->rp_size = cpu_to_le64(tmp64); 195 sup->rp_size = cpu_to_le64(tmp64);
196 sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
196 197
197 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); 198 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
198 kfree(sup); 199 kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
532 if (IS_ERR(sup)) 533 if (IS_ERR(sup))
533 return PTR_ERR(sup); 534 return PTR_ERR(sup);
534 535
536 c->fmt_version = le32_to_cpu(sup->fmt_version);
537 c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
538
535 /* 539 /*
536 * The software supports all previous versions but not future versions, 540 * The software supports all previous versions but not future versions,
537 * due to the unavailability of time-travelling equipment. 541 * due to the unavailability of time-travelling equipment.
538 */ 542 */
539 c->fmt_version = le32_to_cpu(sup->fmt_version);
540 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 543 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
541 ubifs_err("on-flash format version is %d, but software only " 544 struct super_block *sb = c->vfs_sb;
542 "supports up to version %d", c->fmt_version, 545 int mounting_ro = sb->s_flags & MS_RDONLY;
543 UBIFS_FORMAT_VERSION); 546
544 err = -EINVAL; 547 ubifs_assert(!c->ro_media || mounting_ro);
545 goto out; 548 if (!mounting_ro ||
549 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
550 ubifs_err("on-flash format version is w%d/r%d, but "
551 "software only supports up to version "
552 "w%d/r%d", c->fmt_version,
553 c->ro_compat_version, UBIFS_FORMAT_VERSION,
554 UBIFS_RO_COMPAT_VERSION);
555 if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
556 ubifs_msg("only R/O mounting is possible");
557 err = -EROFS;
558 } else
559 err = -EINVAL;
560 goto out;
561 }
562
563 /*
564 * The FS is mounted R/O, and the media format is
565 * R/O-compatible with the UBIFS implementation, so we can
566 * mount.
567 */
568 c->rw_incompat = 1;
546 } 569 }
547 570
548 if (c->fmt_version < 3) { 571 if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
623 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; 646 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
624 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; 647 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
625 c->main_first = c->leb_cnt - c->main_lebs; 648 c->main_first = c->leb_cnt - c->main_lebs;
626 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
627 649
628 err = validate_sb(c, sup); 650 err = validate_sb(c, sup);
629out: 651out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
206 * Move this one to the end of the list to provide some 206 * Move this one to the end of the list to provide some
207 * fairness. 207 * fairness.
208 */ 208 */
209 list_del(&c->infos_list); 209 list_move_tail(&c->infos_list, &ubifs_infos);
210 list_add_tail(&c->infos_list, &ubifs_infos);
211 mutex_unlock(&c->umount_mutex); 210 mutex_unlock(&c->umount_mutex);
212 if (freed >= nr) 211 if (freed >= nr)
213 break; 212 break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
263 } 262 }
264 263
265 if (i == 1) { 264 if (i == 1) {
266 list_del(&c->infos_list); 265 list_move_tail(&c->infos_list, &ubifs_infos);
267 list_add_tail(&c->infos_list, &ubifs_infos);
268 spin_unlock(&ubifs_infos_lock); 266 spin_unlock(&ubifs_infos_lock);
269 267
270 ubifs_request_bg_commit(c); 268 ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..faa44f90608a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
421 seq_printf(s, ",no_chk_data_crc"); 421 seq_printf(s, ",no_chk_data_crc");
422 422
423 if (c->mount_opts.override_compr) { 423 if (c->mount_opts.override_compr) {
424 seq_printf(s, ",compr="); 424 seq_printf(s, ",compr=%s",
425 seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); 425 ubifs_compr_name(c->mount_opts.compr_type));
426 } 426 }
427 427
428 return 0; 428 return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
700 if (err) 700 if (err)
701 return err; 701 return err;
702 702
703 /* Initialize effective LEB size used in budgeting calculations */
704 c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
703 return 0; 705 return 0;
704} 706}
705 707
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
716 long long tmp64; 718 long long tmp64;
717 719
718 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 720 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
721 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
719 722
720 /* 723 /*
721 * Calculate total amount of FS blocks. This number is not used 724 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
1201 goto out_cbuf; 1204 goto out_cbuf;
1202 1205
1203 /* Create background thread */ 1206 /* Create background thread */
1204 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1207 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1205 if (IS_ERR(c->bgt)) { 1208 if (IS_ERR(c->bgt)) {
1206 err = PTR_ERR(c->bgt); 1209 err = PTR_ERR(c->bgt);
1207 c->bgt = NULL; 1210 c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
1318 else { 1321 else {
1319 c->need_recovery = 0; 1322 c->need_recovery = 0;
1320 ubifs_msg("recovery completed"); 1323 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */ 1324 /*
1322 ubifs_assert(c->lst.taken_empty_lebs == 1); 1325 * GC LEB has to be empty and taken at this point. But
1326 * the journal head LEBs may also be accounted as
1327 * "empty taken" if they are empty.
1328 */
1329 ubifs_assert(c->lst.taken_empty_lebs > 0);
1323 } 1330 }
1324 } else 1331 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1); 1332 ubifs_assert(c->lst.taken_empty_lebs > 0);
1326 1333
1327 err = dbg_check_filesystem(c); 1334 err = dbg_check_filesystem(c);
1328 if (err) 1335 if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
1344 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1351 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1345 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1352 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
1346 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1353 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
1347 ubifs_msg("media format: %d (latest is %d)", 1354 ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",
1348 c->fmt_version, UBIFS_FORMAT_VERSION); 1355 c->fmt_version, c->ro_compat_version,
1356 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
1349 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1357 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
1350 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1358 ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
1351 c->report_rp_size, c->report_rp_size >> 10); 1359 c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1485{ 1493{
1486 int err, lnum; 1494 int err, lnum;
1487 1495
1496 if (c->rw_incompat) {
1497 ubifs_err("the file-system is not R/W-compatible");
1498 ubifs_msg("on-flash format version is w%d/r%d, but software "
1499 "only supports up to version w%d/r%d", c->fmt_version,
1500 c->ro_compat_version, UBIFS_FORMAT_VERSION,
1501 UBIFS_RO_COMPAT_VERSION);
1502 return -EROFS;
1503 }
1504
1488 mutex_lock(&c->umount_mutex); 1505 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c); 1506 dbg_save_space_info(c);
1490 c->remounting_rw = 1; 1507 c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1554 ubifs_create_buds_lists(c); 1571 ubifs_create_buds_lists(c);
1555 1572
1556 /* Create background thread */ 1573 /* Create background thread */
1557 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1574 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1558 if (IS_ERR(c->bgt)) { 1575 if (IS_ERR(c->bgt)) {
1559 err = PTR_ERR(c->bgt); 1576 err = PTR_ERR(c->bgt);
1560 c->bgt = NULL; 1577 c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1775 c->bu.buf = NULL; 1792 c->bu.buf = NULL;
1776 } 1793 }
1777 1794
1778 ubifs_assert(c->lst.taken_empty_lebs == 1); 1795 ubifs_assert(c->lst.taken_empty_lebs > 0);
1779 return 0; 1796 return 0;
1780} 1797}
1781 1798
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1252 * splitting in the middle of the colliding sequence. Also, when 1252 * splitting in the middle of the colliding sequence. Also, when
1253 * removing the leftmost key, we would have to correct the key of the 1253 * removing the leftmost key, we would have to correct the key of the
1254 * parent node, which would introduce additional complications. Namely, 1254 * parent node, which would introduce additional complications. Namely,
1255 * if we changed the the leftmost key of the parent znode, the garbage 1255 * if we changed the leftmost key of the parent znode, the garbage
1256 * collector would be unable to find it (GC is doing this when GC'ing 1256 * collector would be unable to find it (GC is doing this when GC'ing
1257 * indexing LEBs). Although we already have an additional RB-tree where 1257 * indexing LEBs). Although we already have an additional RB-tree where
1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until 1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
36/* UBIFS node magic number (must not have the padding byte first or last) */ 36/* UBIFS node magic number (must not have the padding byte first or last) */
37#define UBIFS_NODE_MAGIC 0x06101831 37#define UBIFS_NODE_MAGIC 0x06101831
38 38
39/* UBIFS on-flash format version */ 39/*
40 * UBIFS on-flash format version. This version is increased when the on-flash
41 * format is changing. If this happens, UBIFS is will support older versions as
42 * well. But older UBIFS code will not support newer formats. Format changes
43 * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
44 * a new feature.
45 *
46 * UBIFS went into mainline kernel with format version 4. The older formats
47 * were development formats.
48 */
40#define UBIFS_FORMAT_VERSION 4 49#define UBIFS_FORMAT_VERSION 4
41 50
51/*
52 * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
53 * implementations will not be able to mount newer formats in read-write mode.
54 * However, depending on the change, it may be possible to mount newer formats
55 * in R/O mode. This is indicated by the R/O compatibility version which is
56 * stored in the super-block.
57 *
58 * This is needed to support boot-loaders which only need R/O mounting. With
59 * this flag it is possible to do UBIFS format changes without a need to update
60 * boot-loaders.
61 */
62#define UBIFS_RO_COMPAT_VERSION 0
63
42/* Minimum logical eraseblock size in bytes */ 64/* Minimum logical eraseblock size in bytes */
43#define UBIFS_MIN_LEB_SZ (15*1024) 65#define UBIFS_MIN_LEB_SZ (15*1024)
44 66
@@ -53,7 +75,7 @@
53 75
54/* 76/*
55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes 77 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
56 * shorter than uncompressed data length, UBIFS preferes to leave this data 78 * shorter than uncompressed data length, UBIFS prefers to leave this data
57 * node uncompress, because it'll be read faster. 79 * node uncompress, because it'll be read faster.
58 */ 80 */
59#define UBIFS_MIN_COMPRESS_DIFF 64 81#define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
586 * @padding2: reserved for future, zeroes 608 * @padding2: reserved for future, zeroes
587 * @time_gran: time granularity in nanoseconds 609 * @time_gran: time granularity in nanoseconds
588 * @uuid: UUID generated when the file system image was created 610 * @uuid: UUID generated when the file system image was created
611 * @ro_compat_version: UBIFS R/O compatibility version
589 */ 612 */
590struct ubifs_sb_node { 613struct ubifs_sb_node {
591 struct ubifs_ch ch; 614 struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
612 __le64 rp_size; 635 __le64 rp_size;
613 __le32 time_gran; 636 __le32 time_gran;
614 __u8 uuid[16]; 637 __u8 uuid[16];
615 __u8 padding2[3972]; 638 __le32 ro_compat_version;
639 __u8 padding2[3968];
616} __attribute__ ((packed)); 640} __attribute__ ((packed));
617 641
618/** 642/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
934 * by @commit_sem 934 * by @commit_sem
935 * @cnt_lock: protects @highest_inum and @max_sqnum counters 935 * @cnt_lock: protects @highest_inum and @max_sqnum counters
936 * @fmt_version: UBIFS on-flash format version 936 * @fmt_version: UBIFS on-flash format version
937 * @ro_compat_version: R/O compatibility version
937 * @uuid: UUID from super block 938 * @uuid: UUID from super block
938 * 939 *
939 * @lhead_lnum: log head logical eraseblock number 940 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
966 * recovery) 967 * recovery)
967 * @bulk_read: enable bulk-reads 968 * @bulk_read: enable bulk-reads
968 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) 969 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
970 * @rw_incompat: the media is not R/W compatible
969 * 971 *
970 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 972 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
971 * @calc_idx_sz 973 * @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
1015 * @min_io_shift: number of bits in @min_io_size minus one 1017 * @min_io_shift: number of bits in @min_io_size minus one
1016 * @leb_size: logical eraseblock size in bytes 1018 * @leb_size: logical eraseblock size in bytes
1017 * @half_leb_size: half LEB size 1019 * @half_leb_size: half LEB size
1020 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
1021 * used to store indexing nodes (@leb_size - @max_idx_node_sz)
1018 * @leb_cnt: count of logical eraseblocks 1022 * @leb_cnt: count of logical eraseblocks
1019 * @max_leb_cnt: maximum count of logical eraseblocks 1023 * @max_leb_cnt: maximum count of logical eraseblocks
1020 * @old_leb_cnt: count of logical eraseblocks before re-size 1024 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
1132 * previous commit start 1136 * previous commit start
1133 * @uncat_list: list of un-categorized LEBs 1137 * @uncat_list: list of un-categorized LEBs
1134 * @empty_list: list of empty LEBs 1138 * @empty_list: list of empty LEBs
1135 * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) 1139 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
1136 * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) 1140 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
1137 * @freeable_cnt: number of freeable LEBs in @freeable_list 1141 * @freeable_cnt: number of freeable LEBs in @freeable_list
1138 * 1142 *
1139 * @ltab_lnum: LEB number of LPT's own lprops table 1143 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
1177 unsigned long long cmt_no; 1181 unsigned long long cmt_no;
1178 spinlock_t cnt_lock; 1182 spinlock_t cnt_lock;
1179 int fmt_version; 1183 int fmt_version;
1184 int ro_compat_version;
1180 unsigned char uuid[16]; 1185 unsigned char uuid[16];
1181 1186
1182 int lhead_lnum; 1187 int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
1205 unsigned int no_chk_data_crc:1; 1210 unsigned int no_chk_data_crc:1;
1206 unsigned int bulk_read:1; 1211 unsigned int bulk_read:1;
1207 unsigned int default_compr:2; 1212 unsigned int default_compr:2;
1213 unsigned int rw_incompat:1;
1208 1214
1209 struct mutex tnc_mutex; 1215 struct mutex tnc_mutex;
1210 struct ubifs_zbranch zroot; 1216 struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
1253 int min_io_shift; 1259 int min_io_shift;
1254 int leb_size; 1260 int leb_size;
1255 int half_leb_size; 1261 int half_leb_size;
1262 int idx_leb_size;
1256 int leb_cnt; 1263 int leb_cnt;
1257 int max_leb_cnt; 1264 int max_leb_cnt;
1258 int old_leb_cnt; 1265 int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
1500long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1507long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1501 1508
1502/* find.c */ 1509/* find.c */
1503int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 1510int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
1504 int squeeze); 1511 int squeeze);
1505int ubifs_find_free_leb_for_idx(struct ubifs_info *c); 1512int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
1506int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 1513int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 2bb788a2acb1..e48e9a3af763 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
87{ 87{
88 struct buffer_head *bh = NULL; 88 struct buffer_head *bh = NULL;
89 int retval = 0; 89 int retval = 0;
90 kernel_lb_addr loc; 90 struct kernel_lb_addr loc;
91 91
92 loc.logicalBlockNum = bitmap->s_extPosition; 92 loc.logicalBlockNum = bitmap->s_extPosition;
93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
94 94
95 bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block)); 95 bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
96 if (!bh) 96 if (!bh)
97 retval = -EIO; 97 retval = -EIO;
98 98
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
140 return slot; 140 return slot;
141} 141}
142 142
143static bool udf_add_free_space(struct udf_sb_info *sbi, 143static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
144 u16 partition, u32 cnt)
145{ 144{
145 struct udf_sb_info *sbi = UDF_SB(sb);
146 struct logicalVolIntegrityDesc *lvid; 146 struct logicalVolIntegrityDesc *lvid;
147 147
148 if (sbi->s_lvid_bh == NULL) 148 if (!sbi->s_lvid_bh)
149 return false; 149 return;
150 150
151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; 151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); 152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
153 return true; 153 udf_updated_lvid(sb);
154} 154}
155 155
156static void udf_bitmap_free_blocks(struct super_block *sb, 156static void udf_bitmap_free_blocks(struct super_block *sb,
157 struct inode *inode, 157 struct inode *inode,
158 struct udf_bitmap *bitmap, 158 struct udf_bitmap *bitmap,
159 kernel_lb_addr bloc, uint32_t offset, 159 struct kernel_lb_addr *bloc,
160 uint32_t offset,
160 uint32_t count) 161 uint32_t count)
161{ 162{
162 struct udf_sb_info *sbi = UDF_SB(sb); 163 struct udf_sb_info *sbi = UDF_SB(sb);
163 struct buffer_head *bh = NULL; 164 struct buffer_head *bh = NULL;
165 struct udf_part_map *partmap;
164 unsigned long block; 166 unsigned long block;
165 unsigned long block_group; 167 unsigned long block_group;
166 unsigned long bit; 168 unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
169 unsigned long overflow; 171 unsigned long overflow;
170 172
171 mutex_lock(&sbi->s_alloc_mutex); 173 mutex_lock(&sbi->s_alloc_mutex);
172 if (bloc.logicalBlockNum < 0 || 174 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
173 (bloc.logicalBlockNum + count) > 175 if (bloc->logicalBlockNum < 0 ||
174 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 176 (bloc->logicalBlockNum + count) >
177 partmap->s_partition_len) {
175 udf_debug("%d < %d || %d + %d > %d\n", 178 udf_debug("%d < %d || %d + %d > %d\n",
176 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 179 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
177 sbi->s_partmaps[bloc.partitionReferenceNum]. 180 count, partmap->s_partition_len);
178 s_partition_len);
179 goto error_return; 181 goto error_return;
180 } 182 }
181 183
182 block = bloc.logicalBlockNum + offset + 184 block = bloc->logicalBlockNum + offset +
183 (sizeof(struct spaceBitmapDesc) << 3); 185 (sizeof(struct spaceBitmapDesc) << 3);
184 186
185 do { 187 do {
@@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
207 } else { 209 } else {
208 if (inode) 210 if (inode)
209 vfs_dq_free_block(inode, 1); 211 vfs_dq_free_block(inode, 1);
210 udf_add_free_space(sbi, sbi->s_partition, 1); 212 udf_add_free_space(sb, sbi->s_partition, 1);
211 } 213 }
212 } 214 }
213 mark_buffer_dirty(bh); 215 mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
218 } while (overflow); 220 } while (overflow);
219 221
220error_return: 222error_return:
221 sb->s_dirt = 1;
222 if (sbi->s_lvid_bh)
223 mark_buffer_dirty(sbi->s_lvid_bh);
224 mutex_unlock(&sbi->s_alloc_mutex); 223 mutex_unlock(&sbi->s_alloc_mutex);
225} 224}
226 225
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
277 } while (block_count > 0); 276 } while (block_count > 0);
278 277
279out: 278out:
280 if (udf_add_free_space(sbi, partition, -alloc_count)) 279 udf_add_free_space(sb, partition, -alloc_count);
281 mark_buffer_dirty(sbi->s_lvid_bh);
282 sb->s_dirt = 1;
283 mutex_unlock(&sbi->s_alloc_mutex); 280 mutex_unlock(&sbi->s_alloc_mutex);
284 return alloc_count; 281 return alloc_count;
285} 282}
@@ -409,9 +406,7 @@ got_block:
409 406
410 mark_buffer_dirty(bh); 407 mark_buffer_dirty(bh);
411 408
412 if (udf_add_free_space(sbi, partition, -1)) 409 udf_add_free_space(sb, partition, -1);
413 mark_buffer_dirty(sbi->s_lvid_bh);
414 sb->s_dirt = 1;
415 mutex_unlock(&sbi->s_alloc_mutex); 410 mutex_unlock(&sbi->s_alloc_mutex);
416 *err = 0; 411 *err = 0;
417 return newblock; 412 return newblock;
@@ -425,26 +420,28 @@ error_return:
425static void udf_table_free_blocks(struct super_block *sb, 420static void udf_table_free_blocks(struct super_block *sb,
426 struct inode *inode, 421 struct inode *inode,
427 struct inode *table, 422 struct inode *table,
428 kernel_lb_addr bloc, uint32_t offset, 423 struct kernel_lb_addr *bloc,
424 uint32_t offset,
429 uint32_t count) 425 uint32_t count)
430{ 426{
431 struct udf_sb_info *sbi = UDF_SB(sb); 427 struct udf_sb_info *sbi = UDF_SB(sb);
428 struct udf_part_map *partmap;
432 uint32_t start, end; 429 uint32_t start, end;
433 uint32_t elen; 430 uint32_t elen;
434 kernel_lb_addr eloc; 431 struct kernel_lb_addr eloc;
435 struct extent_position oepos, epos; 432 struct extent_position oepos, epos;
436 int8_t etype; 433 int8_t etype;
437 int i; 434 int i;
438 struct udf_inode_info *iinfo; 435 struct udf_inode_info *iinfo;
439 436
440 mutex_lock(&sbi->s_alloc_mutex); 437 mutex_lock(&sbi->s_alloc_mutex);
441 if (bloc.logicalBlockNum < 0 || 438 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
442 (bloc.logicalBlockNum + count) > 439 if (bloc->logicalBlockNum < 0 ||
443 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 440 (bloc->logicalBlockNum + count) >
441 partmap->s_partition_len) {
444 udf_debug("%d < %d || %d + %d > %d\n", 442 udf_debug("%d < %d || %d + %d > %d\n",
445 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 443 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
446 sbi->s_partmaps[bloc.partitionReferenceNum]. 444 partmap->s_partition_len);
447 s_partition_len);
448 goto error_return; 445 goto error_return;
449 } 446 }
450 447
@@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb,
453 could occure, but.. oh well */ 450 could occure, but.. oh well */
454 if (inode) 451 if (inode)
455 vfs_dq_free_block(inode, count); 452 vfs_dq_free_block(inode, count);
456 if (udf_add_free_space(sbi, sbi->s_partition, count)) 453 udf_add_free_space(sb, sbi->s_partition, count);
457 mark_buffer_dirty(sbi->s_lvid_bh);
458 454
459 start = bloc.logicalBlockNum + offset; 455 start = bloc->logicalBlockNum + offset;
460 end = bloc.logicalBlockNum + offset + count - 1; 456 end = bloc->logicalBlockNum + offset + count - 1;
461 457
462 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); 458 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
463 elen = 0; 459 elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
483 start += count; 479 start += count;
484 count = 0; 480 count = 0;
485 } 481 }
486 udf_write_aext(table, &oepos, eloc, elen, 1); 482 udf_write_aext(table, &oepos, &eloc, elen, 1);
487 } else if (eloc.logicalBlockNum == (end + 1)) { 483 } else if (eloc.logicalBlockNum == (end + 1)) {
488 if ((0x3FFFFFFF - elen) < 484 if ((0x3FFFFFFF - elen) <
489 (count << sb->s_blocksize_bits)) { 485 (count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
502 end -= count; 498 end -= count;
503 count = 0; 499 count = 0;
504 } 500 }
505 udf_write_aext(table, &oepos, eloc, elen, 1); 501 udf_write_aext(table, &oepos, &eloc, elen, 1);
506 } 502 }
507 503
508 if (epos.bh != oepos.bh) { 504 if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
532 */ 528 */
533 529
534 int adsize; 530 int adsize;
535 short_ad *sad = NULL; 531 struct short_ad *sad = NULL;
536 long_ad *lad = NULL; 532 struct long_ad *lad = NULL;
537 struct allocExtDesc *aed; 533 struct allocExtDesc *aed;
538 534
539 eloc.logicalBlockNum = start; 535 eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
541 (count << sb->s_blocksize_bits); 537 (count << sb->s_blocksize_bits);
542 538
543 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 539 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
544 adsize = sizeof(short_ad); 540 adsize = sizeof(struct short_ad);
545 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 541 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
546 adsize = sizeof(long_ad); 542 adsize = sizeof(struct long_ad);
547 else { 543 else {
548 brelse(oepos.bh); 544 brelse(oepos.bh);
549 brelse(epos.bh); 545 brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
563 elen -= sb->s_blocksize; 559 elen -= sb->s_blocksize;
564 560
565 epos.bh = udf_tread(sb, 561 epos.bh = udf_tread(sb,
566 udf_get_lb_pblock(sb, epos.block, 0)); 562 udf_get_lb_pblock(sb, &epos.block, 0));
567 if (!epos.bh) { 563 if (!epos.bh) {
568 brelse(oepos.bh); 564 brelse(oepos.bh);
569 goto error_return; 565 goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
601 if (sbi->s_udfrev >= 0x0200) 597 if (sbi->s_udfrev >= 0x0200)
602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 598 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
603 3, 1, epos.block.logicalBlockNum, 599 3, 1, epos.block.logicalBlockNum,
604 sizeof(tag)); 600 sizeof(struct tag));
605 else 601 else
606 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
607 2, 1, epos.block.logicalBlockNum, 603 2, 1, epos.block.logicalBlockNum,
608 sizeof(tag)); 604 sizeof(struct tag));
609 605
610 switch (iinfo->i_alloc_type) { 606 switch (iinfo->i_alloc_type) {
611 case ICBTAG_FLAG_AD_SHORT: 607 case ICBTAG_FLAG_AD_SHORT:
612 sad = (short_ad *)sptr; 608 sad = (struct short_ad *)sptr;
613 sad->extLength = cpu_to_le32( 609 sad->extLength = cpu_to_le32(
614 EXT_NEXT_EXTENT_ALLOCDECS | 610 EXT_NEXT_EXTENT_ALLOCDECS |
615 sb->s_blocksize); 611 sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
617 cpu_to_le32(epos.block.logicalBlockNum); 613 cpu_to_le32(epos.block.logicalBlockNum);
618 break; 614 break;
619 case ICBTAG_FLAG_AD_LONG: 615 case ICBTAG_FLAG_AD_LONG:
620 lad = (long_ad *)sptr; 616 lad = (struct long_ad *)sptr;
621 lad->extLength = cpu_to_le32( 617 lad->extLength = cpu_to_le32(
622 EXT_NEXT_EXTENT_ALLOCDECS | 618 EXT_NEXT_EXTENT_ALLOCDECS |
623 sb->s_blocksize); 619 sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
635 631
636 /* It's possible that stealing the block emptied the extent */ 632 /* It's possible that stealing the block emptied the extent */
637 if (elen) { 633 if (elen) {
638 udf_write_aext(table, &epos, eloc, elen, 1); 634 udf_write_aext(table, &epos, &eloc, elen, 1);
639 635
640 if (!epos.bh) { 636 if (!epos.bh) {
641 iinfo->i_lenAlloc += adsize; 637 iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
653 brelse(oepos.bh); 649 brelse(oepos.bh);
654 650
655error_return: 651error_return:
656 sb->s_dirt = 1;
657 mutex_unlock(&sbi->s_alloc_mutex); 652 mutex_unlock(&sbi->s_alloc_mutex);
658 return; 653 return;
659} 654}
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
666 struct udf_sb_info *sbi = UDF_SB(sb); 661 struct udf_sb_info *sbi = UDF_SB(sb);
667 int alloc_count = 0; 662 int alloc_count = 0;
668 uint32_t elen, adsize; 663 uint32_t elen, adsize;
669 kernel_lb_addr eloc; 664 struct kernel_lb_addr eloc;
670 struct extent_position epos; 665 struct extent_position epos;
671 int8_t etype = -1; 666 int8_t etype = -1;
672 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
677 672
678 iinfo = UDF_I(table); 673 iinfo = UDF_I(table);
679 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 674 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
680 adsize = sizeof(short_ad); 675 adsize = sizeof(struct short_ad);
681 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 676 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
682 adsize = sizeof(long_ad); 677 adsize = sizeof(struct long_ad);
683 else 678 else
684 return 0; 679 return 0;
685 680
@@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
707 alloc_count = block_count; 702 alloc_count = block_count;
708 eloc.logicalBlockNum += alloc_count; 703 eloc.logicalBlockNum += alloc_count;
709 elen -= (alloc_count << sb->s_blocksize_bits); 704 elen -= (alloc_count << sb->s_blocksize_bits);
710 udf_write_aext(table, &epos, eloc, 705 udf_write_aext(table, &epos, &eloc,
711 (etype << 30) | elen, 1); 706 (etype << 30) | elen, 1);
712 } else 707 } else
713 udf_delete_aext(table, epos, eloc, 708 udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
718 713
719 brelse(epos.bh); 714 brelse(epos.bh);
720 715
721 if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) { 716 if (alloc_count)
722 mark_buffer_dirty(sbi->s_lvid_bh); 717 udf_add_free_space(sb, partition, -alloc_count);
723 sb->s_dirt = 1;
724 }
725 mutex_unlock(&sbi->s_alloc_mutex); 718 mutex_unlock(&sbi->s_alloc_mutex);
726 return alloc_count; 719 return alloc_count;
727} 720}
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
735 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; 728 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
736 uint32_t newblock = 0, adsize; 729 uint32_t newblock = 0, adsize;
737 uint32_t elen, goal_elen = 0; 730 uint32_t elen, goal_elen = 0;
738 kernel_lb_addr eloc, uninitialized_var(goal_eloc); 731 struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
739 struct extent_position epos, goal_epos; 732 struct extent_position epos, goal_epos;
740 int8_t etype; 733 int8_t etype;
741 struct udf_inode_info *iinfo = UDF_I(table); 734 struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
743 *err = -ENOSPC; 736 *err = -ENOSPC;
744 737
745 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 738 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
746 adsize = sizeof(short_ad); 739 adsize = sizeof(struct short_ad);
747 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 740 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
748 adsize = sizeof(long_ad); 741 adsize = sizeof(struct long_ad);
749 else 742 else
750 return newblock; 743 return newblock;
751 744
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
814 } 807 }
815 808
816 if (goal_elen) 809 if (goal_elen)
817 udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1); 810 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
818 else 811 else
819 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); 812 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
820 brelse(goal_epos.bh); 813 brelse(goal_epos.bh);
821 814
822 if (udf_add_free_space(sbi, partition, -1)) 815 udf_add_free_space(sb, partition, -1);
823 mark_buffer_dirty(sbi->s_lvid_bh);
824 816
825 sb->s_dirt = 1;
826 mutex_unlock(&sbi->s_alloc_mutex); 817 mutex_unlock(&sbi->s_alloc_mutex);
827 *err = 0; 818 *err = 0;
828 return newblock; 819 return newblock;
829} 820}
830 821
831inline void udf_free_blocks(struct super_block *sb, 822void udf_free_blocks(struct super_block *sb, struct inode *inode,
832 struct inode *inode, 823 struct kernel_lb_addr *bloc, uint32_t offset,
833 kernel_lb_addr bloc, uint32_t offset, 824 uint32_t count)
834 uint32_t count)
835{ 825{
836 uint16_t partition = bloc.partitionReferenceNum; 826 uint16_t partition = bloc->partitionReferenceNum;
837 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 827 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
838 828
839 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { 829 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
840 return udf_bitmap_free_blocks(sb, inode, 830 udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
841 map->s_uspace.s_bitmap, 831 bloc, offset, count);
842 bloc, offset, count);
843 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { 832 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
844 return udf_table_free_blocks(sb, inode, 833 udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
845 map->s_uspace.s_table, 834 bloc, offset, count);
846 bloc, offset, count);
847 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { 835 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
848 return udf_bitmap_free_blocks(sb, inode, 836 udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
849 map->s_fspace.s_bitmap, 837 bloc, offset, count);
850 bloc, offset, count);
851 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { 838 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
852 return udf_table_free_blocks(sb, inode, 839 udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
853 map->s_fspace.s_table, 840 bloc, offset, count);
854 bloc, offset, count);
855 } else {
856 return;
857 } 841 }
858} 842}
859 843
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d1..2efd4d5291b6 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
53 struct buffer_head *tmp, *bha[16]; 53 struct buffer_head *tmp, *bha[16];
54 kernel_lb_addr eloc; 54 struct kernel_lb_addr eloc;
55 uint32_t elen; 55 uint32_t elen;
56 sector_t offset; 56 sector_t offset;
57 int i, num, ret = 0; 57 int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
80 ret = -ENOENT; 80 ret = -ENOENT;
81 goto out; 81 goto out;
82 } 82 }
83 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 83 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 epos.offset -= sizeof(short_ad); 86 epos.offset -= sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == 87 else if (iinfo->i_alloc_type ==
88 ICBTAG_FLAG_AD_LONG) 88 ICBTAG_FLAG_AD_LONG)
89 epos.offset -= sizeof(long_ad); 89 epos.offset -= sizeof(struct long_ad);
90 } else { 90 } else {
91 offset = 0; 91 offset = 0;
92 } 92 }
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) 101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset; 102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
103 for (num = 0; i > 0; i--) { 103 for (num = 0; i > 0; i--) {
104 block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i); 104 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
105 tmp = udf_tgetblk(dir->i_sb, block); 105 tmp = udf_tgetblk(dir->i_sb, block);
106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) 106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
107 bha[num++] = tmp; 107 bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
161 memcpy(fname, "..", flen); 161 memcpy(fname, "..", flen);
162 dt_type = DT_DIR; 162 dt_type = DT_DIR;
163 } else { 163 } else {
164 kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); 164 struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
165 165
166 iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0); 166 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); 167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
168 dt_type = DT_UNKNOWN; 168 dt_type = DT_UNKNOWN;
169 } 169 }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4cc..1d2c570704c8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
20 20
21#if 0 21#if 0
22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, 22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
23 uint8_t ad_size, kernel_lb_addr fe_loc, 23 uint8_t ad_size, struct kernel_lb_addr fe_loc,
24 int *pos, int *offset, struct buffer_head **bh, 24 int *pos, int *offset, struct buffer_head **bh,
25 int *error) 25 int *error)
26{ 26{
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
75 struct udf_fileident_bh *fibh, 75 struct udf_fileident_bh *fibh,
76 struct fileIdentDesc *cfi, 76 struct fileIdentDesc *cfi,
77 struct extent_position *epos, 77 struct extent_position *epos,
78 kernel_lb_addr *eloc, uint32_t *elen, 78 struct kernel_lb_addr *eloc, uint32_t *elen,
79 sector_t *offset) 79 sector_t *offset)
80{ 80{
81 struct fileIdentDesc *fi; 81 struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
111 (EXT_RECORDED_ALLOCATED >> 30)) 111 (EXT_RECORDED_ALLOCATED >> 30))
112 return NULL; 112 return NULL;
113 113
114 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 114 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
115 115
116 (*offset)++; 116 (*offset)++;
117 117
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
131 if (i + *offset > (*elen >> blocksize_bits)) 131 if (i + *offset > (*elen >> blocksize_bits))
132 i = (*elen >> blocksize_bits)-*offset; 132 i = (*elen >> blocksize_bits)-*offset;
133 for (num = 0; i > 0; i--) { 133 for (num = 0; i > 0; i--) {
134 block = udf_get_lb_pblock(dir->i_sb, *eloc, 134 block = udf_get_lb_pblock(dir->i_sb, eloc,
135 *offset + i); 135 *offset + i);
136 tmp = udf_tgetblk(dir->i_sb, block); 136 tmp = udf_tgetblk(dir->i_sb, block);
137 if (tmp && !buffer_uptodate(tmp) && 137 if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
169 (EXT_RECORDED_ALLOCATED >> 30)) 169 (EXT_RECORDED_ALLOCATED >> 30))
170 return NULL; 170 return NULL;
171 171
172 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 172 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
173 173
174 (*offset)++; 174 (*offset)++;
175 175
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
249} 249}
250 250
251#if 0 251#if 0
252static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) 252static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
253{ 253{
254 extent_ad *ext; 254 struct extent_ad *ext;
255 struct fileEntry *fe; 255 struct fileEntry *fe;
256 uint8_t *ptr; 256 uint8_t *ptr;
257 257
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) 274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
275 ptr += *offset; 275 ptr += *offset;
276 276
277 ext = (extent_ad *)ptr; 277 ext = (struct extent_ad *)ptr;
278 278
279 *offset = *offset + sizeof(extent_ad); 279 *offset = *offset + sizeof(struct extent_ad);
280 return ext; 280 return ext;
281} 281}
282#endif 282#endif
283 283
284short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, 284struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
285 int inc) 285 int inc)
286{ 286{
287 short_ad *sa; 287 struct short_ad *sa;
288 288
289 if ((!ptr) || (!offset)) { 289 if ((!ptr) || (!offset)) {
290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); 290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
291 return NULL; 291 return NULL;
292 } 292 }
293 293
294 if ((*offset + sizeof(short_ad)) > maxoffset) 294 if ((*offset + sizeof(struct short_ad)) > maxoffset)
295 return NULL; 295 return NULL;
296 else { 296 else {
297 sa = (short_ad *)ptr; 297 sa = (struct short_ad *)ptr;
298 if (sa->extLength == 0) 298 if (sa->extLength == 0)
299 return NULL; 299 return NULL;
300 } 300 }
301 301
302 if (inc) 302 if (inc)
303 *offset += sizeof(short_ad); 303 *offset += sizeof(struct short_ad);
304 return sa; 304 return sa;
305} 305}
306 306
307long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) 307struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
308{ 308{
309 long_ad *la; 309 struct long_ad *la;
310 310
311 if ((!ptr) || (!offset)) { 311 if ((!ptr) || (!offset)) {
312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); 312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
313 return NULL; 313 return NULL;
314 } 314 }
315 315
316 if ((*offset + sizeof(long_ad)) > maxoffset) 316 if ((*offset + sizeof(struct long_ad)) > maxoffset)
317 return NULL; 317 return NULL;
318 else { 318 else {
319 la = (long_ad *)ptr; 319 la = (struct long_ad *)ptr;
320 if (la->extLength == 0) 320 if (la->extLength == 0)
321 return NULL; 321 return NULL;
322 } 322 }
323 323
324 if (inc) 324 if (inc)
325 *offset += sizeof(long_ad); 325 *offset += sizeof(struct long_ad);
326 return la; 326 return la;
327} 327}
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b31..4792b771aa80 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
38#define _ECMA_167_H 1 38#define _ECMA_167_H 1
39 39
40/* Character set specification (ECMA 167r3 1/7.2.1) */ 40/* Character set specification (ECMA 167r3 1/7.2.1) */
41typedef struct { 41struct charspec {
42 uint8_t charSetType; 42 uint8_t charSetType;
43 uint8_t charSetInfo[63]; 43 uint8_t charSetInfo[63];
44} __attribute__ ((packed)) charspec; 44} __attribute__ ((packed));
45 45
46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */ 46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */
47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ 47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
57typedef uint8_t dstring; 57typedef uint8_t dstring;
58 58
59/* Timestamp (ECMA 167r3 1/7.3) */ 59/* Timestamp (ECMA 167r3 1/7.3) */
60typedef struct { 60struct timestamp {
61 __le16 typeAndTimezone; 61 __le16 typeAndTimezone;
62 __le16 year; 62 __le16 year;
63 uint8_t month; 63 uint8_t month;
@@ -68,7 +68,7 @@ typedef struct {
68 uint8_t centiseconds; 68 uint8_t centiseconds;
69 uint8_t hundredsOfMicroseconds; 69 uint8_t hundredsOfMicroseconds;
70 uint8_t microseconds; 70 uint8_t microseconds;
71} __attribute__ ((packed)) timestamp; 71} __attribute__ ((packed));
72 72
73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */ 73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
74#define TIMESTAMP_TYPE_MASK 0xF000 74#define TIMESTAMP_TYPE_MASK 0xF000
@@ -78,11 +78,11 @@ typedef struct {
78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF 78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF
79 79
80/* Entity identifier (ECMA 167r3 1/7.4) */ 80/* Entity identifier (ECMA 167r3 1/7.4) */
81typedef struct { 81struct regid {
82 uint8_t flags; 82 uint8_t flags;
83 uint8_t ident[23]; 83 uint8_t ident[23];
84 uint8_t identSuffix[8]; 84 uint8_t identSuffix[8];
85} __attribute__ ((packed)) regid; 85} __attribute__ ((packed));
86 86
87/* Flags (ECMA 167r3 1/7.4.1) */ 87/* Flags (ECMA 167r3 1/7.4.1) */
88#define ENTITYID_FLAGS_DIRTY 0x00 88#define ENTITYID_FLAGS_DIRTY 0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
126 126
127/* Boot Descriptor (ECMA 167r3 2/9.4) */ 127/* Boot Descriptor (ECMA 167r3 2/9.4) */
128struct bootDesc { 128struct bootDesc {
129 uint8_t structType; 129 uint8_t structType;
130 uint8_t stdIdent[VSD_STD_ID_LEN]; 130 uint8_t stdIdent[VSD_STD_ID_LEN];
131 uint8_t structVersion; 131 uint8_t structVersion;
132 uint8_t reserved1; 132 uint8_t reserved1;
133 regid archType; 133 struct regid archType;
134 regid bootIdent; 134 struct regid bootIdent;
135 __le32 bootExtLocation; 135 __le32 bootExtLocation;
136 __le32 bootExtLength; 136 __le32 bootExtLength;
137 __le64 loadAddress; 137 __le64 loadAddress;
138 __le64 startAddress; 138 __le64 startAddress;
139 timestamp descCreationDateAndTime; 139 struct timestamp descCreationDateAndTime;
140 __le16 flags; 140 __le16 flags;
141 uint8_t reserved2[32]; 141 uint8_t reserved2[32];
142 uint8_t bootUse[1906]; 142 uint8_t bootUse[1906];
143} __attribute__ ((packed)); 143} __attribute__ ((packed));
144 144
145/* Flags (ECMA 167r3 2/9.4.12) */ 145/* Flags (ECMA 167r3 2/9.4.12) */
146#define BOOT_FLAGS_ERASE 0x01 146#define BOOT_FLAGS_ERASE 0x01
147 147
148/* Extent Descriptor (ECMA 167r3 3/7.1) */ 148/* Extent Descriptor (ECMA 167r3 3/7.1) */
149typedef struct { 149struct extent_ad {
150 __le32 extLength; 150 __le32 extLength;
151 __le32 extLocation; 151 __le32 extLocation;
152} __attribute__ ((packed)) extent_ad; 152} __attribute__ ((packed));
153 153
154typedef struct { 154struct kernel_extent_ad {
155 uint32_t extLength; 155 uint32_t extLength;
156 uint32_t extLocation; 156 uint32_t extLocation;
157} kernel_extent_ad; 157};
158 158
159/* Descriptor Tag (ECMA 167r3 3/7.2) */ 159/* Descriptor Tag (ECMA 167r3 3/7.2) */
160typedef struct { 160struct tag {
161 __le16 tagIdent; 161 __le16 tagIdent;
162 __le16 descVersion; 162 __le16 descVersion;
163 uint8_t tagChecksum; 163 uint8_t tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
166 __le16 descCRC; 166 __le16 descCRC;
167 __le16 descCRCLength; 167 __le16 descCRCLength;
168 __le32 tagLocation; 168 __le32 tagLocation;
169} __attribute__ ((packed)) tag; 169} __attribute__ ((packed));
170 170
171/* Tag Identifier (ECMA 167r3 3/7.2.1) */ 171/* Tag Identifier (ECMA 167r3 3/7.2.1) */
172#define TAG_IDENT_PVD 0x0001 172#define TAG_IDENT_PVD 0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
190 190
191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ 191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
192struct primaryVolDesc { 192struct primaryVolDesc {
193 tag descTag; 193 struct tag descTag;
194 __le32 volDescSeqNum; 194 __le32 volDescSeqNum;
195 __le32 primaryVolDescNum; 195 __le32 primaryVolDescNum;
196 dstring volIdent[32]; 196 dstring volIdent[32];
197 __le16 volSeqNum; 197 __le16 volSeqNum;
198 __le16 maxVolSeqNum; 198 __le16 maxVolSeqNum;
199 __le16 interchangeLvl; 199 __le16 interchangeLvl;
200 __le16 maxInterchangeLvl; 200 __le16 maxInterchangeLvl;
201 __le32 charSetList; 201 __le32 charSetList;
202 __le32 maxCharSetList; 202 __le32 maxCharSetList;
203 dstring volSetIdent[128]; 203 dstring volSetIdent[128];
204 charspec descCharSet; 204 struct charspec descCharSet;
205 charspec explanatoryCharSet; 205 struct charspec explanatoryCharSet;
206 extent_ad volAbstract; 206 struct extent_ad volAbstract;
207 extent_ad volCopyright; 207 struct extent_ad volCopyright;
208 regid appIdent; 208 struct regid appIdent;
209 timestamp recordingDateAndTime; 209 struct timestamp recordingDateAndTime;
210 regid impIdent; 210 struct regid impIdent;
211 uint8_t impUse[64]; 211 uint8_t impUse[64];
212 __le32 predecessorVolDescSeqLocation; 212 __le32 predecessorVolDescSeqLocation;
213 __le16 flags; 213 __le16 flags;
214 uint8_t reserved[22]; 214 uint8_t reserved[22];
215} __attribute__ ((packed)); 215} __attribute__ ((packed));
216 216
217/* Flags (ECMA 167r3 3/10.1.21) */ 217/* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
219 219
220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ 220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
221struct anchorVolDescPtr { 221struct anchorVolDescPtr {
222 tag descTag; 222 struct tag descTag;
223 extent_ad mainVolDescSeqExt; 223 struct extent_ad mainVolDescSeqExt;
224 extent_ad reserveVolDescSeqExt; 224 struct extent_ad reserveVolDescSeqExt;
225 uint8_t reserved[480]; 225 uint8_t reserved[480];
226} __attribute__ ((packed)); 226} __attribute__ ((packed));
227 227
228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ 228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
229struct volDescPtr { 229struct volDescPtr {
230 tag descTag; 230 struct tag descTag;
231 __le32 volDescSeqNum; 231 __le32 volDescSeqNum;
232 extent_ad nextVolDescSeqExt; 232 struct extent_ad nextVolDescSeqExt;
233 uint8_t reserved[484]; 233 uint8_t reserved[484];
234} __attribute__ ((packed)); 234} __attribute__ ((packed));
235 235
236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ 236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
237struct impUseVolDesc { 237struct impUseVolDesc {
238 tag descTag; 238 struct tag descTag;
239 __le32 volDescSeqNum; 239 __le32 volDescSeqNum;
240 regid impIdent; 240 struct regid impIdent;
241 uint8_t impUse[460]; 241 uint8_t impUse[460];
242} __attribute__ ((packed)); 242} __attribute__ ((packed));
243 243
244/* Partition Descriptor (ECMA 167r3 3/10.5) */ 244/* Partition Descriptor (ECMA 167r3 3/10.5) */
245struct partitionDesc { 245struct partitionDesc {
246 tag descTag; 246 struct tag descTag;
247 __le32 volDescSeqNum; 247 __le32 volDescSeqNum;
248 __le16 partitionFlags; 248 __le16 partitionFlags;
249 __le16 partitionNumber; 249 __le16 partitionNumber;
250 regid partitionContents; 250 struct regid partitionContents;
251 uint8_t partitionContentsUse[128]; 251 uint8_t partitionContentsUse[128];
252 __le32 accessType; 252 __le32 accessType;
253 __le32 partitionStartingLocation; 253 __le32 partitionStartingLocation;
254 __le32 partitionLength; 254 __le32 partitionLength;
255 regid impIdent; 255 struct regid impIdent;
256 uint8_t impUse[128]; 256 uint8_t impUse[128];
257 uint8_t reserved[156]; 257 uint8_t reserved[156];
258} __attribute__ ((packed)); 258} __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
278 278
279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ 279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
280struct logicalVolDesc { 280struct logicalVolDesc {
281 tag descTag; 281 struct tag descTag;
282 __le32 volDescSeqNum; 282 __le32 volDescSeqNum;
283 charspec descCharSet; 283 struct charspec descCharSet;
284 dstring logicalVolIdent[128]; 284 dstring logicalVolIdent[128];
285 __le32 logicalBlockSize; 285 __le32 logicalBlockSize;
286 regid domainIdent; 286 struct regid domainIdent;
287 uint8_t logicalVolContentsUse[16]; 287 uint8_t logicalVolContentsUse[16];
288 __le32 mapTableLength; 288 __le32 mapTableLength;
289 __le32 numPartitionMaps; 289 __le32 numPartitionMaps;
290 regid impIdent; 290 struct regid impIdent;
291 uint8_t impUse[128]; 291 uint8_t impUse[128];
292 extent_ad integritySeqExt; 292 struct extent_ad integritySeqExt;
293 uint8_t partitionMaps[0]; 293 uint8_t partitionMaps[0];
294} __attribute__ ((packed)); 294} __attribute__ ((packed));
295 295
296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */ 296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
322 322
323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ 323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
324struct unallocSpaceDesc { 324struct unallocSpaceDesc {
325 tag descTag; 325 struct tag descTag;
326 __le32 volDescSeqNum; 326 __le32 volDescSeqNum;
327 __le32 numAllocDescs; 327 __le32 numAllocDescs;
328 extent_ad allocDescs[0]; 328 struct extent_ad allocDescs[0];
329} __attribute__ ((packed)); 329} __attribute__ ((packed));
330 330
331/* Terminating Descriptor (ECMA 167r3 3/10.9) */ 331/* Terminating Descriptor (ECMA 167r3 3/10.9) */
332struct terminatingDesc { 332struct terminatingDesc {
333 tag descTag; 333 struct tag descTag;
334 uint8_t reserved[496]; 334 uint8_t reserved[496];
335} __attribute__ ((packed)); 335} __attribute__ ((packed));
336 336
337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ 337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
338struct logicalVolIntegrityDesc { 338struct logicalVolIntegrityDesc {
339 tag descTag; 339 struct tag descTag;
340 timestamp recordingDateAndTime; 340 struct timestamp recordingDateAndTime;
341 __le32 integrityType; 341 __le32 integrityType;
342 extent_ad nextIntegrityExt; 342 struct extent_ad nextIntegrityExt;
343 uint8_t logicalVolContentsUse[32]; 343 uint8_t logicalVolContentsUse[32];
344 __le32 numOfPartitions; 344 __le32 numOfPartitions;
345 __le32 lengthOfImpUse; 345 __le32 lengthOfImpUse;
346 __le32 freeSpaceTable[0]; 346 __le32 freeSpaceTable[0];
347 __le32 sizeTable[0]; 347 __le32 sizeTable[0];
348 uint8_t impUse[0]; 348 uint8_t impUse[0];
349} __attribute__ ((packed)); 349} __attribute__ ((packed));
350 350
351/* Integrity Type (ECMA 167r3 3/10.10.3) */ 351/* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001
354 354
355/* Recorded Address (ECMA 167r3 4/7.1) */ 355/* Recorded Address (ECMA 167r3 4/7.1) */
356typedef struct { 356struct lb_addr {
357 __le32 logicalBlockNum; 357 __le32 logicalBlockNum;
358 __le16 partitionReferenceNum; 358 __le16 partitionReferenceNum;
359} __attribute__ ((packed)) lb_addr; 359} __attribute__ ((packed));
360 360
361/* ... and its in-core analog */ 361/* ... and its in-core analog */
362typedef struct { 362struct kernel_lb_addr {
363 uint32_t logicalBlockNum; 363 uint32_t logicalBlockNum;
364 uint16_t partitionReferenceNum; 364 uint16_t partitionReferenceNum;
365} kernel_lb_addr; 365};
366 366
367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
368typedef struct { 368struct short_ad {
369 __le32 extLength; 369 __le32 extLength;
370 __le32 extPosition; 370 __le32 extPosition;
371} __attribute__ ((packed)) short_ad; 371} __attribute__ ((packed));
372 372
373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ 373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
374typedef struct { 374struct long_ad {
375 __le32 extLength; 375 __le32 extLength;
376 lb_addr extLocation; 376 struct lb_addr extLocation;
377 uint8_t impUse[6]; 377 uint8_t impUse[6];
378} __attribute__ ((packed)) long_ad; 378} __attribute__ ((packed));
379 379
380typedef struct { 380struct kernel_long_ad {
381 uint32_t extLength; 381 uint32_t extLength;
382 kernel_lb_addr extLocation; 382 struct kernel_lb_addr extLocation;
383 uint8_t impUse[6]; 383 uint8_t impUse[6];
384} kernel_long_ad; 384};
385 385
386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ 386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
387typedef struct { 387struct ext_ad {
388 __le32 extLength; 388 __le32 extLength;
389 __le32 recordedLength; 389 __le32 recordedLength;
390 __le32 informationLength; 390 __le32 informationLength;
391 lb_addr extLocation; 391 struct lb_addr extLocation;
392} __attribute__ ((packed)) ext_ad; 392} __attribute__ ((packed));
393 393
394typedef struct { 394struct kernel_ext_ad {
395 uint32_t extLength; 395 uint32_t extLength;
396 uint32_t recordedLength; 396 uint32_t recordedLength;
397 uint32_t informationLength; 397 uint32_t informationLength;
398 kernel_lb_addr extLocation; 398 struct kernel_lb_addr extLocation;
399} kernel_ext_ad; 399};
400 400
401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ 401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
402 402
@@ -415,44 +415,44 @@ typedef struct {
415 415
416/* File Set Descriptor (ECMA 167r3 4/14.1) */ 416/* File Set Descriptor (ECMA 167r3 4/14.1) */
417struct fileSetDesc { 417struct fileSetDesc {
418 tag descTag; 418 struct tag descTag;
419 timestamp recordingDateAndTime; 419 struct timestamp recordingDateAndTime;
420 __le16 interchangeLvl; 420 __le16 interchangeLvl;
421 __le16 maxInterchangeLvl; 421 __le16 maxInterchangeLvl;
422 __le32 charSetList; 422 __le32 charSetList;
423 __le32 maxCharSetList; 423 __le32 maxCharSetList;
424 __le32 fileSetNum; 424 __le32 fileSetNum;
425 __le32 fileSetDescNum; 425 __le32 fileSetDescNum;
426 charspec logicalVolIdentCharSet; 426 struct charspec logicalVolIdentCharSet;
427 dstring logicalVolIdent[128]; 427 dstring logicalVolIdent[128];
428 charspec fileSetCharSet; 428 struct charspec fileSetCharSet;
429 dstring fileSetIdent[32]; 429 dstring fileSetIdent[32];
430 dstring copyrightFileIdent[32]; 430 dstring copyrightFileIdent[32];
431 dstring abstractFileIdent[32]; 431 dstring abstractFileIdent[32];
432 long_ad rootDirectoryICB; 432 struct long_ad rootDirectoryICB;
433 regid domainIdent; 433 struct regid domainIdent;
434 long_ad nextExt; 434 struct long_ad nextExt;
435 long_ad streamDirectoryICB; 435 struct long_ad streamDirectoryICB;
436 uint8_t reserved[32]; 436 uint8_t reserved[32];
437} __attribute__ ((packed)); 437} __attribute__ ((packed));
438 438
439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */ 439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */
440struct partitionHeaderDesc { 440struct partitionHeaderDesc {
441 short_ad unallocSpaceTable; 441 struct short_ad unallocSpaceTable;
442 short_ad unallocSpaceBitmap; 442 struct short_ad unallocSpaceBitmap;
443 short_ad partitionIntegrityTable; 443 struct short_ad partitionIntegrityTable;
444 short_ad freedSpaceTable; 444 struct short_ad freedSpaceTable;
445 short_ad freedSpaceBitmap; 445 struct short_ad freedSpaceBitmap;
446 uint8_t reserved[88]; 446 uint8_t reserved[88];
447} __attribute__ ((packed)); 447} __attribute__ ((packed));
448 448
449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */ 449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */
450struct fileIdentDesc { 450struct fileIdentDesc {
451 tag descTag; 451 struct tag descTag;
452 __le16 fileVersionNum; 452 __le16 fileVersionNum;
453 uint8_t fileCharacteristics; 453 uint8_t fileCharacteristics;
454 uint8_t lengthFileIdent; 454 uint8_t lengthFileIdent;
455 long_ad icb; 455 struct long_ad icb;
456 __le16 lengthOfImpUse; 456 __le16 lengthOfImpUse;
457 uint8_t impUse[0]; 457 uint8_t impUse[0];
458 uint8_t fileIdent[0]; 458 uint8_t fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
468 468
469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ 469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
470struct allocExtDesc { 470struct allocExtDesc {
471 tag descTag; 471 struct tag descTag;
472 __le32 previousAllocExtLocation; 472 __le32 previousAllocExtLocation;
473 __le32 lengthAllocDescs; 473 __le32 lengthAllocDescs;
474} __attribute__ ((packed)); 474} __attribute__ ((packed));
475 475
476/* ICB Tag (ECMA 167r3 4/14.6) */ 476/* ICB Tag (ECMA 167r3 4/14.6) */
477typedef struct { 477struct icbtag {
478 __le32 priorRecordedNumDirectEntries; 478 __le32 priorRecordedNumDirectEntries;
479 __le16 strategyType; 479 __le16 strategyType;
480 __le16 strategyParameter; 480 __le16 strategyParameter;
481 __le16 numEntries; 481 __le16 numEntries;
482 uint8_t reserved; 482 uint8_t reserved;
483 uint8_t fileType; 483 uint8_t fileType;
484 lb_addr parentICBLocation; 484 struct lb_addr parentICBLocation;
485 __le16 flags; 485 __le16 flags;
486} __attribute__ ((packed)) icbtag; 486} __attribute__ ((packed));
487 487
488/* Strategy Type (ECMA 167r3 4/14.6.2) */ 488/* Strategy Type (ECMA 167r3 4/14.6.2) */
489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000
@@ -528,41 +528,41 @@ typedef struct {
528 528
529/* Indirect Entry (ECMA 167r3 4/14.7) */ 529/* Indirect Entry (ECMA 167r3 4/14.7) */
530struct indirectEntry { 530struct indirectEntry {
531 tag descTag; 531 struct tag descTag;
532 icbtag icbTag; 532 struct icbtag icbTag;
533 long_ad indirectICB; 533 struct long_ad indirectICB;
534} __attribute__ ((packed)); 534} __attribute__ ((packed));
535 535
536/* Terminal Entry (ECMA 167r3 4/14.8) */ 536/* Terminal Entry (ECMA 167r3 4/14.8) */
537struct terminalEntry { 537struct terminalEntry {
538 tag descTag; 538 struct tag descTag;
539 icbtag icbTag; 539 struct icbtag icbTag;
540} __attribute__ ((packed)); 540} __attribute__ ((packed));
541 541
542/* File Entry (ECMA 167r3 4/14.9) */ 542/* File Entry (ECMA 167r3 4/14.9) */
543struct fileEntry { 543struct fileEntry {
544 tag descTag; 544 struct tag descTag;
545 icbtag icbTag; 545 struct icbtag icbTag;
546 __le32 uid; 546 __le32 uid;
547 __le32 gid; 547 __le32 gid;
548 __le32 permissions; 548 __le32 permissions;
549 __le16 fileLinkCount; 549 __le16 fileLinkCount;
550 uint8_t recordFormat; 550 uint8_t recordFormat;
551 uint8_t recordDisplayAttr; 551 uint8_t recordDisplayAttr;
552 __le32 recordLength; 552 __le32 recordLength;
553 __le64 informationLength; 553 __le64 informationLength;
554 __le64 logicalBlocksRecorded; 554 __le64 logicalBlocksRecorded;
555 timestamp accessTime; 555 struct timestamp accessTime;
556 timestamp modificationTime; 556 struct timestamp modificationTime;
557 timestamp attrTime; 557 struct timestamp attrTime;
558 __le32 checkpoint; 558 __le32 checkpoint;
559 long_ad extendedAttrICB; 559 struct long_ad extendedAttrICB;
560 regid impIdent; 560 struct regid impIdent;
561 __le64 uniqueID; 561 __le64 uniqueID;
562 __le32 lengthExtendedAttr; 562 __le32 lengthExtendedAttr;
563 __le32 lengthAllocDescs; 563 __le32 lengthAllocDescs;
564 uint8_t extendedAttr[0]; 564 uint8_t extendedAttr[0];
565 uint8_t allocDescs[0]; 565 uint8_t allocDescs[0];
566} __attribute__ ((packed)); 566} __attribute__ ((packed));
567 567
568/* Permissions (ECMA 167r3 4/14.9.5) */ 568/* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
604 604
605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ 605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
606struct extendedAttrHeaderDesc { 606struct extendedAttrHeaderDesc {
607 tag descTag; 607 struct tag descTag;
608 __le32 impAttrLocation; 608 __le32 impAttrLocation;
609 __le32 appAttrLocation; 609 __le32 appAttrLocation;
610} __attribute__ ((packed)); 610} __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
687 uint8_t reserved[3]; 687 uint8_t reserved[3];
688 __le32 attrLength; 688 __le32 attrLength;
689 __le32 impUseLength; 689 __le32 impUseLength;
690 regid impIdent; 690 struct regid impIdent;
691 uint8_t impUse[0]; 691 uint8_t impUse[0];
692} __attribute__ ((packed)); 692} __attribute__ ((packed));
693 693
@@ -698,7 +698,7 @@ struct appUseExtAttr {
698 uint8_t reserved[3]; 698 uint8_t reserved[3];
699 __le32 attrLength; 699 __le32 attrLength;
700 __le32 appUseLength; 700 __le32 appUseLength;
701 regid appIdent; 701 struct regid appIdent;
702 uint8_t appUse[0]; 702 uint8_t appUse[0];
703} __attribute__ ((packed)); 703} __attribute__ ((packed));
704 704
@@ -712,15 +712,15 @@ struct appUseExtAttr {
712 712
713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */ 713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
714struct unallocSpaceEntry { 714struct unallocSpaceEntry {
715 tag descTag; 715 struct tag descTag;
716 icbtag icbTag; 716 struct icbtag icbTag;
717 __le32 lengthAllocDescs; 717 __le32 lengthAllocDescs;
718 uint8_t allocDescs[0]; 718 uint8_t allocDescs[0];
719} __attribute__ ((packed)); 719} __attribute__ ((packed));
720 720
721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ 721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
722struct spaceBitmapDesc { 722struct spaceBitmapDesc {
723 tag descTag; 723 struct tag descTag;
724 __le32 numOfBits; 724 __le32 numOfBits;
725 __le32 numOfBytes; 725 __le32 numOfBytes;
726 uint8_t bitmap[0]; 726 uint8_t bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
728 728
729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */ 729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
730struct partitionIntegrityEntry { 730struct partitionIntegrityEntry {
731 tag descTag; 731 struct tag descTag;
732 icbtag icbTag; 732 struct icbtag icbTag;
733 timestamp recordingDateAndTime; 733 struct timestamp recordingDateAndTime;
734 uint8_t integrityType; 734 uint8_t integrityType;
735 uint8_t reserved[175]; 735 uint8_t reserved[175];
736 regid impIdent; 736 struct regid impIdent;
737 uint8_t impUse[256]; 737 uint8_t impUse[256];
738} __attribute__ ((packed)); 738} __attribute__ ((packed));
739 739
740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
765 765
766/* File Entry (ECMA 167r3 4/14.17) */ 766/* File Entry (ECMA 167r3 4/14.17) */
767struct extendedFileEntry { 767struct extendedFileEntry {
768 tag descTag; 768 struct tag descTag;
769 icbtag icbTag; 769 struct icbtag icbTag;
770 __le32 uid; 770 __le32 uid;
771 __le32 gid; 771 __le32 gid;
772 __le32 permissions; 772 __le32 permissions;
773 __le16 fileLinkCount; 773 __le16 fileLinkCount;
774 uint8_t recordFormat; 774 uint8_t recordFormat;
775 uint8_t recordDisplayAttr; 775 uint8_t recordDisplayAttr;
776 __le32 recordLength; 776 __le32 recordLength;
777 __le64 informationLength; 777 __le64 informationLength;
778 __le64 objectSize; 778 __le64 objectSize;
779 __le64 logicalBlocksRecorded; 779 __le64 logicalBlocksRecorded;
780 timestamp accessTime; 780 struct timestamp accessTime;
781 timestamp modificationTime; 781 struct timestamp modificationTime;
782 timestamp createTime; 782 struct timestamp createTime;
783 timestamp attrTime; 783 struct timestamp attrTime;
784 __le32 checkpoint; 784 __le32 checkpoint;
785 __le32 reserved; 785 __le32 reserved;
786 long_ad extendedAttrICB; 786 struct long_ad extendedAttrICB;
787 long_ad streamDirectoryICB; 787 struct long_ad streamDirectoryICB;
788 regid impIdent; 788 struct regid impIdent;
789 __le64 uniqueID; 789 __le64 uniqueID;
790 __le32 lengthExtendedAttr; 790 __le32 lengthExtendedAttr;
791 __le32 lengthAllocDescs; 791 __le32 lengthAllocDescs;
792 uint8_t extendedAttr[0]; 792 uint8_t extendedAttr[0];
793 uint8_t allocDescs[0]; 793 uint8_t allocDescs[0];
794} __attribute__ ((packed)); 794} __attribute__ ((packed));
795 795
796#endif /* _ECMA_167_H */ 796#endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 47dbe5613f90..c10fa39f97e2 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
49 le32_add_cpu(&lvidiu->numDirs, -1); 49 le32_add_cpu(&lvidiu->numDirs, -1);
50 else 50 else
51 le32_add_cpu(&lvidiu->numFiles, -1); 51 le32_add_cpu(&lvidiu->numFiles, -1);
52 52 udf_updated_lvid(sb);
53 mark_buffer_dirty(sbi->s_lvid_bh);
54 } 53 }
55 mutex_unlock(&sbi->s_alloc_mutex); 54 mutex_unlock(&sbi->s_alloc_mutex);
56 55
57 udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1); 56 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
58} 57}
59 58
60struct inode *udf_new_inode(struct inode *dir, int mode, int *err) 59struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
122 if (!(++uniqueID & 0x00000000FFFFFFFFUL)) 121 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
123 uniqueID += 16; 122 uniqueID += 16;
124 lvhd->uniqueID = cpu_to_le64(uniqueID); 123 lvhd->uniqueID = cpu_to_le64(uniqueID);
125 mark_buffer_dirty(sbi->s_lvid_bh); 124 udf_updated_lvid(sb);
126 } 125 }
127 mutex_unlock(&sbi->s_alloc_mutex); 126 mutex_unlock(&sbi->s_alloc_mutex);
128 inode->i_mode = mode; 127 inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
138 iinfo->i_location.logicalBlockNum = block; 137 iinfo->i_location.logicalBlockNum = block;
139 iinfo->i_location.partitionReferenceNum = 138 iinfo->i_location.partitionReferenceNum =
140 dinfo->i_location.partitionReferenceNum; 139 dinfo->i_location.partitionReferenceNum;
141 inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0); 140 inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
142 inode->i_blocks = 0; 141 inode->i_blocks = 0;
143 iinfo->i_lenEAttr = 0; 142 iinfo->i_lenEAttr = 0;
144 iinfo->i_lenAlloc = 0; 143 iinfo->i_lenAlloc = 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7f..e7533f785636 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
56 sector_t *, int *); 56 sector_t *, int *);
57static int8_t udf_insert_aext(struct inode *, struct extent_position, 57static int8_t udf_insert_aext(struct inode *, struct extent_position,
58 kernel_lb_addr, uint32_t); 58 struct kernel_lb_addr, uint32_t);
59static void udf_split_extents(struct inode *, int *, int, int, 59static void udf_split_extents(struct inode *, int *, int, int,
60 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 60 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
61static void udf_prealloc_extents(struct inode *, int, int, 61static void udf_prealloc_extents(struct inode *, int, int,
62 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 62 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
63static void udf_merge_extents(struct inode *, 63static void udf_merge_extents(struct inode *,
64 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 64 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
65static void udf_update_extents(struct inode *, 65static void udf_update_extents(struct inode *,
66 kernel_long_ad[EXTENT_MERGE_SIZE], int, int, 66 struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
67 struct extent_position *); 67 struct extent_position *);
68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); 68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
69 69
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
200{ 200{
201 int newblock; 201 int newblock;
202 struct buffer_head *dbh = NULL; 202 struct buffer_head *dbh = NULL;
203 kernel_lb_addr eloc; 203 struct kernel_lb_addr eloc;
204 uint32_t elen; 204 uint32_t elen;
205 uint8_t alloctype; 205 uint8_t alloctype;
206 struct extent_position epos; 206 struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
281 epos.bh = NULL; 281 epos.bh = NULL;
282 epos.block = iinfo->i_location; 282 epos.block = iinfo->i_location;
283 epos.offset = udf_file_entry_alloc_offset(inode); 283 epos.offset = udf_file_entry_alloc_offset(inode);
284 udf_add_aext(inode, &epos, eloc, elen, 0); 284 udf_add_aext(inode, &epos, &eloc, elen, 0);
285 /* UniqueID stuff */ 285 /* UniqueID stuff */
286 286
287 brelse(epos.bh); 287 brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
359 359
360/* Extend the file by 'blocks' blocks, return the number of extents added */ 360/* Extend the file by 'blocks' blocks, return the number of extents added */
361int udf_extend_file(struct inode *inode, struct extent_position *last_pos, 361int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
362 kernel_long_ad *last_ext, sector_t blocks) 362 struct kernel_long_ad *last_ext, sector_t blocks)
363{ 363{
364 sector_t add; 364 sector_t add;
365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); 365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
366 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
367 kernel_lb_addr prealloc_loc = {}; 367 struct kernel_lb_addr prealloc_loc = {};
368 int prealloc_len = 0; 368 int prealloc_len = 0;
369 struct udf_inode_info *iinfo; 369 struct udf_inode_info *iinfo;
370 370
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
411 } 411 }
412 412
413 if (fake) { 413 if (fake) {
414 udf_add_aext(inode, last_pos, last_ext->extLocation, 414 udf_add_aext(inode, last_pos, &last_ext->extLocation,
415 last_ext->extLength, 1); 415 last_ext->extLength, 1);
416 count++; 416 count++;
417 } else 417 } else
418 udf_write_aext(inode, last_pos, last_ext->extLocation, 418 udf_write_aext(inode, last_pos, &last_ext->extLocation,
419 last_ext->extLength, 1); 419 last_ext->extLength, 1);
420 420
421 /* Managed to do everything necessary? */ 421 /* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
432 /* Create enough extents to cover the whole hole */ 432 /* Create enough extents to cover the whole hole */
433 while (blocks > add) { 433 while (blocks > add) {
434 blocks -= add; 434 blocks -= add;
435 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 435 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
436 last_ext->extLength, 1) == -1) 436 last_ext->extLength, 1) == -1)
437 return -1; 437 return -1;
438 count++; 438 count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
440 if (blocks) { 440 if (blocks) {
441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
442 (blocks << sb->s_blocksize_bits); 442 (blocks << sb->s_blocksize_bits);
443 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 443 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
444 last_ext->extLength, 1) == -1) 444 last_ext->extLength, 1) == -1)
445 return -1; 445 return -1;
446 count++; 446 count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
449out: 449out:
450 /* Do we have some preallocated blocks saved? */ 450 /* Do we have some preallocated blocks saved? */
451 if (prealloc_len) { 451 if (prealloc_len) {
452 if (udf_add_aext(inode, last_pos, prealloc_loc, 452 if (udf_add_aext(inode, last_pos, &prealloc_loc,
453 prealloc_len, 1) == -1) 453 prealloc_len, 1) == -1)
454 return -1; 454 return -1;
455 last_ext->extLocation = prealloc_loc; 455 last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
459 459
460 /* last_pos should point to the last written extent... */ 460 /* last_pos should point to the last written extent... */
461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
462 last_pos->offset -= sizeof(short_ad); 462 last_pos->offset -= sizeof(struct short_ad);
463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
464 last_pos->offset -= sizeof(long_ad); 464 last_pos->offset -= sizeof(struct long_ad);
465 else 465 else
466 return -1; 466 return -1;
467 467
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
473{ 473{
474 static sector_t last_block; 474 static sector_t last_block;
475 struct buffer_head *result = NULL; 475 struct buffer_head *result = NULL;
476 kernel_long_ad laarr[EXTENT_MERGE_SIZE]; 476 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
477 struct extent_position prev_epos, cur_epos, next_epos; 477 struct extent_position prev_epos, cur_epos, next_epos;
478 int count = 0, startnum = 0, endnum = 0; 478 int count = 0, startnum = 0, endnum = 0;
479 uint32_t elen = 0, tmpelen; 479 uint32_t elen = 0, tmpelen;
480 kernel_lb_addr eloc, tmpeloc; 480 struct kernel_lb_addr eloc, tmpeloc;
481 int c = 1; 481 int c = 1;
482 loff_t lbcount = 0, b_off = 0; 482 loff_t lbcount = 0, b_off = 0;
483 uint32_t newblocknum, newblock; 483 uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
550 elen = EXT_RECORDED_ALLOCATED | 550 elen = EXT_RECORDED_ALLOCATED |
551 ((elen + inode->i_sb->s_blocksize - 1) & 551 ((elen + inode->i_sb->s_blocksize - 1) &
552 ~(inode->i_sb->s_blocksize - 1)); 552 ~(inode->i_sb->s_blocksize - 1));
553 etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1); 553 etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
554 } 554 }
555 brelse(prev_epos.bh); 555 brelse(prev_epos.bh);
556 brelse(cur_epos.bh); 556 brelse(cur_epos.bh);
557 brelse(next_epos.bh); 557 brelse(next_epos.bh);
558 newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset); 558 newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
559 *phys = newblock; 559 *phys = newblock;
560 return NULL; 560 return NULL;
561 } 561 }
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
572 } else { 572 } else {
573 /* Create a fake extent when there's not one */ 573 /* Create a fake extent when there's not one */
574 memset(&laarr[0].extLocation, 0x00, 574 memset(&laarr[0].extLocation, 0x00,
575 sizeof(kernel_lb_addr)); 575 sizeof(struct kernel_lb_addr));
576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; 576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
577 /* Will udf_extend_file() create real extent from 577 /* Will udf_extend_file() create real extent from
578 a fake one? */ 578 a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
603 inode->i_sb->s_blocksize; 603 inode->i_sb->s_blocksize;
604 memset(&laarr[c].extLocation, 0x00, 604 memset(&laarr[c].extLocation, 0x00,
605 sizeof(kernel_lb_addr)); 605 sizeof(struct kernel_lb_addr));
606 count++; 606 count++;
607 endnum++; 607 endnum++;
608 } 608 }
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
699 699
700static void udf_split_extents(struct inode *inode, int *c, int offset, 700static void udf_split_extents(struct inode *inode, int *c, int offset,
701 int newblocknum, 701 int newblocknum,
702 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 702 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
703 int *endnum) 703 int *endnum)
704{ 704{
705 unsigned long blocksize = inode->i_sb->s_blocksize; 705 unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
726 if (offset) { 726 if (offset) {
727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
728 udf_free_blocks(inode->i_sb, inode, 728 udf_free_blocks(inode->i_sb, inode,
729 laarr[curr].extLocation, 729 &laarr[curr].extLocation,
730 0, offset); 730 0, offset);
731 laarr[curr].extLength = 731 laarr[curr].extLength =
732 EXT_NOT_RECORDED_NOT_ALLOCATED | 732 EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
763} 763}
764 764
765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, 765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
766 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 766 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
767 int *endnum) 767 int *endnum)
768{ 768{
769 int start, length = 0, currlength = 0, i; 769 int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
817 inode->i_sb->s_blocksize_bits); 817 inode->i_sb->s_blocksize_bits);
818 else { 818 else {
819 memmove(&laarr[c + 2], &laarr[c + 1], 819 memmove(&laarr[c + 2], &laarr[c + 1],
820 sizeof(long_ad) * (*endnum - (c + 1))); 820 sizeof(struct long_ad) * (*endnum - (c + 1)));
821 (*endnum)++; 821 (*endnum)++;
822 laarr[c + 1].extLocation.logicalBlockNum = next; 822 laarr[c + 1].extLocation.logicalBlockNum = next;
823 laarr[c + 1].extLocation.partitionReferenceNum = 823 laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
846 if (*endnum > (i + 1)) 846 if (*endnum > (i + 1))
847 memmove(&laarr[i], 847 memmove(&laarr[i],
848 &laarr[i + 1], 848 &laarr[i + 1],
849 sizeof(long_ad) * 849 sizeof(struct long_ad) *
850 (*endnum - (i + 1))); 850 (*endnum - (i + 1)));
851 i--; 851 i--;
852 (*endnum)--; 852 (*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
859} 859}
860 860
861static void udf_merge_extents(struct inode *inode, 861static void udf_merge_extents(struct inode *inode,
862 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 862 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
863 int *endnum) 863 int *endnum)
864{ 864{
865 int i; 865 int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
868 868
869 for (i = 0; i < (*endnum - 1); i++) { 869 for (i = 0; i < (*endnum - 1); i++) {
870 kernel_long_ad *li /*l[i]*/ = &laarr[i]; 870 struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
871 kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; 871 struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
872 872
873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) && 873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
874 (((li->extLength >> 30) == 874 (((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
902 blocksize - 1) & ~(blocksize - 1)); 902 blocksize - 1) & ~(blocksize - 1));
903 if (*endnum > (i + 2)) 903 if (*endnum > (i + 2))
904 memmove(&laarr[i + 1], &laarr[i + 2], 904 memmove(&laarr[i + 1], &laarr[i + 2],
905 sizeof(long_ad) * 905 sizeof(struct long_ad) *
906 (*endnum - (i + 2))); 906 (*endnum - (i + 2)));
907 i--; 907 i--;
908 (*endnum)--; 908 (*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) && 911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
912 ((lip1->extLength >> 30) == 912 ((lip1->extLength >> 30) ==
913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { 913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
914 udf_free_blocks(inode->i_sb, inode, li->extLocation, 0, 914 udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
915 ((li->extLength & 915 ((li->extLength &
916 UDF_EXTENT_LENGTH_MASK) + 916 UDF_EXTENT_LENGTH_MASK) +
917 blocksize - 1) >> blocksize_bits); 917 blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
937 blocksize - 1) & ~(blocksize - 1)); 937 blocksize - 1) & ~(blocksize - 1));
938 if (*endnum > (i + 2)) 938 if (*endnum > (i + 2))
939 memmove(&laarr[i + 1], &laarr[i + 2], 939 memmove(&laarr[i + 1], &laarr[i + 2],
940 sizeof(long_ad) * 940 sizeof(struct long_ad) *
941 (*endnum - (i + 2))); 941 (*endnum - (i + 2)));
942 i--; 942 i--;
943 (*endnum)--; 943 (*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
945 } else if ((li->extLength >> 30) == 945 } else if ((li->extLength >> 30) ==
946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
947 udf_free_blocks(inode->i_sb, inode, 947 udf_free_blocks(inode->i_sb, inode,
948 li->extLocation, 0, 948 &li->extLocation, 0,
949 ((li->extLength & 949 ((li->extLength &
950 UDF_EXTENT_LENGTH_MASK) + 950 UDF_EXTENT_LENGTH_MASK) +
951 blocksize - 1) >> blocksize_bits); 951 blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
959} 959}
960 960
961static void udf_update_extents(struct inode *inode, 961static void udf_update_extents(struct inode *inode,
962 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 962 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
963 int startnum, int endnum, 963 int startnum, int endnum,
964 struct extent_position *epos) 964 struct extent_position *epos)
965{ 965{
966 int start = 0, i; 966 int start = 0, i;
967 kernel_lb_addr tmploc; 967 struct kernel_lb_addr tmploc;
968 uint32_t tmplen; 968 uint32_t tmplen;
969 969
970 if (startnum > endnum) { 970 if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
983 983
984 for (i = start; i < endnum; i++) { 984 for (i = start; i < endnum; i++) {
985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0); 985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
986 udf_write_aext(inode, epos, laarr[i].extLocation, 986 udf_write_aext(inode, epos, &laarr[i].extLocation,
987 laarr[i].extLength, 1); 987 laarr[i].extLength, 1);
988 } 988 }
989} 989}
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
1076 * i_nlink = 1 1076 * i_nlink = 1
1077 * i_op = NULL; 1077 * i_op = NULL;
1078 */ 1078 */
1079 bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident); 1079 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
1080 if (!bh) { 1080 if (!bh) {
1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", 1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
1082 inode->i_ino); 1082 inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) { 1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
1099 struct buffer_head *ibh; 1099 struct buffer_head *ibh;
1100 1100
1101 ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, 1101 ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
1102 &ident); 1102 &ident);
1103 if (ident == TAG_IDENT_IE && ibh) { 1103 if (ident == TAG_IDENT_IE && ibh) {
1104 struct buffer_head *nbh = NULL; 1104 struct buffer_head *nbh = NULL;
1105 kernel_lb_addr loc; 1105 struct kernel_lb_addr loc;
1106 struct indirectEntry *ie; 1106 struct indirectEntry *ie;
1107 1107
1108 ie = (struct indirectEntry *)ibh->b_data; 1108 ie = (struct indirectEntry *)ibh->b_data;
1109 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1109 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1110 1110
1111 if (ie->indirectICB.extLength && 1111 if (ie->indirectICB.extLength &&
1112 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, 1112 (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
1113 &ident))) { 1113 &ident))) {
1114 if (ident == TAG_IDENT_FE || 1114 if (ident == TAG_IDENT_FE ||
1115 ident == TAG_IDENT_EFE) { 1115 ident == TAG_IDENT_EFE) {
1116 memcpy(&iinfo->i_location, 1116 memcpy(&iinfo->i_location,
1117 &loc, 1117 &loc,
1118 sizeof(kernel_lb_addr)); 1118 sizeof(struct kernel_lb_addr));
1119 brelse(bh); 1119 brelse(bh);
1120 brelse(ibh); 1120 brelse(ibh);
1121 brelse(nbh); 1121 brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1222 inode->i_size = le64_to_cpu(fe->informationLength); 1222 inode->i_size = le64_to_cpu(fe->informationLength);
1223 iinfo->i_lenExtents = inode->i_size; 1223 iinfo->i_lenExtents = inode->i_size;
1224 1224
1225 inode->i_mode = udf_convert_permissions(fe); 1225 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
1226 inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask; 1226 sbi->s_fmode != UDF_INVALID_MODE)
1227 inode->i_mode = sbi->s_fmode;
1228 else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
1229 sbi->s_dmode != UDF_INVALID_MODE)
1230 inode->i_mode = sbi->s_dmode;
1231 else
1232 inode->i_mode = udf_convert_permissions(fe);
1233 inode->i_mode &= ~sbi->s_umask;
1227 1234
1228 if (iinfo->i_efe == 0) { 1235 if (iinfo->i_efe == 0) {
1229 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1236 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1396 1403
1397 bh = udf_tread(inode->i_sb, 1404 bh = udf_tread(inode->i_sb,
1398 udf_get_lb_pblock(inode->i_sb, 1405 udf_get_lb_pblock(inode->i_sb,
1399 iinfo->i_location, 0)); 1406 &iinfo->i_location, 0));
1400 if (!bh) { 1407 if (!bh) {
1401 udf_debug("bread failure\n"); 1408 udf_debug("bread failure\n");
1402 return -EIO; 1409 return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1416 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1423 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1417 sizeof(struct unallocSpaceEntry)); 1424 sizeof(struct unallocSpaceEntry));
1418 crclen = sizeof(struct unallocSpaceEntry) + 1425 crclen = sizeof(struct unallocSpaceEntry) +
1419 iinfo->i_lenAlloc - sizeof(tag); 1426 iinfo->i_lenAlloc - sizeof(struct tag);
1420 use->descTag.tagLocation = cpu_to_le32( 1427 use->descTag.tagLocation = cpu_to_le32(
1421 iinfo->i_location. 1428 iinfo->i_location.
1422 logicalBlockNum); 1429 logicalBlockNum);
1423 use->descTag.descCRCLength = cpu_to_le16(crclen); 1430 use->descTag.descCRCLength = cpu_to_le16(crclen);
1424 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1431 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1425 sizeof(tag), 1432 sizeof(struct tag),
1426 crclen)); 1433 crclen));
1427 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1434 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1428 1435
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1459 fe->informationLength = cpu_to_le64(inode->i_size); 1466 fe->informationLength = cpu_to_le64(inode->i_size);
1460 1467
1461 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1468 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1462 regid *eid; 1469 struct regid *eid;
1463 struct deviceSpec *dsea = 1470 struct deviceSpec *dsea =
1464 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); 1471 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
1465 if (!dsea) { 1472 if (!dsea) {
1466 dsea = (struct deviceSpec *) 1473 dsea = (struct deviceSpec *)
1467 udf_add_extendedattr(inode, 1474 udf_add_extendedattr(inode,
1468 sizeof(struct deviceSpec) + 1475 sizeof(struct deviceSpec) +
1469 sizeof(regid), 12, 0x3); 1476 sizeof(struct regid), 12, 0x3);
1470 dsea->attrType = cpu_to_le32(12); 1477 dsea->attrType = cpu_to_le32(12);
1471 dsea->attrSubtype = 1; 1478 dsea->attrSubtype = 1;
1472 dsea->attrLength = cpu_to_le32( 1479 dsea->attrLength = cpu_to_le32(
1473 sizeof(struct deviceSpec) + 1480 sizeof(struct deviceSpec) +
1474 sizeof(regid)); 1481 sizeof(struct regid));
1475 dsea->impUseLength = cpu_to_le32(sizeof(regid)); 1482 dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
1476 } 1483 }
1477 eid = (regid *)dsea->impUse; 1484 eid = (struct regid *)dsea->impUse;
1478 memset(eid, 0, sizeof(regid)); 1485 memset(eid, 0, sizeof(struct regid));
1479 strcpy(eid->ident, UDF_ID_DEVELOPER); 1486 strcpy(eid->ident, UDF_ID_DEVELOPER);
1480 eid->identSuffix[0] = UDF_OS_CLASS_UNIX; 1487 eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
1481 eid->identSuffix[1] = UDF_OS_ID_LINUX; 1488 eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1494 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); 1501 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
1495 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); 1502 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
1496 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); 1503 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
1497 memset(&(fe->impIdent), 0, sizeof(regid)); 1504 memset(&(fe->impIdent), 0, sizeof(struct regid));
1498 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); 1505 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
1499 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1506 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1500 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1507 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1533 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); 1540 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
1534 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); 1541 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
1535 1542
1536 memset(&(efe->impIdent), 0, sizeof(regid)); 1543 memset(&(efe->impIdent), 0, sizeof(struct regid));
1537 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); 1544 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
1538 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1545 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1539 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1546 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1584 fe->descTag.tagLocation = cpu_to_le32( 1591 fe->descTag.tagLocation = cpu_to_le32(
1585 iinfo->i_location.logicalBlockNum); 1592 iinfo->i_location.logicalBlockNum);
1586 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1593 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
1587 sizeof(tag); 1594 sizeof(struct tag);
1588 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1595 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1589 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag), 1596 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1590 crclen)); 1597 crclen));
1591 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1598 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1592 1599
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1606 return err; 1613 return err;
1607} 1614}
1608 1615
1609struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) 1616struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
1610{ 1617{
1611 unsigned long block = udf_get_lb_pblock(sb, ino, 0); 1618 unsigned long block = udf_get_lb_pblock(sb, ino, 0);
1612 struct inode *inode = iget_locked(sb, block); 1619 struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1615 return NULL; 1622 return NULL;
1616 1623
1617 if (inode->i_state & I_NEW) { 1624 if (inode->i_state & I_NEW) {
1618 memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr)); 1625 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1619 __udf_read_inode(inode); 1626 __udf_read_inode(inode);
1620 unlock_new_inode(inode); 1627 unlock_new_inode(inode);
1621 } 1628 }
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1623 if (is_bad_inode(inode)) 1630 if (is_bad_inode(inode))
1624 goto out_iput; 1631 goto out_iput;
1625 1632
1626 if (ino.logicalBlockNum >= UDF_SB(sb)-> 1633 if (ino->logicalBlockNum >= UDF_SB(sb)->
1627 s_partmaps[ino.partitionReferenceNum].s_partition_len) { 1634 s_partmaps[ino->partitionReferenceNum].s_partition_len) {
1628 udf_debug("block=%d, partition=%d out of range\n", 1635 udf_debug("block=%d, partition=%d out of range\n",
1629 ino.logicalBlockNum, ino.partitionReferenceNum); 1636 ino->logicalBlockNum, ino->partitionReferenceNum);
1630 make_bad_inode(inode); 1637 make_bad_inode(inode);
1631 goto out_iput; 1638 goto out_iput;
1632 } 1639 }
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1639} 1646}
1640 1647
1641int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, 1648int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1642 kernel_lb_addr eloc, uint32_t elen, int inc) 1649 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1643{ 1650{
1644 int adsize; 1651 int adsize;
1645 short_ad *sad = NULL; 1652 struct short_ad *sad = NULL;
1646 long_ad *lad = NULL; 1653 struct long_ad *lad = NULL;
1647 struct allocExtDesc *aed; 1654 struct allocExtDesc *aed;
1648 int8_t etype; 1655 int8_t etype;
1649 uint8_t *ptr; 1656 uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1657 ptr = epos->bh->b_data + epos->offset; 1664 ptr = epos->bh->b_data + epos->offset;
1658 1665
1659 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1666 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1660 adsize = sizeof(short_ad); 1667 adsize = sizeof(struct short_ad);
1661 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1668 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1662 adsize = sizeof(long_ad); 1669 adsize = sizeof(struct long_ad);
1663 else 1670 else
1664 return -1; 1671 return -1;
1665 1672
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1667 char *sptr, *dptr; 1674 char *sptr, *dptr;
1668 struct buffer_head *nbh; 1675 struct buffer_head *nbh;
1669 int err, loffset; 1676 int err, loffset;
1670 kernel_lb_addr obloc = epos->block; 1677 struct kernel_lb_addr obloc = epos->block;
1671 1678
1672 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, 1679 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
1673 obloc.partitionReferenceNum, 1680 obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1675 if (!epos->block.logicalBlockNum) 1682 if (!epos->block.logicalBlockNum)
1676 return -1; 1683 return -1;
1677 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, 1684 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
1678 epos->block, 1685 &epos->block,
1679 0)); 1686 0));
1680 if (!nbh) 1687 if (!nbh)
1681 return -1; 1688 return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1712 } 1719 }
1713 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) 1720 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
1714 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, 1721 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
1715 epos->block.logicalBlockNum, sizeof(tag)); 1722 epos->block.logicalBlockNum, sizeof(struct tag));
1716 else 1723 else
1717 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, 1724 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
1718 epos->block.logicalBlockNum, sizeof(tag)); 1725 epos->block.logicalBlockNum, sizeof(struct tag));
1719 switch (iinfo->i_alloc_type) { 1726 switch (iinfo->i_alloc_type) {
1720 case ICBTAG_FLAG_AD_SHORT: 1727 case ICBTAG_FLAG_AD_SHORT:
1721 sad = (short_ad *)sptr; 1728 sad = (struct short_ad *)sptr;
1722 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1729 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1723 inode->i_sb->s_blocksize); 1730 inode->i_sb->s_blocksize);
1724 sad->extPosition = 1731 sad->extPosition =
1725 cpu_to_le32(epos->block.logicalBlockNum); 1732 cpu_to_le32(epos->block.logicalBlockNum);
1726 break; 1733 break;
1727 case ICBTAG_FLAG_AD_LONG: 1734 case ICBTAG_FLAG_AD_LONG:
1728 lad = (long_ad *)sptr; 1735 lad = (struct long_ad *)sptr;
1729 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1736 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1730 inode->i_sb->s_blocksize); 1737 inode->i_sb->s_blocksize);
1731 lad->extLocation = cpu_to_lelb(epos->block); 1738 lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1769} 1776}
1770 1777
1771int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, 1778int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1772 kernel_lb_addr eloc, uint32_t elen, int inc) 1779 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1773{ 1780{
1774 int adsize; 1781 int adsize;
1775 uint8_t *ptr; 1782 uint8_t *ptr;
1776 short_ad *sad; 1783 struct short_ad *sad;
1777 long_ad *lad; 1784 struct long_ad *lad;
1778 struct udf_inode_info *iinfo = UDF_I(inode); 1785 struct udf_inode_info *iinfo = UDF_I(inode);
1779 1786
1780 if (!epos->bh) 1787 if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1786 1793
1787 switch (iinfo->i_alloc_type) { 1794 switch (iinfo->i_alloc_type) {
1788 case ICBTAG_FLAG_AD_SHORT: 1795 case ICBTAG_FLAG_AD_SHORT:
1789 sad = (short_ad *)ptr; 1796 sad = (struct short_ad *)ptr;
1790 sad->extLength = cpu_to_le32(elen); 1797 sad->extLength = cpu_to_le32(elen);
1791 sad->extPosition = cpu_to_le32(eloc.logicalBlockNum); 1798 sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
1792 adsize = sizeof(short_ad); 1799 adsize = sizeof(struct short_ad);
1793 break; 1800 break;
1794 case ICBTAG_FLAG_AD_LONG: 1801 case ICBTAG_FLAG_AD_LONG:
1795 lad = (long_ad *)ptr; 1802 lad = (struct long_ad *)ptr;
1796 lad->extLength = cpu_to_le32(elen); 1803 lad->extLength = cpu_to_le32(elen);
1797 lad->extLocation = cpu_to_lelb(eloc); 1804 lad->extLocation = cpu_to_lelb(*eloc);
1798 memset(lad->impUse, 0x00, sizeof(lad->impUse)); 1805 memset(lad->impUse, 0x00, sizeof(lad->impUse));
1799 adsize = sizeof(long_ad); 1806 adsize = sizeof(struct long_ad);
1800 break; 1807 break;
1801 default: 1808 default:
1802 return -1; 1809 return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1823} 1830}
1824 1831
1825int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, 1832int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1826 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1833 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1827{ 1834{
1828 int8_t etype; 1835 int8_t etype;
1829 1836
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1833 epos->block = *eloc; 1840 epos->block = *eloc;
1834 epos->offset = sizeof(struct allocExtDesc); 1841 epos->offset = sizeof(struct allocExtDesc);
1835 brelse(epos->bh); 1842 brelse(epos->bh);
1836 block = udf_get_lb_pblock(inode->i_sb, epos->block, 0); 1843 block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
1837 epos->bh = udf_tread(inode->i_sb, block); 1844 epos->bh = udf_tread(inode->i_sb, block);
1838 if (!epos->bh) { 1845 if (!epos->bh) {
1839 udf_debug("reading block %d failed!\n", block); 1846 udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1845} 1852}
1846 1853
1847int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, 1854int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1848 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1855 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1849{ 1856{
1850 int alen; 1857 int alen;
1851 int8_t etype; 1858 int8_t etype;
1852 uint8_t *ptr; 1859 uint8_t *ptr;
1853 short_ad *sad; 1860 struct short_ad *sad;
1854 long_ad *lad; 1861 struct long_ad *lad;
1855 struct udf_inode_info *iinfo = UDF_I(inode); 1862 struct udf_inode_info *iinfo = UDF_I(inode);
1856 1863
1857 if (!epos->bh) { 1864 if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1900} 1907}
1901 1908
1902static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, 1909static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1903 kernel_lb_addr neloc, uint32_t nelen) 1910 struct kernel_lb_addr neloc, uint32_t nelen)
1904{ 1911{
1905 kernel_lb_addr oeloc; 1912 struct kernel_lb_addr oeloc;
1906 uint32_t oelen; 1913 uint32_t oelen;
1907 int8_t etype; 1914 int8_t etype;
1908 1915
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1910 get_bh(epos.bh); 1917 get_bh(epos.bh);
1911 1918
1912 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { 1919 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
1913 udf_write_aext(inode, &epos, neloc, nelen, 1); 1920 udf_write_aext(inode, &epos, &neloc, nelen, 1);
1914 neloc = oeloc; 1921 neloc = oeloc;
1915 nelen = (etype << 30) | oelen; 1922 nelen = (etype << 30) | oelen;
1916 } 1923 }
1917 udf_add_aext(inode, &epos, neloc, nelen, 1); 1924 udf_add_aext(inode, &epos, &neloc, nelen, 1);
1918 brelse(epos.bh); 1925 brelse(epos.bh);
1919 1926
1920 return (nelen >> 30); 1927 return (nelen >> 30);
1921} 1928}
1922 1929
1923int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, 1930int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1924 kernel_lb_addr eloc, uint32_t elen) 1931 struct kernel_lb_addr eloc, uint32_t elen)
1925{ 1932{
1926 struct extent_position oepos; 1933 struct extent_position oepos;
1927 int adsize; 1934 int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1936 1943
1937 iinfo = UDF_I(inode); 1944 iinfo = UDF_I(inode);
1938 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1945 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1939 adsize = sizeof(short_ad); 1946 adsize = sizeof(struct short_ad);
1940 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1947 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1941 adsize = sizeof(long_ad); 1948 adsize = sizeof(struct long_ad);
1942 else 1949 else
1943 adsize = 0; 1950 adsize = 0;
1944 1951
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1947 return -1; 1954 return -1;
1948 1955
1949 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { 1956 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
1950 udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1); 1957 udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
1951 if (oepos.bh != epos.bh) { 1958 if (oepos.bh != epos.bh) {
1952 oepos.block = epos.block; 1959 oepos.block = epos.block;
1953 brelse(oepos.bh); 1960 brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1956 oepos.offset = epos.offset - adsize; 1963 oepos.offset = epos.offset - adsize;
1957 } 1964 }
1958 } 1965 }
1959 memset(&eloc, 0x00, sizeof(kernel_lb_addr)); 1966 memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
1960 elen = 0; 1967 elen = 0;
1961 1968
1962 if (epos.bh != oepos.bh) { 1969 if (epos.bh != oepos.bh) {
1963 udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1); 1970 udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
1964 udf_write_aext(inode, &oepos, eloc, elen, 1); 1971 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1965 udf_write_aext(inode, &oepos, eloc, elen, 1); 1972 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1966 if (!oepos.bh) { 1973 if (!oepos.bh) {
1967 iinfo->i_lenAlloc -= (adsize * 2); 1974 iinfo->i_lenAlloc -= (adsize * 2);
1968 mark_inode_dirty(inode); 1975 mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1979 mark_buffer_dirty_inode(oepos.bh, inode); 1986 mark_buffer_dirty_inode(oepos.bh, inode);
1980 } 1987 }
1981 } else { 1988 } else {
1982 udf_write_aext(inode, &oepos, eloc, elen, 1); 1989 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1983 if (!oepos.bh) { 1990 if (!oepos.bh) {
1984 iinfo->i_lenAlloc -= adsize; 1991 iinfo->i_lenAlloc -= adsize;
1985 mark_inode_dirty(inode); 1992 mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2004} 2011}
2005 2012
2006int8_t inode_bmap(struct inode *inode, sector_t block, 2013int8_t inode_bmap(struct inode *inode, sector_t block,
2007 struct extent_position *pos, kernel_lb_addr *eloc, 2014 struct extent_position *pos, struct kernel_lb_addr *eloc,
2008 uint32_t *elen, sector_t *offset) 2015 uint32_t *elen, sector_t *offset)
2009{ 2016{
2010 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 2017 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
2036 2043
2037long udf_block_map(struct inode *inode, sector_t block) 2044long udf_block_map(struct inode *inode, sector_t block)
2038{ 2045{
2039 kernel_lb_addr eloc; 2046 struct kernel_lb_addr eloc;
2040 uint32_t elen; 2047 uint32_t elen;
2041 sector_t offset; 2048 sector_t offset;
2042 struct extent_position epos = {}; 2049 struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2046 2053
2047 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == 2054 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
2048 (EXT_RECORDED_ALLOCATED >> 30)) 2055 (EXT_RECORDED_ALLOCATED >> 30))
2049 ret = udf_get_lb_pblock(inode->i_sb, eloc, offset); 2056 ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
2050 else 2057 else
2051 ret = 0; 2058 ret = 0;
2052 2059
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f1..9215700c00a4 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
134 } 134 }
135 } 135 }
136 /* rewrite CRC + checksum of eahd */ 136 /* rewrite CRC + checksum of eahd */
137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); 137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
138 eahd->descTag.descCRCLength = cpu_to_le16(crclen); 138 eahd->descTag.descCRCLength = cpu_to_le16(crclen);
139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + 139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
140 sizeof(tag), crclen)); 140 sizeof(struct tag), crclen));
141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); 141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
142 iinfo->i_lenEAttr += size; 142 iinfo->i_lenEAttr += size;
143 return (struct genericFormat *)&ea[offset]; 143 return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, 202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
203 uint32_t location, uint16_t *ident) 203 uint32_t location, uint16_t *ident)
204{ 204{
205 tag *tag_p; 205 struct tag *tag_p;
206 struct buffer_head *bh = NULL; 206 struct buffer_head *bh = NULL;
207 207
208 /* Read the block */ 208 /* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
216 return NULL; 216 return NULL;
217 } 217 }
218 218
219 tag_p = (tag *)(bh->b_data); 219 tag_p = (struct tag *)(bh->b_data);
220 220
221 *ident = le16_to_cpu(tag_p->tagIdent); 221 *ident = le16_to_cpu(tag_p->tagIdent);
222 222
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
241 } 241 }
242 242
243 /* Verify the descriptor CRC */ 243 /* Verify the descriptor CRC */
244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || 244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, 245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
246 bh->b_data + sizeof(tag), 246 bh->b_data + sizeof(struct tag),
247 le16_to_cpu(tag_p->descCRCLength))) 247 le16_to_cpu(tag_p->descCRCLength)))
248 return bh; 248 return bh;
249 249
@@ -255,27 +255,28 @@ error_out:
255 return NULL; 255 return NULL;
256} 256}
257 257
258struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc, 258struct buffer_head *udf_read_ptagged(struct super_block *sb,
259 struct kernel_lb_addr *loc,
259 uint32_t offset, uint16_t *ident) 260 uint32_t offset, uint16_t *ident)
260{ 261{
261 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), 262 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
262 loc.logicalBlockNum + offset, ident); 263 loc->logicalBlockNum + offset, ident);
263} 264}
264 265
265void udf_update_tag(char *data, int length) 266void udf_update_tag(char *data, int length)
266{ 267{
267 tag *tptr = (tag *)data; 268 struct tag *tptr = (struct tag *)data;
268 length -= sizeof(tag); 269 length -= sizeof(struct tag);
269 270
270 tptr->descCRCLength = cpu_to_le16(length); 271 tptr->descCRCLength = cpu_to_le16(length);
271 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length)); 272 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
272 tptr->tagChecksum = udf_tag_checksum(tptr); 273 tptr->tagChecksum = udf_tag_checksum(tptr);
273} 274}
274 275
275void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, 276void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
276 uint32_t loc, int length) 277 uint32_t loc, int length)
277{ 278{
278 tag *tptr = (tag *)data; 279 struct tag *tptr = (struct tag *)data;
279 tptr->tagIdent = cpu_to_le16(ident); 280 tptr->tagIdent = cpu_to_le16(ident);
280 tptr->descVersion = cpu_to_le16(version); 281 tptr->descVersion = cpu_to_le16(version);
281 tptr->tagSerialNum = cpu_to_le16(snum); 282 tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
283 udf_update_tag(data, length); 284 udf_update_tag(data, length);
284} 285}
285 286
286u8 udf_tag_checksum(const tag *t) 287u8 udf_tag_checksum(const struct tag *t)
287{ 288{
288 u8 *data = (u8 *)t; 289 u8 *data = (u8 *)t;
289 u8 checksum = 0; 290 u8 checksum = 0;
290 int i; 291 int i;
291 for (i = 0; i < sizeof(tag); ++i) 292 for (i = 0; i < sizeof(struct tag); ++i)
292 if (i != 4) /* position of checksum */ 293 if (i != 4) /* position of checksum */
293 checksum += data[i]; 294 checksum += data[i];
294 return checksum; 295 return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d941..6a29fa34c478 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, 47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
48 uint8_t *impuse, uint8_t *fileident) 48 uint8_t *impuse, uint8_t *fileident)
49{ 49{
50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag); 50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
51 uint16_t crc; 51 uint16_t crc;
52 int offset; 52 int offset;
53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); 53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
99 memset(fibh->ebh->b_data, 0x00, padlen + offset); 99 memset(fibh->ebh->b_data, 0x00, padlen + offset);
100 } 100 }
101 101
102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag), 102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
103 sizeof(struct fileIdentDesc) - sizeof(tag)); 103 sizeof(struct fileIdentDesc) - sizeof(struct tag));
104 104
105 if (fibh->sbh == fibh->ebh) { 105 if (fibh->sbh == fibh->ebh) {
106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
107 crclen + sizeof(tag) - 107 crclen + sizeof(struct tag) -
108 sizeof(struct fileIdentDesc)); 108 sizeof(struct fileIdentDesc));
109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { 109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
110 crc = crc_itu_t(crc, fibh->ebh->b_data + 110 crc = crc_itu_t(crc, fibh->ebh->b_data +
111 sizeof(struct fileIdentDesc) + 111 sizeof(struct fileIdentDesc) +
112 fibh->soffset, 112 fibh->soffset,
113 crclen + sizeof(tag) - 113 crclen + sizeof(struct tag) -
114 sizeof(struct fileIdentDesc)); 114 sizeof(struct fileIdentDesc));
115 } else { 115 } else {
116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
157 kernel_lb_addr eloc; 157 struct kernel_lb_addr eloc;
158 uint32_t elen; 158 uint32_t elen;
159 sector_t offset; 159 sector_t offset;
160 struct extent_position epos = {}; 160 struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) 172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
173 goto out_err; 173 goto out_err;
174 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 174 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
177 epos.offset -= sizeof(short_ad); 177 epos.offset -= sizeof(struct short_ad);
178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
179 epos.offset -= sizeof(long_ad); 179 epos.offset -= sizeof(struct long_ad);
180 } else 180 } else
181 offset = 0; 181 offset = 0;
182 182
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
268#ifdef UDF_RECOVERY 268#ifdef UDF_RECOVERY
269 /* temporary shorthand for specifying files by inode number */ 269 /* temporary shorthand for specifying files by inode number */
270 if (!strncmp(dentry->d_name.name, ".B=", 3)) { 270 if (!strncmp(dentry->d_name.name, ".B=", 3)) {
271 kernel_lb_addr lb = { 271 struct kernel_lb_addr lb = {
272 .logicalBlockNum = 0, 272 .logicalBlockNum = 0,
273 .partitionReferenceNum = 273 .partitionReferenceNum =
274 simple_strtoul(dentry->d_name.name + 3, 274 simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
283#endif /* UDF_RECOVERY */ 283#endif /* UDF_RECOVERY */
284 284
285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { 285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
286 struct kernel_lb_addr loc;
287
286 if (fibh.sbh != fibh.ebh) 288 if (fibh.sbh != fibh.ebh)
287 brelse(fibh.ebh); 289 brelse(fibh.ebh);
288 brelse(fibh.sbh); 290 brelse(fibh.sbh);
289 291
290 inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation)); 292 loc = lelb_to_cpu(cfi.icb.extLocation);
293 inode = udf_iget(dir->i_sb, &loc);
291 if (!inode) { 294 if (!inode) {
292 unlock_kernel(); 295 unlock_kernel();
293 return ERR_PTR(-EACCES); 296 return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
313 uint8_t lfi; 316 uint8_t lfi;
314 uint16_t liu; 317 uint16_t liu;
315 int block; 318 int block;
316 kernel_lb_addr eloc; 319 struct kernel_lb_addr eloc;
317 uint32_t elen = 0; 320 uint32_t elen = 0;
318 sector_t offset; 321 sector_t offset;
319 struct extent_position epos = {}; 322 struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
351 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 354 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
352 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { 355 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
353 block = udf_get_lb_pblock(dir->i_sb, 356 block = udf_get_lb_pblock(dir->i_sb,
354 dinfo->i_location, 0); 357 &dinfo->i_location, 0);
355 fibh->soffset = fibh->eoffset = sb->s_blocksize; 358 fibh->soffset = fibh->eoffset = sb->s_blocksize;
356 goto add; 359 goto add;
357 } 360 }
358 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 361 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
359 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 362 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
360 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 363 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
361 epos.offset -= sizeof(short_ad); 364 epos.offset -= sizeof(struct short_ad);
362 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 365 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
363 epos.offset -= sizeof(long_ad); 366 epos.offset -= sizeof(struct long_ad);
364 } else 367 } else
365 offset = 0; 368 offset = 0;
366 369
@@ -409,10 +412,10 @@ add:
409 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { 412 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
410 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); 413 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
411 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 414 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
412 epos.offset -= sizeof(short_ad); 415 epos.offset -= sizeof(struct short_ad);
413 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 416 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
414 epos.offset -= sizeof(long_ad); 417 epos.offset -= sizeof(struct long_ad);
415 udf_write_aext(dir, &epos, eloc, elen, 1); 418 udf_write_aext(dir, &epos, &eloc, elen, 1);
416 } 419 }
417 f_pos += nfidlen; 420 f_pos += nfidlen;
418 421
@@ -494,10 +497,10 @@ add:
494 memset(cfi, 0, sizeof(struct fileIdentDesc)); 497 memset(cfi, 0, sizeof(struct fileIdentDesc));
495 if (UDF_SB(sb)->s_udfrev >= 0x0200) 498 if (UDF_SB(sb)->s_udfrev >= 0x0200)
496 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, 499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
497 sizeof(tag)); 500 sizeof(struct tag));
498 else 501 else
499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, 502 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
500 sizeof(tag)); 503 sizeof(struct tag));
501 cfi->fileVersionNum = cpu_to_le16(1); 504 cfi->fileVersionNum = cpu_to_le16(1);
502 cfi->lengthFileIdent = namelen; 505 cfi->lengthFileIdent = namelen;
503 cfi->lengthOfImpUse = cpu_to_le16(0); 506 cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
530 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; 533 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
531 534
532 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) 535 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
533 memset(&(cfi->icb), 0x00, sizeof(long_ad)); 536 memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
534 537
535 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); 538 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
536} 539}
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
710 loff_t f_pos; 713 loff_t f_pos;
711 loff_t size = udf_ext0_offset(dir) + dir->i_size; 714 loff_t size = udf_ext0_offset(dir) + dir->i_size;
712 int block; 715 int block;
713 kernel_lb_addr eloc; 716 struct kernel_lb_addr eloc;
714 uint32_t elen; 717 uint32_t elen;
715 sector_t offset; 718 sector_t offset;
716 struct extent_position epos = {}; 719 struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
724 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, 727 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
725 &epos, &eloc, &elen, &offset) == 728 &epos, &eloc, &elen, &offset) ==
726 (EXT_RECORDED_ALLOCATED >> 30)) { 729 (EXT_RECORDED_ALLOCATED >> 30)) {
727 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 730 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
728 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 731 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
729 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 732 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
730 epos.offset -= sizeof(short_ad); 733 epos.offset -= sizeof(struct short_ad);
731 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 734 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
732 epos.offset -= sizeof(long_ad); 735 epos.offset -= sizeof(struct long_ad);
733 } else 736 } else
734 offset = 0; 737 offset = 0;
735 738
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
778 struct inode *inode = dentry->d_inode; 781 struct inode *inode = dentry->d_inode;
779 struct udf_fileident_bh fibh; 782 struct udf_fileident_bh fibh;
780 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
781 kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
782 785
783 retval = -ENOENT; 786 retval = -ENOENT;
784 lock_kernel(); 787 lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
788 791
789 retval = -EIO; 792 retval = -EIO;
790 tloc = lelb_to_cpu(cfi.icb.extLocation); 793 tloc = lelb_to_cpu(cfi.icb.extLocation);
791 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 794 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
792 goto end_rmdir; 795 goto end_rmdir;
793 retval = -ENOTEMPTY; 796 retval = -ENOTEMPTY;
794 if (!empty_dir(inode)) 797 if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
824 struct udf_fileident_bh fibh; 827 struct udf_fileident_bh fibh;
825 struct fileIdentDesc *fi; 828 struct fileIdentDesc *fi;
826 struct fileIdentDesc cfi; 829 struct fileIdentDesc cfi;
827 kernel_lb_addr tloc; 830 struct kernel_lb_addr tloc;
828 831
829 retval = -ENOENT; 832 retval = -ENOENT;
830 lock_kernel(); 833 lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
834 837
835 retval = -EIO; 838 retval = -EIO;
836 tloc = lelb_to_cpu(cfi.icb.extLocation); 839 tloc = lelb_to_cpu(cfi.icb.extLocation);
837 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 840 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
838 goto end_unlink; 841 goto end_unlink;
839 842
840 if (!inode->i_nlink) { 843 if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
897 inode->i_op = &page_symlink_inode_operations; 900 inode->i_op = &page_symlink_inode_operations;
898 901
899 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 902 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
900 kernel_lb_addr eloc; 903 struct kernel_lb_addr eloc;
901 uint32_t bsize; 904 uint32_t bsize;
902 905
903 block = udf_new_block(inode->i_sb, inode, 906 block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
913 iinfo->i_location.partitionReferenceNum; 916 iinfo->i_location.partitionReferenceNum;
914 bsize = inode->i_sb->s_blocksize; 917 bsize = inode->i_sb->s_blocksize;
915 iinfo->i_lenExtents = bsize; 918 iinfo->i_lenExtents = bsize;
916 udf_add_aext(inode, &epos, eloc, bsize, 0); 919 udf_add_aext(inode, &epos, &eloc, bsize, 0);
917 brelse(epos.bh); 920 brelse(epos.bh);
918 921
919 block = udf_get_pblock(inode->i_sb, block, 922 block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1108 struct fileIdentDesc ocfi, ncfi; 1111 struct fileIdentDesc ocfi, ncfi;
1109 struct buffer_head *dir_bh = NULL; 1112 struct buffer_head *dir_bh = NULL;
1110 int retval = -ENOENT; 1113 int retval = -ENOENT;
1111 kernel_lb_addr tloc; 1114 struct kernel_lb_addr tloc;
1112 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1115 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1113 1116
1114 lock_kernel(); 1117 lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1119 brelse(ofibh.sbh); 1122 brelse(ofibh.sbh);
1120 } 1123 }
1121 tloc = lelb_to_cpu(ocfi.icb.extLocation); 1124 tloc = lelb_to_cpu(ocfi.icb.extLocation);
1122 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0) 1125 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
1123 != old_inode->i_ino) 1126 != old_inode->i_ino)
1124 goto end_rename; 1127 goto end_rename;
1125 1128
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1158 if (!dir_fi) 1161 if (!dir_fi)
1159 goto end_rename; 1162 goto end_rename;
1160 tloc = lelb_to_cpu(dir_fi->icb.extLocation); 1163 tloc = lelb_to_cpu(dir_fi->icb.extLocation);
1161 if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) != 1164 if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
1162 old_dir->i_ino) 1165 old_dir->i_ino)
1163 goto end_rename; 1166 goto end_rename;
1164 1167
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1187 */ 1190 */
1188 ncfi.fileVersionNum = ocfi.fileVersionNum; 1191 ncfi.fileVersionNum = ocfi.fileVersionNum;
1189 ncfi.fileCharacteristics = ocfi.fileCharacteristics; 1192 ncfi.fileCharacteristics = ocfi.fileCharacteristics;
1190 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad)); 1193 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
1191 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); 1194 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
1192 1195
1193 /* The old fid may have moved - find it again */ 1196 /* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
1242 1245
1243static struct dentry *udf_get_parent(struct dentry *child) 1246static struct dentry *udf_get_parent(struct dentry *child)
1244{ 1247{
1248 struct kernel_lb_addr tloc;
1245 struct inode *inode = NULL; 1249 struct inode *inode = NULL;
1246 struct qstr dotdot = {.name = "..", .len = 2}; 1250 struct qstr dotdot = {.name = "..", .len = 2};
1247 struct fileIdentDesc cfi; 1251 struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
1255 brelse(fibh.ebh); 1259 brelse(fibh.ebh);
1256 brelse(fibh.sbh); 1260 brelse(fibh.sbh);
1257 1261
1258 inode = udf_iget(child->d_inode->i_sb, 1262 tloc = lelb_to_cpu(cfi.icb.extLocation);
1259 lelb_to_cpu(cfi.icb.extLocation)); 1263 inode = udf_iget(child->d_inode->i_sb, &tloc);
1260 if (!inode) 1264 if (!inode)
1261 goto out_unlock; 1265 goto out_unlock;
1262 unlock_kernel(); 1266 unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
1272 u16 partref, __u32 generation) 1276 u16 partref, __u32 generation)
1273{ 1277{
1274 struct inode *inode; 1278 struct inode *inode;
1275 kernel_lb_addr loc; 1279 struct kernel_lb_addr loc;
1276 1280
1277 if (block == 0) 1281 if (block == 0)
1278 return ERR_PTR(-ESTALE); 1282 return ERR_PTR(-ESTALE);
1279 1283
1280 loc.logicalBlockNum = block; 1284 loc.logicalBlockNum = block;
1281 loc.partitionReferenceNum = partref; 1285 loc.partitionReferenceNum = partref;
1282 inode = udf_iget(sb, loc); 1286 inode = udf_iget(sb, &loc);
1283 1287
1284 if (inode == NULL) 1288 if (inode == NULL)
1285 return ERR_PTR(-ENOMEM); 1289 return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1318{ 1322{
1319 int len = *lenp; 1323 int len = *lenp;
1320 struct inode *inode = de->d_inode; 1324 struct inode *inode = de->d_inode;
1321 kernel_lb_addr location = UDF_I(inode)->i_location; 1325 struct kernel_lb_addr location = UDF_I(inode)->i_location;
1322 struct fid *fid = (struct fid *)fh; 1326 struct fid *fid = (struct fid *)fh;
1323 int type = FILEID_UDF_WITHOUT_PARENT; 1327 int type = FILEID_UDF_WITHOUT_PARENT;
1324 1328
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd2..fbff74654df2 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ 85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
86/* Implementation Use (UDF 2.50 2.2.6.4) */ 86/* Implementation Use (UDF 2.50 2.2.6.4) */
87struct logicalVolIntegrityDescImpUse { 87struct logicalVolIntegrityDescImpUse {
88 regid impIdent; 88 struct regid impIdent;
89 __le32 numFiles; 89 __le32 numFiles;
90 __le32 numDirs; 90 __le32 numDirs;
91 __le16 minUDFReadRev; 91 __le16 minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ 97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
98/* Implementation Use (UDF 2.50 2.2.7.2) */ 98/* Implementation Use (UDF 2.50 2.2.7.2) */
99struct impUseVolDescImpUse { 99struct impUseVolDescImpUse {
100 charspec LVICharset; 100 struct charspec LVICharset;
101 dstring logicalVolIdent[128]; 101 dstring logicalVolIdent[128];
102 dstring LVInfo1[36]; 102 dstring LVInfo1[36];
103 dstring LVInfo2[36]; 103 dstring LVInfo2[36];
104 dstring LVInfo3[36]; 104 dstring LVInfo3[36];
105 regid impIdent; 105 struct regid impIdent;
106 uint8_t impUse[128]; 106 uint8_t impUse[128];
107} __attribute__ ((packed)); 107} __attribute__ ((packed));
108 108
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
110 uint8_t partitionMapType; 110 uint8_t partitionMapType;
111 uint8_t partitionMapLength; 111 uint8_t partitionMapLength;
112 uint8_t reserved1[2]; 112 uint8_t reserved1[2];
113 regid partIdent; 113 struct regid partIdent;
114 __le16 volSeqNum; 114 __le16 volSeqNum;
115 __le16 partitionNum; 115 __le16 partitionNum;
116} __attribute__ ((packed)); 116} __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
120 uint8_t partitionMapType; 120 uint8_t partitionMapType;
121 uint8_t partitionMapLength; 121 uint8_t partitionMapLength;
122 uint8_t reserved1[2]; 122 uint8_t reserved1[2];
123 regid partIdent; 123 struct regid partIdent;
124 __le16 volSeqNum; 124 __le16 volSeqNum;
125 __le16 partitionNum; 125 __le16 partitionNum;
126 uint8_t reserved2[24]; 126 uint8_t reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
131 uint8_t partitionMapType; 131 uint8_t partitionMapType;
132 uint8_t partitionMapLength; 132 uint8_t partitionMapLength;
133 uint8_t reserved1[2]; 133 uint8_t reserved1[2];
134 regid partIdent; 134 struct regid partIdent;
135 __le16 volSeqNum; 135 __le16 volSeqNum;
136 __le16 partitionNum; 136 __le16 partitionNum;
137 __le16 packetLength; 137 __le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
146 uint8_t partitionMapType; 146 uint8_t partitionMapType;
147 uint8_t partitionMapLength; 147 uint8_t partitionMapLength;
148 uint8_t reserved1[2]; 148 uint8_t reserved1[2];
149 regid partIdent; 149 struct regid partIdent;
150 __le16 volSeqNum; 150 __le16 volSeqNum;
151 __le16 partitionNum; 151 __le16 partitionNum;
152 __le32 metadataFileLoc; 152 __le32 metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
161/* Virtual Allocation Table (UDF 1.5 2.2.10) */ 161/* Virtual Allocation Table (UDF 1.5 2.2.10) */
162struct virtualAllocationTable15 { 162struct virtualAllocationTable15 {
163 __le32 VirtualSector[0]; 163 __le32 VirtualSector[0];
164 regid vatIdent; 164 struct regid vatIdent;
165 __le32 previousVATICBLoc; 165 __le32 previousVATICBLoc;
166} __attribute__ ((packed)); 166} __attribute__ ((packed));
167 167
@@ -192,8 +192,8 @@ struct sparingEntry {
192} __attribute__ ((packed)); 192} __attribute__ ((packed));
193 193
194struct sparingTable { 194struct sparingTable {
195 tag descTag; 195 struct tag descTag;
196 regid sparingIdent; 196 struct regid sparingIdent;
197 __le16 reallocationTableLen; 197 __le16 reallocationTableLen;
198 __le16 reserved; 198 __le16 reserved;
199 __le32 sequenceNum; 199 __le32 sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
206#define ICBTAG_FILE_TYPE_MIRROR 0xFB 206#define ICBTAG_FILE_TYPE_MIRROR 0xFB
207#define ICBTAG_FILE_TYPE_BITMAP 0xFC 207#define ICBTAG_FILE_TYPE_BITMAP 0xFC
208 208
209/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ 209/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
210struct allocDescImpUse { 210struct allocDescImpUse {
211 __le16 flags; 211 __le16 flags;
212 uint8_t impUse[4]; 212 uint8_t impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d6..4b540ee632d5 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
273{ 273{
274 struct super_block *sb = inode->i_sb; 274 struct super_block *sb = inode->i_sb;
275 struct udf_part_map *map; 275 struct udf_part_map *map;
276 kernel_lb_addr eloc; 276 struct kernel_lb_addr eloc;
277 uint32_t elen; 277 uint32_t elen;
278 sector_t ext_offset; 278 sector_t ext_offset;
279 struct extent_position epos = {}; 279 struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627b..72348cc855a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
81/* These are the "meat" - everything else is stuffing */ 81/* These are the "meat" - everything else is stuffing */
82static int udf_fill_super(struct super_block *, void *, int); 82static int udf_fill_super(struct super_block *, void *, int);
83static void udf_put_super(struct super_block *); 83static void udf_put_super(struct super_block *);
84static void udf_write_super(struct super_block *); 84static int udf_sync_fs(struct super_block *, int);
85static int udf_remount_fs(struct super_block *, int *, char *); 85static int udf_remount_fs(struct super_block *, int *, char *);
86static int udf_check_valid(struct super_block *, int, int); 86static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
87static int udf_vrs(struct super_block *sb, int silent); 87static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
88static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); 88 struct kernel_lb_addr *);
89static void udf_find_anchor(struct super_block *);
90static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
91 kernel_lb_addr *);
92static void udf_load_fileset(struct super_block *, struct buffer_head *, 89static void udf_load_fileset(struct super_block *, struct buffer_head *,
93 kernel_lb_addr *); 90 struct kernel_lb_addr *);
94static void udf_open_lvid(struct super_block *); 91static void udf_open_lvid(struct super_block *);
95static void udf_close_lvid(struct super_block *); 92static void udf_close_lvid(struct super_block *);
96static unsigned int udf_count_free(struct super_block *); 93static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
181 .delete_inode = udf_delete_inode, 178 .delete_inode = udf_delete_inode,
182 .clear_inode = udf_clear_inode, 179 .clear_inode = udf_clear_inode,
183 .put_super = udf_put_super, 180 .put_super = udf_put_super,
184 .write_super = udf_write_super, 181 .sync_fs = udf_sync_fs,
185 .statfs = udf_statfs, 182 .statfs = udf_statfs,
186 .remount_fs = udf_remount_fs, 183 .remount_fs = udf_remount_fs,
187 .show_options = udf_show_options, 184 .show_options = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
201 mode_t umask; 198 mode_t umask;
202 gid_t gid; 199 gid_t gid;
203 uid_t uid; 200 uid_t uid;
201 mode_t fmode;
202 mode_t dmode;
204 struct nls_table *nls_map; 203 struct nls_table *nls_map;
205}; 204};
206 205
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
258 257
259 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) 258 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
260 seq_puts(seq, ",nostrict"); 259 seq_puts(seq, ",nostrict");
261 if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE) 260 if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
262 seq_printf(seq, ",bs=%lu", sb->s_blocksize); 261 seq_printf(seq, ",bs=%lu", sb->s_blocksize);
263 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) 262 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
264 seq_puts(seq, ",unhide"); 263 seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
282 seq_printf(seq, ",gid=%u", sbi->s_gid); 281 seq_printf(seq, ",gid=%u", sbi->s_gid);
283 if (sbi->s_umask != 0) 282 if (sbi->s_umask != 0)
284 seq_printf(seq, ",umask=%o", sbi->s_umask); 283 seq_printf(seq, ",umask=%o", sbi->s_umask);
284 if (sbi->s_fmode != UDF_INVALID_MODE)
285 seq_printf(seq, ",mode=%o", sbi->s_fmode);
286 if (sbi->s_dmode != UDF_INVALID_MODE)
287 seq_printf(seq, ",dmode=%o", sbi->s_dmode);
285 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) 288 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
286 seq_printf(seq, ",session=%u", sbi->s_session); 289 seq_printf(seq, ",session=%u", sbi->s_session);
287 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) 290 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
288 seq_printf(seq, ",lastblock=%u", sbi->s_last_block); 291 seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
289 /* 292 if (sbi->s_anchor != 0)
290 * s_anchor[2] could be zeroed out in case there is no anchor 293 seq_printf(seq, ",anchor=%u", sbi->s_anchor);
291 * in the specified block, but then the "anchor=N" option
292 * originally given by the user wasn't effective, so it's OK
293 * if we don't show it.
294 */
295 if (sbi->s_anchor[2] != 0)
296 seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
297 /* 294 /*
298 * volume, partition, fileset and rootdir seem to be ignored 295 * volume, partition, fileset and rootdir seem to be ignored
299 * currently 296 * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
317 * 314 *
318 * gid= Set the default group. 315 * gid= Set the default group.
319 * umask= Set the default umask. 316 * umask= Set the default umask.
317 * mode= Set the default file permissions.
318 * dmode= Set the default directory permissions.
320 * uid= Set the default user. 319 * uid= Set the default user.
321 * bs= Set the block size. 320 * bs= Set the block size.
322 * unhide Show otherwise hidden files. 321 * unhide Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
366 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, 365 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
367 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, 366 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
368 Opt_rootdir, Opt_utf8, Opt_iocharset, 367 Opt_rootdir, Opt_utf8, Opt_iocharset,
369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore 368 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
369 Opt_fmode, Opt_dmode
370}; 370};
371 371
372static const match_table_t tokens = { 372static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
395 {Opt_rootdir, "rootdir=%u"}, 395 {Opt_rootdir, "rootdir=%u"},
396 {Opt_utf8, "utf8"}, 396 {Opt_utf8, "utf8"},
397 {Opt_iocharset, "iocharset=%s"}, 397 {Opt_iocharset, "iocharset=%s"},
398 {Opt_fmode, "mode=%o"},
399 {Opt_dmode, "dmode=%o"},
398 {Opt_err, NULL} 400 {Opt_err, NULL}
399}; 401};
400 402
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
405 int option; 407 int option;
406 408
407 uopt->novrs = 0; 409 uopt->novrs = 0;
408 uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
409 uopt->partition = 0xFFFF; 410 uopt->partition = 0xFFFF;
410 uopt->session = 0xFFFFFFFF; 411 uopt->session = 0xFFFFFFFF;
411 uopt->lastblock = 0; 412 uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
428 switch (token) { 429 switch (token) {
429 case Opt_novrs: 430 case Opt_novrs:
430 uopt->novrs = 1; 431 uopt->novrs = 1;
432 break;
431 case Opt_bs: 433 case Opt_bs:
432 if (match_int(&args[0], &option)) 434 if (match_int(&args[0], &option))
433 return 0; 435 return 0;
434 uopt->blocksize = option; 436 uopt->blocksize = option;
437 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
435 break; 438 break;
436 case Opt_unhide: 439 case Opt_unhide:
437 uopt->flags |= (1 << UDF_FLAG_UNHIDE); 440 uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
531 case Opt_gforget: 534 case Opt_gforget:
532 uopt->flags |= (1 << UDF_FLAG_GID_FORGET); 535 uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
533 break; 536 break;
537 case Opt_fmode:
538 if (match_octal(args, &option))
539 return 0;
540 uopt->fmode = option & 0777;
541 break;
542 case Opt_dmode:
543 if (match_octal(args, &option))
544 return 0;
545 uopt->dmode = option & 0777;
546 break;
534 default: 547 default:
535 printk(KERN_ERR "udf: bad mount option \"%s\" " 548 printk(KERN_ERR "udf: bad mount option \"%s\" "
536 "or missing value\n", p); 549 "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
540 return 1; 553 return 1;
541} 554}
542 555
543static void udf_write_super(struct super_block *sb)
544{
545 lock_kernel();
546
547 if (!(sb->s_flags & MS_RDONLY))
548 udf_open_lvid(sb);
549 sb->s_dirt = 0;
550
551 unlock_kernel();
552}
553
554static int udf_remount_fs(struct super_block *sb, int *flags, char *options) 556static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
555{ 557{
556 struct udf_options uopt; 558 struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
560 uopt.uid = sbi->s_uid; 562 uopt.uid = sbi->s_uid;
561 uopt.gid = sbi->s_gid; 563 uopt.gid = sbi->s_gid;
562 uopt.umask = sbi->s_umask; 564 uopt.umask = sbi->s_umask;
565 uopt.fmode = sbi->s_fmode;
566 uopt.dmode = sbi->s_dmode;
563 567
564 if (!udf_parse_options(options, &uopt, true)) 568 if (!udf_parse_options(options, &uopt, true))
565 return -EINVAL; 569 return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
568 sbi->s_uid = uopt.uid; 572 sbi->s_uid = uopt.uid;
569 sbi->s_gid = uopt.gid; 573 sbi->s_gid = uopt.gid;
570 sbi->s_umask = uopt.umask; 574 sbi->s_umask = uopt.umask;
575 sbi->s_fmode = uopt.fmode;
576 sbi->s_dmode = uopt.dmode;
571 577
572 if (sbi->s_lvid_bh) { 578 if (sbi->s_lvid_bh) {
573 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); 579 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
585 return 0; 591 return 0;
586} 592}
587 593
588static int udf_vrs(struct super_block *sb, int silent) 594/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
595/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
596static loff_t udf_check_vsd(struct super_block *sb)
589{ 597{
590 struct volStructDesc *vsd = NULL; 598 struct volStructDesc *vsd = NULL;
591 loff_t sector = 32768; 599 loff_t sector = 32768;
592 int sectorsize; 600 int sectorsize;
593 struct buffer_head *bh = NULL; 601 struct buffer_head *bh = NULL;
594 int iso9660 = 0;
595 int nsr02 = 0; 602 int nsr02 = 0;
596 int nsr03 = 0; 603 int nsr03 = 0;
597 struct udf_sb_info *sbi; 604 struct udf_sb_info *sbi;
598 605
599 /* Block size must be a multiple of 512 */
600 if (sb->s_blocksize & 511)
601 return 0;
602 sbi = UDF_SB(sb); 606 sbi = UDF_SB(sb);
603
604 if (sb->s_blocksize < sizeof(struct volStructDesc)) 607 if (sb->s_blocksize < sizeof(struct volStructDesc))
605 sectorsize = sizeof(struct volStructDesc); 608 sectorsize = sizeof(struct volStructDesc);
606 else 609 else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
627 break; 630 break;
628 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, 631 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
629 VSD_STD_ID_LEN)) { 632 VSD_STD_ID_LEN)) {
630 iso9660 = sector;
631 switch (vsd->structType) { 633 switch (vsd->structType) {
632 case 0: 634 case 0:
633 udf_debug("ISO9660 Boot Record found\n"); 635 udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
679 return 0; 681 return 0;
680} 682}
681 683
682/*
683 * Check whether there is an anchor block in the given block
684 */
685static int udf_check_anchor_block(struct super_block *sb, sector_t block)
686{
687 struct buffer_head *bh;
688 uint16_t ident;
689
690 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
691 udf_fixed_to_variable(block) >=
692 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
693 return 0;
694
695 bh = udf_read_tagged(sb, block, block, &ident);
696 if (!bh)
697 return 0;
698 brelse(bh);
699
700 return ident == TAG_IDENT_AVDP;
701}
702
703/* Search for an anchor volume descriptor pointer */
704static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
705{
706 sector_t last[6];
707 int i;
708 struct udf_sb_info *sbi = UDF_SB(sb);
709
710 last[0] = lastblock;
711 last[1] = last[0] - 1;
712 last[2] = last[0] + 1;
713 last[3] = last[0] - 2;
714 last[4] = last[0] - 150;
715 last[5] = last[0] - 152;
716
717 /* according to spec, anchor is in either:
718 * block 256
719 * lastblock-256
720 * lastblock
721 * however, if the disc isn't closed, it could be 512 */
722
723 for (i = 0; i < ARRAY_SIZE(last); i++) {
724 if (last[i] < 0)
725 continue;
726 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
727 sb->s_blocksize_bits)
728 continue;
729
730 if (udf_check_anchor_block(sb, last[i])) {
731 sbi->s_anchor[0] = last[i];
732 sbi->s_anchor[1] = last[i] - 256;
733 return last[i];
734 }
735
736 if (last[i] < 256)
737 continue;
738
739 if (udf_check_anchor_block(sb, last[i] - 256)) {
740 sbi->s_anchor[1] = last[i] - 256;
741 return last[i];
742 }
743 }
744
745 if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
746 sbi->s_anchor[0] = sbi->s_session + 256;
747 return last[0];
748 }
749 if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
750 sbi->s_anchor[0] = sbi->s_session + 512;
751 return last[0];
752 }
753 return 0;
754}
755
756/*
757 * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
758 * be the last block on the media.
759 *
760 * Return 1 if not found, 0 if ok
761 *
762 */
763static void udf_find_anchor(struct super_block *sb)
764{
765 sector_t lastblock;
766 struct buffer_head *bh = NULL;
767 uint16_t ident;
768 int i;
769 struct udf_sb_info *sbi = UDF_SB(sb);
770
771 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
772 if (lastblock)
773 goto check_anchor;
774
775 /* No anchor found? Try VARCONV conversion of block numbers */
776 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
777 /* Firstly, we try to not convert number of the last block */
778 lastblock = udf_scan_anchors(sb,
779 udf_variable_to_fixed(sbi->s_last_block));
780 if (lastblock)
781 goto check_anchor;
782
783 /* Secondly, we try with converted number of the last block */
784 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
785 if (!lastblock) {
786 /* VARCONV didn't help. Clear it. */
787 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
788 }
789
790check_anchor:
791 /*
792 * Check located anchors and the anchor block supplied via
793 * mount options
794 */
795 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
796 if (!sbi->s_anchor[i])
797 continue;
798 bh = udf_read_tagged(sb, sbi->s_anchor[i],
799 sbi->s_anchor[i], &ident);
800 if (!bh)
801 sbi->s_anchor[i] = 0;
802 else {
803 brelse(bh);
804 if (ident != TAG_IDENT_AVDP)
805 sbi->s_anchor[i] = 0;
806 }
807 }
808
809 sbi->s_last_block = lastblock;
810}
811
812static int udf_find_fileset(struct super_block *sb, 684static int udf_find_fileset(struct super_block *sb,
813 kernel_lb_addr *fileset, 685 struct kernel_lb_addr *fileset,
814 kernel_lb_addr *root) 686 struct kernel_lb_addr *root)
815{ 687{
816 struct buffer_head *bh = NULL; 688 struct buffer_head *bh = NULL;
817 long lastblock; 689 long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
820 692
821 if (fileset->logicalBlockNum != 0xFFFFFFFF || 693 if (fileset->logicalBlockNum != 0xFFFFFFFF ||
822 fileset->partitionReferenceNum != 0xFFFF) { 694 fileset->partitionReferenceNum != 0xFFFF) {
823 bh = udf_read_ptagged(sb, *fileset, 0, &ident); 695 bh = udf_read_ptagged(sb, fileset, 0, &ident);
824 696
825 if (!bh) { 697 if (!bh) {
826 return 1; 698 return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
834 sbi = UDF_SB(sb); 706 sbi = UDF_SB(sb);
835 if (!bh) { 707 if (!bh) {
836 /* Search backwards through the partitions */ 708 /* Search backwards through the partitions */
837 kernel_lb_addr newfileset; 709 struct kernel_lb_addr newfileset;
838 710
839/* --> cvg: FIXME - is it reasonable? */ 711/* --> cvg: FIXME - is it reasonable? */
840 return 1; 712 return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
850 newfileset.logicalBlockNum = 0; 722 newfileset.logicalBlockNum = 0;
851 723
852 do { 724 do {
853 bh = udf_read_ptagged(sb, newfileset, 0, 725 bh = udf_read_ptagged(sb, &newfileset, 0,
854 &ident); 726 &ident);
855 if (!bh) { 727 if (!bh) {
856 newfileset.logicalBlockNum++; 728 newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
902static int udf_load_pvoldesc(struct super_block *sb, sector_t block) 774static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
903{ 775{
904 struct primaryVolDesc *pvoldesc; 776 struct primaryVolDesc *pvoldesc;
905 struct ustr instr; 777 struct ustr *instr, *outstr;
906 struct ustr outstr;
907 struct buffer_head *bh; 778 struct buffer_head *bh;
908 uint16_t ident; 779 uint16_t ident;
780 int ret = 1;
781
782 instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
783 if (!instr)
784 return 1;
785
786 outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
787 if (!outstr)
788 goto out1;
909 789
910 bh = udf_read_tagged(sb, block, block, &ident); 790 bh = udf_read_tagged(sb, block, block, &ident);
911 if (!bh) 791 if (!bh)
912 return 1; 792 goto out2;
793
913 BUG_ON(ident != TAG_IDENT_PVD); 794 BUG_ON(ident != TAG_IDENT_PVD);
914 795
915 pvoldesc = (struct primaryVolDesc *)bh->b_data; 796 pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
917 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, 798 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
918 pvoldesc->recordingDateAndTime)) { 799 pvoldesc->recordingDateAndTime)) {
919#ifdef UDFFS_DEBUG 800#ifdef UDFFS_DEBUG
920 timestamp *ts = &pvoldesc->recordingDateAndTime; 801 struct timestamp *ts = &pvoldesc->recordingDateAndTime;
921 udf_debug("recording time %04u/%02u/%02u" 802 udf_debug("recording time %04u/%02u/%02u"
922 " %02u:%02u (%x)\n", 803 " %02u:%02u (%x)\n",
923 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, 804 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
925#endif 806#endif
926 } 807 }
927 808
928 if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) 809 if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
929 if (udf_CS0toUTF8(&outstr, &instr)) { 810 if (udf_CS0toUTF8(outstr, instr)) {
930 strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name, 811 strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
931 outstr.u_len > 31 ? 31 : outstr.u_len); 812 outstr->u_len > 31 ? 31 : outstr->u_len);
932 udf_debug("volIdent[] = '%s'\n", 813 udf_debug("volIdent[] = '%s'\n",
933 UDF_SB(sb)->s_volume_ident); 814 UDF_SB(sb)->s_volume_ident);
934 } 815 }
935 816
936 if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) 817 if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
937 if (udf_CS0toUTF8(&outstr, &instr)) 818 if (udf_CS0toUTF8(outstr, instr))
938 udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); 819 udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
939 820
940 brelse(bh); 821 brelse(bh);
941 return 0; 822 ret = 0;
823out2:
824 kfree(outstr);
825out1:
826 kfree(instr);
827 return ret;
942} 828}
943 829
944static int udf_load_metadata_files(struct super_block *sb, int partition) 830static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
946 struct udf_sb_info *sbi = UDF_SB(sb); 832 struct udf_sb_info *sbi = UDF_SB(sb);
947 struct udf_part_map *map; 833 struct udf_part_map *map;
948 struct udf_meta_data *mdata; 834 struct udf_meta_data *mdata;
949 kernel_lb_addr addr; 835 struct kernel_lb_addr addr;
950 int fe_error = 0; 836 int fe_error = 0;
951 837
952 map = &sbi->s_partmaps[partition]; 838 map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
959 udf_debug("Metadata file location: block = %d part = %d\n", 845 udf_debug("Metadata file location: block = %d part = %d\n",
960 addr.logicalBlockNum, addr.partitionReferenceNum); 846 addr.logicalBlockNum, addr.partitionReferenceNum);
961 847
962 mdata->s_metadata_fe = udf_iget(sb, addr); 848 mdata->s_metadata_fe = udf_iget(sb, &addr);
963 849
964 if (mdata->s_metadata_fe == NULL) { 850 if (mdata->s_metadata_fe == NULL) {
965 udf_warning(sb, __func__, "metadata inode efe not found, " 851 udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
981 udf_debug("Mirror metadata file location: block = %d part = %d\n", 867 udf_debug("Mirror metadata file location: block = %d part = %d\n",
982 addr.logicalBlockNum, addr.partitionReferenceNum); 868 addr.logicalBlockNum, addr.partitionReferenceNum);
983 869
984 mdata->s_mirror_fe = udf_iget(sb, addr); 870 mdata->s_mirror_fe = udf_iget(sb, &addr);
985 871
986 if (mdata->s_mirror_fe == NULL) { 872 if (mdata->s_mirror_fe == NULL) {
987 if (fe_error) { 873 if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
1013 udf_debug("Bitmap file location: block = %d part = %d\n", 899 udf_debug("Bitmap file location: block = %d part = %d\n",
1014 addr.logicalBlockNum, addr.partitionReferenceNum); 900 addr.logicalBlockNum, addr.partitionReferenceNum);
1015 901
1016 mdata->s_bitmap_fe = udf_iget(sb, addr); 902 mdata->s_bitmap_fe = udf_iget(sb, &addr);
1017 903
1018 if (mdata->s_bitmap_fe == NULL) { 904 if (mdata->s_bitmap_fe == NULL) {
1019 if (sb->s_flags & MS_RDONLY) 905 if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
1037} 923}
1038 924
1039static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, 925static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
1040 kernel_lb_addr *root) 926 struct kernel_lb_addr *root)
1041{ 927{
1042 struct fileSetDesc *fset; 928 struct fileSetDesc *fset;
1043 929
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1119 1005
1120 phd = (struct partitionHeaderDesc *)p->partitionContentsUse; 1006 phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
1121 if (phd->unallocSpaceTable.extLength) { 1007 if (phd->unallocSpaceTable.extLength) {
1122 kernel_lb_addr loc = { 1008 struct kernel_lb_addr loc = {
1123 .logicalBlockNum = le32_to_cpu( 1009 .logicalBlockNum = le32_to_cpu(
1124 phd->unallocSpaceTable.extPosition), 1010 phd->unallocSpaceTable.extPosition),
1125 .partitionReferenceNum = p_index, 1011 .partitionReferenceNum = p_index,
1126 }; 1012 };
1127 1013
1128 map->s_uspace.s_table = udf_iget(sb, loc); 1014 map->s_uspace.s_table = udf_iget(sb, &loc);
1129 if (!map->s_uspace.s_table) { 1015 if (!map->s_uspace.s_table) {
1130 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1016 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1131 p_index); 1017 p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1154 udf_debug("partitionIntegrityTable (part %d)\n", p_index); 1040 udf_debug("partitionIntegrityTable (part %d)\n", p_index);
1155 1041
1156 if (phd->freedSpaceTable.extLength) { 1042 if (phd->freedSpaceTable.extLength) {
1157 kernel_lb_addr loc = { 1043 struct kernel_lb_addr loc = {
1158 .logicalBlockNum = le32_to_cpu( 1044 .logicalBlockNum = le32_to_cpu(
1159 phd->freedSpaceTable.extPosition), 1045 phd->freedSpaceTable.extPosition),
1160 .partitionReferenceNum = p_index, 1046 .partitionReferenceNum = p_index,
1161 }; 1047 };
1162 1048
1163 map->s_fspace.s_table = udf_iget(sb, loc); 1049 map->s_fspace.s_table = udf_iget(sb, &loc);
1164 if (!map->s_fspace.s_table) { 1050 if (!map->s_fspace.s_table) {
1165 udf_debug("cannot load freedSpaceTable (part %d)\n", 1051 udf_debug("cannot load freedSpaceTable (part %d)\n",
1166 p_index); 1052 p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1192{ 1078{
1193 struct udf_sb_info *sbi = UDF_SB(sb); 1079 struct udf_sb_info *sbi = UDF_SB(sb);
1194 struct udf_part_map *map = &sbi->s_partmaps[p_index]; 1080 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1195 kernel_lb_addr ino; 1081 struct kernel_lb_addr ino;
1196 struct buffer_head *bh = NULL; 1082 struct buffer_head *bh = NULL;
1197 struct udf_inode_info *vati; 1083 struct udf_inode_info *vati;
1198 uint32_t pos; 1084 uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1201 /* VAT file entry is in the last recorded block */ 1087 /* VAT file entry is in the last recorded block */
1202 ino.partitionReferenceNum = type1_index; 1088 ino.partitionReferenceNum = type1_index;
1203 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; 1089 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1204 sbi->s_vat_inode = udf_iget(sb, ino); 1090 sbi->s_vat_inode = udf_iget(sb, &ino);
1205 if (!sbi->s_vat_inode) 1091 if (!sbi->s_vat_inode)
1206 return 1; 1092 return 1;
1207 1093
@@ -1322,7 +1208,7 @@ out_bh:
1322} 1208}
1323 1209
1324static int udf_load_logicalvol(struct super_block *sb, sector_t block, 1210static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1325 kernel_lb_addr *fileset) 1211 struct kernel_lb_addr *fileset)
1326{ 1212{
1327 struct logicalVolDesc *lvd; 1213 struct logicalVolDesc *lvd;
1328 int i, j, offset; 1214 int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1471 } 1357 }
1472 1358
1473 if (fileset) { 1359 if (fileset) {
1474 long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]); 1360 struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
1475 1361
1476 *fileset = lelb_to_cpu(la->extLocation); 1362 *fileset = lelb_to_cpu(la->extLocation);
1477 udf_debug("FileSet found in LogicalVolDesc at block=%d, " 1363 udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
1490 * udf_load_logicalvolint 1376 * udf_load_logicalvolint
1491 * 1377 *
1492 */ 1378 */
1493static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) 1379static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
1494{ 1380{
1495 struct buffer_head *bh = NULL; 1381 struct buffer_head *bh = NULL;
1496 uint16_t ident; 1382 uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
1533 * Written, tested, and released. 1419 * Written, tested, and released.
1534 */ 1420 */
1535static noinline int udf_process_sequence(struct super_block *sb, long block, 1421static noinline int udf_process_sequence(struct super_block *sb, long block,
1536 long lastblock, kernel_lb_addr *fileset) 1422 long lastblock, struct kernel_lb_addr *fileset)
1537{ 1423{
1538 struct buffer_head *bh = NULL; 1424 struct buffer_head *bh = NULL;
1539 struct udf_vds_record vds[VDS_POS_LENGTH]; 1425 struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1655 return 0; 1541 return 0;
1656} 1542}
1657 1543
1544static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1545 struct kernel_lb_addr *fileset)
1546{
1547 struct anchorVolDescPtr *anchor;
1548 long main_s, main_e, reserve_s, reserve_e;
1549 struct udf_sb_info *sbi;
1550
1551 sbi = UDF_SB(sb);
1552 anchor = (struct anchorVolDescPtr *)bh->b_data;
1553
1554 /* Locate the main sequence */
1555 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
1556 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
1557 main_e = main_e >> sb->s_blocksize_bits;
1558 main_e += main_s;
1559
1560 /* Locate the reserve sequence */
1561 reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
1562 reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
1563 reserve_e = reserve_e >> sb->s_blocksize_bits;
1564 reserve_e += reserve_s;
1565
1566 /* Process the main & reserve sequences */
1567 /* responsible for finding the PartitionDesc(s) */
1568 if (!udf_process_sequence(sb, main_s, main_e, fileset))
1569 return 1;
1570 return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
1571}
1572
1658/* 1573/*
1659 * udf_check_valid() 1574 * Check whether there is an anchor block in the given block and
1575 * load Volume Descriptor Sequence if so.
1660 */ 1576 */
1661static int udf_check_valid(struct super_block *sb, int novrs, int silent) 1577static int udf_check_anchor_block(struct super_block *sb, sector_t block,
1578 struct kernel_lb_addr *fileset)
1662{ 1579{
1663 long block; 1580 struct buffer_head *bh;
1664 struct udf_sb_info *sbi = UDF_SB(sb); 1581 uint16_t ident;
1582 int ret;
1665 1583
1666 if (novrs) { 1584 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
1667 udf_debug("Validity check skipped because of novrs option\n"); 1585 udf_fixed_to_variable(block) >=
1586 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
1587 return 0;
1588
1589 bh = udf_read_tagged(sb, block, block, &ident);
1590 if (!bh)
1591 return 0;
1592 if (ident != TAG_IDENT_AVDP) {
1593 brelse(bh);
1668 return 0; 1594 return 0;
1669 } 1595 }
1670 /* Check that it is NSR02 compliant */ 1596 ret = udf_load_sequence(sb, bh, fileset);
1671 /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ 1597 brelse(bh);
1672 block = udf_vrs(sb, silent); 1598 return ret;
1673 if (block == -1)
1674 udf_debug("Failed to read byte 32768. Assuming open "
1675 "disc. Skipping validity check\n");
1676 if (block && !sbi->s_last_block)
1677 sbi->s_last_block = udf_get_last_block(sb);
1678 return !block;
1679} 1599}
1680 1600
1681static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset) 1601/* Search for an anchor volume descriptor pointer */
1602static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
1603 struct kernel_lb_addr *fileset)
1682{ 1604{
1683 struct anchorVolDescPtr *anchor; 1605 sector_t last[6];
1684 uint16_t ident;
1685 struct buffer_head *bh;
1686 long main_s, main_e, reserve_s, reserve_e;
1687 int i; 1606 int i;
1688 struct udf_sb_info *sbi; 1607 struct udf_sb_info *sbi = UDF_SB(sb);
1689 1608 int last_count = 0;
1690 if (!sb)
1691 return 1;
1692 sbi = UDF_SB(sb);
1693 1609
1694 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { 1610 /* First try user provided anchor */
1695 if (!sbi->s_anchor[i]) 1611 if (sbi->s_anchor) {
1612 if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
1613 return lastblock;
1614 }
1615 /*
1616 * according to spec, anchor is in either:
1617 * block 256
1618 * lastblock-256
1619 * lastblock
1620 * however, if the disc isn't closed, it could be 512.
1621 */
1622 if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
1623 return lastblock;
1624 /*
1625 * The trouble is which block is the last one. Drives often misreport
1626 * this so we try various possibilities.
1627 */
1628 last[last_count++] = lastblock;
1629 if (lastblock >= 1)
1630 last[last_count++] = lastblock - 1;
1631 last[last_count++] = lastblock + 1;
1632 if (lastblock >= 2)
1633 last[last_count++] = lastblock - 2;
1634 if (lastblock >= 150)
1635 last[last_count++] = lastblock - 150;
1636 if (lastblock >= 152)
1637 last[last_count++] = lastblock - 152;
1638
1639 for (i = 0; i < last_count; i++) {
1640 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
1641 sb->s_blocksize_bits)
1696 continue; 1642 continue;
1697 1643 if (udf_check_anchor_block(sb, last[i], fileset))
1698 bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], 1644 return last[i];
1699 &ident); 1645 if (last[i] < 256)
1700 if (!bh)
1701 continue; 1646 continue;
1647 if (udf_check_anchor_block(sb, last[i] - 256, fileset))
1648 return last[i];
1649 }
1702 1650
1703 anchor = (struct anchorVolDescPtr *)bh->b_data; 1651 /* Finally try block 512 in case media is open */
1652 if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
1653 return last[0];
1654 return 0;
1655}
1704 1656
1705 /* Locate the main sequence */ 1657/*
1706 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); 1658 * Find an anchor volume descriptor and load Volume Descriptor Sequence from
1707 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); 1659 * area specified by it. The function expects sbi->s_lastblock to be the last
1708 main_e = main_e >> sb->s_blocksize_bits; 1660 * block on the media.
1709 main_e += main_s; 1661 *
1662 * Return 1 if ok, 0 if not found.
1663 *
1664 */
1665static int udf_find_anchor(struct super_block *sb,
1666 struct kernel_lb_addr *fileset)
1667{
1668 sector_t lastblock;
1669 struct udf_sb_info *sbi = UDF_SB(sb);
1710 1670
1711 /* Locate the reserve sequence */ 1671 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1712 reserve_s = le32_to_cpu( 1672 if (lastblock)
1713 anchor->reserveVolDescSeqExt.extLocation); 1673 goto out;
1714 reserve_e = le32_to_cpu(
1715 anchor->reserveVolDescSeqExt.extLength);
1716 reserve_e = reserve_e >> sb->s_blocksize_bits;
1717 reserve_e += reserve_s;
1718 1674
1719 brelse(bh); 1675 /* No anchor found? Try VARCONV conversion of block numbers */
1676 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
1677 /* Firstly, we try to not convert number of the last block */
1678 lastblock = udf_scan_anchors(sb,
1679 udf_variable_to_fixed(sbi->s_last_block),
1680 fileset);
1681 if (lastblock)
1682 goto out;
1720 1683
1721 /* Process the main & reserve sequences */ 1684 /* Secondly, we try with converted number of the last block */
1722 /* responsible for finding the PartitionDesc(s) */ 1685 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1723 if (!(udf_process_sequence(sb, main_s, main_e, 1686 if (!lastblock) {
1724 fileset) && 1687 /* VARCONV didn't help. Clear it. */
1725 udf_process_sequence(sb, reserve_s, reserve_e, 1688 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
1726 fileset))) 1689 return 0;
1727 break;
1728 } 1690 }
1691out:
1692 sbi->s_last_block = lastblock;
1693 return 1;
1694}
1729 1695
1730 if (i == ARRAY_SIZE(sbi->s_anchor)) { 1696/*
1731 udf_debug("No Anchor block found\n"); 1697 * Check Volume Structure Descriptor, find Anchor block and load Volume
1732 return 1; 1698 * Descriptor Sequence
1699 */
1700static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1701 int silent, struct kernel_lb_addr *fileset)
1702{
1703 struct udf_sb_info *sbi = UDF_SB(sb);
1704 loff_t nsr_off;
1705
1706 if (!sb_set_blocksize(sb, uopt->blocksize)) {
1707 if (!silent)
1708 printk(KERN_WARNING "UDF-fs: Bad block size\n");
1709 return 0;
1710 }
1711 sbi->s_last_block = uopt->lastblock;
1712 if (!uopt->novrs) {
1713 /* Check that it is NSR02 compliant */
1714 nsr_off = udf_check_vsd(sb);
1715 if (!nsr_off) {
1716 if (!silent)
1717 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1718 return 0;
1719 }
1720 if (nsr_off == -1)
1721 udf_debug("Failed to read byte 32768. Assuming open "
1722 "disc. Skipping validity check\n");
1723 if (!sbi->s_last_block)
1724 sbi->s_last_block = udf_get_last_block(sb);
1725 } else {
1726 udf_debug("Validity check skipped because of novrs option\n");
1733 } 1727 }
1734 udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
1735 1728
1736 return 0; 1729 /* Look for anchor block and load Volume Descriptor Sequence */
1730 sbi->s_anchor = uopt->anchor;
1731 if (!udf_find_anchor(sb, fileset)) {
1732 if (!silent)
1733 printk(KERN_WARNING "UDF-fs: No anchor found\n");
1734 return 0;
1735 }
1736 return 1;
1737} 1737}
1738 1738
1739static void udf_open_lvid(struct super_block *sb) 1739static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
1742 struct buffer_head *bh = sbi->s_lvid_bh; 1742 struct buffer_head *bh = sbi->s_lvid_bh;
1743 struct logicalVolIntegrityDesc *lvid; 1743 struct logicalVolIntegrityDesc *lvid;
1744 struct logicalVolIntegrityDescImpUse *lvidiu; 1744 struct logicalVolIntegrityDescImpUse *lvidiu;
1745
1745 if (!bh) 1746 if (!bh)
1746 return; 1747 return;
1747
1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1749 lvidiu = udf_sb_lvidiu(sbi); 1749 lvidiu = udf_sb_lvidiu(sbi);
1750 1750
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, 1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
1754 CURRENT_TIME); 1754 CURRENT_TIME);
1755 lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; 1755 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
1756 1756
1757 lvid->descTag.descCRC = cpu_to_le16( 1757 lvid->descTag.descCRC = cpu_to_le16(
1758 crc_itu_t(0, (char *)lvid + sizeof(tag), 1758 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1759 le16_to_cpu(lvid->descTag.descCRCLength))); 1759 le16_to_cpu(lvid->descTag.descCRCLength)));
1760 1760
1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1762 mark_buffer_dirty(bh); 1762 mark_buffer_dirty(bh);
1763 sbi->s_lvid_dirty = 0;
1763} 1764}
1764 1765
1765static void udf_close_lvid(struct super_block *sb) 1766static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
1773 return; 1774 return;
1774 1775
1775 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1776 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1776
1777 if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
1778 return;
1779
1780 lvidiu = udf_sb_lvidiu(sbi); 1777 lvidiu = udf_sb_lvidiu(sbi);
1781 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1778 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1782 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1779 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
1790 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); 1787 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
1791 1788
1792 lvid->descTag.descCRC = cpu_to_le16( 1789 lvid->descTag.descCRC = cpu_to_le16(
1793 crc_itu_t(0, (char *)lvid + sizeof(tag), 1790 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1794 le16_to_cpu(lvid->descTag.descCRCLength))); 1791 le16_to_cpu(lvid->descTag.descCRCLength)));
1795 1792
1796 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1793 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1797 mark_buffer_dirty(bh); 1794 mark_buffer_dirty(bh);
1795 sbi->s_lvid_dirty = 0;
1798} 1796}
1799 1797
1800static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1798static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
1846static int udf_fill_super(struct super_block *sb, void *options, int silent) 1844static int udf_fill_super(struct super_block *sb, void *options, int silent)
1847{ 1845{
1848 int i; 1846 int i;
1847 int ret;
1849 struct inode *inode = NULL; 1848 struct inode *inode = NULL;
1850 struct udf_options uopt; 1849 struct udf_options uopt;
1851 kernel_lb_addr rootdir, fileset; 1850 struct kernel_lb_addr rootdir, fileset;
1852 struct udf_sb_info *sbi; 1851 struct udf_sb_info *sbi;
1853 1852
1854 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1853 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1855 uopt.uid = -1; 1854 uopt.uid = -1;
1856 uopt.gid = -1; 1855 uopt.gid = -1;
1857 uopt.umask = 0; 1856 uopt.umask = 0;
1857 uopt.fmode = UDF_INVALID_MODE;
1858 uopt.dmode = UDF_INVALID_MODE;
1858 1859
1859 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1860 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1860 if (!sbi) 1861 if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1892 sbi->s_uid = uopt.uid; 1893 sbi->s_uid = uopt.uid;
1893 sbi->s_gid = uopt.gid; 1894 sbi->s_gid = uopt.gid;
1894 sbi->s_umask = uopt.umask; 1895 sbi->s_umask = uopt.umask;
1896 sbi->s_fmode = uopt.fmode;
1897 sbi->s_dmode = uopt.dmode;
1895 sbi->s_nls_map = uopt.nls_map; 1898 sbi->s_nls_map = uopt.nls_map;
1896 1899
1897 /* Set the block size for all transfers */
1898 if (!sb_min_blocksize(sb, uopt.blocksize)) {
1899 udf_debug("Bad block size (%d)\n", uopt.blocksize);
1900 printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
1901 goto error_out;
1902 }
1903
1904 if (uopt.session == 0xFFFFFFFF) 1900 if (uopt.session == 0xFFFFFFFF)
1905 sbi->s_session = udf_get_last_session(sb); 1901 sbi->s_session = udf_get_last_session(sb);
1906 else 1902 else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1908 1904
1909 udf_debug("Multi-session=%d\n", sbi->s_session); 1905 udf_debug("Multi-session=%d\n", sbi->s_session);
1910 1906
1911 sbi->s_last_block = uopt.lastblock;
1912 sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
1913 sbi->s_anchor[2] = uopt.anchor;
1914
1915 if (udf_check_valid(sb, uopt.novrs, silent)) {
1916 /* read volume recognition sequences */
1917 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1918 goto error_out;
1919 }
1920
1921 udf_find_anchor(sb);
1922
1923 /* Fill in the rest of the superblock */ 1907 /* Fill in the rest of the superblock */
1924 sb->s_op = &udf_sb_ops; 1908 sb->s_op = &udf_sb_ops;
1925 sb->s_export_op = &udf_export_ops; 1909 sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1928 sb->s_magic = UDF_SUPER_MAGIC; 1912 sb->s_magic = UDF_SUPER_MAGIC;
1929 sb->s_time_gran = 1000; 1913 sb->s_time_gran = 1000;
1930 1914
1931 if (udf_load_sequence(sb, &fileset)) { 1915 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
1916 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1917 } else {
1918 uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
1919 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1920 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
1921 if (!silent)
1922 printk(KERN_NOTICE
1923 "UDF-fs: Rescanning with blocksize "
1924 "%d\n", UDF_DEFAULT_BLOCKSIZE);
1925 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
1926 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1927 }
1928 }
1929 if (!ret) {
1932 printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); 1930 printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
1933 goto error_out; 1931 goto error_out;
1934 } 1932 }
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1978 } 1976 }
1979 1977
1980 if (!silent) { 1978 if (!silent) {
1981 timestamp ts; 1979 struct timestamp ts;
1982 udf_time_to_disk_stamp(&ts, sbi->s_record_time); 1980 udf_time_to_disk_stamp(&ts, sbi->s_record_time);
1983 udf_info("UDF: Mounting volume '%s', " 1981 udf_info("UDF: Mounting volume '%s', "
1984 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", 1982 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1991 /* Assign the root inode */ 1989 /* Assign the root inode */
1992 /* assign inodes by physical block number */ 1990 /* assign inodes by physical block number */
1993 /* perhaps it's not extensible enough, but for now ... */ 1991 /* perhaps it's not extensible enough, but for now ... */
1994 inode = udf_iget(sb, rootdir); 1992 inode = udf_iget(sb, &rootdir);
1995 if (!inode) { 1993 if (!inode) {
1996 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " 1994 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
1997 "partition=%d\n", 1995 "partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
2081 sb->s_fs_info = NULL; 2079 sb->s_fs_info = NULL;
2082} 2080}
2083 2081
2082static int udf_sync_fs(struct super_block *sb, int wait)
2083{
2084 struct udf_sb_info *sbi = UDF_SB(sb);
2085
2086 mutex_lock(&sbi->s_alloc_mutex);
2087 if (sbi->s_lvid_dirty) {
2088 /*
2089 * Blockdevice will be synced later so we don't have to submit
2090 * the buffer for IO
2091 */
2092 mark_buffer_dirty(sbi->s_lvid_bh);
2093 sb->s_dirt = 0;
2094 sbi->s_lvid_dirty = 0;
2095 }
2096 mutex_unlock(&sbi->s_alloc_mutex);
2097
2098 return 0;
2099}
2100
2084static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) 2101static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2085{ 2102{
2086 struct super_block *sb = dentry->d_sb; 2103 struct super_block *sb = dentry->d_sb;
2087 struct udf_sb_info *sbi = UDF_SB(sb); 2104 struct udf_sb_info *sbi = UDF_SB(sb);
2088 struct logicalVolIntegrityDescImpUse *lvidiu; 2105 struct logicalVolIntegrityDescImpUse *lvidiu;
2106 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
2089 2107
2090 if (sbi->s_lvid_bh != NULL) 2108 if (sbi->s_lvid_bh != NULL)
2091 lvidiu = udf_sb_lvidiu(sbi); 2109 lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2101 le32_to_cpu(lvidiu->numDirs)) : 0) 2119 le32_to_cpu(lvidiu->numDirs)) : 0)
2102 + buf->f_bfree; 2120 + buf->f_bfree;
2103 buf->f_ffree = buf->f_bfree; 2121 buf->f_ffree = buf->f_bfree;
2104 /* __kernel_fsid_t f_fsid */
2105 buf->f_namelen = UDF_NAME_LEN - 2; 2122 buf->f_namelen = UDF_NAME_LEN - 2;
2123 buf->f_fsid.val[0] = (u32)id;
2124 buf->f_fsid.val[1] = (u32)(id >> 32);
2106 2125
2107 return 0; 2126 return 0;
2108} 2127}
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2114 unsigned int accum = 0; 2133 unsigned int accum = 0;
2115 int index; 2134 int index;
2116 int block = 0, newblock; 2135 int block = 0, newblock;
2117 kernel_lb_addr loc; 2136 struct kernel_lb_addr loc;
2118 uint32_t bytes; 2137 uint32_t bytes;
2119 uint8_t *ptr; 2138 uint8_t *ptr;
2120 uint16_t ident; 2139 uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2124 2143
2125 loc.logicalBlockNum = bitmap->s_extPosition; 2144 loc.logicalBlockNum = bitmap->s_extPosition;
2126 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 2145 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
2127 bh = udf_read_ptagged(sb, loc, 0, &ident); 2146 bh = udf_read_ptagged(sb, &loc, 0, &ident);
2128 2147
2129 if (!bh) { 2148 if (!bh) {
2130 printk(KERN_ERR "udf: udf_count_free failed\n"); 2149 printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2147 bytes -= cur_bytes; 2166 bytes -= cur_bytes;
2148 if (bytes) { 2167 if (bytes) {
2149 brelse(bh); 2168 brelse(bh);
2150 newblock = udf_get_lb_pblock(sb, loc, ++block); 2169 newblock = udf_get_lb_pblock(sb, &loc, ++block);
2151 bh = udf_tread(sb, newblock); 2170 bh = udf_tread(sb, newblock);
2152 if (!bh) { 2171 if (!bh) {
2153 udf_debug("read failed\n"); 2172 udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2170{ 2189{
2171 unsigned int accum = 0; 2190 unsigned int accum = 0;
2172 uint32_t elen; 2191 uint32_t elen;
2173 kernel_lb_addr eloc; 2192 struct kernel_lb_addr eloc;
2174 int8_t etype; 2193 int8_t etype;
2175 struct extent_position epos; 2194 struct extent_position epos;
2176 2195
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f9424..225527cdc885 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
28#include "udf_sb.h" 28#include "udf_sb.h"
29 29
30static void extent_trunc(struct inode *inode, struct extent_position *epos, 30static void extent_trunc(struct inode *inode, struct extent_position *epos,
31 kernel_lb_addr eloc, int8_t etype, uint32_t elen, 31 struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
32 uint32_t nelen) 32 uint32_t nelen)
33{ 33{
34 kernel_lb_addr neloc = {}; 34 struct kernel_lb_addr neloc = {};
35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >> 35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
36 inode->i_sb->s_blocksize_bits; 36 inode->i_sb->s_blocksize_bits;
37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> 37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
43 last_block); 43 last_block);
44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); 44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
45 } else 45 } else
46 neloc = eloc; 46 neloc = *eloc;
47 nelen = (etype << 30) | nelen; 47 nelen = (etype << 30) | nelen;
48 } 48 }
49 49
50 if (elen != nelen) { 50 if (elen != nelen) {
51 udf_write_aext(inode, epos, neloc, nelen, 0); 51 udf_write_aext(inode, epos, &neloc, nelen, 0);
52 if (last_block - first_block > 0) { 52 if (last_block - first_block > 0) {
53 if (etype == (EXT_RECORDED_ALLOCATED >> 30)) 53 if (etype == (EXT_RECORDED_ALLOCATED >> 30))
54 mark_inode_dirty(inode); 54 mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
68void udf_truncate_tail_extent(struct inode *inode) 68void udf_truncate_tail_extent(struct inode *inode)
69{ 69{
70 struct extent_position epos = {}; 70 struct extent_position epos = {};
71 kernel_lb_addr eloc; 71 struct kernel_lb_addr eloc;
72 uint32_t elen, nelen; 72 uint32_t elen, nelen;
73 uint64_t lbcount = 0; 73 uint64_t lbcount = 0;
74 int8_t etype = -1, netype; 74 int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
83 return; 83 return;
84 84
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 adsize = sizeof(short_ad); 86 adsize = sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
88 adsize = sizeof(long_ad); 88 adsize = sizeof(struct long_ad);
89 else 89 else
90 BUG(); 90 BUG();
91 91
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
106 (unsigned)elen); 106 (unsigned)elen);
107 nelen = elen - (lbcount - inode->i_size); 107 nelen = elen - (lbcount - inode->i_size);
108 epos.offset -= adsize; 108 epos.offset -= adsize;
109 extent_trunc(inode, &epos, eloc, etype, elen, nelen); 109 extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
110 epos.offset += adsize; 110 epos.offset += adsize;
111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) 111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
112 printk(KERN_ERR "udf_truncate_tail_extent(): " 112 printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
124void udf_discard_prealloc(struct inode *inode) 124void udf_discard_prealloc(struct inode *inode)
125{ 125{
126 struct extent_position epos = { NULL, 0, {0, 0} }; 126 struct extent_position epos = { NULL, 0, {0, 0} };
127 kernel_lb_addr eloc; 127 struct kernel_lb_addr eloc;
128 uint32_t elen; 128 uint32_t elen;
129 uint64_t lbcount = 0; 129 uint64_t lbcount = 0;
130 int8_t etype = -1, netype; 130 int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
136 return; 136 return;
137 137
138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
139 adsize = sizeof(short_ad); 139 adsize = sizeof(struct short_ad);
140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
141 adsize = sizeof(long_ad); 141 adsize = sizeof(struct long_ad);
142 else 142 else
143 adsize = 0; 143 adsize = 0;
144 144
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
153 epos.offset -= adsize; 153 epos.offset -= adsize;
154 lbcount -= elen; 154 lbcount -= elen;
155 extent_trunc(inode, &epos, eloc, etype, elen, 0); 155 extent_trunc(inode, &epos, &eloc, etype, elen, 0);
156 if (!epos.bh) { 156 if (!epos.bh) {
157 iinfo->i_lenAlloc = 157 iinfo->i_lenAlloc =
158 epos.offset - 158 epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
200void udf_truncate_extents(struct inode *inode) 200void udf_truncate_extents(struct inode *inode)
201{ 201{
202 struct extent_position epos; 202 struct extent_position epos;
203 kernel_lb_addr eloc, neloc = {}; 203 struct kernel_lb_addr eloc, neloc = {};
204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; 204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
205 int8_t etype; 205 int8_t etype;
206 struct super_block *sb = inode->i_sb; 206 struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
210 struct udf_inode_info *iinfo = UDF_I(inode); 210 struct udf_inode_info *iinfo = UDF_I(inode);
211 211
212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
213 adsize = sizeof(short_ad); 213 adsize = sizeof(struct short_ad);
214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
215 adsize = sizeof(long_ad); 215 adsize = sizeof(struct long_ad);
216 else 216 else
217 BUG(); 217 BUG();
218 218
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
221 (inode->i_size & (sb->s_blocksize - 1)); 221 (inode->i_size & (sb->s_blocksize - 1));
222 if (etype != -1) { 222 if (etype != -1) {
223 epos.offset -= adsize; 223 epos.offset -= adsize;
224 extent_trunc(inode, &epos, eloc, etype, elen, byte_offset); 224 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
225 epos.offset += adsize; 225 epos.offset += adsize;
226 if (byte_offset) 226 if (byte_offset)
227 lenalloc = epos.offset; 227 lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
236 while ((etype = udf_current_aext(inode, &epos, &eloc, 236 while ((etype = udf_current_aext(inode, &epos, &eloc,
237 &elen, 0)) != -1) { 237 &elen, 0)) != -1) {
238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { 238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
239 udf_write_aext(inode, &epos, neloc, nelen, 0); 239 udf_write_aext(inode, &epos, &neloc, nelen, 0);
240 if (indirect_ext_len) { 240 if (indirect_ext_len) {
241 /* We managed to free all extents in the 241 /* We managed to free all extents in the
242 * indirect extent - free it too */ 242 * indirect extent - free it too */
243 BUG_ON(!epos.bh); 243 BUG_ON(!epos.bh);
244 udf_free_blocks(sb, inode, epos.block, 244 udf_free_blocks(sb, inode, &epos.block,
245 0, indirect_ext_len); 245 0, indirect_ext_len);
246 } else if (!epos.bh) { 246 } else if (!epos.bh) {
247 iinfo->i_lenAlloc = lenalloc; 247 iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
253 epos.offset = sizeof(struct allocExtDesc); 253 epos.offset = sizeof(struct allocExtDesc);
254 epos.block = eloc; 254 epos.block = eloc;
255 epos.bh = udf_tread(sb, 255 epos.bh = udf_tread(sb,
256 udf_get_lb_pblock(sb, eloc, 0)); 256 udf_get_lb_pblock(sb, &eloc, 0));
257 if (elen) 257 if (elen)
258 indirect_ext_len = 258 indirect_ext_len =
259 (elen + sb->s_blocksize - 1) >> 259 (elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
261 else 261 else
262 indirect_ext_len = 1; 262 indirect_ext_len = 1;
263 } else { 263 } else {
264 extent_trunc(inode, &epos, eloc, etype, 264 extent_trunc(inode, &epos, &eloc, etype,
265 elen, 0); 265 elen, 0);
266 epos.offset += adsize; 266 epos.offset += adsize;
267 } 267 }
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
269 269
270 if (indirect_ext_len) { 270 if (indirect_ext_len) {
271 BUG_ON(!epos.bh); 271 BUG_ON(!epos.bh);
272 udf_free_blocks(sb, inode, epos.block, 0, 272 udf_free_blocks(sb, inode, &epos.block, 0,
273 indirect_ext_len); 273 indirect_ext_len);
274 } else if (!epos.bh) { 274 } else if (!epos.bh) {
275 iinfo->i_lenAlloc = lenalloc; 275 iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
278 udf_update_alloc_ext_desc(inode, &epos, lenalloc); 278 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
279 } else if (inode->i_size) { 279 } else if (inode->i_size) {
280 if (byte_offset) { 280 if (byte_offset) {
281 kernel_long_ad extent; 281 struct kernel_long_ad extent;
282 282
283 /* 283 /*
284 * OK, there is not extent covering inode->i_size and 284 * OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5d..e58d1de41073 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
4struct udf_inode_info { 4struct udf_inode_info {
5 struct timespec i_crtime; 5 struct timespec i_crtime;
6 /* Physical address of inode */ 6 /* Physical address of inode */
7 kernel_lb_addr i_location; 7 struct kernel_lb_addr i_location;
8 __u64 i_unique; 8 __u64 i_unique;
9 __u32 i_lenEAttr; 9 __u32 i_lenEAttr;
10 __u32 i_lenAlloc; 10 __u32 i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
17 unsigned i_strat4096 : 1; 17 unsigned i_strat4096 : 1;
18 unsigned reserved : 26; 18 unsigned reserved : 26;
19 union { 19 union {
20 short_ad *i_sad; 20 struct short_ad *i_sad;
21 long_ad *i_lad; 21 struct long_ad *i_lad;
22 __u8 *i_data; 22 __u8 *i_data;
23 } i_ext; 23 } i_ext;
24 struct inode vfs_inode; 24 struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a9725..d113b72c2768 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
30#define UDF_FLAG_GID_SET 16 30#define UDF_FLAG_GID_SET 16
31#define UDF_FLAG_SESSION_SET 17 31#define UDF_FLAG_SESSION_SET 17
32#define UDF_FLAG_LASTBLOCK_SET 18 32#define UDF_FLAG_LASTBLOCK_SET 18
33#define UDF_FLAG_BLOCKSIZE_SET 19
33 34
34#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 35#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
35#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 36#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
@@ -48,6 +49,8 @@
48#define UDF_SPARABLE_MAP15 0x1522U 49#define UDF_SPARABLE_MAP15 0x1522U
49#define UDF_METADATA_MAP25 0x2511U 50#define UDF_METADATA_MAP25 0x2511U
50 51
52#define UDF_INVALID_MODE ((mode_t)-1)
53
51#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ 54#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
52 55
53struct udf_meta_data { 56struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
114 117
115 /* Sector headers */ 118 /* Sector headers */
116 __s32 s_session; 119 __s32 s_session;
117 __u32 s_anchor[3]; 120 __u32 s_anchor;
118 __u32 s_last_block; 121 __u32 s_last_block;
119 122
120 struct buffer_head *s_lvid_bh; 123 struct buffer_head *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
123 mode_t s_umask; 126 mode_t s_umask;
124 gid_t s_gid; 127 gid_t s_gid;
125 uid_t s_uid; 128 uid_t s_uid;
129 mode_t s_fmode;
130 mode_t s_dmode;
126 131
127 /* Root Info */ 132 /* Root Info */
128 struct timespec s_record_time; 133 struct timespec s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
143 struct inode *s_vat_inode; 148 struct inode *s_vat_inode;
144 149
145 struct mutex s_alloc_mutex; 150 struct mutex s_alloc_mutex;
151 /* Protected by s_alloc_mutex */
152 unsigned int s_lvid_dirty;
146}; 153};
147 154
148static inline struct udf_sb_info *UDF_SB(struct super_block *sb) 155static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f13..cac51b77a5d1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
62 return 0; 62 return 0;
63} 63}
64 64
65#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
66
67/* computes tag checksum */ 65/* computes tag checksum */
68u8 udf_tag_checksum(const tag *t); 66u8 udf_tag_checksum(const struct tag *t);
69 67
70struct dentry; 68struct dentry;
71struct inode; 69struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
95}; 93};
96 94
97struct generic_desc { 95struct generic_desc {
98 tag descTag; 96 struct tag descTag;
99 __le32 volDescSeqNum; 97 __le32 volDescSeqNum;
100}; 98};
101 99
@@ -108,11 +106,22 @@ struct ustr {
108struct extent_position { 106struct extent_position {
109 struct buffer_head *bh; 107 struct buffer_head *bh;
110 uint32_t offset; 108 uint32_t offset;
111 kernel_lb_addr block; 109 struct kernel_lb_addr block;
112}; 110};
113 111
114/* super.c */ 112/* super.c */
115extern void udf_warning(struct super_block *, const char *, const char *, ...); 113extern void udf_warning(struct super_block *, const char *, const char *, ...);
114static inline void udf_updated_lvid(struct super_block *sb)
115{
116 struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
117
118 BUG_ON(!bh);
119 WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
120 bh->b_data)->integrityType !=
121 cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
122 sb->s_dirt = 1;
123 UDF_SB(sb)->s_lvid_dirty = 1;
124}
116 125
117/* namei.c */ 126/* namei.c */
118extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 127extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
124 unsigned long); 133 unsigned long);
125 134
126/* inode.c */ 135/* inode.c */
127extern struct inode *udf_iget(struct super_block *, kernel_lb_addr); 136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
128extern int udf_sync_inode(struct inode *); 137extern int udf_sync_inode(struct inode *);
129extern void udf_expand_file_adinicb(struct inode *, int, int *); 138extern void udf_expand_file_adinicb(struct inode *, int, int *);
130extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 139extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
136extern int udf_write_inode(struct inode *, int); 145extern int udf_write_inode(struct inode *, int);
137extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
138extern int udf_extend_file(struct inode *, struct extent_position *, 147extern int udf_extend_file(struct inode *, struct extent_position *,
139 kernel_long_ad *, sector_t); 148 struct kernel_long_ad *, sector_t);
140extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, 149extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
141 kernel_lb_addr *, uint32_t *, sector_t *); 150 struct kernel_lb_addr *, uint32_t *, sector_t *);
142extern int8_t udf_add_aext(struct inode *, struct extent_position *, 151extern int8_t udf_add_aext(struct inode *, struct extent_position *,
143 kernel_lb_addr, uint32_t, int); 152 struct kernel_lb_addr *, uint32_t, int);
144extern int8_t udf_write_aext(struct inode *, struct extent_position *, 153extern int8_t udf_write_aext(struct inode *, struct extent_position *,
145 kernel_lb_addr, uint32_t, int); 154 struct kernel_lb_addr *, uint32_t, int);
146extern int8_t udf_delete_aext(struct inode *, struct extent_position, 155extern int8_t udf_delete_aext(struct inode *, struct extent_position,
147 kernel_lb_addr, uint32_t); 156 struct kernel_lb_addr, uint32_t);
148extern int8_t udf_next_aext(struct inode *, struct extent_position *, 157extern int8_t udf_next_aext(struct inode *, struct extent_position *,
149 kernel_lb_addr *, uint32_t *, int); 158 struct kernel_lb_addr *, uint32_t *, int);
150extern int8_t udf_current_aext(struct inode *, struct extent_position *, 159extern int8_t udf_current_aext(struct inode *, struct extent_position *,
151 kernel_lb_addr *, uint32_t *, int); 160 struct kernel_lb_addr *, uint32_t *, int);
152 161
153/* misc.c */ 162/* misc.c */
154extern struct buffer_head *udf_tgetblk(struct super_block *, int); 163extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
160extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, 169extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
161 uint32_t, uint16_t *); 170 uint32_t, uint16_t *);
162extern struct buffer_head *udf_read_ptagged(struct super_block *, 171extern struct buffer_head *udf_read_ptagged(struct super_block *,
163 kernel_lb_addr, uint32_t, 172 struct kernel_lb_addr *, uint32_t,
164 uint16_t *); 173 uint16_t *);
165extern void udf_update_tag(char *, int); 174extern void udf_update_tag(char *, int);
166extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); 175extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
182 uint32_t); 191 uint32_t);
183extern int udf_relocate_blocks(struct super_block *, long, long *); 192extern int udf_relocate_blocks(struct super_block *, long, long *);
184 193
194static inline uint32_t
195udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
196 uint32_t offset)
197{
198 return udf_get_pblock(sb, loc->logicalBlockNum,
199 loc->partitionReferenceNum, offset);
200}
201
185/* unicode.c */ 202/* unicode.c */
186extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); 203extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
187extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, 204extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
200 217
201/* balloc.c */ 218/* balloc.c */
202extern void udf_free_blocks(struct super_block *, struct inode *, 219extern void udf_free_blocks(struct super_block *, struct inode *,
203 kernel_lb_addr, uint32_t, uint32_t); 220 struct kernel_lb_addr *, uint32_t, uint32_t);
204extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, 221extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
205 uint32_t, uint32_t); 222 uint32_t, uint32_t);
206extern int udf_new_block(struct super_block *, struct inode *, uint16_t, 223extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
214 struct udf_fileident_bh *, 231 struct udf_fileident_bh *,
215 struct fileIdentDesc *, 232 struct fileIdentDesc *,
216 struct extent_position *, 233 struct extent_position *,
217 kernel_lb_addr *, uint32_t *, 234 struct kernel_lb_addr *, uint32_t *,
218 sector_t *); 235 sector_t *);
219extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, 236extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
220 int *offset); 237 int *offset);
221extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); 238extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
222extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); 239extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
223 240
224/* udftime.c */ 241/* udftime.c */
225extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, 242extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
226 timestamp src); 243 struct timestamp src);
227extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src); 244extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
228 245
229#endif /* __UDF_DECL_H */ 246#endif /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428c..6a9f3a9cc428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
4#include <asm/byteorder.h> 4#include <asm/byteorder.h>
5#include <linux/string.h> 5#include <linux/string.h>
6 6
7static inline kernel_lb_addr lelb_to_cpu(lb_addr in) 7static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
8{ 8{
9 kernel_lb_addr out; 9 struct kernel_lb_addr out;
10 10
11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); 11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); 12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
14 return out; 14 return out;
15} 15}
16 16
17static inline lb_addr cpu_to_lelb(kernel_lb_addr in) 17static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
18{ 18{
19 lb_addr out; 19 struct lb_addr out;
20 20
21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); 21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); 22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
24 return out; 24 return out;
25} 25}
26 26
27static inline short_ad lesa_to_cpu(short_ad in) 27static inline struct short_ad lesa_to_cpu(struct short_ad in)
28{ 28{
29 short_ad out; 29 struct short_ad out;
30 30
31 out.extLength = le32_to_cpu(in.extLength); 31 out.extLength = le32_to_cpu(in.extLength);
32 out.extPosition = le32_to_cpu(in.extPosition); 32 out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
34 return out; 34 return out;
35} 35}
36 36
37static inline short_ad cpu_to_lesa(short_ad in) 37static inline struct short_ad cpu_to_lesa(struct short_ad in)
38{ 38{
39 short_ad out; 39 struct short_ad out;
40 40
41 out.extLength = cpu_to_le32(in.extLength); 41 out.extLength = cpu_to_le32(in.extLength);
42 out.extPosition = cpu_to_le32(in.extPosition); 42 out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
44 return out; 44 return out;
45} 45}
46 46
47static inline kernel_long_ad lela_to_cpu(long_ad in) 47static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
48{ 48{
49 kernel_long_ad out; 49 struct kernel_long_ad out;
50 50
51 out.extLength = le32_to_cpu(in.extLength); 51 out.extLength = le32_to_cpu(in.extLength);
52 out.extLocation = lelb_to_cpu(in.extLocation); 52 out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
54 return out; 54 return out;
55} 55}
56 56
57static inline long_ad cpu_to_lela(kernel_long_ad in) 57static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
58{ 58{
59 long_ad out; 59 struct long_ad out;
60 60
61 out.extLength = cpu_to_le32(in.extLength); 61 out.extLength = cpu_to_le32(in.extLength);
62 out.extLocation = cpu_to_lelb(in.extLocation); 62 out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
64 return out; 64 return out;
65} 65}
66 66
67static inline kernel_extent_ad leea_to_cpu(extent_ad in) 67static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
68{ 68{
69 kernel_extent_ad out; 69 struct kernel_extent_ad out;
70 70
71 out.extLength = le32_to_cpu(in.extLength); 71 out.extLength = le32_to_cpu(in.extLength);
72 out.extLocation = le32_to_cpu(in.extLocation); 72 out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b5..b8c828c4d200 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
85#define SECS_PER_HOUR (60 * 60) 85#define SECS_PER_HOUR (60 * 60)
86#define SECS_PER_DAY (SECS_PER_HOUR * 24) 86#define SECS_PER_DAY (SECS_PER_HOUR * 24)
87 87
88struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) 88struct timespec *
89udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
89{ 90{
90 int yday; 91 int yday;
91 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); 92 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
116 return dest; 117 return dest;
117} 118}
118 119
119timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts) 120struct timestamp *
121udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
120{ 122{
121 long int days, rem, y; 123 long int days, rem, y;
122 const unsigned short int *ip; 124 const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58e..cefa8c8913e6 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
254{ 254{
255 const uint8_t *ocu; 255 const uint8_t *ocu;
256 uint8_t cmp_id, ocu_len; 256 uint8_t cmp_id, ocu_len;
257 int i; 257 int i, len;
258 258
259 259
260 ocu_len = ocu_i->u_len; 260 ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
279 if (cmp_id == 16) 279 if (cmp_id == 16)
280 c = (c << 8) | ocu[i++]; 280 c = (c << 8) | ocu[i++];
281 281
282 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 282 len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
283 UDF_NAME_LEN - utf_o->u_len); 283 UDF_NAME_LEN - utf_o->u_len);
284 /* Valid character? */
285 if (len >= 0)
286 utf_o->u_len += len;
287 else
288 utf_o->u_name[utf_o->u_len++] = '?';
284 } 289 }
285 utf_o->u_cmpID = 8; 290 utf_o->u_cmpID = 8;
286 291
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
290static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, 295static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
291 int length) 296 int length)
292{ 297{
293 unsigned len, i, max_val; 298 int len;
299 unsigned i, max_val;
294 uint16_t uni_char; 300 uint16_t uni_char;
295 int u_len; 301 int u_len;
296 302
@@ -302,8 +308,13 @@ try_again:
302 u_len = 0U; 308 u_len = 0U;
303 for (i = 0U; i < uni->u_len; i++) { 309 for (i = 0U; i < uni->u_len; i++) {
304 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); 310 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
305 if (len <= 0) 311 if (!len)
306 continue; 312 continue;
313 /* Invalid character, deal with it */
314 if (len < 0) {
315 len = 1;
316 uni_char = '?';
317 }
307 318
308 if (uni_char > max_val) { 319 if (uni_char > max_val) {
309 max_val = 0xffffU; 320 max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
324int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, 335int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
325 int flen) 336 int flen)
326{ 337{
327 struct ustr filename, unifilename; 338 struct ustr *filename, *unifilename;
328 int len; 339 int len = 0;
329 340
330 if (udf_build_ustr_exact(&unifilename, sname, flen)) 341 filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
342 if (!filename)
331 return 0; 343 return 0;
332 344
345 unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
346 if (!unifilename)
347 goto out1;
348
349 if (udf_build_ustr_exact(unifilename, sname, flen))
350 goto out2;
351
333 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 352 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
334 if (!udf_CS0toUTF8(&filename, &unifilename)) { 353 if (!udf_CS0toUTF8(filename, unifilename)) {
335 udf_debug("Failed in udf_get_filename: sname = %s\n", 354 udf_debug("Failed in udf_get_filename: sname = %s\n",
336 sname); 355 sname);
337 return 0; 356 goto out2;
338 } 357 }
339 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 358 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
340 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, 359 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
341 &unifilename)) { 360 unifilename)) {
342 udf_debug("Failed in udf_get_filename: sname = %s\n", 361 udf_debug("Failed in udf_get_filename: sname = %s\n",
343 sname); 362 sname);
344 return 0; 363 goto out2;
345 } 364 }
346 } else 365 } else
347 return 0; 366 goto out2;
348 367
349 len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, 368 len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
350 unifilename.u_name, unifilename.u_len); 369 unifilename->u_name, unifilename->u_len);
351 if (len) 370out2:
352 return len; 371 kfree(unifilename);
353 372out1:
354 return 0; 373 kfree(filename);
374 return len;
355} 375}
356 376
357int udf_put_filename(struct super_block *sb, const uint8_t *sname, 377int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e1c1fc5ee239..60359291761f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1268 struct ufs_super_block_first *usb1; 1268 struct ufs_super_block_first *usb1;
1269 struct ufs_super_block_second *usb2; 1269 struct ufs_super_block_second *usb2;
1270 struct ufs_super_block_third *usb3; 1270 struct ufs_super_block_third *usb3;
1271 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1271 1272
1272 lock_kernel(); 1273 lock_kernel();
1273 1274
@@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1290 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; 1291 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
1291 buf->f_files = uspi->s_ncg * uspi->s_ipg; 1292 buf->f_files = uspi->s_ncg * uspi->s_ipg;
1292 buf->f_namelen = UFS_MAXNAMLEN; 1293 buf->f_namelen = UFS_MAXNAMLEN;
1294 buf->f_fsid.val[0] = (u32)id;
1295 buf->f_fsid.val[1] = (u32)(id >> 32);
1293 1296
1294 unlock_kernel(); 1297 unlock_kernel();
1295 1298
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c3dc491fff89..60f107e47fe9 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
33 xfs_qm_syscalls.o \ 33 xfs_qm_syscalls.o \
34 xfs_qm_bhv.o \ 34 xfs_qm_bhv.o \
35 xfs_qm.o) 35 xfs_qm.o)
36xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o
36 37
37ifeq ($(CONFIG_XFS_QUOTA),y) 38ifeq ($(CONFIG_XFS_QUOTA),y)
38xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o 39xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
deleted file mode 100644
index 2a88d56c4dc2..000000000000
--- a/fs/xfs/linux-2.6/mutex.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_MUTEX_H__
19#define __XFS_SUPPORT_MUTEX_H__
20
21#include <linux/mutex.h>
22
23typedef struct mutex mutex_t;
24
25#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198f771e..c13f67300fe7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = {
1623 .bmap = xfs_vm_bmap, 1623 .bmap = xfs_vm_bmap,
1624 .direct_IO = xfs_vm_direct_IO, 1624 .direct_IO = xfs_vm_direct_IO,
1625 .migratepage = buffer_migrate_page, 1625 .migratepage = buffer_migrate_page,
1626 .is_partially_uptodate = block_is_partially_uptodate,
1626}; 1627};
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4bd112313f33..d0b499418a7d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -34,6 +34,7 @@
34#include "xfs_dir2_sf.h" 34#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 35#include "xfs_dinode.h"
36#include "xfs_inode.h" 36#include "xfs_inode.h"
37#include "xfs_ioctl.h"
37#include "xfs_btree.h" 38#include "xfs_btree.h"
38#include "xfs_ialloc.h" 39#include "xfs_ialloc.h"
39#include "xfs_rtalloc.h" 40#include "xfs_rtalloc.h"
@@ -78,92 +79,74 @@ xfs_find_handle(
78 int hsize; 79 int hsize;
79 xfs_handle_t handle; 80 xfs_handle_t handle;
80 struct inode *inode; 81 struct inode *inode;
82 struct file *file = NULL;
83 struct path path;
84 int error;
85 struct xfs_inode *ip;
81 86
82 memset((char *)&handle, 0, sizeof(handle)); 87 if (cmd == XFS_IOC_FD_TO_HANDLE) {
83 88 file = fget(hreq->fd);
84 switch (cmd) { 89 if (!file)
85 case XFS_IOC_PATH_TO_FSHANDLE: 90 return -EBADF;
86 case XFS_IOC_PATH_TO_HANDLE: { 91 inode = file->f_path.dentry->d_inode;
87 struct path path; 92 } else {
88 int error = user_lpath((const char __user *)hreq->path, &path); 93 error = user_lpath((const char __user *)hreq->path, &path);
89 if (error) 94 if (error)
90 return error; 95 return error;
91 96 inode = path.dentry->d_inode;
92 ASSERT(path.dentry);
93 ASSERT(path.dentry->d_inode);
94 inode = igrab(path.dentry->d_inode);
95 path_put(&path);
96 break;
97 } 97 }
98 ip = XFS_I(inode);
98 99
99 case XFS_IOC_FD_TO_HANDLE: { 100 /*
100 struct file *file; 101 * We can only generate handles for inodes residing on a XFS filesystem,
101 102 * and only for regular files, directories or symbolic links.
102 file = fget(hreq->fd); 103 */
103 if (!file) 104 error = -EINVAL;
104 return -EBADF; 105 if (inode->i_sb->s_magic != XFS_SB_MAGIC)
106 goto out_put;
105 107
106 ASSERT(file->f_path.dentry); 108 error = -EBADF;
107 ASSERT(file->f_path.dentry->d_inode); 109 if (!S_ISREG(inode->i_mode) &&
108 inode = igrab(file->f_path.dentry->d_inode); 110 !S_ISDIR(inode->i_mode) &&
109 fput(file); 111 !S_ISLNK(inode->i_mode))
110 break; 112 goto out_put;
111 }
112 113
113 default:
114 ASSERT(0);
115 return -XFS_ERROR(EINVAL);
116 }
117 114
118 if (inode->i_sb->s_magic != XFS_SB_MAGIC) { 115 memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
119 /* we're not in XFS anymore, Toto */
120 iput(inode);
121 return -XFS_ERROR(EINVAL);
122 }
123 116
124 switch (inode->i_mode & S_IFMT) { 117 if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
125 case S_IFREG: 118 /*
126 case S_IFDIR: 119 * This handle only contains an fsid, zero the rest.
127 case S_IFLNK: 120 */
128 break; 121 memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
129 default: 122 hsize = sizeof(xfs_fsid_t);
130 iput(inode); 123 } else {
131 return -XFS_ERROR(EBADF);
132 }
133
134 /* now we can grab the fsid */
135 memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid,
136 sizeof(xfs_fsid_t));
137 hsize = sizeof(xfs_fsid_t);
138
139 if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
140 xfs_inode_t *ip = XFS_I(inode);
141 int lock_mode; 124 int lock_mode;
142 125
143 /* need to get access to the xfs_inode to read the generation */
144 lock_mode = xfs_ilock_map_shared(ip); 126 lock_mode = xfs_ilock_map_shared(ip);
145
146 /* fill in fid section of handle from inode */
147 handle.ha_fid.fid_len = sizeof(xfs_fid_t) - 127 handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
148 sizeof(handle.ha_fid.fid_len); 128 sizeof(handle.ha_fid.fid_len);
149 handle.ha_fid.fid_pad = 0; 129 handle.ha_fid.fid_pad = 0;
150 handle.ha_fid.fid_gen = ip->i_d.di_gen; 130 handle.ha_fid.fid_gen = ip->i_d.di_gen;
151 handle.ha_fid.fid_ino = ip->i_ino; 131 handle.ha_fid.fid_ino = ip->i_ino;
152
153 xfs_iunlock_map_shared(ip, lock_mode); 132 xfs_iunlock_map_shared(ip, lock_mode);
154 133
155 hsize = XFS_HSIZE(handle); 134 hsize = XFS_HSIZE(handle);
156 } 135 }
157 136
158 /* now copy our handle into the user buffer & write out the size */ 137 error = -EFAULT;
159 if (copy_to_user(hreq->ohandle, &handle, hsize) || 138 if (copy_to_user(hreq->ohandle, &handle, hsize) ||
160 copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) { 139 copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
161 iput(inode); 140 goto out_put;
162 return -XFS_ERROR(EFAULT);
163 }
164 141
165 iput(inode); 142 error = 0;
166 return 0; 143
144 out_put:
145 if (cmd == XFS_IOC_FD_TO_HANDLE)
146 fput(file);
147 else
148 path_put(&path);
149 return error;
167} 150}
168 151
169/* 152/*
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67f..6075382336d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -211,8 +211,13 @@ xfs_vn_mknod(
211 * Irix uses Missed'em'V split, but doesn't want to see 211 * Irix uses Missed'em'V split, but doesn't want to see
212 * the upper 5 bits of (14bit) major. 212 * the upper 5 bits of (14bit) major.
213 */ 213 */
214 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) 214 if (S_ISCHR(mode) || S_ISBLK(mode)) {
215 return -EINVAL; 215 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
216 return -EINVAL;
217 rdev = sysv_encode_dev(rdev);
218 } else {
219 rdev = 0;
220 }
216 221
217 if (test_default_acl && test_default_acl(dir)) { 222 if (test_default_acl && test_default_acl(dir)) {
218 if (!_ACL_ALLOC(default_acl)) { 223 if (!_ACL_ALLOC(default_acl)) {
@@ -224,28 +229,11 @@ xfs_vn_mknod(
224 } 229 }
225 } 230 }
226 231
227 xfs_dentry_to_name(&name, dentry);
228
229 if (IS_POSIXACL(dir) && !default_acl) 232 if (IS_POSIXACL(dir) && !default_acl)
230 mode &= ~current->fs->umask; 233 mode &= ~current_umask();
231
232 switch (mode & S_IFMT) {
233 case S_IFCHR:
234 case S_IFBLK:
235 case S_IFIFO:
236 case S_IFSOCK:
237 rdev = sysv_encode_dev(rdev);
238 case S_IFREG:
239 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
240 break;
241 case S_IFDIR:
242 error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
243 break;
244 default:
245 error = EINVAL;
246 break;
247 }
248 234
235 xfs_dentry_to_name(&name, dentry);
236 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
249 if (unlikely(error)) 237 if (unlikely(error))
250 goto out_free_acl; 238 goto out_free_acl;
251 239
@@ -416,7 +404,7 @@ xfs_vn_symlink(
416 mode_t mode; 404 mode_t mode;
417 405
418 mode = S_IFLNK | 406 mode = S_IFLNK |
419 (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); 407 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
420 xfs_dentry_to_name(&name, dentry); 408 xfs_dentry_to_name(&name, dentry);
421 409
422 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 410 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
@@ -553,9 +541,6 @@ xfs_vn_getattr(
553 stat->uid = ip->i_d.di_uid; 541 stat->uid = ip->i_d.di_uid;
554 stat->gid = ip->i_d.di_gid; 542 stat->gid = ip->i_d.di_gid;
555 stat->ino = ip->i_ino; 543 stat->ino = ip->i_ino;
556#if XFS_BIG_INUMS
557 stat->ino += mp->m_inoadd;
558#endif
559 stat->atime = inode->i_atime; 544 stat->atime = inode->i_atime;
560 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; 545 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
561 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; 546 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 507492d6dccd..f65a53f8752f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -38,7 +38,6 @@
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h> 40#include <sv.h>
41#include <mutex.h>
42#include <time.h> 41#include <time.h>
43 42
44#include <support/ktrace.h> 43#include <support/ktrace.h>
@@ -51,6 +50,7 @@
51#include <linux/blkdev.h> 50#include <linux/blkdev.h>
52#include <linux/slab.h> 51#include <linux/slab.h>
53#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/mutex.h>
54#include <linux/file.h> 54#include <linux/file.h>
55#include <linux/swap.h> 55#include <linux/swap.h>
56#include <linux/errno.h> 56#include <linux/errno.h>
@@ -147,17 +147,6 @@
147#define SYNCHRONIZE() barrier() 147#define SYNCHRONIZE() barrier()
148#define __return_address __builtin_return_address(0) 148#define __return_address __builtin_return_address(0)
149 149
150/*
151 * IRIX (BSD) quotactl makes use of separate commands for user/group,
152 * whereas on Linux the syscall encodes this information into the cmd
153 * field (see the QCMD macro in quota.h). These macros help keep the
154 * code portable - they are not visible from the syscall interface.
155 */
156#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */
157#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */
158#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */
159#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */
160
161#define dfltprid 0 150#define dfltprid 0
162#define MAXPATHLEN 1024 151#define MAXPATHLEN 1024
163 152
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 000000000000..94d9a633d3d9
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2008, Christoph Hellwig
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_dmapi.h"
20#include "xfs_sb.h"
21#include "xfs_inum.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_log.h"
26#include "xfs_trans.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_inode.h"
29#include "quota/xfs_qm.h"
30#include <linux/quota.h>
31
32
33STATIC int
34xfs_quota_type(int type)
35{
36 switch (type) {
37 case USRQUOTA:
38 return XFS_DQ_USER;
39 case GRPQUOTA:
40 return XFS_DQ_GROUP;
41 default:
42 return XFS_DQ_PROJ;
43 }
44}
45
46STATIC int
47xfs_fs_quota_sync(
48 struct super_block *sb,
49 int type)
50{
51 struct xfs_mount *mp = XFS_M(sb);
52
53 if (!XFS_IS_QUOTA_RUNNING(mp))
54 return -ENOSYS;
55 return -xfs_sync_inodes(mp, SYNC_DELWRI);
56}
57
58STATIC int
59xfs_fs_get_xstate(
60 struct super_block *sb,
61 struct fs_quota_stat *fqs)
62{
63 struct xfs_mount *mp = XFS_M(sb);
64
65 if (!XFS_IS_QUOTA_RUNNING(mp))
66 return -ENOSYS;
67 return -xfs_qm_scall_getqstat(mp, fqs);
68}
69
70STATIC int
71xfs_fs_set_xstate(
72 struct super_block *sb,
73 unsigned int uflags,
74 int op)
75{
76 struct xfs_mount *mp = XFS_M(sb);
77 unsigned int flags = 0;
78
79 if (sb->s_flags & MS_RDONLY)
80 return -EROFS;
81 if (!XFS_IS_QUOTA_RUNNING(mp))
82 return -ENOSYS;
83 if (!capable(CAP_SYS_ADMIN))
84 return -EPERM;
85
86 if (uflags & XFS_QUOTA_UDQ_ACCT)
87 flags |= XFS_UQUOTA_ACCT;
88 if (uflags & XFS_QUOTA_PDQ_ACCT)
89 flags |= XFS_PQUOTA_ACCT;
90 if (uflags & XFS_QUOTA_GDQ_ACCT)
91 flags |= XFS_GQUOTA_ACCT;
92 if (uflags & XFS_QUOTA_UDQ_ENFD)
93 flags |= XFS_UQUOTA_ENFD;
94 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
95 flags |= XFS_OQUOTA_ENFD;
96
97 switch (op) {
98 case Q_XQUOTAON:
99 return -xfs_qm_scall_quotaon(mp, flags);
100 case Q_XQUOTAOFF:
101 if (!XFS_IS_QUOTA_ON(mp))
102 return -EINVAL;
103 return -xfs_qm_scall_quotaoff(mp, flags);
104 case Q_XQUOTARM:
105 if (XFS_IS_QUOTA_ON(mp))
106 return -EINVAL;
107 return -xfs_qm_scall_trunc_qfiles(mp, flags);
108 }
109
110 return -EINVAL;
111}
112
113STATIC int
114xfs_fs_get_xquota(
115 struct super_block *sb,
116 int type,
117 qid_t id,
118 struct fs_disk_quota *fdq)
119{
120 struct xfs_mount *mp = XFS_M(sb);
121
122 if (!XFS_IS_QUOTA_RUNNING(mp))
123 return -ENOSYS;
124 if (!XFS_IS_QUOTA_ON(mp))
125 return -ESRCH;
126
127 return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
128}
129
130STATIC int
131xfs_fs_set_xquota(
132 struct super_block *sb,
133 int type,
134 qid_t id,
135 struct fs_disk_quota *fdq)
136{
137 struct xfs_mount *mp = XFS_M(sb);
138
139 if (sb->s_flags & MS_RDONLY)
140 return -EROFS;
141 if (!XFS_IS_QUOTA_RUNNING(mp))
142 return -ENOSYS;
143 if (!XFS_IS_QUOTA_ON(mp))
144 return -ESRCH;
145 if (!capable(CAP_SYS_ADMIN))
146 return -EPERM;
147
148 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
149}
150
151struct quotactl_ops xfs_quotactl_operations = {
152 .quota_sync = xfs_fs_quota_sync,
153 .get_xstate = xfs_fs_get_xstate,
154 .set_xstate = xfs_fs_set_xstate,
155 .get_xquota = xfs_fs_get_xquota,
156 .set_xquota = xfs_fs_set_xquota,
157};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 32ae5028e96b..bb685269f832 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -68,7 +68,6 @@
68#include <linux/freezer.h> 68#include <linux/freezer.h>
69#include <linux/parser.h> 69#include <linux/parser.h>
70 70
71static struct quotactl_ops xfs_quotactl_operations;
72static struct super_operations xfs_super_operations; 71static struct super_operations xfs_super_operations;
73static kmem_zone_t *xfs_ioend_zone; 72static kmem_zone_t *xfs_ioend_zone;
74mempool_t *xfs_ioend_pool; 73mempool_t *xfs_ioend_pool;
@@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool;
79#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ 78#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
80#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ 79#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
81#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ 80#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
82#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */
83#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ 81#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
84#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ 82#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
85#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ 83#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
@@ -180,7 +178,7 @@ xfs_parseargs(
180 int dswidth = 0; 178 int dswidth = 0;
181 int iosize = 0; 179 int iosize = 0;
182 int dmapi_implies_ikeep = 1; 180 int dmapi_implies_ikeep = 1;
183 uchar_t iosizelog = 0; 181 __uint8_t iosizelog = 0;
184 182
185 /* 183 /*
186 * Copy binary VFS mount flags we are interested in. 184 * Copy binary VFS mount flags we are interested in.
@@ -291,16 +289,6 @@ xfs_parseargs(
291 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; 289 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
292 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { 290 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
293 mp->m_flags |= XFS_MOUNT_NORECOVERY; 291 mp->m_flags |= XFS_MOUNT_NORECOVERY;
294 } else if (!strcmp(this_char, MNTOPT_INO64)) {
295#if XFS_BIG_INUMS
296 mp->m_flags |= XFS_MOUNT_INO64;
297 mp->m_inoadd = XFS_INO64_OFFSET;
298#else
299 cmn_err(CE_WARN,
300 "XFS: %s option not allowed on this system",
301 this_char);
302 return EINVAL;
303#endif
304 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { 292 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
305 mp->m_flags |= XFS_MOUNT_NOALIGN; 293 mp->m_flags |= XFS_MOUNT_NOALIGN;
306 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { 294 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
@@ -529,7 +517,6 @@ xfs_showargs(
529 /* the few simple ones we can get from the mount struct */ 517 /* the few simple ones we can get from the mount struct */
530 { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, 518 { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP },
531 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, 519 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
532 { XFS_MOUNT_INO64, "," MNTOPT_INO64 },
533 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, 520 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
534 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, 521 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
535 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, 522 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
@@ -634,7 +621,7 @@ xfs_max_file_offset(
634 return (((__uint64_t)pagefactor) << bitshift) - 1; 621 return (((__uint64_t)pagefactor) << bitshift) - 1;
635} 622}
636 623
637int 624STATIC int
638xfs_blkdev_get( 625xfs_blkdev_get(
639 xfs_mount_t *mp, 626 xfs_mount_t *mp,
640 const char *name, 627 const char *name,
@@ -651,7 +638,7 @@ xfs_blkdev_get(
651 return -error; 638 return -error;
652} 639}
653 640
654void 641STATIC void
655xfs_blkdev_put( 642xfs_blkdev_put(
656 struct block_device *bdev) 643 struct block_device *bdev)
657{ 644{
@@ -872,7 +859,7 @@ xfsaild_wakeup(
872 wake_up_process(ailp->xa_task); 859 wake_up_process(ailp->xa_task);
873} 860}
874 861
875int 862STATIC int
876xfsaild( 863xfsaild(
877 void *data) 864 void *data)
878{ 865{
@@ -990,26 +977,57 @@ xfs_fs_write_inode(
990 int sync) 977 int sync)
991{ 978{
992 struct xfs_inode *ip = XFS_I(inode); 979 struct xfs_inode *ip = XFS_I(inode);
980 struct xfs_mount *mp = ip->i_mount;
993 int error = 0; 981 int error = 0;
994 int flags = 0;
995 982
996 xfs_itrace_entry(ip); 983 xfs_itrace_entry(ip);
984
985 if (XFS_FORCED_SHUTDOWN(mp))
986 return XFS_ERROR(EIO);
987
997 if (sync) { 988 if (sync) {
998 error = xfs_wait_on_pages(ip, 0, -1); 989 error = xfs_wait_on_pages(ip, 0, -1);
999 if (error) 990 if (error)
1000 goto out_error; 991 goto out;
1001 flags |= FLUSH_SYNC;
1002 } 992 }
1003 error = xfs_inode_flush(ip, flags);
1004 993
1005out_error: 994 /*
995 * Bypass inodes which have already been cleaned by
996 * the inode flush clustering code inside xfs_iflush
997 */
998 if (xfs_inode_clean(ip))
999 goto out;
1000
1001 /*
1002 * We make this non-blocking if the inode is contended, return
1003 * EAGAIN to indicate to the caller that they did not succeed.
1004 * This prevents the flush path from blocking on inodes inside
1005 * another operation right now, they get caught later by xfs_sync.
1006 */
1007 if (sync) {
1008 xfs_ilock(ip, XFS_ILOCK_SHARED);
1009 xfs_iflock(ip);
1010
1011 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
1012 } else {
1013 error = EAGAIN;
1014 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1015 goto out;
1016 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1017 goto out_unlock;
1018
1019 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
1020 }
1021
1022 out_unlock:
1023 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1024 out:
1006 /* 1025 /*
1007 * if we failed to write out the inode then mark 1026 * if we failed to write out the inode then mark
1008 * it dirty again so we'll try again later. 1027 * it dirty again so we'll try again later.
1009 */ 1028 */
1010 if (error) 1029 if (error)
1011 xfs_mark_inode_dirty_sync(ip); 1030 xfs_mark_inode_dirty_sync(ip);
1012
1013 return -error; 1031 return -error;
1014} 1032}
1015 1033
@@ -1169,18 +1187,12 @@ xfs_fs_statfs(
1169 statp->f_bfree = statp->f_bavail = 1187 statp->f_bfree = statp->f_bavail =
1170 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); 1188 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1171 fakeinos = statp->f_bfree << sbp->sb_inopblog; 1189 fakeinos = statp->f_bfree << sbp->sb_inopblog;
1172#if XFS_BIG_INUMS
1173 fakeinos += mp->m_inoadd;
1174#endif
1175 statp->f_files = 1190 statp->f_files =
1176 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); 1191 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
1177 if (mp->m_maxicount) 1192 if (mp->m_maxicount)
1178#if XFS_BIG_INUMS 1193 statp->f_files = min_t(typeof(statp->f_files),
1179 if (!mp->m_inoadd) 1194 statp->f_files,
1180#endif 1195 mp->m_maxicount);
1181 statp->f_files = min_t(typeof(statp->f_files),
1182 statp->f_files,
1183 mp->m_maxicount);
1184 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1196 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1185 spin_unlock(&mp->m_sb_lock); 1197 spin_unlock(&mp->m_sb_lock);
1186 1198
@@ -1302,57 +1314,6 @@ xfs_fs_show_options(
1302 return -xfs_showargs(XFS_M(mnt->mnt_sb), m); 1314 return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
1303} 1315}
1304 1316
1305STATIC int
1306xfs_fs_quotasync(
1307 struct super_block *sb,
1308 int type)
1309{
1310 return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL);
1311}
1312
1313STATIC int
1314xfs_fs_getxstate(
1315 struct super_block *sb,
1316 struct fs_quota_stat *fqs)
1317{
1318 return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
1319}
1320
1321STATIC int
1322xfs_fs_setxstate(
1323 struct super_block *sb,
1324 unsigned int flags,
1325 int op)
1326{
1327 return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags);
1328}
1329
1330STATIC int
1331xfs_fs_getxquota(
1332 struct super_block *sb,
1333 int type,
1334 qid_t id,
1335 struct fs_disk_quota *fdq)
1336{
1337 return -XFS_QM_QUOTACTL(XFS_M(sb),
1338 (type == USRQUOTA) ? Q_XGETQUOTA :
1339 ((type == GRPQUOTA) ? Q_XGETGQUOTA :
1340 Q_XGETPQUOTA), id, (caddr_t)fdq);
1341}
1342
1343STATIC int
1344xfs_fs_setxquota(
1345 struct super_block *sb,
1346 int type,
1347 qid_t id,
1348 struct fs_disk_quota *fdq)
1349{
1350 return -XFS_QM_QUOTACTL(XFS_M(sb),
1351 (type == USRQUOTA) ? Q_XSETQLIM :
1352 ((type == GRPQUOTA) ? Q_XSETGQLIM :
1353 Q_XSETPQLIM), id, (caddr_t)fdq);
1354}
1355
1356/* 1317/*
1357 * This function fills in xfs_mount_t fields based on mount args. 1318 * This function fills in xfs_mount_t fields based on mount args.
1358 * Note: the superblock _has_ now been read in. 1319 * Note: the superblock _has_ now been read in.
@@ -1435,7 +1396,9 @@ xfs_fs_fill_super(
1435 sb_min_blocksize(sb, BBSIZE); 1396 sb_min_blocksize(sb, BBSIZE);
1436 sb->s_xattr = xfs_xattr_handlers; 1397 sb->s_xattr = xfs_xattr_handlers;
1437 sb->s_export_op = &xfs_export_operations; 1398 sb->s_export_op = &xfs_export_operations;
1399#ifdef CONFIG_XFS_QUOTA
1438 sb->s_qcop = &xfs_quotactl_operations; 1400 sb->s_qcop = &xfs_quotactl_operations;
1401#endif
1439 sb->s_op = &xfs_super_operations; 1402 sb->s_op = &xfs_super_operations;
1440 1403
1441 error = xfs_dmops_get(mp); 1404 error = xfs_dmops_get(mp);
@@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = {
1578 .show_options = xfs_fs_show_options, 1541 .show_options = xfs_fs_show_options,
1579}; 1542};
1580 1543
1581static struct quotactl_ops xfs_quotactl_operations = {
1582 .quota_sync = xfs_fs_quotasync,
1583 .get_xstate = xfs_fs_getxstate,
1584 .set_xstate = xfs_fs_setxstate,
1585 .get_xquota = xfs_fs_getxquota,
1586 .set_xquota = xfs_fs_setxquota,
1587};
1588
1589static struct file_system_type xfs_fs_type = { 1544static struct file_system_type xfs_fs_type = {
1590 .owner = THIS_MODULE, 1545 .owner = THIS_MODULE,
1591 .name = "xfs", 1546 .name = "xfs",
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index d5d776d4cd67..5a2ea3a21781 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
93 93
94extern const struct export_operations xfs_export_operations; 94extern const struct export_operations xfs_export_operations;
95extern struct xattr_handler *xfs_xattr_handlers[]; 95extern struct xattr_handler *xfs_xattr_handlers[];
96extern struct quotactl_ops xfs_quotactl_operations;
96 97
97#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
98 99
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 5f6de1efe1f6..04f058c848ae 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -19,6 +19,7 @@
19#define XFS_SYNC_H 1 19#define XFS_SYNC_H 1
20 20
21struct xfs_mount; 21struct xfs_mount;
22struct xfs_perag;
22 23
23typedef struct bhv_vfs_sync_work { 24typedef struct bhv_vfs_sync_work {
24 struct list_head w_list; 25 struct list_head w_list;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f65983a230d3..ad7fbead4c97 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -41,11 +41,6 @@ struct attrlist_cursor_kern;
41#define IO_INVIS 0x00020 /* don't update inode timestamps */ 41#define IO_INVIS 0x00020 /* don't update inode timestamps */
42 42
43/* 43/*
44 * Flags for xfs_inode_flush
45 */
46#define FLUSH_SYNC 1 /* wait for flush to complete */
47
48/*
49 * Flush/Invalidate options for vop_toss/flush/flushinval_pages. 44 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
50 */ 45 */
51#define FI_NONE 0 /* none */ 46#define FI_NONE 0 /* none */
@@ -55,33 +50,6 @@ struct attrlist_cursor_kern;
55 the operation completes. */ 50 the operation completes. */
56 51
57/* 52/*
58 * Dealing with bad inodes
59 */
60static inline int VN_BAD(struct inode *vp)
61{
62 return is_bad_inode(vp);
63}
64
65/*
66 * Extracting atime values in various formats
67 */
68static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
69{
70 bs_atime->tv_sec = vp->i_atime.tv_sec;
71 bs_atime->tv_nsec = vp->i_atime.tv_nsec;
72}
73
74static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
75{
76 *ts = vp->i_atime;
77}
78
79static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
80{
81 *tt = vp->i_atime.tv_sec;
82}
83
84/*
85 * Some useful predicates. 53 * Some useful predicates.
86 */ 54 */
87#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) 55#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 6543c0b29753..e4babcc63423 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -804,7 +804,7 @@ xfs_qm_dqlookup(
804 uint flist_locked; 804 uint flist_locked;
805 xfs_dquot_t *d; 805 xfs_dquot_t *d;
806 806
807 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 807 ASSERT(mutex_is_locked(&qh->qh_lock));
808 808
809 flist_locked = B_FALSE; 809 flist_locked = B_FALSE;
810 810
@@ -877,7 +877,7 @@ xfs_qm_dqlookup(
877 /* 877 /*
878 * move the dquot to the front of the hashchain 878 * move the dquot to the front of the hashchain
879 */ 879 */
880 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 880 ASSERT(mutex_is_locked(&qh->qh_lock));
881 if (dqp->HL_PREVP != &qh->qh_next) { 881 if (dqp->HL_PREVP != &qh->qh_next) {
882 xfs_dqtrace_entry(dqp, 882 xfs_dqtrace_entry(dqp,
883 "DQLOOKUP: HASH MOVETOFRONT"); 883 "DQLOOKUP: HASH MOVETOFRONT");
@@ -892,13 +892,13 @@ xfs_qm_dqlookup(
892 } 892 }
893 xfs_dqtrace_entry(dqp, "LOOKUP END"); 893 xfs_dqtrace_entry(dqp, "LOOKUP END");
894 *O_dqpp = dqp; 894 *O_dqpp = dqp;
895 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 895 ASSERT(mutex_is_locked(&qh->qh_lock));
896 return (0); 896 return (0);
897 } 897 }
898 } 898 }
899 899
900 *O_dqpp = NULL; 900 *O_dqpp = NULL;
901 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 901 ASSERT(mutex_is_locked(&qh->qh_lock));
902 return (1); 902 return (1);
903} 903}
904 904
@@ -956,7 +956,7 @@ xfs_qm_dqget(
956 ASSERT(ip->i_gdquot == NULL); 956 ASSERT(ip->i_gdquot == NULL);
957 } 957 }
958#endif 958#endif
959 XFS_DQ_HASH_LOCK(h); 959 mutex_lock(&h->qh_lock);
960 960
961 /* 961 /*
962 * Look in the cache (hashtable). 962 * Look in the cache (hashtable).
@@ -971,7 +971,7 @@ xfs_qm_dqget(
971 */ 971 */
972 ASSERT(*O_dqpp); 972 ASSERT(*O_dqpp);
973 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); 973 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
974 XFS_DQ_HASH_UNLOCK(h); 974 mutex_unlock(&h->qh_lock);
975 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); 975 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
976 return (0); /* success */ 976 return (0); /* success */
977 } 977 }
@@ -991,7 +991,7 @@ xfs_qm_dqget(
991 * we don't keep the lock across a disk read 991 * we don't keep the lock across a disk read
992 */ 992 */
993 version = h->qh_version; 993 version = h->qh_version;
994 XFS_DQ_HASH_UNLOCK(h); 994 mutex_unlock(&h->qh_lock);
995 995
996 /* 996 /*
997 * Allocate the dquot on the kernel heap, and read the ondisk 997 * Allocate the dquot on the kernel heap, and read the ondisk
@@ -1056,7 +1056,7 @@ xfs_qm_dqget(
1056 /* 1056 /*
1057 * Hashlock comes after ilock in lock order 1057 * Hashlock comes after ilock in lock order
1058 */ 1058 */
1059 XFS_DQ_HASH_LOCK(h); 1059 mutex_lock(&h->qh_lock);
1060 if (version != h->qh_version) { 1060 if (version != h->qh_version) {
1061 xfs_dquot_t *tmpdqp; 1061 xfs_dquot_t *tmpdqp;
1062 /* 1062 /*
@@ -1072,7 +1072,7 @@ xfs_qm_dqget(
1072 * and start over. 1072 * and start over.
1073 */ 1073 */
1074 xfs_qm_dqput(tmpdqp); 1074 xfs_qm_dqput(tmpdqp);
1075 XFS_DQ_HASH_UNLOCK(h); 1075 mutex_unlock(&h->qh_lock);
1076 xfs_qm_dqdestroy(dqp); 1076 xfs_qm_dqdestroy(dqp);
1077 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); 1077 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
1078 goto again; 1078 goto again;
@@ -1083,7 +1083,7 @@ xfs_qm_dqget(
1083 * Put the dquot at the beginning of the hash-chain and mp's list 1083 * Put the dquot at the beginning of the hash-chain and mp's list
1084 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. 1084 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
1085 */ 1085 */
1086 ASSERT(XFS_DQ_IS_HASH_LOCKED(h)); 1086 ASSERT(mutex_is_locked(&h->qh_lock));
1087 dqp->q_hash = h; 1087 dqp->q_hash = h;
1088 XQM_HASHLIST_INSERT(h, dqp); 1088 XQM_HASHLIST_INSERT(h, dqp);
1089 1089
@@ -1102,7 +1102,7 @@ xfs_qm_dqget(
1102 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); 1102 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
1103 1103
1104 xfs_qm_mplist_unlock(mp); 1104 xfs_qm_mplist_unlock(mp);
1105 XFS_DQ_HASH_UNLOCK(h); 1105 mutex_unlock(&h->qh_lock);
1106 dqret: 1106 dqret:
1107 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1107 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
1108 xfs_dqtrace_entry(dqp, "DQGET DONE"); 1108 xfs_dqtrace_entry(dqp, "DQGET DONE");
@@ -1440,7 +1440,7 @@ xfs_qm_dqpurge(
1440 xfs_mount_t *mp = dqp->q_mount; 1440 xfs_mount_t *mp = dqp->q_mount;
1441 1441
1442 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1442 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
1443 ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); 1443 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
1444 1444
1445 xfs_dqlock(dqp); 1445 xfs_dqlock(dqp);
1446 /* 1446 /*
@@ -1453,7 +1453,7 @@ xfs_qm_dqpurge(
1453 */ 1453 */
1454 if (dqp->q_nrefs != 0) { 1454 if (dqp->q_nrefs != 0) {
1455 xfs_dqunlock(dqp); 1455 xfs_dqunlock(dqp);
1456 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 1456 mutex_unlock(&dqp->q_hash->qh_lock);
1457 return (1); 1457 return (1);
1458 } 1458 }
1459 1459
@@ -1517,7 +1517,7 @@ xfs_qm_dqpurge(
1517 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 1517 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1518 xfs_dqfunlock(dqp); 1518 xfs_dqfunlock(dqp);
1519 xfs_dqunlock(dqp); 1519 xfs_dqunlock(dqp);
1520 XFS_DQ_HASH_UNLOCK(thishash); 1520 mutex_unlock(&thishash->qh_lock);
1521 return (0); 1521 return (0);
1522} 1522}
1523 1523
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index d443e93b4331..de0f402ddb4c 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -34,7 +34,7 @@
34 */ 34 */
35typedef struct xfs_dqhash { 35typedef struct xfs_dqhash {
36 struct xfs_dquot *qh_next; 36 struct xfs_dquot *qh_next;
37 mutex_t qh_lock; 37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */ 38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */ 39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t; 40} xfs_dqhash_t;
@@ -81,7 +81,7 @@ typedef struct xfs_dquot {
81 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ 81 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ 82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ 83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
84 mutex_t q_qlock; /* quota lock */ 84 struct mutex q_qlock; /* quota lock */
85 struct completion q_flush; /* flush completion queue */ 85 struct completion q_flush; /* flush completion queue */
86 atomic_t q_pincount; /* dquot pin count */ 86 atomic_t q_pincount; /* dquot pin count */
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
@@ -109,19 +109,6 @@ enum {
109 109
110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) 110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
111 111
112#ifdef DEBUG
113static inline int
114XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
115{
116 if (mutex_trylock(&dqp->q_qlock)) {
117 mutex_unlock(&dqp->q_qlock);
118 return 0;
119 }
120 return 1;
121}
122#endif
123
124
125/* 112/*
126 * Manage the q_flush completion queue embedded in the dquot. This completion 113 * Manage the q_flush completion queue embedded in the dquot. This completion
127 * queue synchronizes processes attempting to flush the in-core dquot back to 114 * queue synchronizes processes attempting to flush the in-core dquot back to
@@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
142 complete(&dqp->q_flush); 129 complete(&dqp->q_flush);
143} 130}
144 131
132#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
145#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) 133#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
146#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 134#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
147#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 135#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7a2beb64314f..5b6695049e00 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,7 +55,7 @@
55 * quota functionality, including maintaining the freelist and hash 55 * quota functionality, including maintaining the freelist and hash
56 * tables of dquots. 56 * tables of dquots.
57 */ 57 */
58mutex_t xfs_Gqm_lock; 58struct mutex xfs_Gqm_lock;
59struct xfs_qm *xfs_Gqm; 59struct xfs_qm *xfs_Gqm;
60uint ndquot; 60uint ndquot;
61 61
@@ -69,8 +69,6 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 69
70STATIC void xfs_qm_freelist_init(xfs_frlist_t *); 70STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); 71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
72STATIC int xfs_qm_mplist_nowait(xfs_mount_t *);
73STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
74 72
75STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 73STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
76STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 74STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
@@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = {
82}; 80};
83 81
84#ifdef DEBUG 82#ifdef DEBUG
85extern mutex_t qcheck_lock; 83extern struct mutex qcheck_lock;
86#endif 84#endif
87 85
88#ifdef QUOTADEBUG 86#ifdef QUOTADEBUG
@@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref(
219 * the structure could disappear between the entry to this routine and 217 * the structure could disappear between the entry to this routine and
220 * a HOLD operation if not locked. 218 * a HOLD operation if not locked.
221 */ 219 */
222 XFS_QM_LOCK(xfs_Gqm); 220 mutex_lock(&xfs_Gqm_lock);
223 221
224 if (xfs_Gqm == NULL) 222 if (xfs_Gqm == NULL)
225 xfs_Gqm = xfs_Gqm_init(); 223 xfs_Gqm = xfs_Gqm_init();
@@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref(
228 * debugging and statistical purposes, but ... 226 * debugging and statistical purposes, but ...
229 * Just take a reference and get out. 227 * Just take a reference and get out.
230 */ 228 */
231 XFS_QM_HOLD(xfs_Gqm); 229 xfs_Gqm->qm_nrefs++;
232 XFS_QM_UNLOCK(xfs_Gqm); 230 mutex_unlock(&xfs_Gqm_lock);
233 231
234 return 0; 232 return 0;
235} 233}
@@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref(
277 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 275 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
278 * be restarted. 276 * be restarted.
279 */ 277 */
280 XFS_QM_LOCK(xfs_Gqm); 278 mutex_lock(&xfs_Gqm_lock);
281 XFS_QM_RELE(xfs_Gqm); 279 if (--xfs_Gqm->qm_nrefs == 0) {
282 if (xfs_Gqm->qm_nrefs == 0) {
283 xfs_qm_destroy(xfs_Gqm); 280 xfs_qm_destroy(xfs_Gqm);
284 xfs_Gqm = NULL; 281 xfs_Gqm = NULL;
285 } 282 }
286 XFS_QM_UNLOCK(xfs_Gqm); 283 mutex_unlock(&xfs_Gqm_lock);
287} 284}
288 285
289/* 286/*
@@ -577,10 +574,10 @@ xfs_qm_dqpurge_int(
577 continue; 574 continue;
578 } 575 }
579 576
580 if (! xfs_qm_dqhashlock_nowait(dqp)) { 577 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
581 nrecl = XFS_QI_MPLRECLAIMS(mp); 578 nrecl = XFS_QI_MPLRECLAIMS(mp);
582 xfs_qm_mplist_unlock(mp); 579 xfs_qm_mplist_unlock(mp);
583 XFS_DQ_HASH_LOCK(dqp->q_hash); 580 mutex_lock(&dqp->q_hash->qh_lock);
584 xfs_qm_mplist_lock(mp); 581 xfs_qm_mplist_lock(mp);
585 582
586 /* 583 /*
@@ -590,7 +587,7 @@ xfs_qm_dqpurge_int(
590 * this point, but somebody might be taking things off. 587 * this point, but somebody might be taking things off.
591 */ 588 */
592 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { 589 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
593 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 590 mutex_unlock(&dqp->q_hash->qh_lock);
594 goto again; 591 goto again;
595 } 592 }
596 } 593 }
@@ -632,7 +629,6 @@ xfs_qm_dqattach_one(
632 xfs_dqid_t id, 629 xfs_dqid_t id,
633 uint type, 630 uint type,
634 uint doalloc, 631 uint doalloc,
635 uint dolock,
636 xfs_dquot_t *udqhint, /* hint */ 632 xfs_dquot_t *udqhint, /* hint */
637 xfs_dquot_t **IO_idqpp) 633 xfs_dquot_t **IO_idqpp)
638{ 634{
@@ -641,16 +637,16 @@ xfs_qm_dqattach_one(
641 637
642 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 638 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
643 error = 0; 639 error = 0;
640
644 /* 641 /*
645 * See if we already have it in the inode itself. IO_idqpp is 642 * See if we already have it in the inode itself. IO_idqpp is
646 * &i_udquot or &i_gdquot. This made the code look weird, but 643 * &i_udquot or &i_gdquot. This made the code look weird, but
647 * made the logic a lot simpler. 644 * made the logic a lot simpler.
648 */ 645 */
649 if ((dqp = *IO_idqpp)) { 646 dqp = *IO_idqpp;
650 if (dolock) 647 if (dqp) {
651 xfs_dqlock(dqp);
652 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); 648 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
653 goto done; 649 return 0;
654 } 650 }
655 651
656 /* 652 /*
@@ -659,38 +655,38 @@ xfs_qm_dqattach_one(
659 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside 655 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
660 * the user dquot. 656 * the user dquot.
661 */ 657 */
662 ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); 658 if (udqhint) {
663 if (udqhint && !dolock) 659 ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
664 xfs_dqlock(udqhint); 660 xfs_dqlock(udqhint);
665 661
666 /* 662 /*
667 * No need to take dqlock to look at the id. 663 * No need to take dqlock to look at the id.
668 * The ID can't change until it gets reclaimed, and it won't 664 *
669 * be reclaimed as long as we have a ref from inode and we hold 665 * The ID can't change until it gets reclaimed, and it won't
670 * the ilock. 666 * be reclaimed as long as we have a ref from inode and we
671 */ 667 * hold the ilock.
672 if (udqhint && 668 */
673 (dqp = udqhint->q_gdquot) && 669 dqp = udqhint->q_gdquot;
674 (be32_to_cpu(dqp->q_core.d_id) == id)) { 670 if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
675 ASSERT(XFS_DQ_IS_LOCKED(udqhint)); 671 xfs_dqlock(dqp);
676 xfs_dqlock(dqp); 672 XFS_DQHOLD(dqp);
677 XFS_DQHOLD(dqp); 673 ASSERT(*IO_idqpp == NULL);
678 ASSERT(*IO_idqpp == NULL); 674 *IO_idqpp = dqp;
679 *IO_idqpp = dqp; 675
680 if (!dolock) {
681 xfs_dqunlock(dqp); 676 xfs_dqunlock(dqp);
682 xfs_dqunlock(udqhint); 677 xfs_dqunlock(udqhint);
678 return 0;
683 } 679 }
684 goto done; 680
685 } 681 /*
686 /* 682 * We can't hold a dquot lock when we call the dqget code.
687 * We can't hold a dquot lock when we call the dqget code. 683 * We'll deadlock in no time, because of (not conforming to)
688 * We'll deadlock in no time, because of (not conforming to) 684 * lock ordering - the inodelock comes before any dquot lock,
689 * lock ordering - the inodelock comes before any dquot lock, 685 * and we may drop and reacquire the ilock in xfs_qm_dqget().
690 * and we may drop and reacquire the ilock in xfs_qm_dqget(). 686 */
691 */
692 if (udqhint)
693 xfs_dqunlock(udqhint); 687 xfs_dqunlock(udqhint);
688 }
689
694 /* 690 /*
695 * Find the dquot from somewhere. This bumps the 691 * Find the dquot from somewhere. This bumps the
696 * reference count of dquot and returns it locked. 692 * reference count of dquot and returns it locked.
@@ -698,48 +694,19 @@ xfs_qm_dqattach_one(
698 * disk and we didn't ask it to allocate; 694 * disk and we didn't ask it to allocate;
699 * ESRCH if quotas got turned off suddenly. 695 * ESRCH if quotas got turned off suddenly.
700 */ 696 */
701 if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type, 697 error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
702 doalloc|XFS_QMOPT_DOWARN, &dqp))) { 698 if (error)
703 if (udqhint && dolock) 699 return error;
704 xfs_dqlock(udqhint);
705 goto done;
706 }
707 700
708 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); 701 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
702
709 /* 703 /*
710 * dqget may have dropped and re-acquired the ilock, but it guarantees 704 * dqget may have dropped and re-acquired the ilock, but it guarantees
711 * that the dquot returned is the one that should go in the inode. 705 * that the dquot returned is the one that should go in the inode.
712 */ 706 */
713 *IO_idqpp = dqp; 707 *IO_idqpp = dqp;
714 ASSERT(dqp); 708 xfs_dqunlock(dqp);
715 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 709 return 0;
716 if (! dolock) {
717 xfs_dqunlock(dqp);
718 goto done;
719 }
720 if (! udqhint)
721 goto done;
722
723 ASSERT(udqhint);
724 ASSERT(dolock);
725 ASSERT(XFS_DQ_IS_LOCKED(dqp));
726 if (! xfs_qm_dqlock_nowait(udqhint)) {
727 xfs_dqunlock(dqp);
728 xfs_dqlock(udqhint);
729 xfs_dqlock(dqp);
730 }
731 done:
732#ifdef QUOTADEBUG
733 if (udqhint) {
734 if (dolock)
735 ASSERT(XFS_DQ_IS_LOCKED(udqhint));
736 }
737 if (! error) {
738 if (dolock)
739 ASSERT(XFS_DQ_IS_LOCKED(dqp));
740 }
741#endif
742 return error;
743} 710}
744 711
745 712
@@ -754,24 +721,15 @@ xfs_qm_dqattach_one(
754STATIC void 721STATIC void
755xfs_qm_dqattach_grouphint( 722xfs_qm_dqattach_grouphint(
756 xfs_dquot_t *udq, 723 xfs_dquot_t *udq,
757 xfs_dquot_t *gdq, 724 xfs_dquot_t *gdq)
758 uint locked)
759{ 725{
760 xfs_dquot_t *tmp; 726 xfs_dquot_t *tmp;
761 727
762#ifdef QUOTADEBUG 728 xfs_dqlock(udq);
763 if (locked) {
764 ASSERT(XFS_DQ_IS_LOCKED(udq));
765 ASSERT(XFS_DQ_IS_LOCKED(gdq));
766 }
767#endif
768 if (! locked)
769 xfs_dqlock(udq);
770 729
771 if ((tmp = udq->q_gdquot)) { 730 if ((tmp = udq->q_gdquot)) {
772 if (tmp == gdq) { 731 if (tmp == gdq) {
773 if (! locked) 732 xfs_dqunlock(udq);
774 xfs_dqunlock(udq);
775 return; 733 return;
776 } 734 }
777 735
@@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint(
781 * because the freelist lock comes before dqlocks. 739 * because the freelist lock comes before dqlocks.
782 */ 740 */
783 xfs_dqunlock(udq); 741 xfs_dqunlock(udq);
784 if (locked)
785 xfs_dqunlock(gdq);
786 /* 742 /*
787 * we took a hard reference once upon a time in dqget, 743 * we took a hard reference once upon a time in dqget,
788 * so give it back when the udquot no longer points at it 744 * so give it back when the udquot no longer points at it
@@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint(
795 751
796 } else { 752 } else {
797 ASSERT(XFS_DQ_IS_LOCKED(udq)); 753 ASSERT(XFS_DQ_IS_LOCKED(udq));
798 if (! locked) { 754 xfs_dqlock(gdq);
799 xfs_dqlock(gdq);
800 }
801 } 755 }
802 756
803 ASSERT(XFS_DQ_IS_LOCKED(udq)); 757 ASSERT(XFS_DQ_IS_LOCKED(udq));
@@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint(
810 XFS_DQHOLD(gdq); 764 XFS_DQHOLD(gdq);
811 udq->q_gdquot = gdq; 765 udq->q_gdquot = gdq;
812 } 766 }
813 if (! locked) { 767
814 xfs_dqunlock(gdq); 768 xfs_dqunlock(gdq);
815 xfs_dqunlock(udq); 769 xfs_dqunlock(udq);
816 }
817} 770}
818 771
819 772
@@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint(
821 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON 774 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
822 * into account. 775 * into account.
823 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. 776 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
824 * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
825 * much made this code a complete mess, but it has been pretty useful.
826 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL. 777 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
827 * Inode may get unlocked and relocked in here, and the caller must deal with 778 * Inode may get unlocked and relocked in here, and the caller must deal with
828 * the consequences. 779 * the consequences.
@@ -851,7 +802,6 @@ xfs_qm_dqattach(
851 if (XFS_IS_UQUOTA_ON(mp)) { 802 if (XFS_IS_UQUOTA_ON(mp)) {
852 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, 803 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
853 flags & XFS_QMOPT_DQALLOC, 804 flags & XFS_QMOPT_DQALLOC,
854 flags & XFS_QMOPT_DQLOCK,
855 NULL, &ip->i_udquot); 805 NULL, &ip->i_udquot);
856 if (error) 806 if (error)
857 goto done; 807 goto done;
@@ -863,11 +813,9 @@ xfs_qm_dqattach(
863 error = XFS_IS_GQUOTA_ON(mp) ? 813 error = XFS_IS_GQUOTA_ON(mp) ?
864 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 814 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
865 flags & XFS_QMOPT_DQALLOC, 815 flags & XFS_QMOPT_DQALLOC,
866 flags & XFS_QMOPT_DQLOCK,
867 ip->i_udquot, &ip->i_gdquot) : 816 ip->i_udquot, &ip->i_gdquot) :
868 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 817 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
869 flags & XFS_QMOPT_DQALLOC, 818 flags & XFS_QMOPT_DQALLOC,
870 flags & XFS_QMOPT_DQLOCK,
871 ip->i_udquot, &ip->i_gdquot); 819 ip->i_udquot, &ip->i_gdquot);
872 /* 820 /*
873 * Don't worry about the udquot that we may have 821 * Don't worry about the udquot that we may have
@@ -898,22 +846,13 @@ xfs_qm_dqattach(
898 /* 846 /*
899 * Attach i_gdquot to the gdquot hint inside the i_udquot. 847 * Attach i_gdquot to the gdquot hint inside the i_udquot.
900 */ 848 */
901 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot, 849 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
902 flags & XFS_QMOPT_DQLOCK);
903 } 850 }
904 851
905 done: 852 done:
906 853
907#ifdef QUOTADEBUG 854#ifdef QUOTADEBUG
908 if (! error) { 855 if (! error) {
909 if (ip->i_udquot) {
910 if (flags & XFS_QMOPT_DQLOCK)
911 ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
912 }
913 if (ip->i_gdquot) {
914 if (flags & XFS_QMOPT_DQLOCK)
915 ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
916 }
917 if (XFS_IS_UQUOTA_ON(mp)) 856 if (XFS_IS_UQUOTA_ON(mp))
918 ASSERT(ip->i_udquot); 857 ASSERT(ip->i_udquot);
919 if (XFS_IS_OQUOTA_ON(mp)) 858 if (XFS_IS_OQUOTA_ON(mp))
@@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist(
2086 * a dqlookup process that holds the hashlock that is 2025 * a dqlookup process that holds the hashlock that is
2087 * waiting for the freelist lock. 2026 * waiting for the freelist lock.
2088 */ 2027 */
2089 if (! xfs_qm_dqhashlock_nowait(dqp)) { 2028 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
2090 xfs_dqfunlock(dqp); 2029 xfs_dqfunlock(dqp);
2091 xfs_dqunlock(dqp); 2030 xfs_dqunlock(dqp);
2092 dqp = dqp->dq_flnext; 2031 dqp = dqp->dq_flnext;
@@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist(
2103 /* XXX put a sentinel so that we can come back here */ 2042 /* XXX put a sentinel so that we can come back here */
2104 xfs_dqfunlock(dqp); 2043 xfs_dqfunlock(dqp);
2105 xfs_dqunlock(dqp); 2044 xfs_dqunlock(dqp);
2106 XFS_DQ_HASH_UNLOCK(hash); 2045 mutex_unlock(&hash->qh_lock);
2107 xfs_qm_freelist_unlock(xfs_Gqm); 2046 xfs_qm_freelist_unlock(xfs_Gqm);
2108 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2047 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2109 return nreclaimed; 2048 return nreclaimed;
@@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist(
2120 XQM_HASHLIST_REMOVE(hash, dqp); 2059 XQM_HASHLIST_REMOVE(hash, dqp);
2121 xfs_dqfunlock(dqp); 2060 xfs_dqfunlock(dqp);
2122 xfs_qm_mplist_unlock(dqp->q_mount); 2061 xfs_qm_mplist_unlock(dqp->q_mount);
2123 XFS_DQ_HASH_UNLOCK(hash); 2062 mutex_unlock(&hash->qh_lock);
2124 2063
2125 off_freelist: 2064 off_freelist:
2126 XQM_FREELIST_REMOVE(dqp); 2065 XQM_FREELIST_REMOVE(dqp);
@@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void)
2262 continue; 2201 continue;
2263 } 2202 }
2264 2203
2265 if (! xfs_qm_dqhashlock_nowait(dqp)) 2204 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2266 goto mplistunlock; 2205 goto mplistunlock;
2267 2206
2268 ASSERT(dqp->q_nrefs == 0); 2207 ASSERT(dqp->q_nrefs == 0);
@@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void)
2271 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); 2210 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2272 XQM_FREELIST_REMOVE(dqp); 2211 XQM_FREELIST_REMOVE(dqp);
2273 dqpout = dqp; 2212 dqpout = dqp;
2274 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 2213 mutex_unlock(&dqp->q_hash->qh_lock);
2275 mplistunlock: 2214 mplistunlock:
2276 xfs_qm_mplist_unlock(dqp->q_mount); 2215 xfs_qm_mplist_unlock(dqp->q_mount);
2277 xfs_dqfunlock(dqp); 2216 xfs_dqfunlock(dqp);
@@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2774{ 2713{
2775 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); 2714 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2776} 2715}
2777
2778STATIC int
2779xfs_qm_dqhashlock_nowait(
2780 xfs_dquot_t *dqp)
2781{
2782 int locked;
2783
2784 locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
2785 return locked;
2786}
2787
2788int
2789xfs_qm_freelist_lock_nowait(
2790 xfs_qm_t *xqm)
2791{
2792 int locked;
2793
2794 locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
2795 return locked;
2796}
2797
2798STATIC int
2799xfs_qm_mplist_nowait(
2800 xfs_mount_t *mp)
2801{
2802 int locked;
2803
2804 ASSERT(mp->m_quotainfo);
2805 locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
2806 return locked;
2807}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index ddf09166387c..a371954cae1b 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -27,7 +27,7 @@ struct xfs_qm;
27struct xfs_inode; 27struct xfs_inode;
28 28
29extern uint ndquot; 29extern uint ndquot;
30extern mutex_t xfs_Gqm_lock; 30extern struct mutex xfs_Gqm_lock;
31extern struct xfs_qm *xfs_Gqm; 31extern struct xfs_qm *xfs_Gqm;
32extern kmem_zone_t *qm_dqzone; 32extern kmem_zone_t *qm_dqzone;
33extern kmem_zone_t *qm_dqtrxzone; 33extern kmem_zone_t *qm_dqtrxzone;
@@ -79,7 +79,7 @@ typedef xfs_dqhash_t xfs_dqlist_t;
79typedef struct xfs_frlist { 79typedef struct xfs_frlist {
80 struct xfs_dquot *qh_next; 80 struct xfs_dquot *qh_next;
81 struct xfs_dquot *qh_prev; 81 struct xfs_dquot *qh_prev;
82 mutex_t qh_lock; 82 struct mutex qh_lock;
83 uint qh_version; 83 uint qh_version;
84 uint qh_nelems; 84 uint qh_nelems;
85} xfs_frlist_t; 85} xfs_frlist_t;
@@ -115,7 +115,7 @@ typedef struct xfs_quotainfo {
115 xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ 115 xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
116 xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ 116 xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
117 xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ 117 xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
118 mutex_t qi_quotaofflock;/* to serialize quotaoff */ 118 struct mutex qi_quotaofflock;/* to serialize quotaoff */
119 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ 119 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
120 uint qi_dqperchunk; /* # ondisk dqs in above chunk */ 120 uint qi_dqperchunk; /* # ondisk dqs in above chunk */
121 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ 121 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
@@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct {
158#define XFS_QM_IWARNLIMIT 5 158#define XFS_QM_IWARNLIMIT 5
159#define XFS_QM_RTBWARNLIMIT 5 159#define XFS_QM_RTBWARNLIMIT 5
160 160
161#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock))
162#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock))
163#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++)
164#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
165
166extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 161extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
167extern void xfs_qm_mount_quotas(xfs_mount_t *); 162extern void xfs_qm_mount_quotas(xfs_mount_t *);
168extern int xfs_qm_quotacheck(xfs_mount_t *); 163extern int xfs_qm_quotacheck(xfs_mount_t *);
@@ -178,6 +173,16 @@ extern void xfs_qm_dqdetach(xfs_inode_t *);
178extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); 173extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
179extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 174extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
180 175
176/* quota ops */
177extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
178extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
179 fs_disk_quota_t *);
180extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
181 fs_disk_quota_t *);
182extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
183extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
184extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
185
181/* vop stuff */ 186/* vop stuff */
182extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, 187extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
183 uid_t, gid_t, prid_t, uint, 188 uid_t, gid_t, prid_t, uint,
@@ -194,11 +199,6 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
194/* list stuff */ 199/* list stuff */
195extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); 200extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
196extern void xfs_qm_freelist_unlink(xfs_dquot_t *); 201extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
197extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *);
198
199/* system call interface */
200extern int xfs_qm_quotactl(struct xfs_mount *, int, int,
201 xfs_caddr_t);
202 202
203#ifdef DEBUG 203#ifdef DEBUG
204extern int xfs_qm_internalqcheck(xfs_mount_t *); 204extern int xfs_qm_internalqcheck(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bc6c5cca3e12..63037c689a4b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = {
235 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, 235 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve,
236 .xfs_dqstatvfs = xfs_qm_statvfs, 236 .xfs_dqstatvfs = xfs_qm_statvfs,
237 .xfs_dqsync = xfs_qm_sync, 237 .xfs_dqsync = xfs_qm_sync,
238 .xfs_quotactl = xfs_qm_quotactl,
239 .xfs_dqtrxops = &xfs_trans_dquot_ops, 238 .xfs_dqtrxops = &xfs_trans_dquot_ops,
240}; 239};
241EXPORT_SYMBOL(xfs_qmcore_xfs); 240EXPORT_SYMBOL(xfs_qmcore_xfs);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68139b38aede..c7b66f6506ce 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -57,135 +57,16 @@
57# define qdprintk(s, args...) do { } while (0) 57# define qdprintk(s, args...) do { } while (0)
58#endif 58#endif
59 59
60STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
61STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
62 fs_disk_quota_t *);
63STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
64STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
65 fs_disk_quota_t *);
66STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
67STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
68STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 60STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
69STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 61STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
70 uint); 62 uint);
71STATIC uint xfs_qm_import_flags(uint);
72STATIC uint xfs_qm_export_flags(uint); 63STATIC uint xfs_qm_export_flags(uint);
73STATIC uint xfs_qm_import_qtype_flags(uint);
74STATIC uint xfs_qm_export_qtype_flags(uint); 64STATIC uint xfs_qm_export_qtype_flags(uint);
75STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, 65STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
76 fs_disk_quota_t *); 66 fs_disk_quota_t *);
77 67
78 68
79/* 69/*
80 * The main distribution switch of all XFS quotactl system calls.
81 */
82int
83xfs_qm_quotactl(
84 xfs_mount_t *mp,
85 int cmd,
86 int id,
87 xfs_caddr_t addr)
88{
89 int error;
90
91 ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
92
93 /*
94 * The following commands are valid even when quotaoff.
95 */
96 switch (cmd) {
97 case Q_XQUOTARM:
98 /*
99 * Truncate quota files. quota must be off.
100 */
101 if (XFS_IS_QUOTA_ON(mp))
102 return XFS_ERROR(EINVAL);
103 if (mp->m_flags & XFS_MOUNT_RDONLY)
104 return XFS_ERROR(EROFS);
105 return (xfs_qm_scall_trunc_qfiles(mp,
106 xfs_qm_import_qtype_flags(*(uint *)addr)));
107
108 case Q_XGETQSTAT:
109 /*
110 * Get quota status information.
111 */
112 return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
113
114 case Q_XQUOTAON:
115 /*
116 * QUOTAON - enabling quota enforcement.
117 * Quota accounting must be turned on at mount time.
118 */
119 if (mp->m_flags & XFS_MOUNT_RDONLY)
120 return XFS_ERROR(EROFS);
121 return (xfs_qm_scall_quotaon(mp,
122 xfs_qm_import_flags(*(uint *)addr)));
123
124 case Q_XQUOTAOFF:
125 if (mp->m_flags & XFS_MOUNT_RDONLY)
126 return XFS_ERROR(EROFS);
127 break;
128
129 case Q_XQUOTASYNC:
130 return xfs_sync_inodes(mp, SYNC_DELWRI);
131
132 default:
133 break;
134 }
135
136 if (! XFS_IS_QUOTA_ON(mp))
137 return XFS_ERROR(ESRCH);
138
139 switch (cmd) {
140 case Q_XQUOTAOFF:
141 if (mp->m_flags & XFS_MOUNT_RDONLY)
142 return XFS_ERROR(EROFS);
143 error = xfs_qm_scall_quotaoff(mp,
144 xfs_qm_import_flags(*(uint *)addr),
145 B_FALSE);
146 break;
147
148 case Q_XGETQUOTA:
149 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
150 (fs_disk_quota_t *)addr);
151 break;
152 case Q_XGETGQUOTA:
153 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
154 (fs_disk_quota_t *)addr);
155 break;
156 case Q_XGETPQUOTA:
157 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
158 (fs_disk_quota_t *)addr);
159 break;
160
161 case Q_XSETQLIM:
162 if (mp->m_flags & XFS_MOUNT_RDONLY)
163 return XFS_ERROR(EROFS);
164 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
165 (fs_disk_quota_t *)addr);
166 break;
167 case Q_XSETGQLIM:
168 if (mp->m_flags & XFS_MOUNT_RDONLY)
169 return XFS_ERROR(EROFS);
170 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
171 (fs_disk_quota_t *)addr);
172 break;
173 case Q_XSETPQLIM:
174 if (mp->m_flags & XFS_MOUNT_RDONLY)
175 return XFS_ERROR(EROFS);
176 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
177 (fs_disk_quota_t *)addr);
178 break;
179
180 default:
181 error = XFS_ERROR(EINVAL);
182 break;
183 }
184
185 return (error);
186}
187
188/*
189 * Turn off quota accounting and/or enforcement for all udquots and/or 70 * Turn off quota accounting and/or enforcement for all udquots and/or
190 * gdquots. Called only at unmount time. 71 * gdquots. Called only at unmount time.
191 * 72 *
@@ -193,11 +74,10 @@ xfs_qm_quotactl(
193 * incore, and modifies the ondisk dquot directly. Therefore, for example, 74 * incore, and modifies the ondisk dquot directly. Therefore, for example,
194 * it is an error to call this twice, without purging the cache. 75 * it is an error to call this twice, without purging the cache.
195 */ 76 */
196STATIC int 77int
197xfs_qm_scall_quotaoff( 78xfs_qm_scall_quotaoff(
198 xfs_mount_t *mp, 79 xfs_mount_t *mp,
199 uint flags, 80 uint flags)
200 boolean_t force)
201{ 81{
202 uint dqtype; 82 uint dqtype;
203 int error; 83 int error;
@@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff(
205 xfs_qoff_logitem_t *qoffstart; 85 xfs_qoff_logitem_t *qoffstart;
206 int nculprits; 86 int nculprits;
207 87
208 if (!force && !capable(CAP_SYS_ADMIN))
209 return XFS_ERROR(EPERM);
210 /* 88 /*
211 * No file system can have quotas enabled on disk but not in core. 89 * No file system can have quotas enabled on disk but not in core.
212 * Note that quota utilities (like quotaoff) _expect_ 90 * Note that quota utilities (like quotaoff) _expect_
@@ -375,7 +253,7 @@ out_error:
375 return (error); 253 return (error);
376} 254}
377 255
378STATIC int 256int
379xfs_qm_scall_trunc_qfiles( 257xfs_qm_scall_trunc_qfiles(
380 xfs_mount_t *mp, 258 xfs_mount_t *mp,
381 uint flags) 259 uint flags)
@@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles(
383 int error = 0, error2 = 0; 261 int error = 0, error2 = 0;
384 xfs_inode_t *qip; 262 xfs_inode_t *qip;
385 263
386 if (!capable(CAP_SYS_ADMIN))
387 return XFS_ERROR(EPERM);
388 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 264 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
389 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 265 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
390 return XFS_ERROR(EINVAL); 266 return XFS_ERROR(EINVAL);
@@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles(
416 * effect immediately. 292 * effect immediately.
417 * (Switching on quota accounting must be done at mount time.) 293 * (Switching on quota accounting must be done at mount time.)
418 */ 294 */
419STATIC int 295int
420xfs_qm_scall_quotaon( 296xfs_qm_scall_quotaon(
421 xfs_mount_t *mp, 297 xfs_mount_t *mp,
422 uint flags) 298 uint flags)
@@ -426,9 +302,6 @@ xfs_qm_scall_quotaon(
426 uint accflags; 302 uint accflags;
427 __int64_t sbflags; 303 __int64_t sbflags;
428 304
429 if (!capable(CAP_SYS_ADMIN))
430 return XFS_ERROR(EPERM);
431
432 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 305 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
433 /* 306 /*
434 * Switching on quota accounting must be done at mount time. 307 * Switching on quota accounting must be done at mount time.
@@ -517,7 +390,7 @@ xfs_qm_scall_quotaon(
517/* 390/*
518 * Return quota status information, such as uquota-off, enforcements, etc. 391 * Return quota status information, such as uquota-off, enforcements, etc.
519 */ 392 */
520STATIC int 393int
521xfs_qm_scall_getqstat( 394xfs_qm_scall_getqstat(
522 xfs_mount_t *mp, 395 xfs_mount_t *mp,
523 fs_quota_stat_t *out) 396 fs_quota_stat_t *out)
@@ -582,7 +455,7 @@ xfs_qm_scall_getqstat(
582/* 455/*
583 * Adjust quota limits, and start/stop timers accordingly. 456 * Adjust quota limits, and start/stop timers accordingly.
584 */ 457 */
585STATIC int 458int
586xfs_qm_scall_setqlim( 459xfs_qm_scall_setqlim(
587 xfs_mount_t *mp, 460 xfs_mount_t *mp,
588 xfs_dqid_t id, 461 xfs_dqid_t id,
@@ -595,9 +468,6 @@ xfs_qm_scall_setqlim(
595 int error; 468 int error;
596 xfs_qcnt_t hard, soft; 469 xfs_qcnt_t hard, soft;
597 470
598 if (!capable(CAP_SYS_ADMIN))
599 return XFS_ERROR(EPERM);
600
601 if ((newlim->d_fieldmask & 471 if ((newlim->d_fieldmask &
602 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) 472 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
603 return (0); 473 return (0);
@@ -742,7 +612,7 @@ xfs_qm_scall_setqlim(
742 return error; 612 return error;
743} 613}
744 614
745STATIC int 615int
746xfs_qm_scall_getquota( 616xfs_qm_scall_getquota(
747 xfs_mount_t *mp, 617 xfs_mount_t *mp,
748 xfs_dqid_t id, 618 xfs_dqid_t id,
@@ -935,30 +805,6 @@ xfs_qm_export_dquot(
935} 805}
936 806
937STATIC uint 807STATIC uint
938xfs_qm_import_qtype_flags(
939 uint uflags)
940{
941 uint oflags = 0;
942
943 /*
944 * Can't be more than one, or none.
945 */
946 if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
947 (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
948 ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
949 (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
950 ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
951 (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
952 ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
953 return (0);
954
955 oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
956 oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
957 oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
958 return oflags;
959}
960
961STATIC uint
962xfs_qm_export_qtype_flags( 808xfs_qm_export_qtype_flags(
963 uint flags) 809 uint flags)
964{ 810{
@@ -979,26 +825,6 @@ xfs_qm_export_qtype_flags(
979} 825}
980 826
981STATIC uint 827STATIC uint
982xfs_qm_import_flags(
983 uint uflags)
984{
985 uint flags = 0;
986
987 if (uflags & XFS_QUOTA_UDQ_ACCT)
988 flags |= XFS_UQUOTA_ACCT;
989 if (uflags & XFS_QUOTA_PDQ_ACCT)
990 flags |= XFS_PQUOTA_ACCT;
991 if (uflags & XFS_QUOTA_GDQ_ACCT)
992 flags |= XFS_GQUOTA_ACCT;
993 if (uflags & XFS_QUOTA_UDQ_ENFD)
994 flags |= XFS_UQUOTA_ENFD;
995 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
996 flags |= XFS_OQUOTA_ENFD;
997 return (flags);
998}
999
1000
1001STATIC uint
1002xfs_qm_export_flags( 828xfs_qm_export_flags(
1003 uint flags) 829 uint flags)
1004{ 830{
@@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab;
1134xfs_dqhash_t *qmtest_gdqtab; 960xfs_dqhash_t *qmtest_gdqtab;
1135int qmtest_hashmask; 961int qmtest_hashmask;
1136int qmtest_nfails; 962int qmtest_nfails;
1137mutex_t qcheck_lock; 963struct mutex qcheck_lock;
1138 964
1139#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ 965#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
1140 (__psunsigned_t)(id)) & \ 966 (__psunsigned_t)(id)) & \
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index c4fcea600bc2..8286b2842b6b 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -42,34 +42,24 @@
42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) 42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
43 43
44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) 44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
45#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock)
46#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) 45#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
47#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) 46#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
48 47
49#define XQMLCK(h) (mutex_lock(&((h)->qh_lock))) 48#define xfs_qm_mplist_lock(mp) \
50#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) 49 mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
51#ifdef DEBUG 50#define xfs_qm_mplist_nowait(mp) \
52struct xfs_dqhash; 51 mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
53static inline int XQMISLCKD(struct xfs_dqhash *h) 52#define xfs_qm_mplist_unlock(mp) \
54{ 53 mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
55 if (mutex_trylock(&h->qh_lock)) { 54#define XFS_QM_IS_MPLIST_LOCKED(mp) \
56 mutex_unlock(&h->qh_lock); 55 mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
57 return 0; 56
58 } 57#define xfs_qm_freelist_lock(qm) \
59 return 1; 58 mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
60} 59#define xfs_qm_freelist_lock_nowait(qm) \
61#endif 60 mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
62 61#define xfs_qm_freelist_unlock(qm) \
63#define XFS_DQ_HASH_LOCK(h) XQMLCK(h) 62 mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
64#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h)
65#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h)
66
67#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp)))
68#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
69#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
70
71#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist))
72#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist))
73 63
74/* 64/*
75 * Hash into a bucket in the dquot hash table, based on <mp, id>. 65 * Hash into a bucket in the dquot hash table, based on <mp, id>.
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 99611381e740..447173bcf96d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -624,10 +624,9 @@ xfs_trans_dqresv(
624 xfs_qcnt_t *resbcountp; 624 xfs_qcnt_t *resbcountp;
625 xfs_quotainfo_t *q = mp->m_quotainfo; 625 xfs_quotainfo_t *q = mp->m_quotainfo;
626 626
627 if (! (flags & XFS_QMOPT_DQLOCK)) { 627
628 xfs_dqlock(dqp); 628 xfs_dqlock(dqp);
629 } 629
630 ASSERT(XFS_DQ_IS_LOCKED(dqp));
631 if (flags & XFS_TRANS_DQ_RES_BLKS) { 630 if (flags & XFS_TRANS_DQ_RES_BLKS) {
632 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); 631 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
633 if (!hardlimit) 632 if (!hardlimit)
@@ -740,10 +739,8 @@ xfs_trans_dqresv(
740 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 739 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
741 740
742error_return: 741error_return:
743 if (! (flags & XFS_QMOPT_DQLOCK)) { 742 xfs_dqunlock(dqp);
744 xfs_dqunlock(dqp); 743 return error;
745 }
746 return (error);
747} 744}
748 745
749 746
@@ -753,8 +750,7 @@ error_return:
753 * grp/prj quotas is important, because this follows a both-or-nothing 750 * grp/prj quotas is important, because this follows a both-or-nothing
754 * approach. 751 * approach.
755 * 752 *
756 * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. 753 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
757 * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
758 * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. 754 * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota.
759 * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks 755 * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
760 * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks 756 * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index ae5482965424..3f3610a7ee05 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -24,6 +24,7 @@
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dmapi.h" 25#include "xfs_dmapi.h"
26#include "xfs_mount.h" 26#include "xfs_mount.h"
27#include "xfs_error.h"
27 28
28static char message[1024]; /* keep it off the stack */ 29static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock); 30static DEFINE_SPINLOCK(xfs_err_lock);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 5830c040ea7e..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,10 +17,6 @@
17 */ 17 */
18#include <xfs.h> 18#include <xfs.h>
19 19
20static DEFINE_MUTEX(uuid_monitor);
21static int uuid_table_size;
22static uuid_t *uuid_table;
23
24/* IRIX interpretation of an uuid_t */ 20/* IRIX interpretation of an uuid_t */
25typedef struct { 21typedef struct {
26 __be32 uu_timelow; 22 __be32 uu_timelow;
@@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
46 fsid[1] = be32_to_cpu(uup->uu_timelow); 42 fsid[1] = be32_to_cpu(uup->uu_timelow);
47} 43}
48 44
49void
50uuid_create_nil(uuid_t *uuid)
51{
52 memset(uuid, 0, sizeof(*uuid));
53}
54
55int 45int
56uuid_is_nil(uuid_t *uuid) 46uuid_is_nil(uuid_t *uuid)
57{ 47{
@@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
71{ 61{
72 return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; 62 return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
73} 63}
74
75/*
76 * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
77 * 64-bit words. NOTE: This function can not be changed EVER. Although
78 * brain-dead, some applications depend on this 64-bit value remaining
79 * persistent. Specifically, DMI vendors store the value as a persistent
80 * filehandle.
81 */
82__uint64_t
83uuid_hash64(uuid_t *uuid)
84{
85 __uint64_t *sp = (__uint64_t *)uuid;
86
87 return sp[0] + sp[1];
88}
89
90int
91uuid_table_insert(uuid_t *uuid)
92{
93 int i, hole;
94
95 mutex_lock(&uuid_monitor);
96 for (i = 0, hole = -1; i < uuid_table_size; i++) {
97 if (uuid_is_nil(&uuid_table[i])) {
98 hole = i;
99 continue;
100 }
101 if (uuid_equal(uuid, &uuid_table[i])) {
102 mutex_unlock(&uuid_monitor);
103 return 0;
104 }
105 }
106 if (hole < 0) {
107 uuid_table = kmem_realloc(uuid_table,
108 (uuid_table_size + 1) * sizeof(*uuid_table),
109 uuid_table_size * sizeof(*uuid_table),
110 KM_SLEEP);
111 hole = uuid_table_size++;
112 }
113 uuid_table[hole] = *uuid;
114 mutex_unlock(&uuid_monitor);
115 return 1;
116}
117
118void
119uuid_table_remove(uuid_t *uuid)
120{
121 int i;
122
123 mutex_lock(&uuid_monitor);
124 for (i = 0; i < uuid_table_size; i++) {
125 if (uuid_is_nil(&uuid_table[i]))
126 continue;
127 if (!uuid_equal(uuid, &uuid_table[i]))
128 continue;
129 uuid_create_nil(&uuid_table[i]);
130 break;
131 }
132 ASSERT(i < uuid_table_size);
133 mutex_unlock(&uuid_monitor);
134}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index cff5b607d445..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,12 +22,8 @@ typedef struct {
22 unsigned char __u_bits[16]; 22 unsigned char __u_bits[16];
23} uuid_t; 23} uuid_t;
24 24
25extern void uuid_create_nil(uuid_t *uuid);
26extern int uuid_is_nil(uuid_t *uuid); 25extern int uuid_is_nil(uuid_t *uuid);
27extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); 26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
28extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); 27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
29extern __uint64_t uuid_hash64(uuid_t *uuid);
30extern int uuid_table_insert(uuid_t *uuid);
31extern void uuid_table_remove(uuid_t *uuid);
32 28
33#endif /* __XFS_SUPPORT_UUID_H__ */ 29#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 143d63ecb20a..c8641f713caa 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,8 +223,8 @@ typedef struct xfs_perag
223 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) 223 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
224#define XFS_MIN_FREELIST_PAG(pag,mp) \ 224#define XFS_MIN_FREELIST_PAG(pag,mp) \
225 (XFS_MIN_FREELIST_RAW( \ 225 (XFS_MIN_FREELIST_RAW( \
226 (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ 226 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
227 (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) 227 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
228 228
229#define XFS_AGB_TO_FSB(mp,agno,agbno) \ 229#define XFS_AGB_TO_FSB(mp,agno,agbno) \
230 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) 230 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 028e44e58ea9..2cf944eb796d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1872,6 +1872,25 @@ xfs_alloc_compute_maxlevels(
1872} 1872}
1873 1873
1874/* 1874/*
1875 * Find the length of the longest extent in an AG.
1876 */
1877xfs_extlen_t
1878xfs_alloc_longest_free_extent(
1879 struct xfs_mount *mp,
1880 struct xfs_perag *pag)
1881{
1882 xfs_extlen_t need, delta = 0;
1883
1884 need = XFS_MIN_FREELIST_PAG(pag, mp);
1885 if (need > pag->pagf_flcount)
1886 delta = need - pag->pagf_flcount;
1887
1888 if (pag->pagf_longest > delta)
1889 return pag->pagf_longest - delta;
1890 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
1891}
1892
1893/*
1875 * Decide whether to use this allocation group for this allocation. 1894 * Decide whether to use this allocation group for this allocation.
1876 * If so, fix up the btree freelist's size. 1895 * If so, fix up the btree freelist's size.
1877 */ 1896 */
@@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist(
1923 } 1942 }
1924 1943
1925 if (!(flags & XFS_ALLOC_FLAG_FREEING)) { 1944 if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
1926 need = XFS_MIN_FREELIST_PAG(pag, mp);
1927 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
1928 /* 1945 /*
1929 * If it looks like there isn't a long enough extent, or enough 1946 * If it looks like there isn't a long enough extent, or enough
1930 * total blocks, reject it. 1947 * total blocks, reject it.
1931 */ 1948 */
1932 longest = (pag->pagf_longest > delta) ? 1949 need = XFS_MIN_FREELIST_PAG(pag, mp);
1933 (pag->pagf_longest - delta) : 1950 longest = xfs_alloc_longest_free_extent(mp, pag);
1934 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
1935 if ((args->minlen + args->alignment + args->minalignslop - 1) > 1951 if ((args->minlen + args->alignment + args->minalignslop - 1) >
1936 longest || 1952 longest ||
1937 ((int)(pag->pagf_freeblks + pag->pagf_flcount - 1953 ((int)(pag->pagf_freeblks + pag->pagf_flcount -
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 588172796f7b..e704caee10df 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg {
100#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ 100#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
101#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ 101#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
102 102
103/*
104 * Find the length of the longest extent in an AG.
105 */
106xfs_extlen_t
107xfs_alloc_longest_free_extent(struct xfs_mount *mp,
108 struct xfs_perag *pag);
103 109
104#ifdef __KERNEL__ 110#ifdef __KERNEL__
105 111
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 6c323f8a4cd1..afdc8911637d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
155 * minimum offset only needs to be the space required for 155 * minimum offset only needs to be the space required for
156 * the btree root. 156 * the btree root.
157 */ 157 */
158 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset) 158 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
159 xfs_default_attroffset(dp))
159 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); 160 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
160 break; 161 break;
161 162
@@ -298,6 +299,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
298} 299}
299 300
300/* 301/*
302 * After the last attribute is removed revert to original inode format,
303 * making all literal area available to the data fork once more.
304 */
305STATIC void
306xfs_attr_fork_reset(
307 struct xfs_inode *ip,
308 struct xfs_trans *tp)
309{
310 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
311 ip->i_d.di_forkoff = 0;
312 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
313
314 ASSERT(ip->i_d.di_anextents == 0);
315 ASSERT(ip->i_afp == NULL);
316
317 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
318 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
319}
320
321/*
301 * Remove an attribute from the shortform attribute list structure. 322 * Remove an attribute from the shortform attribute list structure.
302 */ 323 */
303int 324int
@@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
344 */ 365 */
345 totsize -= size; 366 totsize -= size;
346 if (totsize == sizeof(xfs_attr_sf_hdr_t) && 367 if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
347 !(args->op_flags & XFS_DA_OP_ADDNAME) && 368 (mp->m_flags & XFS_MOUNT_ATTR2) &&
348 (mp->m_flags & XFS_MOUNT_ATTR2) && 369 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
349 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { 370 !(args->op_flags & XFS_DA_OP_ADDNAME)) {
350 /* 371 xfs_attr_fork_reset(dp, args->trans);
351 * Last attribute now removed, revert to original
352 * inode format making all literal area available
353 * to the data fork once more.
354 */
355 xfs_idestroy_fork(dp, XFS_ATTR_FORK);
356 dp->i_d.di_forkoff = 0;
357 dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
358 ASSERT(dp->i_d.di_anextents == 0);
359 ASSERT(dp->i_afp == NULL);
360 dp->i_df.if_ext_max =
361 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
362 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
363 } else { 372 } else {
364 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); 373 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
365 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); 374 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
786 if (forkoff == -1) { 795 if (forkoff == -1) {
787 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); 796 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
788 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); 797 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
789 798 xfs_attr_fork_reset(dp, args->trans);
790 /*
791 * Last attribute was removed, revert to original
792 * inode format making all literal area available
793 * to the data fork once more.
794 */
795 xfs_idestroy_fork(dp, XFS_ATTR_FORK);
796 dp->i_d.di_forkoff = 0;
797 dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
798 ASSERT(dp->i_d.di_anextents == 0);
799 ASSERT(dp->i_afp == NULL);
800 dp->i_df.if_ext_max =
801 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
802 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
803 goto out; 799 goto out;
804 } 800 }
805 801
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c852cd65aaea..3a6ed426327a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2479,7 +2479,7 @@ xfs_bmap_adjacent(
2479 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2479 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
2480 /* 2480 /*
2481 * If allocating at eof, and there's a previous real block, 2481 * If allocating at eof, and there's a previous real block,
2482 * try to use it's last block as our starting point. 2482 * try to use its last block as our starting point.
2483 */ 2483 */
2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && 2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
2485 !isnullstartblock(ap->prevp->br_startblock) && 2485 !isnullstartblock(ap->prevp->br_startblock) &&
@@ -2712,9 +2712,6 @@ xfs_bmap_btalloc(
2712 xfs_agnumber_t startag; 2712 xfs_agnumber_t startag;
2713 xfs_alloc_arg_t args; 2713 xfs_alloc_arg_t args;
2714 xfs_extlen_t blen; 2714 xfs_extlen_t blen;
2715 xfs_extlen_t delta;
2716 xfs_extlen_t longest;
2717 xfs_extlen_t need;
2718 xfs_extlen_t nextminlen = 0; 2715 xfs_extlen_t nextminlen = 0;
2719 xfs_perag_t *pag; 2716 xfs_perag_t *pag;
2720 int nullfb; /* true if ap->firstblock isn't set */ 2717 int nullfb; /* true if ap->firstblock isn't set */
@@ -2796,13 +2793,8 @@ xfs_bmap_btalloc(
2796 * See xfs_alloc_fix_freelist... 2793 * See xfs_alloc_fix_freelist...
2797 */ 2794 */
2798 if (pag->pagf_init) { 2795 if (pag->pagf_init) {
2799 need = XFS_MIN_FREELIST_PAG(pag, mp); 2796 xfs_extlen_t longest;
2800 delta = need > pag->pagf_flcount ? 2797 longest = xfs_alloc_longest_free_extent(mp, pag);
2801 need - pag->pagf_flcount : 0;
2802 longest = (pag->pagf_longest > delta) ?
2803 (pag->pagf_longest - delta) :
2804 (pag->pagf_flcount > 0 ||
2805 pag->pagf_longest > 0);
2806 if (blen < longest) 2798 if (blen < longest)
2807 blen = longest; 2799 blen = longest;
2808 } else 2800 } else
@@ -3577,6 +3569,27 @@ xfs_bmap_extents_to_btree(
3577} 3569}
3578 3570
3579/* 3571/*
3572 * Calculate the default attribute fork offset for newly created inodes.
3573 */
3574uint
3575xfs_default_attroffset(
3576 struct xfs_inode *ip)
3577{
3578 struct xfs_mount *mp = ip->i_mount;
3579 uint offset;
3580
3581 if (mp->m_sb.sb_inodesize == 256) {
3582 offset = XFS_LITINO(mp) -
3583 XFS_BMDR_SPACE_CALC(MINABTPTRS);
3584 } else {
3585 offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
3586 }
3587
3588 ASSERT(offset < XFS_LITINO(mp));
3589 return offset;
3590}
3591
3592/*
3580 * Helper routine to reset inode di_forkoff field when switching 3593 * Helper routine to reset inode di_forkoff field when switching
3581 * attribute fork from local to extent format - we reset it where 3594 * attribute fork from local to extent format - we reset it where
3582 * possible to make space available for inline data fork extents. 3595 * possible to make space available for inline data fork extents.
@@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset(
3588 int whichfork) 3601 int whichfork)
3589{ 3602{
3590 if (whichfork == XFS_ATTR_FORK && 3603 if (whichfork == XFS_ATTR_FORK &&
3591 (ip->i_d.di_format != XFS_DINODE_FMT_DEV) && 3604 ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
3592 (ip->i_d.di_format != XFS_DINODE_FMT_UUID) && 3605 ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
3593 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3606 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
3594 ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) { 3607 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
3595 ip->i_d.di_forkoff = mp->m_attroffset >> 3; 3608
3596 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / 3609 if (dfl_forkoff > ip->i_d.di_forkoff) {
3597 (uint)sizeof(xfs_bmbt_rec_t); 3610 ip->i_d.di_forkoff = dfl_forkoff;
3598 ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / 3611 ip->i_df.if_ext_max =
3599 (uint)sizeof(xfs_bmbt_rec_t); 3612 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
3613 ip->i_afp->if_ext_max =
3614 XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
3615 }
3600 } 3616 }
3601} 3617}
3602 3618
@@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork(
4065 case XFS_DINODE_FMT_BTREE: 4081 case XFS_DINODE_FMT_BTREE:
4066 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); 4082 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
4067 if (!ip->i_d.di_forkoff) 4083 if (!ip->i_d.di_forkoff)
4068 ip->i_d.di_forkoff = mp->m_attroffset >> 3; 4084 ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
4069 else if (mp->m_flags & XFS_MOUNT_ATTR2) 4085 else if (mp->m_flags & XFS_MOUNT_ATTR2)
4070 version = 2; 4086 version = 2;
4071 break; 4087 break;
@@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels(
4212 * (a signed 16-bit number, xfs_aextnum_t). 4228 * (a signed 16-bit number, xfs_aextnum_t).
4213 * 4229 *
4214 * Note that we can no longer assume that if we are in ATTR1 that 4230 * Note that we can no longer assume that if we are in ATTR1 that
4215 * the fork offset of all the inodes will be (m_attroffset >> 3) 4231 * the fork offset of all the inodes will be
4216 * because we could have mounted with ATTR2 and then mounted back 4232 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
4217 * with ATTR1, keeping the di_forkoff's fixed but probably at 4233 * with ATTR2 and then mounted back with ATTR1, keeping the
4218 * various positions. Therefore, for both ATTR1 and ATTR2 4234 * di_forkoff's fixed but probably at various positions. Therefore,
4219 * we have to assume the worst case scenario of a minimum size 4235 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
4220 * available. 4236 * of a minimum size available.
4221 */ 4237 */
4222 if (whichfork == XFS_DATA_FORK) { 4238 if (whichfork == XFS_DATA_FORK) {
4223 maxleafents = MAXEXTNUM; 4239 maxleafents = MAXEXTNUM;
@@ -4804,7 +4820,7 @@ xfs_bmapi(
4804 xfs_extlen_t minlen; /* min allocation size */ 4820 xfs_extlen_t minlen; /* min allocation size */
4805 xfs_mount_t *mp; /* xfs mount structure */ 4821 xfs_mount_t *mp; /* xfs mount structure */
4806 int n; /* current extent index */ 4822 int n; /* current extent index */
4807 int nallocs; /* number of extents alloc\'d */ 4823 int nallocs; /* number of extents alloc'd */
4808 xfs_extnum_t nextents; /* number of extents in file */ 4824 xfs_extnum_t nextents; /* number of extents in file */
4809 xfs_fileoff_t obno; /* old block number (offset) */ 4825 xfs_fileoff_t obno; /* old block number (offset) */
4810 xfs_bmbt_irec_t prev; /* previous file extent record */ 4826 xfs_bmbt_irec_t prev; /* previous file extent record */
@@ -6204,7 +6220,7 @@ xfs_bmap_get_bp(
6204 return(bp); 6220 return(bp);
6205} 6221}
6206 6222
6207void 6223STATIC void
6208xfs_check_block( 6224xfs_check_block(
6209 struct xfs_btree_block *block, 6225 struct xfs_btree_block *block,
6210 xfs_mount_t *mp, 6226 xfs_mount_t *mp,
@@ -6494,7 +6510,7 @@ xfs_bmap_count_tree(
6494 block = XFS_BUF_TO_BLOCK(bp); 6510 block = XFS_BUF_TO_BLOCK(bp);
6495 6511
6496 if (--level) { 6512 if (--level) {
6497 /* Not at node above leafs, count this level of nodes */ 6513 /* Not at node above leaves, count this level of nodes */
6498 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 6514 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6499 while (nextbno != NULLFSBLOCK) { 6515 while (nextbno != NULLFSBLOCK) {
6500 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6516 if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index be2979d88d32..1b8ff9256bd0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -125,7 +125,7 @@ typedef struct xfs_bmalloca {
125 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ 125 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
126 xfs_extlen_t alen; /* i/o length asked/allocated */ 126 xfs_extlen_t alen; /* i/o length asked/allocated */
127 xfs_extlen_t total; /* total blocks needed for xaction */ 127 xfs_extlen_t total; /* total blocks needed for xaction */
128 xfs_extlen_t minlen; /* mininum allocation size (blocks) */ 128 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
129 xfs_extlen_t minleft; /* amount must be left after alloc */ 129 xfs_extlen_t minleft; /* amount must be left after alloc */
130 char eof; /* set if allocating past last extent */ 130 char eof; /* set if allocating past last extent */
131 char wasdel; /* replacing a delayed allocation */ 131 char wasdel; /* replacing a delayed allocation */
@@ -338,6 +338,10 @@ xfs_check_nostate_extents(
338 xfs_extnum_t idx, 338 xfs_extnum_t idx,
339 xfs_extnum_t num); 339 xfs_extnum_t num);
340 340
341uint
342xfs_default_attroffset(
343 struct xfs_inode *ip);
344
341#ifdef __KERNEL__ 345#ifdef __KERNEL__
342 346
343/* 347/*
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e73c332eb23f..e9df99574829 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -1883,7 +1883,7 @@ xfs_btree_lshift(
1883 1883
1884 /* 1884 /*
1885 * We add one entry to the left side and remove one for the right side. 1885 * We add one entry to the left side and remove one for the right side.
1886 * Accout for it here, the changes will be updated on disk and logged 1886 * Account for it here, the changes will be updated on disk and logged
1887 * later. 1887 * later.
1888 */ 1888 */
1889 lrecs++; 1889 lrecs++;
@@ -3535,7 +3535,7 @@ xfs_btree_delrec(
3535 XFS_BTREE_STATS_INC(cur, join); 3535 XFS_BTREE_STATS_INC(cur, join);
3536 3536
3537 /* 3537 /*
3538 * Fix up the the number of records and right block pointer in the 3538 * Fix up the number of records and right block pointer in the
3539 * surviving block, and log it. 3539 * surviving block, and log it.
3540 */ 3540 */
3541 xfs_btree_set_numrecs(left, lrecs + rrecs); 3541 xfs_btree_set_numrecs(left, lrecs + rrecs);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 789fffdf8b2f..4f852b735b96 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -41,7 +41,7 @@ extern kmem_zone_t *xfs_btree_cur_zone;
41/* 41/*
42 * Generic btree header. 42 * Generic btree header.
43 * 43 *
44 * This is a comination of the actual format used on disk for short and long 44 * This is a combination of the actual format used on disk for short and long
45 * format btrees. The first three fields are shared by both format, but 45 * format btrees. The first three fields are shared by both format, but
46 * the pointers are different and should be used with care. 46 * the pointers are different and should be used with care.
47 * 47 *
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c45f74ff1a5b..9ff6e57a5075 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1503 * This is implemented with some source-level loop unrolling. 1503 * This is implemented with some source-level loop unrolling.
1504 */ 1504 */
1505xfs_dahash_t 1505xfs_dahash_t
1506xfs_da_hashname(const uchar_t *name, int namelen) 1506xfs_da_hashname(const __uint8_t *name, int namelen)
1507{ 1507{
1508 xfs_dahash_t hash; 1508 xfs_dahash_t hash;
1509 1509
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 70b710c1792d..8c536167bf75 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -91,9 +91,9 @@ enum xfs_dacmp {
91 * Structure to ease passing around component names. 91 * Structure to ease passing around component names.
92 */ 92 */
93typedef struct xfs_da_args { 93typedef struct xfs_da_args {
94 const uchar_t *name; /* string (maybe not NULL terminated) */ 94 const __uint8_t *name; /* string (maybe not NULL terminated) */
95 int namelen; /* length of string (maybe no NULL) */ 95 int namelen; /* length of string (maybe no NULL) */
96 uchar_t *value; /* set of bytes (maybe contain NULLs) */ 96 __uint8_t *value; /* set of bytes (maybe contain NULLs) */
97 int valuelen; /* length of value */ 97 int valuelen; /* length of value */
98 int flags; /* argument flags (eg: ATTR_NOCREATE) */ 98 int flags; /* argument flags (eg: ATTR_NOCREATE) */
99 xfs_dahash_t hashval; /* hash value of name */ 99 xfs_dahash_t hashval; /* hash value of name */
@@ -185,7 +185,7 @@ typedef struct xfs_da_state {
185 unsigned char inleaf; /* insert into 1->lf, 0->splf */ 185 unsigned char inleaf; /* insert into 1->lf, 0->splf */
186 unsigned char extravalid; /* T/F: extrablk is in use */ 186 unsigned char extravalid; /* T/F: extrablk is in use */
187 unsigned char extraafter; /* T/F: extrablk is after new */ 187 unsigned char extraafter; /* T/F: extrablk is after new */
188 xfs_da_state_blk_t extrablk; /* for double-splits on leafs */ 188 xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
189 /* for dirv2 extrablk is data */ 189 /* for dirv2 extrablk is data */
190} xfs_da_state_t; 190} xfs_da_state_t;
191 191
@@ -251,7 +251,7 @@ xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
251int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 251int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
252 xfs_dabuf_t *dead_buf); 252 xfs_dabuf_t *dead_buf);
253 253
254uint xfs_da_hashname(const uchar_t *name_string, int name_length); 254uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
255enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, 255enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
256 const char *name, int len); 256 const char *name, int len);
257 257
@@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
268 268
269extern struct kmem_zone *xfs_da_state_zone; 269extern struct kmem_zone *xfs_da_state_zone;
270extern struct kmem_zone *xfs_dabuf_zone; 270extern struct kmem_zone *xfs_dabuf_zone;
271extern const struct xfs_nameops xfs_default_nameops;
271 272
272#endif /* __XFS_DA_BTREE_H__ */ 273#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f8278cfcc1d3..e6d839bddbf0 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -79,6 +79,12 @@ xfs_swapext(
79 goto out_put_target_file; 79 goto out_put_target_file;
80 } 80 }
81 81
82 if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
83 IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
84 error = XFS_ERROR(EINVAL);
85 goto out_put_target_file;
86 }
87
82 ip = XFS_I(file->f_path.dentry->d_inode); 88 ip = XFS_I(file->f_path.dentry->d_inode);
83 tip = XFS_I(target_file->f_path.dentry->d_inode); 89 tip = XFS_I(target_file->f_path.dentry->d_inode);
84 90
@@ -118,19 +124,17 @@ xfs_swap_extents(
118 xfs_bstat_t *sbp = &sxp->sx_stat; 124 xfs_bstat_t *sbp = &sxp->sx_stat;
119 xfs_ifork_t *tempifp, *ifp, *tifp; 125 xfs_ifork_t *tempifp, *ifp, *tifp;
120 int ilf_fields, tilf_fields; 126 int ilf_fields, tilf_fields;
121 static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
122 int error = 0; 127 int error = 0;
123 int aforkblks = 0; 128 int aforkblks = 0;
124 int taforkblks = 0; 129 int taforkblks = 0;
125 __uint64_t tmp; 130 __uint64_t tmp;
126 char locked = 0;
127 131
128 mp = ip->i_mount; 132 mp = ip->i_mount;
129 133
130 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 134 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
131 if (!tempifp) { 135 if (!tempifp) {
132 error = XFS_ERROR(ENOMEM); 136 error = XFS_ERROR(ENOMEM);
133 goto error0; 137 goto out;
134 } 138 }
135 139
136 sbp = &sxp->sx_stat; 140 sbp = &sxp->sx_stat;
@@ -143,25 +147,24 @@ xfs_swap_extents(
143 */ 147 */
144 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 148 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
145 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 149 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
146 locked = 1;
147 150
148 /* Verify that both files have the same format */ 151 /* Verify that both files have the same format */
149 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { 152 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
150 error = XFS_ERROR(EINVAL); 153 error = XFS_ERROR(EINVAL);
151 goto error0; 154 goto out_unlock;
152 } 155 }
153 156
154 /* Verify both files are either real-time or non-realtime */ 157 /* Verify both files are either real-time or non-realtime */
155 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { 158 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
156 error = XFS_ERROR(EINVAL); 159 error = XFS_ERROR(EINVAL);
157 goto error0; 160 goto out_unlock;
158 } 161 }
159 162
160 /* Should never get a local format */ 163 /* Should never get a local format */
161 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || 164 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
162 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { 165 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
163 error = XFS_ERROR(EINVAL); 166 error = XFS_ERROR(EINVAL);
164 goto error0; 167 goto out_unlock;
165 } 168 }
166 169
167 if (VN_CACHED(VFS_I(tip)) != 0) { 170 if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -169,13 +172,13 @@ xfs_swap_extents(
169 error = xfs_flushinval_pages(tip, 0, -1, 172 error = xfs_flushinval_pages(tip, 0, -1,
170 FI_REMAPF_LOCKED); 173 FI_REMAPF_LOCKED);
171 if (error) 174 if (error)
172 goto error0; 175 goto out_unlock;
173 } 176 }
174 177
175 /* Verify O_DIRECT for ftmp */ 178 /* Verify O_DIRECT for ftmp */
176 if (VN_CACHED(VFS_I(tip)) != 0) { 179 if (VN_CACHED(VFS_I(tip)) != 0) {
177 error = XFS_ERROR(EINVAL); 180 error = XFS_ERROR(EINVAL);
178 goto error0; 181 goto out_unlock;
179 } 182 }
180 183
181 /* Verify all data are being swapped */ 184 /* Verify all data are being swapped */
@@ -183,7 +186,7 @@ xfs_swap_extents(
183 sxp->sx_length != ip->i_d.di_size || 186 sxp->sx_length != ip->i_d.di_size ||
184 sxp->sx_length != tip->i_d.di_size) { 187 sxp->sx_length != tip->i_d.di_size) {
185 error = XFS_ERROR(EFAULT); 188 error = XFS_ERROR(EFAULT);
186 goto error0; 189 goto out_unlock;
187 } 190 }
188 191
189 /* 192 /*
@@ -193,7 +196,7 @@ xfs_swap_extents(
193 */ 196 */
194 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { 197 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
195 error = XFS_ERROR(EINVAL); 198 error = XFS_ERROR(EINVAL);
196 goto error0; 199 goto out_unlock;
197 } 200 }
198 201
199 /* 202 /*
@@ -208,7 +211,7 @@ xfs_swap_extents(
208 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || 211 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
209 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { 212 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
210 error = XFS_ERROR(EBUSY); 213 error = XFS_ERROR(EBUSY);
211 goto error0; 214 goto out_unlock;
212 } 215 }
213 216
214 /* We need to fail if the file is memory mapped. Once we have tossed 217 /* We need to fail if the file is memory mapped. Once we have tossed
@@ -219,7 +222,7 @@ xfs_swap_extents(
219 */ 222 */
220 if (VN_MAPPED(VFS_I(ip))) { 223 if (VN_MAPPED(VFS_I(ip))) {
221 error = XFS_ERROR(EBUSY); 224 error = XFS_ERROR(EBUSY);
222 goto error0; 225 goto out_unlock;
223 } 226 }
224 227
225 xfs_iunlock(ip, XFS_ILOCK_EXCL); 228 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -242,8 +245,7 @@ xfs_swap_extents(
242 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 245 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
243 xfs_iunlock(tip, XFS_IOLOCK_EXCL); 246 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
244 xfs_trans_cancel(tp, 0); 247 xfs_trans_cancel(tp, 0);
245 locked = 0; 248 goto out;
246 goto error0;
247 } 249 }
248 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 250 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
249 251
@@ -253,19 +255,15 @@ xfs_swap_extents(
253 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && 255 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
254 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { 256 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
255 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); 257 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
256 if (error) { 258 if (error)
257 xfs_trans_cancel(tp, 0); 259 goto out_trans_cancel;
258 goto error0;
259 }
260 } 260 }
261 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && 261 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
262 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { 262 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
263 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, 263 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
264 &taforkblks); 264 &taforkblks);
265 if (error) { 265 if (error)
266 xfs_trans_cancel(tp, 0); 266 goto out_trans_cancel;
267 goto error0;
268 }
269 } 267 }
270 268
271 /* 269 /*
@@ -332,10 +330,10 @@ xfs_swap_extents(
332 330
333 331
334 IHOLD(ip); 332 IHOLD(ip);
335 xfs_trans_ijoin(tp, ip, lock_flags); 333 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
336 334
337 IHOLD(tip); 335 IHOLD(tip);
338 xfs_trans_ijoin(tp, tip, lock_flags); 336 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
339 337
340 xfs_trans_log_inode(tp, ip, ilf_fields); 338 xfs_trans_log_inode(tp, ip, ilf_fields);
341 xfs_trans_log_inode(tp, tip, tilf_fields); 339 xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -344,19 +342,19 @@ xfs_swap_extents(
344 * If this is a synchronous mount, make sure that the 342 * If this is a synchronous mount, make sure that the
345 * transaction goes to disk before returning to the user. 343 * transaction goes to disk before returning to the user.
346 */ 344 */
347 if (mp->m_flags & XFS_MOUNT_WSYNC) { 345 if (mp->m_flags & XFS_MOUNT_WSYNC)
348 xfs_trans_set_sync(tp); 346 xfs_trans_set_sync(tp);
349 }
350 347
351 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 348 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
352 locked = 0;
353 349
354 error0: 350out_unlock:
355 if (locked) { 351 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
356 xfs_iunlock(ip, lock_flags); 352 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
357 xfs_iunlock(tip, lock_flags); 353out:
358 } 354 kmem_free(tempifp);
359 if (tempifp != NULL)
360 kmem_free(tempifp);
361 return error; 355 return error;
356
357out_trans_cancel:
358 xfs_trans_cancel(tp, 0);
359 goto out_unlock;
362} 360}
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 162e8726df5e..e5b153b2e6a3 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt {
103/* 103/*
104 * Inode size for given fs. 104 * Inode size for given fs.
105 */ 105 */
106#define XFS_LITINO(mp) ((mp)->m_litino) 106#define XFS_LITINO(mp) \
107 ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
108
107#define XFS_BROOT_SIZE_ADJ \ 109#define XFS_BROOT_SIZE_ADJ \
108 (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) 110 (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
109 111
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 1afb12278b8d..c657bec6d951 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,8 +46,6 @@
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2}; 47struct xfs_name xfs_name_dotdot = {"..", 2};
48 48
49extern const struct xfs_nameops xfs_default_nameops;
50
51/* 49/*
52 * ASCII case-insensitive (ie. A-Z) support for directories that was 50 * ASCII case-insensitive (ie. A-Z) support for directories that was
53 * used in IRIX. 51 * used in IRIX.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e1f0a06aaf04..ab52e9e1c1ee 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -448,7 +448,6 @@ xfs_dir2_block_getdents(
448 xfs_mount_t *mp; /* filesystem mount point */ 448 xfs_mount_t *mp; /* filesystem mount point */
449 char *ptr; /* current data entry */ 449 char *ptr; /* current data entry */
450 int wantoff; /* starting block offset */ 450 int wantoff; /* starting block offset */
451 xfs_ino_t ino;
452 xfs_off_t cook; 451 xfs_off_t cook;
453 452
454 mp = dp->i_mount; 453 mp = dp->i_mount;
@@ -509,16 +508,12 @@ xfs_dir2_block_getdents(
509 508
510 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 509 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
511 (char *)dep - (char *)block); 510 (char *)dep - (char *)block);
512 ino = be64_to_cpu(dep->inumber);
513#if XFS_BIG_INUMS
514 ino += mp->m_inoadd;
515#endif
516 511
517 /* 512 /*
518 * If it didn't fit, set the final offset to here & return. 513 * If it didn't fit, set the final offset to here & return.
519 */ 514 */
520 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, 515 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
521 ino, DT_UNKNOWN)) { 516 be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
522 *offset = cook & 0x7fffffff; 517 *offset = cook & 0x7fffffff;
523 xfs_da_brelse(NULL, bp); 518 xfs_da_brelse(NULL, bp);
524 return 0; 519 return 0;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index b816e0252739..efbc290c7fec 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -38,7 +38,7 @@ struct xfs_trans;
38 38
39/* 39/*
40 * Directory address space divided into sections, 40 * Directory address space divided into sections,
41 * spaces separated by 32gb. 41 * spaces separated by 32GB.
42 */ 42 */
43#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) 43#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
44#define XFS_DIR2_DATA_SPACE 0 44#define XFS_DIR2_DATA_SPACE 0
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ef805a374eec..fa913e459442 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -549,7 +549,7 @@ xfs_dir2_leaf_addname(
549 * Check the internal consistency of a leaf1 block. 549 * Check the internal consistency of a leaf1 block.
550 * Pop an assert if something is wrong. 550 * Pop an assert if something is wrong.
551 */ 551 */
552void 552STATIC void
553xfs_dir2_leaf_check( 553xfs_dir2_leaf_check(
554 xfs_inode_t *dp, /* incore directory inode */ 554 xfs_inode_t *dp, /* incore directory inode */
555 xfs_dabuf_t *bp) /* leaf's buffer */ 555 xfs_dabuf_t *bp) /* leaf's buffer */
@@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents(
780 int ra_index; /* *map index for read-ahead */ 780 int ra_index; /* *map index for read-ahead */
781 int ra_offset; /* map entry offset for ra */ 781 int ra_offset; /* map entry offset for ra */
782 int ra_want; /* readahead count wanted */ 782 int ra_want; /* readahead count wanted */
783 xfs_ino_t ino;
784 783
785 /* 784 /*
786 * If the offset is at or past the largest allowed value, 785 * If the offset is at or past the largest allowed value,
@@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents(
1076 continue; 1075 continue;
1077 } 1076 }
1078 1077
1079 /*
1080 * Copy the entry into the putargs, and try formatting it.
1081 */
1082 dep = (xfs_dir2_data_entry_t *)ptr; 1078 dep = (xfs_dir2_data_entry_t *)ptr;
1083
1084 length = xfs_dir2_data_entsize(dep->namelen); 1079 length = xfs_dir2_data_entsize(dep->namelen);
1085 1080
1086 ino = be64_to_cpu(dep->inumber);
1087#if XFS_BIG_INUMS
1088 ino += mp->m_inoadd;
1089#endif
1090
1091 /*
1092 * Won't fit. Return to caller.
1093 */
1094 if (filldir(dirent, dep->name, dep->namelen, 1081 if (filldir(dirent, dep->name, dep->namelen,
1095 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1082 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1096 ino, DT_UNKNOWN)) 1083 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1097 break; 1084 break;
1098 1085
1099 /* 1086 /*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index fa6c3a5ddbc6..5a81ccd1045b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove(
1104 } 1104 }
1105 xfs_dir2_leafn_check(dp, bp); 1105 xfs_dir2_leafn_check(dp, bp);
1106 /* 1106 /*
1107 * Return indication of whether this leaf block is emtpy enough 1107 * Return indication of whether this leaf block is empty enough
1108 * to justify trying to join it with a neighbor. 1108 * to justify trying to join it with a neighbor.
1109 */ 1109 */
1110 *rval = 1110 *rval =
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index a8a8a6efad5b..e89734e84646 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -748,11 +748,7 @@ xfs_dir2_sf_getdents(
748 * Put . entry unless we're starting past it. 748 * Put . entry unless we're starting past it.
749 */ 749 */
750 if (*offset <= dot_offset) { 750 if (*offset <= dot_offset) {
751 ino = dp->i_ino; 751 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
752#if XFS_BIG_INUMS
753 ino += mp->m_inoadd;
754#endif
755 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
756 *offset = dot_offset & 0x7fffffff; 752 *offset = dot_offset & 0x7fffffff;
757 return 0; 753 return 0;
758 } 754 }
@@ -763,9 +759,6 @@ xfs_dir2_sf_getdents(
763 */ 759 */
764 if (*offset <= dotdot_offset) { 760 if (*offset <= dotdot_offset) {
765 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); 761 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
766#if XFS_BIG_INUMS
767 ino += mp->m_inoadd;
768#endif
769 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { 762 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
770 *offset = dotdot_offset & 0x7fffffff; 763 *offset = dotdot_offset & 0x7fffffff;
771 return 0; 764 return 0;
@@ -786,10 +779,6 @@ xfs_dir2_sf_getdents(
786 } 779 }
787 780
788 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); 781 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
789#if XFS_BIG_INUMS
790 ino += mp->m_inoadd;
791#endif
792
793 if (filldir(dirent, sfep->name, sfep->namelen, 782 if (filldir(dirent, sfep->name, sfep->namelen,
794 off & 0x7fffffff, ino, DT_UNKNOWN)) { 783 off & 0x7fffffff, ino, DT_UNKNOWN)) {
795 *offset = off & 0x7fffffff; 784 *offset = off & 0x7fffffff;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2f049f63e85f..0d22c56fdf64 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,12 +33,10 @@ typedef struct xfs_extent {
33 * conversion routine. 33 * conversion routine.
34 */ 34 */
35 35
36#ifndef HAVE_FORMAT32
37typedef struct xfs_extent_32 { 36typedef struct xfs_extent_32 {
38 __uint64_t ext_start; 37 __uint64_t ext_start;
39 __uint32_t ext_len; 38 __uint32_t ext_len;
40} __attribute__((packed)) xfs_extent_32_t; 39} __attribute__((packed)) xfs_extent_32_t;
41#endif
42 40
43typedef struct xfs_extent_64 { 41typedef struct xfs_extent_64 {
44 __uint64_t ext_start; 42 __uint64_t ext_start;
@@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format {
59 xfs_extent_t efi_extents[1]; /* array of extents to free */ 57 xfs_extent_t efi_extents[1]; /* array of extents to free */
60} xfs_efi_log_format_t; 58} xfs_efi_log_format_t;
61 59
62#ifndef HAVE_FORMAT32
63typedef struct xfs_efi_log_format_32 { 60typedef struct xfs_efi_log_format_32 {
64 __uint16_t efi_type; /* efi log item type */ 61 __uint16_t efi_type; /* efi log item type */
65 __uint16_t efi_size; /* size of this item */ 62 __uint16_t efi_size; /* size of this item */
@@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 {
67 __uint64_t efi_id; /* efi identifier */ 64 __uint64_t efi_id; /* efi identifier */
68 xfs_extent_32_t efi_extents[1]; /* array of extents to free */ 65 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
69} __attribute__((packed)) xfs_efi_log_format_32_t; 66} __attribute__((packed)) xfs_efi_log_format_32_t;
70#endif
71 67
72typedef struct xfs_efi_log_format_64 { 68typedef struct xfs_efi_log_format_64 {
73 __uint16_t efi_type; /* efi log item type */ 69 __uint16_t efi_type; /* efi log item type */
@@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format {
90 xfs_extent_t efd_extents[1]; /* array of extents freed */ 86 xfs_extent_t efd_extents[1]; /* array of extents freed */
91} xfs_efd_log_format_t; 87} xfs_efd_log_format_t;
92 88
93#ifndef HAVE_FORMAT32
94typedef struct xfs_efd_log_format_32 { 89typedef struct xfs_efd_log_format_32 {
95 __uint16_t efd_type; /* efd log item type */ 90 __uint16_t efd_type; /* efd log item type */
96 __uint16_t efd_size; /* size of this item */ 91 __uint16_t efd_size; /* size of this item */
@@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 {
98 __uint64_t efd_efi_id; /* id of corresponding efi */ 93 __uint64_t efd_efi_id; /* id of corresponding efi */
99 xfs_extent_32_t efd_extents[1]; /* array of extents freed */ 94 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
100} __attribute__((packed)) xfs_efd_log_format_32_t; 95} __attribute__((packed)) xfs_efd_log_format_32_t;
101#endif
102 96
103typedef struct xfs_efd_log_format_64 { 97typedef struct xfs_efd_log_format_64 {
104 __uint16_t efd_type; /* efd log item type */ 98 __uint16_t efd_type; /* efd log item type */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index f3bb75da384e..6c87c8f304ef 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,7 +140,7 @@ _xfs_filestream_pick_ag(
140 xfs_extlen_t minlen) 140 xfs_extlen_t minlen)
141{ 141{
142 int err, trylock, nscan; 142 int err, trylock, nscan;
143 xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; 143 xfs_extlen_t longest, free, minfree, maxfree = 0;
144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
145 struct xfs_perag *pag; 145 struct xfs_perag *pag;
146 146
@@ -186,12 +186,7 @@ _xfs_filestream_pick_ag(
186 goto next_ag; 186 goto next_ag;
187 } 187 }
188 188
189 need = XFS_MIN_FREELIST_PAG(pag, mp); 189 longest = xfs_alloc_longest_free_extent(mp, pag);
190 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
191 longest = (pag->pagf_longest > delta) ?
192 (pag->pagf_longest - delta) :
193 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
194
195 if (((minlen && longest >= minlen) || 190 if (((minlen && longest >= minlen) ||
196 (!minlen && pag->pagf_freeblks >= minfree)) && 191 (!minlen && pag->pagf_freeblks >= minfree)) &&
197 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || 192 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 680d0e0ec932..8379e3bca26c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -576,7 +576,7 @@ out:
576 if (fdblks_delta) { 576 if (fdblks_delta) {
577 /* 577 /*
578 * If we are putting blocks back here, m_resblks_avail is 578 * If we are putting blocks back here, m_resblks_avail is
579 * already at it's max so this will put it in the free pool. 579 * already at its max so this will put it in the free pool.
580 * 580 *
581 * If we need space, we'll either succeed in getting it 581 * If we need space, we'll either succeed in getting it
582 * from the free block count or we'll get an enospc. If 582 * from the free block count or we'll get an enospc. If
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab016e5ae7be..3120a3a5e20f 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc(
230 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; 230 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
231 231
232 /* Allow space for the inode btree to split. */ 232 /* Allow space for the inode btree to split. */
233 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; 233 args.minleft = args.mp->m_in_maxlevels - 1;
234 if ((error = xfs_alloc_vextent(&args))) 234 if ((error = xfs_alloc_vextent(&args)))
235 return error; 235 return error;
236 } else 236 } else
@@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc(
270 /* 270 /*
271 * Allow space for the inode btree to split. 271 * Allow space for the inode btree to split.
272 */ 272 */
273 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; 273 args.minleft = args.mp->m_in_maxlevels - 1;
274 if ((error = xfs_alloc_vextent(&args))) 274 if ((error = xfs_alloc_vextent(&args)))
275 return error; 275 return error;
276 } 276 }
@@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc(
349 * Initialize all inodes in this buffer and then log them. 349 * Initialize all inodes in this buffer and then log them.
350 * 350 *
351 * XXX: It would be much better if we had just one transaction to 351 * XXX: It would be much better if we had just one transaction to
352 * log a whole cluster of inodes instead of all the indivdual 352 * log a whole cluster of inodes instead of all the individual
353 * transactions causing a lot of log traffic. 353 * transactions causing a lot of log traffic.
354 */ 354 */
355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); 355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
@@ -943,7 +943,7 @@ nextag:
943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
944 XFS_INODES_PER_CHUNK) == 0); 944 XFS_INODES_PER_CHUNK) == 0);
945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); 945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
946 XFS_INOBT_CLR_FREE(&rec, offset); 946 rec.ir_free &= ~XFS_INOBT_MASK(offset);
947 rec.ir_freecount--; 947 rec.ir_freecount--;
948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, 948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
949 rec.ir_free))) 949 rec.ir_free)))
@@ -1105,11 +1105,11 @@ xfs_difree(
1105 */ 1105 */
1106 off = agino - rec.ir_startino; 1106 off = agino - rec.ir_startino;
1107 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); 1107 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1108 ASSERT(!XFS_INOBT_IS_FREE(&rec, off)); 1108 ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1109 /* 1109 /*
1110 * Mark the inode free & increment the count. 1110 * Mark the inode free & increment the count.
1111 */ 1111 */
1112 XFS_INOBT_SET_FREE(&rec, off); 1112 rec.ir_free |= XFS_INOBT_MASK(off);
1113 rec.ir_freecount++; 1113 rec.ir_freecount++;
1114 1114
1115 /* 1115 /*
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 99f2408e8d8e..c282a9af5393 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur(
164} 164}
165 165
166/* 166/*
167 * intial value of ptr for lookup 167 * initial value of ptr for lookup
168 */ 168 */
169STATIC void 169STATIC void
170xfs_inobt_init_ptr_from_cur( 170xfs_inobt_init_ptr_from_cur(
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 5580e255ff06..f782ad0c4769 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -32,14 +32,14 @@ struct xfs_mount;
32#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ 32#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
33 33
34typedef __uint64_t xfs_inofree_t; 34typedef __uint64_t xfs_inofree_t;
35#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) 35#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) 36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
38#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
38 39
39static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 40static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
40{ 41{
41 return (((n) >= XFS_INODES_PER_CHUNK ? \ 42 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
42 (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
43} 43}
44 44
45/* 45/*
@@ -69,20 +69,6 @@ typedef struct xfs_inobt_key {
69typedef __be32 xfs_inobt_ptr_t; 69typedef __be32 xfs_inobt_ptr_t;
70 70
71/* 71/*
72 * Bit manipulations for ir_free.
73 */
74#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
75#define XFS_INOBT_IS_FREE(rp,i) \
76 (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
77#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i))
78#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
79
80/*
81 * Maximum number of inode btree levels.
82 */
83#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
84
85/*
86 * block numbers in the AG. 72 * block numbers in the AG.
87 */ 73 */
88#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) 74#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1f175fa34b22..f879c1bc4b96 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp {
122 122
123/* 123/*
124 * NOTE: This structure must be kept identical to struct xfs_dinode 124 * NOTE: This structure must be kept identical to struct xfs_dinode
125 * in xfs_dinode.h except for the endianess annotations. 125 * in xfs_dinode.h except for the endianness annotations.
126 */ 126 */
127typedef struct xfs_icdinode { 127typedef struct xfs_icdinode {
128 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 128 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9957d0602d54..a52ac125f055 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format {
40 __int32_t ilf_boffset; /* off of inode in buffer */ 40 __int32_t ilf_boffset; /* off of inode in buffer */
41} xfs_inode_log_format_t; 41} xfs_inode_log_format_t;
42 42
43#ifndef HAVE_FORMAT32
44typedef struct xfs_inode_log_format_32 { 43typedef struct xfs_inode_log_format_32 {
45 __uint16_t ilf_type; /* inode log item type */ 44 __uint16_t ilf_type; /* inode log item type */
46 __uint16_t ilf_size; /* size of this item */ 45 __uint16_t ilf_size; /* size of this item */
@@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 {
56 __int32_t ilf_len; /* len of inode buffer */ 55 __int32_t ilf_len; /* len of inode buffer */
57 __int32_t ilf_boffset; /* off of inode in buffer */ 56 __int32_t ilf_boffset; /* off of inode in buffer */
58} __attribute__((packed)) xfs_inode_log_format_32_t; 57} __attribute__((packed)) xfs_inode_log_format_32_t;
59#endif
60 58
61typedef struct xfs_inode_log_format_64 { 59typedef struct xfs_inode_log_format_64 {
62 __uint16_t ilf_type; /* inode log item type */ 60 __uint16_t ilf_type; /* inode log item type */
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ee1a0c134cc2..a1cc1322fc0f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -63,7 +63,7 @@ typedef enum {
63 */ 63 */
64 64
65typedef struct xfs_iomap { 65typedef struct xfs_iomap {
66 xfs_daddr_t iomap_bn; /* first 512b blk of mapping */ 66 xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
67 xfs_buftarg_t *iomap_target; 67 xfs_buftarg_t *iomap_target;
68 xfs_off_t iomap_offset; /* offset of mapping, bytes */ 68 xfs_off_t iomap_offset; /* offset of mapping, bytes */
69 xfs_off_t iomap_bsize; /* size of mapping, bytes */ 69 xfs_off_t iomap_bsize; /* size of mapping, bytes */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf98a805ec90..aeb2d2221c7d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -83,7 +83,12 @@ xfs_bulkstat_one_iget(
83 buf->bs_uid = dic->di_uid; 83 buf->bs_uid = dic->di_uid;
84 buf->bs_gid = dic->di_gid; 84 buf->bs_gid = dic->di_gid;
85 buf->bs_size = dic->di_size; 85 buf->bs_size = dic->di_size;
86 vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime); 86 /*
87 * We are reading the atime from the Linux inode because the
88 * dinode might not be uptodate.
89 */
90 buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
91 buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
87 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 92 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
88 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 93 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
89 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 94 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -579,7 +584,7 @@ xfs_bulkstat(
579 * first inode of the cluster. 584 * first inode of the cluster.
580 * 585 *
581 * Careful with clustidx. There can be 586 * Careful with clustidx. There can be
582 * multple clusters per chunk, a single 587 * multiple clusters per chunk, a single
583 * cluster per chunk or a cluster that has 588 * cluster per chunk or a cluster that has
584 * inodes represented from several different 589 * inodes represented from several different
585 * chunks (if blocksize is large). 590 * chunks (if blocksize is large).
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f4726f702a9e..f76c6d7cea21 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -574,7 +574,7 @@ xfs_log_mount(
574 error = xfs_trans_ail_init(mp); 574 error = xfs_trans_ail_init(mp);
575 if (error) { 575 if (error) {
576 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 576 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
577 goto error; 577 goto out_free_log;
578 } 578 }
579 mp->m_log->l_ailp = mp->m_ail; 579 mp->m_log->l_ailp = mp->m_ail;
580 580
@@ -594,20 +594,22 @@ xfs_log_mount(
594 mp->m_flags |= XFS_MOUNT_RDONLY; 594 mp->m_flags |= XFS_MOUNT_RDONLY;
595 if (error) { 595 if (error) {
596 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 596 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
597 goto error; 597 goto out_destroy_ail;
598 } 598 }
599 } 599 }
600 600
601 /* Normal transactions can now occur */ 601 /* Normal transactions can now occur */
602 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 602 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
603 603
604 /* End mounting message in xfs_log_mount_finish */
605 return 0; 604 return 0;
606error: 605
607 xfs_log_unmount_dealloc(mp); 606out_destroy_ail:
607 xfs_trans_ail_destroy(mp);
608out_free_log:
609 xlog_dealloc_log(mp->m_log);
608out: 610out:
609 return error; 611 return error;
610} /* xfs_log_mount */ 612}
611 613
612/* 614/*
613 * Finish the recovery of the file system. This is separate from 615 * Finish the recovery of the file system. This is separate from
@@ -633,19 +635,6 @@ xfs_log_mount_finish(xfs_mount_t *mp)
633} 635}
634 636
635/* 637/*
636 * Unmount processing for the log.
637 */
638int
639xfs_log_unmount(xfs_mount_t *mp)
640{
641 int error;
642
643 error = xfs_log_unmount_write(mp);
644 xfs_log_unmount_dealloc(mp);
645 return error;
646}
647
648/*
649 * Final log writes as part of unmount. 638 * Final log writes as part of unmount.
650 * 639 *
651 * Mark the filesystem clean as unmount happens. Note that during relocation 640 * Mark the filesystem clean as unmount happens. Note that during relocation
@@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
795 * and deallocate the log as the aild references the log. 784 * and deallocate the log as the aild references the log.
796 */ 785 */
797void 786void
798xfs_log_unmount_dealloc(xfs_mount_t *mp) 787xfs_log_unmount(xfs_mount_t *mp)
799{ 788{
800 xfs_trans_ail_destroy(mp); 789 xfs_trans_ail_destroy(mp);
801 xlog_dealloc_log(mp->m_log); 790 xlog_dealloc_log(mp->m_log);
@@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
1109/* 1098/*
1110 * Return size of each in-core log record buffer. 1099 * Return size of each in-core log record buffer.
1111 * 1100 *
1112 * All machines get 8 x 32KB buffers by default, unless tuned otherwise. 1101 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
1113 * 1102 *
1114 * If the filesystem blocksize is too large, we may need to choose a 1103 * If the filesystem blocksize is too large, we may need to choose a
1115 * larger size since the directory code currently logs entire blocks. 1104 * larger size since the directory code currently logs entire blocks.
@@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1139 } 1128 }
1140 1129
1141 if (xfs_sb_version_haslogv2(&mp->m_sb)) { 1130 if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1142 /* # headers = size / 32K 1131 /* # headers = size / 32k
1143 * one header holds cycles from 32K of data 1132 * one header holds cycles from 32k of data
1144 */ 1133 */
1145 1134
1146 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; 1135 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
@@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1156 goto done; 1145 goto done;
1157 } 1146 }
1158 1147
1159 /* All machines use 32KB buffers by default. */ 1148 /* All machines use 32kB buffers by default. */
1160 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; 1149 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1161 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; 1150 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1162 1151
@@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1164 log->l_iclog_hsize = BBSIZE; 1153 log->l_iclog_hsize = BBSIZE;
1165 log->l_iclog_heads = 1; 1154 log->l_iclog_heads = 1;
1166 1155
1167 /* 1156done:
1168 * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use 1157 /* are we being asked to make the sizes selected above visible? */
1169 * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers.
1170 */
1171 if (mp->m_sb.sb_blocksize >= 16*1024) {
1172 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1173 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1174 if (mp->m_logbufs <= 0) {
1175 switch (mp->m_sb.sb_blocksize) {
1176 case 16*1024: /* 16 KB */
1177 log->l_iclog_bufs = 3;
1178 break;
1179 case 32*1024: /* 32 KB */
1180 log->l_iclog_bufs = 4;
1181 break;
1182 case 64*1024: /* 64 KB */
1183 log->l_iclog_bufs = 8;
1184 break;
1185 default:
1186 xlog_panic("XFS: Invalid blocksize");
1187 break;
1188 }
1189 }
1190 }
1191
1192done: /* are we being asked to make the sizes selected above visible? */
1193 if (mp->m_logbufs == 0) 1158 if (mp->m_logbufs == 0)
1194 mp->m_logbufs = log->l_iclog_bufs; 1159 mp->m_logbufs = log->l_iclog_bufs;
1195 if (mp->m_logbsize == 0) 1160 if (mp->m_logbsize == 0)
@@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3214 */ 3179 */
3215 3180
3216/* 3181/*
3217 * Free a used ticket when it's refcount falls to zero. 3182 * Free a used ticket when its refcount falls to zero.
3218 */ 3183 */
3219void 3184void
3220xfs_log_ticket_put( 3185xfs_log_ticket_put(
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 8a3e84e900a3..d0c9baa50b1a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -170,9 +170,8 @@ int xfs_log_write(struct xfs_mount *mp,
170 int nentries, 170 int nentries,
171 xfs_log_ticket_t ticket, 171 xfs_log_ticket_t ticket,
172 xfs_lsn_t *start_lsn); 172 xfs_lsn_t *start_lsn);
173int xfs_log_unmount(struct xfs_mount *mp);
174int xfs_log_unmount_write(struct xfs_mount *mp); 173int xfs_log_unmount_write(struct xfs_mount *mp);
175void xfs_log_unmount_dealloc(struct xfs_mount *mp); 174void xfs_log_unmount(struct xfs_mount *mp);
176int xfs_log_force_umount(struct xfs_mount *mp, int logerror); 175int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
177int xfs_log_need_covered(struct xfs_mount *mp); 176int xfs_log_need_covered(struct xfs_mount *mp);
178 177
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 654167be0efb..bcad5f4c1fd1 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -359,7 +359,7 @@ typedef struct xlog_in_core {
359 int ic_size; 359 int ic_size;
360 int ic_offset; 360 int ic_offset;
361 int ic_bwritecnt; 361 int ic_bwritecnt;
362 ushort_t ic_state; 362 unsigned short ic_state;
363 char *ic_datap; /* pointer to iclog data */ 363 char *ic_datap; /* pointer to iclog data */
364#ifdef XFS_LOG_TRACE 364#ifdef XFS_LOG_TRACE
365 struct ktrace *ic_trace; 365 struct ktrace *ic_trace;
@@ -455,7 +455,6 @@ extern void xlog_recover_process_iunlinks(xlog_t *log);
455 455
456extern struct xfs_buf *xlog_get_bp(xlog_t *, int); 456extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
457extern void xlog_put_bp(struct xfs_buf *); 457extern void xlog_put_bp(struct xfs_buf *);
458extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
459 458
460extern kmem_zone_t *xfs_log_ticket_zone; 459extern kmem_zone_t *xfs_log_ticket_zone;
461 460
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 61af610d79b3..7ba450116d4f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -94,12 +94,30 @@ xlog_put_bp(
94 xfs_buf_free(bp); 94 xfs_buf_free(bp);
95} 95}
96 96
97STATIC xfs_caddr_t
98xlog_align(
99 xlog_t *log,
100 xfs_daddr_t blk_no,
101 int nbblks,
102 xfs_buf_t *bp)
103{
104 xfs_caddr_t ptr;
105
106 if (!log->l_sectbb_log)
107 return XFS_BUF_PTR(bp);
108
109 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
110 ASSERT(XFS_BUF_SIZE(bp) >=
111 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
112 return ptr;
113}
114
97 115
98/* 116/*
99 * nbblks should be uint, but oh well. Just want to catch that 32-bit length. 117 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
100 */ 118 */
101int 119STATIC int
102xlog_bread( 120xlog_bread_noalign(
103 xlog_t *log, 121 xlog_t *log,
104 xfs_daddr_t blk_no, 122 xfs_daddr_t blk_no,
105 int nbblks, 123 int nbblks,
@@ -137,6 +155,24 @@ xlog_bread(
137 return error; 155 return error;
138} 156}
139 157
158STATIC int
159xlog_bread(
160 xlog_t *log,
161 xfs_daddr_t blk_no,
162 int nbblks,
163 xfs_buf_t *bp,
164 xfs_caddr_t *offset)
165{
166 int error;
167
168 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
169 if (error)
170 return error;
171
172 *offset = xlog_align(log, blk_no, nbblks, bp);
173 return 0;
174}
175
140/* 176/*
141 * Write out the buffer at the given block for the given number of blocks. 177 * Write out the buffer at the given block for the given number of blocks.
142 * The buffer is kept locked across the write and is returned locked. 178 * The buffer is kept locked across the write and is returned locked.
@@ -180,24 +216,6 @@ xlog_bwrite(
180 return error; 216 return error;
181} 217}
182 218
183STATIC xfs_caddr_t
184xlog_align(
185 xlog_t *log,
186 xfs_daddr_t blk_no,
187 int nbblks,
188 xfs_buf_t *bp)
189{
190 xfs_caddr_t ptr;
191
192 if (!log->l_sectbb_log)
193 return XFS_BUF_PTR(bp);
194
195 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
196 ASSERT(XFS_BUF_SIZE(bp) >=
197 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
198 return ptr;
199}
200
201#ifdef DEBUG 219#ifdef DEBUG
202/* 220/*
203 * dump debug superblock and log record information 221 * dump debug superblock and log record information
@@ -211,11 +229,11 @@ xlog_header_check_dump(
211 229
212 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); 230 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__);
213 for (b = 0; b < 16; b++) 231 for (b = 0; b < 16; b++)
214 cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); 232 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
215 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); 233 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
216 cmn_err(CE_DEBUG, " log : uuid = "); 234 cmn_err(CE_DEBUG, " log : uuid = ");
217 for (b = 0; b < 16; b++) 235 for (b = 0; b < 16; b++)
218 cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); 236 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
219 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); 237 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
220} 238}
221#else 239#else
@@ -321,9 +339,9 @@ xlog_find_cycle_start(
321 339
322 mid_blk = BLK_AVG(first_blk, *last_blk); 340 mid_blk = BLK_AVG(first_blk, *last_blk);
323 while (mid_blk != first_blk && mid_blk != *last_blk) { 341 while (mid_blk != first_blk && mid_blk != *last_blk) {
324 if ((error = xlog_bread(log, mid_blk, 1, bp))) 342 error = xlog_bread(log, mid_blk, 1, bp, &offset);
343 if (error)
325 return error; 344 return error;
326 offset = xlog_align(log, mid_blk, 1, bp);
327 mid_cycle = xlog_get_cycle(offset); 345 mid_cycle = xlog_get_cycle(offset);
328 if (mid_cycle == cycle) { 346 if (mid_cycle == cycle) {
329 *last_blk = mid_blk; 347 *last_blk = mid_blk;
@@ -379,10 +397,10 @@ xlog_find_verify_cycle(
379 397
380 bcount = min(bufblks, (start_blk + nbblks - i)); 398 bcount = min(bufblks, (start_blk + nbblks - i));
381 399
382 if ((error = xlog_bread(log, i, bcount, bp))) 400 error = xlog_bread(log, i, bcount, bp, &buf);
401 if (error)
383 goto out; 402 goto out;
384 403
385 buf = xlog_align(log, i, bcount, bp);
386 for (j = 0; j < bcount; j++) { 404 for (j = 0; j < bcount; j++) {
387 cycle = xlog_get_cycle(buf); 405 cycle = xlog_get_cycle(buf);
388 if (cycle == stop_on_cycle_no) { 406 if (cycle == stop_on_cycle_no) {
@@ -436,9 +454,9 @@ xlog_find_verify_log_record(
436 return ENOMEM; 454 return ENOMEM;
437 smallmem = 1; 455 smallmem = 1;
438 } else { 456 } else {
439 if ((error = xlog_bread(log, start_blk, num_blks, bp))) 457 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
458 if (error)
440 goto out; 459 goto out;
441 offset = xlog_align(log, start_blk, num_blks, bp);
442 offset += ((num_blks - 1) << BBSHIFT); 460 offset += ((num_blks - 1) << BBSHIFT);
443 } 461 }
444 462
@@ -453,9 +471,9 @@ xlog_find_verify_log_record(
453 } 471 }
454 472
455 if (smallmem) { 473 if (smallmem) {
456 if ((error = xlog_bread(log, i, 1, bp))) 474 error = xlog_bread(log, i, 1, bp, &offset);
475 if (error)
457 goto out; 476 goto out;
458 offset = xlog_align(log, i, 1, bp);
459 } 477 }
460 478
461 head = (xlog_rec_header_t *)offset; 479 head = (xlog_rec_header_t *)offset;
@@ -559,15 +577,18 @@ xlog_find_head(
559 bp = xlog_get_bp(log, 1); 577 bp = xlog_get_bp(log, 1);
560 if (!bp) 578 if (!bp)
561 return ENOMEM; 579 return ENOMEM;
562 if ((error = xlog_bread(log, 0, 1, bp))) 580
581 error = xlog_bread(log, 0, 1, bp, &offset);
582 if (error)
563 goto bp_err; 583 goto bp_err;
564 offset = xlog_align(log, 0, 1, bp); 584
565 first_half_cycle = xlog_get_cycle(offset); 585 first_half_cycle = xlog_get_cycle(offset);
566 586
567 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ 587 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
568 if ((error = xlog_bread(log, last_blk, 1, bp))) 588 error = xlog_bread(log, last_blk, 1, bp, &offset);
589 if (error)
569 goto bp_err; 590 goto bp_err;
570 offset = xlog_align(log, last_blk, 1, bp); 591
571 last_half_cycle = xlog_get_cycle(offset); 592 last_half_cycle = xlog_get_cycle(offset);
572 ASSERT(last_half_cycle != 0); 593 ASSERT(last_half_cycle != 0);
573 594
@@ -817,9 +838,10 @@ xlog_find_tail(
817 if (!bp) 838 if (!bp)
818 return ENOMEM; 839 return ENOMEM;
819 if (*head_blk == 0) { /* special case */ 840 if (*head_blk == 0) { /* special case */
820 if ((error = xlog_bread(log, 0, 1, bp))) 841 error = xlog_bread(log, 0, 1, bp, &offset);
842 if (error)
821 goto bread_err; 843 goto bread_err;
822 offset = xlog_align(log, 0, 1, bp); 844
823 if (xlog_get_cycle(offset) == 0) { 845 if (xlog_get_cycle(offset) == 0) {
824 *tail_blk = 0; 846 *tail_blk = 0;
825 /* leave all other log inited values alone */ 847 /* leave all other log inited values alone */
@@ -832,9 +854,10 @@ xlog_find_tail(
832 */ 854 */
833 ASSERT(*head_blk < INT_MAX); 855 ASSERT(*head_blk < INT_MAX);
834 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 856 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
835 if ((error = xlog_bread(log, i, 1, bp))) 857 error = xlog_bread(log, i, 1, bp, &offset);
858 if (error)
836 goto bread_err; 859 goto bread_err;
837 offset = xlog_align(log, i, 1, bp); 860
838 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 861 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
839 found = 1; 862 found = 1;
840 break; 863 break;
@@ -848,9 +871,10 @@ xlog_find_tail(
848 */ 871 */
849 if (!found) { 872 if (!found) {
850 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 873 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
851 if ((error = xlog_bread(log, i, 1, bp))) 874 error = xlog_bread(log, i, 1, bp, &offset);
875 if (error)
852 goto bread_err; 876 goto bread_err;
853 offset = xlog_align(log, i, 1, bp); 877
854 if (XLOG_HEADER_MAGIC_NUM == 878 if (XLOG_HEADER_MAGIC_NUM ==
855 be32_to_cpu(*(__be32 *)offset)) { 879 be32_to_cpu(*(__be32 *)offset)) {
856 found = 2; 880 found = 2;
@@ -922,10 +946,10 @@ xlog_find_tail(
922 if (*head_blk == after_umount_blk && 946 if (*head_blk == after_umount_blk &&
923 be32_to_cpu(rhead->h_num_logops) == 1) { 947 be32_to_cpu(rhead->h_num_logops) == 1) {
924 umount_data_blk = (i + hblks) % log->l_logBBsize; 948 umount_data_blk = (i + hblks) % log->l_logBBsize;
925 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { 949 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
950 if (error)
926 goto bread_err; 951 goto bread_err;
927 } 952
928 offset = xlog_align(log, umount_data_blk, 1, bp);
929 op_head = (xlog_op_header_t *)offset; 953 op_head = (xlog_op_header_t *)offset;
930 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 954 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
931 /* 955 /*
@@ -1017,9 +1041,10 @@ xlog_find_zeroed(
1017 bp = xlog_get_bp(log, 1); 1041 bp = xlog_get_bp(log, 1);
1018 if (!bp) 1042 if (!bp)
1019 return ENOMEM; 1043 return ENOMEM;
1020 if ((error = xlog_bread(log, 0, 1, bp))) 1044 error = xlog_bread(log, 0, 1, bp, &offset);
1045 if (error)
1021 goto bp_err; 1046 goto bp_err;
1022 offset = xlog_align(log, 0, 1, bp); 1047
1023 first_cycle = xlog_get_cycle(offset); 1048 first_cycle = xlog_get_cycle(offset);
1024 if (first_cycle == 0) { /* completely zeroed log */ 1049 if (first_cycle == 0) { /* completely zeroed log */
1025 *blk_no = 0; 1050 *blk_no = 0;
@@ -1028,9 +1053,10 @@ xlog_find_zeroed(
1028 } 1053 }
1029 1054
1030 /* check partially zeroed log */ 1055 /* check partially zeroed log */
1031 if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) 1056 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1057 if (error)
1032 goto bp_err; 1058 goto bp_err;
1033 offset = xlog_align(log, log_bbnum-1, 1, bp); 1059
1034 last_cycle = xlog_get_cycle(offset); 1060 last_cycle = xlog_get_cycle(offset);
1035 if (last_cycle != 0) { /* log completely written to */ 1061 if (last_cycle != 0) { /* log completely written to */
1036 xlog_put_bp(bp); 1062 xlog_put_bp(bp);
@@ -1152,10 +1178,10 @@ xlog_write_log_records(
1152 */ 1178 */
1153 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1179 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1154 if (balign != start_block) { 1180 if (balign != start_block) {
1155 if ((error = xlog_bread(log, start_block, 1, bp))) { 1181 error = xlog_bread_noalign(log, start_block, 1, bp);
1156 xlog_put_bp(bp); 1182 if (error)
1157 return error; 1183 goto out_put_bp;
1158 } 1184
1159 j = start_block - balign; 1185 j = start_block - balign;
1160 } 1186 }
1161 1187
@@ -1175,10 +1201,14 @@ xlog_write_log_records(
1175 balign = BBTOB(ealign - start_block); 1201 balign = BBTOB(ealign - start_block);
1176 error = XFS_BUF_SET_PTR(bp, offset + balign, 1202 error = XFS_BUF_SET_PTR(bp, offset + balign,
1177 BBTOB(sectbb)); 1203 BBTOB(sectbb));
1178 if (!error) 1204 if (error)
1179 error = xlog_bread(log, ealign, sectbb, bp); 1205 break;
1180 if (!error) 1206
1181 error = XFS_BUF_SET_PTR(bp, offset, bufblks); 1207 error = xlog_bread_noalign(log, ealign, sectbb, bp);
1208 if (error)
1209 break;
1210
1211 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1182 if (error) 1212 if (error)
1183 break; 1213 break;
1184 } 1214 }
@@ -1195,6 +1225,8 @@ xlog_write_log_records(
1195 start_block += endcount; 1225 start_block += endcount;
1196 j = 0; 1226 j = 0;
1197 } 1227 }
1228
1229 out_put_bp:
1198 xlog_put_bp(bp); 1230 xlog_put_bp(bp);
1199 return error; 1231 return error;
1200} 1232}
@@ -2511,16 +2543,10 @@ xlog_recover_do_inode_trans(
2511 } 2543 }
2512 2544
2513write_inode_buffer: 2545write_inode_buffer:
2514 if (ITEM_TYPE(item) == XFS_LI_INODE) { 2546 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2515 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2547 bp->b_mount = mp;
2516 bp->b_mount = mp; 2548 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2517 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2549 xfs_bdwrite(mp, bp);
2518 xfs_bdwrite(mp, bp);
2519 } else {
2520 XFS_BUF_STALE(bp);
2521 error = xfs_bwrite(mp, bp);
2522 }
2523
2524error: 2550error:
2525 if (need_free) 2551 if (need_free)
2526 kmem_free(in_f); 2552 kmem_free(in_f);
@@ -2769,51 +2795,48 @@ xlog_recover_do_trans(
2769 int error = 0; 2795 int error = 0;
2770 xlog_recover_item_t *item, *first_item; 2796 xlog_recover_item_t *item, *first_item;
2771 2797
2772 if ((error = xlog_recover_reorder_trans(trans))) 2798 error = xlog_recover_reorder_trans(trans);
2799 if (error)
2773 return error; 2800 return error;
2801
2774 first_item = item = trans->r_itemq; 2802 first_item = item = trans->r_itemq;
2775 do { 2803 do {
2776 /* 2804 switch (ITEM_TYPE(item)) {
2777 * we don't need to worry about the block number being 2805 case XFS_LI_BUF:
2778 * truncated in > 1 TB buffers because in user-land, 2806 error = xlog_recover_do_buffer_trans(log, item, pass);
2779 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so 2807 break;
2780 * the blknos will get through the user-mode buffer 2808 case XFS_LI_INODE:
2781 * cache properly. The only bad case is o32 kernels 2809 error = xlog_recover_do_inode_trans(log, item, pass);
2782 * where xfs_daddr_t is 32-bits but mount will warn us 2810 break;
2783 * off a > 1 TB filesystem before we get here. 2811 case XFS_LI_EFI:
2784 */ 2812 error = xlog_recover_do_efi_trans(log, item,
2785 if ((ITEM_TYPE(item) == XFS_LI_BUF)) { 2813 trans->r_lsn, pass);
2786 if ((error = xlog_recover_do_buffer_trans(log, item, 2814 break;
2787 pass))) 2815 case XFS_LI_EFD:
2788 break;
2789 } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
2790 if ((error = xlog_recover_do_inode_trans(log, item,
2791 pass)))
2792 break;
2793 } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2794 if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2795 pass)))
2796 break;
2797 } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2798 xlog_recover_do_efd_trans(log, item, pass); 2816 xlog_recover_do_efd_trans(log, item, pass);
2799 } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { 2817 error = 0;
2800 if ((error = xlog_recover_do_dquot_trans(log, item, 2818 break;
2801 pass))) 2819 case XFS_LI_DQUOT:
2802 break; 2820 error = xlog_recover_do_dquot_trans(log, item, pass);
2803 } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { 2821 break;
2804 if ((error = xlog_recover_do_quotaoff_trans(log, item, 2822 case XFS_LI_QUOTAOFF:
2805 pass))) 2823 error = xlog_recover_do_quotaoff_trans(log, item,
2806 break; 2824 pass);
2807 } else { 2825 break;
2808 xlog_warn("XFS: xlog_recover_do_trans"); 2826 default:
2827 xlog_warn(
2828 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2809 ASSERT(0); 2829 ASSERT(0);
2810 error = XFS_ERROR(EIO); 2830 error = XFS_ERROR(EIO);
2811 break; 2831 break;
2812 } 2832 }
2833
2834 if (error)
2835 return error;
2813 item = item->ri_next; 2836 item = item->ri_next;
2814 } while (first_item != item); 2837 } while (first_item != item);
2815 2838
2816 return error; 2839 return 0;
2817} 2840}
2818 2841
2819/* 2842/*
@@ -3490,9 +3513,11 @@ xlog_do_recovery_pass(
3490 hbp = xlog_get_bp(log, 1); 3513 hbp = xlog_get_bp(log, 1);
3491 if (!hbp) 3514 if (!hbp)
3492 return ENOMEM; 3515 return ENOMEM;
3493 if ((error = xlog_bread(log, tail_blk, 1, hbp))) 3516
3517 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3518 if (error)
3494 goto bread_err1; 3519 goto bread_err1;
3495 offset = xlog_align(log, tail_blk, 1, hbp); 3520
3496 rhead = (xlog_rec_header_t *)offset; 3521 rhead = (xlog_rec_header_t *)offset;
3497 error = xlog_valid_rec_header(log, rhead, tail_blk); 3522 error = xlog_valid_rec_header(log, rhead, tail_blk);
3498 if (error) 3523 if (error)
@@ -3526,9 +3551,10 @@ xlog_do_recovery_pass(
3526 memset(rhash, 0, sizeof(rhash)); 3551 memset(rhash, 0, sizeof(rhash));
3527 if (tail_blk <= head_blk) { 3552 if (tail_blk <= head_blk) {
3528 for (blk_no = tail_blk; blk_no < head_blk; ) { 3553 for (blk_no = tail_blk; blk_no < head_blk; ) {
3529 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3554 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3555 if (error)
3530 goto bread_err2; 3556 goto bread_err2;
3531 offset = xlog_align(log, blk_no, hblks, hbp); 3557
3532 rhead = (xlog_rec_header_t *)offset; 3558 rhead = (xlog_rec_header_t *)offset;
3533 error = xlog_valid_rec_header(log, rhead, blk_no); 3559 error = xlog_valid_rec_header(log, rhead, blk_no);
3534 if (error) 3560 if (error)
@@ -3536,10 +3562,11 @@ xlog_do_recovery_pass(
3536 3562
3537 /* blocks in data section */ 3563 /* blocks in data section */
3538 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3564 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3539 error = xlog_bread(log, blk_no + hblks, bblks, dbp); 3565 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3566 &offset);
3540 if (error) 3567 if (error)
3541 goto bread_err2; 3568 goto bread_err2;
3542 offset = xlog_align(log, blk_no + hblks, bblks, dbp); 3569
3543 xlog_unpack_data(rhead, offset, log); 3570 xlog_unpack_data(rhead, offset, log);
3544 if ((error = xlog_recover_process_data(log, 3571 if ((error = xlog_recover_process_data(log,
3545 rhash, rhead, offset, pass))) 3572 rhash, rhead, offset, pass)))
@@ -3562,10 +3589,10 @@ xlog_do_recovery_pass(
3562 wrapped_hblks = 0; 3589 wrapped_hblks = 0;
3563 if (blk_no + hblks <= log->l_logBBsize) { 3590 if (blk_no + hblks <= log->l_logBBsize) {
3564 /* Read header in one read */ 3591 /* Read header in one read */
3565 error = xlog_bread(log, blk_no, hblks, hbp); 3592 error = xlog_bread(log, blk_no, hblks, hbp,
3593 &offset);
3566 if (error) 3594 if (error)
3567 goto bread_err2; 3595 goto bread_err2;
3568 offset = xlog_align(log, blk_no, hblks, hbp);
3569 } else { 3596 } else {
3570 /* This LR is split across physical log end */ 3597 /* This LR is split across physical log end */
3571 if (blk_no != log->l_logBBsize) { 3598 if (blk_no != log->l_logBBsize) {
@@ -3573,12 +3600,13 @@ xlog_do_recovery_pass(
3573 ASSERT(blk_no <= INT_MAX); 3600 ASSERT(blk_no <= INT_MAX);
3574 split_hblks = log->l_logBBsize - (int)blk_no; 3601 split_hblks = log->l_logBBsize - (int)blk_no;
3575 ASSERT(split_hblks > 0); 3602 ASSERT(split_hblks > 0);
3576 if ((error = xlog_bread(log, blk_no, 3603 error = xlog_bread(log, blk_no,
3577 split_hblks, hbp))) 3604 split_hblks, hbp,
3605 &offset);
3606 if (error)
3578 goto bread_err2; 3607 goto bread_err2;
3579 offset = xlog_align(log, blk_no,
3580 split_hblks, hbp);
3581 } 3608 }
3609
3582 /* 3610 /*
3583 * Note: this black magic still works with 3611 * Note: this black magic still works with
3584 * large sector sizes (non-512) only because: 3612 * large sector sizes (non-512) only because:
@@ -3596,14 +3624,19 @@ xlog_do_recovery_pass(
3596 error = XFS_BUF_SET_PTR(hbp, 3624 error = XFS_BUF_SET_PTR(hbp,
3597 bufaddr + BBTOB(split_hblks), 3625 bufaddr + BBTOB(split_hblks),
3598 BBTOB(hblks - split_hblks)); 3626 BBTOB(hblks - split_hblks));
3599 if (!error) 3627 if (error)
3600 error = xlog_bread(log, 0, 3628 goto bread_err2;
3601 wrapped_hblks, hbp); 3629
3602 if (!error) 3630 error = xlog_bread_noalign(log, 0,
3603 error = XFS_BUF_SET_PTR(hbp, bufaddr, 3631 wrapped_hblks, hbp);
3632 if (error)
3633 goto bread_err2;
3634
3635 error = XFS_BUF_SET_PTR(hbp, bufaddr,
3604 BBTOB(hblks)); 3636 BBTOB(hblks));
3605 if (error) 3637 if (error)
3606 goto bread_err2; 3638 goto bread_err2;
3639
3607 if (!offset) 3640 if (!offset)
3608 offset = xlog_align(log, 0, 3641 offset = xlog_align(log, 0,
3609 wrapped_hblks, hbp); 3642 wrapped_hblks, hbp);
@@ -3619,10 +3652,10 @@ xlog_do_recovery_pass(
3619 3652
3620 /* Read in data for log record */ 3653 /* Read in data for log record */
3621 if (blk_no + bblks <= log->l_logBBsize) { 3654 if (blk_no + bblks <= log->l_logBBsize) {
3622 error = xlog_bread(log, blk_no, bblks, dbp); 3655 error = xlog_bread(log, blk_no, bblks, dbp,
3656 &offset);
3623 if (error) 3657 if (error)
3624 goto bread_err2; 3658 goto bread_err2;
3625 offset = xlog_align(log, blk_no, bblks, dbp);
3626 } else { 3659 } else {
3627 /* This log record is split across the 3660 /* This log record is split across the
3628 * physical end of log */ 3661 * physical end of log */
@@ -3636,12 +3669,13 @@ xlog_do_recovery_pass(
3636 split_bblks = 3669 split_bblks =
3637 log->l_logBBsize - (int)blk_no; 3670 log->l_logBBsize - (int)blk_no;
3638 ASSERT(split_bblks > 0); 3671 ASSERT(split_bblks > 0);
3639 if ((error = xlog_bread(log, blk_no, 3672 error = xlog_bread(log, blk_no,
3640 split_bblks, dbp))) 3673 split_bblks, dbp,
3674 &offset);
3675 if (error)
3641 goto bread_err2; 3676 goto bread_err2;
3642 offset = xlog_align(log, blk_no,
3643 split_bblks, dbp);
3644 } 3677 }
3678
3645 /* 3679 /*
3646 * Note: this black magic still works with 3680 * Note: this black magic still works with
3647 * large sector sizes (non-512) only because: 3681 * large sector sizes (non-512) only because:
@@ -3658,15 +3692,19 @@ xlog_do_recovery_pass(
3658 error = XFS_BUF_SET_PTR(dbp, 3692 error = XFS_BUF_SET_PTR(dbp,
3659 bufaddr + BBTOB(split_bblks), 3693 bufaddr + BBTOB(split_bblks),
3660 BBTOB(bblks - split_bblks)); 3694 BBTOB(bblks - split_bblks));
3661 if (!error)
3662 error = xlog_bread(log, wrapped_hblks,
3663 bblks - split_bblks,
3664 dbp);
3665 if (!error)
3666 error = XFS_BUF_SET_PTR(dbp, bufaddr,
3667 h_size);
3668 if (error) 3695 if (error)
3669 goto bread_err2; 3696 goto bread_err2;
3697
3698 error = xlog_bread_noalign(log, wrapped_hblks,
3699 bblks - split_bblks,
3700 dbp);
3701 if (error)
3702 goto bread_err2;
3703
3704 error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
3705 if (error)
3706 goto bread_err2;
3707
3670 if (!offset) 3708 if (!offset)
3671 offset = xlog_align(log, wrapped_hblks, 3709 offset = xlog_align(log, wrapped_hblks,
3672 bblks - split_bblks, dbp); 3710 bblks - split_bblks, dbp);
@@ -3683,17 +3721,21 @@ xlog_do_recovery_pass(
3683 3721
3684 /* read first part of physical log */ 3722 /* read first part of physical log */
3685 while (blk_no < head_blk) { 3723 while (blk_no < head_blk) {
3686 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3724 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3725 if (error)
3687 goto bread_err2; 3726 goto bread_err2;
3688 offset = xlog_align(log, blk_no, hblks, hbp); 3727
3689 rhead = (xlog_rec_header_t *)offset; 3728 rhead = (xlog_rec_header_t *)offset;
3690 error = xlog_valid_rec_header(log, rhead, blk_no); 3729 error = xlog_valid_rec_header(log, rhead, blk_no);
3691 if (error) 3730 if (error)
3692 goto bread_err2; 3731 goto bread_err2;
3732
3693 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3733 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3694 if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) 3734 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3735 &offset);
3736 if (error)
3695 goto bread_err2; 3737 goto bread_err2;
3696 offset = xlog_align(log, blk_no+hblks, bblks, dbp); 3738
3697 xlog_unpack_data(rhead, offset, log); 3739 xlog_unpack_data(rhead, offset, log);
3698 if ((error = xlog_recover_process_data(log, rhash, 3740 if ((error = xlog_recover_process_data(log, rhash,
3699 rhead, offset, pass))) 3741 rhead, offset, pass)))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 35300250e86d..b101990df027 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
45#include "xfs_fsops.h" 45#include "xfs_fsops.h"
46#include "xfs_utils.h" 46#include "xfs_utils.h"
47 47
48STATIC int xfs_uuid_mount(xfs_mount_t *);
49STATIC void xfs_unmountfs_wait(xfs_mount_t *); 48STATIC void xfs_unmountfs_wait(xfs_mount_t *);
50 49
51 50
@@ -121,6 +120,84 @@ static const struct {
121 { sizeof(xfs_sb_t), 0 } 120 { sizeof(xfs_sb_t), 0 }
122}; 121};
123 122
123static DEFINE_MUTEX(xfs_uuid_table_mutex);
124static int xfs_uuid_table_size;
125static uuid_t *xfs_uuid_table;
126
127/*
128 * See if the UUID is unique among mounted XFS filesystems.
129 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
130 */
131STATIC int
132xfs_uuid_mount(
133 struct xfs_mount *mp)
134{
135 uuid_t *uuid = &mp->m_sb.sb_uuid;
136 int hole, i;
137
138 if (mp->m_flags & XFS_MOUNT_NOUUID)
139 return 0;
140
141 if (uuid_is_nil(uuid)) {
142 cmn_err(CE_WARN,
143 "XFS: Filesystem %s has nil UUID - can't mount",
144 mp->m_fsname);
145 return XFS_ERROR(EINVAL);
146 }
147
148 mutex_lock(&xfs_uuid_table_mutex);
149 for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
150 if (uuid_is_nil(&xfs_uuid_table[i])) {
151 hole = i;
152 continue;
153 }
154 if (uuid_equal(uuid, &xfs_uuid_table[i]))
155 goto out_duplicate;
156 }
157
158 if (hole < 0) {
159 xfs_uuid_table = kmem_realloc(xfs_uuid_table,
160 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
161 xfs_uuid_table_size * sizeof(*xfs_uuid_table),
162 KM_SLEEP);
163 hole = xfs_uuid_table_size++;
164 }
165 xfs_uuid_table[hole] = *uuid;
166 mutex_unlock(&xfs_uuid_table_mutex);
167
168 return 0;
169
170 out_duplicate:
171 mutex_unlock(&xfs_uuid_table_mutex);
172 cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
173 mp->m_fsname);
174 return XFS_ERROR(EINVAL);
175}
176
177STATIC void
178xfs_uuid_unmount(
179 struct xfs_mount *mp)
180{
181 uuid_t *uuid = &mp->m_sb.sb_uuid;
182 int i;
183
184 if (mp->m_flags & XFS_MOUNT_NOUUID)
185 return;
186
187 mutex_lock(&xfs_uuid_table_mutex);
188 for (i = 0; i < xfs_uuid_table_size; i++) {
189 if (uuid_is_nil(&xfs_uuid_table[i]))
190 continue;
191 if (!uuid_equal(uuid, &xfs_uuid_table[i]))
192 continue;
193 memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
194 break;
195 }
196 ASSERT(i < xfs_uuid_table_size);
197 mutex_unlock(&xfs_uuid_table_mutex);
198}
199
200
124/* 201/*
125 * Free up the resources associated with a mount structure. Assume that 202 * Free up the resources associated with a mount structure. Assume that
126 * the structure was initially zeroed, so we can tell which fields got 203 * the structure was initially zeroed, so we can tell which fields got
@@ -256,6 +333,22 @@ xfs_mount_validate_sb(
256 return XFS_ERROR(ENOSYS); 333 return XFS_ERROR(ENOSYS);
257 } 334 }
258 335
336 /*
337 * Currently only very few inode sizes are supported.
338 */
339 switch (sbp->sb_inodesize) {
340 case 256:
341 case 512:
342 case 1024:
343 case 2048:
344 break;
345 default:
346 xfs_fs_mount_cmn_err(flags,
347 "inode size of %d bytes not supported",
348 sbp->sb_inodesize);
349 return XFS_ERROR(ENOSYS);
350 }
351
259 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 352 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
260 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 353 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
261 xfs_fs_mount_cmn_err(flags, 354 xfs_fs_mount_cmn_err(flags,
@@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
574 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; 667 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
575 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; 668 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
576 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; 669 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
577 mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
578 mp->m_blockmask = sbp->sb_blocksize - 1; 670 mp->m_blockmask = sbp->sb_blocksize - 1;
579 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; 671 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
580 mp->m_blockwmask = mp->m_blockwsize - 1; 672 mp->m_blockwmask = mp->m_blockwsize - 1;
581 673
582 /*
583 * Setup for attributes, in case they get created.
584 * This value is for inodes getting attributes for the first time,
585 * the per-inode value is for old attribute values.
586 */
587 ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
588 switch (sbp->sb_inodesize) {
589 case 256:
590 mp->m_attroffset = XFS_LITINO(mp) -
591 XFS_BMDR_SPACE_CALC(MINABTPTRS);
592 break;
593 case 512:
594 case 1024:
595 case 2048:
596 mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
597 break;
598 default:
599 ASSERT(0);
600 }
601 ASSERT(mp->m_attroffset < XFS_LITINO(mp));
602
603 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); 674 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
604 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); 675 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
605 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; 676 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
@@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
645 for (index = 0; index < agcount; index++) { 716 for (index = 0; index < agcount; index++) {
646 /* 717 /*
647 * read the agf, then the agi. This gets us 718 * read the agf, then the agi. This gets us
648 * all the inforamtion we need and populates the 719 * all the information we need and populates the
649 * per-ag structures for us. 720 * per-ag structures for us.
650 */ 721 */
651 error = xfs_alloc_pagf_init(mp, NULL, index, 0); 722 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
@@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp)
886} 957}
887 958
888/* 959/*
889 * xfs_mountfs
890 *
891 * This function does the following on an initial mount of a file system: 960 * This function does the following on an initial mount of a file system:
892 * - reads the superblock from disk and init the mount struct 961 * - reads the superblock from disk and init the mount struct
893 * - if we're a 32-bit kernel, do a size check on the superblock 962 * - if we're a 32-bit kernel, do a size check on the superblock
@@ -905,7 +974,6 @@ xfs_mountfs(
905 xfs_inode_t *rip; 974 xfs_inode_t *rip;
906 __uint64_t resblks; 975 __uint64_t resblks;
907 uint quotamount, quotaflags; 976 uint quotamount, quotaflags;
908 int uuid_mounted = 0;
909 int error = 0; 977 int error = 0;
910 978
911 xfs_mount_common(mp, sbp); 979 xfs_mount_common(mp, sbp);
@@ -960,7 +1028,7 @@ xfs_mountfs(
960 */ 1028 */
961 error = xfs_update_alignment(mp); 1029 error = xfs_update_alignment(mp);
962 if (error) 1030 if (error)
963 goto error1; 1031 goto out;
964 1032
965 xfs_alloc_compute_maxlevels(mp); 1033 xfs_alloc_compute_maxlevels(mp);
966 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); 1034 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
@@ -971,19 +1039,9 @@ xfs_mountfs(
971 1039
972 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); 1040 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
973 1041
974 /* 1042 error = xfs_uuid_mount(mp);
975 * XFS uses the uuid from the superblock as the unique 1043 if (error)
976 * identifier for fsid. We can not use the uuid from the volume 1044 goto out;
977 * since a single partition filesystem is identical to a single
978 * partition volume/filesystem.
979 */
980 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
981 if (xfs_uuid_mount(mp)) {
982 error = XFS_ERROR(EINVAL);
983 goto error1;
984 }
985 uuid_mounted=1;
986 }
987 1045
988 /* 1046 /*
989 * Set the minimum read and write sizes 1047 * Set the minimum read and write sizes
@@ -1007,7 +1065,7 @@ xfs_mountfs(
1007 */ 1065 */
1008 error = xfs_check_sizes(mp); 1066 error = xfs_check_sizes(mp);
1009 if (error) 1067 if (error)
1010 goto error1; 1068 goto out_remove_uuid;
1011 1069
1012 /* 1070 /*
1013 * Initialize realtime fields in the mount structure 1071 * Initialize realtime fields in the mount structure
@@ -1015,7 +1073,7 @@ xfs_mountfs(
1015 error = xfs_rtmount_init(mp); 1073 error = xfs_rtmount_init(mp);
1016 if (error) { 1074 if (error) {
1017 cmn_err(CE_WARN, "XFS: RT mount failed"); 1075 cmn_err(CE_WARN, "XFS: RT mount failed");
1018 goto error1; 1076 goto out_remove_uuid;
1019 } 1077 }
1020 1078
1021 /* 1079 /*
@@ -1045,26 +1103,26 @@ xfs_mountfs(
1045 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), 1103 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
1046 KM_MAYFAIL); 1104 KM_MAYFAIL);
1047 if (!mp->m_perag) 1105 if (!mp->m_perag)
1048 goto error1; 1106 goto out_remove_uuid;
1049 1107
1050 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); 1108 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1051 1109
1110 if (!sbp->sb_logblocks) {
1111 cmn_err(CE_WARN, "XFS: no log defined");
1112 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1113 error = XFS_ERROR(EFSCORRUPTED);
1114 goto out_free_perag;
1115 }
1116
1052 /* 1117 /*
1053 * log's mount-time initialization. Perform 1st part recovery if needed 1118 * log's mount-time initialization. Perform 1st part recovery if needed
1054 */ 1119 */
1055 if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ 1120 error = xfs_log_mount(mp, mp->m_logdev_targp,
1056 error = xfs_log_mount(mp, mp->m_logdev_targp, 1121 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1057 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1122 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1058 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1123 if (error) {
1059 if (error) { 1124 cmn_err(CE_WARN, "XFS: log mount failed");
1060 cmn_err(CE_WARN, "XFS: log mount failed"); 1125 goto out_free_perag;
1061 goto error2;
1062 }
1063 } else { /* No log has been defined */
1064 cmn_err(CE_WARN, "XFS: no log defined");
1065 XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
1066 error = XFS_ERROR(EFSCORRUPTED);
1067 goto error2;
1068 } 1126 }
1069 1127
1070 /* 1128 /*
@@ -1086,15 +1144,14 @@ xfs_mountfs(
1086 * If we are currently making the filesystem, the initialisation will 1144 * If we are currently making the filesystem, the initialisation will
1087 * fail as the perag data is in an undefined state. 1145 * fail as the perag data is in an undefined state.
1088 */ 1146 */
1089
1090 if (xfs_sb_version_haslazysbcount(&mp->m_sb) && 1147 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
1091 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && 1148 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
1092 !mp->m_sb.sb_inprogress) { 1149 !mp->m_sb.sb_inprogress) {
1093 error = xfs_initialize_perag_data(mp, sbp->sb_agcount); 1150 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1094 if (error) { 1151 if (error)
1095 goto error2; 1152 goto out_free_perag;
1096 }
1097 } 1153 }
1154
1098 /* 1155 /*
1099 * Get and sanity-check the root inode. 1156 * Get and sanity-check the root inode.
1100 * Save the pointer to it in the mount structure. 1157 * Save the pointer to it in the mount structure.
@@ -1102,7 +1159,7 @@ xfs_mountfs(
1102 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); 1159 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
1103 if (error) { 1160 if (error) {
1104 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1161 cmn_err(CE_WARN, "XFS: failed to read root inode");
1105 goto error3; 1162 goto out_log_dealloc;
1106 } 1163 }
1107 1164
1108 ASSERT(rip != NULL); 1165 ASSERT(rip != NULL);
@@ -1116,7 +1173,7 @@ xfs_mountfs(
1116 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1173 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
1117 mp); 1174 mp);
1118 error = XFS_ERROR(EFSCORRUPTED); 1175 error = XFS_ERROR(EFSCORRUPTED);
1119 goto error4; 1176 goto out_rele_rip;
1120 } 1177 }
1121 mp->m_rootip = rip; /* save it */ 1178 mp->m_rootip = rip; /* save it */
1122 1179
@@ -1131,7 +1188,7 @@ xfs_mountfs(
1131 * Free up the root inode. 1188 * Free up the root inode.
1132 */ 1189 */
1133 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1190 cmn_err(CE_WARN, "XFS: failed to read RT inodes");
1134 goto error4; 1191 goto out_rele_rip;
1135 } 1192 }
1136 1193
1137 /* 1194 /*
@@ -1143,7 +1200,7 @@ xfs_mountfs(
1143 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1200 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1144 if (error) { 1201 if (error) {
1145 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1202 cmn_err(CE_WARN, "XFS: failed to write sb changes");
1146 goto error4; 1203 goto out_rtunmount;
1147 } 1204 }
1148 } 1205 }
1149 1206
@@ -1152,7 +1209,7 @@ xfs_mountfs(
1152 */ 1209 */
1153 error = XFS_QM_INIT(mp, &quotamount, &quotaflags); 1210 error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
1154 if (error) 1211 if (error)
1155 goto error4; 1212 goto out_rtunmount;
1156 1213
1157 /* 1214 /*
1158 * Finish recovering the file system. This part needed to be 1215 * Finish recovering the file system. This part needed to be
@@ -1162,7 +1219,7 @@ xfs_mountfs(
1162 error = xfs_log_mount_finish(mp); 1219 error = xfs_log_mount_finish(mp);
1163 if (error) { 1220 if (error) {
1164 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1221 cmn_err(CE_WARN, "XFS: log mount finish failed");
1165 goto error4; 1222 goto out_rtunmount;
1166 } 1223 }
1167 1224
1168 /* 1225 /*
@@ -1170,7 +1227,7 @@ xfs_mountfs(
1170 */ 1227 */
1171 error = XFS_QM_MOUNT(mp, quotamount, quotaflags); 1228 error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
1172 if (error) 1229 if (error)
1173 goto error4; 1230 goto out_rtunmount;
1174 1231
1175 /* 1232 /*
1176 * Now we are mounted, reserve a small amount of unused space for 1233 * Now we are mounted, reserve a small amount of unused space for
@@ -1194,18 +1251,17 @@ xfs_mountfs(
1194 1251
1195 return 0; 1252 return 0;
1196 1253
1197 error4: 1254 out_rtunmount:
1198 /* 1255 xfs_rtunmount_inodes(mp);
1199 * Free up the root inode. 1256 out_rele_rip:
1200 */
1201 IRELE(rip); 1257 IRELE(rip);
1202 error3: 1258 out_log_dealloc:
1203 xfs_log_unmount_dealloc(mp); 1259 xfs_log_unmount(mp);
1204 error2: 1260 out_free_perag:
1205 xfs_free_perag(mp); 1261 xfs_free_perag(mp);
1206 error1: 1262 out_remove_uuid:
1207 if (uuid_mounted) 1263 xfs_uuid_unmount(mp);
1208 uuid_table_remove(&mp->m_sb.sb_uuid); 1264 out:
1209 return error; 1265 return error;
1210} 1266}
1211 1267
@@ -1226,15 +1282,12 @@ xfs_unmountfs(
1226 */ 1282 */
1227 XFS_QM_UNMOUNT(mp); 1283 XFS_QM_UNMOUNT(mp);
1228 1284
1229 if (mp->m_rbmip) 1285 xfs_rtunmount_inodes(mp);
1230 IRELE(mp->m_rbmip);
1231 if (mp->m_rsumip)
1232 IRELE(mp->m_rsumip);
1233 IRELE(mp->m_rootip); 1286 IRELE(mp->m_rootip);
1234 1287
1235 /* 1288 /*
1236 * We can potentially deadlock here if we have an inode cluster 1289 * We can potentially deadlock here if we have an inode cluster
1237 * that has been freed has it's buffer still pinned in memory because 1290 * that has been freed has its buffer still pinned in memory because
1238 * the transaction is still sitting in a iclog. The stale inodes 1291 * the transaction is still sitting in a iclog. The stale inodes
1239 * on that buffer will have their flush locks held until the 1292 * on that buffer will have their flush locks held until the
1240 * transaction hits the disk and the callbacks run. the inode 1293 * transaction hits the disk and the callbacks run. the inode
@@ -1266,7 +1319,7 @@ xfs_unmountfs(
1266 * Unreserve any blocks we have so that when we unmount we don't account 1319 * Unreserve any blocks we have so that when we unmount we don't account
1267 * the reserved free space as used. This is really only necessary for 1320 * the reserved free space as used. This is really only necessary for
1268 * lazy superblock counting because it trusts the incore superblock 1321 * lazy superblock counting because it trusts the incore superblock
1269 * counters to be aboslutely correct on clean unmount. 1322 * counters to be absolutely correct on clean unmount.
1270 * 1323 *
1271 * We don't bother correcting this elsewhere for lazy superblock 1324 * We don't bother correcting this elsewhere for lazy superblock
1272 * counting because on mount of an unclean filesystem we reconstruct the 1325 * counting because on mount of an unclean filesystem we reconstruct the
@@ -1288,10 +1341,9 @@ xfs_unmountfs(
1288 "Freespace may not be correct on next mount."); 1341 "Freespace may not be correct on next mount.");
1289 xfs_unmountfs_writesb(mp); 1342 xfs_unmountfs_writesb(mp);
1290 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1343 xfs_unmountfs_wait(mp); /* wait for async bufs */
1291 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1344 xfs_log_unmount_write(mp);
1292 1345 xfs_log_unmount(mp);
1293 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) 1346 xfs_uuid_unmount(mp);
1294 uuid_table_remove(&mp->m_sb.sb_uuid);
1295 1347
1296#if defined(DEBUG) 1348#if defined(DEBUG)
1297 xfs_errortag_clearall(mp, 0); 1349 xfs_errortag_clearall(mp, 0);
@@ -1793,29 +1845,6 @@ xfs_freesb(
1793} 1845}
1794 1846
1795/* 1847/*
1796 * See if the UUID is unique among mounted XFS filesystems.
1797 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
1798 */
1799STATIC int
1800xfs_uuid_mount(
1801 xfs_mount_t *mp)
1802{
1803 if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
1804 cmn_err(CE_WARN,
1805 "XFS: Filesystem %s has nil UUID - can't mount",
1806 mp->m_fsname);
1807 return -1;
1808 }
1809 if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
1810 cmn_err(CE_WARN,
1811 "XFS: Filesystem %s has duplicate UUID - can't mount",
1812 mp->m_fsname);
1813 return -1;
1814 }
1815 return 0;
1816}
1817
1818/*
1819 * Used to log changes to the superblock unit and width fields which could 1848 * Used to log changes to the superblock unit and width fields which could
1820 * be altered by the mount options, as well as any potential sb_features2 1849 * be altered by the mount options, as well as any potential sb_features2
1821 * fixup. Only the first superblock is updated. 1850 * fixup. Only the first superblock is updated.
@@ -1868,7 +1897,7 @@ xfs_mount_log_sb(
1868 * we disable the per-cpu counter and go through the slow path. 1897 * we disable the per-cpu counter and go through the slow path.
1869 * 1898 *
1870 * The slow path is the current xfs_mod_incore_sb() function. This means that 1899 * The slow path is the current xfs_mod_incore_sb() function. This means that
1871 * when we disable a per-cpu counter, we need to drain it's resources back to 1900 * when we disable a per-cpu counter, we need to drain its resources back to
1872 * the global superblock. We do this after disabling the counter to prevent 1901 * the global superblock. We do this after disabling the counter to prevent
1873 * more threads from queueing up on the counter. 1902 * more threads from queueing up on the counter.
1874 * 1903 *
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f5e9937f9bdb..7af44adffc8f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -136,7 +136,6 @@ typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
136 struct xfs_dquot *, struct xfs_dquot *, uint); 136 struct xfs_dquot *, struct xfs_dquot *, uint);
137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *); 137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); 138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
139typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
140 139
141typedef struct xfs_qmops { 140typedef struct xfs_qmops {
142 xfs_qminit_t xfs_qminit; 141 xfs_qminit_t xfs_qminit;
@@ -154,7 +153,6 @@ typedef struct xfs_qmops {
154 xfs_dqvopchownresv_t xfs_dqvopchownresv; 153 xfs_dqvopchownresv_t xfs_dqvopchownresv;
155 xfs_dqstatvfs_t xfs_dqstatvfs; 154 xfs_dqstatvfs_t xfs_dqstatvfs;
156 xfs_dqsync_t xfs_dqsync; 155 xfs_dqsync_t xfs_dqsync;
157 xfs_quotactl_t xfs_quotactl;
158 struct xfs_dqtrxops *xfs_dqtrxops; 156 struct xfs_dqtrxops *xfs_dqtrxops;
159} xfs_qmops_t; 157} xfs_qmops_t;
160 158
@@ -188,8 +186,6 @@ typedef struct xfs_qmops {
188 (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp) 186 (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
189#define XFS_QM_DQSYNC(mp, flags) \ 187#define XFS_QM_DQSYNC(mp, flags) \
190 (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags) 188 (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
191#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
192 (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
193 189
194#ifdef HAVE_PERCPU_SB 190#ifdef HAVE_PERCPU_SB
195 191
@@ -273,19 +269,17 @@ typedef struct xfs_mount {
273 uint m_inobt_mnr[2]; /* min inobt btree records */ 269 uint m_inobt_mnr[2]; /* min inobt btree records */
274 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 270 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
275 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 271 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
276 uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ 272 uint m_in_maxlevels; /* max inobt btree levels. */
277 struct xfs_perag *m_perag; /* per-ag accounting info */ 273 struct xfs_perag *m_perag; /* per-ag accounting info */
278 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ 274 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */
279 struct mutex m_growlock; /* growfs mutex */ 275 struct mutex m_growlock; /* growfs mutex */
280 int m_fixedfsid[2]; /* unchanged for life of FS */ 276 int m_fixedfsid[2]; /* unchanged for life of FS */
281 uint m_dmevmask; /* DMI events for this FS */ 277 uint m_dmevmask; /* DMI events for this FS */
282 __uint64_t m_flags; /* global mount flags */ 278 __uint64_t m_flags; /* global mount flags */
283 uint m_attroffset; /* inode attribute offset */
284 uint m_dir_node_ents; /* #entries in a dir danode */ 279 uint m_dir_node_ents; /* #entries in a dir danode */
285 uint m_attr_node_ents; /* #entries in attr danode */ 280 uint m_attr_node_ents; /* #entries in attr danode */
286 int m_ialloc_inos; /* inodes in inode allocation */ 281 int m_ialloc_inos; /* inodes in inode allocation */
287 int m_ialloc_blks; /* blocks in inode allocation */ 282 int m_ialloc_blks; /* blocks in inode allocation */
288 int m_litino; /* size of inode union area */
289 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 283 int m_inoalign_mask;/* mask sb_inoalignmt if used */
290 uint m_qflags; /* quota status flags */ 284 uint m_qflags; /* quota status flags */
291 xfs_trans_reservations_t m_reservations;/* precomputed res values */ 285 xfs_trans_reservations_t m_reservations;/* precomputed res values */
@@ -293,9 +287,6 @@ typedef struct xfs_mount {
293 __uint64_t m_maxioffset; /* maximum inode offset */ 287 __uint64_t m_maxioffset; /* maximum inode offset */
294 __uint64_t m_resblks; /* total reserved blocks */ 288 __uint64_t m_resblks; /* total reserved blocks */
295 __uint64_t m_resblks_avail;/* available reserved blocks */ 289 __uint64_t m_resblks_avail;/* available reserved blocks */
296#if XFS_BIG_INUMS
297 xfs_ino_t m_inoadd; /* add value for ino64_offset */
298#endif
299 int m_dalign; /* stripe unit */ 290 int m_dalign; /* stripe unit */
300 int m_swidth; /* stripe width */ 291 int m_swidth; /* stripe width */
301 int m_sinoalign; /* stripe unit inode alignment */ 292 int m_sinoalign; /* stripe unit inode alignment */
@@ -337,7 +328,6 @@ typedef struct xfs_mount {
337#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 328#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
338 must be synchronous except 329 must be synchronous except
339 for space allocations */ 330 for space allocations */
340#define XFS_MOUNT_INO64 (1ULL << 1)
341#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 331#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
342#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 332#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
343#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 333#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
@@ -389,8 +379,8 @@ typedef struct xfs_mount {
389 * Synchronous read and write sizes. This should be 379 * Synchronous read and write sizes. This should be
390 * better for NFSv2 wsync filesystems. 380 * better for NFSv2 wsync filesystems.
391 */ 381 */
392#define XFS_WSYNC_READIO_LOG 15 /* 32K */ 382#define XFS_WSYNC_READIO_LOG 15 /* 32k */
393#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */ 383#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
394 384
395/* 385/*
396 * Allow large block sizes to be reported to userspace programs if the 386 * Allow large block sizes to be reported to userspace programs if the
@@ -500,9 +490,6 @@ typedef struct xfs_mod_sb {
500 int64_t msb_delta; /* Change to make to specified field */ 490 int64_t msb_delta; /* Change to make to specified field */
501} xfs_mod_sb_t; 491} xfs_mod_sb_t;
502 492
503#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
504#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
505
506extern int xfs_log_sbcount(xfs_mount_t *, uint); 493extern int xfs_log_sbcount(xfs_mount_t *, uint);
507extern int xfs_mountfs(xfs_mount_t *mp); 494extern int xfs_mountfs(xfs_mount_t *mp);
508extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); 495extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 27f80581520a..e101790ea8e7 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = {
126 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, 126 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr,
127 .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval, 127 .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval,
128 .xfs_dqsync = (xfs_dqsync_t) fs_noerr, 128 .xfs_dqsync = (xfs_dqsync_t) fs_noerr,
129 .xfs_quotactl = (xfs_quotactl_t) fs_nosys,
130}; 129};
131 130
132int 131int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 48965ecaa155..f5d1202dde25 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_QUOTA_H__ 18#ifndef __XFS_QUOTA_H__
19#define __XFS_QUOTA_H__ 19#define __XFS_QUOTA_H__
20 20
21struct xfs_trans;
22
21/* 23/*
22 * The ondisk form of a dquot structure. 24 * The ondisk form of a dquot structure.
23 */ 25 */
@@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat {
185 * to a single function. None of these XFS_QMOPT_* flags are meant to have 187 * to a single function. None of these XFS_QMOPT_* flags are meant to have
186 * persistent values (ie. their values can and will change between versions) 188 * persistent values (ie. their values can and will change between versions)
187 */ 189 */
188#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */
189#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ 190#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
190#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ 191#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
191#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ 192#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c5bb86f3ec05..385f6dceba5d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2288,6 +2288,16 @@ xfs_rtmount_inodes(
2288 return 0; 2288 return 0;
2289} 2289}
2290 2290
2291void
2292xfs_rtunmount_inodes(
2293 struct xfs_mount *mp)
2294{
2295 if (mp->m_rbmip)
2296 IRELE(mp->m_rbmip);
2297 if (mp->m_rsumip)
2298 IRELE(mp->m_rsumip);
2299}
2300
2291/* 2301/*
2292 * Pick an extent for allocation at the start of a new realtime file. 2302 * Pick an extent for allocation at the start of a new realtime file.
2293 * Use the sequence number stored in the atime field of the bitmap inode. 2303 * Use the sequence number stored in the atime field of the bitmap inode.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8d8dcd215716..b2d67adb6a08 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,8 +23,8 @@ struct xfs_trans;
23 23
24/* Min and max rt extent sizes, specified in bytes */ 24/* Min and max rt extent sizes, specified in bytes */
25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ 25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */ 26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */ 27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
28 28
29/* 29/*
30 * Constants for bit manipulations. 30 * Constants for bit manipulations.
@@ -108,6 +108,9 @@ xfs_rtfree_extent(
108int /* error */ 108int /* error */
109xfs_rtmount_init( 109xfs_rtmount_init(
110 struct xfs_mount *mp); /* file system mount structure */ 110 struct xfs_mount *mp); /* file system mount structure */
111void
112xfs_rtunmount_inodes(
113 struct xfs_mount *mp);
111 114
112/* 115/*
113 * Get the bitmap and summary inodes into the mount structure 116 * Get the bitmap and summary inodes into the mount structure
@@ -146,6 +149,7 @@ xfs_growfs_rt(
146# define xfs_growfs_rt(mp,in) (ENOSYS) 149# define xfs_growfs_rt(mp,in) (ENOSYS)
147# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 150# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
148# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 151# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
152# define xfs_rtunmount_inodes(m)
149#endif /* CONFIG_XFS_RT */ 153#endif /* CONFIG_XFS_RT */
150 154
151#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index d6fe4a88d79f..775249a54f6f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
292 * In a write transaction we can allocate a maximum of 2 292 * In a write transaction we can allocate a maximum of 2
293 * extents. This gives: 293 * extents. This gives:
294 * the inode getting the new extents: inode size 294 * the inode getting the new extents: inode size
295 * the inode\'s bmap btree: max depth * block size 295 * the inode's bmap btree: max depth * block size
296 * the agfs of the ags from which the extents are allocated: 2 * sector 296 * the agfs of the ags from which the extents are allocated: 2 * sector
297 * the superblock free block counter: sector size 297 * the superblock free block counter: sector size
298 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size 298 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
@@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
321/* 321/*
322 * In truncating a file we free up to two extents at once. We can modify: 322 * In truncating a file we free up to two extents at once. We can modify:
323 * the inode being truncated: inode size 323 * the inode being truncated: inode size
324 * the inode\'s bmap btree: (max depth + 1) * block size 324 * the inode's bmap btree: (max depth + 1) * block size
325 * And the bmap_finish transaction can free the blocks and bmap blocks: 325 * And the bmap_finish transaction can free the blocks and bmap blocks:
326 * the agf for each of the ags: 4 * sector size 326 * the agf for each of the ags: 4 * sector size
327 * the agfl for each of the ags: 4 * sector size 327 * the agfl for each of the ags: 4 * sector size
@@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
343 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ 343 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
344 (128 * 5) + \ 344 (128 * 5) + \
345 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 345 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
346 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 346 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
347 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 347 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
348 348
349#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) 349#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
@@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
431 * the new inode: inode size 431 * the new inode: inode size
432 * the inode btree entry: 1 block 432 * the inode btree entry: 1 block
433 * the directory btree: (max depth + v2) * dir block size 433 * the directory btree: (max depth + v2) * dir block size
434 * the directory inode\'s bmap btree: (max depth + v2) * block size 434 * the directory inode's bmap btree: (max depth + v2) * block size
435 * the blocks for the symlink: 1 KB 435 * the blocks for the symlink: 1 kB
436 * Or in the first xact we allocate some inodes giving: 436 * Or in the first xact we allocate some inodes giving:
437 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 437 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
438 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 438 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
@@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
449 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ 449 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
450 (2 * (mp)->m_sb.sb_sectsize + \ 450 (2 * (mp)->m_sb.sb_sectsize + \
451 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ 451 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
452 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ 452 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
453 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 453 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
454 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 454 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
455 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 455 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
456 456
457#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) 457#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
@@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
463 * the inode btree entry: block size 463 * the inode btree entry: block size
464 * the superblock for the nlink flag: sector size 464 * the superblock for the nlink flag: sector size
465 * the directory btree: (max depth + v2) * dir block size 465 * the directory btree: (max depth + v2) * dir block size
466 * the directory inode\'s bmap btree: (max depth + v2) * block size 466 * the directory inode's bmap btree: (max depth + v2) * block size
467 * Or in the first xact we allocate some inodes giving: 467 * Or in the first xact we allocate some inodes giving:
468 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 468 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
469 * the superblock for the nlink flag: sector size 469 * the superblock for the nlink flag: sector size
@@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
481 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ 481 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
482 (3 * (mp)->m_sb.sb_sectsize + \ 482 (3 * (mp)->m_sb.sb_sectsize + \
483 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ 483 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
484 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ 484 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
485 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 485 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
486 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 486 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
487 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 487 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
488 488
489#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) 489#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
@@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
513 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ 513 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
514 (128 * 5) + \ 514 (128 * 5) + \
515 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 515 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
516 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 516 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
517 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) 517 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
518 518
519 519
@@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
637/* 637/*
638 * Removing the attribute fork of a file 638 * Removing the attribute fork of a file
639 * the inode being truncated: inode size 639 * the inode being truncated: inode size
640 * the inode\'s bmap btree: max depth * block size 640 * the inode's bmap btree: max depth * block size
641 * And the bmap_finish transaction can free the blocks and bmap blocks: 641 * And the bmap_finish transaction can free the blocks and bmap blocks:
642 * the agf for each of the ags: 4 * sector size 642 * the agf for each of the ags: 4 * sector size
643 * the agfl for each of the ags: 4 * sector size 643 * the agfl for each of the ags: 4 * sector size
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2d47f10f8bed..f31271c30de9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -79,7 +79,7 @@ xfs_trans_ail_tail(
79 * the push is run asynchronously in a separate thread, so we return the tail 79 * the push is run asynchronously in a separate thread, so we return the tail
80 * of the log right now instead of the tail after the push. This means we will 80 * of the log right now instead of the tail after the push. This means we will
81 * either continue right away, or we will sleep waiting on the async thread to 81 * either continue right away, or we will sleep waiting on the async thread to
82 * do it's work. 82 * do its work.
83 * 83 *
84 * We do this unlocked - we only need to know whether there is anything in the 84 * We do this unlocked - we only need to know whether there is anything in the
85 * AIL at the time we are called. We don't need to access the contents of 85 * AIL at the time we are called. We don't need to access the contents of
@@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next(
160/* 160/*
161 * Now that the traversal is complete, we need to remove the cursor 161 * Now that the traversal is complete, we need to remove the cursor
162 * from the list of traversing cursors. Avoid removing the embedded 162 * from the list of traversing cursors. Avoid removing the embedded
163 * push cursor, but use the fact it is alway present to make the 163 * push cursor, but use the fact it is always present to make the
164 * list deletion simple. 164 * list deletion simple.
165 */ 165 */
166void 166void
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index e110bf57d7f4..eb3fc57f9eef 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,7 +22,7 @@
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 24#include "xfs_trans_priv.h"
25/* XXX: from here down needed until struct xfs_trans has it's own ailp */ 25/* XXX: from here down needed until struct xfs_trans has its own ailp */
26#include "xfs_bit.h" 26#include "xfs_bit.h"
27#include "xfs_buf_item.h" 27#include "xfs_buf_item.h"
28#include "xfs_sb.h" 28#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 4ea2e5074bdd..7d2c920dfb9c 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
47#define XFS_DIRREMOVE_SPACE_RES(mp) \ 47#define XFS_DIRREMOVE_SPACE_RES(mp) \
48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) 48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
49#define XFS_IALLOC_SPACE_RES(mp) \ 49#define XFS_IALLOC_SPACE_RES(mp) \
50 (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1) 50 (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
51 51
52/* 52/*
53 * Space reservation values for various transactions. 53 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b2f724502f1b..d725428c9df6 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -21,14 +21,6 @@
21#ifdef __KERNEL__ 21#ifdef __KERNEL__
22 22
23/* 23/*
24 * POSIX Extensions
25 */
26typedef unsigned char uchar_t;
27typedef unsigned short ushort_t;
28typedef unsigned int uint_t;
29typedef unsigned long ulong_t;
30
31/*
32 * Additional type declarations for XFS 24 * Additional type declarations for XFS
33 */ 25 */
34typedef signed char __int8_t; 26typedef signed char __int8_t;
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fcc2285d03ed..79b9e5ea5359 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -374,7 +374,7 @@ xfs_truncate_file(
374 374
375 /* 375 /*
376 * Follow the normal truncate locking protocol. Since we 376 * Follow the normal truncate locking protocol. Since we
377 * hold the inode in the transaction, we know that it's number 377 * hold the inode in the transaction, we know that its number
378 * of references will stay constant. 378 * of references will stay constant.
379 */ 379 */
380 xfs_ilock(ip, XFS_ILOCK_EXCL); 380 xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0e55c5d7db5f..7394c7af5de5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1136,7 +1136,7 @@ xfs_inactive(
1136 * If the inode is already free, then there can be nothing 1136 * If the inode is already free, then there can be nothing
1137 * to clean up here. 1137 * to clean up here.
1138 */ 1138 */
1139 if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) { 1139 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
1140 ASSERT(ip->i_df.if_real_bytes == 0); 1140 ASSERT(ip->i_df.if_real_bytes == 0);
1141 ASSERT(ip->i_df.if_broot_bytes == 0); 1141 ASSERT(ip->i_df.if_broot_bytes == 0);
1142 return VN_INACTIVE_CACHE; 1142 return VN_INACTIVE_CACHE;
@@ -1387,23 +1387,28 @@ xfs_create(
1387 xfs_inode_t **ipp, 1387 xfs_inode_t **ipp,
1388 cred_t *credp) 1388 cred_t *credp)
1389{ 1389{
1390 xfs_mount_t *mp = dp->i_mount; 1390 int is_dir = S_ISDIR(mode);
1391 xfs_inode_t *ip; 1391 struct xfs_mount *mp = dp->i_mount;
1392 xfs_trans_t *tp; 1392 struct xfs_inode *ip = NULL;
1393 struct xfs_trans *tp = NULL;
1393 int error; 1394 int error;
1394 xfs_bmap_free_t free_list; 1395 xfs_bmap_free_t free_list;
1395 xfs_fsblock_t first_block; 1396 xfs_fsblock_t first_block;
1396 boolean_t unlock_dp_on_error = B_FALSE; 1397 boolean_t unlock_dp_on_error = B_FALSE;
1397 int dm_event_sent = 0;
1398 uint cancel_flags; 1398 uint cancel_flags;
1399 int committed; 1399 int committed;
1400 xfs_prid_t prid; 1400 xfs_prid_t prid;
1401 struct xfs_dquot *udqp, *gdqp; 1401 struct xfs_dquot *udqp = NULL;
1402 struct xfs_dquot *gdqp = NULL;
1402 uint resblks; 1403 uint resblks;
1404 uint log_res;
1405 uint log_count;
1403 1406
1404 ASSERT(!*ipp);
1405 xfs_itrace_entry(dp); 1407 xfs_itrace_entry(dp);
1406 1408
1409 if (XFS_FORCED_SHUTDOWN(mp))
1410 return XFS_ERROR(EIO);
1411
1407 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { 1412 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1408 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, 1413 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1409 dp, DM_RIGHT_NULL, NULL, 1414 dp, DM_RIGHT_NULL, NULL,
@@ -1412,84 +1417,97 @@ xfs_create(
1412 1417
1413 if (error) 1418 if (error)
1414 return error; 1419 return error;
1415 dm_event_sent = 1;
1416 } 1420 }
1417 1421
1418 if (XFS_FORCED_SHUTDOWN(mp))
1419 return XFS_ERROR(EIO);
1420
1421 /* Return through std_return after this point. */
1422
1423 udqp = gdqp = NULL;
1424 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1422 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1425 prid = dp->i_d.di_projid; 1423 prid = dp->i_d.di_projid;
1426 else 1424 else
1427 prid = (xfs_prid_t)dfltprid; 1425 prid = dfltprid;
1428 1426
1429 /* 1427 /*
1430 * Make sure that we have allocated dquot(s) on disk. 1428 * Make sure that we have allocated dquot(s) on disk.
1431 */ 1429 */
1432 error = XFS_QM_DQVOPALLOC(mp, dp, 1430 error = XFS_QM_DQVOPALLOC(mp, dp,
1433 current_fsuid(), current_fsgid(), prid, 1431 current_fsuid(), current_fsgid(), prid,
1434 XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); 1432 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1435 if (error) 1433 if (error)
1436 goto std_return; 1434 goto std_return;
1437 1435
1438 ip = NULL; 1436 if (is_dir) {
1437 rdev = 0;
1438 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1439 log_res = XFS_MKDIR_LOG_RES(mp);
1440 log_count = XFS_MKDIR_LOG_COUNT;
1441 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1442 } else {
1443 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1444 log_res = XFS_CREATE_LOG_RES(mp);
1445 log_count = XFS_CREATE_LOG_COUNT;
1446 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1447 }
1439 1448
1440 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1441 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 1449 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1442 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1450
1443 /* 1451 /*
1444 * Initially assume that the file does not exist and 1452 * Initially assume that the file does not exist and
1445 * reserve the resources for that case. If that is not 1453 * reserve the resources for that case. If that is not
1446 * the case we'll drop the one we have and get a more 1454 * the case we'll drop the one we have and get a more
1447 * appropriate transaction later. 1455 * appropriate transaction later.
1448 */ 1456 */
1449 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, 1457 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1450 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1458 XFS_TRANS_PERM_LOG_RES, log_count);
1451 if (error == ENOSPC) { 1459 if (error == ENOSPC) {
1452 resblks = 0; 1460 resblks = 0;
1453 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0, 1461 error = xfs_trans_reserve(tp, 0, log_res, 0,
1454 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1462 XFS_TRANS_PERM_LOG_RES, log_count);
1455 } 1463 }
1456 if (error) { 1464 if (error) {
1457 cancel_flags = 0; 1465 cancel_flags = 0;
1458 goto error_return; 1466 goto out_trans_cancel;
1459 } 1467 }
1460 1468
1461 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1469 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1462 unlock_dp_on_error = B_TRUE; 1470 unlock_dp_on_error = B_TRUE;
1463 1471
1464 xfs_bmap_init(&free_list, &first_block); 1472 /*
1473 * Check for directory link count overflow.
1474 */
1475 if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
1476 error = XFS_ERROR(EMLINK);
1477 goto out_trans_cancel;
1478 }
1465 1479
1466 ASSERT(ip == NULL); 1480 xfs_bmap_init(&free_list, &first_block);
1467 1481
1468 /* 1482 /*
1469 * Reserve disk quota and the inode. 1483 * Reserve disk quota and the inode.
1470 */ 1484 */
1471 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); 1485 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1472 if (error) 1486 if (error)
1473 goto error_return; 1487 goto out_trans_cancel;
1474 1488
1475 error = xfs_dir_canenter(tp, dp, name, resblks); 1489 error = xfs_dir_canenter(tp, dp, name, resblks);
1476 if (error) 1490 if (error)
1477 goto error_return; 1491 goto out_trans_cancel;
1478 error = xfs_dir_ialloc(&tp, dp, mode, 1, 1492
1479 rdev, credp, prid, resblks > 0, 1493 /*
1480 &ip, &committed); 1494 * A newly created regular or special file just has one directory
1495 * entry pointing to them, but a directory also the "." entry
1496 * pointing to itself.
1497 */
1498 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
1499 prid, resblks > 0, &ip, &committed);
1481 if (error) { 1500 if (error) {
1482 if (error == ENOSPC) 1501 if (error == ENOSPC)
1483 goto error_return; 1502 goto out_trans_cancel;
1484 goto abort_return; 1503 goto out_trans_abort;
1485 } 1504 }
1486 xfs_itrace_ref(ip);
1487 1505
1488 /* 1506 /*
1489 * At this point, we've gotten a newly allocated inode. 1507 * At this point, we've gotten a newly allocated inode.
1490 * It is locked (and joined to the transaction). 1508 * It is locked (and joined to the transaction).
1491 */ 1509 */
1492 1510 xfs_itrace_ref(ip);
1493 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1511 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1494 1512
1495 /* 1513 /*
@@ -1508,19 +1526,28 @@ xfs_create(
1508 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 1526 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1509 if (error) { 1527 if (error) {
1510 ASSERT(error != ENOSPC); 1528 ASSERT(error != ENOSPC);
1511 goto abort_return; 1529 goto out_trans_abort;
1512 } 1530 }
1513 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1531 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1514 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1532 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1515 1533
1534 if (is_dir) {
1535 error = xfs_dir_init(tp, ip, dp);
1536 if (error)
1537 goto out_bmap_cancel;
1538
1539 error = xfs_bumplink(tp, dp);
1540 if (error)
1541 goto out_bmap_cancel;
1542 }
1543
1516 /* 1544 /*
1517 * If this is a synchronous mount, make sure that the 1545 * If this is a synchronous mount, make sure that the
1518 * create transaction goes to disk before returning to 1546 * create transaction goes to disk before returning to
1519 * the user. 1547 * the user.
1520 */ 1548 */
1521 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { 1549 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1522 xfs_trans_set_sync(tp); 1550 xfs_trans_set_sync(tp);
1523 }
1524 1551
1525 /* 1552 /*
1526 * Attach the dquot(s) to the inodes and modify them incore. 1553 * Attach the dquot(s) to the inodes and modify them incore.
@@ -1537,16 +1564,13 @@ xfs_create(
1537 IHOLD(ip); 1564 IHOLD(ip);
1538 1565
1539 error = xfs_bmap_finish(&tp, &free_list, &committed); 1566 error = xfs_bmap_finish(&tp, &free_list, &committed);
1540 if (error) { 1567 if (error)
1541 xfs_bmap_cancel(&free_list); 1568 goto out_abort_rele;
1542 goto abort_rele;
1543 }
1544 1569
1545 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1570 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1546 if (error) { 1571 if (error) {
1547 IRELE(ip); 1572 IRELE(ip);
1548 tp = NULL; 1573 goto out_dqrele;
1549 goto error_return;
1550 } 1574 }
1551 1575
1552 XFS_QM_DQRELE(mp, udqp); 1576 XFS_QM_DQRELE(mp, udqp);
@@ -1555,26 +1579,22 @@ xfs_create(
1555 *ipp = ip; 1579 *ipp = ip;
1556 1580
1557 /* Fallthrough to std_return with error = 0 */ 1581 /* Fallthrough to std_return with error = 0 */
1558 1582 std_return:
1559std_return: 1583 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1560 if ((*ipp || (error != 0 && dm_event_sent != 0)) && 1584 XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
1561 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { 1585 ip, DM_RIGHT_NULL, name->name, NULL, mode,
1562 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, 1586 error, 0);
1563 dp, DM_RIGHT_NULL,
1564 *ipp ? ip : NULL,
1565 DM_RIGHT_NULL, name->name, NULL,
1566 mode, error, 0);
1567 } 1587 }
1588
1568 return error; 1589 return error;
1569 1590
1570 abort_return: 1591 out_bmap_cancel:
1592 xfs_bmap_cancel(&free_list);
1593 out_trans_abort:
1571 cancel_flags |= XFS_TRANS_ABORT; 1594 cancel_flags |= XFS_TRANS_ABORT;
1572 /* FALLTHROUGH */ 1595 out_trans_cancel:
1573 1596 xfs_trans_cancel(tp, cancel_flags);
1574 error_return: 1597 out_dqrele:
1575 if (tp != NULL)
1576 xfs_trans_cancel(tp, cancel_flags);
1577
1578 XFS_QM_DQRELE(mp, udqp); 1598 XFS_QM_DQRELE(mp, udqp);
1579 XFS_QM_DQRELE(mp, gdqp); 1599 XFS_QM_DQRELE(mp, gdqp);
1580 1600
@@ -1583,20 +1603,18 @@ std_return:
1583 1603
1584 goto std_return; 1604 goto std_return;
1585 1605
1586 abort_rele: 1606 out_abort_rele:
1587 /* 1607 /*
1588 * Wait until after the current transaction is aborted to 1608 * Wait until after the current transaction is aborted to
1589 * release the inode. This prevents recursive transactions 1609 * release the inode. This prevents recursive transactions
1590 * and deadlocks from xfs_inactive. 1610 * and deadlocks from xfs_inactive.
1591 */ 1611 */
1612 xfs_bmap_cancel(&free_list);
1592 cancel_flags |= XFS_TRANS_ABORT; 1613 cancel_flags |= XFS_TRANS_ABORT;
1593 xfs_trans_cancel(tp, cancel_flags); 1614 xfs_trans_cancel(tp, cancel_flags);
1594 IRELE(ip); 1615 IRELE(ip);
1595 1616 unlock_dp_on_error = B_FALSE;
1596 XFS_QM_DQRELE(mp, udqp); 1617 goto out_dqrele;
1597 XFS_QM_DQRELE(mp, gdqp);
1598
1599 goto std_return;
1600} 1618}
1601 1619
1602#ifdef DEBUG 1620#ifdef DEBUG
@@ -2004,8 +2022,10 @@ xfs_link(
2004 /* Return through std_return after this point. */ 2022 /* Return through std_return after this point. */
2005 2023
2006 error = XFS_QM_DQATTACH(mp, sip, 0); 2024 error = XFS_QM_DQATTACH(mp, sip, 0);
2007 if (!error && sip != tdp) 2025 if (error)
2008 error = XFS_QM_DQATTACH(mp, tdp, 0); 2026 goto std_return;
2027
2028 error = XFS_QM_DQATTACH(mp, tdp, 0);
2009 if (error) 2029 if (error)
2010 goto std_return; 2030 goto std_return;
2011 2031
@@ -2110,209 +2130,6 @@ std_return:
2110 goto std_return; 2130 goto std_return;
2111} 2131}
2112 2132
2113
2114int
2115xfs_mkdir(
2116 xfs_inode_t *dp,
2117 struct xfs_name *dir_name,
2118 mode_t mode,
2119 xfs_inode_t **ipp,
2120 cred_t *credp)
2121{
2122 xfs_mount_t *mp = dp->i_mount;
2123 xfs_inode_t *cdp; /* inode of created dir */
2124 xfs_trans_t *tp;
2125 int cancel_flags;
2126 int error;
2127 int committed;
2128 xfs_bmap_free_t free_list;
2129 xfs_fsblock_t first_block;
2130 boolean_t unlock_dp_on_error = B_FALSE;
2131 boolean_t created = B_FALSE;
2132 int dm_event_sent = 0;
2133 xfs_prid_t prid;
2134 struct xfs_dquot *udqp, *gdqp;
2135 uint resblks;
2136
2137 if (XFS_FORCED_SHUTDOWN(mp))
2138 return XFS_ERROR(EIO);
2139
2140 tp = NULL;
2141
2142 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2143 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2144 dp, DM_RIGHT_NULL, NULL,
2145 DM_RIGHT_NULL, dir_name->name, NULL,
2146 mode, 0, 0);
2147 if (error)
2148 return error;
2149 dm_event_sent = 1;
2150 }
2151
2152 /* Return through std_return after this point. */
2153
2154 xfs_itrace_entry(dp);
2155
2156 mp = dp->i_mount;
2157 udqp = gdqp = NULL;
2158 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2159 prid = dp->i_d.di_projid;
2160 else
2161 prid = (xfs_prid_t)dfltprid;
2162
2163 /*
2164 * Make sure that we have allocated dquot(s) on disk.
2165 */
2166 error = XFS_QM_DQVOPALLOC(mp, dp,
2167 current_fsuid(), current_fsgid(), prid,
2168 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2169 if (error)
2170 goto std_return;
2171
2172 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2173 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2174 resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2175 error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2176 XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2177 if (error == ENOSPC) {
2178 resblks = 0;
2179 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2180 XFS_TRANS_PERM_LOG_RES,
2181 XFS_MKDIR_LOG_COUNT);
2182 }
2183 if (error) {
2184 cancel_flags = 0;
2185 goto error_return;
2186 }
2187
2188 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2189 unlock_dp_on_error = B_TRUE;
2190
2191 /*
2192 * Check for directory link count overflow.
2193 */
2194 if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2195 error = XFS_ERROR(EMLINK);
2196 goto error_return;
2197 }
2198
2199 /*
2200 * Reserve disk quota and the inode.
2201 */
2202 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2203 if (error)
2204 goto error_return;
2205
2206 error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2207 if (error)
2208 goto error_return;
2209 /*
2210 * create the directory inode.
2211 */
2212 error = xfs_dir_ialloc(&tp, dp, mode, 2,
2213 0, credp, prid, resblks > 0,
2214 &cdp, NULL);
2215 if (error) {
2216 if (error == ENOSPC)
2217 goto error_return;
2218 goto abort_return;
2219 }
2220 xfs_itrace_ref(cdp);
2221
2222 /*
2223 * Now we add the directory inode to the transaction.
2224 * We waited until now since xfs_dir_ialloc might start
2225 * a new transaction. Had we joined the transaction
2226 * earlier, the locks might have gotten released. An error
2227 * from here on will result in the transaction cancel
2228 * unlocking dp so don't do it explicitly in the error path.
2229 */
2230 IHOLD(dp);
2231 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2232 unlock_dp_on_error = B_FALSE;
2233
2234 xfs_bmap_init(&free_list, &first_block);
2235
2236 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2237 &first_block, &free_list, resblks ?
2238 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2239 if (error) {
2240 ASSERT(error != ENOSPC);
2241 goto error1;
2242 }
2243 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2244
2245 error = xfs_dir_init(tp, cdp, dp);
2246 if (error)
2247 goto error2;
2248
2249 error = xfs_bumplink(tp, dp);
2250 if (error)
2251 goto error2;
2252
2253 created = B_TRUE;
2254
2255 *ipp = cdp;
2256 IHOLD(cdp);
2257
2258 /*
2259 * Attach the dquots to the new inode and modify the icount incore.
2260 */
2261 XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2262
2263 /*
2264 * If this is a synchronous mount, make sure that the
2265 * mkdir transaction goes to disk before returning to
2266 * the user.
2267 */
2268 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2269 xfs_trans_set_sync(tp);
2270 }
2271
2272 error = xfs_bmap_finish(&tp, &free_list, &committed);
2273 if (error) {
2274 IRELE(cdp);
2275 goto error2;
2276 }
2277
2278 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2279 XFS_QM_DQRELE(mp, udqp);
2280 XFS_QM_DQRELE(mp, gdqp);
2281 if (error) {
2282 IRELE(cdp);
2283 }
2284
2285 /* Fall through to std_return with error = 0 or errno from
2286 * xfs_trans_commit. */
2287
2288std_return:
2289 if ((created || (error != 0 && dm_event_sent != 0)) &&
2290 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2291 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2292 dp, DM_RIGHT_NULL,
2293 created ? cdp : NULL,
2294 DM_RIGHT_NULL,
2295 dir_name->name, NULL,
2296 mode, error, 0);
2297 }
2298 return error;
2299
2300 error2:
2301 error1:
2302 xfs_bmap_cancel(&free_list);
2303 abort_return:
2304 cancel_flags |= XFS_TRANS_ABORT;
2305 error_return:
2306 xfs_trans_cancel(tp, cancel_flags);
2307 XFS_QM_DQRELE(mp, udqp);
2308 XFS_QM_DQRELE(mp, gdqp);
2309
2310 if (unlock_dp_on_error)
2311 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2312
2313 goto std_return;
2314}
2315
2316int 2133int
2317xfs_symlink( 2134xfs_symlink(
2318 xfs_inode_t *dp, 2135 xfs_inode_t *dp,
@@ -2587,51 +2404,6 @@ std_return:
2587} 2404}
2588 2405
2589int 2406int
2590xfs_inode_flush(
2591 xfs_inode_t *ip,
2592 int flags)
2593{
2594 xfs_mount_t *mp = ip->i_mount;
2595 int error = 0;
2596
2597 if (XFS_FORCED_SHUTDOWN(mp))
2598 return XFS_ERROR(EIO);
2599
2600 /*
2601 * Bypass inodes which have already been cleaned by
2602 * the inode flush clustering code inside xfs_iflush
2603 */
2604 if (xfs_inode_clean(ip))
2605 return 0;
2606
2607 /*
2608 * We make this non-blocking if the inode is contended,
2609 * return EAGAIN to indicate to the caller that they
2610 * did not succeed. This prevents the flush path from
2611 * blocking on inodes inside another operation right
2612 * now, they get caught later by xfs_sync.
2613 */
2614 if (flags & FLUSH_SYNC) {
2615 xfs_ilock(ip, XFS_ILOCK_SHARED);
2616 xfs_iflock(ip);
2617 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
2618 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
2619 xfs_iunlock(ip, XFS_ILOCK_SHARED);
2620 return EAGAIN;
2621 }
2622 } else {
2623 return EAGAIN;
2624 }
2625
2626 error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
2627 : XFS_IFLUSH_ASYNC_NOBLOCK);
2628 xfs_iunlock(ip, XFS_ILOCK_SHARED);
2629
2630 return error;
2631}
2632
2633
2634int
2635xfs_set_dmattrs( 2407xfs_set_dmattrs(
2636 xfs_inode_t *ip, 2408 xfs_inode_t *ip,
2637 u_int evmask, 2409 u_int evmask,
@@ -2676,7 +2448,7 @@ xfs_reclaim(
2676 ASSERT(!VN_MAPPED(VFS_I(ip))); 2448 ASSERT(!VN_MAPPED(VFS_I(ip)));
2677 2449
2678 /* bad inode, get out here ASAP */ 2450 /* bad inode, get out here ASAP */
2679 if (VN_BAD(VFS_I(ip))) { 2451 if (is_bad_inode(VFS_I(ip))) {
2680 xfs_ireclaim(ip); 2452 xfs_ireclaim(ip);
2681 return 0; 2453 return 0;
2682 } 2454 }
@@ -3090,7 +2862,7 @@ xfs_free_file_space(
3090 2862
3091 /* 2863 /*
3092 * Need to zero the stuff we're not freeing, on disk. 2864 * Need to zero the stuff we're not freeing, on disk.
3093 * If its a realtime file & can't use unwritten extents then we 2865 * If it's a realtime file & can't use unwritten extents then we
3094 * actually need to zero the extent edges. Otherwise xfs_bunmapi 2866 * actually need to zero the extent edges. Otherwise xfs_bunmapi
3095 * will take care of it for us. 2867 * will take care of it for us.
3096 */ 2868 */
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 76df328c61b4..04373c6c61ff 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33 struct xfs_name *target_name); 33 struct xfs_name *target_name);
34int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
35 mode_t mode, struct xfs_inode **ipp, cred_t *credp);
36int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
37 xfs_off_t *offset, filldir_t filldir); 35 xfs_off_t *offset, filldir_t filldir);
38int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
39 const char *target_path, mode_t mode, struct xfs_inode **ipp, 37 const char *target_path, mode_t mode, struct xfs_inode **ipp,
40 cred_t *credp); 38 cred_t *credp);
41int xfs_inode_flush(struct xfs_inode *ip, int flags);
42int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
43int xfs_reclaim(struct xfs_inode *ip); 40int xfs_reclaim(struct xfs_inode *ip);
44int xfs_change_file_space(struct xfs_inode *ip, int cmd, 41int xfs_change_file_space(struct xfs_inode *ip, int cmd,