aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig9
-rw-r--r--fs/Makefile5
-rw-r--r--fs/adfs/super.c16
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/Kconfig8
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cache.c503
-rw-r--r--fs/afs/cache.h15
-rw-r--r--fs/afs/cell.c16
-rw-r--r--fs/afs/file.c220
-rw-r--r--fs/afs/inode.c31
-rw-r--r--fs/afs/internal.h53
-rw-r--r--fs/afs/main.c27
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/vlocation.c25
-rw-r--r--fs/afs/volume.c14
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c25
-rw-r--r--fs/binfmt_som.c7
-rw-r--r--fs/bio.c3
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/async-thread.c7
-rw-r--r--fs/btrfs/ctree.c312
-rw-r--r--fs/btrfs/ctree.h84
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent-tree.c398
-rw-r--r--fs/btrfs/extent_io.c16
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/free-space-cache.c530
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode.c5
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/locking.c4
-rw-r--r--fs/btrfs/super.c54
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c12
-rw-r--r--fs/btrfs/volumes.c41
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/cachefiles/Kconfig39
-rw-r--r--fs/cachefiles/Makefile18
-rw-r--r--fs/cachefiles/bind.c286
-rw-r--r--fs/cachefiles/daemon.c755
-rw-r--r--fs/cachefiles/interface.c449
-rw-r--r--fs/cachefiles/internal.h360
-rw-r--r--fs/cachefiles/key.c159
-rw-r--r--fs/cachefiles/main.c106
-rw-r--r--fs/cachefiles/namei.c771
-rw-r--r--fs/cachefiles/proc.c134
-rw-r--r--fs/cachefiles/rdwr.c879
-rw-r--r--fs/cachefiles/security.c116
-rw-r--r--fs/cachefiles/xattr.c291
-rw-r--r--fs/cifs/dir.c4
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/compat.c105
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/cramfs/inode.c39
-rw-r--r--fs/cramfs/uncompress.c2
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/efs/super.c20
-rw-r--r--fs/exec.c35
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild16
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/common.h184
-rw-r--r--fs/exofs/dir.c672
-rw-r--r--fs/exofs/exofs.h180
-rw-r--r--fs/exofs/file.c87
-rw-r--r--fs/exofs/inode.c1303
-rw-r--r--fs/exofs/namei.c342
-rw-r--r--fs/exofs/osd.c153
-rw-r--r--fs/exofs/super.c584
-rw-r--r--fs/exofs/symlink.c57
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/file.c6
-rw-r--r--fs/ext3/inode.c142
-rw-r--r--fs/ext3/ioctl.c59
-rw-r--r--fs/ext3/namei.c35
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/fat/inode.c8
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/fs-writeback.c31
-rw-r--r--fs/fs_struct.c177
-rw-r--r--fs/fscache/Kconfig56
-rw-r--r--fs/fscache/Makefile19
-rw-r--r--fs/fscache/cache.c415
-rw-r--r--fs/fscache/cookie.c500
-rw-r--r--fs/fscache/fsdef.c144
-rw-r--r--fs/fscache/histogram.c109
-rw-r--r--fs/fscache/internal.h380
-rw-r--r--fs/fscache/main.c124
-rw-r--r--fs/fscache/netfs.c103
-rw-r--r--fs/fscache/object.c810
-rw-r--r--fs/fscache/operation.c459
-rw-r--r--fs/fscache/page.c816
-rw-r--r--fs/fscache/proc.c68
-rw-r--r--fs/fscache/stats.c212
-rw-r--r--fs/fuse/dir.c1
-rw-r--r--fs/fuse/file.c54
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hpfs/super.c5
-rw-r--r--fs/hppfs/hppfs.c7
-rw-r--r--fs/internal.h8
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd/commit.c23
-rw-r--r--fs/jbd/journal.c34
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jfs/acl.c2
-rw-r--r--fs/minix/inode.c11
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c14
-rw-r--r--fs/namespace.c61
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/client.c14
-rw-r--r--fs/nfs/file.c38
-rw-r--r--fs/nfs/fscache-index.c337
-rw-r--r--fs/nfs/fscache.c523
-rw-r--r--fs/nfs/fscache.h220
-rw-r--r--fs/nfs/inode.c14
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/iostat.h18
-rw-r--r--fs/nfs/nfs3proc.c6
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/read.c27
-rw-r--r--fs/nfs/super.c45
-rw-r--r--fs/nfsd/nfssvc.c7
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/alloc.c57
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/aops.c23
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c9
-rw-r--r--fs/ocfs2/dir.c2806
-rw-r--r--fs/ocfs2/dir.h57
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h58
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c87
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c387
-rw-r--r--fs/ocfs2/dlm/dlmthread.c20
-rw-r--r--fs/ocfs2/dlmglue.c46
-rw-r--r--fs/ocfs2/dlmglue.h2
-rw-r--r--fs/ocfs2/export.c84
-rw-r--r--fs/ocfs2/inode.c48
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c173
-rw-r--r--fs/ocfs2/journal.h77
-rw-r--r--fs/ocfs2/localalloc.c86
-rw-r--r--fs/ocfs2/namei.c250
-rw-r--r--fs/ocfs2/ocfs2.h76
-rw-r--r--fs/ocfs2/ocfs2_fs.h136
-rw-r--r--fs/ocfs2/ocfs2_lockid.h4
-rw-r--r--fs/ocfs2/suballoc.c254
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c188
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/inode.c7
-rw-r--r--fs/open.c1
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/task_nommu.c7
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/read_write.c56
-rw-r--r--fs/reiserfs/Kconfig1
-rw-r--r--fs/reiserfs/super.c5
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/splice.c3
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/super.c1
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/ubifs/Kconfig4
-rw-r--r--fs/udf/balloc.c150
-rw-r--r--fs/udf/dir.c14
-rw-r--r--fs/udf/directory.c38
-rw-r--r--fs/udf/ecma_167.h416
-rw-r--r--fs/udf/ialloc.c9
-rw-r--r--fs/udf/inode.c213
-rw-r--r--fs/udf/misc.c29
-rw-r--r--fs/udf/namei.c86
-rw-r--r--fs/udf/osta_udf.h22
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c605
-rw-r--r--fs/udf/truncate.c44
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/udf/udf_sb.h9
-rw-r--r--fs/udf/udfdecl.h57
-rw-r--r--fs/udf/udfend.h28
-rw-r--r--fs/udf/udftime.c6
-rw-r--r--fs/udf/unicode.c62
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/mutex.h25
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c107
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h13
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c157
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c137
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h32
-rw-r--r--fs/xfs/quota/xfs_dquot.c28
-rw-r--r--fs/xfs/quota/xfs_dquot.h18
-rw-r--r--fs/xfs/quota/xfs_qm.c212
-rw-r--r--fs/xfs/quota/xfs_qm.h26
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c190
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h40
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c16
-rw-r--r--fs/xfs/support/debug.c1
-rw-r--r--fs/xfs/support/uuid.c71
-rw-r--r--fs/xfs/support/uuid.h4
-rw-r--r--fs/xfs/xfs_ag.h4
-rw-r--r--fs/xfs/xfs_alloc.c26
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_attr_leaf.c58
-rw-r--r--fs/xfs/xfs_bmap.c76
-rw-r--r--fs/xfs/xfs_bmap.h6
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_btree.h2
-rw-r--r--fs/xfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/xfs_da_btree.h9
-rw-r--r--fs/xfs/xfs_dfrag.c68
-rw-r--r--fs/xfs/xfs_dinode.h4
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_block.c7
-rw-r--r--fs/xfs/xfs_dir2_data.h2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c17
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c13
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_filestream.c9
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c12
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.h22
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_itable.c9
-rw-r--r--fs/xfs/xfs_log.c67
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c308
-rw-r--r--fs/xfs/xfs_mount.c253
-rw-r--r--fs/xfs/xfs_mount.h19
-rw-r--r--fs/xfs/xfs_qmops.c1
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c10
-rw-r--r--fs/xfs/xfs_rtalloc.h8
-rw-r--r--fs/xfs/xfs_trans.h24
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_trans_item.c2
-rw-r--r--fs/xfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_types.h8
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c408
-rw-r--r--fs/xfs/xfs_vnodeops.h3
275 files changed, 22122 insertions, 4432 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa3..86b203fc3c56 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -66,6 +66,13 @@ config GENERIC_ACL
66 bool 66 bool
67 select FS_POSIX_ACL 67 select FS_POSIX_ACL
68 68
69menu "Caches"
70
71source "fs/fscache/Kconfig"
72source "fs/cachefiles/Kconfig"
73
74endmenu
75
69if BLOCK 76if BLOCK
70menu "CD-ROM/DVD Filesystems" 77menu "CD-ROM/DVD Filesystems"
71 78
@@ -169,6 +176,8 @@ source "fs/romfs/Kconfig"
169source "fs/sysv/Kconfig" 176source "fs/sysv/Kconfig"
170source "fs/ufs/Kconfig" 177source "fs/ufs/Kconfig"
171 178
179source "fs/exofs/Kconfig"
180
172endif # MISC_FILESYSTEMS 181endif # MISC_FILESYSTEMS
173 182
174menuconfig NETWORK_FILESYSTEMS 183menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..70b2aed87133 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o 14 stack.o fs_struct.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
63obj-$(CONFIG_DLM) += dlm/ 63obj-$(CONFIG_DLM) += dlm/
64 64
65# Do not add any filesystems before this line 65# Do not add any filesystems before this line
66obj-$(CONFIG_FSCACHE) += fscache/
66obj-$(CONFIG_REISERFS_FS) += reiserfs/ 67obj-$(CONFIG_REISERFS_FS) += reiserfs/
67obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 68obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
68obj-$(CONFIG_EXT2_FS) += ext2/ 69obj-$(CONFIG_EXT2_FS) += ext2/
@@ -116,7 +117,9 @@ obj-$(CONFIG_AFS_FS) += afs/
116obj-$(CONFIG_BEFS_FS) += befs/ 117obj-$(CONFIG_BEFS_FS) += befs/
117obj-$(CONFIG_HOSTFS) += hostfs/ 118obj-$(CONFIG_HOSTFS) += hostfs/
118obj-$(CONFIG_HPPFS) += hppfs/ 119obj-$(CONFIG_HPPFS) += hppfs/
120obj-$(CONFIG_CACHEFILES) += cachefiles/
119obj-$(CONFIG_DEBUG_FS) += debugfs/ 121obj-$(CONFIG_DEBUG_FS) += debugfs/
120obj-$(CONFIG_OCFS2_FS) += ocfs2/ 122obj-$(CONFIG_OCFS2_FS) += ocfs2/
121obj-$(CONFIG_BTRFS_FS) += btrfs/ 123obj-$(CONFIG_BTRFS_FS) += btrfs/
122obj-$(CONFIG_GFS2_FS) += gfs2/ 124obj-$(CONFIG_GFS2_FS) += gfs2/
125obj-$(CONFIG_EXOFS_FS) += exofs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7e..dd9becca4241 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
219 219
220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) 220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
221{ 221{
222 struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb); 222 struct super_block *sb = dentry->d_sb;
223 struct adfs_sb_info *sbi = ADFS_SB(sb);
224 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
223 225
224 buf->f_type = ADFS_SUPER_MAGIC; 226 buf->f_type = ADFS_SUPER_MAGIC;
225 buf->f_namelen = asb->s_namelen; 227 buf->f_namelen = sbi->s_namelen;
226 buf->f_bsize = dentry->d_sb->s_blocksize; 228 buf->f_bsize = sb->s_blocksize;
227 buf->f_blocks = asb->s_size; 229 buf->f_blocks = sbi->s_size;
228 buf->f_files = asb->s_ids_per_zone * asb->s_map_size; 230 buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size;
229 buf->f_bavail = 231 buf->f_bavail =
230 buf->f_bfree = adfs_map_free(dentry->d_sb); 232 buf->f_bfree = adfs_map_free(sb);
231 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; 233 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
234 buf->f_fsid.val[0] = (u32)id;
235 buf->f_fsid.val[1] = (u32)(id >> 32);
232 236
233 return 0; 237 return 0;
234} 238}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582aa..5ce695e707fe 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
533{ 533{
534 struct super_block *sb = dentry->d_sb; 534 struct super_block *sb = dentry->d_sb;
535 int free; 535 int free;
536 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
536 537
537 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, 538 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
538 AFFS_SB(sb)->s_reserved); 539 AFFS_SB(sb)->s_reserved);
@@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
543 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; 544 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
544 buf->f_bfree = free; 545 buf->f_bfree = free;
545 buf->f_bavail = free; 546 buf->f_bavail = free;
547 buf->f_fsid.val[0] = (u32)id;
548 buf->f_fsid.val[1] = (u32)(id >> 32);
549 buf->f_namelen = 30;
546 return 0; 550 return 0;
547} 551}
548 552
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e1..5c4e61d3c772 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
19 See <file:Documentation/filesystems/afs.txt> for more information. 19 See <file:Documentation/filesystems/afs.txt> for more information.
20 20
21 If unsure, say N. 21 If unsure, say N.
22
23config AFS_FSCACHE
24 bool "Provide AFS client caching support (EXPERIMENTAL)"
25 depends on EXPERIMENTAL
26 depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
27 help
28 Say Y here if you want AFS data to be cached locally on disk through
29 the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cfb..4f64b95d57bd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
2# Makefile for Red Hat Linux AFS client. 2# Makefile for Red Hat Linux AFS client.
3# 3#
4 4
5afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
6
5kafs-objs := \ 7kafs-objs := \
8 $(afs-cache-y) \
6 callback.o \ 9 callback.o \
7 cell.o \ 10 cell.o \
8 cmservice.o \ 11 cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69edc..e2b1d3f16519 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
1/* AFS caching stuff 1/* AFS caching stuff
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifdef AFS_CACHING_SUPPORT 12#include <linux/slab.h>
13static cachefs_match_val_t afs_cell_cache_match(void *target, 13#include <linux/sched.h>
14 const void *entry); 14#include "internal.h"
15static void afs_cell_cache_update(void *source, void *entry); 15
16 16static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
17struct cachefs_index_def afs_cache_cell_index_def = { 17 void *buffer, uint16_t buflen);
18 .name = "cell_ix", 18static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
19 .data_size = sizeof(struct afs_cache_cell), 19 void *buffer, uint16_t buflen);
20 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, 20static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
21 .match = afs_cell_cache_match, 21 const void *buffer,
22 .update = afs_cell_cache_update, 22 uint16_t buflen);
23
24static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
25 void *buffer, uint16_t buflen);
26static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
27 void *buffer, uint16_t buflen);
28static enum fscache_checkaux afs_vlocation_cache_check_aux(
29 void *cookie_netfs_data, const void *buffer, uint16_t buflen);
30
31static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
32 void *buffer, uint16_t buflen);
33
34static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
35 void *buffer, uint16_t buflen);
36static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
37 uint64_t *size);
38static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
39 void *buffer, uint16_t buflen);
40static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
41 const void *buffer,
42 uint16_t buflen);
43static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
44
45struct fscache_netfs afs_cache_netfs = {
46 .name = "afs",
47 .version = 0,
48};
49
50struct fscache_cookie_def afs_cell_cache_index_def = {
51 .name = "AFS.cell",
52 .type = FSCACHE_COOKIE_TYPE_INDEX,
53 .get_key = afs_cell_cache_get_key,
54 .get_aux = afs_cell_cache_get_aux,
55 .check_aux = afs_cell_cache_check_aux,
56};
57
58struct fscache_cookie_def afs_vlocation_cache_index_def = {
59 .name = "AFS.vldb",
60 .type = FSCACHE_COOKIE_TYPE_INDEX,
61 .get_key = afs_vlocation_cache_get_key,
62 .get_aux = afs_vlocation_cache_get_aux,
63 .check_aux = afs_vlocation_cache_check_aux,
64};
65
66struct fscache_cookie_def afs_volume_cache_index_def = {
67 .name = "AFS.volume",
68 .type = FSCACHE_COOKIE_TYPE_INDEX,
69 .get_key = afs_volume_cache_get_key,
70};
71
72struct fscache_cookie_def afs_vnode_cache_index_def = {
73 .name = "AFS.vnode",
74 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
75 .get_key = afs_vnode_cache_get_key,
76 .get_attr = afs_vnode_cache_get_attr,
77 .get_aux = afs_vnode_cache_get_aux,
78 .check_aux = afs_vnode_cache_check_aux,
79 .now_uncached = afs_vnode_cache_now_uncached,
23}; 80};
24#endif
25 81
26/* 82/*
27 * match a cell record obtained from the cache 83 * set the key for the index entry
28 */ 84 */
29#ifdef AFS_CACHING_SUPPORT 85static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
30static cachefs_match_val_t afs_cell_cache_match(void *target, 86 void *buffer, uint16_t bufmax)
31 const void *entry)
32{ 87{
33 const struct afs_cache_cell *ccell = entry; 88 const struct afs_cell *cell = cookie_netfs_data;
34 struct afs_cell *cell = target; 89 uint16_t klen;
35 90
36 _enter("{%s},{%s}", ccell->name, cell->name); 91 _enter("%p,%p,%u", cell, buffer, bufmax);
37 92
38 if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) { 93 klen = strlen(cell->name);
39 _leave(" = SUCCESS"); 94 if (klen > bufmax)
40 return CACHEFS_MATCH_SUCCESS; 95 return 0;
41 }
42 96
43 _leave(" = FAILED"); 97 memcpy(buffer, cell->name, klen);
44 return CACHEFS_MATCH_FAILED; 98 return klen;
45} 99}
46#endif
47 100
48/* 101/*
49 * update a cell record in the cache 102 * provide new auxilliary cache data
50 */ 103 */
51#ifdef AFS_CACHING_SUPPORT 104static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
52static void afs_cell_cache_update(void *source, void *entry) 105 void *buffer, uint16_t bufmax)
53{ 106{
54 struct afs_cache_cell *ccell = entry; 107 const struct afs_cell *cell = cookie_netfs_data;
55 struct afs_cell *cell = source; 108 uint16_t dlen;
56 109
57 _enter("%p,%p", source, entry); 110 _enter("%p,%p,%u", cell, buffer, bufmax);
58 111
59 strncpy(ccell->name, cell->name, sizeof(ccell->name)); 112 dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
113 dlen = min(dlen, bufmax);
114 dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
60 115
61 memcpy(ccell->vl_servers, 116 memcpy(buffer, cell->vl_addrs, dlen);
62 cell->vl_addrs, 117 return dlen;
63 min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs))); 118}
64 119
120/*
121 * check that the auxilliary data indicates that the entry is still valid
122 */
123static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
124 const void *buffer,
125 uint16_t buflen)
126{
127 _leave(" = OKAY");
128 return FSCACHE_CHECKAUX_OKAY;
65} 129}
66#endif
67
68#ifdef AFS_CACHING_SUPPORT
69static cachefs_match_val_t afs_vlocation_cache_match(void *target,
70 const void *entry);
71static void afs_vlocation_cache_update(void *source, void *entry);
72
73struct cachefs_index_def afs_vlocation_cache_index_def = {
74 .name = "vldb",
75 .data_size = sizeof(struct afs_cache_vlocation),
76 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
77 .match = afs_vlocation_cache_match,
78 .update = afs_vlocation_cache_update,
79};
80#endif
81 130
131/*****************************************************************************/
82/* 132/*
83 * match a VLDB record stored in the cache 133 * set the key for the index entry
84 * - may also load target from entry
85 */ 134 */
86#ifdef AFS_CACHING_SUPPORT 135static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
87static cachefs_match_val_t afs_vlocation_cache_match(void *target, 136 void *buffer, uint16_t bufmax)
88 const void *entry)
89{ 137{
90 const struct afs_cache_vlocation *vldb = entry; 138 const struct afs_vlocation *vlocation = cookie_netfs_data;
91 struct afs_vlocation *vlocation = target; 139 uint16_t klen;
140
141 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
142
143 klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
144 if (klen > bufmax)
145 return 0;
92 146
93 _enter("{%s},{%s}", vlocation->vldb.name, vldb->name); 147 memcpy(buffer, vlocation->vldb.name, klen);
94 148
95 if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0 149 _leave(" = %u", klen);
96 ) { 150 return klen;
97 if (!vlocation->valid || 151}
98 vlocation->vldb.rtime == vldb->rtime 152
153/*
154 * provide new auxilliary cache data
155 */
156static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
157 void *buffer, uint16_t bufmax)
158{
159 const struct afs_vlocation *vlocation = cookie_netfs_data;
160 uint16_t dlen;
161
162 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
163
164 dlen = sizeof(struct afs_cache_vlocation);
165 dlen -= offsetof(struct afs_cache_vlocation, nservers);
166 if (dlen > bufmax)
167 return 0;
168
169 memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
170
171 _leave(" = %u", dlen);
172 return dlen;
173}
174
175/*
176 * check that the auxilliary data indicates that the entry is still valid
177 */
178static
179enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
180 const void *buffer,
181 uint16_t buflen)
182{
183 const struct afs_cache_vlocation *cvldb;
184 struct afs_vlocation *vlocation = cookie_netfs_data;
185 uint16_t dlen;
186
187 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
188
189 /* check the size of the data is what we're expecting */
190 dlen = sizeof(struct afs_cache_vlocation);
191 dlen -= offsetof(struct afs_cache_vlocation, nservers);
192 if (dlen != buflen)
193 return FSCACHE_CHECKAUX_OBSOLETE;
194
195 cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
196
197 /* if what's on disk is more valid than what's in memory, then use the
198 * VL record from the cache */
199 if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
200 memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
201 vlocation->valid = 1;
202 _leave(" = SUCCESS [c->m]");
203 return FSCACHE_CHECKAUX_OKAY;
204 }
205
206 /* need to update the cache if the cached info differs */
207 if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
208 /* delete if the volume IDs for this name differ */
209 if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
210 sizeof(cvldb->vid)) != 0
99 ) { 211 ) {
100 vlocation->vldb = *vldb; 212 _leave(" = OBSOLETE");
101 vlocation->valid = 1; 213 return FSCACHE_CHECKAUX_OBSOLETE;
102 _leave(" = SUCCESS [c->m]");
103 return CACHEFS_MATCH_SUCCESS;
104 } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
105 /* delete if VIDs for this name differ */
106 if (memcmp(&vlocation->vldb.vid,
107 &vldb->vid,
108 sizeof(vldb->vid)) != 0) {
109 _leave(" = DELETE");
110 return CACHEFS_MATCH_SUCCESS_DELETE;
111 }
112
113 _leave(" = UPDATE");
114 return CACHEFS_MATCH_SUCCESS_UPDATE;
115 } else {
116 _leave(" = SUCCESS");
117 return CACHEFS_MATCH_SUCCESS;
118 } 214 }
215
216 _leave(" = UPDATE");
217 return FSCACHE_CHECKAUX_NEEDS_UPDATE;
119 } 218 }
120 219
121 _leave(" = FAILED"); 220 _leave(" = OKAY");
122 return CACHEFS_MATCH_FAILED; 221 return FSCACHE_CHECKAUX_OKAY;
123} 222}
124#endif
125 223
224/*****************************************************************************/
126/* 225/*
127 * update a VLDB record stored in the cache 226 * set the key for the volume index entry
128 */ 227 */
129#ifdef AFS_CACHING_SUPPORT 228static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
130static void afs_vlocation_cache_update(void *source, void *entry) 229 void *buffer, uint16_t bufmax)
131{ 230{
132 struct afs_cache_vlocation *vldb = entry; 231 const struct afs_volume *volume = cookie_netfs_data;
133 struct afs_vlocation *vlocation = source; 232 uint16_t klen;
233
234 _enter("{%u},%p,%u", volume->type, buffer, bufmax);
235
236 klen = sizeof(volume->type);
237 if (klen > bufmax)
238 return 0;
134 239
135 _enter(""); 240 memcpy(buffer, &volume->type, sizeof(volume->type));
241
242 _leave(" = %u", klen);
243 return klen;
136 244
137 *vldb = vlocation->vldb;
138} 245}
139#endif
140
141#ifdef AFS_CACHING_SUPPORT
142static cachefs_match_val_t afs_volume_cache_match(void *target,
143 const void *entry);
144static void afs_volume_cache_update(void *source, void *entry);
145
146struct cachefs_index_def afs_volume_cache_index_def = {
147 .name = "volume",
148 .data_size = sizeof(struct afs_cache_vhash),
149 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
150 .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
151 .match = afs_volume_cache_match,
152 .update = afs_volume_cache_update,
153};
154#endif
155 246
247/*****************************************************************************/
156/* 248/*
157 * match a volume hash record stored in the cache 249 * set the key for the index entry
158 */ 250 */
159#ifdef AFS_CACHING_SUPPORT 251static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
160static cachefs_match_val_t afs_volume_cache_match(void *target, 252 void *buffer, uint16_t bufmax)
161 const void *entry)
162{ 253{
163 const struct afs_cache_vhash *vhash = entry; 254 const struct afs_vnode *vnode = cookie_netfs_data;
164 struct afs_volume *volume = target; 255 uint16_t klen;
165 256
166 _enter("{%u},{%u}", volume->type, vhash->vtype); 257 _enter("{%x,%x,%llx},%p,%u",
258 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
259 buffer, bufmax);
167 260
168 if (volume->type == vhash->vtype) { 261 klen = sizeof(vnode->fid.vnode);
169 _leave(" = SUCCESS"); 262 if (klen > bufmax)
170 return CACHEFS_MATCH_SUCCESS; 263 return 0;
171 } 264
265 memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
172 266
173 _leave(" = FAILED"); 267 _leave(" = %u", klen);
174 return CACHEFS_MATCH_FAILED; 268 return klen;
175} 269}
176#endif
177 270
178/* 271/*
179 * update a volume hash record stored in the cache 272 * provide updated file attributes
180 */ 273 */
181#ifdef AFS_CACHING_SUPPORT 274static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
182static void afs_volume_cache_update(void *source, void *entry) 275 uint64_t *size)
183{ 276{
184 struct afs_cache_vhash *vhash = entry; 277 const struct afs_vnode *vnode = cookie_netfs_data;
185 struct afs_volume *volume = source;
186 278
187 _enter(""); 279 _enter("{%x,%x,%llx},",
280 vnode->fid.vnode, vnode->fid.unique,
281 vnode->status.data_version);
188 282
189 vhash->vtype = volume->type; 283 *size = vnode->status.size;
190} 284}
191#endif
192
193#ifdef AFS_CACHING_SUPPORT
194static cachefs_match_val_t afs_vnode_cache_match(void *target,
195 const void *entry);
196static void afs_vnode_cache_update(void *source, void *entry);
197
198struct cachefs_index_def afs_vnode_cache_index_def = {
199 .name = "vnode",
200 .data_size = sizeof(struct afs_cache_vnode),
201 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
202 .match = afs_vnode_cache_match,
203 .update = afs_vnode_cache_update,
204};
205#endif
206 285
207/* 286/*
208 * match a vnode record stored in the cache 287 * provide new auxilliary cache data
288 */
289static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
290 void *buffer, uint16_t bufmax)
291{
292 const struct afs_vnode *vnode = cookie_netfs_data;
293 uint16_t dlen;
294
295 _enter("{%x,%x,%Lx},%p,%u",
296 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
297 buffer, bufmax);
298
299 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
300 if (dlen > bufmax)
301 return 0;
302
303 memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
304 buffer += sizeof(vnode->fid.unique);
305 memcpy(buffer, &vnode->status.data_version,
306 sizeof(vnode->status.data_version));
307
308 _leave(" = %u", dlen);
309 return dlen;
310}
311
312/*
313 * check that the auxilliary data indicates that the entry is still valid
209 */ 314 */
210#ifdef AFS_CACHING_SUPPORT 315static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
211static cachefs_match_val_t afs_vnode_cache_match(void *target, 316 const void *buffer,
212 const void *entry) 317 uint16_t buflen)
213{ 318{
214 const struct afs_cache_vnode *cvnode = entry; 319 struct afs_vnode *vnode = cookie_netfs_data;
215 struct afs_vnode *vnode = target; 320 uint16_t dlen;
216 321
217 _enter("{%x,%x,%Lx},{%x,%x,%Lx}", 322 _enter("{%x,%x,%llx},%p,%u",
218 vnode->fid.vnode, 323 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
219 vnode->fid.unique, 324 buffer, buflen);
220 vnode->status.version, 325
221 cvnode->vnode_id, 326 /* check the size of the data is what we're expecting */
222 cvnode->vnode_unique, 327 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
223 cvnode->data_version); 328 if (dlen != buflen) {
224 329 _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
225 if (vnode->fid.vnode != cvnode->vnode_id) { 330 return FSCACHE_CHECKAUX_OBSOLETE;
226 _leave(" = FAILED");
227 return CACHEFS_MATCH_FAILED;
228 } 331 }
229 332
230 if (vnode->fid.unique != cvnode->vnode_unique || 333 if (memcmp(buffer,
231 vnode->status.version != cvnode->data_version) { 334 &vnode->fid.unique,
232 _leave(" = DELETE"); 335 sizeof(vnode->fid.unique)
233 return CACHEFS_MATCH_SUCCESS_DELETE; 336 ) != 0) {
337 unsigned unique;
338
339 memcpy(&unique, buffer, sizeof(unique));
340
341 _leave(" = OBSOLETE [uniq %x != %x]",
342 unique, vnode->fid.unique);
343 return FSCACHE_CHECKAUX_OBSOLETE;
344 }
345
346 if (memcmp(buffer + sizeof(vnode->fid.unique),
347 &vnode->status.data_version,
348 sizeof(vnode->status.data_version)
349 ) != 0) {
350 afs_dataversion_t version;
351
352 memcpy(&version, buffer + sizeof(vnode->fid.unique),
353 sizeof(version));
354
355 _leave(" = OBSOLETE [vers %llx != %llx]",
356 version, vnode->status.data_version);
357 return FSCACHE_CHECKAUX_OBSOLETE;
234 } 358 }
235 359
236 _leave(" = SUCCESS"); 360 _leave(" = SUCCESS");
237 return CACHEFS_MATCH_SUCCESS; 361 return FSCACHE_CHECKAUX_OKAY;
238} 362}
239#endif
240 363
241/* 364/*
242 * update a vnode record stored in the cache 365 * indication the cookie is no longer uncached
366 * - this function is called when the backing store currently caching a cookie
367 * is removed
368 * - the netfs should use this to clean up any markers indicating cached pages
369 * - this is mandatory for any object that may have data
243 */ 370 */
244#ifdef AFS_CACHING_SUPPORT 371static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
245static void afs_vnode_cache_update(void *source, void *entry)
246{ 372{
247 struct afs_cache_vnode *cvnode = entry; 373 struct afs_vnode *vnode = cookie_netfs_data;
248 struct afs_vnode *vnode = source; 374 struct pagevec pvec;
375 pgoff_t first;
376 int loop, nr_pages;
377
378 _enter("{%x,%x,%Lx}",
379 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
380
381 pagevec_init(&pvec, 0);
382 first = 0;
383
384 for (;;) {
385 /* grab a bunch of pages to clean */
386 nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
387 first,
388 PAGEVEC_SIZE - pagevec_count(&pvec));
389 if (!nr_pages)
390 break;
249 391
250 _enter(""); 392 for (loop = 0; loop < nr_pages; loop++)
393 ClearPageFsCache(pvec.pages[loop]);
394
395 first = pvec.pages[nr_pages - 1]->index + 1;
396
397 pvec.nr = nr_pages;
398 pagevec_release(&pvec);
399 cond_resched();
400 }
251 401
252 cvnode->vnode_id = vnode->fid.vnode; 402 _leave("");
253 cvnode->vnode_unique = vnode->fid.unique;
254 cvnode->data_version = vnode->status.version;
255} 403}
256#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90e..5c4f6b499e90 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
1/* AFS local cache management interface 1/* AFS local cache management interface
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifndef AFS_CACHE_H 12#include <linux/fscache.h>
13#define AFS_CACHE_H
14
15#undef AFS_CACHING_SUPPORT
16
17#include <linux/mm.h>
18#ifdef AFS_CACHING_SUPPORT
19#include <linux/cachefs.h>
20#endif
21#include "types.h"
22
23#endif /* AFS_CACHE_H */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b1..e19c13f059ed 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
147 if (ret < 0) 147 if (ret < 0)
148 goto error; 148 goto error;
149 149
150#ifdef AFS_CACHING_SUPPORT 150#ifdef CONFIG_AFS_FSCACHE
151 /* put it up for caching */ 151 /* put it up for caching (this never returns an error) */
152 cachefs_acquire_cookie(afs_cache_netfs.primary_index, 152 cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
153 &afs_vlocation_cache_index_def, 153 &afs_cell_cache_index_def,
154 cell, 154 cell);
155 &cell->cache);
156#endif 155#endif
157 156
158 /* add to the cell lists */ 157 /* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
362 list_del_init(&cell->proc_link); 361 list_del_init(&cell->proc_link);
363 up_write(&afs_proc_cells_sem); 362 up_write(&afs_proc_cells_sem);
364 363
365#ifdef AFS_CACHING_SUPPORT 364#ifdef CONFIG_AFS_FSCACHE
366 cachefs_relinquish_cookie(cell->cache, 0); 365 fscache_relinquish_cookie(cell->cache, 0);
367#endif 366#endif
368
369 key_put(cell->anonymous_key); 367 key_put(cell->anonymous_key);
370 kfree(cell); 368 kfree(cell);
371 369
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96c..7a1d942ef68d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
23static int afs_releasepage(struct page *page, gfp_t gfp_flags); 23static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page); 24static int afs_launder_page(struct page *page);
25 25
26static int afs_readpages(struct file *filp, struct address_space *mapping,
27 struct list_head *pages, unsigned nr_pages);
28
26const struct file_operations afs_file_operations = { 29const struct file_operations afs_file_operations = {
27 .open = afs_open, 30 .open = afs_open,
28 .release = afs_release, 31 .release = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
46 49
47const struct address_space_operations afs_fs_aops = { 50const struct address_space_operations afs_fs_aops = {
48 .readpage = afs_readpage, 51 .readpage = afs_readpage,
52 .readpages = afs_readpages,
49 .set_page_dirty = afs_set_page_dirty, 53 .set_page_dirty = afs_set_page_dirty,
50 .launder_page = afs_launder_page, 54 .launder_page = afs_launder_page,
51 .releasepage = afs_releasepage, 55 .releasepage = afs_releasepage,
@@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file)
101/* 105/*
102 * deal with notification that a page was read from the cache 106 * deal with notification that a page was read from the cache
103 */ 107 */
104#ifdef AFS_CACHING_SUPPORT 108static void afs_file_readpage_read_complete(struct page *page,
105static void afs_readpage_read_complete(void *cookie_data, 109 void *data,
106 struct page *page, 110 int error)
107 void *data,
108 int error)
109{ 111{
110 _enter("%p,%p,%p,%d", cookie_data, page, data, error); 112 _enter("%p,%p,%d", page, data, error);
111 113
112 if (error) 114 /* if the read completes with an error, we just unlock the page and let
113 SetPageError(page); 115 * the VM reissue the readpage */
114 else 116 if (!error)
115 SetPageUptodate(page); 117 SetPageUptodate(page);
116 unlock_page(page); 118 unlock_page(page);
117
118} 119}
119#endif
120
121/*
122 * deal with notification that a page was written to the cache
123 */
124#ifdef AFS_CACHING_SUPPORT
125static void afs_readpage_write_complete(void *cookie_data,
126 struct page *page,
127 void *data,
128 int error)
129{
130 _enter("%p,%p,%p,%d", cookie_data, page, data, error);
131
132 unlock_page(page);
133}
134#endif
135 120
136/* 121/*
137 * AFS read page from file, directory or symlink 122 * AFS read page from file, directory or symlink
@@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page)
161 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) 146 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
162 goto error; 147 goto error;
163 148
164#ifdef AFS_CACHING_SUPPORT
165 /* is it cached? */ 149 /* is it cached? */
166 ret = cachefs_read_or_alloc_page(vnode->cache, 150#ifdef CONFIG_AFS_FSCACHE
151 ret = fscache_read_or_alloc_page(vnode->cache,
167 page, 152 page,
168 afs_file_readpage_read_complete, 153 afs_file_readpage_read_complete,
169 NULL, 154 NULL,
@@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page)
171#else 156#else
172 ret = -ENOBUFS; 157 ret = -ENOBUFS;
173#endif 158#endif
174
175 switch (ret) { 159 switch (ret) {
176 /* read BIO submitted and wb-journal entry found */
177 case 1:
178 BUG(); // TODO - handle wb-journal match
179
180 /* read BIO submitted (page in cache) */ 160 /* read BIO submitted (page in cache) */
181 case 0: 161 case 0:
182 break; 162 break;
183 163
184 /* no page available in cache */ 164 /* page not yet cached */
185 case -ENOBUFS:
186 case -ENODATA: 165 case -ENODATA:
166 _debug("cache said ENODATA");
167 goto go_on;
168
169 /* page will not be cached */
170 case -ENOBUFS:
171 _debug("cache said ENOBUFS");
187 default: 172 default:
173 go_on:
188 offset = page->index << PAGE_CACHE_SHIFT; 174 offset = page->index << PAGE_CACHE_SHIFT;
189 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); 175 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
190 176
@@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page)
198 set_bit(AFS_VNODE_DELETED, &vnode->flags); 184 set_bit(AFS_VNODE_DELETED, &vnode->flags);
199 ret = -ESTALE; 185 ret = -ESTALE;
200 } 186 }
201#ifdef AFS_CACHING_SUPPORT 187
202 cachefs_uncache_page(vnode->cache, page); 188#ifdef CONFIG_AFS_FSCACHE
189 fscache_uncache_page(vnode->cache, page);
203#endif 190#endif
191 BUG_ON(PageFsCache(page));
204 goto error; 192 goto error;
205 } 193 }
206 194
207 SetPageUptodate(page); 195 SetPageUptodate(page);
208 196
209#ifdef AFS_CACHING_SUPPORT 197 /* send the page to the cache */
210 if (cachefs_write_page(vnode->cache, 198#ifdef CONFIG_AFS_FSCACHE
211 page, 199 if (PageFsCache(page) &&
212 afs_file_readpage_write_complete, 200 fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
213 NULL, 201 fscache_uncache_page(vnode->cache, page);
214 GFP_KERNEL) != 0 202 BUG_ON(PageFsCache(page));
215 ) {
216 cachefs_uncache_page(vnode->cache, page);
217 unlock_page(page);
218 } 203 }
219#else
220 unlock_page(page);
221#endif 204#endif
205 unlock_page(page);
222 } 206 }
223 207
224 _leave(" = 0"); 208 _leave(" = 0");
@@ -232,34 +216,59 @@ error:
232} 216}
233 217
234/* 218/*
235 * invalidate part or all of a page 219 * read a set of pages
236 */ 220 */
237static void afs_invalidatepage(struct page *page, unsigned long offset) 221static int afs_readpages(struct file *file, struct address_space *mapping,
222 struct list_head *pages, unsigned nr_pages)
238{ 223{
239 int ret = 1; 224 struct afs_vnode *vnode;
225 int ret = 0;
240 226
241 _enter("{%lu},%lu", page->index, offset); 227 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
242 228
243 BUG_ON(!PageLocked(page)); 229 vnode = AFS_FS_I(mapping->host);
230 if (vnode->flags & AFS_VNODE_DELETED) {
231 _leave(" = -ESTALE");
232 return -ESTALE;
233 }
244 234
245 if (PagePrivate(page)) { 235 /* attempt to read as many of the pages as possible */
246 /* We release buffers only if the entire page is being 236#ifdef CONFIG_AFS_FSCACHE
247 * invalidated. 237 ret = fscache_read_or_alloc_pages(vnode->cache,
248 * The get_block cached value has been unconditionally 238 mapping,
249 * invalidated, so real IO is not possible anymore. 239 pages,
250 */ 240 &nr_pages,
251 if (offset == 0) { 241 afs_file_readpage_read_complete,
252 BUG_ON(!PageLocked(page)); 242 NULL,
253 243 mapping_gfp_mask(mapping));
254 ret = 0; 244#else
255 if (!PageWriteback(page)) 245 ret = -ENOBUFS;
256 ret = page->mapping->a_ops->releasepage(page, 246#endif
257 0); 247
258 /* possibly should BUG_ON(!ret); - neilb */ 248 switch (ret) {
259 } 249 /* all pages are being read from the cache */
250 case 0:
251 BUG_ON(!list_empty(pages));
252 BUG_ON(nr_pages != 0);
253 _leave(" = 0 [reading all]");
254 return 0;
255
256 /* there were pages that couldn't be read from the cache */
257 case -ENODATA:
258 case -ENOBUFS:
259 break;
260
261 /* other error */
262 default:
263 _leave(" = %d", ret);
264 return ret;
260 } 265 }
261 266
262 _leave(" = %d", ret); 267 /* load the missing pages from the network */
268 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
269
270 _leave(" = %d [netting]", ret);
271 return ret;
263} 272}
264 273
265/* 274/*
@@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page)
273} 282}
274 283
275/* 284/*
276 * release a page and cleanup its private data 285 * invalidate part or all of a page
286 * - release a page and clean up its private data if offset is 0 (indicating
287 * the entire page)
288 */
289static void afs_invalidatepage(struct page *page, unsigned long offset)
290{
291 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
292
293 _enter("{%lu},%lu", page->index, offset);
294
295 BUG_ON(!PageLocked(page));
296
297 /* we clean up only if the entire page is being invalidated */
298 if (offset == 0) {
299#ifdef CONFIG_AFS_FSCACHE
300 if (PageFsCache(page)) {
301 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
302 fscache_wait_on_page_write(vnode->cache, page);
303 fscache_uncache_page(vnode->cache, page);
304 ClearPageFsCache(page);
305 }
306#endif
307
308 if (PagePrivate(page)) {
309 if (wb && !PageWriteback(page)) {
310 set_page_private(page, 0);
311 afs_put_writeback(wb);
312 }
313
314 if (!page_private(page))
315 ClearPagePrivate(page);
316 }
317 }
318
319 _leave("");
320}
321
322/*
323 * release a page and clean up its private state if it's not busy
324 * - return true if the page can now be released, false if not
277 */ 325 */
278static int afs_releasepage(struct page *page, gfp_t gfp_flags) 326static int afs_releasepage(struct page *page, gfp_t gfp_flags)
279{ 327{
328 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
280 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 329 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
281 struct afs_writeback *wb;
282 330
283 _enter("{{%x:%u}[%lu],%lx},%x", 331 _enter("{{%x:%u}[%lu],%lx},%x",
284 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, 332 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
285 gfp_flags); 333 gfp_flags);
286 334
335 /* deny if page is being written to the cache and the caller hasn't
336 * elected to wait */
337#ifdef CONFIG_AFS_FSCACHE
338 if (PageFsCache(page)) {
339 if (fscache_check_page_write(vnode->cache, page)) {
340 if (!(gfp_flags & __GFP_WAIT)) {
341 _leave(" = F [cache busy]");
342 return 0;
343 }
344 fscache_wait_on_page_write(vnode->cache, page);
345 }
346
347 fscache_uncache_page(vnode->cache, page);
348 ClearPageFsCache(page);
349 }
350#endif
351
287 if (PagePrivate(page)) { 352 if (PagePrivate(page)) {
288 wb = (struct afs_writeback *) page_private(page); 353 if (wb) {
289 ASSERT(wb != NULL); 354 set_page_private(page, 0);
290 set_page_private(page, 0); 355 afs_put_writeback(wb);
356 }
291 ClearPagePrivate(page); 357 ClearPagePrivate(page);
292 afs_put_writeback(wb);
293 } 358 }
294 359
295 _leave(" = 0"); 360 /* indicate that the page can be released */
296 return 0; 361 _leave(" = T");
362 return 1;
297} 363}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a18..c048f0658751 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
61 return -EBADMSG; 61 return -EBADMSG;
62 } 62 }
63 63
64#ifdef CONFIG_AFS_FSCACHE
65 if (vnode->status.size != inode->i_size)
66 fscache_attr_changed(vnode->cache);
67#endif
68
64 inode->i_nlink = vnode->status.nlink; 69 inode->i_nlink = vnode->status.nlink;
65 inode->i_uid = vnode->status.owner; 70 inode->i_uid = vnode->status.owner;
66 inode->i_gid = 0; 71 inode->i_gid = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
149 return inode; 154 return inode;
150 } 155 }
151 156
152#ifdef AFS_CACHING_SUPPORT
153 /* set up caching before reading the status, as fetch-status reads the
154 * first page of symlinks to see if they're really mntpts */
155 cachefs_acquire_cookie(vnode->volume->cache,
156 NULL,
157 vnode,
158 &vnode->cache);
159#endif
160
161 if (!status) { 157 if (!status) {
162 /* it's a remotely extant inode */ 158 /* it's a remotely extant inode */
163 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); 159 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
183 } 179 }
184 } 180 }
185 181
182 /* set up caching before mapping the status, as map-status reads the
183 * first page of symlinks to see if they're really mountpoints */
184 inode->i_size = vnode->status.size;
185#ifdef CONFIG_AFS_FSCACHE
186 vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
187 &afs_vnode_cache_index_def,
188 vnode);
189#endif
190
186 ret = afs_inode_map_status(vnode, key); 191 ret = afs_inode_map_status(vnode, key);
187 if (ret < 0) 192 if (ret < 0)
188 goto bad_inode; 193 goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
196 201
197 /* failure */ 202 /* failure */
198bad_inode: 203bad_inode:
204#ifdef CONFIG_AFS_FSCACHE
205 fscache_relinquish_cookie(vnode->cache, 0);
206 vnode->cache = NULL;
207#endif
199 iget_failed(inode); 208 iget_failed(inode);
200 _leave(" = %d [bad]", ret); 209 _leave(" = %d [bad]", ret);
201 return ERR_PTR(ret); 210 return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
340 ASSERT(list_empty(&vnode->writebacks)); 349 ASSERT(list_empty(&vnode->writebacks));
341 ASSERT(!vnode->cb_promised); 350 ASSERT(!vnode->cb_promised);
342 351
343#ifdef AFS_CACHING_SUPPORT 352#ifdef CONFIG_AFS_FSCACHE
344 cachefs_relinquish_cookie(vnode->cache, 0); 353 fscache_relinquish_cookie(vnode->cache, 0);
345 vnode->cache = NULL; 354 vnode->cache = NULL;
346#endif 355#endif
347 356
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd6..106be66dafd2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
21 21
22#include "afs.h" 22#include "afs.h"
23#include "afs_vl.h" 23#include "afs_vl.h"
24#include "cache.h"
24 25
25#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
26 27
@@ -193,8 +194,8 @@ struct afs_cell {
193 struct key *anonymous_key; /* anonymous user key for this cell */ 194 struct key *anonymous_key; /* anonymous user key for this cell */
194 struct list_head proc_link; /* /proc cell list link */ 195 struct list_head proc_link; /* /proc cell list link */
195 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ 196 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
196#ifdef AFS_CACHING_SUPPORT 197#ifdef CONFIG_AFS_FSCACHE
197 struct cachefs_cookie *cache; /* caching cookie */ 198 struct fscache_cookie *cache; /* caching cookie */
198#endif 199#endif
199 200
200 /* server record management */ 201 /* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
249 struct list_head grave; /* link in master graveyard list */ 250 struct list_head grave; /* link in master graveyard list */
250 struct list_head update; /* link in master update list */ 251 struct list_head update; /* link in master update list */
251 struct afs_cell *cell; /* cell to which volume belongs */ 252 struct afs_cell *cell; /* cell to which volume belongs */
252#ifdef AFS_CACHING_SUPPORT 253#ifdef CONFIG_AFS_FSCACHE
253 struct cachefs_cookie *cache; /* caching cookie */ 254 struct fscache_cookie *cache; /* caching cookie */
254#endif 255#endif
255 struct afs_cache_vlocation vldb; /* volume information DB record */ 256 struct afs_cache_vlocation vldb; /* volume information DB record */
256 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ 257 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
302 atomic_t usage; 303 atomic_t usage;
303 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ 304 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
304 struct afs_vlocation *vlocation; /* volume location */ 305 struct afs_vlocation *vlocation; /* volume location */
305#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
306 struct cachefs_cookie *cache; /* caching cookie */ 307 struct fscache_cookie *cache; /* caching cookie */
307#endif 308#endif
308 afs_volid_t vid; /* volume ID */ 309 afs_volid_t vid; /* volume ID */
309 afs_voltype_t type; /* type of volume */ 310 afs_voltype_t type; /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
333 struct afs_server *server; /* server currently supplying this file */ 334 struct afs_server *server; /* server currently supplying this file */
334 struct afs_fid fid; /* the file identifier for this inode */ 335 struct afs_fid fid; /* the file identifier for this inode */
335 struct afs_file_status status; /* AFS status info for this file */ 336 struct afs_file_status status; /* AFS status info for this file */
336#ifdef AFS_CACHING_SUPPORT 337#ifdef CONFIG_AFS_FSCACHE
337 struct cachefs_cookie *cache; /* caching cookie */ 338 struct fscache_cookie *cache; /* caching cookie */
338#endif 339#endif
339 struct afs_permits *permits; /* cache of permits so far obtained */ 340 struct afs_permits *permits; /* cache of permits so far obtained */
340 struct mutex permits_lock; /* lock for altering permits list */ 341 struct mutex permits_lock; /* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
428 429
429/*****************************************************************************/ 430/*****************************************************************************/
430/* 431/*
432 * cache.c
433 */
434#ifdef CONFIG_AFS_FSCACHE
435extern struct fscache_netfs afs_cache_netfs;
436extern struct fscache_cookie_def afs_cell_cache_index_def;
437extern struct fscache_cookie_def afs_vlocation_cache_index_def;
438extern struct fscache_cookie_def afs_volume_cache_index_def;
439extern struct fscache_cookie_def afs_vnode_cache_index_def;
440#else
441#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL)
442#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL)
443#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL)
444#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL)
445#endif
446
447/*
431 * callback.c 448 * callback.c
432 */ 449 */
433extern void afs_init_callback_state(struct afs_server *); 450extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
446 */ 463 */
447extern struct rw_semaphore afs_proc_cells_sem; 464extern struct rw_semaphore afs_proc_cells_sem;
448extern struct list_head afs_proc_cells; 465extern struct list_head afs_proc_cells;
449#ifdef AFS_CACHING_SUPPORT
450extern struct cachefs_index_def afs_cache_cell_index_def;
451#endif
452 466
453#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) 467#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
454extern int afs_cell_init(char *); 468extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
554 * main.c 568 * main.c
555 */ 569 */
556extern struct afs_uuid afs_uuid; 570extern struct afs_uuid afs_uuid;
557#ifdef AFS_CACHING_SUPPORT
558extern struct cachefs_netfs afs_cache_netfs;
559#endif
560 571
561/* 572/*
562 * misc.c 573 * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
637/* 648/*
638 * vlclient.c 649 * vlclient.c
639 */ 650 */
640#ifdef AFS_CACHING_SUPPORT
641extern struct cachefs_index_def afs_vlocation_cache_index_def;
642#endif
643
644extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, 651extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
645 const char *, struct afs_cache_vlocation *, 652 const char *, struct afs_cache_vlocation *,
646 const struct afs_wait_mode *); 653 const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
664/* 671/*
665 * vnode.c 672 * vnode.c
666 */ 673 */
667#ifdef AFS_CACHING_SUPPORT
668extern struct cachefs_index_def afs_vnode_cache_index_def;
669#endif
670
671extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
672
673static inline struct afs_vnode *AFS_FS_I(struct inode *inode) 674static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
674{ 675{
675 return container_of(inode, struct afs_vnode, vfs_inode); 676 return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
711/* 712/*
712 * volume.c 713 * volume.c
713 */ 714 */
714#ifdef AFS_CACHING_SUPPORT
715extern struct cachefs_index_def afs_volume_cache_index_def;
716#endif
717
718#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) 715#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
719 716
720extern void afs_put_volume(struct afs_volume *); 717extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f7..66d54d348c55 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
1/* AFS client file system 1/* AFS client file system
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
29module_param(rootcell, charp, 0); 29module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32#ifdef AFS_CACHING_SUPPORT
33static struct cachefs_netfs_operations afs_cache_ops = {
34 .get_page_cookie = afs_cache_get_page_cookie,
35};
36
37struct cachefs_netfs afs_cache_netfs = {
38 .name = "afs",
39 .version = 0,
40 .ops = &afs_cache_ops,
41};
42#endif
43
44struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
45 33
46/* 34/*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
104 if (ret < 0) 92 if (ret < 0)
105 return ret; 93 return ret;
106 94
107#ifdef AFS_CACHING_SUPPORT 95#ifdef CONFIG_AFS_FSCACHE
108 /* we want to be able to cache */ 96 /* we want to be able to cache */
109 ret = cachefs_register_netfs(&afs_cache_netfs, 97 ret = fscache_register_netfs(&afs_cache_netfs);
110 &afs_cache_cell_index_def);
111 if (ret < 0) 98 if (ret < 0)
112 goto error_cache; 99 goto error_cache;
113#endif 100#endif
@@ -142,8 +129,8 @@ error_fs:
142error_open_socket: 129error_open_socket:
143error_vl_update_init: 130error_vl_update_init:
144error_cell_init: 131error_cell_init:
145#ifdef AFS_CACHING_SUPPORT 132#ifdef CONFIG_AFS_FSCACHE
146 cachefs_unregister_netfs(&afs_cache_netfs); 133 fscache_unregister_netfs(&afs_cache_netfs);
147error_cache: 134error_cache:
148#endif 135#endif
149 afs_callback_update_kill(); 136 afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
175 afs_vlocation_purge(); 162 afs_vlocation_purge();
176 flush_scheduled_work(); 163 flush_scheduled_work();
177 afs_cell_purge(); 164 afs_cell_purge();
178#ifdef AFS_CACHING_SUPPORT 165#ifdef CONFIG_AFS_FSCACHE
179 cachefs_unregister_netfs(&afs_cache_netfs); 166 fscache_unregister_netfs(&afs_cache_netfs);
180#endif 167#endif
181 afs_proc_cleanup(); 168 afs_proc_cleanup();
182 rcu_barrier(); 169 rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a800..2b9e2d03a390 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
173 if (PageError(page)) 173 if (PageError(page))
174 goto error; 174 goto error;
175 175
176 buf = kmap(page); 176 buf = kmap_atomic(page, KM_USER0);
177 memcpy(devname, buf, size); 177 memcpy(devname, buf, size);
178 kunmap(page); 178 kunmap_atomic(buf, KM_USER0);
179 page_cache_release(page); 179 page_cache_release(page);
180 page = NULL; 180 page = NULL;
181 181
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb5..ec2a7431e458 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
281 281
282 vl->vldb = *vldb; 282 vl->vldb = *vldb;
283 283
284#ifdef AFS_CACHING_SUPPORT 284#ifdef CONFIG_AFS_FSCACHE
285 /* update volume entry in local cache */ 285 fscache_update_cookie(vl->cache);
286 cachefs_update_cookie(vl->cache);
287#endif 286#endif
288} 287}
289 288
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
304 memset(&vldb, 0, sizeof(vldb)); 303 memset(&vldb, 0, sizeof(vldb));
305 304
306 /* see if we have an in-cache copy (will set vl->valid if there is) */ 305 /* see if we have an in-cache copy (will set vl->valid if there is) */
307#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
308 cachefs_acquire_cookie(cell->cache, 307 vl->cache = fscache_acquire_cookie(vl->cell->cache,
309 &afs_volume_cache_index_def, 308 &afs_vlocation_cache_index_def, vl);
310 vlocation,
311 &vl->cache);
312#endif 309#endif
313 310
314 if (vl->valid) { 311 if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
420 spin_unlock(&vl->lock); 417 spin_unlock(&vl->lock);
421 wake_up(&vl->waitq); 418 wake_up(&vl->waitq);
422 419
420 /* update volume entry in local cache */
421#ifdef CONFIG_AFS_FSCACHE
422 fscache_update_cookie(vl->cache);
423#endif
424
423 /* schedule for regular updates */ 425 /* schedule for regular updates */
424 afs_vlocation_queue_for_updates(vl); 426 afs_vlocation_queue_for_updates(vl);
425 goto success; 427 goto success;
@@ -465,7 +467,7 @@ found_in_memory:
465 spin_unlock(&vl->lock); 467 spin_unlock(&vl->lock);
466 468
467success: 469success:
468 _leave(" = %p",vl); 470 _leave(" = %p", vl);
469 return vl; 471 return vl;
470 472
471error_abandon: 473error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
523{ 525{
524 _enter("%p", vl); 526 _enter("%p", vl);
525 527
526#ifdef AFS_CACHING_SUPPORT 528#ifdef CONFIG_AFS_FSCACHE
527 cachefs_relinquish_cookie(vl->cache, 0); 529 fscache_relinquish_cookie(vl->cache, 0);
528#endif 530#endif
529
530 afs_put_cell(vl->cell); 531 afs_put_cell(vl->cell);
531 kfree(vl); 532 kfree(vl);
532} 533}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f9..a353e69e2391 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
124 } 124 }
125 125
126 /* attach the cache and volume location */ 126 /* attach the cache and volume location */
127#ifdef AFS_CACHING_SUPPORT 127#ifdef CONFIG_AFS_FSCACHE
128 cachefs_acquire_cookie(vlocation->cache, 128 volume->cache = fscache_acquire_cookie(vlocation->cache,
129 &afs_vnode_cache_index_def, 129 &afs_volume_cache_index_def,
130 volume, 130 volume);
131 &volume->cache);
132#endif 131#endif
133
134 afs_get_vlocation(vlocation); 132 afs_get_vlocation(vlocation);
135 volume->vlocation = vlocation; 133 volume->vlocation = vlocation;
136 134
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
194 up_write(&vlocation->cell->vl_sem); 192 up_write(&vlocation->cell->vl_sem);
195 193
196 /* finish cleaning up the volume */ 194 /* finish cleaning up the volume */
197#ifdef AFS_CACHING_SUPPORT 195#ifdef CONFIG_AFS_FSCACHE
198 cachefs_relinquish_cookie(volume->cache, 0); 196 fscache_relinquish_cookie(volume->cache, 0);
199#endif 197#endif
200 afs_put_vlocation(vlocation); 198 afs_put_vlocation(vlocation);
201 199
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d433621..c2e7a7ff0080 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
780 _leave(" = %d", ret); 780 _leave(" = %d", ret);
781 return ret; 781 return ret;
782} 782}
783
784/*
785 * notification that a previously read-only page is about to become writable
786 * - if it returns an error, the caller will deliver a bus error signal
787 */
788int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
789{
790 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
791
792 _enter("{{%x:%u}},{%lx}",
793 vnode->fid.vid, vnode->fid.vnode, page->index);
794
795 /* wait for the page to be written to the cache before we allow it to
796 * be modified */
797#ifdef CONFIG_AFS_FSCACHE
798 fscache_wait_on_page_write(vnode->cache, page);
799#endif
800
801 _leave(" = 0");
802 return 0;
803}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad02..76afd0d6b86c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
900befs_statfs(struct dentry *dentry, struct kstatfs *buf) 900befs_statfs(struct dentry *dentry, struct kstatfs *buf)
901{ 901{
902 struct super_block *sb = dentry->d_sb; 902 struct super_block *sb = dentry->d_sb;
903 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
903 904
904 befs_debug(sb, "---> befs_statfs()"); 905 befs_debug(sb, "---> befs_statfs()");
905 906
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
910 buf->f_bavail = buf->f_bfree; 911 buf->f_bavail = buf->f_bfree;
911 buf->f_files = 0; /* UNKNOWN */ 912 buf->f_files = 0; /* UNKNOWN */
912 buf->f_ffree = 0; /* UNKNOWN */ 913 buf->f_ffree = 0; /* UNKNOWN */
914 buf->f_fsid.val[0] = (u32)id;
915 buf->f_fsid.val[1] = (u32)(id >> 32);
913 buf->f_namelen = BEFS_NAME_LEN; 916 buf->f_namelen = BEFS_NAME_LEN;
914 917
915 befs_debug(sb, "<--- befs_statfs()"); 918 befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853b..40381df34869 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/stat.h>
16#include <linux/time.h>
17#include <linux/mm.h> 15#include <linux/mm.h>
18#include <linux/mman.h> 16#include <linux/mman.h>
19#include <linux/errno.h> 17#include <linux/errno.h>
@@ -21,20 +19,15 @@
21#include <linux/binfmts.h> 19#include <linux/binfmts.h>
22#include <linux/string.h> 20#include <linux/string.h>
23#include <linux/file.h> 21#include <linux/file.h>
24#include <linux/fcntl.h>
25#include <linux/ptrace.h>
26#include <linux/slab.h> 22#include <linux/slab.h>
27#include <linux/shm.h>
28#include <linux/personality.h> 23#include <linux/personality.h>
29#include <linux/elfcore.h> 24#include <linux/elfcore.h>
30#include <linux/init.h> 25#include <linux/init.h>
31#include <linux/highuid.h> 26#include <linux/highuid.h>
32#include <linux/smp.h>
33#include <linux/compiler.h> 27#include <linux/compiler.h>
34#include <linux/highmem.h> 28#include <linux/highmem.h>
35#include <linux/pagemap.h> 29#include <linux/pagemap.h>
36#include <linux/security.h> 30#include <linux/security.h>
37#include <linux/syscalls.h>
38#include <linux/random.h> 31#include <linux/random.h>
39#include <linux/elf.h> 32#include <linux/elf.h>
40#include <linux/utsname.h> 33#include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
576 unsigned long error; 569 unsigned long error;
577 struct elf_phdr *elf_ppnt, *elf_phdata; 570 struct elf_phdr *elf_ppnt, *elf_phdata;
578 unsigned long elf_bss, elf_brk; 571 unsigned long elf_bss, elf_brk;
579 int elf_exec_fileno;
580 int retval, i; 572 int retval, i;
581 unsigned int size; 573 unsigned int size;
582 unsigned long elf_entry; 574 unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
631 goto out_free_ph; 623 goto out_free_ph;
632 } 624 }
633 625
634 retval = get_unused_fd();
635 if (retval < 0)
636 goto out_free_ph;
637 get_file(bprm->file);
638 fd_install(elf_exec_fileno = retval, bprm->file);
639
640 elf_ppnt = elf_phdata; 626 elf_ppnt = elf_phdata;
641 elf_bss = 0; 627 elf_bss = 0;
642 elf_brk = 0; 628 elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
655 retval = -ENOEXEC; 641 retval = -ENOEXEC;
656 if (elf_ppnt->p_filesz > PATH_MAX || 642 if (elf_ppnt->p_filesz > PATH_MAX ||
657 elf_ppnt->p_filesz < 2) 643 elf_ppnt->p_filesz < 2)
658 goto out_free_file; 644 goto out_free_ph;
659 645
660 retval = -ENOMEM; 646 retval = -ENOMEM;
661 elf_interpreter = kmalloc(elf_ppnt->p_filesz, 647 elf_interpreter = kmalloc(elf_ppnt->p_filesz,
662 GFP_KERNEL); 648 GFP_KERNEL);
663 if (!elf_interpreter) 649 if (!elf_interpreter)
664 goto out_free_file; 650 goto out_free_ph;
665 651
666 retval = kernel_read(bprm->file, elf_ppnt->p_offset, 652 retval = kernel_read(bprm->file, elf_ppnt->p_offset,
667 elf_interpreter, 653 elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
956 942
957 kfree(elf_phdata); 943 kfree(elf_phdata);
958 944
959 sys_close(elf_exec_fileno);
960
961 set_binfmt(&elf_format); 945 set_binfmt(&elf_format);
962 946
963#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES 947#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
1028 fput(interpreter); 1012 fput(interpreter);
1029out_free_interp: 1013out_free_interp:
1030 kfree(elf_interpreter); 1014 kfree(elf_interpreter);
1031out_free_file:
1032 sys_close(elf_exec_fileno);
1033out_free_ph: 1015out_free_ph:
1034 kfree(elf_phdata); 1016 kfree(elf_phdata);
1035 goto out; 1017 goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f5..70cfc4b84ae0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
972 params->elfhdr_addr = seg->addr; 972 params->elfhdr_addr = seg->addr;
973 973
974 /* clear any space allocated but not loaded */ 974 /* clear any space allocated but not loaded */
975 if (phdr->p_filesz < phdr->p_memsz) 975 if (phdr->p_filesz < phdr->p_memsz) {
976 clear_user((void *) (seg->addr + phdr->p_filesz), 976 ret = clear_user((void *) (seg->addr + phdr->p_filesz),
977 phdr->p_memsz - phdr->p_filesz); 977 phdr->p_memsz - phdr->p_filesz);
978 if (ret)
979 return ret;
980 }
978 981
979 if (mm) { 982 if (mm) {
980 if (phdr->p_flags & PF_X) { 983 if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1014 struct elf32_fdpic_loadseg *seg; 1017 struct elf32_fdpic_loadseg *seg;
1015 struct elf32_phdr *phdr; 1018 struct elf32_phdr *phdr;
1016 unsigned long load_addr, delta_vaddr; 1019 unsigned long load_addr, delta_vaddr;
1017 int loop, dvset; 1020 int loop, dvset, ret;
1018 1021
1019 load_addr = params->load_addr; 1022 load_addr = params->load_addr;
1020 delta_vaddr = 0; 1023 delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1114 * PT_LOAD */ 1117 * PT_LOAD */
1115 if (prot & PROT_WRITE && disp > 0) { 1118 if (prot & PROT_WRITE && disp > 0) {
1116 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1119 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1117 clear_user((void __user *) maddr, disp); 1120 ret = clear_user((void __user *) maddr, disp);
1121 if (ret)
1122 return ret;
1118 maddr += disp; 1123 maddr += disp;
1119 } 1124 }
1120 1125
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1149 if (prot & PROT_WRITE && excess1 > 0) { 1154 if (prot & PROT_WRITE && excess1 > 0) {
1150 kdebug("clear[%d] ad=%lx sz=%lx", 1155 kdebug("clear[%d] ad=%lx sz=%lx",
1151 loop, maddr + phdr->p_filesz, excess1); 1156 loop, maddr + phdr->p_filesz, excess1);
1152 clear_user((void __user *) maddr + phdr->p_filesz, 1157 ret = clear_user((void __user *) maddr + phdr->p_filesz,
1153 excess1); 1158 excess1);
1159 if (ret)
1160 return ret;
1154 } 1161 }
1155 1162
1156#else 1163#else
1157 if (excess > 0) { 1164 if (excess > 0) {
1158 kdebug("clear[%d] ad=%lx sz=%lx", 1165 kdebug("clear[%d] ad=%lx sz=%lx",
1159 loop, maddr + phdr->p_filesz, excess); 1166 loop, maddr + phdr->p_filesz, excess);
1160 clear_user((void *) maddr + phdr->p_filesz, excess); 1167 ret = clear_user((void *) maddr + phdr->p_filesz, excess);
1168 if (ret)
1169 return ret;
1161 } 1170 }
1162#endif 1171#endif
1163 1172
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616e..eff74b9c9e77 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
188static int 188static int
189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) 189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
190{ 190{
191 int som_exec_fileno;
192 int retval; 191 int retval;
193 unsigned int size; 192 unsigned int size;
194 unsigned long som_entry; 193 unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
220 goto out_free; 219 goto out_free;
221 } 220 }
222 221
223 retval = get_unused_fd();
224 if (retval < 0)
225 goto out_free;
226 get_file(bprm->file);
227 fd_install(som_exec_fileno = retval, bprm->file);
228
229 /* Flush all traces of the currently running executable */ 222 /* Flush all traces of the currently running executable */
230 retval = flush_old_exec(bprm); 223 retval = flush_old_exec(bprm);
231 if (retval) 224 if (retval)
diff --git a/fs/bio.c b/fs/bio.c
index a040cde7f6fd..e0c9e545bbfa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1420,8 +1420,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
1420} 1420}
1421 1421
1422/* 1422/*
1423 * split a bio - only worry about a bio with a single page 1423 * split a bio - only worry about a bio with a single page in its iovec
1424 * in it's iovec
1425 */ 1424 */
1426struct bio_pair *bio_split(struct bio *bi, int first_sectors) 1425struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1427{ 1426{
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf3..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
204 } 204 }
205 return sync_blockdev(bdev); 205 return sync_blockdev(bdev);
206} 206}
207EXPORT_SYMBOL(fsync_bdev);
207 208
208/** 209/**
209 * freeze_bdev -- lock a filesystem and force it into a consistent state 210 * freeze_bdev -- lock a filesystem and force it into a consistent state
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
256 } 256 }
257 257
258 if (!acl) 258 if (!acl)
259 inode->i_mode &= ~current->fs->umask; 259 inode->i_mode &= ~current_umask();
260 } 260 }
261 261
262 if (IS_POSIXACL(dir) && acl) { 262 if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 23#include "async-thread.h"
25 24
26#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
195 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending))
196 continue; 195 continue;
197 196
197 if (kthread_should_stop())
198 break;
199
198 /* still no more work?, sleep for real */ 200 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock); 201 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE); 202 set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
208 worker->working = 0; 210 worker->working = 0;
209 spin_unlock_irq(&worker->lock); 211 spin_unlock_irq(&worker->lock);
210 212
211 schedule(); 213 if (!kthread_should_stop())
214 schedule();
212 } 215 }
213 __set_current_state(TASK_RUNNING); 216 __set_current_state(TASK_RUNNING);
214 } 217 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dbb724124633..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1244 * readahead one full node of leaves, finding things that are close 1244 * readahead one full node of leaves, finding things that are close
1245 * to the block in 'slot', and triggering ra on them. 1245 * to the block in 'slot', and triggering ra on them.
1246 */ 1246 */
1247static noinline void reada_for_search(struct btrfs_root *root, 1247static void reada_for_search(struct btrfs_root *root,
1248 struct btrfs_path *path, 1248 struct btrfs_path *path,
1249 int level, int slot, u64 objectid) 1249 int level, int slot, u64 objectid)
1250{ 1250{
1251 struct extent_buffer *node; 1251 struct extent_buffer *node;
1252 struct btrfs_disk_key disk_key; 1252 struct btrfs_disk_key disk_key;
@@ -1447,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1447} 1447}
1448 1448
1449/* 1449/*
1450 * helper function for btrfs_search_slot. The goal is to find a block
1451 * in cache without setting the path to blocking. If we find the block
1452 * we return zero and the path is unchanged.
1453 *
1454 * If we can't find the block, we set the path blocking and do some
1455 * reada. -EAGAIN is returned and the search must be repeated.
1456 */
1457static int
1458read_block_for_search(struct btrfs_trans_handle *trans,
1459 struct btrfs_root *root, struct btrfs_path *p,
1460 struct extent_buffer **eb_ret, int level, int slot,
1461 struct btrfs_key *key)
1462{
1463 u64 blocknr;
1464 u64 gen;
1465 u32 blocksize;
1466 struct extent_buffer *b = *eb_ret;
1467 struct extent_buffer *tmp;
1468
1469 blocknr = btrfs_node_blockptr(b, slot);
1470 gen = btrfs_node_ptr_generation(b, slot);
1471 blocksize = btrfs_level_size(root, level - 1);
1472
1473 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1475 *eb_ret = tmp;
1476 return 0;
1477 }
1478
1479 /*
1480 * reduce lock contention at high levels
1481 * of the btree by dropping locks before
1482 * we read.
1483 */
1484 btrfs_release_path(NULL, p);
1485 if (tmp)
1486 free_extent_buffer(tmp);
1487 if (p->reada)
1488 reada_for_search(root, p, level, slot, key->objectid);
1489
1490 tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 if (tmp)
1492 free_extent_buffer(tmp);
1493 return -EAGAIN;
1494}
1495
1496/*
1497 * helper function for btrfs_search_slot. This does all of the checks
1498 * for node-level blocks and does any balancing required based on
1499 * the ins_len.
1500 *
1501 * If no extra work was required, zero is returned. If we had to
1502 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1503 * start over
1504 */
1505static int
1506setup_nodes_for_search(struct btrfs_trans_handle *trans,
1507 struct btrfs_root *root, struct btrfs_path *p,
1508 struct extent_buffer *b, int level, int ins_len)
1509{
1510 int ret;
1511 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1512 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1513 int sret;
1514
1515 sret = reada_for_balance(root, p, level);
1516 if (sret)
1517 goto again;
1518
1519 btrfs_set_path_blocking(p);
1520 sret = split_node(trans, root, p, level);
1521 btrfs_clear_path_blocking(p, NULL);
1522
1523 BUG_ON(sret > 0);
1524 if (sret) {
1525 ret = sret;
1526 goto done;
1527 }
1528 b = p->nodes[level];
1529 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1530 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1531 int sret;
1532
1533 sret = reada_for_balance(root, p, level);
1534 if (sret)
1535 goto again;
1536
1537 btrfs_set_path_blocking(p);
1538 sret = balance_level(trans, root, p, level);
1539 btrfs_clear_path_blocking(p, NULL);
1540
1541 if (sret) {
1542 ret = sret;
1543 goto done;
1544 }
1545 b = p->nodes[level];
1546 if (!b) {
1547 btrfs_release_path(NULL, p);
1548 goto again;
1549 }
1550 BUG_ON(btrfs_header_nritems(b) == 1);
1551 }
1552 return 0;
1553
1554again:
1555 ret = -EAGAIN;
1556done:
1557 return ret;
1558}
1559
1560/*
1450 * look for key in the tree. path is filled in with nodes along the way 1561 * look for key in the tree. path is filled in with nodes along the way
1451 * if key is found, we return zero and you can find the item in the leaf 1562 * if key is found, we return zero and you can find the item in the leaf
1452 * level of the path (level 0) 1563 * level of the path (level 0)
@@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1464 ins_len, int cow) 1575 ins_len, int cow)
1465{ 1576{
1466 struct extent_buffer *b; 1577 struct extent_buffer *b;
1467 struct extent_buffer *tmp;
1468 int slot; 1578 int slot;
1469 int ret; 1579 int ret;
1470 int level; 1580 int level;
1471 int should_reada = p->reada;
1472 int lowest_unlock = 1; 1581 int lowest_unlock = 1;
1473 int blocksize;
1474 u8 lowest_level = 0; 1582 u8 lowest_level = 0;
1475 u64 blocknr;
1476 u64 gen;
1477 1583
1478 lowest_level = p->lowest_level; 1584 lowest_level = p->lowest_level;
1479 WARN_ON(lowest_level && ins_len > 0); 1585 WARN_ON(lowest_level && ins_len > 0);
@@ -1502,7 +1608,11 @@ again:
1502 if (cow) { 1608 if (cow) {
1503 int wret; 1609 int wret;
1504 1610
1505 /* is a cow on this block not required */ 1611 /*
1612 * if we don't really need to cow this block
1613 * then we don't want to set the path blocking,
1614 * so we test it here
1615 */
1506 if (btrfs_header_generation(b) == trans->transid && 1616 if (btrfs_header_generation(b) == trans->transid &&
1507 btrfs_header_owner(b) == root->root_key.objectid && 1617 btrfs_header_owner(b) == root->root_key.objectid &&
1508 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1618 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -1557,51 +1667,15 @@ cow_done:
1557 if (ret && slot > 0) 1667 if (ret && slot > 0)
1558 slot -= 1; 1668 slot -= 1;
1559 p->slots[level] = slot; 1669 p->slots[level] = slot;
1560 if ((p->search_for_split || ins_len > 0) && 1670 ret = setup_nodes_for_search(trans, root, p, b, level,
1561 btrfs_header_nritems(b) >= 1671 ins_len);
1562 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1672 if (ret == -EAGAIN)
1563 int sret; 1673 goto again;
1564 1674 else if (ret)
1565 sret = reada_for_balance(root, p, level); 1675 goto done;
1566 if (sret) 1676 b = p->nodes[level];
1567 goto again; 1677 slot = p->slots[level];
1568
1569 btrfs_set_path_blocking(p);
1570 sret = split_node(trans, root, p, level);
1571 btrfs_clear_path_blocking(p, NULL);
1572
1573 BUG_ON(sret > 0);
1574 if (sret) {
1575 ret = sret;
1576 goto done;
1577 }
1578 b = p->nodes[level];
1579 slot = p->slots[level];
1580 } else if (ins_len < 0 &&
1581 btrfs_header_nritems(b) <
1582 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1583 int sret;
1584
1585 sret = reada_for_balance(root, p, level);
1586 if (sret)
1587 goto again;
1588
1589 btrfs_set_path_blocking(p);
1590 sret = balance_level(trans, root, p, level);
1591 btrfs_clear_path_blocking(p, NULL);
1592 1678
1593 if (sret) {
1594 ret = sret;
1595 goto done;
1596 }
1597 b = p->nodes[level];
1598 if (!b) {
1599 btrfs_release_path(NULL, p);
1600 goto again;
1601 }
1602 slot = p->slots[level];
1603 BUG_ON(btrfs_header_nritems(b) == 1);
1604 }
1605 unlock_up(p, level, lowest_unlock); 1679 unlock_up(p, level, lowest_unlock);
1606 1680
1607 /* this is only true while dropping a snapshot */ 1681 /* this is only true while dropping a snapshot */
@@ -1610,44 +1684,11 @@ cow_done:
1610 goto done; 1684 goto done;
1611 } 1685 }
1612 1686
1613 blocknr = btrfs_node_blockptr(b, slot); 1687 ret = read_block_for_search(trans, root, p,
1614 gen = btrfs_node_ptr_generation(b, slot); 1688 &b, level, slot, key);
1615 blocksize = btrfs_level_size(root, level - 1); 1689 if (ret == -EAGAIN)
1690 goto again;
1616 1691
1617 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1618 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1619 b = tmp;
1620 } else {
1621 /*
1622 * reduce lock contention at high levels
1623 * of the btree by dropping locks before
1624 * we read.
1625 */
1626 if (level > 0) {
1627 btrfs_release_path(NULL, p);
1628 if (tmp)
1629 free_extent_buffer(tmp);
1630 if (should_reada)
1631 reada_for_search(root, p,
1632 level, slot,
1633 key->objectid);
1634
1635 tmp = read_tree_block(root, blocknr,
1636 blocksize, gen);
1637 if (tmp)
1638 free_extent_buffer(tmp);
1639 goto again;
1640 } else {
1641 btrfs_set_path_blocking(p);
1642 if (tmp)
1643 free_extent_buffer(tmp);
1644 if (should_reada)
1645 reada_for_search(root, p,
1646 level, slot,
1647 key->objectid);
1648 b = read_node_slot(root, b, slot);
1649 }
1650 }
1651 if (!p->skip_locking) { 1692 if (!p->skip_locking) {
1652 int lret; 1693 int lret;
1653 1694
@@ -2116,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2116 BUG_ON(!path->nodes[level]); 2157 BUG_ON(!path->nodes[level]);
2117 lower = path->nodes[level]; 2158 lower = path->nodes[level];
2118 nritems = btrfs_header_nritems(lower); 2159 nritems = btrfs_header_nritems(lower);
2119 if (slot > nritems) 2160 BUG_ON(slot > nritems);
2120 BUG();
2121 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2161 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2122 BUG(); 2162 BUG();
2123 if (slot != nritems) { 2163 if (slot != nritems) {
@@ -4086,28 +4126,44 @@ next:
4086int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4126int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4087{ 4127{
4088 int slot; 4128 int slot;
4089 int level = 1; 4129 int level;
4090 struct extent_buffer *c; 4130 struct extent_buffer *c;
4091 struct extent_buffer *next = NULL; 4131 struct extent_buffer *next;
4092 struct btrfs_key key; 4132 struct btrfs_key key;
4093 u32 nritems; 4133 u32 nritems;
4094 int ret; 4134 int ret;
4135 int old_spinning = path->leave_spinning;
4136 int force_blocking = 0;
4095 4137
4096 nritems = btrfs_header_nritems(path->nodes[0]); 4138 nritems = btrfs_header_nritems(path->nodes[0]);
4097 if (nritems == 0) 4139 if (nritems == 0)
4098 return 1; 4140 return 1;
4099 4141
4100 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4142 /*
4143 * we take the blocks in an order that upsets lockdep. Using
4144 * blocking mode is the only way around it.
4145 */
4146#ifdef CONFIG_DEBUG_LOCK_ALLOC
4147 force_blocking = 1;
4148#endif
4101 4149
4150 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4151again:
4152 level = 1;
4153 next = NULL;
4102 btrfs_release_path(root, path); 4154 btrfs_release_path(root, path);
4155
4103 path->keep_locks = 1; 4156 path->keep_locks = 1;
4157
4158 if (!force_blocking)
4159 path->leave_spinning = 1;
4160
4104 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4161 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4105 path->keep_locks = 0; 4162 path->keep_locks = 0;
4106 4163
4107 if (ret < 0) 4164 if (ret < 0)
4108 return ret; 4165 return ret;
4109 4166
4110 btrfs_set_path_blocking(path);
4111 nritems = btrfs_header_nritems(path->nodes[0]); 4167 nritems = btrfs_header_nritems(path->nodes[0]);
4112 /* 4168 /*
4113 * by releasing the path above we dropped all our locks. A balance 4169 * by releasing the path above we dropped all our locks. A balance
@@ -4117,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4117 */ 4173 */
4118 if (nritems > 0 && path->slots[0] < nritems - 1) { 4174 if (nritems > 0 && path->slots[0] < nritems - 1) {
4119 path->slots[0]++; 4175 path->slots[0]++;
4176 ret = 0;
4120 goto done; 4177 goto done;
4121 } 4178 }
4122 4179
4123 while (level < BTRFS_MAX_LEVEL) { 4180 while (level < BTRFS_MAX_LEVEL) {
4124 if (!path->nodes[level]) 4181 if (!path->nodes[level]) {
4125 return 1; 4182 ret = 1;
4183 goto done;
4184 }
4126 4185
4127 slot = path->slots[level] + 1; 4186 slot = path->slots[level] + 1;
4128 c = path->nodes[level]; 4187 c = path->nodes[level];
4129 if (slot >= btrfs_header_nritems(c)) { 4188 if (slot >= btrfs_header_nritems(c)) {
4130 level++; 4189 level++;
4131 if (level == BTRFS_MAX_LEVEL) 4190 if (level == BTRFS_MAX_LEVEL) {
4132 return 1; 4191 ret = 1;
4192 goto done;
4193 }
4133 continue; 4194 continue;
4134 } 4195 }
4135 4196
@@ -4138,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4138 free_extent_buffer(next); 4199 free_extent_buffer(next);
4139 } 4200 }
4140 4201
4141 /* the path was set to blocking above */ 4202 next = c;
4142 if (level == 1 && (path->locks[1] || path->skip_locking) && 4203 ret = read_block_for_search(NULL, root, path, &next, level,
4143 path->reada) 4204 slot, &key);
4144 reada_for_search(root, path, level, slot, 0); 4205 if (ret == -EAGAIN)
4206 goto again;
4145 4207
4146 next = read_node_slot(root, c, slot);
4147 if (!path->skip_locking) { 4208 if (!path->skip_locking) {
4148 btrfs_assert_tree_locked(c); 4209 ret = btrfs_try_spin_lock(next);
4149 btrfs_tree_lock(next); 4210 if (!ret) {
4150 btrfs_set_lock_blocking(next); 4211 btrfs_set_path_blocking(path);
4212 btrfs_tree_lock(next);
4213 if (!force_blocking)
4214 btrfs_clear_path_blocking(path, next);
4215 }
4216 if (force_blocking)
4217 btrfs_set_lock_blocking(next);
4151 } 4218 }
4152 break; 4219 break;
4153 } 4220 }
@@ -4157,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4157 c = path->nodes[level]; 4224 c = path->nodes[level];
4158 if (path->locks[level]) 4225 if (path->locks[level])
4159 btrfs_tree_unlock(c); 4226 btrfs_tree_unlock(c);
4227
4160 free_extent_buffer(c); 4228 free_extent_buffer(c);
4161 path->nodes[level] = next; 4229 path->nodes[level] = next;
4162 path->slots[level] = 0; 4230 path->slots[level] = 0;
4163 if (!path->skip_locking) 4231 if (!path->skip_locking)
4164 path->locks[level] = 1; 4232 path->locks[level] = 1;
4233
4165 if (!level) 4234 if (!level)
4166 break; 4235 break;
4167 4236
4168 btrfs_set_path_blocking(path); 4237 ret = read_block_for_search(NULL, root, path, &next, level,
4169 if (level == 1 && path->locks[1] && path->reada) 4238 0, &key);
4170 reada_for_search(root, path, level, slot, 0); 4239 if (ret == -EAGAIN)
4171 next = read_node_slot(root, next, 0); 4240 goto again;
4241
4172 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4173 btrfs_assert_tree_locked(path->nodes[level]); 4243 btrfs_assert_tree_locked(path->nodes[level]);
4174 btrfs_tree_lock(next); 4244 ret = btrfs_try_spin_lock(next);
4175 btrfs_set_lock_blocking(next); 4245 if (!ret) {
4246 btrfs_set_path_blocking(path);
4247 btrfs_tree_lock(next);
4248 if (!force_blocking)
4249 btrfs_clear_path_blocking(path, next);
4250 }
4251 if (force_blocking)
4252 btrfs_set_lock_blocking(next);
4176 } 4253 }
4177 } 4254 }
4255 ret = 0;
4178done: 4256done:
4179 unlock_up(path, 0, 1); 4257 unlock_up(path, 0, 1);
4180 return 0; 4258 path->leave_spinning = old_spinning;
4259 if (!old_spinning)
4260 btrfs_set_path_blocking(path);
4261
4262 return ret;
4181} 4263}
4182 4264
4183/* 4265/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9417713542a2..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
143#define BTRFS_FT_MAX 9 143#define BTRFS_FT_MAX 9
144 144
145/* 145/*
146 * the key defines the order in the tree, and so it also defines (optimal) 146 * The key defines the order in the tree, and so it also defines (optimal)
147 * block layout. objectid corresonds to the inode number. The flags 147 * block layout.
148 * tells us things about the object, and is a kind of stream selector. 148 *
149 * so for a given inode, keys with flags of 1 might refer to the inode 149 * objectid corresponds to the inode number.
150 * data, flags of 2 may point to file data in the btree and flags == 3 150 *
151 * may point to extents. 151 * type tells us things about the object, and is a kind of stream selector.
152 * so for a given inode, keys with type of 1 might refer to the inode data,
153 * type of 2 may point to file data in the btree and type == 3 may point to
154 * extents.
152 * 155 *
153 * offset is the starting byte offset for this key in the stream. 156 * offset is the starting byte offset for this key in the stream.
154 * 157 *
@@ -200,7 +203,7 @@ struct btrfs_dev_item {
200 203
201 /* 204 /*
202 * starting byte of this partition on the device, 205 * starting byte of this partition on the device,
203 * to allowr for stripe alignment in the future 206 * to allow for stripe alignment in the future
204 */ 207 */
205 __le64 start_offset; 208 __le64 start_offset;
206 209
@@ -633,18 +636,35 @@ struct btrfs_space_info {
633 struct rw_semaphore groups_sem; 636 struct rw_semaphore groups_sem;
634}; 637};
635 638
636struct btrfs_free_space { 639/*
637 struct rb_node bytes_index; 640 * free clusters are used to claim free space in relatively large chunks,
638 struct rb_node offset_index; 641 * allowing us to do less seeky writes. They are used for all metadata
639 u64 offset; 642 * allocations and data allocations in ssd mode.
640 u64 bytes; 643 */
644struct btrfs_free_cluster {
645 spinlock_t lock;
646 spinlock_t refill_lock;
647 struct rb_root root;
648
649 /* largest extent in this cluster */
650 u64 max_size;
651
652 /* first extent starting offset */
653 u64 window_start;
654
655 struct btrfs_block_group_cache *block_group;
656 /*
657 * when a cluster is allocated from a block group, we put the
658 * cluster onto a list in the block group so that it can
659 * be freed before the block group is freed.
660 */
661 struct list_head block_group_list;
641}; 662};
642 663
643struct btrfs_block_group_cache { 664struct btrfs_block_group_cache {
644 struct btrfs_key key; 665 struct btrfs_key key;
645 struct btrfs_block_group_item item; 666 struct btrfs_block_group_item item;
646 spinlock_t lock; 667 spinlock_t lock;
647 struct mutex alloc_mutex;
648 struct mutex cache_mutex; 668 struct mutex cache_mutex;
649 u64 pinned; 669 u64 pinned;
650 u64 reserved; 670 u64 reserved;
@@ -656,6 +676,7 @@ struct btrfs_block_group_cache {
656 struct btrfs_space_info *space_info; 676 struct btrfs_space_info *space_info;
657 677
658 /* free space cache stuff */ 678 /* free space cache stuff */
679 spinlock_t tree_lock;
659 struct rb_root free_space_bytes; 680 struct rb_root free_space_bytes;
660 struct rb_root free_space_offset; 681 struct rb_root free_space_offset;
661 682
@@ -667,6 +688,11 @@ struct btrfs_block_group_cache {
667 688
668 /* usage count */ 689 /* usage count */
669 atomic_t count; 690 atomic_t count;
691
692 /* List of struct btrfs_free_clusters for this block group.
693 * Today it will only have one thing on it, but that may change
694 */
695 struct list_head cluster_list;
670}; 696};
671 697
672struct btrfs_leaf_ref_tree { 698struct btrfs_leaf_ref_tree {
@@ -728,7 +754,6 @@ struct btrfs_fs_info {
728 struct mutex tree_log_mutex; 754 struct mutex tree_log_mutex;
729 struct mutex transaction_kthread_mutex; 755 struct mutex transaction_kthread_mutex;
730 struct mutex cleaner_mutex; 756 struct mutex cleaner_mutex;
731 struct mutex pinned_mutex;
732 struct mutex chunk_mutex; 757 struct mutex chunk_mutex;
733 struct mutex drop_mutex; 758 struct mutex drop_mutex;
734 struct mutex volume_mutex; 759 struct mutex volume_mutex;
@@ -839,8 +864,12 @@ struct btrfs_fs_info {
839 spinlock_t delalloc_lock; 864 spinlock_t delalloc_lock;
840 spinlock_t new_trans_lock; 865 spinlock_t new_trans_lock;
841 u64 delalloc_bytes; 866 u64 delalloc_bytes;
842 u64 last_alloc; 867
843 u64 last_data_alloc; 868 /* data_alloc_cluster is only used in ssd mode */
869 struct btrfs_free_cluster data_alloc_cluster;
870
871 /* all metadata allocations go through this cluster */
872 struct btrfs_free_cluster meta_alloc_cluster;
844 873
845 spinlock_t ref_cache_lock; 874 spinlock_t ref_cache_lock;
846 u64 total_ref_cache_size; 875 u64 total_ref_cache_size;
@@ -932,7 +961,6 @@ struct btrfs_root {
932}; 961};
933 962
934/* 963/*
935
936 * inode items have the data typically returned from stat and store other 964 * inode items have the data typically returned from stat and store other
937 * info about object characteristics. There is one for every file and dir in 965 * info about object characteristics. There is one for every file and dir in
938 * the FS 966 * the FS
@@ -963,7 +991,7 @@ struct btrfs_root {
963#define BTRFS_EXTENT_CSUM_KEY 128 991#define BTRFS_EXTENT_CSUM_KEY 128
964 992
965/* 993/*
966 * root items point to tree roots. There are typically in the root 994 * root items point to tree roots. They are typically in the root
967 * tree used by the super block to find all the other trees 995 * tree used by the super block to find all the other trees
968 */ 996 */
969#define BTRFS_ROOT_ITEM_KEY 132 997#define BTRFS_ROOT_ITEM_KEY 132
@@ -1010,6 +1038,8 @@ struct btrfs_root {
1010#define BTRFS_MOUNT_SSD (1 << 3) 1038#define BTRFS_MOUNT_SSD (1 << 3)
1011#define BTRFS_MOUNT_DEGRADED (1 << 4) 1039#define BTRFS_MOUNT_DEGRADED (1 << 4)
1012#define BTRFS_MOUNT_COMPRESS (1 << 5) 1040#define BTRFS_MOUNT_COMPRESS (1 << 5)
1041#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1042#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1013 1043
1014#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1044#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1015#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1045#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1748,6 +1778,7 @@ static inline struct dentry *fdentry(struct file *file)
1748} 1778}
1749 1779
1750/* extent-tree.c */ 1780/* extent-tree.c */
1781void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1751int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1782int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1752 struct btrfs_root *root, unsigned long count); 1783 struct btrfs_root *root, unsigned long count);
1753int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1784int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -2174,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2174int btrfs_init_acl(struct inode *inode, struct inode *dir); 2205int btrfs_init_acl(struct inode *inode, struct inode *dir);
2175int btrfs_acl_chmod(struct inode *inode); 2206int btrfs_acl_chmod(struct inode *inode);
2176 2207
2177/* free-space-cache.c */
2178int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2179 u64 bytenr, u64 size);
2180int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2181 u64 offset, u64 bytes);
2182int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2183 u64 bytenr, u64 size);
2184int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2185 u64 offset, u64 bytes);
2186void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2187 *block_group);
2188struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2189 *block_group, u64 offset,
2190 u64 bytes);
2191void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2192 u64 bytes);
2193u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2194#endif 2208#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cbf7dc8ae3ec..d6c01c096a40 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -18,7 +18,6 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h> 20#include <linux/sort.h>
21#include <linux/ftrace.h>
22#include "ctree.h" 21#include "ctree.h"
23#include "delayed-ref.h" 22#include "delayed-ref.h"
24#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92d73929d381..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -1412,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
1412 1413
1413 ret = extent_range_uptodate(io_tree, start + length, 1414 ret = extent_range_uptodate(io_tree, start + length,
1414 start + buf_len - 1); 1415 start + buf_len - 1);
1415 if (ret == 1)
1416 return ret;
1417 return ret; 1416 return ret;
1418} 1417}
1419 1418
@@ -1647,12 +1646,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1647 mutex_init(&fs_info->ordered_operations_mutex); 1646 mutex_init(&fs_info->ordered_operations_mutex);
1648 mutex_init(&fs_info->tree_log_mutex); 1647 mutex_init(&fs_info->tree_log_mutex);
1649 mutex_init(&fs_info->drop_mutex); 1648 mutex_init(&fs_info->drop_mutex);
1650 mutex_init(&fs_info->pinned_mutex);
1651 mutex_init(&fs_info->chunk_mutex); 1649 mutex_init(&fs_info->chunk_mutex);
1652 mutex_init(&fs_info->transaction_kthread_mutex); 1650 mutex_init(&fs_info->transaction_kthread_mutex);
1653 mutex_init(&fs_info->cleaner_mutex); 1651 mutex_init(&fs_info->cleaner_mutex);
1654 mutex_init(&fs_info->volume_mutex); 1652 mutex_init(&fs_info->volume_mutex);
1655 mutex_init(&fs_info->tree_reloc_mutex); 1653 mutex_init(&fs_info->tree_reloc_mutex);
1654
1655 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1656 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1657
1656 init_waitqueue_head(&fs_info->transaction_throttle); 1658 init_waitqueue_head(&fs_info->transaction_throttle);
1657 init_waitqueue_head(&fs_info->transaction_wait); 1659 init_waitqueue_head(&fs_info->transaction_wait);
1658 init_waitqueue_head(&fs_info->async_submit_wait); 1660 init_waitqueue_head(&fs_info->async_submit_wait);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5e7cae63d80..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
34 35
35#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
166 u64 extent_start, extent_end, size; 167 u64 extent_start, extent_end, size;
167 int ret; 168 int ret;
168 169
169 mutex_lock(&info->pinned_mutex);
170 while (start < end) { 170 while (start < end) {
171 ret = find_first_extent_bit(&info->pinned_extents, start, 171 ret = find_first_extent_bit(&info->pinned_extents, start,
172 &extent_start, &extent_end, 172 &extent_start, &extent_end,
@@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
192 ret = btrfs_add_free_space(block_group, start, size); 192 ret = btrfs_add_free_space(block_group, start, size);
193 BUG_ON(ret); 193 BUG_ON(ret);
194 } 194 }
195 mutex_unlock(&info->pinned_mutex);
196 195
197 return 0; 196 return 0;
198} 197}
@@ -291,8 +290,8 @@ next:
291 block_group->key.objectid + 290 block_group->key.objectid +
292 block_group->key.offset); 291 block_group->key.offset);
293 292
294 remove_sb_from_cache(root, block_group);
295 block_group->cached = 1; 293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
296 ret = 0; 295 ret = 0;
297err: 296err:
298 btrfs_free_path(path); 297 btrfs_free_path(path);
@@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
326 return cache; 325 return cache;
327} 326}
328 327
329static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
330{ 329{
331 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
332 kfree(cache); 331 kfree(cache);
@@ -399,12 +398,12 @@ again:
399 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
400 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
401 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
402 put_block_group(cache); 401 btrfs_put_block_group(cache);
403 goto found; 402 goto found;
404 } 403 }
405 } 404 }
406 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
407 put_block_group(cache); 406 btrfs_put_block_group(cache);
408 cond_resched(); 407 cond_resched();
409 } 408 }
410 if (!wrapped) { 409 if (!wrapped) {
@@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1594 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1595 readonly = 1; 1594 readonly = 1;
1596 if (block_group) 1595 if (block_group)
1597 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1598 return readonly; 1597 return readonly;
1599} 1598}
1600 1599
@@ -2018,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2018 WARN_ON(ret); 2017 WARN_ON(ret);
2019 } 2018 }
2020 } 2019 }
2021 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2022 total -= num_bytes; 2021 total -= num_bytes;
2023 bytenr += num_bytes; 2022 bytenr += num_bytes;
2024 } 2023 }
@@ -2035,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2035 return 0; 2034 return 0;
2036 2035
2037 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2038 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2039 2038
2040 return bytenr; 2039 return bytenr;
2041} 2040}
@@ -2047,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2047 struct btrfs_block_group_cache *cache; 2046 struct btrfs_block_group_cache *cache;
2048 struct btrfs_fs_info *fs_info = root->fs_info; 2047 struct btrfs_fs_info *fs_info = root->fs_info;
2049 2048
2050 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2051 if (pin) { 2049 if (pin) {
2052 set_extent_dirty(&fs_info->pinned_extents, 2050 set_extent_dirty(&fs_info->pinned_extents,
2053 bytenr, bytenr + num - 1, GFP_NOFS); 2051 bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2055,7 +2053,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2055 clear_extent_dirty(&fs_info->pinned_extents, 2053 clear_extent_dirty(&fs_info->pinned_extents,
2056 bytenr, bytenr + num - 1, GFP_NOFS); 2054 bytenr, bytenr + num - 1, GFP_NOFS);
2057 } 2055 }
2058 mutex_unlock(&root->fs_info->pinned_mutex);
2059 2056
2060 while (num > 0) { 2057 while (num > 0) {
2061 cache = btrfs_lookup_block_group(fs_info, bytenr); 2058 cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2081,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2081 if (cache->cached) 2078 if (cache->cached)
2082 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2083 } 2080 }
2084 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2085 bytenr += len; 2082 bytenr += len;
2086 num -= len; 2083 num -= len;
2087 } 2084 }
@@ -2112,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2112 } 2109 }
2113 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2114 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2115 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2116 bytenr += len; 2113 bytenr += len;
2117 num -= len; 2114 num -= len;
2118 } 2115 }
@@ -2127,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2127 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2124 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2128 int ret; 2125 int ret;
2129 2126
2130 mutex_lock(&root->fs_info->pinned_mutex);
2131 while (1) { 2127 while (1) {
2132 ret = find_first_extent_bit(pinned_extents, last, 2128 ret = find_first_extent_bit(pinned_extents, last,
2133 &start, &end, EXTENT_DIRTY); 2129 &start, &end, EXTENT_DIRTY);
@@ -2136,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2136 set_extent_dirty(copy, start, end, GFP_NOFS); 2132 set_extent_dirty(copy, start, end, GFP_NOFS);
2137 last = end + 1; 2133 last = end + 1;
2138 } 2134 }
2139 mutex_unlock(&root->fs_info->pinned_mutex);
2140 return 0; 2135 return 0;
2141} 2136}
2142 2137
@@ -2149,7 +2144,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2149 int ret; 2144 int ret;
2150 2145
2151 while (1) { 2146 while (1) {
2152 mutex_lock(&root->fs_info->pinned_mutex);
2153 ret = find_first_extent_bit(unpin, 0, &start, &end, 2147 ret = find_first_extent_bit(unpin, 0, &start, &end,
2154 EXTENT_DIRTY); 2148 EXTENT_DIRTY);
2155 if (ret) 2149 if (ret)
@@ -2163,7 +2157,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2163 2157
2164 cond_resched(); 2158 cond_resched();
2165 } 2159 }
2166 mutex_unlock(&root->fs_info->pinned_mutex);
2167 return ret; 2160 return ret;
2168} 2161}
2169 2162
@@ -2205,7 +2198,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2205 free_extent_buffer(buf); 2198 free_extent_buffer(buf);
2206pinit: 2199pinit:
2207 btrfs_set_path_blocking(path); 2200 btrfs_set_path_blocking(path);
2208 mutex_lock(&root->fs_info->pinned_mutex);
2209 /* unlocks the pinned mutex */ 2201 /* unlocks the pinned mutex */
2210 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2202 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2211 2203
@@ -2511,8 +2503,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2511 */ 2503 */
2512 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && 2504 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2513 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2505 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2514 mutex_lock(&root->fs_info->pinned_mutex);
2515
2516 /* unlocks the pinned mutex */ 2506 /* unlocks the pinned mutex */
2517 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2507 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2518 update_reserved_extents(root, bytenr, num_bytes, 0); 2508 update_reserved_extents(root, bytenr, num_bytes, 0);
@@ -2554,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2554{ 2544{
2555 int ret = 0; 2545 int ret = 0;
2556 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
2557 u64 total_needed = num_bytes; 2547 struct btrfs_free_cluster *last_ptr = NULL;
2558 u64 *last_ptr = NULL;
2559 u64 last_wanted = 0;
2560 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
2561 int chunk_alloc_done = 0;
2562 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
2563 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
2564 struct list_head *head = NULL, *cur = NULL;
2565 int loop = 0;
2566 int extra_loop = 0;
2567 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
2568 2554
2569 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
2570 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2571 ins->objectid = 0; 2557 ins->objectid = 0;
2572 ins->offset = 0; 2558 ins->offset = 0;
2573 2559
2560 space_info = __find_space_info(root->fs_info, data);
2561
2574 if (orig_root->ref_cows || empty_size) 2562 if (orig_root->ref_cows || empty_size)
2575 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
2576 2564
2577 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2578 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
2579 if (!btrfs_test_opt(root, SSD)) 2567 if (!btrfs_test_opt(root, SSD))
2580 empty_cluster = 64 * 1024; 2568 empty_cluster = 64 * 1024;
2581 } 2569 }
2582 2570
2583 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
2584 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
2585 2574
2586 if (last_ptr) { 2575 if (last_ptr) {
2587 if (*last_ptr) { 2576 spin_lock(&last_ptr->lock);
2588 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
2589 last_wanted = *last_ptr; 2578 hint_byte = last_ptr->window_start;
2590 } else 2579 spin_unlock(&last_ptr->lock);
2591 empty_size += empty_cluster;
2592 } else {
2593 empty_cluster = 0;
2594 } 2580 }
2581
2595 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
2596 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
2597 2584
2598 if (last_wanted && search_start != last_wanted) { 2585 if (!last_ptr) {
2599 last_wanted = 0; 2586 empty_cluster = 0;
2600 empty_size += empty_cluster; 2587 loop = 1;
2601 } 2588 }
2602 2589
2603 total_needed += empty_size; 2590 if (search_start == hint_byte) {
2604 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2591 block_group = btrfs_lookup_block_group(root->fs_info,
2605 if (!block_group) 2592 search_start);
2606 block_group = btrfs_lookup_first_block_group(root->fs_info, 2593 if (block_group && block_group_bits(block_group, data)) {
2607 search_start); 2594 down_read(&space_info->groups_sem);
2608 space_info = __find_space_info(root->fs_info, data); 2595 goto have_block_group;
2596 } else if (block_group) {
2597 btrfs_put_block_group(block_group);
2598 }
2599 }
2609 2600
2601search:
2610 down_read(&space_info->groups_sem); 2602 down_read(&space_info->groups_sem);
2611 while (1) { 2603 list_for_each_entry(block_group, &space_info->block_groups, list) {
2612 struct btrfs_free_space *free_space; 2604 u64 offset;
2613 /*
2614 * the only way this happens if our hint points to a block
2615 * group thats not of the proper type, while looping this
2616 * should never happen
2617 */
2618 if (empty_size)
2619 extra_loop = 1;
2620 2605
2621 if (!block_group) 2606 atomic_inc(&block_group->count);
2622 goto new_group_no_lock; 2607 search_start = block_group->key.objectid;
2623 2608
2609have_block_group:
2624 if (unlikely(!block_group->cached)) { 2610 if (unlikely(!block_group->cached)) {
2625 mutex_lock(&block_group->cache_mutex); 2611 mutex_lock(&block_group->cache_mutex);
2626 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
2627 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
2628 if (ret) 2614 if (ret) {
2615 btrfs_put_block_group(block_group);
2629 break; 2616 break;
2617 }
2630 } 2618 }
2631 2619
2632 mutex_lock(&block_group->alloc_mutex);
2633 if (unlikely(!block_group_bits(block_group, data)))
2634 goto new_group;
2635
2636 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
2637 goto new_group; 2621 goto loop;
2638 2622
2639 free_space = btrfs_find_free_space(block_group, search_start, 2623 if (last_ptr) {
2640 total_needed); 2624 /*
2641 if (free_space) { 2625 * the refill lock keeps out other
2642 u64 start = block_group->key.objectid; 2626 * people trying to start a new cluster
2643 u64 end = block_group->key.objectid + 2627 */
2644 block_group->key.offset; 2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
2645 2636
2646 search_start = stripe_align(root, free_space->offset); 2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
2647 2657
2648 /* move on to the next group */ 2658 /*
2649 if (search_start + num_bytes >= search_end) 2659 * this cluster didn't work out, free it and
2650 goto new_group; 2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
2651 2663
2652 /* move on to the next group */ 2664 last_ptr_loop = 0;
2653 if (search_start + num_bytes > end)
2654 goto new_group;
2655 2665
2656 if (last_wanted && search_start != last_wanted) { 2666 /* allocate a cluster in this block group */
2657 total_needed += empty_cluster; 2667 ret = btrfs_find_space_cluster(trans,
2658 empty_size += empty_cluster; 2668 block_group, last_ptr,
2659 last_wanted = 0; 2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
2660 /* 2672 /*
2661 * if search_start is still in this block group 2673 * now pull our allocation out of this
2662 * then we just re-search this block group 2674 * cluster
2663 */ 2675 */
2664 if (search_start >= start && 2676 offset = btrfs_alloc_from_cluster(block_group,
2665 search_start < end) { 2677 last_ptr, num_bytes,
2666 mutex_unlock(&block_group->alloc_mutex); 2678 search_start);
2667 continue; 2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
2668 } 2683 }
2669
2670 /* else we go to the next block group */
2671 goto new_group;
2672 } 2684 }
2673 2685 /*
2674 if (exclude_nr > 0 && 2686 * at this point we either didn't find a cluster
2675 (search_start + num_bytes > exclude_start && 2687 * or we weren't able to allocate a block from our
2676 search_start < exclude_start + exclude_nr)) { 2688 * cluster. Free the cluster we've been trying
2677 search_start = exclude_start + exclude_nr; 2689 * to use, and go to the next block group
2678 /* 2690 */
2679 * if search_start is still in this block group 2691 if (loop < 2) {
2680 * then we just re-search this block group 2692 btrfs_return_cluster_to_free_space(NULL,
2681 */ 2693 last_ptr);
2682 if (search_start >= start && 2694 spin_unlock(&last_ptr->refill_lock);
2683 search_start < end) { 2695 goto loop;
2684 mutex_unlock(&block_group->alloc_mutex);
2685 last_wanted = 0;
2686 continue;
2687 }
2688
2689 /* else we go to the next block group */
2690 goto new_group;
2691 } 2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
2692 2699
2693 ins->objectid = search_start; 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
2694 ins->offset = num_bytes; 2701 num_bytes, empty_size);
2702 if (!offset)
2703 goto loop;
2704checks:
2705 search_start = stripe_align(root, offset);
2706
2707 /* move on to the next group */
2708 if (search_start + num_bytes >= search_end) {
2709 btrfs_add_free_space(block_group, offset, num_bytes);
2710 goto loop;
2711 }
2695 2712
2696 btrfs_remove_free_space_lock(block_group, search_start, 2713 /* move on to the next group */
2697 num_bytes); 2714 if (search_start + num_bytes >
2698 /* we are all good, lets return */ 2715 block_group->key.objectid + block_group->key.offset) {
2699 mutex_unlock(&block_group->alloc_mutex); 2716 btrfs_add_free_space(block_group, offset, num_bytes);
2700 break; 2717 goto loop;
2701 } 2718 }
2702new_group:
2703 mutex_unlock(&block_group->alloc_mutex);
2704 put_block_group(block_group);
2705 block_group = NULL;
2706new_group_no_lock:
2707 /* don't try to compare new allocations against the
2708 * last allocation any more
2709 */
2710 last_wanted = 0;
2711 2719
2712 /* 2720 if (exclude_nr > 0 &&
2713 * Here's how this works. 2721 (search_start + num_bytes > exclude_start &&
2714 * loop == 0: we were searching a block group via a hint 2722 search_start < exclude_start + exclude_nr)) {
2715 * and didn't find anything, so we start at 2723 search_start = exclude_start + exclude_nr;
2716 * the head of the block groups and keep searching 2724
2717 * loop == 1: we're searching through all of the block groups 2725 btrfs_add_free_space(block_group, offset, num_bytes);
2718 * if we hit the head again we have searched 2726 /*
2719 * all of the block groups for this space and we 2727 * if search_start is still in this block group
2720 * need to try and allocate, if we cant error out. 2728 * then we just re-search this block group
2721 * loop == 2: we allocated more space and are looping through
2722 * all of the block groups again.
2723 */
2724 if (loop == 0) {
2725 head = &space_info->block_groups;
2726 cur = head->next;
2727 loop++;
2728 } else if (loop == 1 && cur == head) {
2729 int keep_going;
2730
2731 /* at this point we give up on the empty_size
2732 * allocations and just try to allocate the min
2733 * space.
2734 *
2735 * The extra_loop field was set if an empty_size
2736 * allocation was attempted above, and if this
2737 * is try we need to try the loop again without
2738 * the additional empty_size.
2739 */ 2729 */
2740 total_needed -= empty_size; 2730 if (search_start >= block_group->key.objectid &&
2741 empty_size = 0; 2731 search_start < (block_group->key.objectid +
2742 keep_going = extra_loop; 2732 block_group->key.offset))
2743 loop++; 2733 goto have_block_group;
2734 goto loop;
2735 }
2744 2736
2745 if (allowed_chunk_alloc && !chunk_alloc_done) { 2737 ins->objectid = search_start;
2746 up_read(&space_info->groups_sem); 2738 ins->offset = num_bytes;
2747 ret = do_chunk_alloc(trans, root, num_bytes + 2739
2748 2 * 1024 * 1024, data, 1); 2740 if (offset < search_start)
2749 down_read(&space_info->groups_sem); 2741 btrfs_add_free_space(block_group, offset,
2750 if (ret < 0) 2742 search_start - offset);
2751 goto loop_check; 2743 BUG_ON(offset > search_start);
2752 head = &space_info->block_groups; 2744
2753 /* 2745 /* we are all good, lets return */
2754 * we've allocated a new chunk, keep 2746 break;
2755 * trying 2747loop:
2756 */ 2748 btrfs_put_block_group(block_group);
2757 keep_going = 1; 2749 }
2758 chunk_alloc_done = 1; 2750 up_read(&space_info->groups_sem);
2759 } else if (!allowed_chunk_alloc) { 2751
2760 space_info->force_alloc = 1; 2752 /* loop == 0, try to find a clustered alloc in every block group
2761 } 2753 * loop == 1, try again after forcing a chunk allocation
2762loop_check: 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
2763 if (keep_going) { 2755 */
2764 cur = head->next; 2756 if (!ins->objectid && loop < 3 &&
2765 extra_loop = 0; 2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
2766 } else { 2758 if (loop >= 2) {
2767 break; 2759 empty_size = 0;
2768 } 2760 empty_cluster = 0;
2769 } else if (cur == head) {
2770 break;
2771 } 2761 }
2772 2762
2773 block_group = list_entry(cur, struct btrfs_block_group_cache, 2763 if (allowed_chunk_alloc) {
2774 list); 2764 ret = do_chunk_alloc(trans, root, num_bytes +
2775 atomic_inc(&block_group->count); 2765 2 * 1024 * 1024, data, 1);
2766 allowed_chunk_alloc = 0;
2767 } else {
2768 space_info->force_alloc = 1;
2769 }
2776 2770
2777 search_start = block_group->key.objectid; 2771 if (loop < 3) {
2778 cur = cur->next; 2772 loop++;
2773 goto search;
2774 }
2775 ret = -ENOSPC;
2776 } else if (!ins->objectid) {
2777 ret = -ENOSPC;
2779 } 2778 }
2780 2779
2781 /* we found what we needed */ 2780 /* we found what we needed */
@@ -2783,21 +2782,10 @@ loop_check:
2783 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2784 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
2785 2784
2786 if (last_ptr) 2785 btrfs_put_block_group(block_group);
2787 *last_ptr = ins->objectid + ins->offset;
2788 ret = 0; 2786 ret = 0;
2789 } else if (!ret) {
2790 printk(KERN_ERR "btrfs searching for %llu bytes, "
2791 "num_bytes %llu, loop %d, allowed_alloc %d\n",
2792 (unsigned long long)total_needed,
2793 (unsigned long long)num_bytes,
2794 loop, allowed_chunk_alloc);
2795 ret = -ENOSPC;
2796 } 2787 }
2797 if (block_group)
2798 put_block_group(block_group);
2799 2788
2800 up_read(&space_info->groups_sem);
2801 return ret; 2789 return ret;
2802} 2790}
2803 2791
@@ -2902,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2902 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
2903 2891
2904 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
2905 put_block_group(cache); 2893 btrfs_put_block_group(cache);
2906 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
2907 2895
2908 return ret; 2896 return ret;
@@ -3040,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3040 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
3041 ins->offset); 3029 ins->offset);
3042 BUG_ON(ret); 3030 BUG_ON(ret);
3043 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
3044 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3045 ref_generation, owner, ins, 1); 3033 ref_generation, owner, ins, 1);
3046 return ret; 3034 return ret;
@@ -5729,7 +5717,7 @@ next:
5729 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
5730 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5731 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
5732 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
5733 ret = 0; 5721 ret = 0;
5734out: 5722out:
5735 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -5856,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5856 5844
5857 atomic_set(&cache->count, 1); 5845 atomic_set(&cache->count, 1);
5858 spin_lock_init(&cache->lock); 5846 spin_lock_init(&cache->lock);
5859 mutex_init(&cache->alloc_mutex); 5847 spin_lock_init(&cache->tree_lock);
5860 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
5861 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
5862 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
5863 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
5864 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -5912,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5912 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5901 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5913 atomic_set(&cache->count, 1); 5902 atomic_set(&cache->count, 1);
5914 spin_lock_init(&cache->lock); 5903 spin_lock_init(&cache->lock);
5915 mutex_init(&cache->alloc_mutex); 5904 spin_lock_init(&cache->tree_lock);
5916 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
5917 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
5918 5908
5919 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
5920 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5974,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5974 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
5975 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
5976 5966
5977 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
5978 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
5979 5969
5980 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5981 if (ret > 0) 5971 if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 08085af089e2..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2884 disko = 0; 2884 disko = 0;
2885 flags = 0; 2885 flags = 0;
2886 2886
2887 switch (em->block_start) { 2887 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1; 2888 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST; 2889 flags |= FIEMAP_EXTENT_LAST;
2891 break; 2890 } else if (em->block_start == EXTENT_MAP_HOLE) {
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN; 2891 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break; 2892 } else if (em->block_start == EXTENT_MAP_INLINE) {
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2893 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED); 2894 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break; 2895 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC | 2896 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN); 2897 FIEMAP_EXTENT_UNKNOWN);
2902 break; 2898 } else {
2903 default:
2904 disko = em->block_start; 2899 disko = em->block_start;
2905 break;
2906 } 2900 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2901 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED; 2902 flags |= FIEMAP_EXTENT_ENCODED;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 234 rb = tree_insert(&tree->map, em->start, &em->rb_node);
235 if (rb) { 235 if (rb) {
236 ret = -EEXIST; 236 ret = -EEXIST;
237 free_extent_map(merge);
238 goto out; 237 goto out;
239 } 238 }
240 atomic_inc(&em->refs); 239 atomic_inc(&em->refs);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
68} 77}
69 78
70/* 79/*
71 * searches the tree for the given offset. If contains is set we will return 80 * searches the tree for the given offset.
72 * the free space that contains the given offset. If contains is not set we 81 *
73 * will return the free space that starts at or after the given offset and is 82 * fuzzy == 1: this is used for allocations where we are given a hint of where
74 * at least bytes long. 83 * to look for free space. Because the hint may not be completely on an offset
84 * mark, or the hint may no longer point to free space we need to fudge our
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
75 */ 94 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 95static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes, 96 u64 offset, u64 bytes,
78 int contains) 97 int fuzzy)
79{ 98{
80 struct rb_node *n = root->rb_node; 99 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL; 100 struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 103 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85 104
86 if (offset < entry->offset) { 105 if (offset < entry->offset) {
87 if (!contains && 106 if (fuzzy &&
88 (!ret || entry->offset < ret->offset) && 107 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes)) 108 (bytes <= entry->bytes))
90 ret = entry; 109 ret = entry;
91 n = n->rb_left; 110 n = n->rb_left;
92 } else if (offset > entry->offset) { 111 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset && 112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) { 114 bytes <= entry->bytes) {
95 ret = entry; 115 ret = entry;
96 break; 116 break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
171 int ret = 0; 191 int ret = 0;
172 192
173 193
194 BUG_ON(!info->bytes);
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index); 196 &info->offset_index);
176 if (ret) 197 if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
184 return ret; 205 return ret;
185} 206}
186 207
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes) 209 u64 offset, u64 bytes)
189{ 210{
190 struct btrfs_free_space *right_info; 211 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info; 212 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL; 213 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0; 214 int ret = 0;
195 215
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info) 217 if (!info)
198 return -ENOMEM; 218 return -ENOMEM;
199 219
220 info->offset = offset;
221 info->bytes = bytes;
222
223 spin_lock(&block_group->tree_lock);
224
200 /* 225 /*
201 * first we want to see if there is free space adjacent to the range we 226 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to 227 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range 228 * cover the entire range
204 */ 229 */
205 right_info = tree_search_offset(&block_group->free_space_offset, 230 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1); 231 offset+bytes, 0, 0);
207 left_info = tree_search_offset(&block_group->free_space_offset, 232 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1); 233 offset-1, 0, 1);
209 234
210 if (right_info && right_info->offset == offset+bytes) { 235 if (right_info) {
211 unlink_free_space(block_group, right_info); 236 unlink_free_space(block_group, right_info);
212 info = right_info; 237 info->bytes += right_info->bytes;
213 info->offset = offset; 238 kfree(right_info);
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 } 239 }
225 240
226 if (left_info) { 241 if (left_info && left_info->offset + left_info->bytes == offset) {
227 unlink_free_space(block_group, left_info); 242 unlink_free_space(block_group, left_info);
228 243 info->offset = left_info->offset;
229 if (unlikely((left_info->offset + left_info->bytes) != 244 info->bytes += left_info->bytes;
230 offset)) { 245 kfree(left_info);
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 } 246 }
251 247
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info); 248 ret = link_free_space(block_group, info);
265 if (ret) 249 if (ret)
266 kfree(info); 250 kfree(info);
267out: 251
252 spin_unlock(&block_group->tree_lock);
253
268 if (ret) { 254 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST) 256 BUG_ON(ret == -EEXIST);
271 BUG();
272 } 257 }
273 258
274 kfree(alloc_info);
275
276 return ret; 259 return ret;
277} 260}
278 261
279static int 262int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes)
281 u64 offset, u64 bytes)
282{ 264{
283 struct btrfs_free_space *info; 265 struct btrfs_free_space *info;
284 int ret = 0; 266 int ret = 0;
285 267
268 spin_lock(&block_group->tree_lock);
269
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1); 271 1);
288
289 if (info && info->offset == offset) { 272 if (info && info->offset == offset) {
290 if (info->bytes < bytes) { 273 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu," 274 printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
295 (unsigned long long)bytes); 278 (unsigned long long)bytes);
296 WARN_ON(1); 279 WARN_ON(1);
297 ret = -EINVAL; 280 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock);
298 goto out; 282 goto out;
299 } 283 }
300 unlink_free_space(block_group, info); 284 unlink_free_space(block_group, info);
301 285
302 if (info->bytes == bytes) { 286 if (info->bytes == bytes) {
303 kfree(info); 287 kfree(info);
288 spin_unlock(&block_group->tree_lock);
304 goto out; 289 goto out;
305 } 290 }
306 291
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
308 info->bytes -= bytes; 293 info->bytes -= bytes;
309 294
310 ret = link_free_space(block_group, info); 295 ret = link_free_space(block_group, info);
296 spin_unlock(&block_group->tree_lock);
311 BUG_ON(ret); 297 BUG_ON(ret);
312 } else if (info && info->offset < offset && 298 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) { 299 info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
333 */ 319 */
334 kfree(info); 320 kfree(info);
335 } 321 }
336 322 spin_unlock(&block_group->tree_lock);
337 /* step two, insert a new info struct to cover anything 323 /* step two, insert a new info struct to cover anything
338 * before the hole 324 * before the hole
339 */ 325 */
340 ret = __btrfs_add_free_space(block_group, old_start, 326 ret = btrfs_add_free_space(block_group, old_start,
341 offset - old_start); 327 offset - old_start);
342 BUG_ON(ret); 328 BUG_ON(ret);
343 } else { 329 } else {
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached, block_group->key.objectid,
336 block_group->key.offset);
337 btrfs_dump_free_space(block_group, bytes);
338 } else if (info) {
339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
340 "but wanted offset=%llu bytes=%llu\n",
341 info->offset, info->bytes, offset, bytes);
342 }
344 WARN_ON(1); 343 WARN_ON(1);
345 } 344 }
346out: 345out:
347 return ret; 346 return ret;
348} 347}
349 348
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 349void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes) 350 u64 bytes)
402{ 351{
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
408 info = rb_entry(n, struct btrfs_free_space, offset_index); 357 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes) 358 if (info->bytes >= bytes)
410 count++; 359 count++;
360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
361 info->bytes);
411 } 362 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count); 364 "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
428 return ret; 379 return ret;
429} 380}
430 381
382/*
383 * for a given cluster, put all of its extents back into the free
384 * space cache. If the block group passed doesn't match the block group
385 * pointed to by the cluster, someone else raced in and freed the
386 * cluster already. In that case, we just return without changing anything
387 */
388static int
389__btrfs_return_cluster_to_free_space(
390 struct btrfs_block_group_cache *block_group,
391 struct btrfs_free_cluster *cluster)
392{
393 struct btrfs_free_space *entry;
394 struct rb_node *node;
395
396 spin_lock(&cluster->lock);
397 if (cluster->block_group != block_group)
398 goto out;
399
400 cluster->window_start = 0;
401 node = rb_first(&cluster->root);
402 while(node) {
403 entry = rb_entry(node, struct btrfs_free_space, offset_index);
404 node = rb_next(&entry->offset_index);
405 rb_erase(&entry->offset_index, &cluster->root);
406 link_free_space(block_group, entry);
407 }
408 list_del_init(&cluster->block_group_list);
409
410 btrfs_put_block_group(cluster->block_group);
411 cluster->block_group = NULL;
412 cluster->root.rb_node = NULL;
413out:
414 spin_unlock(&cluster->lock);
415 return 0;
416}
417
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 418void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{ 419{
433 struct btrfs_free_space *info; 420 struct btrfs_free_space *info;
434 struct rb_node *node; 421 struct rb_node *node;
422 struct btrfs_free_cluster *cluster;
423 struct btrfs_free_cluster *safe;
424
425 spin_lock(&block_group->tree_lock);
426
427 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
428 block_group_list) {
429
430 WARN_ON(cluster->block_group != block_group);
431 __btrfs_return_cluster_to_free_space(block_group, cluster);
432 }
435 433
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 434 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index); 435 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info); 436 unlink_free_space(block_group, info);
440 kfree(info); 437 kfree(info);
441 if (need_resched()) { 438 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex); 439 spin_unlock(&block_group->tree_lock);
443 cond_resched(); 440 cond_resched();
444 mutex_lock(&block_group->alloc_mutex); 441 spin_lock(&block_group->tree_lock);
445 } 442 }
446 } 443 }
447 mutex_unlock(&block_group->alloc_mutex); 444 spin_unlock(&block_group->tree_lock);
448} 445}
449 446
450#if 0 447u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct 448 u64 offset, u64 bytes, u64 empty_size)
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{ 449{
456 struct btrfs_free_space *ret; 450 struct btrfs_free_space *entry = NULL;
451 u64 ret = 0;
457 452
458 mutex_lock(&block_group->alloc_mutex); 453 spin_lock(&block_group->tree_lock);
459 ret = tree_search_offset(&block_group->free_space_offset, offset, 454 entry = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0); 455 bytes + empty_size, 1);
461 mutex_unlock(&block_group->alloc_mutex); 456 if (!entry)
457 entry = tree_search_bytes(&block_group->free_space_bytes,
458 offset, bytes + empty_size);
459 if (entry) {
460 unlink_free_space(block_group, entry);
461 ret = entry->offset;
462 entry->offset += bytes;
463 entry->bytes -= bytes;
464
465 if (!entry->bytes)
466 kfree(entry);
467 else
468 link_free_space(block_group, entry);
469 }
470 spin_unlock(&block_group->tree_lock);
462 471
463 return ret; 472 return ret;
464} 473}
465 474
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 475/*
467 btrfs_block_group_cache 476 * given a cluster, put all of its extents back into the free space
468 *block_group, u64 offset, 477 * cache. If a block group is passed, this function will only free
469 u64 bytes) 478 * a cluster that belongs to the passed block group.
479 *
480 * Otherwise, it'll get a reference on the block group pointed to by the
481 * cluster and remove the cluster from it.
482 */
483int btrfs_return_cluster_to_free_space(
484 struct btrfs_block_group_cache *block_group,
485 struct btrfs_free_cluster *cluster)
470{ 486{
471 struct btrfs_free_space *ret; 487 int ret;
472 488
473 mutex_lock(&block_group->alloc_mutex); 489 /* first, get a safe pointer to the block group */
490 spin_lock(&cluster->lock);
491 if (!block_group) {
492 block_group = cluster->block_group;
493 if (!block_group) {
494 spin_unlock(&cluster->lock);
495 return 0;
496 }
497 } else if (cluster->block_group != block_group) {
498 /* someone else has already freed it don't redo their work */
499 spin_unlock(&cluster->lock);
500 return 0;
501 }
502 atomic_inc(&block_group->count);
503 spin_unlock(&cluster->lock);
474 504
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 505 /* now return any extents the cluster had on it */
476 mutex_unlock(&block_group->alloc_mutex); 506 spin_lock(&block_group->tree_lock);
507 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
508 spin_unlock(&block_group->tree_lock);
477 509
510 /* finally drop our ref */
511 btrfs_put_block_group(block_group);
478 return ret; 512 return ret;
479} 513}
480#endif
481 514
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 515/*
483 *block_group, u64 offset, 516 * given a cluster, try to allocate 'bytes' from it, returns 0
484 u64 bytes) 517 * if it couldn't find anything suitably large, or a logical disk offset
518 * if things worked out
519 */
520u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
521 struct btrfs_free_cluster *cluster, u64 bytes,
522 u64 min_start)
523{
524 struct btrfs_free_space *entry = NULL;
525 struct rb_node *node;
526 u64 ret = 0;
527
528 spin_lock(&cluster->lock);
529 if (bytes > cluster->max_size)
530 goto out;
531
532 if (cluster->block_group != block_group)
533 goto out;
534
535 node = rb_first(&cluster->root);
536 if (!node)
537 goto out;
538
539 entry = rb_entry(node, struct btrfs_free_space, offset_index);
540
541 while(1) {
542 if (entry->bytes < bytes || entry->offset < min_start) {
543 struct rb_node *node;
544
545 node = rb_next(&entry->offset_index);
546 if (!node)
547 break;
548 entry = rb_entry(node, struct btrfs_free_space,
549 offset_index);
550 continue;
551 }
552 ret = entry->offset;
553
554 entry->offset += bytes;
555 entry->bytes -= bytes;
556
557 if (entry->bytes == 0) {
558 rb_erase(&entry->offset_index, &cluster->root);
559 kfree(entry);
560 }
561 break;
562 }
563out:
564 spin_unlock(&cluster->lock);
565 return ret;
566}
567
568/*
569 * here we try to find a cluster of blocks in a block group. The goal
570 * is to find at least bytes free and up to empty_size + bytes free.
571 * We might not find them all in one contiguous area.
572 *
573 * returns zero and sets up cluster if things worked out, otherwise
574 * it returns -enospc
575 */
576int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
577 struct btrfs_block_group_cache *block_group,
578 struct btrfs_free_cluster *cluster,
579 u64 offset, u64 bytes, u64 empty_size)
485{ 580{
486 struct btrfs_free_space *ret = NULL; 581 struct btrfs_free_space *entry = NULL;
582 struct rb_node *node;
583 struct btrfs_free_space *next;
584 struct btrfs_free_space *last;
585 u64 min_bytes;
586 u64 window_start;
587 u64 window_free;
588 u64 max_extent = 0;
589 int total_retries = 0;
590 int ret;
591
592 /* for metadata, allow allocates with more holes */
593 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
594 /*
595 * we want to do larger allocations when we are
596 * flushing out the delayed refs, it helps prevent
597 * making more work as we go along.
598 */
599 if (trans->transaction->delayed_refs.flushing)
600 min_bytes = max(bytes, (bytes + empty_size) >> 1);
601 else
602 min_bytes = max(bytes, (bytes + empty_size) >> 4);
603 } else
604 min_bytes = max(bytes, (bytes + empty_size) >> 2);
605
606 spin_lock(&block_group->tree_lock);
607 spin_lock(&cluster->lock);
608
609 /* someone already found a cluster, hooray */
610 if (cluster->block_group) {
611 ret = 0;
612 goto out;
613 }
614again:
615 min_bytes = min(min_bytes, bytes + empty_size);
616 entry = tree_search_bytes(&block_group->free_space_bytes,
617 offset, min_bytes);
618 if (!entry) {
619 ret = -ENOSPC;
620 goto out;
621 }
622 window_start = entry->offset;
623 window_free = entry->bytes;
624 last = entry;
625 max_extent = entry->bytes;
626
627 while(1) {
628 /* out window is just right, lets fill it */
629 if (window_free >= bytes + empty_size)
630 break;
487 631
488 ret = tree_search_offset(&block_group->free_space_offset, offset, 632 node = rb_next(&last->offset_index);
489 bytes, 0); 633 if (!node) {
490 if (!ret) 634 ret = -ENOSPC;
491 ret = tree_search_bytes(&block_group->free_space_bytes, 635 goto out;
492 offset, bytes); 636 }
637 next = rb_entry(node, struct btrfs_free_space, offset_index);
638
639 /*
640 * we haven't filled the empty size and the window is
641 * very large. reset and try again
642 */
643 if (next->offset - window_start > (bytes + empty_size) * 2) {
644 entry = next;
645 window_start = entry->offset;
646 window_free = entry->bytes;
647 last = entry;
648 max_extent = 0;
649 total_retries++;
650 if (total_retries % 256 == 0) {
651 if (min_bytes >= (bytes + empty_size)) {
652 ret = -ENOSPC;
653 goto out;
654 }
655 /*
656 * grow our allocation a bit, we're not having
657 * much luck
658 */
659 min_bytes *= 2;
660 goto again;
661 }
662 } else {
663 last = next;
664 window_free += next->bytes;
665 if (entry->bytes > max_extent)
666 max_extent = entry->bytes;
667 }
668 }
669
670 cluster->window_start = entry->offset;
671
672 /*
673 * now we've found our entries, pull them out of the free space
674 * cache and put them into the cluster rbtree
675 *
676 * The cluster includes an rbtree, but only uses the offset index
677 * of each free space cache entry.
678 */
679 while(1) {
680 node = rb_next(&entry->offset_index);
681 unlink_free_space(block_group, entry);
682 ret = tree_insert_offset(&cluster->root, entry->offset,
683 &entry->offset_index);
684 BUG_ON(ret);
685
686 if (!node || entry == last)
687 break;
688
689 entry = rb_entry(node, struct btrfs_free_space, offset_index);
690 }
691 ret = 0;
692 cluster->max_size = max_extent;
693 atomic_inc(&block_group->count);
694 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
695 cluster->block_group = block_group;
696out:
697 spin_unlock(&cluster->lock);
698 spin_unlock(&block_group->tree_lock);
493 699
494 return ret; 700 return ret;
495} 701}
702
703/*
704 * simple code to zero out a cluster
705 */
706void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
707{
708 spin_lock_init(&cluster->lock);
709 spin_lock_init(&cluster->refill_lock);
710 cluster->root.rb_node = NULL;
711 cluster->max_size = 0;
712 INIT_LIST_HEAD(&cluster->block_group_list);
713 cluster->block_group = NULL;
714}
715
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06d8db5afb08..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3481 3481
3482 if (dir) { 3482 if (dir) {
3483 ret = btrfs_set_inode_index(dir, index); 3483 ret = btrfs_set_inode_index(dir, index);
3484 if (ret) 3484 if (ret) {
3485 iput(inode);
3485 return ERR_PTR(ret); 3486 return ERR_PTR(ret);
3487 }
3486 } 3488 }
3487 /* 3489 /*
3488 * index_cnt is ignored for everything but a dir, 3490 * index_cnt is ignored for everything but a dir,
@@ -3565,6 +3567,7 @@ fail:
3565 if (dir) 3567 if (dir)
3566 BTRFS_I(dir)->index_cnt--; 3568 BTRFS_I(dir)->index_cnt--;
3567 btrfs_free_path(path); 3569 btrfs_free_path(path);
3570 iput(inode);
3568 return ERR_PTR(ret); 3571 return ERR_PTR(ret);
3569} 3572}
3570 3573
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
267 goto out_dput; 267 goto out_dput;
268 268
269 if (!IS_POSIXACL(parent->dentry->d_inode)) 269 if (!IS_POSIXACL(parent->dentry->d_inode))
270 mode &= ~current->fs->umask; 270 mode &= ~current_umask();
271 271
272 error = mnt_want_write(parent->mnt); 272 error = mnt_want_write(parent->mnt);
273 if (error) 273 if (error)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a5310c0f41e2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
60 60
61/* 61/*
62 * unfortunately, many of the places that currently set a lock to blocking 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block 63 * don't end up blocking for very long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit 64 * at all. For a dbench 50 run, if we don't spin on the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more. 65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 * 66 *
67 * So, we're still stuck with this crummy spin on the blocking bit, 67 * So, we're still stuck with this crummy spin on the blocking bit,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
66enum { 67enum {
67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
69 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_flushoncommit, Opt_err,
70}; 72};
71 73
72static match_table_t tokens = { 74static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
83 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
84 {Opt_ssd, "ssd"}, 86 {Opt_ssd, "ssd"},
85 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"},
86 {Opt_err, NULL}, 90 {Opt_err, NULL},
87}; 91};
88 92
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
222 case Opt_noacl: 226 case Opt_noacl:
223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 227 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
224 break; 228 break;
229 case Opt_notreelog:
230 printk(KERN_INFO "btrfs: disabling tree log\n");
231 btrfs_set_opt(info->mount_opt, NOTREELOG);
232 break;
233 case Opt_flushoncommit:
234 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
235 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
236 break;
225 default: 237 default:
226 break; 238 break;
227 } 239 }
@@ -363,9 +375,8 @@ fail_close:
363int btrfs_sync_fs(struct super_block *sb, int wait) 375int btrfs_sync_fs(struct super_block *sb, int wait)
364{ 376{
365 struct btrfs_trans_handle *trans; 377 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root; 378 struct btrfs_root *root = btrfs_sb(sb);
367 int ret; 379 int ret;
368 root = btrfs_sb(sb);
369 380
370 if (sb->s_flags & MS_RDONLY) 381 if (sb->s_flags & MS_RDONLY)
371 return 0; 382 return 0;
@@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
385 return ret; 396 return ret;
386} 397}
387 398
399static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
400{
401 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
402 struct btrfs_fs_info *info = root->fs_info;
403
404 if (btrfs_test_opt(root, DEGRADED))
405 seq_puts(seq, ",degraded");
406 if (btrfs_test_opt(root, NODATASUM))
407 seq_puts(seq, ",nodatasum");
408 if (btrfs_test_opt(root, NODATACOW))
409 seq_puts(seq, ",nodatacow");
410 if (btrfs_test_opt(root, NOBARRIER))
411 seq_puts(seq, ",nobarrier");
412 if (info->max_extent != (u64)-1)
413 seq_printf(seq, ",max_extent=%llu", info->max_extent);
414 if (info->max_inline != 8192 * 1024)
415 seq_printf(seq, ",max_inline=%llu", info->max_inline);
416 if (info->alloc_start != 0)
417 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
418 if (info->thread_pool_size != min_t(unsigned long,
419 num_online_cpus() + 2, 8))
420 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
421 if (btrfs_test_opt(root, COMPRESS))
422 seq_puts(seq, ",compress");
423 if (btrfs_test_opt(root, SSD))
424 seq_puts(seq, ",ssd");
425 if (btrfs_test_opt(root, NOTREELOG))
426 seq_puts(seq, ",no-treelog");
427 if (btrfs_test_opt(root, FLUSHONCOMMIT))
428 seq_puts(seq, ",flush-on-commit");
429 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
430 seq_puts(seq, ",noacl");
431 return 0;
432}
433
388static void btrfs_write_super(struct super_block *sb) 434static void btrfs_write_super(struct super_block *sb)
389{ 435{
390 sb->s_dirt = 0; 436 sb->s_dirt = 0;
@@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
630 .put_super = btrfs_put_super, 676 .put_super = btrfs_put_super,
631 .write_super = btrfs_write_super, 677 .write_super = btrfs_write_super,
632 .sync_fs = btrfs_sync_fs, 678 .sync_fs = btrfs_sync_fs,
633 .show_options = generic_show_options, 679 .show_options = btrfs_show_options,
634 .write_inode = btrfs_write_inode, 680 .write_inode = btrfs_write_inode,
635 .dirty_inode = btrfs_dirty_inode, 681 .dirty_inode = btrfs_dirty_inode,
636 .alloc_inode = btrfs_alloc_inode, 682 .alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 664782c6a2df..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;
@@ -974,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
974 int ret; 972 int ret;
975 int should_grow = 0; 973 int should_grow = 0;
976 unsigned long now = get_seconds(); 974 unsigned long now = get_seconds();
975 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
977 976
978 btrfs_run_ordered_operations(root, 0); 977 btrfs_run_ordered_operations(root, 0);
979 978
@@ -1053,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1053 1052
1054 mutex_unlock(&root->fs_info->trans_mutex); 1053 mutex_unlock(&root->fs_info->trans_mutex);
1055 1054
1056 if (snap_pending) { 1055 if (flush_on_commit || snap_pending) {
1056 if (flush_on_commit)
1057 btrfs_start_delalloc_inodes(root);
1057 ret = btrfs_wait_ordered_extents(root, 1); 1058 ret = btrfs_wait_ordered_extents(root, 1);
1058 BUG_ON(ret); 1059 BUG_ON(ret);
1059 } 1060 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fc9b87a7975b..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
262 struct extent_buffer *eb, 262 struct extent_buffer *eb,
263 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
264{ 264{
265 if (wc->pin) { 265 if (wc->pin)
266 mutex_lock(&log->fs_info->pinned_mutex);
267 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
268 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
269 }
270 268
271 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
272 if (wc->write) 270 if (wc->write)
@@ -1224,8 +1222,7 @@ insert:
1224 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1225 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1226 1224
1227 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1228 BUG();
1229 goto out; 1226 goto out;
1230} 1227}
1231 1228
@@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2900 2897
2901 sb = inode->i_sb; 2898 sb = inode->i_sb;
2902 2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2903 if (root->fs_info->last_trans_log_full_commit > 2905 if (root->fs_info->last_trans_log_full_commit >
2904 root->fs_info->last_trans_committed) { 2906 root->fs_info->last_trans_committed) {
2905 ret = 1; 2907 ret = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/iocontext.h>
23#include <asm/div64.h> 24#include <asm/div64.h>
24#include "compat.h" 25#include "compat.h"
25#include "ctree.h" 26#include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
145 int again = 0; 146 int again = 0;
146 unsigned long num_run = 0; 147 unsigned long num_run = 0;
147 unsigned long limit; 148 unsigned long limit;
149 unsigned long last_waited = 0;
148 150
149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 151 bdi = blk_get_backing_dev_info(device->bdev);
150 fs_info = device->dev_root->fs_info; 152 fs_info = device->dev_root->fs_info;
151 limit = btrfs_async_submit_limit(fs_info); 153 limit = btrfs_async_submit_limit(fs_info);
152 limit = limit * 2 / 3; 154 limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 209 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
208 fs_info->fs_devices->open_devices > 1) { 210 fs_info->fs_devices->open_devices > 1) {
209 struct bio *old_head; 211 struct bio *old_head;
212 struct io_context *ioc;
210 213
214 ioc = current->io_context;
215
216 /*
217 * the main goal here is that we don't want to
218 * block if we're going to be able to submit
219 * more requests without blocking.
220 *
221 * This code does two great things, it pokes into
222 * the elevator code from a filesystem _and_
223 * it makes assumptions about how batching works.
224 */
225 if (ioc && ioc->nr_batch_requests > 0 &&
226 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
227 (last_waited == 0 ||
228 ioc->last_waited == last_waited)) {
229 /*
230 * we want to go through our batch of
231 * requests and stop. So, we copy out
232 * the ioc->last_waited time and test
233 * against it before looping
234 */
235 last_waited = ioc->last_waited;
236 continue;
237 }
211 spin_lock(&device->io_lock); 238 spin_lock(&device->io_lock);
212 239
213 old_head = device->pending_bios; 240 old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
231 if (device->pending_bios) 258 if (device->pending_bios)
232 goto loop_lock; 259 goto loop_lock;
233 spin_unlock(&device->io_lock); 260 spin_unlock(&device->io_lock);
261
262 /*
263 * IO has already been through a long path to get here. Checksumming,
264 * async helper threads, perhaps compression. We've done a pretty
265 * good job of collecting a batch of IO and should just unplug
266 * the device right away.
267 *
268 * This will help anyone who is waiting on the IO, they might have
269 * already unplugged, but managed to do so before the bio they
270 * cared about found its way down here.
271 */
272 blk_run_backing_dev(bdi, NULL);
234done: 273done:
235 return 0; 274 return 0;
236} 275}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
76struct btrfs_fs_devices { 76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78 78
79 /* the device with this id has the most recent coyp of the super */ 79 /* the device with this id has the most recent copy of the super */
80 u64 latest_devid; 80 u64 latest_devid;
81 u64 latest_trans; 81 u64 latest_trans;
82 u64 num_devices; 82 u64 num_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index f5f8b15a6e40..5d55a896ff78 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
199 head = page_buffers(page); 199 head = page_buffers(page);
200 bh = head; 200 bh = head;
201 do { 201 do {
202 if (bh->b_blocknr == block) { 202 if (!buffer_mapped(bh))
203 all_mapped = 0;
204 else if (bh->b_blocknr == block) {
203 ret = bh; 205 ret = bh;
204 get_bh(bh); 206 get_bh(bh);
205 goto out_unlock; 207 goto out_unlock;
206 } 208 }
207 if (!buffer_mapped(bh))
208 all_mapped = 0;
209 bh = bh->b_this_page; 209 bh = bh->b_this_page;
210 } while (bh != head); 210 } while (bh != head);
211 211
@@ -1595,6 +1595,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1595 struct buffer_head *bh, *head; 1595 struct buffer_head *bh, *head;
1596 const unsigned blocksize = 1 << inode->i_blkbits; 1596 const unsigned blocksize = 1 << inode->i_blkbits;
1597 int nr_underway = 0; 1597 int nr_underway = 0;
1598 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
1598 1599
1599 BUG_ON(!PageLocked(page)); 1600 BUG_ON(!PageLocked(page));
1600 1601
@@ -1686,7 +1687,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1686 do { 1687 do {
1687 struct buffer_head *next = bh->b_this_page; 1688 struct buffer_head *next = bh->b_this_page;
1688 if (buffer_async_write(bh)) { 1689 if (buffer_async_write(bh)) {
1689 submit_bh(WRITE, bh); 1690 submit_bh(write_op, bh);
1690 nr_underway++; 1691 nr_underway++;
1691 } 1692 }
1692 bh = next; 1693 bh = next;
@@ -1740,7 +1741,7 @@ recover:
1740 struct buffer_head *next = bh->b_this_page; 1741 struct buffer_head *next = bh->b_this_page;
1741 if (buffer_async_write(bh)) { 1742 if (buffer_async_write(bh)) {
1742 clear_buffer_dirty(bh); 1743 clear_buffer_dirty(bh);
1743 submit_bh(WRITE, bh); 1744 submit_bh(write_op, bh);
1744 nr_underway++; 1745 nr_underway++;
1745 } 1746 }
1746 bh = next; 1747 bh = next;
@@ -3315,7 +3316,6 @@ EXPORT_SYMBOL(cont_write_begin);
3315EXPORT_SYMBOL(end_buffer_read_sync); 3316EXPORT_SYMBOL(end_buffer_read_sync);
3316EXPORT_SYMBOL(end_buffer_write_sync); 3317EXPORT_SYMBOL(end_buffer_write_sync);
3317EXPORT_SYMBOL(file_fsync); 3318EXPORT_SYMBOL(file_fsync);
3318EXPORT_SYMBOL(fsync_bdev);
3319EXPORT_SYMBOL(generic_block_bmap); 3319EXPORT_SYMBOL(generic_block_bmap);
3320EXPORT_SYMBOL(generic_cont_expand_simple); 3320EXPORT_SYMBOL(generic_cont_expand_simple);
3321EXPORT_SYMBOL(init_buffer); 3321EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 000000000000..80e9c6167f0b
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
1
2config CACHEFILES
3 tristate "Filesystem caching on files"
4 depends on FSCACHE && BLOCK
5 help
6 This permits use of a mounted filesystem as a cache for other
7 filesystems - primarily networking filesystems - thus allowing fast
8 local disk to enhance the speed of slower devices.
9
10 See Documentation/filesystems/caching/cachefiles.txt for more
11 information.
12
13config CACHEFILES_DEBUG
14 bool "Debug CacheFiles"
15 depends on CACHEFILES
16 help
17 This permits debugging to be dynamically enabled in the filesystem
18 caching on files module. If this is set, the debugging output may be
19 enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
20 by including a debugging specifier in /etc/cachefilesd.conf.
21
22config CACHEFILES_HISTOGRAM
23 bool "Gather latency information on CacheFiles"
24 depends on CACHEFILES && PROC_FS
25 help
26
27 This option causes latency information to be gathered on CacheFiles
28 operation and exported through file:
29
30 /proc/fs/cachefiles/histogram
31
32 The generation of this histogram adds a certain amount of overhead to
33 execution as there are a number of points at which data is gathered,
34 and on a multi-CPU system these may be on cachelines that keep
35 bouncing between CPUs. On the other hand, the histogram may be
36 useful for debugging purposes. Saying 'N' here is recommended.
37
38 See Documentation/filesystems/caching/cachefiles.txt for more
39 information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 000000000000..32cbab0ffce3
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
1#
2# Makefile for caching in a mounted filesystem
3#
4
5cachefiles-y := \
6 bind.o \
7 daemon.o \
8 interface.o \
9 key.o \
10 main.o \
11 namei.o \
12 rdwr.o \
13 security.o \
14 xattr.o
15
16cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
17
18obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 000000000000..3797e0077b35
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
1/* Bind and unbind a cache from the filesystem backing it
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/mount.h>
21#include <linux/statfs.h>
22#include <linux/ctype.h>
23#include "internal.h"
24
25static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
26
27/*
28 * bind a directory as a cache
29 */
30int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
31{
32 _enter("{%u,%u,%u,%u,%u,%u},%s",
33 cache->frun_percent,
34 cache->fcull_percent,
35 cache->fstop_percent,
36 cache->brun_percent,
37 cache->bcull_percent,
38 cache->bstop_percent,
39 args);
40
41 /* start by checking things over */
42 ASSERT(cache->fstop_percent >= 0 &&
43 cache->fstop_percent < cache->fcull_percent &&
44 cache->fcull_percent < cache->frun_percent &&
45 cache->frun_percent < 100);
46
47 ASSERT(cache->bstop_percent >= 0 &&
48 cache->bstop_percent < cache->bcull_percent &&
49 cache->bcull_percent < cache->brun_percent &&
50 cache->brun_percent < 100);
51
52 if (*args) {
53 kerror("'bind' command doesn't take an argument");
54 return -EINVAL;
55 }
56
57 if (!cache->rootdirname) {
58 kerror("No cache directory specified");
59 return -EINVAL;
60 }
61
62 /* don't permit already bound caches to be re-bound */
63 if (test_bit(CACHEFILES_READY, &cache->flags)) {
64 kerror("Cache already bound");
65 return -EBUSY;
66 }
67
68 /* make sure we have copies of the tag and dirname strings */
69 if (!cache->tag) {
70 /* the tag string is released by the fops->release()
71 * function, so we don't release it on error here */
72 cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
73 if (!cache->tag)
74 return -ENOMEM;
75 }
76
77 /* add the cache */
78 return cachefiles_daemon_add_cache(cache);
79}
80
81/*
82 * add a cache
83 */
84static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
85{
86 struct cachefiles_object *fsdef;
87 struct nameidata nd;
88 struct kstatfs stats;
89 struct dentry *graveyard, *cachedir, *root;
90 const struct cred *saved_cred;
91 int ret;
92
93 _enter("");
94
95 /* we want to work under the module's security ID */
96 ret = cachefiles_get_security_ID(cache);
97 if (ret < 0)
98 return ret;
99
100 cachefiles_begin_secure(cache, &saved_cred);
101
102 /* allocate the root index object */
103 ret = -ENOMEM;
104
105 fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
106 if (!fsdef)
107 goto error_root_object;
108
109 ASSERTCMP(fsdef->backer, ==, NULL);
110
111 atomic_set(&fsdef->usage, 1);
112 fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
113
114 _debug("- fsdef %p", fsdef);
115
116 /* look up the directory at the root of the cache */
117 memset(&nd, 0, sizeof(nd));
118
119 ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
120 if (ret < 0)
121 goto error_open_root;
122
123 cache->mnt = mntget(nd.path.mnt);
124 root = dget(nd.path.dentry);
125 path_put(&nd.path);
126
127 /* check parameters */
128 ret = -EOPNOTSUPP;
129 if (!root->d_inode ||
130 !root->d_inode->i_op ||
131 !root->d_inode->i_op->lookup ||
132 !root->d_inode->i_op->mkdir ||
133 !root->d_inode->i_op->setxattr ||
134 !root->d_inode->i_op->getxattr ||
135 !root->d_sb ||
136 !root->d_sb->s_op ||
137 !root->d_sb->s_op->statfs ||
138 !root->d_sb->s_op->sync_fs)
139 goto error_unsupported;
140
141 ret = -EROFS;
142 if (root->d_sb->s_flags & MS_RDONLY)
143 goto error_unsupported;
144
145 /* determine the security of the on-disk cache as this governs
146 * security ID of files we create */
147 ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
148 if (ret < 0)
149 goto error_unsupported;
150
151 /* get the cache size and blocksize */
152 ret = vfs_statfs(root, &stats);
153 if (ret < 0)
154 goto error_unsupported;
155
156 ret = -ERANGE;
157 if (stats.f_bsize <= 0)
158 goto error_unsupported;
159
160 ret = -EOPNOTSUPP;
161 if (stats.f_bsize > PAGE_SIZE)
162 goto error_unsupported;
163
164 cache->bsize = stats.f_bsize;
165 cache->bshift = 0;
166 if (stats.f_bsize < PAGE_SIZE)
167 cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
168
169 _debug("blksize %u (shift %u)",
170 cache->bsize, cache->bshift);
171
172 _debug("size %llu, avail %llu",
173 (unsigned long long) stats.f_blocks,
174 (unsigned long long) stats.f_bavail);
175
176 /* set up caching limits */
177 do_div(stats.f_files, 100);
178 cache->fstop = stats.f_files * cache->fstop_percent;
179 cache->fcull = stats.f_files * cache->fcull_percent;
180 cache->frun = stats.f_files * cache->frun_percent;
181
182 _debug("limits {%llu,%llu,%llu} files",
183 (unsigned long long) cache->frun,
184 (unsigned long long) cache->fcull,
185 (unsigned long long) cache->fstop);
186
187 stats.f_blocks >>= cache->bshift;
188 do_div(stats.f_blocks, 100);
189 cache->bstop = stats.f_blocks * cache->bstop_percent;
190 cache->bcull = stats.f_blocks * cache->bcull_percent;
191 cache->brun = stats.f_blocks * cache->brun_percent;
192
193 _debug("limits {%llu,%llu,%llu} blocks",
194 (unsigned long long) cache->brun,
195 (unsigned long long) cache->bcull,
196 (unsigned long long) cache->bstop);
197
198 /* get the cache directory and check its type */
199 cachedir = cachefiles_get_directory(cache, root, "cache");
200 if (IS_ERR(cachedir)) {
201 ret = PTR_ERR(cachedir);
202 goto error_unsupported;
203 }
204
205 fsdef->dentry = cachedir;
206 fsdef->fscache.cookie = NULL;
207
208 ret = cachefiles_check_object_type(fsdef);
209 if (ret < 0)
210 goto error_unsupported;
211
212 /* get the graveyard directory */
213 graveyard = cachefiles_get_directory(cache, root, "graveyard");
214 if (IS_ERR(graveyard)) {
215 ret = PTR_ERR(graveyard);
216 goto error_unsupported;
217 }
218
219 cache->graveyard = graveyard;
220
221 /* publish the cache */
222 fscache_init_cache(&cache->cache,
223 &cachefiles_cache_ops,
224 "%s",
225 fsdef->dentry->d_sb->s_id);
226
227 fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
228
229 ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
230 if (ret < 0)
231 goto error_add_cache;
232
233 /* done */
234 set_bit(CACHEFILES_READY, &cache->flags);
235 dput(root);
236
237 printk(KERN_INFO "CacheFiles:"
238 " File cache on %s registered\n",
239 cache->cache.identifier);
240
241 /* check how much space the cache has */
242 cachefiles_has_space(cache, 0, 0);
243 cachefiles_end_secure(cache, saved_cred);
244 return 0;
245
246error_add_cache:
247 dput(cache->graveyard);
248 cache->graveyard = NULL;
249error_unsupported:
250 mntput(cache->mnt);
251 cache->mnt = NULL;
252 dput(fsdef->dentry);
253 fsdef->dentry = NULL;
254 dput(root);
255error_open_root:
256 kmem_cache_free(cachefiles_object_jar, fsdef);
257error_root_object:
258 cachefiles_end_secure(cache, saved_cred);
259 kerror("Failed to register: %d", ret);
260 return ret;
261}
262
263/*
264 * unbind a cache on fd release
265 */
266void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
267{
268 _enter("");
269
270 if (test_bit(CACHEFILES_READY, &cache->flags)) {
271 printk(KERN_INFO "CacheFiles:"
272 " File cache on %s unregistering\n",
273 cache->cache.identifier);
274
275 fscache_withdraw_cache(&cache->cache);
276 }
277
278 dput(cache->graveyard);
279 mntput(cache->mnt);
280
281 kfree(cache->rootdirname);
282 kfree(cache->secctx);
283 kfree(cache->tag);
284
285 _leave("");
286}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 000000000000..4618516dd994
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
1/* Daemon interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/poll.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/ctype.h>
24#include <linux/fs_struct.h>
25#include "internal.h"
26
27static int cachefiles_daemon_open(struct inode *, struct file *);
28static int cachefiles_daemon_release(struct inode *, struct file *);
29static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
30 loff_t *);
31static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
32 size_t, loff_t *);
33static unsigned int cachefiles_daemon_poll(struct file *,
34 struct poll_table_struct *);
35static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
36static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
37static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
38static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
39static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
40static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
41static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
42static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
43static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
44static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
45static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
46static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
47
48static unsigned long cachefiles_open;
49
50const struct file_operations cachefiles_daemon_fops = {
51 .owner = THIS_MODULE,
52 .open = cachefiles_daemon_open,
53 .release = cachefiles_daemon_release,
54 .read = cachefiles_daemon_read,
55 .write = cachefiles_daemon_write,
56 .poll = cachefiles_daemon_poll,
57};
58
59struct cachefiles_daemon_cmd {
60 char name[8];
61 int (*handler)(struct cachefiles_cache *cache, char *args);
62};
63
64static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
65 { "bind", cachefiles_daemon_bind },
66 { "brun", cachefiles_daemon_brun },
67 { "bcull", cachefiles_daemon_bcull },
68 { "bstop", cachefiles_daemon_bstop },
69 { "cull", cachefiles_daemon_cull },
70 { "debug", cachefiles_daemon_debug },
71 { "dir", cachefiles_daemon_dir },
72 { "frun", cachefiles_daemon_frun },
73 { "fcull", cachefiles_daemon_fcull },
74 { "fstop", cachefiles_daemon_fstop },
75 { "inuse", cachefiles_daemon_inuse },
76 { "secctx", cachefiles_daemon_secctx },
77 { "tag", cachefiles_daemon_tag },
78 { "", NULL }
79};
80
81
82/*
83 * do various checks
84 */
85static int cachefiles_daemon_open(struct inode *inode, struct file *file)
86{
87 struct cachefiles_cache *cache;
88
89 _enter("");
90
91 /* only the superuser may do this */
92 if (!capable(CAP_SYS_ADMIN))
93 return -EPERM;
94
95 /* the cachefiles device may only be open once at a time */
96 if (xchg(&cachefiles_open, 1) == 1)
97 return -EBUSY;
98
99 /* allocate a cache record */
100 cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
101 if (!cache) {
102 cachefiles_open = 0;
103 return -ENOMEM;
104 }
105
106 mutex_init(&cache->daemon_mutex);
107 cache->active_nodes = RB_ROOT;
108 rwlock_init(&cache->active_lock);
109 init_waitqueue_head(&cache->daemon_pollwq);
110
111 /* set default caching limits
112 * - limit at 1% free space and/or free files
113 * - cull below 5% free space and/or free files
114 * - cease culling above 7% free space and/or free files
115 */
116 cache->frun_percent = 7;
117 cache->fcull_percent = 5;
118 cache->fstop_percent = 1;
119 cache->brun_percent = 7;
120 cache->bcull_percent = 5;
121 cache->bstop_percent = 1;
122
123 file->private_data = cache;
124 cache->cachefilesd = file;
125 return 0;
126}
127
128/*
129 * release a cache
130 */
131static int cachefiles_daemon_release(struct inode *inode, struct file *file)
132{
133 struct cachefiles_cache *cache = file->private_data;
134
135 _enter("");
136
137 ASSERT(cache);
138
139 set_bit(CACHEFILES_DEAD, &cache->flags);
140
141 cachefiles_daemon_unbind(cache);
142
143 ASSERT(!cache->active_nodes.rb_node);
144
145 /* clean up the control file interface */
146 cache->cachefilesd = NULL;
147 file->private_data = NULL;
148 cachefiles_open = 0;
149
150 kfree(cache);
151
152 _leave("");
153 return 0;
154}
155
156/*
157 * read the cache state
158 */
159static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
160 size_t buflen, loff_t *pos)
161{
162 struct cachefiles_cache *cache = file->private_data;
163 char buffer[256];
164 int n;
165
166 //_enter(",,%zu,", buflen);
167
168 if (!test_bit(CACHEFILES_READY, &cache->flags))
169 return 0;
170
171 /* check how much space the cache has */
172 cachefiles_has_space(cache, 0, 0);
173
174 /* summarise */
175 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
176
177 n = snprintf(buffer, sizeof(buffer),
178 "cull=%c"
179 " frun=%llx"
180 " fcull=%llx"
181 " fstop=%llx"
182 " brun=%llx"
183 " bcull=%llx"
184 " bstop=%llx",
185 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
186 (unsigned long long) cache->frun,
187 (unsigned long long) cache->fcull,
188 (unsigned long long) cache->fstop,
189 (unsigned long long) cache->brun,
190 (unsigned long long) cache->bcull,
191 (unsigned long long) cache->bstop
192 );
193
194 if (n > buflen)
195 return -EMSGSIZE;
196
197 if (copy_to_user(_buffer, buffer, n) != 0)
198 return -EFAULT;
199
200 return n;
201}
202
203/*
204 * command the cache
205 */
206static ssize_t cachefiles_daemon_write(struct file *file,
207 const char __user *_data,
208 size_t datalen,
209 loff_t *pos)
210{
211 const struct cachefiles_daemon_cmd *cmd;
212 struct cachefiles_cache *cache = file->private_data;
213 ssize_t ret;
214 char *data, *args, *cp;
215
216 //_enter(",,%zu,", datalen);
217
218 ASSERT(cache);
219
220 if (test_bit(CACHEFILES_DEAD, &cache->flags))
221 return -EIO;
222
223 if (datalen < 0 || datalen > PAGE_SIZE - 1)
224 return -EOPNOTSUPP;
225
226 /* drag the command string into the kernel so we can parse it */
227 data = kmalloc(datalen + 1, GFP_KERNEL);
228 if (!data)
229 return -ENOMEM;
230
231 ret = -EFAULT;
232 if (copy_from_user(data, _data, datalen) != 0)
233 goto error;
234
235 data[datalen] = '\0';
236
237 ret = -EINVAL;
238 if (memchr(data, '\0', datalen))
239 goto error;
240
241 /* strip any newline */
242 cp = memchr(data, '\n', datalen);
243 if (cp) {
244 if (cp == data)
245 goto error;
246
247 *cp = '\0';
248 }
249
250 /* parse the command */
251 ret = -EOPNOTSUPP;
252
253 for (args = data; *args; args++)
254 if (isspace(*args))
255 break;
256 if (*args) {
257 if (args == data)
258 goto error;
259 *args = '\0';
260 for (args++; isspace(*args); args++)
261 continue;
262 }
263
264 /* run the appropriate command handler */
265 for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
266 if (strcmp(cmd->name, data) == 0)
267 goto found_command;
268
269error:
270 kfree(data);
271 //_leave(" = %zd", ret);
272 return ret;
273
274found_command:
275 mutex_lock(&cache->daemon_mutex);
276
277 ret = -EIO;
278 if (!test_bit(CACHEFILES_DEAD, &cache->flags))
279 ret = cmd->handler(cache, args);
280
281 mutex_unlock(&cache->daemon_mutex);
282
283 if (ret == 0)
284 ret = datalen;
285 goto error;
286}
287
288/*
289 * poll for culling state
290 * - use POLLOUT to indicate culling state
291 */
292static unsigned int cachefiles_daemon_poll(struct file *file,
293 struct poll_table_struct *poll)
294{
295 struct cachefiles_cache *cache = file->private_data;
296 unsigned int mask;
297
298 poll_wait(file, &cache->daemon_pollwq, poll);
299 mask = 0;
300
301 if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
302 mask |= POLLIN;
303
304 if (test_bit(CACHEFILES_CULLING, &cache->flags))
305 mask |= POLLOUT;
306
307 return mask;
308}
309
310/*
311 * give a range error for cache space constraints
312 * - can be tail-called
313 */
314static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
315 char *args)
316{
317 kerror("Free space limits must be in range"
318 " 0%%<=stop<cull<run<100%%");
319
320 return -EINVAL;
321}
322
323/*
324 * set the percentage of files at which to stop culling
325 * - command: "frun <N>%"
326 */
327static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
328{
329 unsigned long frun;
330
331 _enter(",%s", args);
332
333 if (!*args)
334 return -EINVAL;
335
336 frun = simple_strtoul(args, &args, 10);
337 if (args[0] != '%' || args[1] != '\0')
338 return -EINVAL;
339
340 if (frun <= cache->fcull_percent || frun >= 100)
341 return cachefiles_daemon_range_error(cache, args);
342
343 cache->frun_percent = frun;
344 return 0;
345}
346
347/*
348 * set the percentage of files at which to start culling
349 * - command: "fcull <N>%"
350 */
351static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
352{
353 unsigned long fcull;
354
355 _enter(",%s", args);
356
357 if (!*args)
358 return -EINVAL;
359
360 fcull = simple_strtoul(args, &args, 10);
361 if (args[0] != '%' || args[1] != '\0')
362 return -EINVAL;
363
364 if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
365 return cachefiles_daemon_range_error(cache, args);
366
367 cache->fcull_percent = fcull;
368 return 0;
369}
370
371/*
372 * set the percentage of files at which to stop allocating
373 * - command: "fstop <N>%"
374 */
375static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
376{
377 unsigned long fstop;
378
379 _enter(",%s", args);
380
381 if (!*args)
382 return -EINVAL;
383
384 fstop = simple_strtoul(args, &args, 10);
385 if (args[0] != '%' || args[1] != '\0')
386 return -EINVAL;
387
388 if (fstop < 0 || fstop >= cache->fcull_percent)
389 return cachefiles_daemon_range_error(cache, args);
390
391 cache->fstop_percent = fstop;
392 return 0;
393}
394
395/*
396 * set the percentage of blocks at which to stop culling
397 * - command: "brun <N>%"
398 */
399static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
400{
401 unsigned long brun;
402
403 _enter(",%s", args);
404
405 if (!*args)
406 return -EINVAL;
407
408 brun = simple_strtoul(args, &args, 10);
409 if (args[0] != '%' || args[1] != '\0')
410 return -EINVAL;
411
412 if (brun <= cache->bcull_percent || brun >= 100)
413 return cachefiles_daemon_range_error(cache, args);
414
415 cache->brun_percent = brun;
416 return 0;
417}
418
419/*
420 * set the percentage of blocks at which to start culling
421 * - command: "bcull <N>%"
422 */
423static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
424{
425 unsigned long bcull;
426
427 _enter(",%s", args);
428
429 if (!*args)
430 return -EINVAL;
431
432 bcull = simple_strtoul(args, &args, 10);
433 if (args[0] != '%' || args[1] != '\0')
434 return -EINVAL;
435
436 if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
437 return cachefiles_daemon_range_error(cache, args);
438
439 cache->bcull_percent = bcull;
440 return 0;
441}
442
443/*
444 * set the percentage of blocks at which to stop allocating
445 * - command: "bstop <N>%"
446 */
447static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
448{
449 unsigned long bstop;
450
451 _enter(",%s", args);
452
453 if (!*args)
454 return -EINVAL;
455
456 bstop = simple_strtoul(args, &args, 10);
457 if (args[0] != '%' || args[1] != '\0')
458 return -EINVAL;
459
460 if (bstop < 0 || bstop >= cache->bcull_percent)
461 return cachefiles_daemon_range_error(cache, args);
462
463 cache->bstop_percent = bstop;
464 return 0;
465}
466
467/*
468 * set the cache directory
469 * - command: "dir <name>"
470 */
471static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
472{
473 char *dir;
474
475 _enter(",%s", args);
476
477 if (!*args) {
478 kerror("Empty directory specified");
479 return -EINVAL;
480 }
481
482 if (cache->rootdirname) {
483 kerror("Second cache directory specified");
484 return -EEXIST;
485 }
486
487 dir = kstrdup(args, GFP_KERNEL);
488 if (!dir)
489 return -ENOMEM;
490
491 cache->rootdirname = dir;
492 return 0;
493}
494
495/*
496 * set the cache security context
497 * - command: "secctx <ctx>"
498 */
499static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
500{
501 char *secctx;
502
503 _enter(",%s", args);
504
505 if (!*args) {
506 kerror("Empty security context specified");
507 return -EINVAL;
508 }
509
510 if (cache->secctx) {
511 kerror("Second security context specified");
512 return -EINVAL;
513 }
514
515 secctx = kstrdup(args, GFP_KERNEL);
516 if (!secctx)
517 return -ENOMEM;
518
519 cache->secctx = secctx;
520 return 0;
521}
522
523/*
524 * set the cache tag
525 * - command: "tag <name>"
526 */
527static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
528{
529 char *tag;
530
531 _enter(",%s", args);
532
533 if (!*args) {
534 kerror("Empty tag specified");
535 return -EINVAL;
536 }
537
538 if (cache->tag)
539 return -EEXIST;
540
541 tag = kstrdup(args, GFP_KERNEL);
542 if (!tag)
543 return -ENOMEM;
544
545 cache->tag = tag;
546 return 0;
547}
548
549/*
550 * request a node in the cache be culled from the current working directory
551 * - command: "cull <name>"
552 */
553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554{
555 struct fs_struct *fs;
556 struct dentry *dir;
557 const struct cred *saved_cred;
558 int ret;
559
560 _enter(",%s", args);
561
562 if (strchr(args, '/'))
563 goto inval;
564
565 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
566 kerror("cull applied to unready cache");
567 return -EIO;
568 }
569
570 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
571 kerror("cull applied to dead cache");
572 return -EIO;
573 }
574
575 /* extract the directory dentry from the cwd */
576 fs = current->fs;
577 read_lock(&fs->lock);
578 dir = dget(fs->pwd.dentry);
579 read_unlock(&fs->lock);
580
581 if (!S_ISDIR(dir->d_inode->i_mode))
582 goto notdir;
583
584 cachefiles_begin_secure(cache, &saved_cred);
585 ret = cachefiles_cull(cache, dir, args);
586 cachefiles_end_secure(cache, saved_cred);
587
588 dput(dir);
589 _leave(" = %d", ret);
590 return ret;
591
592notdir:
593 dput(dir);
594 kerror("cull command requires dirfd to be a directory");
595 return -ENOTDIR;
596
597inval:
598 kerror("cull command requires dirfd and filename");
599 return -EINVAL;
600}
601
602/*
603 * set debugging mode
604 * - command: "debug <mask>"
605 */
606static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
607{
608 unsigned long mask;
609
610 _enter(",%s", args);
611
612 mask = simple_strtoul(args, &args, 0);
613 if (args[0] != '\0')
614 goto inval;
615
616 cachefiles_debug = mask;
617 _leave(" = 0");
618 return 0;
619
620inval:
621 kerror("debug command requires mask");
622 return -EINVAL;
623}
624
625/*
626 * find out whether an object in the current working directory is in use or not
627 * - command: "inuse <name>"
628 */
629static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
630{
631 struct fs_struct *fs;
632 struct dentry *dir;
633 const struct cred *saved_cred;
634 int ret;
635
636 //_enter(",%s", args);
637
638 if (strchr(args, '/'))
639 goto inval;
640
641 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
642 kerror("inuse applied to unready cache");
643 return -EIO;
644 }
645
646 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
647 kerror("inuse applied to dead cache");
648 return -EIO;
649 }
650
651 /* extract the directory dentry from the cwd */
652 fs = current->fs;
653 read_lock(&fs->lock);
654 dir = dget(fs->pwd.dentry);
655 read_unlock(&fs->lock);
656
657 if (!S_ISDIR(dir->d_inode->i_mode))
658 goto notdir;
659
660 cachefiles_begin_secure(cache, &saved_cred);
661 ret = cachefiles_check_in_use(cache, dir, args);
662 cachefiles_end_secure(cache, saved_cred);
663
664 dput(dir);
665 //_leave(" = %d", ret);
666 return ret;
667
668notdir:
669 dput(dir);
670 kerror("inuse command requires dirfd to be a directory");
671 return -ENOTDIR;
672
673inval:
674 kerror("inuse command requires dirfd and filename");
675 return -EINVAL;
676}
677
678/*
679 * see if we have space for a number of pages and/or a number of files in the
680 * cache
681 */
682int cachefiles_has_space(struct cachefiles_cache *cache,
683 unsigned fnr, unsigned bnr)
684{
685 struct kstatfs stats;
686 int ret;
687
688 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
689 // (unsigned long long) cache->frun,
690 // (unsigned long long) cache->fcull,
691 // (unsigned long long) cache->fstop,
692 // (unsigned long long) cache->brun,
693 // (unsigned long long) cache->bcull,
694 // (unsigned long long) cache->bstop,
695 // fnr, bnr);
696
697 /* find out how many pages of blockdev are available */
698 memset(&stats, 0, sizeof(stats));
699
700 ret = vfs_statfs(cache->mnt->mnt_root, &stats);
701 if (ret < 0) {
702 if (ret == -EIO)
703 cachefiles_io_error(cache, "statfs failed");
704 _leave(" = %d", ret);
705 return ret;
706 }
707
708 stats.f_bavail >>= cache->bshift;
709
710 //_debug("avail %llu,%llu",
711 // (unsigned long long) stats.f_ffree,
712 // (unsigned long long) stats.f_bavail);
713
714 /* see if there is sufficient space */
715 if (stats.f_ffree > fnr)
716 stats.f_ffree -= fnr;
717 else
718 stats.f_ffree = 0;
719
720 if (stats.f_bavail > bnr)
721 stats.f_bavail -= bnr;
722 else
723 stats.f_bavail = 0;
724
725 ret = -ENOBUFS;
726 if (stats.f_ffree < cache->fstop ||
727 stats.f_bavail < cache->bstop)
728 goto begin_cull;
729
730 ret = 0;
731 if (stats.f_ffree < cache->fcull ||
732 stats.f_bavail < cache->bcull)
733 goto begin_cull;
734
735 if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
736 stats.f_ffree >= cache->frun &&
737 stats.f_bavail >= cache->brun &&
738 test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
739 ) {
740 _debug("cease culling");
741 cachefiles_state_changed(cache);
742 }
743
744 //_leave(" = 0");
745 return 0;
746
747begin_cull:
748 if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
749 _debug("### CULL CACHE ###");
750 cachefiles_state_changed(cache);
751 }
752
753 _leave(" = %d", ret);
754 return ret;
755}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 000000000000..1e962348d111
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
1/* FS-Cache interface to CacheFiles
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/buffer_head.h>
14#include "internal.h"
15
16#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
17
18struct cachefiles_lookup_data {
19 struct cachefiles_xattr *auxdata; /* auxiliary data */
20 char *key; /* key path */
21};
22
23static int cachefiles_attr_changed(struct fscache_object *_object);
24
25/*
26 * allocate an object record for a cookie lookup and prepare the lookup data
27 */
28static struct fscache_object *cachefiles_alloc_object(
29 struct fscache_cache *_cache,
30 struct fscache_cookie *cookie)
31{
32 struct cachefiles_lookup_data *lookup_data;
33 struct cachefiles_object *object;
34 struct cachefiles_cache *cache;
35 struct cachefiles_xattr *auxdata;
36 unsigned keylen, auxlen;
37 void *buffer;
38 char *key;
39
40 cache = container_of(_cache, struct cachefiles_cache, cache);
41
42 _enter("{%s},%p,", cache->cache.identifier, cookie);
43
44 lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
45 if (!lookup_data)
46 goto nomem_lookup_data;
47
48 /* create a new object record and a temporary leaf image */
49 object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
50 if (!object)
51 goto nomem_object;
52
53 ASSERTCMP(object->backer, ==, NULL);
54
55 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
56 atomic_set(&object->usage, 1);
57
58 fscache_object_init(&object->fscache, cookie, &cache->cache);
59
60 object->type = cookie->def->type;
61
62 /* get hold of the raw key
63 * - stick the length on the front and leave space on the back for the
64 * encoder
65 */
66 buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
67 if (!buffer)
68 goto nomem_buffer;
69
70 keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
71 ASSERTCMP(keylen, <, 512);
72
73 *(uint16_t *)buffer = keylen;
74 ((char *)buffer)[keylen + 2] = 0;
75 ((char *)buffer)[keylen + 3] = 0;
76 ((char *)buffer)[keylen + 4] = 0;
77
78 /* turn the raw key into something that can work with as a filename */
79 key = cachefiles_cook_key(buffer, keylen + 2, object->type);
80 if (!key)
81 goto nomem_key;
82
83 /* get hold of the auxiliary data and prepend the object type */
84 auxdata = buffer;
85 auxlen = 0;
86 if (cookie->def->get_aux) {
87 auxlen = cookie->def->get_aux(cookie->netfs_data,
88 auxdata->data, 511);
89 ASSERTCMP(auxlen, <, 511);
90 }
91
92 auxdata->len = auxlen + 1;
93 auxdata->type = cookie->def->type;
94
95 lookup_data->auxdata = auxdata;
96 lookup_data->key = key;
97 object->lookup_data = lookup_data;
98
99 _leave(" = %p [%p]", &object->fscache, lookup_data);
100 return &object->fscache;
101
102nomem_key:
103 kfree(buffer);
104nomem_buffer:
105 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
106 kmem_cache_free(cachefiles_object_jar, object);
107 fscache_object_destroyed(&cache->cache);
108nomem_object:
109 kfree(lookup_data);
110nomem_lookup_data:
111 _leave(" = -ENOMEM");
112 return ERR_PTR(-ENOMEM);
113}
114
115/*
116 * attempt to look up the nominated node in this cache
117 */
118static void cachefiles_lookup_object(struct fscache_object *_object)
119{
120 struct cachefiles_lookup_data *lookup_data;
121 struct cachefiles_object *parent, *object;
122 struct cachefiles_cache *cache;
123 const struct cred *saved_cred;
124 int ret;
125
126 _enter("{OBJ%x}", _object->debug_id);
127
128 cache = container_of(_object->cache, struct cachefiles_cache, cache);
129 parent = container_of(_object->parent,
130 struct cachefiles_object, fscache);
131 object = container_of(_object, struct cachefiles_object, fscache);
132 lookup_data = object->lookup_data;
133
134 ASSERTCMP(lookup_data, !=, NULL);
135
136 /* look up the key, creating any missing bits */
137 cachefiles_begin_secure(cache, &saved_cred);
138 ret = cachefiles_walk_to_object(parent, object,
139 lookup_data->key,
140 lookup_data->auxdata);
141 cachefiles_end_secure(cache, saved_cred);
142
143 /* polish off by setting the attributes of non-index files */
144 if (ret == 0 &&
145 object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
146 cachefiles_attr_changed(&object->fscache);
147
148 if (ret < 0) {
149 printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
150 ret);
151 fscache_object_lookup_error(&object->fscache);
152 }
153
154 _leave(" [%d]", ret);
155}
156
157/*
158 * indication of lookup completion
159 */
160static void cachefiles_lookup_complete(struct fscache_object *_object)
161{
162 struct cachefiles_object *object;
163
164 object = container_of(_object, struct cachefiles_object, fscache);
165
166 _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
167
168 if (object->lookup_data) {
169 kfree(object->lookup_data->key);
170 kfree(object->lookup_data->auxdata);
171 kfree(object->lookup_data);
172 object->lookup_data = NULL;
173 }
174}
175
176/*
177 * increment the usage count on an inode object (may fail if unmounting)
178 */
179static
180struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
181{
182 struct cachefiles_object *object =
183 container_of(_object, struct cachefiles_object, fscache);
184
185 _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
186
187#ifdef CACHEFILES_DEBUG_SLAB
188 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
189#endif
190
191 atomic_inc(&object->usage);
192 return &object->fscache;
193}
194
195/*
196 * update the auxilliary data for an object object on disk
197 */
198static void cachefiles_update_object(struct fscache_object *_object)
199{
200 struct cachefiles_object *object;
201 struct cachefiles_xattr *auxdata;
202 struct cachefiles_cache *cache;
203 struct fscache_cookie *cookie;
204 const struct cred *saved_cred;
205 unsigned auxlen;
206
207 _enter("{OBJ%x}", _object->debug_id);
208
209 object = container_of(_object, struct cachefiles_object, fscache);
210 cache = container_of(object->fscache.cache, struct cachefiles_cache,
211 cache);
212 cookie = object->fscache.cookie;
213
214 if (!cookie->def->get_aux) {
215 _leave(" [no aux]");
216 return;
217 }
218
219 auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
220 if (!auxdata) {
221 _leave(" [nomem]");
222 return;
223 }
224
225 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
226 ASSERTCMP(auxlen, <, 511);
227
228 auxdata->len = auxlen + 1;
229 auxdata->type = cookie->def->type;
230
231 cachefiles_begin_secure(cache, &saved_cred);
232 cachefiles_update_object_xattr(object, auxdata);
233 cachefiles_end_secure(cache, saved_cred);
234 kfree(auxdata);
235 _leave("");
236}
237
238/*
239 * discard the resources pinned by an object and effect retirement if
240 * requested
241 */
242static void cachefiles_drop_object(struct fscache_object *_object)
243{
244 struct cachefiles_object *object;
245 struct cachefiles_cache *cache;
246 const struct cred *saved_cred;
247
248 ASSERT(_object);
249
250 object = container_of(_object, struct cachefiles_object, fscache);
251
252 _enter("{OBJ%x,%d}",
253 object->fscache.debug_id, atomic_read(&object->usage));
254
255 cache = container_of(object->fscache.cache,
256 struct cachefiles_cache, cache);
257
258#ifdef CACHEFILES_DEBUG_SLAB
259 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
260#endif
261
262 /* delete retired objects */
263 if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
264 _object != cache->cache.fsdef
265 ) {
266 _debug("- retire object OBJ%x", object->fscache.debug_id);
267 cachefiles_begin_secure(cache, &saved_cred);
268 cachefiles_delete_object(cache, object);
269 cachefiles_end_secure(cache, saved_cred);
270 }
271
272 /* close the filesystem stuff attached to the object */
273 if (object->backer != object->dentry)
274 dput(object->backer);
275 object->backer = NULL;
276
277 /* note that the object is now inactive */
278 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
279 write_lock(&cache->active_lock);
280 if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
281 &object->flags))
282 BUG();
283 rb_erase(&object->active_node, &cache->active_nodes);
284 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
285 write_unlock(&cache->active_lock);
286 }
287
288 dput(object->dentry);
289 object->dentry = NULL;
290
291 _leave("");
292}
293
294/*
295 * dispose of a reference to an object
296 */
297static void cachefiles_put_object(struct fscache_object *_object)
298{
299 struct cachefiles_object *object;
300 struct fscache_cache *cache;
301
302 ASSERT(_object);
303
304 object = container_of(_object, struct cachefiles_object, fscache);
305
306 _enter("{OBJ%x,%d}",
307 object->fscache.debug_id, atomic_read(&object->usage));
308
309#ifdef CACHEFILES_DEBUG_SLAB
310 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
311#endif
312
313 ASSERTIFCMP(object->fscache.parent,
314 object->fscache.parent->n_children, >, 0);
315
316 if (atomic_dec_and_test(&object->usage)) {
317 _debug("- kill object OBJ%x", object->fscache.debug_id);
318
319 ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
320 ASSERTCMP(object->fscache.parent, ==, NULL);
321 ASSERTCMP(object->backer, ==, NULL);
322 ASSERTCMP(object->dentry, ==, NULL);
323 ASSERTCMP(object->fscache.n_ops, ==, 0);
324 ASSERTCMP(object->fscache.n_children, ==, 0);
325
326 if (object->lookup_data) {
327 kfree(object->lookup_data->key);
328 kfree(object->lookup_data->auxdata);
329 kfree(object->lookup_data);
330 object->lookup_data = NULL;
331 }
332
333 cache = object->fscache.cache;
334 kmem_cache_free(cachefiles_object_jar, object);
335 fscache_object_destroyed(cache);
336 }
337
338 _leave("");
339}
340
341/*
342 * sync a cache
343 */
344static void cachefiles_sync_cache(struct fscache_cache *_cache)
345{
346 struct cachefiles_cache *cache;
347 const struct cred *saved_cred;
348 int ret;
349
350 _enter("%p", _cache);
351
352 cache = container_of(_cache, struct cachefiles_cache, cache);
353
354 /* make sure all pages pinned by operations on behalf of the netfs are
355 * written to disc */
356 cachefiles_begin_secure(cache, &saved_cred);
357 ret = fsync_super(cache->mnt->mnt_sb);
358 cachefiles_end_secure(cache, saved_cred);
359
360 if (ret == -EIO)
361 cachefiles_io_error(cache,
362 "Attempt to sync backing fs superblock"
363 " returned error %d",
364 ret);
365}
366
367/*
368 * notification the attributes on an object have changed
369 * - called with reads/writes excluded by FS-Cache
370 */
371static int cachefiles_attr_changed(struct fscache_object *_object)
372{
373 struct cachefiles_object *object;
374 struct cachefiles_cache *cache;
375 const struct cred *saved_cred;
376 struct iattr newattrs;
377 uint64_t ni_size;
378 loff_t oi_size;
379 int ret;
380
381 _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
382
383 _enter("{OBJ%x},[%llu]",
384 _object->debug_id, (unsigned long long) ni_size);
385
386 object = container_of(_object, struct cachefiles_object, fscache);
387 cache = container_of(object->fscache.cache,
388 struct cachefiles_cache, cache);
389
390 if (ni_size == object->i_size)
391 return 0;
392
393 if (!object->backer)
394 return -ENOBUFS;
395
396 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
397
398 fscache_set_store_limit(&object->fscache, ni_size);
399
400 oi_size = i_size_read(object->backer->d_inode);
401 if (oi_size == ni_size)
402 return 0;
403
404 newattrs.ia_size = ni_size;
405 newattrs.ia_valid = ATTR_SIZE;
406
407 cachefiles_begin_secure(cache, &saved_cred);
408 mutex_lock(&object->backer->d_inode->i_mutex);
409 ret = notify_change(object->backer, &newattrs);
410 mutex_unlock(&object->backer->d_inode->i_mutex);
411 cachefiles_end_secure(cache, saved_cred);
412
413 if (ret == -EIO) {
414 fscache_set_store_limit(&object->fscache, 0);
415 cachefiles_io_error_obj(object, "Size set failed");
416 ret = -ENOBUFS;
417 }
418
419 _leave(" = %d", ret);
420 return ret;
421}
422
423/*
424 * dissociate a cache from all the pages it was backing
425 */
426static void cachefiles_dissociate_pages(struct fscache_cache *cache)
427{
428 _enter("");
429}
430
431const struct fscache_cache_ops cachefiles_cache_ops = {
432 .name = "cachefiles",
433 .alloc_object = cachefiles_alloc_object,
434 .lookup_object = cachefiles_lookup_object,
435 .lookup_complete = cachefiles_lookup_complete,
436 .grab_object = cachefiles_grab_object,
437 .update_object = cachefiles_update_object,
438 .drop_object = cachefiles_drop_object,
439 .put_object = cachefiles_put_object,
440 .sync_cache = cachefiles_sync_cache,
441 .attr_changed = cachefiles_attr_changed,
442 .read_or_alloc_page = cachefiles_read_or_alloc_page,
443 .read_or_alloc_pages = cachefiles_read_or_alloc_pages,
444 .allocate_page = cachefiles_allocate_page,
445 .allocate_pages = cachefiles_allocate_pages,
446 .write_page = cachefiles_write_page,
447 .uncache_page = cachefiles_uncache_page,
448 .dissociate_pages = cachefiles_dissociate_pages,
449};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 000000000000..19218e1463d6
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
1/* General netfs cache on cache files internal defs
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fscache-cache.h>
13#include <linux/timer.h>
14#include <linux/wait.h>
15#include <linux/workqueue.h>
16#include <linux/security.h>
17
18struct cachefiles_cache;
19struct cachefiles_object;
20
21extern unsigned cachefiles_debug;
22#define CACHEFILES_DEBUG_KENTER 1
23#define CACHEFILES_DEBUG_KLEAVE 2
24#define CACHEFILES_DEBUG_KDEBUG 4
25
26/*
27 * node records
28 */
29struct cachefiles_object {
30 struct fscache_object fscache; /* fscache handle */
31 struct cachefiles_lookup_data *lookup_data; /* cached lookup data */
32 struct dentry *dentry; /* the file/dir representing this object */
33 struct dentry *backer; /* backing file */
34 loff_t i_size; /* object size */
35 unsigned long flags;
36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
37 atomic_t usage; /* object usage count */
38 uint8_t type; /* object type */
39 uint8_t new; /* T if object new */
40 spinlock_t work_lock;
41 struct rb_node active_node; /* link in active tree (dentry is key) */
42};
43
44extern struct kmem_cache *cachefiles_object_jar;
45
46/*
47 * Cache files cache definition
48 */
49struct cachefiles_cache {
50 struct fscache_cache cache; /* FS-Cache record */
51 struct vfsmount *mnt; /* mountpoint holding the cache */
52 struct dentry *graveyard; /* directory into which dead objects go */
53 struct file *cachefilesd; /* manager daemon handle */
54 const struct cred *cache_cred; /* security override for accessing cache */
55 struct mutex daemon_mutex; /* command serialisation mutex */
56 wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */
57 struct rb_root active_nodes; /* active nodes (can't be culled) */
58 rwlock_t active_lock; /* lock for active_nodes */
59 atomic_t gravecounter; /* graveyard uniquifier */
60 unsigned frun_percent; /* when to stop culling (% files) */
61 unsigned fcull_percent; /* when to start culling (% files) */
62 unsigned fstop_percent; /* when to stop allocating (% files) */
63 unsigned brun_percent; /* when to stop culling (% blocks) */
64 unsigned bcull_percent; /* when to start culling (% blocks) */
65 unsigned bstop_percent; /* when to stop allocating (% blocks) */
66 unsigned bsize; /* cache's block size */
67 unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */
68 uint64_t frun; /* when to stop culling */
69 uint64_t fcull; /* when to start culling */
70 uint64_t fstop; /* when to stop allocating */
71 sector_t brun; /* when to stop culling */
72 sector_t bcull; /* when to start culling */
73 sector_t bstop; /* when to stop allocating */
74 unsigned long flags;
75#define CACHEFILES_READY 0 /* T if cache prepared */
76#define CACHEFILES_DEAD 1 /* T if cache dead */
77#define CACHEFILES_CULLING 2 /* T if cull engaged */
78#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
79 char *rootdirname; /* name of cache root directory */
80 char *secctx; /* LSM security context */
81 char *tag; /* cache binding tag */
82};
83
84/*
85 * backing file read tracking
86 */
87struct cachefiles_one_read {
88 wait_queue_t monitor; /* link into monitored waitqueue */
89 struct page *back_page; /* backing file page we're waiting for */
90 struct page *netfs_page; /* netfs page we're going to fill */
91 struct fscache_retrieval *op; /* retrieval op covering this */
92 struct list_head op_link; /* link in op's todo list */
93};
94
95/*
96 * backing file write tracking
97 */
98struct cachefiles_one_write {
99 struct page *netfs_page; /* netfs page to copy */
100 struct cachefiles_object *object;
101 struct list_head obj_link; /* link in object's lists */
102 fscache_rw_complete_t end_io_func;
103 void *context;
104};
105
106/*
107 * auxiliary data xattr buffer
108 */
109struct cachefiles_xattr {
110 uint16_t len;
111 uint8_t type;
112 uint8_t data[];
113};
114
115/*
116 * note change of state for daemon
117 */
118static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
119{
120 set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
121 wake_up_all(&cache->daemon_pollwq);
122}
123
124/*
125 * cf-bind.c
126 */
127extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
128extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
129
130/*
131 * cf-daemon.c
132 */
133extern const struct file_operations cachefiles_daemon_fops;
134
135extern int cachefiles_has_space(struct cachefiles_cache *cache,
136 unsigned fnr, unsigned bnr);
137
138/*
139 * cf-interface.c
140 */
141extern const struct fscache_cache_ops cachefiles_cache_ops;
142
143/*
144 * cf-key.c
145 */
146extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
147
148/*
149 * cf-namei.c
150 */
151extern int cachefiles_delete_object(struct cachefiles_cache *cache,
152 struct cachefiles_object *object);
153extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
154 struct cachefiles_object *object,
155 const char *key,
156 struct cachefiles_xattr *auxdata);
157extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
158 struct dentry *dir,
159 const char *name);
160
161extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
162 char *filename);
163
164extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
165 struct dentry *dir, char *filename);
166
167/*
168 * cf-proc.c
169 */
170#ifdef CONFIG_CACHEFILES_HISTOGRAM
171extern atomic_t cachefiles_lookup_histogram[HZ];
172extern atomic_t cachefiles_mkdir_histogram[HZ];
173extern atomic_t cachefiles_create_histogram[HZ];
174
175extern int __init cachefiles_proc_init(void);
176extern void cachefiles_proc_cleanup(void);
177static inline
178void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
179{
180 unsigned long jif = jiffies - start_jif;
181 if (jif >= HZ)
182 jif = HZ - 1;
183 atomic_inc(&histogram[jif]);
184}
185
186#else
187#define cachefiles_proc_init() (0)
188#define cachefiles_proc_cleanup() do {} while (0)
189#define cachefiles_hist(hist, start_jif) do {} while (0)
190#endif
191
192/*
193 * cf-rdwr.c
194 */
195extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
196 struct page *, gfp_t);
197extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
198 struct list_head *, unsigned *,
199 gfp_t);
200extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
201 gfp_t);
202extern int cachefiles_allocate_pages(struct fscache_retrieval *,
203 struct list_head *, unsigned *, gfp_t);
204extern int cachefiles_write_page(struct fscache_storage *, struct page *);
205extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
206
207/*
208 * cf-security.c
209 */
210extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
211extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
212 struct dentry *root,
213 const struct cred **_saved_cred);
214
215static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
216 const struct cred **_saved_cred)
217{
218 *_saved_cred = override_creds(cache->cache_cred);
219}
220
221static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
222 const struct cred *saved_cred)
223{
224 revert_creds(saved_cred);
225}
226
227/*
228 * cf-xattr.c
229 */
230extern int cachefiles_check_object_type(struct cachefiles_object *object);
231extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
232 struct cachefiles_xattr *auxdata);
233extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
234 struct cachefiles_xattr *auxdata);
235extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
236 struct cachefiles_xattr *auxdata);
237extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
238 struct dentry *dentry);
239
240
241/*
242 * error handling
243 */
244#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
245
246#define cachefiles_io_error(___cache, FMT, ...) \
247do { \
248 kerror("I/O Error: " FMT, ##__VA_ARGS__); \
249 fscache_io_error(&(___cache)->cache); \
250 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
251} while (0)
252
253#define cachefiles_io_error_obj(object, FMT, ...) \
254do { \
255 struct cachefiles_cache *___cache; \
256 \
257 ___cache = container_of((object)->fscache.cache, \
258 struct cachefiles_cache, cache); \
259 cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \
260} while (0)
261
262
263/*
264 * debug tracing
265 */
266#define dbgprintk(FMT, ...) \
267 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
268
269/* make sure we maintain the format strings, even when debugging is disabled */
270static inline void _dbprintk(const char *fmt, ...)
271 __attribute__((format(printf, 1, 2)));
272static inline void _dbprintk(const char *fmt, ...)
273{
274}
275
276#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
277#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
278#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
279
280
281#if defined(__KDEBUG)
282#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
283#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
284#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
285
286#elif defined(CONFIG_CACHEFILES_DEBUG)
287#define _enter(FMT, ...) \
288do { \
289 if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
290 kenter(FMT, ##__VA_ARGS__); \
291} while (0)
292
293#define _leave(FMT, ...) \
294do { \
295 if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
296 kleave(FMT, ##__VA_ARGS__); \
297} while (0)
298
299#define _debug(FMT, ...) \
300do { \
301 if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
302 kdebug(FMT, ##__VA_ARGS__); \
303} while (0)
304
305#else
306#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
307#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
308#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
309#endif
310
311#if 1 /* defined(__KDEBUGALL) */
312
313#define ASSERT(X) \
314do { \
315 if (unlikely(!(X))) { \
316 printk(KERN_ERR "\n"); \
317 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
318 BUG(); \
319 } \
320} while (0)
321
322#define ASSERTCMP(X, OP, Y) \
323do { \
324 if (unlikely(!((X) OP (Y)))) { \
325 printk(KERN_ERR "\n"); \
326 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
327 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
328 (unsigned long)(X), (unsigned long)(Y)); \
329 BUG(); \
330 } \
331} while (0)
332
333#define ASSERTIF(C, X) \
334do { \
335 if (unlikely((C) && !(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTIFCMP(C, X, OP, Y) \
343do { \
344 if (unlikely((C) && !((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#else
354
355#define ASSERT(X) do {} while (0)
356#define ASSERTCMP(X, OP, Y) do {} while (0)
357#define ASSERTIF(C, X) do {} while (0)
358#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
359
360#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 000000000000..81b8b2b3a674
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
1/* Key to pathname encoder
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/slab.h>
13#include "internal.h"
14
15static const char cachefiles_charmap[64] =
16 "0123456789" /* 0 - 9 */
17 "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
18 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
19 "_-" /* 62 - 63 */
20 ;
21
22static const char cachefiles_filecharmap[256] = {
23 /* we skip space and tab and control chars */
24 [33 ... 46] = 1, /* '!' -> '.' */
25 /* we skip '/' as it's significant to pathwalk */
26 [48 ... 127] = 1, /* '0' -> '~' */
27};
28
29/*
30 * turn the raw key into something cooked
31 * - the raw key should include the length in the two bytes at the front
32 * - the key may be up to 514 bytes in length (including the length word)
33 * - "base64" encode the strange keys, mapping 3 bytes of raw to four of
34 * cooked
35 * - need to cut the cooked key into 252 char lengths (189 raw bytes)
36 */
37char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
38{
39 unsigned char csum, ch;
40 unsigned int acc;
41 char *key;
42 int loop, len, max, seg, mark, print;
43
44 _enter(",%d", keylen);
45
46 BUG_ON(keylen < 2 || keylen > 514);
47
48 csum = raw[0] + raw[1];
49 print = 1;
50 for (loop = 2; loop < keylen; loop++) {
51 ch = raw[loop];
52 csum += ch;
53 print &= cachefiles_filecharmap[ch];
54 }
55
56 if (print) {
57 /* if the path is usable ASCII, then we render it directly */
58 max = keylen - 2;
59 max += 2; /* two base64'd length chars on the front */
60 max += 5; /* @checksum/M */
61 max += 3 * 2; /* maximum number of segment dividers (".../M")
62 * is ((514 + 251) / 252) = 3
63 */
64 max += 1; /* NUL on end */
65 } else {
66 /* calculate the maximum length of the cooked key */
67 keylen = (keylen + 2) / 3;
68
69 max = keylen * 4;
70 max += 5; /* @checksum/M */
71 max += 3 * 2; /* maximum number of segment dividers (".../M")
72 * is ((514 + 188) / 189) = 3
73 */
74 max += 1; /* NUL on end */
75 }
76
77 max += 1; /* 2nd NUL on end */
78
79 _debug("max: %d", max);
80
81 key = kmalloc(max, GFP_KERNEL);
82 if (!key)
83 return NULL;
84
85 len = 0;
86
87 /* build the cooked key */
88 sprintf(key, "@%02x%c+", (unsigned) csum, 0);
89 len = 5;
90 mark = len - 1;
91
92 if (print) {
93 acc = *(uint16_t *) raw;
94 raw += 2;
95
96 key[len + 1] = cachefiles_charmap[acc & 63];
97 acc >>= 6;
98 key[len] = cachefiles_charmap[acc & 63];
99 len += 2;
100
101 seg = 250;
102 for (loop = keylen; loop > 0; loop--) {
103 if (seg <= 0) {
104 key[len++] = '\0';
105 mark = len;
106 key[len++] = '+';
107 seg = 252;
108 }
109
110 key[len++] = *raw++;
111 ASSERT(len < max);
112 }
113
114 switch (type) {
115 case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break;
116 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break;
117 default: type = 'S'; break;
118 }
119 } else {
120 seg = 252;
121 for (loop = keylen; loop > 0; loop--) {
122 if (seg <= 0) {
123 key[len++] = '\0';
124 mark = len;
125 key[len++] = '+';
126 seg = 252;
127 }
128
129 acc = *raw++;
130 acc |= *raw++ << 8;
131 acc |= *raw++ << 16;
132
133 _debug("acc: %06x", acc);
134
135 key[len++] = cachefiles_charmap[acc & 63];
136 acc >>= 6;
137 key[len++] = cachefiles_charmap[acc & 63];
138 acc >>= 6;
139 key[len++] = cachefiles_charmap[acc & 63];
140 acc >>= 6;
141 key[len++] = cachefiles_charmap[acc & 63];
142
143 ASSERT(len < max);
144 }
145
146 switch (type) {
147 case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break;
148 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break;
149 default: type = 'T'; break;
150 }
151 }
152
153 key[mark] = type;
154 key[len++] = 0;
155 key[len] = 0;
156
157 _leave(" = %p %d", key, len);
158 return key;
159}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 000000000000..4bfa8cf43bf5
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
1/* Network filesystem caching backend to use cache files on a premounted
2 * filesystem
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public Licence
9 * as published by the Free Software Foundation; either version
10 * 2 of the Licence, or (at your option) any later version.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include <linux/fs.h>
19#include <linux/file.h>
20#include <linux/namei.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/sysctl.h>
24#include <linux/miscdevice.h>
25#include "internal.h"
26
27unsigned cachefiles_debug;
28module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
29MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
30
31MODULE_DESCRIPTION("Mounted-filesystem based cache");
32MODULE_AUTHOR("Red Hat, Inc.");
33MODULE_LICENSE("GPL");
34
35struct kmem_cache *cachefiles_object_jar;
36
37static struct miscdevice cachefiles_dev = {
38 .minor = MISC_DYNAMIC_MINOR,
39 .name = "cachefiles",
40 .fops = &cachefiles_daemon_fops,
41};
42
43static void cachefiles_object_init_once(void *_object)
44{
45 struct cachefiles_object *object = _object;
46
47 memset(object, 0, sizeof(*object));
48 spin_lock_init(&object->work_lock);
49}
50
51/*
52 * initialise the fs caching module
53 */
54static int __init cachefiles_init(void)
55{
56 int ret;
57
58 ret = misc_register(&cachefiles_dev);
59 if (ret < 0)
60 goto error_dev;
61
62 /* create an object jar */
63 ret = -ENOMEM;
64 cachefiles_object_jar =
65 kmem_cache_create("cachefiles_object_jar",
66 sizeof(struct cachefiles_object),
67 0,
68 SLAB_HWCACHE_ALIGN,
69 cachefiles_object_init_once);
70 if (!cachefiles_object_jar) {
71 printk(KERN_NOTICE
72 "CacheFiles: Failed to allocate an object jar\n");
73 goto error_object_jar;
74 }
75
76 ret = cachefiles_proc_init();
77 if (ret < 0)
78 goto error_proc;
79
80 printk(KERN_INFO "CacheFiles: Loaded\n");
81 return 0;
82
83error_proc:
84 kmem_cache_destroy(cachefiles_object_jar);
85error_object_jar:
86 misc_deregister(&cachefiles_dev);
87error_dev:
88 kerror("failed to register: %d", ret);
89 return ret;
90}
91
92fs_initcall(cachefiles_init);
93
94/*
95 * clean up on module removal
96 */
97static void __exit cachefiles_exit(void)
98{
99 printk(KERN_INFO "CacheFiles: Unloading\n");
100
101 cachefiles_proc_cleanup();
102 kmem_cache_destroy(cachefiles_object_jar);
103 misc_deregister(&cachefiles_dev);
104}
105
106module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 000000000000..4ce818ae39ea
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
1/* CacheFiles path walking and related routines
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include <linux/mount.h>
20#include <linux/namei.h>
21#include <linux/security.h>
22#include "internal.h"
23
24static int cachefiles_wait_bit(void *flags)
25{
26 schedule();
27 return 0;
28}
29
30/*
31 * record the fact that an object is now active
32 */
33static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
34 struct cachefiles_object *object)
35{
36 struct cachefiles_object *xobject;
37 struct rb_node **_p, *_parent = NULL;
38 struct dentry *dentry;
39
40 _enter(",%p", object);
41
42try_again:
43 write_lock(&cache->active_lock);
44
45 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
46 BUG();
47
48 dentry = object->dentry;
49 _p = &cache->active_nodes.rb_node;
50 while (*_p) {
51 _parent = *_p;
52 xobject = rb_entry(_parent,
53 struct cachefiles_object, active_node);
54
55 ASSERT(xobject != object);
56
57 if (xobject->dentry > dentry)
58 _p = &(*_p)->rb_left;
59 else if (xobject->dentry < dentry)
60 _p = &(*_p)->rb_right;
61 else
62 goto wait_for_old_object;
63 }
64
65 rb_link_node(&object->active_node, _parent, _p);
66 rb_insert_color(&object->active_node, &cache->active_nodes);
67
68 write_unlock(&cache->active_lock);
69 _leave("");
70 return;
71
72 /* an old object from a previous incarnation is hogging the slot - we
73 * need to wait for it to be destroyed */
74wait_for_old_object:
75 if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
76 printk(KERN_ERR "\n");
77 printk(KERN_ERR "CacheFiles: Error:"
78 " Unexpected object collision\n");
79 printk(KERN_ERR "xobject: OBJ%x\n",
80 xobject->fscache.debug_id);
81 printk(KERN_ERR "xobjstate=%s\n",
82 fscache_object_states[xobject->fscache.state]);
83 printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
84 printk(KERN_ERR "xobjevent=%lx [%lx]\n",
85 xobject->fscache.events, xobject->fscache.event_mask);
86 printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
87 xobject->fscache.n_ops, xobject->fscache.n_in_progress,
88 xobject->fscache.n_exclusive);
89 printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
90 xobject->fscache.cookie,
91 xobject->fscache.cookie->parent,
92 xobject->fscache.cookie->netfs_data,
93 xobject->fscache.cookie->flags);
94 printk(KERN_ERR "xparent=%p\n",
95 xobject->fscache.parent);
96 printk(KERN_ERR "object: OBJ%x\n",
97 object->fscache.debug_id);
98 printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
99 object->fscache.cookie,
100 object->fscache.cookie->parent,
101 object->fscache.cookie->netfs_data,
102 object->fscache.cookie->flags);
103 printk(KERN_ERR "parent=%p\n",
104 object->fscache.parent);
105 BUG();
106 }
107 atomic_inc(&xobject->usage);
108 write_unlock(&cache->active_lock);
109
110 _debug(">>> wait");
111 wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
112 cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
113 _debug("<<< waited");
114
115 cache->cache.ops->put_object(&xobject->fscache);
116 goto try_again;
117}
118
119/*
120 * delete an object representation from the cache
121 * - file backed objects are unlinked
122 * - directory backed objects are stuffed into the graveyard for userspace to
123 * delete
124 * - unlocks the directory mutex
125 */
126static int cachefiles_bury_object(struct cachefiles_cache *cache,
127 struct dentry *dir,
128 struct dentry *rep)
129{
130 struct dentry *grave, *trap;
131 char nbuffer[8 + 8 + 1];
132 int ret;
133
134 _enter(",'%*.*s','%*.*s'",
135 dir->d_name.len, dir->d_name.len, dir->d_name.name,
136 rep->d_name.len, rep->d_name.len, rep->d_name.name);
137
138 /* non-directories can just be unlinked */
139 if (!S_ISDIR(rep->d_inode->i_mode)) {
140 _debug("unlink stale object");
141 ret = vfs_unlink(dir->d_inode, rep);
142
143 mutex_unlock(&dir->d_inode->i_mutex);
144
145 if (ret == -EIO)
146 cachefiles_io_error(cache, "Unlink failed");
147
148 _leave(" = %d", ret);
149 return ret;
150 }
151
152 /* directories have to be moved to the graveyard */
153 _debug("move stale object to graveyard");
154 mutex_unlock(&dir->d_inode->i_mutex);
155
156try_again:
157 /* first step is to make up a grave dentry in the graveyard */
158 sprintf(nbuffer, "%08x%08x",
159 (uint32_t) get_seconds(),
160 (uint32_t) atomic_inc_return(&cache->gravecounter));
161
162 /* do the multiway lock magic */
163 trap = lock_rename(cache->graveyard, dir);
164
165 /* do some checks before getting the grave dentry */
166 if (rep->d_parent != dir) {
167 /* the entry was probably culled when we dropped the parent dir
168 * lock */
169 unlock_rename(cache->graveyard, dir);
170 _leave(" = 0 [culled?]");
171 return 0;
172 }
173
174 if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
175 unlock_rename(cache->graveyard, dir);
176 cachefiles_io_error(cache, "Graveyard no longer a directory");
177 return -EIO;
178 }
179
180 if (trap == rep) {
181 unlock_rename(cache->graveyard, dir);
182 cachefiles_io_error(cache, "May not make directory loop");
183 return -EIO;
184 }
185
186 if (d_mountpoint(rep)) {
187 unlock_rename(cache->graveyard, dir);
188 cachefiles_io_error(cache, "Mountpoint in cache");
189 return -EIO;
190 }
191
192 grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
193 if (IS_ERR(grave)) {
194 unlock_rename(cache->graveyard, dir);
195
196 if (PTR_ERR(grave) == -ENOMEM) {
197 _leave(" = -ENOMEM");
198 return -ENOMEM;
199 }
200
201 cachefiles_io_error(cache, "Lookup error %ld",
202 PTR_ERR(grave));
203 return -EIO;
204 }
205
206 if (grave->d_inode) {
207 unlock_rename(cache->graveyard, dir);
208 dput(grave);
209 grave = NULL;
210 cond_resched();
211 goto try_again;
212 }
213
214 if (d_mountpoint(grave)) {
215 unlock_rename(cache->graveyard, dir);
216 dput(grave);
217 cachefiles_io_error(cache, "Mountpoint in graveyard");
218 return -EIO;
219 }
220
221 /* target should not be an ancestor of source */
222 if (trap == grave) {
223 unlock_rename(cache->graveyard, dir);
224 dput(grave);
225 cachefiles_io_error(cache, "May not make directory loop");
226 return -EIO;
227 }
228
229 /* attempt the rename */
230 ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
231 if (ret != 0 && ret != -ENOMEM)
232 cachefiles_io_error(cache, "Rename failed with error %d", ret);
233
234 unlock_rename(cache->graveyard, dir);
235 dput(grave);
236 _leave(" = 0");
237 return 0;
238}
239
240/*
241 * delete an object representation from the cache
242 */
243int cachefiles_delete_object(struct cachefiles_cache *cache,
244 struct cachefiles_object *object)
245{
246 struct dentry *dir;
247 int ret;
248
249 _enter(",{%p}", object->dentry);
250
251 ASSERT(object->dentry);
252 ASSERT(object->dentry->d_inode);
253 ASSERT(object->dentry->d_parent);
254
255 dir = dget_parent(object->dentry);
256
257 mutex_lock(&dir->d_inode->i_mutex);
258 ret = cachefiles_bury_object(cache, dir, object->dentry);
259
260 dput(dir);
261 _leave(" = %d", ret);
262 return ret;
263}
264
265/*
266 * walk from the parent object to the child object through the backing
267 * filesystem, creating directories as we go
268 */
269int cachefiles_walk_to_object(struct cachefiles_object *parent,
270 struct cachefiles_object *object,
271 const char *key,
272 struct cachefiles_xattr *auxdata)
273{
274 struct cachefiles_cache *cache;
275 struct dentry *dir, *next = NULL;
276 unsigned long start;
277 const char *name;
278 int ret, nlen;
279
280 _enter("{%p},,%s,", parent->dentry, key);
281
282 cache = container_of(parent->fscache.cache,
283 struct cachefiles_cache, cache);
284
285 ASSERT(parent->dentry);
286 ASSERT(parent->dentry->d_inode);
287
288 if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
289 // TODO: convert file to dir
290 _leave("looking up in none directory");
291 return -ENOBUFS;
292 }
293
294 dir = dget(parent->dentry);
295
296advance:
297 /* attempt to transit the first directory component */
298 name = key;
299 nlen = strlen(key);
300
301 /* key ends in a double NUL */
302 key = key + nlen + 1;
303 if (!*key)
304 key = NULL;
305
306lookup_again:
307 /* search the current directory for the element name */
308 _debug("lookup '%s'", name);
309
310 mutex_lock(&dir->d_inode->i_mutex);
311
312 start = jiffies;
313 next = lookup_one_len(name, dir, nlen);
314 cachefiles_hist(cachefiles_lookup_histogram, start);
315 if (IS_ERR(next))
316 goto lookup_error;
317
318 _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
319
320 if (!key)
321 object->new = !next->d_inode;
322
323 /* if this element of the path doesn't exist, then the lookup phase
324 * failed, and we can release any readers in the certain knowledge that
325 * there's nothing for them to actually read */
326 if (!next->d_inode)
327 fscache_object_lookup_negative(&object->fscache);
328
329 /* we need to create the object if it's negative */
330 if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
331 /* index objects and intervening tree levels must be subdirs */
332 if (!next->d_inode) {
333 ret = cachefiles_has_space(cache, 1, 0);
334 if (ret < 0)
335 goto create_error;
336
337 start = jiffies;
338 ret = vfs_mkdir(dir->d_inode, next, 0);
339 cachefiles_hist(cachefiles_mkdir_histogram, start);
340 if (ret < 0)
341 goto create_error;
342
343 ASSERT(next->d_inode);
344
345 _debug("mkdir -> %p{%p{ino=%lu}}",
346 next, next->d_inode, next->d_inode->i_ino);
347
348 } else if (!S_ISDIR(next->d_inode->i_mode)) {
349 kerror("inode %lu is not a directory",
350 next->d_inode->i_ino);
351 ret = -ENOBUFS;
352 goto error;
353 }
354
355 } else {
356 /* non-index objects start out life as files */
357 if (!next->d_inode) {
358 ret = cachefiles_has_space(cache, 1, 0);
359 if (ret < 0)
360 goto create_error;
361
362 start = jiffies;
363 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
364 cachefiles_hist(cachefiles_create_histogram, start);
365 if (ret < 0)
366 goto create_error;
367
368 ASSERT(next->d_inode);
369
370 _debug("create -> %p{%p{ino=%lu}}",
371 next, next->d_inode, next->d_inode->i_ino);
372
373 } else if (!S_ISDIR(next->d_inode->i_mode) &&
374 !S_ISREG(next->d_inode->i_mode)
375 ) {
376 kerror("inode %lu is not a file or directory",
377 next->d_inode->i_ino);
378 ret = -ENOBUFS;
379 goto error;
380 }
381 }
382
383 /* process the next component */
384 if (key) {
385 _debug("advance");
386 mutex_unlock(&dir->d_inode->i_mutex);
387 dput(dir);
388 dir = next;
389 next = NULL;
390 goto advance;
391 }
392
393 /* we've found the object we were looking for */
394 object->dentry = next;
395
396 /* if we've found that the terminal object exists, then we need to
397 * check its attributes and delete it if it's out of date */
398 if (!object->new) {
399 _debug("validate '%*.*s'",
400 next->d_name.len, next->d_name.len, next->d_name.name);
401
402 ret = cachefiles_check_object_xattr(object, auxdata);
403 if (ret == -ESTALE) {
404 /* delete the object (the deleter drops the directory
405 * mutex) */
406 object->dentry = NULL;
407
408 ret = cachefiles_bury_object(cache, dir, next);
409 dput(next);
410 next = NULL;
411
412 if (ret < 0)
413 goto delete_error;
414
415 _debug("redo lookup");
416 goto lookup_again;
417 }
418 }
419
420 /* note that we're now using this object */
421 cachefiles_mark_object_active(cache, object);
422
423 mutex_unlock(&dir->d_inode->i_mutex);
424 dput(dir);
425 dir = NULL;
426
427 _debug("=== OBTAINED_OBJECT ===");
428
429 if (object->new) {
430 /* attach data to a newly constructed terminal object */
431 ret = cachefiles_set_object_xattr(object, auxdata);
432 if (ret < 0)
433 goto check_error;
434 } else {
435 /* always update the atime on an object we've just looked up
436 * (this is used to keep track of culling, and atimes are only
437 * updated by read, write and readdir but not lookup or
438 * open) */
439 touch_atime(cache->mnt, next);
440 }
441
442 /* open a file interface onto a data file */
443 if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
444 if (S_ISREG(object->dentry->d_inode->i_mode)) {
445 const struct address_space_operations *aops;
446
447 ret = -EPERM;
448 aops = object->dentry->d_inode->i_mapping->a_ops;
449 if (!aops->bmap)
450 goto check_error;
451
452 object->backer = object->dentry;
453 } else {
454 BUG(); // TODO: open file in data-class subdir
455 }
456 }
457
458 object->new = 0;
459 fscache_obtained_object(&object->fscache);
460
461 _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
462 return 0;
463
464create_error:
465 _debug("create error %d", ret);
466 if (ret == -EIO)
467 cachefiles_io_error(cache, "Create/mkdir failed");
468 goto error;
469
470check_error:
471 _debug("check error %d", ret);
472 write_lock(&cache->active_lock);
473 rb_erase(&object->active_node, &cache->active_nodes);
474 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
475 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
476 write_unlock(&cache->active_lock);
477
478 dput(object->dentry);
479 object->dentry = NULL;
480 goto error_out;
481
482delete_error:
483 _debug("delete error %d", ret);
484 goto error_out2;
485
486lookup_error:
487 _debug("lookup error %ld", PTR_ERR(next));
488 ret = PTR_ERR(next);
489 if (ret == -EIO)
490 cachefiles_io_error(cache, "Lookup failed");
491 next = NULL;
492error:
493 mutex_unlock(&dir->d_inode->i_mutex);
494 dput(next);
495error_out2:
496 dput(dir);
497error_out:
498 if (ret == -ENOSPC)
499 ret = -ENOBUFS;
500
501 _leave(" = error %d", -ret);
502 return ret;
503}
504
505/*
506 * get a subdirectory
507 */
508struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
509 struct dentry *dir,
510 const char *dirname)
511{
512 struct dentry *subdir;
513 unsigned long start;
514 int ret;
515
516 _enter(",,%s", dirname);
517
518 /* search the current directory for the element name */
519 mutex_lock(&dir->d_inode->i_mutex);
520
521 start = jiffies;
522 subdir = lookup_one_len(dirname, dir, strlen(dirname));
523 cachefiles_hist(cachefiles_lookup_histogram, start);
524 if (IS_ERR(subdir)) {
525 if (PTR_ERR(subdir) == -ENOMEM)
526 goto nomem_d_alloc;
527 goto lookup_error;
528 }
529
530 _debug("subdir -> %p %s",
531 subdir, subdir->d_inode ? "positive" : "negative");
532
533 /* we need to create the subdir if it doesn't exist yet */
534 if (!subdir->d_inode) {
535 ret = cachefiles_has_space(cache, 1, 0);
536 if (ret < 0)
537 goto mkdir_error;
538
539 _debug("attempt mkdir");
540
541 ret = vfs_mkdir(dir->d_inode, subdir, 0700);
542 if (ret < 0)
543 goto mkdir_error;
544
545 ASSERT(subdir->d_inode);
546
547 _debug("mkdir -> %p{%p{ino=%lu}}",
548 subdir,
549 subdir->d_inode,
550 subdir->d_inode->i_ino);
551 }
552
553 mutex_unlock(&dir->d_inode->i_mutex);
554
555 /* we need to make sure the subdir is a directory */
556 ASSERT(subdir->d_inode);
557
558 if (!S_ISDIR(subdir->d_inode->i_mode)) {
559 kerror("%s is not a directory", dirname);
560 ret = -EIO;
561 goto check_error;
562 }
563
564 ret = -EPERM;
565 if (!subdir->d_inode->i_op ||
566 !subdir->d_inode->i_op->setxattr ||
567 !subdir->d_inode->i_op->getxattr ||
568 !subdir->d_inode->i_op->lookup ||
569 !subdir->d_inode->i_op->mkdir ||
570 !subdir->d_inode->i_op->create ||
571 !subdir->d_inode->i_op->rename ||
572 !subdir->d_inode->i_op->rmdir ||
573 !subdir->d_inode->i_op->unlink)
574 goto check_error;
575
576 _leave(" = [%lu]", subdir->d_inode->i_ino);
577 return subdir;
578
579check_error:
580 dput(subdir);
581 _leave(" = %d [check]", ret);
582 return ERR_PTR(ret);
583
584mkdir_error:
585 mutex_unlock(&dir->d_inode->i_mutex);
586 dput(subdir);
587 kerror("mkdir %s failed with error %d", dirname, ret);
588 return ERR_PTR(ret);
589
590lookup_error:
591 mutex_unlock(&dir->d_inode->i_mutex);
592 ret = PTR_ERR(subdir);
593 kerror("Lookup %s failed with error %d", dirname, ret);
594 return ERR_PTR(ret);
595
596nomem_d_alloc:
597 mutex_unlock(&dir->d_inode->i_mutex);
598 _leave(" = -ENOMEM");
599 return ERR_PTR(-ENOMEM);
600}
601
602/*
603 * find out if an object is in use or not
604 * - if finds object and it's not in use:
605 * - returns a pointer to the object and a reference on it
606 * - returns with the directory locked
607 */
608static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
609 struct dentry *dir,
610 char *filename)
611{
612 struct cachefiles_object *object;
613 struct rb_node *_n;
614 struct dentry *victim;
615 unsigned long start;
616 int ret;
617
618 //_enter(",%*.*s/,%s",
619 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
620
621 /* look up the victim */
622 mutex_lock_nested(&dir->d_inode->i_mutex, 1);
623
624 start = jiffies;
625 victim = lookup_one_len(filename, dir, strlen(filename));
626 cachefiles_hist(cachefiles_lookup_histogram, start);
627 if (IS_ERR(victim))
628 goto lookup_error;
629
630 //_debug("victim -> %p %s",
631 // victim, victim->d_inode ? "positive" : "negative");
632
633 /* if the object is no longer there then we probably retired the object
634 * at the netfs's request whilst the cull was in progress
635 */
636 if (!victim->d_inode) {
637 mutex_unlock(&dir->d_inode->i_mutex);
638 dput(victim);
639 _leave(" = -ENOENT [absent]");
640 return ERR_PTR(-ENOENT);
641 }
642
643 /* check to see if we're using this object */
644 read_lock(&cache->active_lock);
645
646 _n = cache->active_nodes.rb_node;
647
648 while (_n) {
649 object = rb_entry(_n, struct cachefiles_object, active_node);
650
651 if (object->dentry > victim)
652 _n = _n->rb_left;
653 else if (object->dentry < victim)
654 _n = _n->rb_right;
655 else
656 goto object_in_use;
657 }
658
659 read_unlock(&cache->active_lock);
660
661 //_leave(" = %p", victim);
662 return victim;
663
664object_in_use:
665 read_unlock(&cache->active_lock);
666 mutex_unlock(&dir->d_inode->i_mutex);
667 dput(victim);
668 //_leave(" = -EBUSY [in use]");
669 return ERR_PTR(-EBUSY);
670
671lookup_error:
672 mutex_unlock(&dir->d_inode->i_mutex);
673 ret = PTR_ERR(victim);
674 if (ret == -ENOENT) {
675 /* file or dir now absent - probably retired by netfs */
676 _leave(" = -ESTALE [absent]");
677 return ERR_PTR(-ESTALE);
678 }
679
680 if (ret == -EIO) {
681 cachefiles_io_error(cache, "Lookup failed");
682 } else if (ret != -ENOMEM) {
683 kerror("Internal error: %d", ret);
684 ret = -EIO;
685 }
686
687 _leave(" = %d", ret);
688 return ERR_PTR(ret);
689}
690
691/*
692 * cull an object if it's not in use
693 * - called only by cache manager daemon
694 */
695int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
696 char *filename)
697{
698 struct dentry *victim;
699 int ret;
700
701 _enter(",%*.*s/,%s",
702 dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
703
704 victim = cachefiles_check_active(cache, dir, filename);
705 if (IS_ERR(victim))
706 return PTR_ERR(victim);
707
708 _debug("victim -> %p %s",
709 victim, victim->d_inode ? "positive" : "negative");
710
711 /* okay... the victim is not being used so we can cull it
712 * - start by marking it as stale
713 */
714 _debug("victim is cullable");
715
716 ret = cachefiles_remove_object_xattr(cache, victim);
717 if (ret < 0)
718 goto error_unlock;
719
720 /* actually remove the victim (drops the dir mutex) */
721 _debug("bury");
722
723 ret = cachefiles_bury_object(cache, dir, victim);
724 if (ret < 0)
725 goto error;
726
727 dput(victim);
728 _leave(" = 0");
729 return 0;
730
731error_unlock:
732 mutex_unlock(&dir->d_inode->i_mutex);
733error:
734 dput(victim);
735 if (ret == -ENOENT) {
736 /* file or dir now absent - probably retired by netfs */
737 _leave(" = -ESTALE [absent]");
738 return -ESTALE;
739 }
740
741 if (ret != -ENOMEM) {
742 kerror("Internal error: %d", ret);
743 ret = -EIO;
744 }
745
746 _leave(" = %d", ret);
747 return ret;
748}
749
750/*
751 * find out if an object is in use or not
752 * - called only by cache manager daemon
753 * - returns -EBUSY or 0 to indicate whether an object is in use or not
754 */
755int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
756 char *filename)
757{
758 struct dentry *victim;
759
760 //_enter(",%*.*s/,%s",
761 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
762
763 victim = cachefiles_check_active(cache, dir, filename);
764 if (IS_ERR(victim))
765 return PTR_ERR(victim);
766
767 mutex_unlock(&dir->d_inode->i_mutex);
768 dput(victim);
769 //_leave(" = 0");
770 return 0;
771}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 000000000000..eccd33941199
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
1/* CacheFiles statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/proc_fs.h>
14#include <linux/seq_file.h>
15#include "internal.h"
16
17atomic_t cachefiles_lookup_histogram[HZ];
18atomic_t cachefiles_mkdir_histogram[HZ];
19atomic_t cachefiles_create_histogram[HZ];
20
21/*
22 * display the latency histogram
23 */
24static int cachefiles_histogram_show(struct seq_file *m, void *v)
25{
26 unsigned long index;
27 unsigned x, y, z, t;
28
29 switch ((unsigned long) v) {
30 case 1:
31 seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n");
32 return 0;
33 case 2:
34 seq_puts(m, "===== ===== ========= ========= =========\n");
35 return 0;
36 default:
37 index = (unsigned long) v - 3;
38 x = atomic_read(&cachefiles_lookup_histogram[index]);
39 y = atomic_read(&cachefiles_mkdir_histogram[index]);
40 z = atomic_read(&cachefiles_create_histogram[index]);
41 if (x == 0 && y == 0 && z == 0)
42 return 0;
43
44 t = (index * 1000) / HZ;
45
46 seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z);
47 return 0;
48 }
49}
50
51/*
52 * set up the iterator to start reading from the first line
53 */
54static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
55{
56 if ((unsigned long long)*_pos >= HZ + 2)
57 return NULL;
58 if (*_pos == 0)
59 *_pos = 1;
60 return (void *)(unsigned long) *_pos;
61}
62
63/*
64 * move to the next line
65 */
66static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
67{
68 (*pos)++;
69 return (unsigned long long)*pos > HZ + 2 ?
70 NULL : (void *)(unsigned long) *pos;
71}
72
73/*
74 * clean up after reading
75 */
76static void cachefiles_histogram_stop(struct seq_file *m, void *v)
77{
78}
79
80static const struct seq_operations cachefiles_histogram_ops = {
81 .start = cachefiles_histogram_start,
82 .stop = cachefiles_histogram_stop,
83 .next = cachefiles_histogram_next,
84 .show = cachefiles_histogram_show,
85};
86
87/*
88 * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
89 */
90static int cachefiles_histogram_open(struct inode *inode, struct file *file)
91{
92 return seq_open(file, &cachefiles_histogram_ops);
93}
94
95static const struct file_operations cachefiles_histogram_fops = {
96 .owner = THIS_MODULE,
97 .open = cachefiles_histogram_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = seq_release,
101};
102
103/*
104 * initialise the /proc/fs/cachefiles/ directory
105 */
106int __init cachefiles_proc_init(void)
107{
108 _enter("");
109
110 if (!proc_mkdir("fs/cachefiles", NULL))
111 goto error_dir;
112
113 if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
114 &cachefiles_histogram_fops))
115 goto error_histogram;
116
117 _leave(" = 0");
118 return 0;
119
120error_histogram:
121 remove_proc_entry("fs/cachefiles", NULL);
122error_dir:
123 _leave(" = -ENOMEM");
124 return -ENOMEM;
125}
126
127/*
128 * clean up the /proc/fs/cachefiles/ directory
129 */
130void cachefiles_proc_cleanup(void)
131{
132 remove_proc_entry("fs/cachefiles/histogram", NULL);
133 remove_proc_entry("fs/cachefiles", NULL);
134}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000000..a69787e7dd96
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
1/* Storage object read/write
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/file.h>
14#include "internal.h"
15
16/*
17 * detect wake up events generated by the unlocking of pages in which we're
18 * interested
19 * - we use this to detect read completion of backing pages
20 * - the caller holds the waitqueue lock
21 */
22static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
23 int sync, void *_key)
24{
25 struct cachefiles_one_read *monitor =
26 container_of(wait, struct cachefiles_one_read, monitor);
27 struct cachefiles_object *object;
28 struct wait_bit_key *key = _key;
29 struct page *page = wait->private;
30
31 ASSERT(key);
32
33 _enter("{%lu},%u,%d,{%p,%u}",
34 monitor->netfs_page->index, mode, sync,
35 key->flags, key->bit_nr);
36
37 if (key->flags != &page->flags ||
38 key->bit_nr != PG_locked)
39 return 0;
40
41 _debug("--- monitor %p %lx ---", page, page->flags);
42
43 if (!PageUptodate(page) && !PageError(page))
44 dump_stack();
45
46 /* remove from the waitqueue */
47 list_del(&wait->task_list);
48
49 /* move onto the action list and queue for FS-Cache thread pool */
50 ASSERT(monitor->op);
51
52 object = container_of(monitor->op->op.object,
53 struct cachefiles_object, fscache);
54
55 spin_lock(&object->work_lock);
56 list_add_tail(&monitor->op_link, &monitor->op->to_do);
57 spin_unlock(&object->work_lock);
58
59 fscache_enqueue_retrieval(monitor->op);
60 return 0;
61}
62
63/*
64 * copy data from backing pages to netfs pages to complete a read operation
65 * - driven by FS-Cache's thread pool
66 */
67static void cachefiles_read_copier(struct fscache_operation *_op)
68{
69 struct cachefiles_one_read *monitor;
70 struct cachefiles_object *object;
71 struct fscache_retrieval *op;
72 struct pagevec pagevec;
73 int error, max;
74
75 op = container_of(_op, struct fscache_retrieval, op);
76 object = container_of(op->op.object,
77 struct cachefiles_object, fscache);
78
79 _enter("{ino=%lu}", object->backer->d_inode->i_ino);
80
81 pagevec_init(&pagevec, 0);
82
83 max = 8;
84 spin_lock_irq(&object->work_lock);
85
86 while (!list_empty(&op->to_do)) {
87 monitor = list_entry(op->to_do.next,
88 struct cachefiles_one_read, op_link);
89 list_del(&monitor->op_link);
90
91 spin_unlock_irq(&object->work_lock);
92
93 _debug("- copy {%lu}", monitor->back_page->index);
94
95 error = -EIO;
96 if (PageUptodate(monitor->back_page)) {
97 copy_highpage(monitor->netfs_page, monitor->back_page);
98
99 pagevec_add(&pagevec, monitor->netfs_page);
100 fscache_mark_pages_cached(monitor->op, &pagevec);
101 error = 0;
102 }
103
104 if (error)
105 cachefiles_io_error_obj(
106 object,
107 "Readpage failed on backing file %lx",
108 (unsigned long) monitor->back_page->flags);
109
110 page_cache_release(monitor->back_page);
111
112 fscache_end_io(op, monitor->netfs_page, error);
113 page_cache_release(monitor->netfs_page);
114 fscache_put_retrieval(op);
115 kfree(monitor);
116
117 /* let the thread pool have some air occasionally */
118 max--;
119 if (max < 0 || need_resched()) {
120 if (!list_empty(&op->to_do))
121 fscache_enqueue_retrieval(op);
122 _leave(" [maxed out]");
123 return;
124 }
125
126 spin_lock_irq(&object->work_lock);
127 }
128
129 spin_unlock_irq(&object->work_lock);
130 _leave("");
131}
132
133/*
134 * read the corresponding page to the given set from the backing file
135 * - an uncertain page is simply discarded, to be tried again another time
136 */
137static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
138 struct fscache_retrieval *op,
139 struct page *netpage,
140 struct pagevec *pagevec)
141{
142 struct cachefiles_one_read *monitor;
143 struct address_space *bmapping;
144 struct page *newpage, *backpage;
145 int ret;
146
147 _enter("");
148
149 pagevec_reinit(pagevec);
150
151 _debug("read back %p{%lu,%d}",
152 netpage, netpage->index, page_count(netpage));
153
154 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
155 if (!monitor)
156 goto nomem;
157
158 monitor->netfs_page = netpage;
159 monitor->op = fscache_get_retrieval(op);
160
161 init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
162
163 /* attempt to get hold of the backing page */
164 bmapping = object->backer->d_inode->i_mapping;
165 newpage = NULL;
166
167 for (;;) {
168 backpage = find_get_page(bmapping, netpage->index);
169 if (backpage)
170 goto backing_page_already_present;
171
172 if (!newpage) {
173 newpage = page_cache_alloc_cold(bmapping);
174 if (!newpage)
175 goto nomem_monitor;
176 }
177
178 ret = add_to_page_cache(newpage, bmapping,
179 netpage->index, GFP_KERNEL);
180 if (ret == 0)
181 goto installed_new_backing_page;
182 if (ret != -EEXIST)
183 goto nomem_page;
184 }
185
186 /* we've installed a new backing page, so now we need to add it
187 * to the LRU list and start it reading */
188installed_new_backing_page:
189 _debug("- new %p", newpage);
190
191 backpage = newpage;
192 newpage = NULL;
193
194 page_cache_get(backpage);
195 pagevec_add(pagevec, backpage);
196 __pagevec_lru_add_file(pagevec);
197
198read_backing_page:
199 ret = bmapping->a_ops->readpage(NULL, backpage);
200 if (ret < 0)
201 goto read_error;
202
203 /* set the monitor to transfer the data across */
204monitor_backing_page:
205 _debug("- monitor add");
206
207 /* install the monitor */
208 page_cache_get(monitor->netfs_page);
209 page_cache_get(backpage);
210 monitor->back_page = backpage;
211 monitor->monitor.private = backpage;
212 add_page_wait_queue(backpage, &monitor->monitor);
213 monitor = NULL;
214
215 /* but the page may have been read before the monitor was installed, so
216 * the monitor may miss the event - so we have to ensure that we do get
217 * one in such a case */
218 if (trylock_page(backpage)) {
219 _debug("jumpstart %p {%lx}", backpage, backpage->flags);
220 unlock_page(backpage);
221 }
222 goto success;
223
224 /* if the backing page is already present, it can be in one of
225 * three states: read in progress, read failed or read okay */
226backing_page_already_present:
227 _debug("- present");
228
229 if (newpage) {
230 page_cache_release(newpage);
231 newpage = NULL;
232 }
233
234 if (PageError(backpage))
235 goto io_error;
236
237 if (PageUptodate(backpage))
238 goto backing_page_already_uptodate;
239
240 if (!trylock_page(backpage))
241 goto monitor_backing_page;
242 _debug("read %p {%lx}", backpage, backpage->flags);
243 goto read_backing_page;
244
245 /* the backing page is already up to date, attach the netfs
246 * page to the pagecache and LRU and copy the data across */
247backing_page_already_uptodate:
248 _debug("- uptodate");
249
250 pagevec_add(pagevec, netpage);
251 fscache_mark_pages_cached(op, pagevec);
252
253 copy_highpage(netpage, backpage);
254 fscache_end_io(op, netpage, 0);
255
256success:
257 _debug("success");
258 ret = 0;
259
260out:
261 if (backpage)
262 page_cache_release(backpage);
263 if (monitor) {
264 fscache_put_retrieval(monitor->op);
265 kfree(monitor);
266 }
267 _leave(" = %d", ret);
268 return ret;
269
270read_error:
271 _debug("read error %d", ret);
272 if (ret == -ENOMEM)
273 goto out;
274io_error:
275 cachefiles_io_error_obj(object, "Page read error on backing file");
276 ret = -ENOBUFS;
277 goto out;
278
279nomem_page:
280 page_cache_release(newpage);
281nomem_monitor:
282 fscache_put_retrieval(monitor->op);
283 kfree(monitor);
284nomem:
285 _leave(" = -ENOMEM");
286 return -ENOMEM;
287}
288
289/*
290 * read a page from the cache or allocate a block in which to store it
291 * - cache withdrawal is prevented by the caller
292 * - returns -EINTR if interrupted
293 * - returns -ENOMEM if ran out of memory
294 * - returns -ENOBUFS if no buffers can be made available
295 * - returns -ENOBUFS if page is beyond EOF
296 * - if the page is backed by a block in the cache:
297 * - a read will be started which will call the callback on completion
298 * - 0 will be returned
299 * - else if the page is unbacked:
300 * - the metadata will be retained
301 * - -ENODATA will be returned
302 */
303int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
304 struct page *page,
305 gfp_t gfp)
306{
307 struct cachefiles_object *object;
308 struct cachefiles_cache *cache;
309 struct pagevec pagevec;
310 struct inode *inode;
311 sector_t block0, block;
312 unsigned shift;
313 int ret;
314
315 object = container_of(op->op.object,
316 struct cachefiles_object, fscache);
317 cache = container_of(object->fscache.cache,
318 struct cachefiles_cache, cache);
319
320 _enter("{%p},{%lx},,,", object, page->index);
321
322 if (!object->backer)
323 return -ENOBUFS;
324
325 inode = object->backer->d_inode;
326 ASSERT(S_ISREG(inode->i_mode));
327 ASSERT(inode->i_mapping->a_ops->bmap);
328 ASSERT(inode->i_mapping->a_ops->readpages);
329
330 /* calculate the shift required to use bmap */
331 if (inode->i_sb->s_blocksize > PAGE_SIZE)
332 return -ENOBUFS;
333
334 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
335
336 op->op.flags = FSCACHE_OP_FAST;
337 op->op.processor = cachefiles_read_copier;
338
339 pagevec_init(&pagevec, 0);
340
341 /* we assume the absence or presence of the first block is a good
342 * enough indication for the page as a whole
343 * - TODO: don't use bmap() for this as it is _not_ actually good
344 * enough for this as it doesn't indicate errors, but it's all we've
345 * got for the moment
346 */
347 block0 = page->index;
348 block0 <<= shift;
349
350 block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
351 _debug("%llx -> %llx",
352 (unsigned long long) block0,
353 (unsigned long long) block);
354
355 if (block) {
356 /* submit the apparently valid page to the backing fs to be
357 * read from disk */
358 ret = cachefiles_read_backing_file_one(object, op, page,
359 &pagevec);
360 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
361 /* there's space in the cache we can use */
362 pagevec_add(&pagevec, page);
363 fscache_mark_pages_cached(op, &pagevec);
364 ret = -ENODATA;
365 } else {
366 ret = -ENOBUFS;
367 }
368
369 _leave(" = %d", ret);
370 return ret;
371}
372
373/*
374 * read the corresponding pages to the given set from the backing file
375 * - any uncertain pages are simply discarded, to be tried again another time
376 */
377static int cachefiles_read_backing_file(struct cachefiles_object *object,
378 struct fscache_retrieval *op,
379 struct list_head *list,
380 struct pagevec *mark_pvec)
381{
382 struct cachefiles_one_read *monitor = NULL;
383 struct address_space *bmapping = object->backer->d_inode->i_mapping;
384 struct pagevec lru_pvec;
385 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
386 int ret = 0;
387
388 _enter("");
389
390 pagevec_init(&lru_pvec, 0);
391
392 list_for_each_entry_safe(netpage, _n, list, lru) {
393 list_del(&netpage->lru);
394
395 _debug("read back %p{%lu,%d}",
396 netpage, netpage->index, page_count(netpage));
397
398 if (!monitor) {
399 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
400 if (!monitor)
401 goto nomem;
402
403 monitor->op = fscache_get_retrieval(op);
404 init_waitqueue_func_entry(&monitor->monitor,
405 cachefiles_read_waiter);
406 }
407
408 for (;;) {
409 backpage = find_get_page(bmapping, netpage->index);
410 if (backpage)
411 goto backing_page_already_present;
412
413 if (!newpage) {
414 newpage = page_cache_alloc_cold(bmapping);
415 if (!newpage)
416 goto nomem;
417 }
418
419 ret = add_to_page_cache(newpage, bmapping,
420 netpage->index, GFP_KERNEL);
421 if (ret == 0)
422 goto installed_new_backing_page;
423 if (ret != -EEXIST)
424 goto nomem;
425 }
426
427 /* we've installed a new backing page, so now we need to add it
428 * to the LRU list and start it reading */
429 installed_new_backing_page:
430 _debug("- new %p", newpage);
431
432 backpage = newpage;
433 newpage = NULL;
434
435 page_cache_get(backpage);
436 if (!pagevec_add(&lru_pvec, backpage))
437 __pagevec_lru_add_file(&lru_pvec);
438
439 reread_backing_page:
440 ret = bmapping->a_ops->readpage(NULL, backpage);
441 if (ret < 0)
442 goto read_error;
443
444 /* add the netfs page to the pagecache and LRU, and set the
445 * monitor to transfer the data across */
446 monitor_backing_page:
447 _debug("- monitor add");
448
449 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
450 GFP_KERNEL);
451 if (ret < 0) {
452 if (ret == -EEXIST) {
453 page_cache_release(netpage);
454 continue;
455 }
456 goto nomem;
457 }
458
459 page_cache_get(netpage);
460 if (!pagevec_add(&lru_pvec, netpage))
461 __pagevec_lru_add_file(&lru_pvec);
462
463 /* install a monitor */
464 page_cache_get(netpage);
465 monitor->netfs_page = netpage;
466
467 page_cache_get(backpage);
468 monitor->back_page = backpage;
469 monitor->monitor.private = backpage;
470 add_page_wait_queue(backpage, &monitor->monitor);
471 monitor = NULL;
472
473 /* but the page may have been read before the monitor was
474 * installed, so the monitor may miss the event - so we have to
475 * ensure that we do get one in such a case */
476 if (trylock_page(backpage)) {
477 _debug("2unlock %p {%lx}", backpage, backpage->flags);
478 unlock_page(backpage);
479 }
480
481 page_cache_release(backpage);
482 backpage = NULL;
483
484 page_cache_release(netpage);
485 netpage = NULL;
486 continue;
487
488 /* if the backing page is already present, it can be in one of
489 * three states: read in progress, read failed or read okay */
490 backing_page_already_present:
491 _debug("- present %p", backpage);
492
493 if (PageError(backpage))
494 goto io_error;
495
496 if (PageUptodate(backpage))
497 goto backing_page_already_uptodate;
498
499 _debug("- not ready %p{%lx}", backpage, backpage->flags);
500
501 if (!trylock_page(backpage))
502 goto monitor_backing_page;
503
504 if (PageError(backpage)) {
505 _debug("error %lx", backpage->flags);
506 unlock_page(backpage);
507 goto io_error;
508 }
509
510 if (PageUptodate(backpage))
511 goto backing_page_already_uptodate_unlock;
512
513 /* we've locked a page that's neither up to date nor erroneous,
514 * so we need to attempt to read it again */
515 goto reread_backing_page;
516
517 /* the backing page is already up to date, attach the netfs
518 * page to the pagecache and LRU and copy the data across */
519 backing_page_already_uptodate_unlock:
520 _debug("uptodate %lx", backpage->flags);
521 unlock_page(backpage);
522 backing_page_already_uptodate:
523 _debug("- uptodate");
524
525 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
526 GFP_KERNEL);
527 if (ret < 0) {
528 if (ret == -EEXIST) {
529 page_cache_release(netpage);
530 continue;
531 }
532 goto nomem;
533 }
534
535 copy_highpage(netpage, backpage);
536
537 page_cache_release(backpage);
538 backpage = NULL;
539
540 if (!pagevec_add(mark_pvec, netpage))
541 fscache_mark_pages_cached(op, mark_pvec);
542
543 page_cache_get(netpage);
544 if (!pagevec_add(&lru_pvec, netpage))
545 __pagevec_lru_add_file(&lru_pvec);
546
547 fscache_end_io(op, netpage, 0);
548 page_cache_release(netpage);
549 netpage = NULL;
550 continue;
551 }
552
553 netpage = NULL;
554
555 _debug("out");
556
557out:
558 /* tidy up */
559 pagevec_lru_add_file(&lru_pvec);
560
561 if (newpage)
562 page_cache_release(newpage);
563 if (netpage)
564 page_cache_release(netpage);
565 if (backpage)
566 page_cache_release(backpage);
567 if (monitor) {
568 fscache_put_retrieval(op);
569 kfree(monitor);
570 }
571
572 list_for_each_entry_safe(netpage, _n, list, lru) {
573 list_del(&netpage->lru);
574 page_cache_release(netpage);
575 }
576
577 _leave(" = %d", ret);
578 return ret;
579
580nomem:
581 _debug("nomem");
582 ret = -ENOMEM;
583 goto out;
584
585read_error:
586 _debug("read error %d", ret);
587 if (ret == -ENOMEM)
588 goto out;
589io_error:
590 cachefiles_io_error_obj(object, "Page read error on backing file");
591 ret = -ENOBUFS;
592 goto out;
593}
594
595/*
596 * read a list of pages from the cache or allocate blocks in which to store
597 * them
598 */
599int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
600 struct list_head *pages,
601 unsigned *nr_pages,
602 gfp_t gfp)
603{
604 struct cachefiles_object *object;
605 struct cachefiles_cache *cache;
606 struct list_head backpages;
607 struct pagevec pagevec;
608 struct inode *inode;
609 struct page *page, *_n;
610 unsigned shift, nrbackpages;
611 int ret, ret2, space;
612
613 object = container_of(op->op.object,
614 struct cachefiles_object, fscache);
615 cache = container_of(object->fscache.cache,
616 struct cachefiles_cache, cache);
617
618 _enter("{OBJ%x,%d},,%d,,",
619 object->fscache.debug_id, atomic_read(&op->op.usage),
620 *nr_pages);
621
622 if (!object->backer)
623 return -ENOBUFS;
624
625 space = 1;
626 if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
627 space = 0;
628
629 inode = object->backer->d_inode;
630 ASSERT(S_ISREG(inode->i_mode));
631 ASSERT(inode->i_mapping->a_ops->bmap);
632 ASSERT(inode->i_mapping->a_ops->readpages);
633
634 /* calculate the shift required to use bmap */
635 if (inode->i_sb->s_blocksize > PAGE_SIZE)
636 return -ENOBUFS;
637
638 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
639
640 pagevec_init(&pagevec, 0);
641
642 op->op.flags = FSCACHE_OP_FAST;
643 op->op.processor = cachefiles_read_copier;
644
645 INIT_LIST_HEAD(&backpages);
646 nrbackpages = 0;
647
648 ret = space ? -ENODATA : -ENOBUFS;
649 list_for_each_entry_safe(page, _n, pages, lru) {
650 sector_t block0, block;
651
652 /* we assume the absence or presence of the first block is a
653 * good enough indication for the page as a whole
654 * - TODO: don't use bmap() for this as it is _not_ actually
655 * good enough for this as it doesn't indicate errors, but
656 * it's all we've got for the moment
657 */
658 block0 = page->index;
659 block0 <<= shift;
660
661 block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
662 block0);
663 _debug("%llx -> %llx",
664 (unsigned long long) block0,
665 (unsigned long long) block);
666
667 if (block) {
668 /* we have data - add it to the list to give to the
669 * backing fs */
670 list_move(&page->lru, &backpages);
671 (*nr_pages)--;
672 nrbackpages++;
673 } else if (space && pagevec_add(&pagevec, page) == 0) {
674 fscache_mark_pages_cached(op, &pagevec);
675 ret = -ENODATA;
676 }
677 }
678
679 if (pagevec_count(&pagevec) > 0)
680 fscache_mark_pages_cached(op, &pagevec);
681
682 if (list_empty(pages))
683 ret = 0;
684
685 /* submit the apparently valid pages to the backing fs to be read from
686 * disk */
687 if (nrbackpages > 0) {
688 ret2 = cachefiles_read_backing_file(object, op, &backpages,
689 &pagevec);
690 if (ret2 == -ENOMEM || ret2 == -EINTR)
691 ret = ret2;
692 }
693
694 if (pagevec_count(&pagevec) > 0)
695 fscache_mark_pages_cached(op, &pagevec);
696
697 _leave(" = %d [nr=%u%s]",
698 ret, *nr_pages, list_empty(pages) ? " empty" : "");
699 return ret;
700}
701
702/*
703 * allocate a block in the cache in which to store a page
704 * - cache withdrawal is prevented by the caller
705 * - returns -EINTR if interrupted
706 * - returns -ENOMEM if ran out of memory
707 * - returns -ENOBUFS if no buffers can be made available
708 * - returns -ENOBUFS if page is beyond EOF
709 * - otherwise:
710 * - the metadata will be retained
711 * - 0 will be returned
712 */
713int cachefiles_allocate_page(struct fscache_retrieval *op,
714 struct page *page,
715 gfp_t gfp)
716{
717 struct cachefiles_object *object;
718 struct cachefiles_cache *cache;
719 struct pagevec pagevec;
720 int ret;
721
722 object = container_of(op->op.object,
723 struct cachefiles_object, fscache);
724 cache = container_of(object->fscache.cache,
725 struct cachefiles_cache, cache);
726
727 _enter("%p,{%lx},", object, page->index);
728
729 ret = cachefiles_has_space(cache, 0, 1);
730 if (ret == 0) {
731 pagevec_init(&pagevec, 0);
732 pagevec_add(&pagevec, page);
733 fscache_mark_pages_cached(op, &pagevec);
734 } else {
735 ret = -ENOBUFS;
736 }
737
738 _leave(" = %d", ret);
739 return ret;
740}
741
742/*
743 * allocate blocks in the cache in which to store a set of pages
744 * - cache withdrawal is prevented by the caller
745 * - returns -EINTR if interrupted
746 * - returns -ENOMEM if ran out of memory
747 * - returns -ENOBUFS if some buffers couldn't be made available
748 * - returns -ENOBUFS if some pages are beyond EOF
749 * - otherwise:
750 * - -ENODATA will be returned
751 * - metadata will be retained for any page marked
752 */
753int cachefiles_allocate_pages(struct fscache_retrieval *op,
754 struct list_head *pages,
755 unsigned *nr_pages,
756 gfp_t gfp)
757{
758 struct cachefiles_object *object;
759 struct cachefiles_cache *cache;
760 struct pagevec pagevec;
761 struct page *page;
762 int ret;
763
764 object = container_of(op->op.object,
765 struct cachefiles_object, fscache);
766 cache = container_of(object->fscache.cache,
767 struct cachefiles_cache, cache);
768
769 _enter("%p,,,%d,", object, *nr_pages);
770
771 ret = cachefiles_has_space(cache, 0, *nr_pages);
772 if (ret == 0) {
773 pagevec_init(&pagevec, 0);
774
775 list_for_each_entry(page, pages, lru) {
776 if (pagevec_add(&pagevec, page) == 0)
777 fscache_mark_pages_cached(op, &pagevec);
778 }
779
780 if (pagevec_count(&pagevec) > 0)
781 fscache_mark_pages_cached(op, &pagevec);
782 ret = -ENODATA;
783 } else {
784 ret = -ENOBUFS;
785 }
786
787 _leave(" = %d", ret);
788 return ret;
789}
790
791/*
792 * request a page be stored in the cache
793 * - cache withdrawal is prevented by the caller
794 * - this request may be ignored if there's no cache block available, in which
795 * case -ENOBUFS will be returned
796 * - if the op is in progress, 0 will be returned
797 */
798int cachefiles_write_page(struct fscache_storage *op, struct page *page)
799{
800 struct cachefiles_object *object;
801 struct cachefiles_cache *cache;
802 mm_segment_t old_fs;
803 struct file *file;
804 loff_t pos;
805 void *data;
806 int ret;
807
808 ASSERT(op != NULL);
809 ASSERT(page != NULL);
810
811 object = container_of(op->op.object,
812 struct cachefiles_object, fscache);
813
814 _enter("%p,%p{%lx},,,", object, page, page->index);
815
816 if (!object->backer) {
817 _leave(" = -ENOBUFS");
818 return -ENOBUFS;
819 }
820
821 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
822
823 cache = container_of(object->fscache.cache,
824 struct cachefiles_cache, cache);
825
826 /* write the page to the backing filesystem and let it store it in its
827 * own time */
828 dget(object->backer);
829 mntget(cache->mnt);
830 file = dentry_open(object->backer, cache->mnt, O_RDWR,
831 cache->cache_cred);
832 if (IS_ERR(file)) {
833 ret = PTR_ERR(file);
834 } else {
835 ret = -EIO;
836 if (file->f_op->write) {
837 pos = (loff_t) page->index << PAGE_SHIFT;
838 data = kmap(page);
839 old_fs = get_fs();
840 set_fs(KERNEL_DS);
841 ret = file->f_op->write(
842 file, (const void __user *) data, PAGE_SIZE,
843 &pos);
844 set_fs(old_fs);
845 kunmap(page);
846 if (ret != PAGE_SIZE)
847 ret = -EIO;
848 }
849 fput(file);
850 }
851
852 if (ret < 0) {
853 if (ret == -EIO)
854 cachefiles_io_error_obj(
855 object, "Write page to backing file failed");
856 ret = -ENOBUFS;
857 }
858
859 _leave(" = %d", ret);
860 return ret;
861}
862
863/*
864 * detach a backing block from a page
865 * - cache withdrawal is prevented by the caller
866 */
867void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
868{
869 struct cachefiles_object *object;
870 struct cachefiles_cache *cache;
871
872 object = container_of(_object, struct cachefiles_object, fscache);
873 cache = container_of(object->fscache.cache,
874 struct cachefiles_cache, cache);
875
876 _enter("%p,{%lu}", object, page->index);
877
878 spin_unlock(&object->fscache.cookie->lock);
879}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 000000000000..b5808cdb2232
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
1/* CacheFiles security management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/cred.h>
14#include "internal.h"
15
16/*
17 * determine the security context within which we access the cache from within
18 * the kernel
19 */
20int cachefiles_get_security_ID(struct cachefiles_cache *cache)
21{
22 struct cred *new;
23 int ret;
24
25 _enter("{%s}", cache->secctx);
26
27 new = prepare_kernel_cred(current);
28 if (!new) {
29 ret = -ENOMEM;
30 goto error;
31 }
32
33 if (cache->secctx) {
34 ret = set_security_override_from_ctx(new, cache->secctx);
35 if (ret < 0) {
36 put_cred(new);
37 printk(KERN_ERR "CacheFiles:"
38 " Security denies permission to nominate"
39 " security context: error %d\n",
40 ret);
41 goto error;
42 }
43 }
44
45 cache->cache_cred = new;
46 ret = 0;
47error:
48 _leave(" = %d", ret);
49 return ret;
50}
51
52/*
53 * see if mkdir and create can be performed in the root directory
54 */
55static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
56 struct dentry *root)
57{
58 int ret;
59
60 ret = security_inode_mkdir(root->d_inode, root, 0);
61 if (ret < 0) {
62 printk(KERN_ERR "CacheFiles:"
63 " Security denies permission to make dirs: error %d",
64 ret);
65 return ret;
66 }
67
68 ret = security_inode_create(root->d_inode, root, 0);
69 if (ret < 0)
70 printk(KERN_ERR "CacheFiles:"
71 " Security denies permission to create files: error %d",
72 ret);
73
74 return ret;
75}
76
77/*
78 * check the security details of the on-disk cache
79 * - must be called with security override in force
80 */
81int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
82 struct dentry *root,
83 const struct cred **_saved_cred)
84{
85 struct cred *new;
86 int ret;
87
88 _enter("");
89
90 /* duplicate the cache creds for COW (the override is currently in
91 * force, so we can use prepare_creds() to do this) */
92 new = prepare_creds();
93 if (!new)
94 return -ENOMEM;
95
96 cachefiles_end_secure(cache, *_saved_cred);
97
98 /* use the cache root dir's security context as the basis with
99 * which create files */
100 ret = set_create_files_as(new, root->d_inode);
101 if (ret < 0) {
102 _leave(" = %d [cfa]", ret);
103 return ret;
104 }
105
106 put_cred(cache->cache_cred);
107 cache->cache_cred = new;
108
109 cachefiles_begin_secure(cache, _saved_cred);
110 ret = cachefiles_check_cache_dir(cache, root);
111
112 if (ret == -EOPNOTSUPP)
113 ret = 0;
114 _leave(" = %d", ret);
115 return ret;
116}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 000000000000..f3e7a0bf068b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
1/* CacheFiles extended attribute management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include "internal.h"
20
21static const char cachefiles_xattr_cache[] =
22 XATTR_USER_PREFIX "CacheFiles.cache";
23
24/*
25 * check the type label on an object
26 * - done using xattrs
27 */
28int cachefiles_check_object_type(struct cachefiles_object *object)
29{
30 struct dentry *dentry = object->dentry;
31 char type[3], xtype[3];
32 int ret;
33
34 ASSERT(dentry);
35 ASSERT(dentry->d_inode);
36
37 if (!object->fscache.cookie)
38 strcpy(type, "C3");
39 else
40 snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
41
42 _enter("%p{%s}", object, type);
43
44 /* attempt to install a type label directly */
45 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
46 XATTR_CREATE);
47 if (ret == 0) {
48 _debug("SET"); /* we succeeded */
49 goto error;
50 }
51
52 if (ret != -EEXIST) {
53 kerror("Can't set xattr on %*.*s [%lu] (err %d)",
54 dentry->d_name.len, dentry->d_name.len,
55 dentry->d_name.name, dentry->d_inode->i_ino,
56 -ret);
57 goto error;
58 }
59
60 /* read the current type label */
61 ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
62 if (ret < 0) {
63 if (ret == -ERANGE)
64 goto bad_type_length;
65
66 kerror("Can't read xattr on %*.*s [%lu] (err %d)",
67 dentry->d_name.len, dentry->d_name.len,
68 dentry->d_name.name, dentry->d_inode->i_ino,
69 -ret);
70 goto error;
71 }
72
73 /* check the type is what we're expecting */
74 if (ret != 2)
75 goto bad_type_length;
76
77 if (xtype[0] != type[0] || xtype[1] != type[1])
78 goto bad_type;
79
80 ret = 0;
81
82error:
83 _leave(" = %d", ret);
84 return ret;
85
86bad_type_length:
87 kerror("Cache object %lu type xattr length incorrect",
88 dentry->d_inode->i_ino);
89 ret = -EIO;
90 goto error;
91
92bad_type:
93 xtype[2] = 0;
94 kerror("Cache object %*.*s [%lu] type %s not %s",
95 dentry->d_name.len, dentry->d_name.len,
96 dentry->d_name.name, dentry->d_inode->i_ino,
97 xtype, type);
98 ret = -EIO;
99 goto error;
100}
101
102/*
103 * set the state xattr on a cache file
104 */
105int cachefiles_set_object_xattr(struct cachefiles_object *object,
106 struct cachefiles_xattr *auxdata)
107{
108 struct dentry *dentry = object->dentry;
109 int ret;
110
111 ASSERT(object->fscache.cookie);
112 ASSERT(dentry);
113
114 _enter("%p,#%d", object, auxdata->len);
115
116 /* attempt to install the cache metadata directly */
117 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
118
119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
120 &auxdata->type, auxdata->len,
121 XATTR_CREATE);
122 if (ret < 0 && ret != -ENOMEM)
123 cachefiles_io_error_obj(
124 object,
125 "Failed to set xattr with error %d", ret);
126
127 _leave(" = %d", ret);
128 return ret;
129}
130
131/*
132 * update the state xattr on a cache file
133 */
134int cachefiles_update_object_xattr(struct cachefiles_object *object,
135 struct cachefiles_xattr *auxdata)
136{
137 struct dentry *dentry = object->dentry;
138 int ret;
139
140 ASSERT(object->fscache.cookie);
141 ASSERT(dentry);
142
143 _enter("%p,#%d", object, auxdata->len);
144
145 /* attempt to install the cache metadata directly */
146 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
147
148 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
149 &auxdata->type, auxdata->len,
150 XATTR_REPLACE);
151 if (ret < 0 && ret != -ENOMEM)
152 cachefiles_io_error_obj(
153 object,
154 "Failed to update xattr with error %d", ret);
155
156 _leave(" = %d", ret);
157 return ret;
158}
159
160/*
161 * check the state xattr on a cache file
162 * - return -ESTALE if the object should be deleted
163 */
164int cachefiles_check_object_xattr(struct cachefiles_object *object,
165 struct cachefiles_xattr *auxdata)
166{
167 struct cachefiles_xattr *auxbuf;
168 struct dentry *dentry = object->dentry;
169 int ret;
170
171 _enter("%p,#%d", object, auxdata->len);
172
173 ASSERT(dentry);
174 ASSERT(dentry->d_inode);
175
176 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
177 if (!auxbuf) {
178 _leave(" = -ENOMEM");
179 return -ENOMEM;
180 }
181
182 /* read the current type label */
183 ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
184 &auxbuf->type, 512 + 1);
185 if (ret < 0) {
186 if (ret == -ENODATA)
187 goto stale; /* no attribute - power went off
188 * mid-cull? */
189
190 if (ret == -ERANGE)
191 goto bad_type_length;
192
193 cachefiles_io_error_obj(object,
194 "Can't read xattr on %lu (err %d)",
195 dentry->d_inode->i_ino, -ret);
196 goto error;
197 }
198
199 /* check the on-disk object */
200 if (ret < 1)
201 goto bad_type_length;
202
203 if (auxbuf->type != auxdata->type)
204 goto stale;
205
206 auxbuf->len = ret;
207
208 /* consult the netfs */
209 if (object->fscache.cookie->def->check_aux) {
210 enum fscache_checkaux result;
211 unsigned int dlen;
212
213 dlen = auxbuf->len - 1;
214
215 _debug("checkaux %s #%u",
216 object->fscache.cookie->def->name, dlen);
217
218 result = fscache_check_aux(&object->fscache,
219 &auxbuf->data, dlen);
220
221 switch (result) {
222 /* entry okay as is */
223 case FSCACHE_CHECKAUX_OKAY:
224 goto okay;
225
226 /* entry requires update */
227 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
228 break;
229
230 /* entry requires deletion */
231 case FSCACHE_CHECKAUX_OBSOLETE:
232 goto stale;
233
234 default:
235 BUG();
236 }
237
238 /* update the current label */
239 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
240 &auxdata->type, auxdata->len,
241 XATTR_REPLACE);
242 if (ret < 0) {
243 cachefiles_io_error_obj(object,
244 "Can't update xattr on %lu"
245 " (error %d)",
246 dentry->d_inode->i_ino, -ret);
247 goto error;
248 }
249 }
250
251okay:
252 ret = 0;
253
254error:
255 kfree(auxbuf);
256 _leave(" = %d", ret);
257 return ret;
258
259bad_type_length:
260 kerror("Cache object %lu xattr length incorrect",
261 dentry->d_inode->i_ino);
262 ret = -EIO;
263 goto error;
264
265stale:
266 ret = -ESTALE;
267 goto error;
268}
269
270/*
271 * remove the object's xattr to mark it stale
272 */
273int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
274 struct dentry *dentry)
275{
276 int ret;
277
278 ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
279 if (ret < 0) {
280 if (ret == -ENOENT || ret == -ENODATA)
281 ret = 0;
282 else if (ret != -ENOMEM)
283 cachefiles_io_error(cache,
284 "Can't remove xattr from %lu"
285 " (error %d)",
286 dentry->d_inode->i_ino, -ret);
287 }
288
289 _leave(" = %d", ret);
290 return ret;
291}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8d..54dce78fbb73 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
254 return -ENOMEM; 254 return -ENOMEM;
255 } 255 }
256 256
257 mode &= ~current->fs->umask; 257 mode &= ~current_umask();
258 if (oplockEnabled) 258 if (oplockEnabled)
259 oplock = REQ_OPLOCK; 259 oplock = REQ_OPLOCK;
260 260
@@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
479 rc = -ENOMEM; 479 rc = -ENOMEM;
480 else if (pTcon->unix_ext) { 480 else if (pTcon->unix_ext) {
481 struct cifs_unix_set_info_args args = { 481 struct cifs_unix_set_info_args args = {
482 .mode = mode & ~current->fs->umask, 482 .mode = mode & ~current_umask(),
483 .ctime = NO_CHANGE_64, 483 .ctime = NO_CHANGE_64,
484 .atime = NO_CHANGE_64, 484 .atime = NO_CHANGE_64,
485 .mtime = NO_CHANGE_64, 485 .mtime = NO_CHANGE_64,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc60805..f121a80fdd6f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1125 goto mkdir_out; 1125 goto mkdir_out;
1126 } 1126 }
1127 1127
1128 mode &= ~current->fs->umask; 1128 mode &= ~current_umask();
1129 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, 1129 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
1130 mode, NULL /* netfid */, pInfo, &oplock, 1130 mode, NULL /* netfid */, pInfo, &oplock,
1131 full_path, cifs_sb->local_nls, 1131 full_path, cifs_sb->local_nls,
@@ -1204,7 +1204,7 @@ mkdir_get_info:
1204 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) 1204 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
1205 direntry->d_inode->i_nlink = 2; 1205 direntry->d_inode->i_nlink = 2;
1206 1206
1207 mode &= ~current->fs->umask; 1207 mode &= ~current_umask();
1208 /* must turn on setgid bit if parent dir has it */ 1208 /* must turn on setgid bit if parent dir has it */
1209 if (inode->i_mode & S_ISGID) 1209 if (inode->i_mode & S_ISGID)
1210 mode |= S_ISGID; 1210 mode |= S_ISGID;
diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..3f84d5f15889 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
51#include <linux/poll.h> 51#include <linux/poll.h>
52#include <linux/mm.h> 52#include <linux/mm.h>
53#include <linux/eventpoll.h> 53#include <linux/eventpoll.h>
54#include <linux/fs_struct.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
@@ -1195,16 +1196,12 @@ out:
1195 return ret; 1196 return ret;
1196} 1197}
1197 1198
1198asmlinkage ssize_t 1199static size_t compat_readv(struct file *file,
1199compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1200 const struct compat_iovec __user *vec,
1201 unsigned long vlen, loff_t *pos)
1200{ 1202{
1201 struct file *file;
1202 ssize_t ret = -EBADF; 1203 ssize_t ret = -EBADF;
1203 1204
1204 file = fget(fd);
1205 if (!file)
1206 return -EBADF;
1207
1208 if (!(file->f_mode & FMODE_READ)) 1205 if (!(file->f_mode & FMODE_READ))
1209 goto out; 1206 goto out;
1210 1207
@@ -1212,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
1212 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) 1209 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
1213 goto out; 1210 goto out;
1214 1211
1215 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); 1212 ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1216 1213
1217out: 1214out:
1218 if (ret > 0) 1215 if (ret > 0)
1219 add_rchar(current, ret); 1216 add_rchar(current, ret);
1220 inc_syscr(current); 1217 inc_syscr(current);
1221 fput(file);
1222 return ret; 1218 return ret;
1223} 1219}
1224 1220
1225asmlinkage ssize_t 1221asmlinkage ssize_t
1226compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1222compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
1223 unsigned long vlen)
1227{ 1224{
1228 struct file *file; 1225 struct file *file;
1229 ssize_t ret = -EBADF; 1226 int fput_needed;
1227 ssize_t ret;
1230 1228
1231 file = fget(fd); 1229 file = fget_light(fd, &fput_needed);
1232 if (!file) 1230 if (!file)
1233 return -EBADF; 1231 return -EBADF;
1232 ret = compat_readv(file, vec, vlen, &file->f_pos);
1233 fput_light(file, fput_needed);
1234 return ret;
1235}
1236
1237asmlinkage ssize_t
1238compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
1239 unsigned long vlen, u32 pos_low, u32 pos_high)
1240{
1241 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1242 struct file *file;
1243 int fput_needed;
1244 ssize_t ret;
1245
1246 if (pos < 0)
1247 return -EINVAL;
1248 file = fget_light(fd, &fput_needed);
1249 if (!file)
1250 return -EBADF;
1251 ret = compat_readv(file, vec, vlen, &pos);
1252 fput_light(file, fput_needed);
1253 return ret;
1254}
1255
1256static size_t compat_writev(struct file *file,
1257 const struct compat_iovec __user *vec,
1258 unsigned long vlen, loff_t *pos)
1259{
1260 ssize_t ret = -EBADF;
1261
1234 if (!(file->f_mode & FMODE_WRITE)) 1262 if (!(file->f_mode & FMODE_WRITE))
1235 goto out; 1263 goto out;
1236 1264
@@ -1238,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
1238 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) 1266 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1239 goto out; 1267 goto out;
1240 1268
1241 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); 1269 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1242 1270
1243out: 1271out:
1244 if (ret > 0) 1272 if (ret > 0)
1245 add_wchar(current, ret); 1273 add_wchar(current, ret);
1246 inc_syscw(current); 1274 inc_syscw(current);
1247 fput(file); 1275 return ret;
1276}
1277
1278asmlinkage ssize_t
1279compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
1280 unsigned long vlen)
1281{
1282 struct file *file;
1283 int fput_needed;
1284 ssize_t ret;
1285
1286 file = fget_light(fd, &fput_needed);
1287 if (!file)
1288 return -EBADF;
1289 ret = compat_writev(file, vec, vlen, &file->f_pos);
1290 fput_light(file, fput_needed);
1291 return ret;
1292}
1293
1294asmlinkage ssize_t
1295compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
1296 unsigned long vlen, u32 pos_low, u32 pos_high)
1297{
1298 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1299 struct file *file;
1300 int fput_needed;
1301 ssize_t ret;
1302
1303 if (pos < 0)
1304 return -EINVAL;
1305 file = fget_light(fd, &fput_needed);
1306 if (!file)
1307 return -EBADF;
1308 ret = compat_writev(file, vec, vlen, &pos);
1309 fput_light(file, fput_needed);
1248 return ret; 1310 return ret;
1249} 1311}
1250 1312
@@ -1441,12 +1503,15 @@ int compat_do_execve(char * filename,
1441 bprm->cred = prepare_exec_creds(); 1503 bprm->cred = prepare_exec_creds();
1442 if (!bprm->cred) 1504 if (!bprm->cred)
1443 goto out_unlock; 1505 goto out_unlock;
1444 check_unsafe_exec(bprm); 1506
1507 retval = check_unsafe_exec(bprm);
1508 if (retval)
1509 goto out_unlock;
1445 1510
1446 file = open_exec(filename); 1511 file = open_exec(filename);
1447 retval = PTR_ERR(file); 1512 retval = PTR_ERR(file);
1448 if (IS_ERR(file)) 1513 if (IS_ERR(file))
1449 goto out_unlock; 1514 goto out_unmark;
1450 1515
1451 sched_exec(); 1516 sched_exec();
1452 1517
@@ -1488,6 +1553,9 @@ int compat_do_execve(char * filename,
1488 goto out; 1553 goto out;
1489 1554
1490 /* execve succeeded */ 1555 /* execve succeeded */
1556 write_lock(&current->fs->lock);
1557 current->fs->in_exec = 0;
1558 write_unlock(&current->fs->lock);
1491 current->in_execve = 0; 1559 current->in_execve = 0;
1492 mutex_unlock(&current->cred_exec_mutex); 1560 mutex_unlock(&current->cred_exec_mutex);
1493 acct_update_integrals(current); 1561 acct_update_integrals(current);
@@ -1506,6 +1574,11 @@ out_file:
1506 fput(bprm->file); 1574 fput(bprm->file);
1507 } 1575 }
1508 1576
1577out_unmark:
1578 write_lock(&current->fs->lock);
1579 current->fs->in_exec = 0;
1580 write_unlock(&current->fs->lock);
1581
1509out_unlock: 1582out_unlock:
1510 current->in_execve = 0; 1583 current->in_execve = 0;
1511 mutex_unlock(&current->cred_exec_mutex); 1584 mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ff786687e93b..3e87ce443ea2 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
23#include <linux/if.h> 23#include <linux/if.h>
24#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/raid/md.h> 26#include <linux/raid/md_u.h>
27#include <linux/kd.h> 27#include <linux/kd.h>
28#include <linux/route.h> 28#include <linux/route.h>
29#include <linux/in6.h> 29#include <linux/in6.h>
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d140..dd3634e4c967 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) 318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
319{ 319{
320 struct super_block *sb = dentry->d_sb; 320 struct super_block *sb = dentry->d_sb;
321 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
321 322
322 buf->f_type = CRAMFS_MAGIC; 323 buf->f_type = CRAMFS_MAGIC;
323 buf->f_bsize = PAGE_CACHE_SIZE; 324 buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
326 buf->f_bavail = 0; 327 buf->f_bavail = 0;
327 buf->f_files = CRAMFS_SB(sb)->files; 328 buf->f_files = CRAMFS_SB(sb)->files;
328 buf->f_ffree = 0; 329 buf->f_ffree = 0;
330 buf->f_fsid.val[0] = (u32)id;
331 buf->f_fsid.val[1] = (u32)(id >> 32);
329 buf->f_namelen = CRAMFS_MAXPATHLEN; 332 buf->f_namelen = CRAMFS_MAXPATHLEN;
330 return 0; 333 return 0;
331} 334}
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
459static int cramfs_readpage(struct file *file, struct page * page) 462static int cramfs_readpage(struct file *file, struct page * page)
460{ 463{
461 struct inode *inode = page->mapping->host; 464 struct inode *inode = page->mapping->host;
462 u32 maxblock, bytes_filled; 465 u32 maxblock;
466 int bytes_filled;
463 void *pgdata; 467 void *pgdata;
464 468
465 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 469 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
466 bytes_filled = 0; 470 bytes_filled = 0;
471 pgdata = kmap(page);
472
467 if (page->index < maxblock) { 473 if (page->index < maxblock) {
468 struct super_block *sb = inode->i_sb; 474 struct super_block *sb = inode->i_sb;
469 u32 blkptr_offset = OFFSET(inode) + page->index*4; 475 u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
472 start_offset = OFFSET(inode) + maxblock*4; 478 start_offset = OFFSET(inode) + maxblock*4;
473 mutex_lock(&read_mutex); 479 mutex_lock(&read_mutex);
474 if (page->index) 480 if (page->index)
475 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4); 481 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
476 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset); 482 4);
483 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
484 start_offset);
477 mutex_unlock(&read_mutex); 485 mutex_unlock(&read_mutex);
478 pgdata = kmap(page); 486
479 if (compr_len == 0) 487 if (compr_len == 0)
480 ; /* hole */ 488 ; /* hole */
481 else if (compr_len > (PAGE_CACHE_SIZE << 1)) 489 else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
482 printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len); 490 pr_err("cramfs: bad compressed blocksize %u\n",
483 else { 491 compr_len);
492 goto err;
493 } else {
484 mutex_lock(&read_mutex); 494 mutex_lock(&read_mutex);
485 bytes_filled = cramfs_uncompress_block(pgdata, 495 bytes_filled = cramfs_uncompress_block(pgdata,
486 PAGE_CACHE_SIZE, 496 PAGE_CACHE_SIZE,
487 cramfs_read(sb, start_offset, compr_len), 497 cramfs_read(sb, start_offset, compr_len),
488 compr_len); 498 compr_len);
489 mutex_unlock(&read_mutex); 499 mutex_unlock(&read_mutex);
500 if (unlikely(bytes_filled < 0))
501 goto err;
490 } 502 }
491 } else 503 }
492 pgdata = kmap(page); 504
493 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); 505 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
494 kunmap(page);
495 flush_dcache_page(page); 506 flush_dcache_page(page);
507 kunmap(page);
496 SetPageUptodate(page); 508 SetPageUptodate(page);
497 unlock_page(page); 509 unlock_page(page);
498 return 0; 510 return 0;
511
512err:
513 kunmap(page);
514 ClearPageUptodate(page);
515 SetPageError(page);
516 unlock_page(page);
517 return 0;
499} 518}
500 519
501static const struct address_space_operations cramfs_aops = { 520static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626f..023329800d2e 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
50err: 50err:
51 printk("Error %d while decompressing!\n", err); 51 printk("Error %d while decompressing!\n", err);
52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); 52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
53 return 0; 53 return -EIO;
54} 54}
55 55
56int cramfs_uncompress_init(void) 56int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..761d30be2683 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/fdtable.h>
21#include <linux/fs.h> 20#include <linux/fs.h>
22#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
23#include <linux/slab.h> 22#include <linux/slab.h>
@@ -32,6 +31,7 @@
32#include <linux/seqlock.h> 31#include <linux/seqlock.h>
33#include <linux/swap.h> 32#include <linux/swap.h>
34#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h>
35#include "internal.h" 35#include "internal.h"
36 36
37int sysctl_vfs_cache_pressure __read_mostly = 100; 37int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 44d725f612cf..b6a719a909f8 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
18 18
19 spin_lock(&inode_lock); 19 spin_lock(&inode_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 21 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
22 continue; 22 continue;
23 if (inode->i_mapping->nrpages == 0) 23 if (inode->i_mapping->nrpages == 0)
24 continue; 24 continue;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91fc..f04942810818 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
329} 329}
330 330
331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { 331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
332 struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb); 332 struct super_block *sb = dentry->d_sb;
333 struct efs_sb_info *sbi = SUPER_INFO(sb);
334 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
333 335
334 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ 336 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */
335 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ 337 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */
336 buf->f_blocks = sb->total_groups * /* total data blocks */ 338 buf->f_blocks = sbi->total_groups * /* total data blocks */
337 (sb->group_size - sb->inode_blocks); 339 (sbi->group_size - sbi->inode_blocks);
338 buf->f_bfree = sb->data_free; /* free data blocks */ 340 buf->f_bfree = sbi->data_free; /* free data blocks */
339 buf->f_bavail = sb->data_free; /* free blocks for non-root */ 341 buf->f_bavail = sbi->data_free; /* free blocks for non-root */
340 buf->f_files = sb->total_groups * /* total inodes */ 342 buf->f_files = sbi->total_groups * /* total inodes */
341 sb->inode_blocks * 343 sbi->inode_blocks *
342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); 344 (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
343 buf->f_ffree = sb->inode_free; /* free inodes */ 345 buf->f_ffree = sbi->inode_free; /* free inodes */
346 buf->f_fsid.val[0] = (u32)id;
347 buf->f_fsid.val[1] = (u32)(id >> 32);
344 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ 348 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
345 349
346 return 0; 350 return 0;
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
53#include <linux/tracehook.h> 53#include <linux/tracehook.h>
54#include <linux/kmod.h> 54#include <linux/kmod.h>
55#include <linux/fsnotify.h> 55#include <linux/fsnotify.h>
56#include <linux/fs_struct.h>
56 57
57#include <asm/uaccess.h> 58#include <asm/uaccess.h>
58#include <asm/mmu_context.h> 59#include <asm/mmu_context.h>
@@ -1056,28 +1057,35 @@ EXPORT_SYMBOL(install_exec_creds);
1056 * - the caller must hold current->cred_exec_mutex to protect against 1057 * - the caller must hold current->cred_exec_mutex to protect against
1057 * PTRACE_ATTACH 1058 * PTRACE_ATTACH
1058 */ 1059 */
1059void check_unsafe_exec(struct linux_binprm *bprm) 1060int check_unsafe_exec(struct linux_binprm *bprm)
1060{ 1061{
1061 struct task_struct *p = current, *t; 1062 struct task_struct *p = current, *t;
1062 unsigned long flags; 1063 unsigned long flags;
1063 unsigned n_fs, n_sighand; 1064 unsigned n_fs;
1065 int res = 0;
1064 1066
1065 bprm->unsafe = tracehook_unsafe_exec(p); 1067 bprm->unsafe = tracehook_unsafe_exec(p);
1066 1068
1067 n_fs = 1; 1069 n_fs = 1;
1068 n_sighand = 1; 1070 write_lock(&p->fs->lock);
1069 lock_task_sighand(p, &flags); 1071 lock_task_sighand(p, &flags);
1070 for (t = next_thread(p); t != p; t = next_thread(t)) { 1072 for (t = next_thread(p); t != p; t = next_thread(t)) {
1071 if (t->fs == p->fs) 1073 if (t->fs == p->fs)
1072 n_fs++; 1074 n_fs++;
1073 n_sighand++;
1074 } 1075 }
1075 1076
1076 if (atomic_read(&p->fs->count) > n_fs || 1077 if (p->fs->users > n_fs) {
1077 atomic_read(&p->sighand->count) > n_sighand)
1078 bprm->unsafe |= LSM_UNSAFE_SHARE; 1078 bprm->unsafe |= LSM_UNSAFE_SHARE;
1079 } else {
1080 if (p->fs->in_exec)
1081 res = -EAGAIN;
1082 p->fs->in_exec = 1;
1083 }
1079 1084
1080 unlock_task_sighand(p, &flags); 1085 unlock_task_sighand(p, &flags);
1086 write_unlock(&p->fs->lock);
1087
1088 return res;
1081} 1089}
1082 1090
1083/* 1091/*
@@ -1296,12 +1304,15 @@ int do_execve(char * filename,
1296 bprm->cred = prepare_exec_creds(); 1304 bprm->cred = prepare_exec_creds();
1297 if (!bprm->cred) 1305 if (!bprm->cred)
1298 goto out_unlock; 1306 goto out_unlock;
1299 check_unsafe_exec(bprm); 1307
1308 retval = check_unsafe_exec(bprm);
1309 if (retval)
1310 goto out_unlock;
1300 1311
1301 file = open_exec(filename); 1312 file = open_exec(filename);
1302 retval = PTR_ERR(file); 1313 retval = PTR_ERR(file);
1303 if (IS_ERR(file)) 1314 if (IS_ERR(file))
1304 goto out_unlock; 1315 goto out_unmark;
1305 1316
1306 sched_exec(); 1317 sched_exec();
1307 1318
@@ -1344,6 +1355,9 @@ int do_execve(char * filename,
1344 goto out; 1355 goto out;
1345 1356
1346 /* execve succeeded */ 1357 /* execve succeeded */
1358 write_lock(&current->fs->lock);
1359 current->fs->in_exec = 0;
1360 write_unlock(&current->fs->lock);
1347 current->in_execve = 0; 1361 current->in_execve = 0;
1348 mutex_unlock(&current->cred_exec_mutex); 1362 mutex_unlock(&current->cred_exec_mutex);
1349 acct_update_integrals(current); 1363 acct_update_integrals(current);
@@ -1362,6 +1376,11 @@ out_file:
1362 fput(bprm->file); 1376 fput(bprm->file);
1363 } 1377 }
1364 1378
1379out_unmark:
1380 write_lock(&current->fs->lock);
1381 current->fs->in_exec = 0;
1382 write_unlock(&current->fs->lock);
1383
1365out_unlock: 1384out_unlock:
1366 current->in_execve = 0; 1385 current->in_execve = 0;
1367 mutex_unlock(&current->cred_exec_mutex); 1386 mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
1- Out-of-space may cause a severe problem if the object (and directory entry)
2 were written, but the inode attributes failed. Then if the filesystem was
3 unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..cc2d22db119c
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
1#
2# Kbuild for the EXOFS module
3#
4# Copyright (C) 2008 Panasas Inc. All rights reserved.
5#
6# Authors:
7# Boaz Harrosh <bharrosh@panasas.com>
8#
9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License version 2
11#
12# Kbuild - Gets included from the Kernels Makefile and build system
13#
14
15exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
1config EXOFS_FS
2 tristate "exofs: OSD based file system support"
3 depends on SCSI_OSD_ULD
4 help
5 EXOFS is a file system that uses an OSD storage device,
6 as its backing storage.
7
8# Debugging-related stuff
9config EXOFS_DEBUG
10 bool "Enable debugging"
11 depends on EXOFS_FS
12 help
13 This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
1/*
2 * common.h - Common definitions for both Kernel and user-mode utilities
3 *
4 * Copyright (C) 2005, 2006
5 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
6 * Copyright (C) 2005, 2006
7 * International Business Machines
8 * Copyright (C) 2008, 2009
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * Copyrights for code taken from ext2:
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise Pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 * from
17 * linux/fs/minix/inode.c
18 * Copyright (C) 1991, 1992 Linus Torvalds
19 *
20 * This file is part of exofs.
21 *
22 * exofs is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation. Since it is based on ext2, and the only
25 * valid version of GPL for the Linux kernel is version 2, the only valid
26 * version of GPL for exofs is version 2.
27 *
28 * exofs is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with exofs; if not, write to the Free Software
35 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
36 */
37
38#ifndef __EXOFS_COM_H__
39#define __EXOFS_COM_H__
40
41#include <linux/types.h>
42
43#include <scsi/osd_attributes.h>
44#include <scsi/osd_initiator.h>
45#include <scsi/osd_sec.h>
46
47/****************************************************************************
48 * Object ID related defines
49 * NOTE: inode# = object ID - EXOFS_OBJ_OFF
50 ****************************************************************************/
51#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
52#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
53#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
54#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
55
56/* exofs Application specific page/attribute */
57# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
58# define EXOFS_ATTR_INODE_DATA 1
59
60/*
61 * The maximum number of files we can have is limited by the size of the
62 * inode number. This is the largest object ID that the file system supports.
63 * Object IDs 0, 1, and 2 are always in use (see above defines).
64 */
65enum {
66 EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
67 (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
68 EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
69};
70
71/****************************************************************************
72 * Misc.
73 ****************************************************************************/
74#define EXOFS_BLKSHIFT 12
75#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
76
77/****************************************************************************
78 * superblock-related things
79 ****************************************************************************/
80#define EXOFS_SUPER_MAGIC 0x5DF5
81
82/*
83 * The file system control block - stored in an object's data (mainly, the one
84 * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
85 * on disk. Right now it just has a magic value, which is basically a sanity
86 * check on our ability to communicate with the object store.
87 */
88struct exofs_fscb {
89 __le64 s_nextid; /* Highest object ID used */
90 __le32 s_numfiles; /* Number of files on fs */
91 __le16 s_magic; /* Magic signature */
92 __le16 s_newfs; /* Non-zero if this is a new fs */
93};
94
95/****************************************************************************
96 * inode-related things
97 ****************************************************************************/
98#define EXOFS_IDATA 5
99
100/*
101 * The file control block - stored in an object's attributes. This is where
102 * the in-memory inode is stored on disk.
103 */
104struct exofs_fcb {
105 __le64 i_size; /* Size of the file */
106 __le16 i_mode; /* File mode */
107 __le16 i_links_count; /* Links count */
108 __le32 i_uid; /* Owner Uid */
109 __le32 i_gid; /* Group Id */
110 __le32 i_atime; /* Access time */
111 __le32 i_ctime; /* Creation time */
112 __le32 i_mtime; /* Modification time */
113 __le32 i_flags; /* File flags (unused for now)*/
114 __le32 i_generation; /* File version (for NFS) */
115 __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
116};
117
118#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
119
120/* This is the Attribute the fcb is stored in */
121static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
122 EXOFS_APAGE_FS_DATA,
123 EXOFS_ATTR_INODE_DATA,
124 EXOFS_INO_ATTR_SIZE);
125
126/****************************************************************************
127 * dentry-related things
128 ****************************************************************************/
129#define EXOFS_NAME_LEN 255
130
131/*
132 * The on-disk directory entry
133 */
134struct exofs_dir_entry {
135 __le64 inode_no; /* inode number */
136 __le16 rec_len; /* directory entry length */
137 u8 name_len; /* name length */
138 u8 file_type; /* umm...file type */
139 char name[EXOFS_NAME_LEN]; /* file name */
140};
141
142enum {
143 EXOFS_FT_UNKNOWN,
144 EXOFS_FT_REG_FILE,
145 EXOFS_FT_DIR,
146 EXOFS_FT_CHRDEV,
147 EXOFS_FT_BLKDEV,
148 EXOFS_FT_FIFO,
149 EXOFS_FT_SOCK,
150 EXOFS_FT_SYMLINK,
151 EXOFS_FT_MAX
152};
153
154#define EXOFS_DIR_PAD 4
155#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
156#define EXOFS_DIR_REC_LEN(name_len) \
157 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
158 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
159
160/*************************
161 * function declarations *
162 *************************/
163/* osd.c */
164void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
165 const struct osd_obj_id *obj);
166
167int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
168static inline int exofs_check_ok(struct osd_request *or)
169{
170 return exofs_check_ok_resid(or, NULL, NULL);
171}
172int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
173int exofs_async_op(struct osd_request *or,
174 osd_req_done_fn *async_done, void *caller_context, u8 *cred);
175
176int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
177
178int osd_req_read_kern(struct osd_request *or,
179 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
180
181int osd_req_write_kern(struct osd_request *or,
182 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
183
184#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..65b0c8c776a1
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline unsigned exofs_chunk_size(struct inode *inode)
39{
40 return inode->i_sb->s_blocksize;
41}
42
43static inline void exofs_put_page(struct page *page)
44{
45 kunmap(page);
46 page_cache_release(page);
47}
48
49/* Accesses dir's inode->i_size must be called under inode lock */
50static inline unsigned long dir_pages(struct inode *inode)
51{
52 return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
53}
54
55static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
56{
57 loff_t last_byte = inode->i_size;
58
59 last_byte -= page_nr << PAGE_CACHE_SHIFT;
60 if (last_byte > PAGE_CACHE_SIZE)
61 last_byte = PAGE_CACHE_SIZE;
62 return last_byte;
63}
64
65static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
66{
67 struct address_space *mapping = page->mapping;
68 struct inode *dir = mapping->host;
69 int err = 0;
70
71 dir->i_version++;
72
73 if (!PageUptodate(page))
74 SetPageUptodate(page);
75
76 if (pos+len > dir->i_size) {
77 i_size_write(dir, pos+len);
78 mark_inode_dirty(dir);
79 }
80 set_page_dirty(page);
81
82 if (IS_DIRSYNC(dir))
83 err = write_one_page(page, 1);
84 else
85 unlock_page(page);
86
87 return err;
88}
89
90static void exofs_check_page(struct page *page)
91{
92 struct inode *dir = page->mapping->host;
93 unsigned chunk_size = exofs_chunk_size(dir);
94 char *kaddr = page_address(page);
95 unsigned offs, rec_len;
96 unsigned limit = PAGE_CACHE_SIZE;
97 struct exofs_dir_entry *p;
98 char *error;
99
100 /* if the page is the last one in the directory */
101 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
102 limit = dir->i_size & ~PAGE_CACHE_MASK;
103 if (limit & (chunk_size - 1))
104 goto Ebadsize;
105 if (!limit)
106 goto out;
107 }
108 for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
109 p = (struct exofs_dir_entry *)(kaddr + offs);
110 rec_len = le16_to_cpu(p->rec_len);
111
112 if (rec_len < EXOFS_DIR_REC_LEN(1))
113 goto Eshort;
114 if (rec_len & 3)
115 goto Ealign;
116 if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
117 goto Enamelen;
118 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
119 goto Espan;
120 }
121 if (offs != limit)
122 goto Eend;
123out:
124 SetPageChecked(page);
125 return;
126
127Ebadsize:
128 EXOFS_ERR("ERROR [exofs_check_page]: "
129 "size of directory #%lu is not a multiple of chunk size",
130 dir->i_ino
131 );
132 goto fail;
133Eshort:
134 error = "rec_len is smaller than minimal";
135 goto bad_entry;
136Ealign:
137 error = "unaligned directory entry";
138 goto bad_entry;
139Enamelen:
140 error = "rec_len is too small for name_len";
141 goto bad_entry;
142Espan:
143 error = "directory entry across blocks";
144 goto bad_entry;
145bad_entry:
146 EXOFS_ERR(
147 "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
148 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
149 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
150 _LLU(le64_to_cpu(p->inode_no)),
151 rec_len, p->name_len);
152 goto fail;
153Eend:
154 p = (struct exofs_dir_entry *)(kaddr + offs);
155 EXOFS_ERR("ERROR [exofs_check_page]: "
156 "entry in directory #%lu spans the page boundary"
157 "offset=%lu, inode=%llu",
158 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
159 _LLU(le64_to_cpu(p->inode_no)));
160fail:
161 SetPageChecked(page);
162 SetPageError(page);
163}
164
165static struct page *exofs_get_page(struct inode *dir, unsigned long n)
166{
167 struct address_space *mapping = dir->i_mapping;
168 struct page *page = read_mapping_page(mapping, n, NULL);
169
170 if (!IS_ERR(page)) {
171 kmap(page);
172 if (!PageChecked(page))
173 exofs_check_page(page);
174 if (PageError(page))
175 goto fail;
176 }
177 return page;
178
179fail:
180 exofs_put_page(page);
181 return ERR_PTR(-EIO);
182}
183
184static inline int exofs_match(int len, const unsigned char *name,
185 struct exofs_dir_entry *de)
186{
187 if (len != de->name_len)
188 return 0;
189 if (!de->inode_no)
190 return 0;
191 return !memcmp(name, de->name, len);
192}
193
194static inline
195struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
196{
197 return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
198}
199
200static inline unsigned
201exofs_validate_entry(char *base, unsigned offset, unsigned mask)
202{
203 struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
204 struct exofs_dir_entry *p =
205 (struct exofs_dir_entry *)(base + (offset&mask));
206 while ((char *)p < (char *)de) {
207 if (p->rec_len == 0)
208 break;
209 p = exofs_next_entry(p);
210 }
211 return (char *)p - base;
212}
213
214static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
215 [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
216 [EXOFS_FT_REG_FILE] = DT_REG,
217 [EXOFS_FT_DIR] = DT_DIR,
218 [EXOFS_FT_CHRDEV] = DT_CHR,
219 [EXOFS_FT_BLKDEV] = DT_BLK,
220 [EXOFS_FT_FIFO] = DT_FIFO,
221 [EXOFS_FT_SOCK] = DT_SOCK,
222 [EXOFS_FT_SYMLINK] = DT_LNK,
223};
224
225#define S_SHIFT 12
226static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
227 [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
228 [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
229 [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
230 [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
231 [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
232 [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
233 [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
234};
235
236static inline
237void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
238{
239 mode_t mode = inode->i_mode;
240 de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
241}
242
243static int
244exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
245{
246 loff_t pos = filp->f_pos;
247 struct inode *inode = filp->f_path.dentry->d_inode;
248 unsigned int offset = pos & ~PAGE_CACHE_MASK;
249 unsigned long n = pos >> PAGE_CACHE_SHIFT;
250 unsigned long npages = dir_pages(inode);
251 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
252 unsigned char *types = NULL;
253 int need_revalidate = (filp->f_version != inode->i_version);
254
255 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
256 return 0;
257
258 types = exofs_filetype_table;
259
260 for ( ; n < npages; n++, offset = 0) {
261 char *kaddr, *limit;
262 struct exofs_dir_entry *de;
263 struct page *page = exofs_get_page(inode, n);
264
265 if (IS_ERR(page)) {
266 EXOFS_ERR("ERROR: "
267 "bad page in #%lu",
268 inode->i_ino);
269 filp->f_pos += PAGE_CACHE_SIZE - offset;
270 return PTR_ERR(page);
271 }
272 kaddr = page_address(page);
273 if (unlikely(need_revalidate)) {
274 if (offset) {
275 offset = exofs_validate_entry(kaddr, offset,
276 chunk_mask);
277 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
278 }
279 filp->f_version = inode->i_version;
280 need_revalidate = 0;
281 }
282 de = (struct exofs_dir_entry *)(kaddr + offset);
283 limit = kaddr + exofs_last_byte(inode, n) -
284 EXOFS_DIR_REC_LEN(1);
285 for (; (char *)de <= limit; de = exofs_next_entry(de)) {
286 if (de->rec_len == 0) {
287 EXOFS_ERR("ERROR: "
288 "zero-length directory entry");
289 exofs_put_page(page);
290 return -EIO;
291 }
292 if (de->inode_no) {
293 int over;
294 unsigned char d_type = DT_UNKNOWN;
295
296 if (types && de->file_type < EXOFS_FT_MAX)
297 d_type = types[de->file_type];
298
299 offset = (char *)de - kaddr;
300 over = filldir(dirent, de->name, de->name_len,
301 (n<<PAGE_CACHE_SHIFT) | offset,
302 le64_to_cpu(de->inode_no),
303 d_type);
304 if (over) {
305 exofs_put_page(page);
306 return 0;
307 }
308 }
309 filp->f_pos += le16_to_cpu(de->rec_len);
310 }
311 exofs_put_page(page);
312 }
313
314 return 0;
315}
316
317struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
318 struct dentry *dentry, struct page **res_page)
319{
320 const unsigned char *name = dentry->d_name.name;
321 int namelen = dentry->d_name.len;
322 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
323 unsigned long start, n;
324 unsigned long npages = dir_pages(dir);
325 struct page *page = NULL;
326 struct exofs_i_info *oi = exofs_i(dir);
327 struct exofs_dir_entry *de;
328
329 if (npages == 0)
330 goto out;
331
332 *res_page = NULL;
333
334 start = oi->i_dir_start_lookup;
335 if (start >= npages)
336 start = 0;
337 n = start;
338 do {
339 char *kaddr;
340 page = exofs_get_page(dir, n);
341 if (!IS_ERR(page)) {
342 kaddr = page_address(page);
343 de = (struct exofs_dir_entry *) kaddr;
344 kaddr += exofs_last_byte(dir, n) - reclen;
345 while ((char *) de <= kaddr) {
346 if (de->rec_len == 0) {
347 EXOFS_ERR(
348 "ERROR: exofs_find_entry: "
349 "zero-length directory entry");
350 exofs_put_page(page);
351 goto out;
352 }
353 if (exofs_match(namelen, name, de))
354 goto found;
355 de = exofs_next_entry(de);
356 }
357 exofs_put_page(page);
358 }
359 if (++n >= npages)
360 n = 0;
361 } while (n != start);
362out:
363 return NULL;
364
365found:
366 *res_page = page;
367 oi->i_dir_start_lookup = n;
368 return de;
369}
370
371struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
372{
373 struct page *page = exofs_get_page(dir, 0);
374 struct exofs_dir_entry *de = NULL;
375
376 if (!IS_ERR(page)) {
377 de = exofs_next_entry(
378 (struct exofs_dir_entry *)page_address(page));
379 *p = page;
380 }
381 return de;
382}
383
384ino_t exofs_parent_ino(struct dentry *child)
385{
386 struct page *page;
387 struct exofs_dir_entry *de;
388 ino_t ino;
389
390 de = exofs_dotdot(child->d_inode, &page);
391 if (!de)
392 return 0;
393
394 ino = le64_to_cpu(de->inode_no);
395 exofs_put_page(page);
396 return ino;
397}
398
399ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
400{
401 ino_t res = 0;
402 struct exofs_dir_entry *de;
403 struct page *page;
404
405 de = exofs_find_entry(dir, dentry, &page);
406 if (de) {
407 res = le64_to_cpu(de->inode_no);
408 exofs_put_page(page);
409 }
410 return res;
411}
412
413int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
414 struct page *page, struct inode *inode)
415{
416 loff_t pos = page_offset(page) +
417 (char *) de - (char *) page_address(page);
418 unsigned len = le16_to_cpu(de->rec_len);
419 int err;
420
421 lock_page(page);
422 err = exofs_write_begin(NULL, page->mapping, pos, len,
423 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
424 if (err)
425 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
426 err);
427
428 de->inode_no = cpu_to_le64(inode->i_ino);
429 exofs_set_de_type(de, inode);
430 if (likely(!err))
431 err = exofs_commit_chunk(page, pos, len);
432 exofs_put_page(page);
433 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
434 mark_inode_dirty(dir);
435 return err;
436}
437
438int exofs_add_link(struct dentry *dentry, struct inode *inode)
439{
440 struct inode *dir = dentry->d_parent->d_inode;
441 const unsigned char *name = dentry->d_name.name;
442 int namelen = dentry->d_name.len;
443 unsigned chunk_size = exofs_chunk_size(dir);
444 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
445 unsigned short rec_len, name_len;
446 struct page *page = NULL;
447 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
448 struct exofs_dir_entry *de;
449 unsigned long npages = dir_pages(dir);
450 unsigned long n;
451 char *kaddr;
452 loff_t pos;
453 int err;
454
455 for (n = 0; n <= npages; n++) {
456 char *dir_end;
457
458 page = exofs_get_page(dir, n);
459 err = PTR_ERR(page);
460 if (IS_ERR(page))
461 goto out;
462 lock_page(page);
463 kaddr = page_address(page);
464 dir_end = kaddr + exofs_last_byte(dir, n);
465 de = (struct exofs_dir_entry *)kaddr;
466 kaddr += PAGE_CACHE_SIZE - reclen;
467 while ((char *)de <= kaddr) {
468 if ((char *)de == dir_end) {
469 name_len = 0;
470 rec_len = chunk_size;
471 de->rec_len = cpu_to_le16(chunk_size);
472 de->inode_no = 0;
473 goto got_it;
474 }
475 if (de->rec_len == 0) {
476 EXOFS_ERR("ERROR: exofs_add_link: "
477 "zero-length directory entry");
478 err = -EIO;
479 goto out_unlock;
480 }
481 err = -EEXIST;
482 if (exofs_match(namelen, name, de))
483 goto out_unlock;
484 name_len = EXOFS_DIR_REC_LEN(de->name_len);
485 rec_len = le16_to_cpu(de->rec_len);
486 if (!de->inode_no && rec_len >= reclen)
487 goto got_it;
488 if (rec_len >= name_len + reclen)
489 goto got_it;
490 de = (struct exofs_dir_entry *) ((char *) de + rec_len);
491 }
492 unlock_page(page);
493 exofs_put_page(page);
494 }
495
496 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
497 return -EINVAL;
498
499got_it:
500 pos = page_offset(page) +
501 (char *)de - (char *)page_address(page);
502 err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
503 &page, NULL);
504 if (err)
505 goto out_unlock;
506 if (de->inode_no) {
507 struct exofs_dir_entry *de1 =
508 (struct exofs_dir_entry *)((char *)de + name_len);
509 de1->rec_len = cpu_to_le16(rec_len - name_len);
510 de->rec_len = cpu_to_le16(name_len);
511 de = de1;
512 }
513 de->name_len = namelen;
514 memcpy(de->name, name, namelen);
515 de->inode_no = cpu_to_le64(inode->i_ino);
516 exofs_set_de_type(de, inode);
517 err = exofs_commit_chunk(page, pos, rec_len);
518 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
519 mark_inode_dirty(dir);
520 sbi->s_numfiles++;
521
522out_put:
523 exofs_put_page(page);
524out:
525 return err;
526out_unlock:
527 unlock_page(page);
528 goto out_put;
529}
530
531int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
532{
533 struct address_space *mapping = page->mapping;
534 struct inode *inode = mapping->host;
535 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
536 char *kaddr = page_address(page);
537 unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
538 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
539 loff_t pos;
540 struct exofs_dir_entry *pde = NULL;
541 struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
542 int err;
543
544 while (de < dir) {
545 if (de->rec_len == 0) {
546 EXOFS_ERR("ERROR: exofs_delete_entry:"
547 "zero-length directory entry");
548 err = -EIO;
549 goto out;
550 }
551 pde = de;
552 de = exofs_next_entry(de);
553 }
554 if (pde)
555 from = (char *)pde - (char *)page_address(page);
556 pos = page_offset(page) + from;
557 lock_page(page);
558 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
559 &page, NULL);
560 if (err)
561 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
562 err);
563 if (pde)
564 pde->rec_len = cpu_to_le16(to - from);
565 dir->inode_no = 0;
566 if (likely(!err))
567 err = exofs_commit_chunk(page, pos, to - from);
568 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
569 mark_inode_dirty(inode);
570 sbi->s_numfiles--;
571out:
572 exofs_put_page(page);
573 return err;
574}
575
576/* kept aligned on 4 bytes */
577#define THIS_DIR ".\0\0"
578#define PARENT_DIR "..\0"
579
580int exofs_make_empty(struct inode *inode, struct inode *parent)
581{
582 struct address_space *mapping = inode->i_mapping;
583 struct page *page = grab_cache_page(mapping, 0);
584 unsigned chunk_size = exofs_chunk_size(inode);
585 struct exofs_dir_entry *de;
586 int err;
587 void *kaddr;
588
589 if (!page)
590 return -ENOMEM;
591
592 err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
593 &page, NULL);
594 if (err) {
595 unlock_page(page);
596 goto fail;
597 }
598
599 kaddr = kmap_atomic(page, KM_USER0);
600 de = (struct exofs_dir_entry *)kaddr;
601 de->name_len = 1;
602 de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
603 memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
604 de->inode_no = cpu_to_le64(inode->i_ino);
605 exofs_set_de_type(de, inode);
606
607 de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
608 de->name_len = 2;
609 de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
610 de->inode_no = cpu_to_le64(parent->i_ino);
611 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
612 exofs_set_de_type(de, inode);
613 kunmap_atomic(page, KM_USER0);
614 err = exofs_commit_chunk(page, 0, chunk_size);
615fail:
616 page_cache_release(page);
617 return err;
618}
619
620int exofs_empty_dir(struct inode *inode)
621{
622 struct page *page = NULL;
623 unsigned long i, npages = dir_pages(inode);
624
625 for (i = 0; i < npages; i++) {
626 char *kaddr;
627 struct exofs_dir_entry *de;
628 page = exofs_get_page(inode, i);
629
630 if (IS_ERR(page))
631 continue;
632
633 kaddr = page_address(page);
634 de = (struct exofs_dir_entry *)kaddr;
635 kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
636
637 while ((char *)de <= kaddr) {
638 if (de->rec_len == 0) {
639 EXOFS_ERR("ERROR: exofs_empty_dir: "
640 "zero-length directory entry"
641 "kaddr=%p, de=%p\n", kaddr, de);
642 goto not_empty;
643 }
644 if (de->inode_no != 0) {
645 /* check for . and .. */
646 if (de->name[0] != '.')
647 goto not_empty;
648 if (de->name_len > 2)
649 goto not_empty;
650 if (de->name_len < 2) {
651 if (le64_to_cpu(de->inode_no) !=
652 inode->i_ino)
653 goto not_empty;
654 } else if (de->name[1] != '.')
655 goto not_empty;
656 }
657 de = exofs_next_entry(de);
658 }
659 exofs_put_page(page);
660 }
661 return 1;
662
663not_empty:
664 exofs_put_page(page);
665 return 0;
666}
667
668const struct file_operations exofs_dir_operations = {
669 .llseek = generic_file_llseek,
670 .read = generic_read_dir,
671 .readdir = exofs_readdir,
672};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..0fd4c7859679
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/fs.h>
37#include <linux/time.h>
38#include "common.h"
39
40#ifndef __EXOFS_H__
41#define __EXOFS_H__
42
43#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
44
45#ifdef CONFIG_EXOFS_DEBUG
46#define EXOFS_DBGMSG(fmt, a...) \
47 printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
48#else
49#define EXOFS_DBGMSG(fmt, a...) \
50 do { if (0) printk(fmt, ##a); } while (0)
51#endif
52
53/* u64 has problems with printk this will cast it to unsigned long long */
54#define _LLU(x) (unsigned long long)(x)
55
56/*
57 * our extension to the in-memory superblock
58 */
59struct exofs_sb_info {
60 struct osd_dev *s_dev; /* returned by get_osd_dev */
61 osd_id s_pid; /* partition ID of file system*/
62 int s_timeout; /* timeout for OSD operations */
63 uint64_t s_nextid; /* highest object ID used */
64 uint32_t s_numfiles; /* number of files on fs */
65 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
66 u32 s_next_generation; /* next gen # to use */
67 atomic_t s_curr_pending; /* number of pending commands */
68 uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */
69};
70
71/*
72 * our extension to the in-memory inode
73 */
74struct exofs_i_info {
75 unsigned long i_flags; /* various atomic flags */
76 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
77 uint32_t i_dir_start_lookup; /* which page to start lookup */
78 wait_queue_head_t i_wq; /* wait queue for inode */
79 uint64_t i_commit_size; /* the object's written length */
80 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
81 struct inode vfs_inode; /* normal in-memory inode */
82};
83
84/*
85 * our inode flags
86 */
87#define OBJ_2BCREATED 0 /* object will be created soon*/
88#define OBJ_CREATED 1 /* object has been created on the osd*/
89
90static inline int obj_2bcreated(struct exofs_i_info *oi)
91{
92 return test_bit(OBJ_2BCREATED, &oi->i_flags);
93}
94
95static inline void set_obj_2bcreated(struct exofs_i_info *oi)
96{
97 set_bit(OBJ_2BCREATED, &oi->i_flags);
98}
99
100static inline int obj_created(struct exofs_i_info *oi)
101{
102 return test_bit(OBJ_CREATED, &oi->i_flags);
103}
104
105static inline void set_obj_created(struct exofs_i_info *oi)
106{
107 set_bit(OBJ_CREATED, &oi->i_flags);
108}
109
110int __exofs_wait_obj_created(struct exofs_i_info *oi);
111static inline int wait_obj_created(struct exofs_i_info *oi)
112{
113 if (likely(obj_created(oi)))
114 return 0;
115
116 return __exofs_wait_obj_created(oi);
117}
118
119/*
120 * get to our inode from the vfs inode
121 */
122static inline struct exofs_i_info *exofs_i(struct inode *inode)
123{
124 return container_of(inode, struct exofs_i_info, vfs_inode);
125}
126
127/*
128 * Maximum count of links to a file
129 */
130#define EXOFS_LINK_MAX 32000
131
132/*************************
133 * function declarations *
134 *************************/
135/* inode.c */
136void exofs_truncate(struct inode *inode);
137int exofs_setattr(struct dentry *, struct iattr *);
138int exofs_write_begin(struct file *file, struct address_space *mapping,
139 loff_t pos, unsigned len, unsigned flags,
140 struct page **pagep, void **fsdata);
141extern struct inode *exofs_iget(struct super_block *, unsigned long);
142struct inode *exofs_new_inode(struct inode *, int);
143extern int exofs_write_inode(struct inode *, int);
144extern void exofs_delete_inode(struct inode *);
145
146/* dir.c: */
147int exofs_add_link(struct dentry *, struct inode *);
148ino_t exofs_inode_by_name(struct inode *, struct dentry *);
149int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
150int exofs_make_empty(struct inode *, struct inode *);
151struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
152 struct page **);
153int exofs_empty_dir(struct inode *);
154struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
155ino_t exofs_parent_ino(struct dentry *child);
156int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
157 struct inode *);
158
159/*********************
160 * operation vectors *
161 *********************/
162/* dir.c: */
163extern const struct file_operations exofs_dir_operations;
164
165/* file.c */
166extern const struct inode_operations exofs_file_inode_operations;
167extern const struct file_operations exofs_file_operations;
168
169/* inode.c */
170extern const struct address_space_operations exofs_aops;
171
172/* namei.c */
173extern const struct inode_operations exofs_dir_inode_operations;
174extern const struct inode_operations exofs_special_inode_operations;
175
176/* symlink.c */
177extern const struct inode_operations exofs_symlink_inode_operations;
178extern const struct inode_operations exofs_fast_symlink_inode_operations;
179
180#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/buffer_head.h>
37
38#include "exofs.h"
39
40static int exofs_release_file(struct inode *inode, struct file *filp)
41{
42 return 0;
43}
44
45static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
46 int datasync)
47{
48 int ret;
49 struct address_space *mapping = filp->f_mapping;
50
51 ret = filemap_write_and_wait(mapping);
52 if (ret)
53 return ret;
54
55 /*Note: file_fsync below also calles sync_blockdev, which is a no-op
56 * for exofs, but other then that it does sync_inode and
57 * sync_superblock which is what we need here.
58 */
59 return file_fsync(filp, dentry, datasync);
60}
61
62static int exofs_flush(struct file *file, fl_owner_t id)
63{
64 exofs_file_fsync(file, file->f_path.dentry, 1);
65 /* TODO: Flush the OSD target */
66 return 0;
67}
68
69const struct file_operations exofs_file_operations = {
70 .llseek = generic_file_llseek,
71 .read = do_sync_read,
72 .write = do_sync_write,
73 .aio_read = generic_file_aio_read,
74 .aio_write = generic_file_aio_write,
75 .mmap = generic_file_mmap,
76 .open = generic_file_open,
77 .release = exofs_release_file,
78 .fsync = exofs_file_fsync,
79 .flush = exofs_flush,
80 .splice_read = generic_file_splice_read,
81 .splice_write = generic_file_splice_write,
82};
83
84const struct inode_operations exofs_file_inode_operations = {
85 .truncate = exofs_truncate,
86 .setattr = exofs_setattr,
87};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..ba8d9fab4693
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/writeback.h>
37#include <linux/buffer_head.h>
38#include <scsi/scsi_device.h>
39
40#include "exofs.h"
41
42#ifdef CONFIG_EXOFS_DEBUG
43# define EXOFS_DEBUG_OBJ_ISIZE 1
44#endif
45
46struct page_collect {
47 struct exofs_sb_info *sbi;
48 struct request_queue *req_q;
49 struct inode *inode;
50 unsigned expected_pages;
51
52 struct bio *bio;
53 unsigned nr_pages;
54 unsigned long length;
55 loff_t pg_first; /* keep 64bit also in 32-arches */
56};
57
58static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
59 struct inode *inode)
60{
61 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
62 struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
63
64 pcol->sbi = sbi;
65 pcol->req_q = req_q;
66 pcol->inode = inode;
67 pcol->expected_pages = expected_pages;
68
69 pcol->bio = NULL;
70 pcol->nr_pages = 0;
71 pcol->length = 0;
72 pcol->pg_first = -1;
73
74 EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
75 expected_pages);
76}
77
78static void _pcol_reset(struct page_collect *pcol)
79{
80 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
81
82 pcol->bio = NULL;
83 pcol->nr_pages = 0;
84 pcol->length = 0;
85 pcol->pg_first = -1;
86 EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
87 pcol->inode->i_ino, pcol->expected_pages);
88
89 /* this is probably the end of the loop but in writes
90 * it might not end here. don't be left with nothing
91 */
92 if (!pcol->expected_pages)
93 pcol->expected_pages = 128;
94}
95
96static int pcol_try_alloc(struct page_collect *pcol)
97{
98 int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
99
100 for (; pages; pages >>= 1) {
101 pcol->bio = bio_alloc(GFP_KERNEL, pages);
102 if (likely(pcol->bio))
103 return 0;
104 }
105
106 EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
107 pcol->expected_pages);
108 return -ENOMEM;
109}
110
111static void pcol_free(struct page_collect *pcol)
112{
113 bio_put(pcol->bio);
114 pcol->bio = NULL;
115}
116
117static int pcol_add_page(struct page_collect *pcol, struct page *page,
118 unsigned len)
119{
120 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
121 if (unlikely(len != added_len))
122 return -ENOMEM;
123
124 ++pcol->nr_pages;
125 pcol->length += len;
126 return 0;
127}
128
129static int update_read_page(struct page *page, int ret)
130{
131 if (ret == 0) {
132 /* Everything is OK */
133 SetPageUptodate(page);
134 if (PageError(page))
135 ClearPageError(page);
136 } else if (ret == -EFAULT) {
137 /* In this case we were trying to read something that wasn't on
138 * disk yet - return a page full of zeroes. This should be OK,
139 * because the object should be empty (if there was a write
140 * before this read, the read would be waiting with the page
141 * locked */
142 clear_highpage(page);
143
144 SetPageUptodate(page);
145 if (PageError(page))
146 ClearPageError(page);
147 ret = 0; /* recovered error */
148 EXOFS_DBGMSG("recovered read error\n");
149 } else /* Error */
150 SetPageError(page);
151
152 return ret;
153}
154
155static void update_write_page(struct page *page, int ret)
156{
157 if (ret) {
158 mapping_set_error(page->mapping, ret);
159 SetPageError(page);
160 }
161 end_page_writeback(page);
162}
163
164/* Called at the end of reads, to optionally unlock pages and update their
165 * status.
166 */
167static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
168 bool do_unlock)
169{
170 struct bio_vec *bvec;
171 int i;
172 u64 resid;
173 u64 good_bytes;
174 u64 length = 0;
175 int ret = exofs_check_ok_resid(or, &resid, NULL);
176
177 osd_end_request(or);
178
179 if (likely(!ret))
180 good_bytes = pcol->length;
181 else if (!resid)
182 good_bytes = 0;
183 else
184 good_bytes = pcol->length - resid;
185
186 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
187 " length=0x%lx nr_pages=%u\n",
188 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
189 pcol->nr_pages);
190
191 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
192 struct page *page = bvec->bv_page;
193 struct inode *inode = page->mapping->host;
194 int page_stat;
195
196 if (inode != pcol->inode)
197 continue; /* osd might add more pages at end */
198
199 if (likely(length < good_bytes))
200 page_stat = 0;
201 else
202 page_stat = ret;
203
204 EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n",
205 inode->i_ino, page->index,
206 page_stat ? "bad_bytes" : "good_bytes");
207
208 ret = update_read_page(page, page_stat);
209 if (do_unlock)
210 unlock_page(page);
211 length += bvec->bv_len;
212 }
213
214 pcol_free(pcol);
215 EXOFS_DBGMSG("readpages_done END\n");
216 return ret;
217}
218
219/* callback of async reads */
220static void readpages_done(struct osd_request *or, void *p)
221{
222 struct page_collect *pcol = p;
223
224 __readpages_done(or, pcol, true);
225 atomic_dec(&pcol->sbi->s_curr_pending);
226 kfree(p);
227}
228
229static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
230{
231 struct bio_vec *bvec;
232 int i;
233
234 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
235 struct page *page = bvec->bv_page;
236
237 if (rw == READ)
238 update_read_page(page, ret);
239 else
240 update_write_page(page, ret);
241
242 unlock_page(page);
243 }
244 pcol_free(pcol);
245}
246
247static int read_exec(struct page_collect *pcol, bool is_sync)
248{
249 struct exofs_i_info *oi = exofs_i(pcol->inode);
250 struct osd_obj_id obj = {pcol->sbi->s_pid,
251 pcol->inode->i_ino + EXOFS_OBJ_OFF};
252 struct osd_request *or = NULL;
253 struct page_collect *pcol_copy = NULL;
254 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
255 int ret;
256
257 if (!pcol->bio)
258 return 0;
259
260 /* see comment in _readpage() about sync reads */
261 WARN_ON(is_sync && (pcol->nr_pages != 1));
262
263 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
264 if (unlikely(!or)) {
265 ret = -ENOMEM;
266 goto err;
267 }
268
269 osd_req_read(or, &obj, pcol->bio, i_start);
270
271 if (is_sync) {
272 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
273 return __readpages_done(or, pcol, false);
274 }
275
276 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
277 if (!pcol_copy) {
278 ret = -ENOMEM;
279 goto err;
280 }
281
282 *pcol_copy = *pcol;
283 ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
284 if (unlikely(ret))
285 goto err;
286
287 atomic_inc(&pcol->sbi->s_curr_pending);
288
289 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
290 obj.id, _LLU(i_start), pcol->length);
291
292 /* pages ownership was passed to pcol_copy */
293 _pcol_reset(pcol);
294 return 0;
295
296err:
297 if (!is_sync)
298 _unlock_pcol_pages(pcol, ret, READ);
299 kfree(pcol_copy);
300 if (or)
301 osd_end_request(or);
302 return ret;
303}
304
305/* readpage_strip is called either directly from readpage() or by the VFS from
306 * within read_cache_pages(), to add one more page to be read. It will try to
307 * collect as many contiguous pages as posible. If a discontinuity is
308 * encountered, or it runs out of resources, it will submit the previous segment
309 * and will start a new collection. Eventually caller must submit the last
310 * segment if present.
311 */
312static int readpage_strip(void *data, struct page *page)
313{
314 struct page_collect *pcol = data;
315 struct inode *inode = pcol->inode;
316 struct exofs_i_info *oi = exofs_i(inode);
317 loff_t i_size = i_size_read(inode);
318 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
319 size_t len;
320 int ret;
321
322 /* FIXME: Just for debugging, will be removed */
323 if (PageUptodate(page))
324 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
325 page->index);
326
327 if (page->index < end_index)
328 len = PAGE_CACHE_SIZE;
329 else if (page->index == end_index)
330 len = i_size & ~PAGE_CACHE_MASK;
331 else
332 len = 0;
333
334 if (!len || !obj_created(oi)) {
335 /* this will be out of bounds, or doesn't exist yet.
336 * Current page is cleared and the request is split
337 */
338 clear_highpage(page);
339
340 SetPageUptodate(page);
341 if (PageError(page))
342 ClearPageError(page);
343
344 unlock_page(page);
345 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
346 " splitting\n", inode->i_ino, page->index);
347
348 return read_exec(pcol, false);
349 }
350
351try_again:
352
353 if (unlikely(pcol->pg_first == -1)) {
354 pcol->pg_first = page->index;
355 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
356 page->index)) {
357 /* Discontinuity detected, split the request */
358 ret = read_exec(pcol, false);
359 if (unlikely(ret))
360 goto fail;
361 goto try_again;
362 }
363
364 if (!pcol->bio) {
365 ret = pcol_try_alloc(pcol);
366 if (unlikely(ret))
367 goto fail;
368 }
369
370 if (len != PAGE_CACHE_SIZE)
371 zero_user(page, len, PAGE_CACHE_SIZE - len);
372
373 EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
374 inode->i_ino, page->index, len);
375
376 ret = pcol_add_page(pcol, page, len);
377 if (ret) {
378 EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
379 "this_len=0x%zx nr_pages=%u length=0x%lx\n",
380 page, len, pcol->nr_pages, pcol->length);
381
382 /* split the request, and start again with current page */
383 ret = read_exec(pcol, false);
384 if (unlikely(ret))
385 goto fail;
386
387 goto try_again;
388 }
389
390 return 0;
391
392fail:
393 /* SetPageError(page); ??? */
394 unlock_page(page);
395 return ret;
396}
397
398static int exofs_readpages(struct file *file, struct address_space *mapping,
399 struct list_head *pages, unsigned nr_pages)
400{
401 struct page_collect pcol;
402 int ret;
403
404 _pcol_init(&pcol, nr_pages, mapping->host);
405
406 ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
407 if (ret) {
408 EXOFS_ERR("read_cache_pages => %d\n", ret);
409 return ret;
410 }
411
412 return read_exec(&pcol, false);
413}
414
415static int _readpage(struct page *page, bool is_sync)
416{
417 struct page_collect pcol;
418 int ret;
419
420 _pcol_init(&pcol, 1, page->mapping->host);
421
422 /* readpage_strip might call read_exec(,async) inside at several places
423 * but this is safe for is_async=0 since read_exec will not do anything
424 * when we have a single page.
425 */
426 ret = readpage_strip(&pcol, page);
427 if (ret) {
428 EXOFS_ERR("_readpage => %d\n", ret);
429 return ret;
430 }
431
432 return read_exec(&pcol, is_sync);
433}
434
435/*
436 * We don't need the file
437 */
438static int exofs_readpage(struct file *file, struct page *page)
439{
440 return _readpage(page, false);
441}
442
443/* Callback for osd_write. All writes are asynchronouse */
444static void writepages_done(struct osd_request *or, void *p)
445{
446 struct page_collect *pcol = p;
447 struct bio_vec *bvec;
448 int i;
449 u64 resid;
450 u64 good_bytes;
451 u64 length = 0;
452
453 int ret = exofs_check_ok_resid(or, NULL, &resid);
454
455 osd_end_request(or);
456 atomic_dec(&pcol->sbi->s_curr_pending);
457
458 if (likely(!ret))
459 good_bytes = pcol->length;
460 else if (!resid)
461 good_bytes = 0;
462 else
463 good_bytes = pcol->length - resid;
464
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages);
469
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
471 struct page *page = bvec->bv_page;
472 struct inode *inode = page->mapping->host;
473 int page_stat;
474
475 if (inode != pcol->inode)
476 continue; /* osd might add more pages to a bio */
477
478 if (likely(length < good_bytes))
479 page_stat = 0;
480 else
481 page_stat = ret;
482
483 update_write_page(page, page_stat);
484 unlock_page(page);
485 EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat);
487
488 length += bvec->bv_len;
489 }
490
491 pcol_free(pcol);
492 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n");
494}
495
496static int write_exec(struct page_collect *pcol)
497{
498 struct exofs_i_info *oi = exofs_i(pcol->inode);
499 struct osd_obj_id obj = {pcol->sbi->s_pid,
500 pcol->inode->i_ino + EXOFS_OBJ_OFF};
501 struct osd_request *or = NULL;
502 struct page_collect *pcol_copy = NULL;
503 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
504 int ret;
505
506 if (!pcol->bio)
507 return 0;
508
509 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
510 if (unlikely(!or)) {
511 EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
512 ret = -ENOMEM;
513 goto err;
514 }
515
516 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
517 if (!pcol_copy) {
518 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
519 ret = -ENOMEM;
520 goto err;
521 }
522
523 *pcol_copy = *pcol;
524
525 osd_req_write(or, &obj, pcol_copy->bio, i_start);
526 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
527 if (unlikely(ret)) {
528 EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
529 goto err;
530 }
531
532 atomic_inc(&pcol->sbi->s_curr_pending);
533 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
534 pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
535 pcol->length);
536 /* pages ownership was passed to pcol_copy */
537 _pcol_reset(pcol);
538 return 0;
539
540err:
541 _unlock_pcol_pages(pcol, ret, WRITE);
542 kfree(pcol_copy);
543 if (or)
544 osd_end_request(or);
545 return ret;
546}
547
548/* writepage_strip is called either directly from writepage() or by the VFS from
549 * within write_cache_pages(), to add one more page to be written to storage.
550 * It will try to collect as many contiguous pages as possible. If a
551 * discontinuity is encountered or it runs out of resources it will submit the
552 * previous segment and will start a new collection.
553 * Eventually caller must submit the last segment if present.
554 */
555static int writepage_strip(struct page *page,
556 struct writeback_control *wbc_unused, void *data)
557{
558 struct page_collect *pcol = data;
559 struct inode *inode = pcol->inode;
560 struct exofs_i_info *oi = exofs_i(inode);
561 loff_t i_size = i_size_read(inode);
562 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
563 size_t len;
564 int ret;
565
566 BUG_ON(!PageLocked(page));
567
568 ret = wait_obj_created(oi);
569 if (unlikely(ret))
570 goto fail;
571
572 if (page->index < end_index)
573 /* in this case, the page is within the limits of the file */
574 len = PAGE_CACHE_SIZE;
575 else {
576 len = i_size & ~PAGE_CACHE_MASK;
577
578 if (page->index > end_index || !len) {
579 /* in this case, the page is outside the limits
580 * (truncate in progress)
581 */
582 ret = write_exec(pcol);
583 if (unlikely(ret))
584 goto fail;
585 if (PageError(page))
586 ClearPageError(page);
587 unlock_page(page);
588 return 0;
589 }
590 }
591
592try_again:
593
594 if (unlikely(pcol->pg_first == -1)) {
595 pcol->pg_first = page->index;
596 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
597 page->index)) {
598 /* Discontinuity detected, split the request */
599 ret = write_exec(pcol);
600 if (unlikely(ret))
601 goto fail;
602 goto try_again;
603 }
604
605 if (!pcol->bio) {
606 ret = pcol_try_alloc(pcol);
607 if (unlikely(ret))
608 goto fail;
609 }
610
611 EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
612 inode->i_ino, page->index, len);
613
614 ret = pcol_add_page(pcol, page, len);
615 if (unlikely(ret)) {
616 EXOFS_DBGMSG("Failed pcol_add_page "
617 "nr_pages=%u total_length=0x%lx\n",
618 pcol->nr_pages, pcol->length);
619
620 /* split the request, next loop will start again */
621 ret = write_exec(pcol);
622 if (unlikely(ret)) {
623 EXOFS_DBGMSG("write_exec faild => %d", ret);
624 goto fail;
625 }
626
627 goto try_again;
628 }
629
630 BUG_ON(PageWriteback(page));
631 set_page_writeback(page);
632
633 return 0;
634
635fail:
636 set_bit(AS_EIO, &page->mapping->flags);
637 unlock_page(page);
638 return ret;
639}
640
641static int exofs_writepages(struct address_space *mapping,
642 struct writeback_control *wbc)
643{
644 struct page_collect pcol;
645 long start, end, expected_pages;
646 int ret;
647
648 start = wbc->range_start >> PAGE_CACHE_SHIFT;
649 end = (wbc->range_end == LLONG_MAX) ?
650 start + mapping->nrpages :
651 wbc->range_end >> PAGE_CACHE_SHIFT;
652
653 if (start || end)
654 expected_pages = min(end - start + 1, 32L);
655 else
656 expected_pages = mapping->nrpages;
657
658 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
659 " m->nrpages=%lu start=0x%lx end=0x%lx\n",
660 mapping->host->i_ino, wbc->range_start, wbc->range_end,
661 mapping->nrpages, start, end);
662
663 _pcol_init(&pcol, expected_pages, mapping->host);
664
665 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
666 if (ret) {
667 EXOFS_ERR("write_cache_pages => %d\n", ret);
668 return ret;
669 }
670
671 return write_exec(&pcol);
672}
673
674static int exofs_writepage(struct page *page, struct writeback_control *wbc)
675{
676 struct page_collect pcol;
677 int ret;
678
679 _pcol_init(&pcol, 1, page->mapping->host);
680
681 ret = writepage_strip(page, NULL, &pcol);
682 if (ret) {
683 EXOFS_ERR("exofs_writepage => %d\n", ret);
684 return ret;
685 }
686
687 return write_exec(&pcol);
688}
689
690int exofs_write_begin(struct file *file, struct address_space *mapping,
691 loff_t pos, unsigned len, unsigned flags,
692 struct page **pagep, void **fsdata)
693{
694 int ret = 0;
695 struct page *page;
696
697 page = *pagep;
698 if (page == NULL) {
699 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
700 fsdata);
701 if (ret) {
702 EXOFS_DBGMSG("simple_write_begin faild\n");
703 return ret;
704 }
705
706 page = *pagep;
707 }
708
709 /* read modify write */
710 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
711 ret = _readpage(page, true);
712 if (ret) {
713 /*SetPageError was done by _readpage. Is it ok?*/
714 unlock_page(page);
715 EXOFS_DBGMSG("__readpage_filler faild\n");
716 }
717 }
718
719 return ret;
720}
721
722static int exofs_write_begin_export(struct file *file,
723 struct address_space *mapping,
724 loff_t pos, unsigned len, unsigned flags,
725 struct page **pagep, void **fsdata)
726{
727 *pagep = NULL;
728
729 return exofs_write_begin(file, mapping, pos, len, flags, pagep,
730 fsdata);
731}
732
733const struct address_space_operations exofs_aops = {
734 .readpage = exofs_readpage,
735 .readpages = exofs_readpages,
736 .writepage = exofs_writepage,
737 .writepages = exofs_writepages,
738 .write_begin = exofs_write_begin_export,
739 .write_end = simple_write_end,
740};
741
742/******************************************************************************
743 * INODE OPERATIONS
744 *****************************************************************************/
745
746/*
747 * Test whether an inode is a fast symlink.
748 */
749static inline int exofs_inode_is_fast_symlink(struct inode *inode)
750{
751 struct exofs_i_info *oi = exofs_i(inode);
752
753 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
754}
755
756/*
757 * get_block_t - Fill in a buffer_head
758 * An OSD takes care of block allocation so we just fake an allocation by
759 * putting in the inode's sector_t in the buffer_head.
760 * TODO: What about the case of create==0 and @iblock does not exist in the
761 * object?
762 */
763static int exofs_get_block(struct inode *inode, sector_t iblock,
764 struct buffer_head *bh_result, int create)
765{
766 map_bh(bh_result, inode->i_sb, iblock);
767 return 0;
768}
769
770const struct osd_attr g_attr_logical_length = ATTR_DEF(
771 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
772
773/*
774 * Truncate a file to the specified size - all we have to do is set the size
775 * attribute. We make sure the object exists first.
776 */
777void exofs_truncate(struct inode *inode)
778{
779 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
780 struct exofs_i_info *oi = exofs_i(inode);
781 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
782 struct osd_request *or;
783 struct osd_attr attr;
784 loff_t isize = i_size_read(inode);
785 __be64 newsize;
786 int ret;
787
788 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
789 || S_ISLNK(inode->i_mode)))
790 return;
791 if (exofs_inode_is_fast_symlink(inode))
792 return;
793 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
794 return;
795 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
796
797 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
798
799 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
800 if (unlikely(!or)) {
801 EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
802 goto fail;
803 }
804
805 osd_req_set_attributes(or, &obj);
806
807 newsize = cpu_to_be64((u64)isize);
808 attr = g_attr_logical_length;
809 attr.val_ptr = &newsize;
810 osd_req_add_set_attr_list(or, &attr, 1);
811
812 /* if we are about to truncate an object, and it hasn't been
813 * created yet, wait
814 */
815 if (unlikely(wait_obj_created(oi)))
816 goto fail;
817
818 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
819 osd_end_request(or);
820 if (ret)
821 goto fail;
822
823out:
824 mark_inode_dirty(inode);
825 return;
826fail:
827 make_bad_inode(inode);
828 goto out;
829}
830
831/*
832 * Set inode attributes - just call generic functions.
833 */
834int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
835{
836 struct inode *inode = dentry->d_inode;
837 int error;
838
839 error = inode_change_ok(inode, iattr);
840 if (error)
841 return error;
842
843 error = inode_setattr(inode, iattr);
844 return error;
845}
846
847/*
848 * Read an inode from the OSD, and return it as is. We also return the size
849 * attribute in the 'sanity' argument if we got compiled with debugging turned
850 * on.
851 */
852static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
853 struct exofs_fcb *inode, uint64_t *sanity)
854{
855 struct exofs_sb_info *sbi = sb->s_fs_info;
856 struct osd_request *or;
857 struct osd_attr attr;
858 struct osd_obj_id obj = {sbi->s_pid,
859 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
860 int ret;
861
862 exofs_make_credential(oi->i_cred, &obj);
863
864 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
865 if (unlikely(!or)) {
866 EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
867 return -ENOMEM;
868 }
869 osd_req_get_attributes(or, &obj);
870
871 /* we need the inode attribute */
872 osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
873
874#ifdef EXOFS_DEBUG_OBJ_ISIZE
875 /* we get the size attributes to do a sanity check */
876 osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
877#endif
878
879 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
880 if (ret)
881 goto out;
882
883 attr = g_attr_inode_data;
884 ret = extract_attr_from_req(or, &attr);
885 if (ret) {
886 EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
887 goto out;
888 }
889
890 WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
891 memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
892
893#ifdef EXOFS_DEBUG_OBJ_ISIZE
894 attr = g_attr_logical_length;
895 ret = extract_attr_from_req(or, &attr);
896 if (ret) {
897 EXOFS_ERR("ERROR: extract attr from or failed\n");
898 goto out;
899 }
900 *sanity = get_unaligned_be64(attr.val_ptr);
901#endif
902
903out:
904 osd_end_request(or);
905 return ret;
906}
907
908/*
909 * Fill in an inode read from the OSD and set it up for use
910 */
911struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
912{
913 struct exofs_i_info *oi;
914 struct exofs_fcb fcb;
915 struct inode *inode;
916 uint64_t uninitialized_var(sanity);
917 int ret;
918
919 inode = iget_locked(sb, ino);
920 if (!inode)
921 return ERR_PTR(-ENOMEM);
922 if (!(inode->i_state & I_NEW))
923 return inode;
924 oi = exofs_i(inode);
925
926 /* read the inode from the osd */
927 ret = exofs_get_inode(sb, oi, &fcb, &sanity);
928 if (ret)
929 goto bad_inode;
930
931 init_waitqueue_head(&oi->i_wq);
932 set_obj_created(oi);
933
934 /* copy stuff from on-disk struct to in-memory struct */
935 inode->i_mode = le16_to_cpu(fcb.i_mode);
936 inode->i_uid = le32_to_cpu(fcb.i_uid);
937 inode->i_gid = le32_to_cpu(fcb.i_gid);
938 inode->i_nlink = le16_to_cpu(fcb.i_links_count);
939 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
940 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
941 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
942 inode->i_ctime.tv_nsec =
943 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
944 oi->i_commit_size = le64_to_cpu(fcb.i_size);
945 i_size_write(inode, oi->i_commit_size);
946 inode->i_blkbits = EXOFS_BLKSHIFT;
947 inode->i_generation = le32_to_cpu(fcb.i_generation);
948
949#ifdef EXOFS_DEBUG_OBJ_ISIZE
950 if ((inode->i_size != sanity) &&
951 (!exofs_inode_is_fast_symlink(inode))) {
952 EXOFS_ERR("WARNING: Size of object from inode and "
953 "attributes differ (%lld != %llu)\n",
954 inode->i_size, _LLU(sanity));
955 }
956#endif
957
958 oi->i_dir_start_lookup = 0;
959
960 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
961 ret = -ESTALE;
962 goto bad_inode;
963 }
964
965 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
966 if (fcb.i_data[0])
967 inode->i_rdev =
968 old_decode_dev(le32_to_cpu(fcb.i_data[0]));
969 else
970 inode->i_rdev =
971 new_decode_dev(le32_to_cpu(fcb.i_data[1]));
972 } else {
973 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
974 }
975
976 if (S_ISREG(inode->i_mode)) {
977 inode->i_op = &exofs_file_inode_operations;
978 inode->i_fop = &exofs_file_operations;
979 inode->i_mapping->a_ops = &exofs_aops;
980 } else if (S_ISDIR(inode->i_mode)) {
981 inode->i_op = &exofs_dir_inode_operations;
982 inode->i_fop = &exofs_dir_operations;
983 inode->i_mapping->a_ops = &exofs_aops;
984 } else if (S_ISLNK(inode->i_mode)) {
985 if (exofs_inode_is_fast_symlink(inode))
986 inode->i_op = &exofs_fast_symlink_inode_operations;
987 else {
988 inode->i_op = &exofs_symlink_inode_operations;
989 inode->i_mapping->a_ops = &exofs_aops;
990 }
991 } else {
992 inode->i_op = &exofs_special_inode_operations;
993 if (fcb.i_data[0])
994 init_special_inode(inode, inode->i_mode,
995 old_decode_dev(le32_to_cpu(fcb.i_data[0])));
996 else
997 init_special_inode(inode, inode->i_mode,
998 new_decode_dev(le32_to_cpu(fcb.i_data[1])));
999 }
1000
1001 unlock_new_inode(inode);
1002 return inode;
1003
1004bad_inode:
1005 iget_failed(inode);
1006 return ERR_PTR(ret);
1007}
1008
1009int __exofs_wait_obj_created(struct exofs_i_info *oi)
1010{
1011 if (!obj_created(oi)) {
1012 BUG_ON(!obj_2bcreated(oi));
1013 wait_event(oi->i_wq, obj_created(oi));
1014 }
1015 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1016}
1017/*
1018 * Callback function from exofs_new_inode(). The important thing is that we
1019 * set the obj_created flag so that other methods know that the object exists on
1020 * the OSD.
1021 */
1022static void create_done(struct osd_request *or, void *p)
1023{
1024 struct inode *inode = p;
1025 struct exofs_i_info *oi = exofs_i(inode);
1026 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1027 int ret;
1028
1029 ret = exofs_check_ok(or);
1030 osd_end_request(or);
1031 atomic_dec(&sbi->s_curr_pending);
1032
1033 if (unlikely(ret)) {
1034 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1035 _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
1036 make_bad_inode(inode);
1037 } else
1038 set_obj_created(oi);
1039
1040 atomic_dec(&inode->i_count);
1041 wake_up(&oi->i_wq);
1042}
1043
1044/*
1045 * Set up a new inode and create an object for it on the OSD
1046 */
1047struct inode *exofs_new_inode(struct inode *dir, int mode)
1048{
1049 struct super_block *sb;
1050 struct inode *inode;
1051 struct exofs_i_info *oi;
1052 struct exofs_sb_info *sbi;
1053 struct osd_request *or;
1054 struct osd_obj_id obj;
1055 int ret;
1056
1057 sb = dir->i_sb;
1058 inode = new_inode(sb);
1059 if (!inode)
1060 return ERR_PTR(-ENOMEM);
1061
1062 oi = exofs_i(inode);
1063
1064 init_waitqueue_head(&oi->i_wq);
1065 set_obj_2bcreated(oi);
1066
1067 sbi = sb->s_fs_info;
1068
1069 sb->s_dirt = 1;
1070 inode->i_uid = current->cred->fsuid;
1071 if (dir->i_mode & S_ISGID) {
1072 inode->i_gid = dir->i_gid;
1073 if (S_ISDIR(mode))
1074 mode |= S_ISGID;
1075 } else {
1076 inode->i_gid = current->cred->fsgid;
1077 }
1078 inode->i_mode = mode;
1079
1080 inode->i_ino = sbi->s_nextid++;
1081 inode->i_blkbits = EXOFS_BLKSHIFT;
1082 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1083 oi->i_commit_size = inode->i_size = 0;
1084 spin_lock(&sbi->s_next_gen_lock);
1085 inode->i_generation = sbi->s_next_generation++;
1086 spin_unlock(&sbi->s_next_gen_lock);
1087 insert_inode_hash(inode);
1088
1089 mark_inode_dirty(inode);
1090
1091 obj.partition = sbi->s_pid;
1092 obj.id = inode->i_ino + EXOFS_OBJ_OFF;
1093 exofs_make_credential(oi->i_cred, &obj);
1094
1095 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1096 if (unlikely(!or)) {
1097 EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
1098 return ERR_PTR(-ENOMEM);
1099 }
1100
1101 osd_req_create_object(or, &obj);
1102
1103 /* increment the refcount so that the inode will still be around when we
1104 * reach the callback
1105 */
1106 atomic_inc(&inode->i_count);
1107
1108 ret = exofs_async_op(or, create_done, inode, oi->i_cred);
1109 if (ret) {
1110 atomic_dec(&inode->i_count);
1111 osd_end_request(or);
1112 return ERR_PTR(-EIO);
1113 }
1114 atomic_inc(&sbi->s_curr_pending);
1115
1116 return inode;
1117}
1118
1119/*
1120 * struct to pass two arguments to update_inode's callback
1121 */
1122struct updatei_args {
1123 struct exofs_sb_info *sbi;
1124 struct exofs_fcb fcb;
1125};
1126
1127/*
1128 * Callback function from exofs_update_inode().
1129 */
1130static void updatei_done(struct osd_request *or, void *p)
1131{
1132 struct updatei_args *args = p;
1133
1134 osd_end_request(or);
1135
1136 atomic_dec(&args->sbi->s_curr_pending);
1137
1138 kfree(args);
1139}
1140
1141/*
1142 * Write the inode to the OSD. Just fill up the struct, and set the attribute
1143 * synchronously or asynchronously depending on the do_sync flag.
1144 */
1145static int exofs_update_inode(struct inode *inode, int do_sync)
1146{
1147 struct exofs_i_info *oi = exofs_i(inode);
1148 struct super_block *sb = inode->i_sb;
1149 struct exofs_sb_info *sbi = sb->s_fs_info;
1150 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1151 struct osd_request *or;
1152 struct osd_attr attr;
1153 struct exofs_fcb *fcb;
1154 struct updatei_args *args;
1155 int ret;
1156
1157 args = kzalloc(sizeof(*args), GFP_KERNEL);
1158 if (!args)
1159 return -ENOMEM;
1160
1161 fcb = &args->fcb;
1162
1163 fcb->i_mode = cpu_to_le16(inode->i_mode);
1164 fcb->i_uid = cpu_to_le32(inode->i_uid);
1165 fcb->i_gid = cpu_to_le32(inode->i_gid);
1166 fcb->i_links_count = cpu_to_le16(inode->i_nlink);
1167 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
1168 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
1169 fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
1170 oi->i_commit_size = i_size_read(inode);
1171 fcb->i_size = cpu_to_le64(oi->i_commit_size);
1172 fcb->i_generation = cpu_to_le32(inode->i_generation);
1173
1174 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1175 if (old_valid_dev(inode->i_rdev)) {
1176 fcb->i_data[0] =
1177 cpu_to_le32(old_encode_dev(inode->i_rdev));
1178 fcb->i_data[1] = 0;
1179 } else {
1180 fcb->i_data[0] = 0;
1181 fcb->i_data[1] =
1182 cpu_to_le32(new_encode_dev(inode->i_rdev));
1183 fcb->i_data[2] = 0;
1184 }
1185 } else
1186 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1187
1188 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1189 if (unlikely(!or)) {
1190 EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
1191 ret = -ENOMEM;
1192 goto free_args;
1193 }
1194
1195 osd_req_set_attributes(or, &obj);
1196
1197 attr = g_attr_inode_data;
1198 attr.val_ptr = fcb;
1199 osd_req_add_set_attr_list(or, &attr, 1);
1200
1201 if (!obj_created(oi)) {
1202 EXOFS_DBGMSG("!obj_created\n");
1203 BUG_ON(!obj_2bcreated(oi));
1204 wait_event(oi->i_wq, obj_created(oi));
1205 EXOFS_DBGMSG("wait_event done\n");
1206 }
1207
1208 if (do_sync) {
1209 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
1210 osd_end_request(or);
1211 goto free_args;
1212 } else {
1213 args->sbi = sbi;
1214
1215 ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
1216 if (ret) {
1217 osd_end_request(or);
1218 goto free_args;
1219 }
1220 atomic_inc(&sbi->s_curr_pending);
1221 goto out; /* deallocation in updatei_done */
1222 }
1223
1224free_args:
1225 kfree(args);
1226out:
1227 EXOFS_DBGMSG("ret=>%d\n", ret);
1228 return ret;
1229}
1230
1231int exofs_write_inode(struct inode *inode, int wait)
1232{
1233 return exofs_update_inode(inode, wait);
1234}
1235
1236/*
1237 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1238 * do.
1239 */
1240static void delete_done(struct osd_request *or, void *p)
1241{
1242 struct exofs_sb_info *sbi;
1243 osd_end_request(or);
1244 sbi = p;
1245 atomic_dec(&sbi->s_curr_pending);
1246}
1247
1248/*
1249 * Called when the refcount of an inode reaches zero. We remove the object
1250 * from the OSD here. We make sure the object was created before we try and
1251 * delete it.
1252 */
1253void exofs_delete_inode(struct inode *inode)
1254{
1255 struct exofs_i_info *oi = exofs_i(inode);
1256 struct super_block *sb = inode->i_sb;
1257 struct exofs_sb_info *sbi = sb->s_fs_info;
1258 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1259 struct osd_request *or;
1260 int ret;
1261
1262 truncate_inode_pages(&inode->i_data, 0);
1263
1264 if (is_bad_inode(inode))
1265 goto no_delete;
1266
1267 mark_inode_dirty(inode);
1268 exofs_update_inode(inode, inode_needs_sync(inode));
1269
1270 inode->i_size = 0;
1271 if (inode->i_blocks)
1272 exofs_truncate(inode);
1273
1274 clear_inode(inode);
1275
1276 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1277 if (unlikely(!or)) {
1278 EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
1279 return;
1280 }
1281
1282 osd_req_remove_object(or, &obj);
1283
1284 /* if we are deleting an obj that hasn't been created yet, wait */
1285 if (!obj_created(oi)) {
1286 BUG_ON(!obj_2bcreated(oi));
1287 wait_event(oi->i_wq, obj_created(oi));
1288 }
1289
1290 ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
1291 if (ret) {
1292 EXOFS_ERR(
1293 "ERROR: @exofs_delete_inode exofs_async_op failed\n");
1294 osd_end_request(or);
1295 return;
1296 }
1297 atomic_inc(&sbi->s_curr_pending);
1298
1299 return;
1300
1301no_delete:
1302 clear_inode(inode);
1303}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
39{
40 int err = exofs_add_link(dentry, inode);
41 if (!err) {
42 d_instantiate(dentry, inode);
43 return 0;
44 }
45 inode_dec_link_count(inode);
46 iput(inode);
47 return err;
48}
49
50static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
51 struct nameidata *nd)
52{
53 struct inode *inode;
54 ino_t ino;
55
56 if (dentry->d_name.len > EXOFS_NAME_LEN)
57 return ERR_PTR(-ENAMETOOLONG);
58
59 ino = exofs_inode_by_name(dir, dentry);
60 inode = NULL;
61 if (ino) {
62 inode = exofs_iget(dir->i_sb, ino);
63 if (IS_ERR(inode))
64 return ERR_CAST(inode);
65 }
66 return d_splice_alias(inode, dentry);
67}
68
69static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
70 struct nameidata *nd)
71{
72 struct inode *inode = exofs_new_inode(dir, mode);
73 int err = PTR_ERR(inode);
74 if (!IS_ERR(inode)) {
75 inode->i_op = &exofs_file_inode_operations;
76 inode->i_fop = &exofs_file_operations;
77 inode->i_mapping->a_ops = &exofs_aops;
78 mark_inode_dirty(inode);
79 err = exofs_add_nondir(dentry, inode);
80 }
81 return err;
82}
83
84static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
85 dev_t rdev)
86{
87 struct inode *inode;
88 int err;
89
90 if (!new_valid_dev(rdev))
91 return -EINVAL;
92
93 inode = exofs_new_inode(dir, mode);
94 err = PTR_ERR(inode);
95 if (!IS_ERR(inode)) {
96 init_special_inode(inode, inode->i_mode, rdev);
97 mark_inode_dirty(inode);
98 err = exofs_add_nondir(dentry, inode);
99 }
100 return err;
101}
102
103static int exofs_symlink(struct inode *dir, struct dentry *dentry,
104 const char *symname)
105{
106 struct super_block *sb = dir->i_sb;
107 int err = -ENAMETOOLONG;
108 unsigned l = strlen(symname)+1;
109 struct inode *inode;
110 struct exofs_i_info *oi;
111
112 if (l > sb->s_blocksize)
113 goto out;
114
115 inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
116 err = PTR_ERR(inode);
117 if (IS_ERR(inode))
118 goto out;
119
120 oi = exofs_i(inode);
121 if (l > sizeof(oi->i_data)) {
122 /* slow symlink */
123 inode->i_op = &exofs_symlink_inode_operations;
124 inode->i_mapping->a_ops = &exofs_aops;
125 memset(oi->i_data, 0, sizeof(oi->i_data));
126
127 err = page_symlink(inode, symname, l);
128 if (err)
129 goto out_fail;
130 } else {
131 /* fast symlink */
132 inode->i_op = &exofs_fast_symlink_inode_operations;
133 memcpy(oi->i_data, symname, l);
134 inode->i_size = l-1;
135 }
136 mark_inode_dirty(inode);
137
138 err = exofs_add_nondir(dentry, inode);
139out:
140 return err;
141
142out_fail:
143 inode_dec_link_count(inode);
144 iput(inode);
145 goto out;
146}
147
148static int exofs_link(struct dentry *old_dentry, struct inode *dir,
149 struct dentry *dentry)
150{
151 struct inode *inode = old_dentry->d_inode;
152
153 if (inode->i_nlink >= EXOFS_LINK_MAX)
154 return -EMLINK;
155
156 inode->i_ctime = CURRENT_TIME;
157 inode_inc_link_count(inode);
158 atomic_inc(&inode->i_count);
159
160 return exofs_add_nondir(dentry, inode);
161}
162
163static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
164{
165 struct inode *inode;
166 int err = -EMLINK;
167
168 if (dir->i_nlink >= EXOFS_LINK_MAX)
169 goto out;
170
171 inode_inc_link_count(dir);
172
173 inode = exofs_new_inode(dir, S_IFDIR | mode);
174 err = PTR_ERR(inode);
175 if (IS_ERR(inode))
176 goto out_dir;
177
178 inode->i_op = &exofs_dir_inode_operations;
179 inode->i_fop = &exofs_dir_operations;
180 inode->i_mapping->a_ops = &exofs_aops;
181
182 inode_inc_link_count(inode);
183
184 err = exofs_make_empty(inode, dir);
185 if (err)
186 goto out_fail;
187
188 err = exofs_add_link(dentry, inode);
189 if (err)
190 goto out_fail;
191
192 d_instantiate(dentry, inode);
193out:
194 return err;
195
196out_fail:
197 inode_dec_link_count(inode);
198 inode_dec_link_count(inode);
199 iput(inode);
200out_dir:
201 inode_dec_link_count(dir);
202 goto out;
203}
204
205static int exofs_unlink(struct inode *dir, struct dentry *dentry)
206{
207 struct inode *inode = dentry->d_inode;
208 struct exofs_dir_entry *de;
209 struct page *page;
210 int err = -ENOENT;
211
212 de = exofs_find_entry(dir, dentry, &page);
213 if (!de)
214 goto out;
215
216 err = exofs_delete_entry(de, page);
217 if (err)
218 goto out;
219
220 inode->i_ctime = dir->i_ctime;
221 inode_dec_link_count(inode);
222 err = 0;
223out:
224 return err;
225}
226
227static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
228{
229 struct inode *inode = dentry->d_inode;
230 int err = -ENOTEMPTY;
231
232 if (exofs_empty_dir(inode)) {
233 err = exofs_unlink(dir, dentry);
234 if (!err) {
235 inode->i_size = 0;
236 inode_dec_link_count(inode);
237 inode_dec_link_count(dir);
238 }
239 }
240 return err;
241}
242
243static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
244 struct inode *new_dir, struct dentry *new_dentry)
245{
246 struct inode *old_inode = old_dentry->d_inode;
247 struct inode *new_inode = new_dentry->d_inode;
248 struct page *dir_page = NULL;
249 struct exofs_dir_entry *dir_de = NULL;
250 struct page *old_page;
251 struct exofs_dir_entry *old_de;
252 int err = -ENOENT;
253
254 old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
255 if (!old_de)
256 goto out;
257
258 if (S_ISDIR(old_inode->i_mode)) {
259 err = -EIO;
260 dir_de = exofs_dotdot(old_inode, &dir_page);
261 if (!dir_de)
262 goto out_old;
263 }
264
265 if (new_inode) {
266 struct page *new_page;
267 struct exofs_dir_entry *new_de;
268
269 err = -ENOTEMPTY;
270 if (dir_de && !exofs_empty_dir(new_inode))
271 goto out_dir;
272
273 err = -ENOENT;
274 new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
275 if (!new_de)
276 goto out_dir;
277 inode_inc_link_count(old_inode);
278 err = exofs_set_link(new_dir, new_de, new_page, old_inode);
279 new_inode->i_ctime = CURRENT_TIME;
280 if (dir_de)
281 drop_nlink(new_inode);
282 inode_dec_link_count(new_inode);
283 if (err)
284 goto out_dir;
285 } else {
286 if (dir_de) {
287 err = -EMLINK;
288 if (new_dir->i_nlink >= EXOFS_LINK_MAX)
289 goto out_dir;
290 }
291 inode_inc_link_count(old_inode);
292 err = exofs_add_link(new_dentry, old_inode);
293 if (err) {
294 inode_dec_link_count(old_inode);
295 goto out_dir;
296 }
297 if (dir_de)
298 inode_inc_link_count(new_dir);
299 }
300
301 old_inode->i_ctime = CURRENT_TIME;
302
303 exofs_delete_entry(old_de, old_page);
304 inode_dec_link_count(old_inode);
305
306 if (dir_de) {
307 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
308 inode_dec_link_count(old_dir);
309 if (err)
310 goto out_dir;
311 }
312 return 0;
313
314
315out_dir:
316 if (dir_de) {
317 kunmap(dir_page);
318 page_cache_release(dir_page);
319 }
320out_old:
321 kunmap(old_page);
322 page_cache_release(old_page);
323out:
324 return err;
325}
326
327const struct inode_operations exofs_dir_inode_operations = {
328 .create = exofs_create,
329 .lookup = exofs_lookup,
330 .link = exofs_link,
331 .unlink = exofs_unlink,
332 .symlink = exofs_symlink,
333 .mkdir = exofs_mkdir,
334 .rmdir = exofs_rmdir,
335 .mknod = exofs_mknod,
336 .rename = exofs_rename,
337 .setattr = exofs_setattr,
338};
339
340const struct inode_operations exofs_special_inode_operations = {
341 .setattr = exofs_setattr,
342};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * This file is part of exofs.
10 *
11 * exofs is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation. Since it is based on ext2, and the only
14 * valid version of GPL for the Linux kernel is version 2, the only valid
15 * version of GPL for exofs is version 2.
16 *
17 * exofs is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with exofs; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <scsi/scsi_device.h>
28#include <scsi/osd_sense.h>
29
30#include "exofs.h"
31
32int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
33{
34 struct osd_sense_info osi;
35 int ret = osd_req_decode_sense(or, &osi);
36
37 if (ret) { /* translate to Linux codes */
38 if (osi.additional_code == scsi_invalid_field_in_cdb) {
39 if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
40 ret = -EFAULT;
41 if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
42 ret = -ENOENT;
43 else
44 ret = -EINVAL;
45 } else if (osi.additional_code == osd_quota_error)
46 ret = -ENOSPC;
47 else
48 ret = -EIO;
49 }
50
51 /* FIXME: should be include in osd_sense_info */
52 if (in_resid)
53 *in_resid = or->in.req ? or->in.req->data_len : 0;
54
55 if (out_resid)
56 *out_resid = or->out.req ? or->out.req->data_len : 0;
57
58 return ret;
59}
60
61void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
62{
63 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
64}
65
66/*
67 * Perform a synchronous OSD operation.
68 */
69int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
70{
71 int ret;
72
73 or->timeout = timeout;
74 ret = osd_finalize_request(or, 0, credential, NULL);
75 if (ret) {
76 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
77 return ret;
78 }
79
80 ret = osd_execute_request(or);
81
82 if (ret)
83 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
84 /* osd_req_decode_sense(or, ret); */
85 return ret;
86}
87
88/*
89 * Perform an asynchronous OSD operation.
90 */
91int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
92 void *caller_context, u8 *cred)
93{
94 int ret;
95
96 ret = osd_finalize_request(or, 0, cred, NULL);
97 if (ret) {
98 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
99 return ret;
100 }
101
102 ret = osd_execute_request_async(or, async_done, caller_context);
103
104 if (ret)
105 EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
106 return ret;
107}
108
109int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
110{
111 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
112 void *iter = NULL;
113 int nelem;
114
115 do {
116 nelem = 1;
117 osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
118 if ((cur_attr.attr_page == attr->attr_page) &&
119 (cur_attr.attr_id == attr->attr_id)) {
120 attr->len = cur_attr.len;
121 attr->val_ptr = cur_attr.val_ptr;
122 return 0;
123 }
124 } while (iter);
125
126 return -EIO;
127}
128
129int osd_req_read_kern(struct osd_request *or,
130 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
131{
132 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
133 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
134
135 if (!bio)
136 return -ENOMEM;
137
138 osd_req_read(or, obj, bio, offset);
139 return 0;
140}
141
142int osd_req_write_kern(struct osd_request *or,
143 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
144{
145 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
146 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
147
148 if (!bio)
149 return -ENOMEM;
150
151 osd_req_write(or, obj, bio, offset);
152 return 0;
153}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..9f1985e857e2
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/string.h>
37#include <linux/parser.h>
38#include <linux/vfs.h>
39#include <linux/random.h>
40#include <linux/exportfs.h>
41
42#include "exofs.h"
43
44/******************************************************************************
45 * MOUNT OPTIONS
46 *****************************************************************************/
47
48/*
49 * struct to hold what we get from mount options
50 */
51struct exofs_mountopt {
52 const char *dev_name;
53 uint64_t pid;
54 int timeout;
55};
56
57/*
58 * exofs-specific mount-time options.
59 */
60enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
61
62/*
63 * Our mount-time options. These should ideally be 64-bit unsigned, but the
64 * kernel's parsing functions do not currently support that. 32-bit should be
65 * sufficient for most applications now.
66 */
67static match_table_t tokens = {
68 {Opt_pid, "pid=%u"},
69 {Opt_to, "to=%u"},
70 {Opt_err, NULL}
71};
72
73/*
74 * The main option parsing method. Also makes sure that all of the mandatory
75 * mount options were set.
76 */
77static int parse_options(char *options, struct exofs_mountopt *opts)
78{
79 char *p;
80 substring_t args[MAX_OPT_ARGS];
81 int option;
82 bool s_pid = false;
83
84 EXOFS_DBGMSG("parse_options %s\n", options);
85 /* defaults */
86 memset(opts, 0, sizeof(*opts));
87 opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
88
89 while ((p = strsep(&options, ",")) != NULL) {
90 int token;
91 char str[32];
92
93 if (!*p)
94 continue;
95
96 token = match_token(p, tokens, args);
97 switch (token) {
98 case Opt_pid:
99 if (0 == match_strlcpy(str, &args[0], sizeof(str)))
100 return -EINVAL;
101 opts->pid = simple_strtoull(str, NULL, 0);
102 if (opts->pid < EXOFS_MIN_PID) {
103 EXOFS_ERR("Partition ID must be >= %u",
104 EXOFS_MIN_PID);
105 return -EINVAL;
106 }
107 s_pid = 1;
108 break;
109 case Opt_to:
110 if (match_int(&args[0], &option))
111 return -EINVAL;
112 if (option <= 0) {
113 EXOFS_ERR("Timout must be > 0");
114 return -EINVAL;
115 }
116 opts->timeout = option * HZ;
117 break;
118 }
119 }
120
121 if (!s_pid) {
122 EXOFS_ERR("Need to specify the following options:\n");
123 EXOFS_ERR(" -o pid=pid_no_to_use\n");
124 return -EINVAL;
125 }
126
127 return 0;
128}
129
130/******************************************************************************
131 * INODE CACHE
132 *****************************************************************************/
133
134/*
135 * Our inode cache. Isn't it pretty?
136 */
137static struct kmem_cache *exofs_inode_cachep;
138
139/*
140 * Allocate an inode in the cache
141 */
142static struct inode *exofs_alloc_inode(struct super_block *sb)
143{
144 struct exofs_i_info *oi;
145
146 oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
147 if (!oi)
148 return NULL;
149
150 oi->vfs_inode.i_version = 1;
151 return &oi->vfs_inode;
152}
153
154/*
155 * Remove an inode from the cache
156 */
157static void exofs_destroy_inode(struct inode *inode)
158{
159 kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
160}
161
162/*
163 * Initialize the inode
164 */
165static void exofs_init_once(void *foo)
166{
167 struct exofs_i_info *oi = foo;
168
169 inode_init_once(&oi->vfs_inode);
170}
171
172/*
173 * Create and initialize the inode cache
174 */
175static int init_inodecache(void)
176{
177 exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
178 sizeof(struct exofs_i_info), 0,
179 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
180 exofs_init_once);
181 if (exofs_inode_cachep == NULL)
182 return -ENOMEM;
183 return 0;
184}
185
186/*
187 * Destroy the inode cache
188 */
189static void destroy_inodecache(void)
190{
191 kmem_cache_destroy(exofs_inode_cachep);
192}
193
194/******************************************************************************
195 * SUPERBLOCK FUNCTIONS
196 *****************************************************************************/
197static const struct super_operations exofs_sops;
198static const struct export_operations exofs_export_ops;
199
200/*
201 * Write the superblock to the OSD
202 */
203static void exofs_write_super(struct super_block *sb)
204{
205 struct exofs_sb_info *sbi;
206 struct exofs_fscb *fscb;
207 struct osd_request *or;
208 struct osd_obj_id obj;
209 int ret;
210
211 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
212 if (!fscb) {
213 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
214 return;
215 }
216
217 lock_kernel();
218 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
220 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
221 fscb->s_magic = cpu_to_le16(sb->s_magic);
222 fscb->s_newfs = 0;
223
224 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
225 if (unlikely(!or)) {
226 EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
227 goto out;
228 }
229
230 obj.partition = sbi->s_pid;
231 obj.id = EXOFS_SUPER_ID;
232 ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
233 if (unlikely(ret)) {
234 EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
235 goto out;
236 }
237
238 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
239 if (unlikely(ret)) {
240 EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
241 goto out;
242 }
243 sb->s_dirt = 0;
244
245out:
246 if (or)
247 osd_end_request(or);
248 unlock_kernel();
249 kfree(fscb);
250}
251
252/*
253 * This function is called when the vfs is freeing the superblock. We just
254 * need to free our own part.
255 */
256static void exofs_put_super(struct super_block *sb)
257{
258 int num_pend;
259 struct exofs_sb_info *sbi = sb->s_fs_info;
260
261 /* make sure there are no pending commands */
262 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
263 num_pend = atomic_read(&sbi->s_curr_pending)) {
264 wait_queue_head_t wq;
265 init_waitqueue_head(&wq);
266 wait_event_timeout(wq,
267 (atomic_read(&sbi->s_curr_pending) == 0),
268 msecs_to_jiffies(100));
269 }
270
271 osduld_put_device(sbi->s_dev);
272 kfree(sb->s_fs_info);
273 sb->s_fs_info = NULL;
274}
275
276/*
277 * Read the superblock from the OSD and fill in the fields
278 */
279static int exofs_fill_super(struct super_block *sb, void *data, int silent)
280{
281 struct inode *root;
282 struct exofs_mountopt *opts = data;
283 struct exofs_sb_info *sbi; /*extended info */
284 struct exofs_fscb fscb; /*on-disk superblock info */
285 struct osd_request *or = NULL;
286 struct osd_obj_id obj;
287 int ret;
288
289 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
290 if (!sbi)
291 return -ENOMEM;
292 sb->s_fs_info = sbi;
293
294 /* use mount options to fill superblock */
295 sbi->s_dev = osduld_path_lookup(opts->dev_name);
296 if (IS_ERR(sbi->s_dev)) {
297 ret = PTR_ERR(sbi->s_dev);
298 sbi->s_dev = NULL;
299 goto free_sbi;
300 }
301
302 sbi->s_pid = opts->pid;
303 sbi->s_timeout = opts->timeout;
304
305 /* fill in some other data by hand */
306 memset(sb->s_id, 0, sizeof(sb->s_id));
307 strcpy(sb->s_id, "exofs");
308 sb->s_blocksize = EXOFS_BLKSIZE;
309 sb->s_blocksize_bits = EXOFS_BLKSHIFT;
310 sb->s_maxbytes = MAX_LFS_FILESIZE;
311 atomic_set(&sbi->s_curr_pending, 0);
312 sb->s_bdev = NULL;
313 sb->s_dev = 0;
314
315 /* read data from on-disk superblock object */
316 obj.partition = sbi->s_pid;
317 obj.id = EXOFS_SUPER_ID;
318 exofs_make_credential(sbi->s_cred, &obj);
319
320 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
321 if (unlikely(!or)) {
322 if (!silent)
323 EXOFS_ERR(
324 "exofs_fill_super: osd_start_request failed.\n");
325 ret = -ENOMEM;
326 goto free_sbi;
327 }
328 ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
329 if (unlikely(ret)) {
330 if (!silent)
331 EXOFS_ERR(
332 "exofs_fill_super: osd_req_read_kern failed.\n");
333 ret = -ENOMEM;
334 goto free_sbi;
335 }
336
337 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
338 if (unlikely(ret)) {
339 if (!silent)
340 EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
341 ret = -EIO;
342 goto free_sbi;
343 }
344
345 sb->s_magic = le16_to_cpu(fscb.s_magic);
346 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
347 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
348
349 /* make sure what we read from the object store is correct */
350 if (sb->s_magic != EXOFS_SUPER_MAGIC) {
351 if (!silent)
352 EXOFS_ERR("ERROR: Bad magic value\n");
353 ret = -EINVAL;
354 goto free_sbi;
355 }
356
357 /* start generation numbers from a random point */
358 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
359 spin_lock_init(&sbi->s_next_gen_lock);
360
361 /* set up operation vectors */
362 sb->s_op = &exofs_sops;
363 sb->s_export_op = &exofs_export_ops;
364 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
365 if (IS_ERR(root)) {
366 EXOFS_ERR("ERROR: exofs_iget failed\n");
367 ret = PTR_ERR(root);
368 goto free_sbi;
369 }
370 sb->s_root = d_alloc_root(root);
371 if (!sb->s_root) {
372 iput(root);
373 EXOFS_ERR("ERROR: get root inode failed\n");
374 ret = -ENOMEM;
375 goto free_sbi;
376 }
377
378 if (!S_ISDIR(root->i_mode)) {
379 dput(sb->s_root);
380 sb->s_root = NULL;
381 EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
382 root->i_mode);
383 ret = -EINVAL;
384 goto free_sbi;
385 }
386
387 ret = 0;
388out:
389 if (or)
390 osd_end_request(or);
391 return ret;
392
393free_sbi:
394 osduld_put_device(sbi->s_dev); /* NULL safe */
395 kfree(sbi);
396 goto out;
397}
398
399/*
400 * Set up the superblock (calls exofs_fill_super eventually)
401 */
402static int exofs_get_sb(struct file_system_type *type,
403 int flags, const char *dev_name,
404 void *data, struct vfsmount *mnt)
405{
406 struct exofs_mountopt opts;
407 int ret;
408
409 ret = parse_options(data, &opts);
410 if (ret)
411 return ret;
412
413 opts.dev_name = dev_name;
414 return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
415}
416
417/*
418 * Return information about the file system state in the buffer. This is used
419 * by the 'df' command, for example.
420 */
421static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
422{
423 struct super_block *sb = dentry->d_sb;
424 struct exofs_sb_info *sbi = sb->s_fs_info;
425 struct osd_obj_id obj = {sbi->s_pid, 0};
426 struct osd_attr attrs[] = {
427 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
428 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
429 ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
430 OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
431 };
432 uint64_t capacity = ULLONG_MAX;
433 uint64_t used = ULLONG_MAX;
434 struct osd_request *or;
435 uint8_t cred_a[OSD_CAP_LEN];
436 int ret;
437
438 /* get used/capacity attributes */
439 exofs_make_credential(cred_a, &obj);
440
441 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
442 if (unlikely(!or)) {
443 EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
444 return -ENOMEM;
445 }
446
447 osd_req_get_attributes(or, &obj);
448 osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
449 ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
450 if (unlikely(ret))
451 goto out;
452
453 ret = extract_attr_from_req(or, &attrs[0]);
454 if (likely(!ret))
455 capacity = get_unaligned_be64(attrs[0].val_ptr);
456 else
457 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
458
459 ret = extract_attr_from_req(or, &attrs[1]);
460 if (likely(!ret))
461 used = get_unaligned_be64(attrs[1].val_ptr);
462 else
463 EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
464
465 /* fill in the stats buffer */
466 buf->f_type = EXOFS_SUPER_MAGIC;
467 buf->f_bsize = EXOFS_BLKSIZE;
468 buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
469 buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
470 buf->f_bavail = buf->f_bfree;
471 buf->f_files = sbi->s_numfiles;
472 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
473 buf->f_namelen = EXOFS_NAME_LEN;
474
475out:
476 osd_end_request(or);
477 return ret;
478}
479
480static const struct super_operations exofs_sops = {
481 .alloc_inode = exofs_alloc_inode,
482 .destroy_inode = exofs_destroy_inode,
483 .write_inode = exofs_write_inode,
484 .delete_inode = exofs_delete_inode,
485 .put_super = exofs_put_super,
486 .write_super = exofs_write_super,
487 .statfs = exofs_statfs,
488};
489
490/******************************************************************************
491 * EXPORT OPERATIONS
492 *****************************************************************************/
493
494struct dentry *exofs_get_parent(struct dentry *child)
495{
496 unsigned long ino = exofs_parent_ino(child);
497
498 if (!ino)
499 return NULL;
500
501 return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
502}
503
504static struct inode *exofs_nfs_get_inode(struct super_block *sb,
505 u64 ino, u32 generation)
506{
507 struct inode *inode;
508
509 inode = exofs_iget(sb, ino);
510 if (IS_ERR(inode))
511 return ERR_CAST(inode);
512 if (generation && inode->i_generation != generation) {
513 /* we didn't find the right inode.. */
514 iput(inode);
515 return ERR_PTR(-ESTALE);
516 }
517 return inode;
518}
519
520static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
521 struct fid *fid, int fh_len, int fh_type)
522{
523 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
524 exofs_nfs_get_inode);
525}
526
527static struct dentry *exofs_fh_to_parent(struct super_block *sb,
528 struct fid *fid, int fh_len, int fh_type)
529{
530 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
531 exofs_nfs_get_inode);
532}
533
534static const struct export_operations exofs_export_ops = {
535 .fh_to_dentry = exofs_fh_to_dentry,
536 .fh_to_parent = exofs_fh_to_parent,
537 .get_parent = exofs_get_parent,
538};
539
540/******************************************************************************
541 * INSMOD/RMMOD
542 *****************************************************************************/
543
544/*
545 * struct that describes this file system
546 */
547static struct file_system_type exofs_type = {
548 .owner = THIS_MODULE,
549 .name = "exofs",
550 .get_sb = exofs_get_sb,
551 .kill_sb = generic_shutdown_super,
552};
553
554static int __init init_exofs(void)
555{
556 int err;
557
558 err = init_inodecache();
559 if (err)
560 goto out;
561
562 err = register_filesystem(&exofs_type);
563 if (err)
564 goto out_d;
565
566 return 0;
567out_d:
568 destroy_inodecache();
569out:
570 return err;
571}
572
573static void __exit exit_exofs(void)
574{
575 unregister_filesystem(&exofs_type);
576 destroy_inodecache();
577}
578
579MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
580MODULE_DESCRIPTION("exofs");
581MODULE_LICENSE("GPL");
582
583module_init(init_exofs)
584module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/namei.h>
37
38#include "exofs.h"
39
40static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
41{
42 struct exofs_i_info *oi = exofs_i(dentry->d_inode);
43
44 nd_set_link(nd, (char *)oi->i_data);
45 return NULL;
46}
47
48const struct inode_operations exofs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52};
53
54const struct inode_operations exofs_fast_symlink_inode_operations = {
55 .readlink = generic_readlink,
56 .follow_link = exofs_follow_link,
57};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
318 return PTR_ERR(acl); 318 return PTR_ERR(acl);
319 } 319 }
320 if (!acl) 320 if (!acl)
321 inode->i_mode &= ~current->fs->umask; 321 inode->i_mode &= ~current_umask();
322 } 322 }
323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
324 struct posix_acl *clone; 324 struct posix_acl *clone;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext3_readdir, /* we take BKL. needed?*/ 44 .readdir = ext3_readdir, /* we take BKL. needed?*/
45 .ioctl = ext3_ioctl, /* BKL held */ 45 .unlocked_ioctl = ext3_ioctl,
46#ifdef CONFIG_COMPAT 46#ifdef CONFIG_COMPAT
47 .compat_ioctl = ext3_compat_ioctl, 47 .compat_ioctl = ext3_compat_ioctl,
48#endif 48#endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..5b49704b231b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
33 */ 33 */
34static int ext3_release_file (struct inode * inode, struct file * filp) 34static int ext3_release_file (struct inode * inode, struct file * filp)
35{ 35{
36 if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
37 filemap_flush(inode->i_mapping);
38 EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1))
@@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = {
112 .write = do_sync_write, 116 .write = do_sync_write,
113 .aio_read = generic_file_aio_read, 117 .aio_read = generic_file_aio_read,
114 .aio_write = ext3_file_write, 118 .aio_write = ext3_file_write,
115 .ioctl = ext3_ioctl, 119 .unlocked_ioctl = ext3_ioctl,
116#ifdef CONFIG_COMPAT 120#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext3_compat_ioctl, 121 .compat_ioctl = ext3_compat_ioctl,
118#endif 122#endif
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 4a09ff169870..466a332e0bd1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
1149 struct page **pagep, void **fsdata) 1149 struct page **pagep, void **fsdata)
1150{ 1150{
1151 struct inode *inode = mapping->host; 1151 struct inode *inode = mapping->host;
1152 int ret, needed_blocks = ext3_writepage_trans_blocks(inode); 1152 int ret;
1153 handle_t *handle; 1153 handle_t *handle;
1154 int retries = 0; 1154 int retries = 0;
1155 struct page *page; 1155 struct page *page;
1156 pgoff_t index; 1156 pgoff_t index;
1157 unsigned from, to; 1157 unsigned from, to;
1158 /* Reserve one block more for addition to orphan list in case
1159 * we allocate blocks but write fails for some reason */
1160 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1158 1161
1159 index = pos >> PAGE_CACHE_SHIFT; 1162 index = pos >> PAGE_CACHE_SHIFT;
1160 from = pos & (PAGE_CACHE_SIZE - 1); 1163 from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
1184 } 1187 }
1185write_begin_failed: 1188write_begin_failed:
1186 if (ret) { 1189 if (ret) {
1187 ext3_journal_stop(handle);
1188 unlock_page(page);
1189 page_cache_release(page);
1190 /* 1190 /*
1191 * block_write_begin may have instantiated a few blocks 1191 * block_write_begin may have instantiated a few blocks
1192 * outside i_size. Trim these off again. Don't need 1192 * outside i_size. Trim these off again. Don't need
1193 * i_size_read because we hold i_mutex. 1193 * i_size_read because we hold i_mutex.
1194 *
1195 * Add inode to orphan list in case we crash before truncate
1196 * finishes.
1194 */ 1197 */
1195 if (pos + len > inode->i_size) 1198 if (pos + len > inode->i_size)
1199 ext3_orphan_add(handle, inode);
1200 ext3_journal_stop(handle);
1201 unlock_page(page);
1202 page_cache_release(page);
1203 if (pos + len > inode->i_size)
1196 vmtruncate(inode, inode->i_size); 1204 vmtruncate(inode, inode->i_size);
1197 } 1205 }
1198 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1206 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1211 return err; 1219 return err;
1212} 1220}
1213 1221
1222/* For ordered writepage and write_end functions */
1223static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1224{
1225 /*
1226 * Write could have mapped the buffer but it didn't copy the data in
1227 * yet. So avoid filing such buffer into a transaction.
1228 */
1229 if (buffer_mapped(bh) && buffer_uptodate(bh))
1230 return ext3_journal_dirty_data(handle, bh);
1231 return 0;
1232}
1233
1214/* For write_end() in data=journal mode */ 1234/* For write_end() in data=journal mode */
1215static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1235static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1216{ 1236{
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1221} 1241}
1222 1242
1223/* 1243/*
1224 * Generic write_end handler for ordered and writeback ext3 journal modes. 1244 * This is nasty and subtle: ext3_write_begin() could have allocated blocks
1225 * We can't use generic_write_end, because that unlocks the page and we need to 1245 * for the whole page but later we failed to copy the data in. Update inode
1226 * unlock the page after ext3_journal_stop, but ext3_journal_stop must run 1246 * size according to what we managed to copy. The rest is going to be
1227 * after block_write_end. 1247 * truncated in write_end function.
1228 */ 1248 */
1229static int ext3_generic_write_end(struct file *file, 1249static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
1230 struct address_space *mapping,
1231 loff_t pos, unsigned len, unsigned copied,
1232 struct page *page, void *fsdata)
1233{ 1250{
1234 struct inode *inode = file->f_mapping->host; 1251 /* What matters to us is i_disksize. We don't write i_size anywhere */
1235 1252 if (pos + copied > inode->i_size)
1236 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1253 i_size_write(inode, pos + copied);
1237 1254 if (pos + copied > EXT3_I(inode)->i_disksize) {
1238 if (pos+copied > inode->i_size) { 1255 EXT3_I(inode)->i_disksize = pos + copied;
1239 i_size_write(inode, pos+copied);
1240 mark_inode_dirty(inode); 1256 mark_inode_dirty(inode);
1241 } 1257 }
1242
1243 return copied;
1244} 1258}
1245 1259
1246/* 1260/*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
1260 unsigned from, to; 1274 unsigned from, to;
1261 int ret = 0, ret2; 1275 int ret = 0, ret2;
1262 1276
1263 from = pos & (PAGE_CACHE_SIZE - 1); 1277 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1264 to = from + len;
1265 1278
1279 from = pos & (PAGE_CACHE_SIZE - 1);
1280 to = from + copied;
1266 ret = walk_page_buffers(handle, page_buffers(page), 1281 ret = walk_page_buffers(handle, page_buffers(page),
1267 from, to, NULL, ext3_journal_dirty_data); 1282 from, to, NULL, journal_dirty_data_fn);
1268 1283
1269 if (ret == 0) { 1284 if (ret == 0)
1270 /* 1285 update_file_sizes(inode, pos, copied);
1271 * generic_write_end() will run mark_inode_dirty() if i_size 1286 /*
1272 * changes. So let's piggyback the i_disksize mark_inode_dirty 1287 * There may be allocated blocks outside of i_size because
1273 * into that. 1288 * we failed to copy some data. Prepare for truncate.
1274 */ 1289 */
1275 loff_t new_i_size; 1290 if (pos + len > inode->i_size)
1276 1291 ext3_orphan_add(handle, inode);
1277 new_i_size = pos + copied;
1278 if (new_i_size > EXT3_I(inode)->i_disksize)
1279 EXT3_I(inode)->i_disksize = new_i_size;
1280 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1281 page, fsdata);
1282 copied = ret2;
1283 if (ret2 < 0)
1284 ret = ret2;
1285 }
1286 ret2 = ext3_journal_stop(handle); 1292 ret2 = ext3_journal_stop(handle);
1287 if (!ret) 1293 if (!ret)
1288 ret = ret2; 1294 ret = ret2;
1289 unlock_page(page); 1295 unlock_page(page);
1290 page_cache_release(page); 1296 page_cache_release(page);
1291 1297
1298 if (pos + len > inode->i_size)
1299 vmtruncate(inode, inode->i_size);
1292 return ret ? ret : copied; 1300 return ret ? ret : copied;
1293} 1301}
1294 1302
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
1299{ 1307{
1300 handle_t *handle = ext3_journal_current_handle(); 1308 handle_t *handle = ext3_journal_current_handle();
1301 struct inode *inode = file->f_mapping->host; 1309 struct inode *inode = file->f_mapping->host;
1302 int ret = 0, ret2; 1310 int ret;
1303 loff_t new_i_size;
1304
1305 new_i_size = pos + copied;
1306 if (new_i_size > EXT3_I(inode)->i_disksize)
1307 EXT3_I(inode)->i_disksize = new_i_size;
1308
1309 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1310 page, fsdata);
1311 copied = ret2;
1312 if (ret2 < 0)
1313 ret = ret2;
1314 1311
1315 ret2 = ext3_journal_stop(handle); 1312 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1316 if (!ret) 1313 update_file_sizes(inode, pos, copied);
1317 ret = ret2; 1314 /*
1315 * There may be allocated blocks outside of i_size because
1316 * we failed to copy some data. Prepare for truncate.
1317 */
1318 if (pos + len > inode->i_size)
1319 ext3_orphan_add(handle, inode);
1320 ret = ext3_journal_stop(handle);
1318 unlock_page(page); 1321 unlock_page(page);
1319 page_cache_release(page); 1322 page_cache_release(page);
1320 1323
1324 if (pos + len > inode->i_size)
1325 vmtruncate(inode, inode->i_size);
1321 return ret ? ret : copied; 1326 return ret ? ret : copied;
1322} 1327}
1323 1328
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
1338 if (copied < len) { 1343 if (copied < len) {
1339 if (!PageUptodate(page)) 1344 if (!PageUptodate(page))
1340 copied = 0; 1345 copied = 0;
1341 page_zero_new_buffers(page, from+copied, to); 1346 page_zero_new_buffers(page, from + copied, to);
1347 to = from + copied;
1342 } 1348 }
1343 1349
1344 ret = walk_page_buffers(handle, page_buffers(page), from, 1350 ret = walk_page_buffers(handle, page_buffers(page), from,
1345 to, &partial, write_end_fn); 1351 to, &partial, write_end_fn);
1346 if (!partial) 1352 if (!partial)
1347 SetPageUptodate(page); 1353 SetPageUptodate(page);
1348 if (pos+copied > inode->i_size) 1354
1349 i_size_write(inode, pos+copied); 1355 if (pos + copied > inode->i_size)
1356 i_size_write(inode, pos + copied);
1357 /*
1358 * There may be allocated blocks outside of i_size because
1359 * we failed to copy some data. Prepare for truncate.
1360 */
1361 if (pos + len > inode->i_size)
1362 ext3_orphan_add(handle, inode);
1350 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1351 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1364 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1352 EXT3_I(inode)->i_disksize = inode->i_size; 1365 EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
1361 unlock_page(page); 1374 unlock_page(page);
1362 page_cache_release(page); 1375 page_cache_release(page);
1363 1376
1377 if (pos + len > inode->i_size)
1378 vmtruncate(inode, inode->i_size);
1364 return ret ? ret : copied; 1379 return ret ? ret : copied;
1365} 1380}
1366 1381
@@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1428 return 0; 1443 return 0;
1429} 1444}
1430 1445
1431static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1432{
1433 if (buffer_mapped(bh))
1434 return ext3_journal_dirty_data(handle, bh);
1435 return 0;
1436}
1437
1438static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) 1446static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1439{ 1447{
1440 return !buffer_mapped(bh); 1448 return !buffer_mapped(bh);
1441} 1449}
1450
1442/* 1451/*
1443 * Note that we always start a transaction even if we're not journalling 1452 * Note that we always start a transaction even if we're not journalling
1444 * data. This is to preserve ordering: any hole instantiation within 1453 * data. This is to preserve ordering: any hole instantiation within
@@ -2354,6 +2363,9 @@ void ext3_truncate(struct inode *inode)
2354 if (!ext3_can_truncate(inode)) 2363 if (!ext3_can_truncate(inode))
2355 return; 2364 return;
2356 2365
2366 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2367 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
2368
2357 /* 2369 /*
2358 * We have to lock the EOF page here, because lock_page() nests 2370 * We have to lock the EOF page here, because lock_page() nests
2359 * outside journal_start(). 2371 * outside journal_start().
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
15#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/compat.h> 17#include <linux/compat.h>
18#include <linux/smp_lock.h>
19#include <asm/uaccess.h> 18#include <asm/uaccess.h>
20 19
21int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, 20long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
22 unsigned long arg)
23{ 21{
22 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext3_inode_info *ei = EXT3_I(inode); 23 struct ext3_inode_info *ei = EXT3_I(inode);
25 unsigned int flags; 24 unsigned int flags;
26 unsigned short rsv_window_size; 25 unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
41 40
41 if (!is_owner_or_cap(inode))
42 return -EACCES;
43
44 if (get_user(flags, (int __user *) arg))
45 return -EFAULT;
46
42 err = mnt_want_write(filp->f_path.mnt); 47 err = mnt_want_write(filp->f_path.mnt);
43 if (err) 48 if (err)
44 return err; 49 return err;
45 50
46 if (!is_owner_or_cap(inode)) {
47 err = -EACCES;
48 goto flags_out;
49 }
50
51 if (get_user(flags, (int __user *) arg)) {
52 err = -EFAULT;
53 goto flags_out;
54 }
55
56 flags = ext3_mask_flags(inode->i_mode, flags); 51 flags = ext3_mask_flags(inode->i_mode, flags);
57 52
58 mutex_lock(&inode->i_mutex); 53 mutex_lock(&inode->i_mutex);
54
59 /* Is it quota file? Do not allow user to mess with it */ 55 /* Is it quota file? Do not allow user to mess with it */
60 if (IS_NOQUOTA(inode)) { 56 err = -EPERM;
61 mutex_unlock(&inode->i_mutex); 57 if (IS_NOQUOTA(inode))
62 err = -EPERM;
63 goto flags_out; 58 goto flags_out;
64 } 59
65 oldflags = ei->i_flags; 60 oldflags = ei->i_flags;
66 61
67 /* The JOURNAL_DATA flag is modifiable only by root */ 62 /* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
74 * This test looks nicer. Thanks to Pauline Middelink 69 * This test looks nicer. Thanks to Pauline Middelink
75 */ 70 */
76 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { 71 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
77 if (!capable(CAP_LINUX_IMMUTABLE)) { 72 if (!capable(CAP_LINUX_IMMUTABLE))
78 mutex_unlock(&inode->i_mutex);
79 err = -EPERM;
80 goto flags_out; 73 goto flags_out;
81 }
82 } 74 }
83 75
84 /* 76 /*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
86 * the relevant capability. 78 * the relevant capability.
87 */ 79 */
88 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { 80 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
89 if (!capable(CAP_SYS_RESOURCE)) { 81 if (!capable(CAP_SYS_RESOURCE))
90 mutex_unlock(&inode->i_mutex);
91 err = -EPERM;
92 goto flags_out; 82 goto flags_out;
93 }
94 } 83 }
95 84
96
97 handle = ext3_journal_start(inode, 1); 85 handle = ext3_journal_start(inode, 1);
98 if (IS_ERR(handle)) { 86 if (IS_ERR(handle)) {
99 mutex_unlock(&inode->i_mutex);
100 err = PTR_ERR(handle); 87 err = PTR_ERR(handle);
101 goto flags_out; 88 goto flags_out;
102 } 89 }
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
116 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 103 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
117flags_err: 104flags_err:
118 ext3_journal_stop(handle); 105 ext3_journal_stop(handle);
119 if (err) { 106 if (err)
120 mutex_unlock(&inode->i_mutex); 107 goto flags_out;
121 return err;
122 }
123 108
124 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) 109 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
125 err = ext3_change_inode_journal_flag(inode, jflag); 110 err = ext3_change_inode_journal_flag(inode, jflag);
126 mutex_unlock(&inode->i_mutex);
127flags_out: 111flags_out:
112 mutex_unlock(&inode->i_mutex);
128 mnt_drop_write(filp->f_path.mnt); 113 mnt_drop_write(filp->f_path.mnt);
129 return err; 114 return err;
130 } 115 }
@@ -140,6 +125,7 @@ flags_out:
140 125
141 if (!is_owner_or_cap(inode)) 126 if (!is_owner_or_cap(inode))
142 return -EPERM; 127 return -EPERM;
128
143 err = mnt_want_write(filp->f_path.mnt); 129 err = mnt_want_write(filp->f_path.mnt);
144 if (err) 130 if (err)
145 return err; 131 return err;
@@ -147,6 +133,7 @@ flags_out:
147 err = -EFAULT; 133 err = -EFAULT;
148 goto setversion_out; 134 goto setversion_out;
149 } 135 }
136
150 handle = ext3_journal_start(inode, 1); 137 handle = ext3_journal_start(inode, 1);
151 if (IS_ERR(handle)) { 138 if (IS_ERR(handle)) {
152 err = PTR_ERR(handle); 139 err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
299#ifdef CONFIG_COMPAT 286#ifdef CONFIG_COMPAT
300long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 287long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
301{ 288{
302 struct inode *inode = file->f_path.dentry->d_inode;
303 int ret;
304
305 /* These are just misnamed, they actually get/put from/to user an int */ 289 /* These are just misnamed, they actually get/put from/to user an int */
306 switch (cmd) { 290 switch (cmd) {
307 case EXT3_IOC32_GETFLAGS: 291 case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
341 default: 325 default:
342 return -ENOIOCTLCMD; 326 return -ENOIOCTLCMD;
343 } 327 }
344 lock_kernel(); 328 return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
345 ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
346 unlock_kernel();
347 return ret;
348} 329}
349#endif 330#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e2fc63cbba8b..6ff7b9730234 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release (struct dx_frame *frames); 163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, 167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count);
169static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); 169static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); 170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext3_htree_next_block(struct inode *dir, __u32 hash, 171static int ext3_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame, 172 struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
708 * Create map of hash values, offsets, and sizes, stored at end of block. 708 * Create map of hash values, offsets, and sizes, stored at end of block.
709 * Returns number of entries mapped. 709 * Returns number of entries mapped.
710 */ 710 */
711static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 711static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
713{ 713{
714 int count = 0; 714 int count = 0;
715 char *base = (char *) de; 715 char *base = (char *) de;
716 struct dx_hash_info h = *hinfo; 716 struct dx_hash_info h = *hinfo;
717 717
718 while ((char *) de < base + size) 718 while ((char *) de < base + blocksize)
719 { 719 {
720 if (de->name_len && de->inode) { 720 if (de->name_len && de->inode) {
721 ext3fs_dirhash(de->name, de->name_len, &h); 721 ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1047 return ERR_PTR(-EIO); 1047 return ERR_PTR(-EIO);
1048 } 1048 }
1049 inode = ext3_iget(dir->i_sb, ino); 1049 inode = ext3_iget(dir->i_sb, ino);
1050 if (IS_ERR(inode)) 1050 if (unlikely(IS_ERR(inode))) {
1051 return ERR_CAST(inode); 1051 if (PTR_ERR(inode) == -ESTALE) {
1052 ext3_error(dir->i_sb, __func__,
1053 "deleted inode referenced: %lu",
1054 ino);
1055 return ERR_PTR(-EIO);
1056 } else {
1057 return ERR_CAST(inode);
1058 }
1059 }
1052 } 1060 }
1053 return d_splice_alias(inode, dentry); 1061 return d_splice_alias(inode, dentry);
1054} 1062}
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1120 * Compact each dir entry in the range to the minimal rec_len. 1128 * Compact each dir entry in the range to the minimal rec_len.
1121 * Returns pointer to last entry in range. 1129 * Returns pointer to last entry in range.
1122 */ 1130 */
1123static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) 1131static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
1124{ 1132{
1125 struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; 1133 struct ext3_dir_entry_2 *next, *to, *prev;
1134 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
1126 unsigned rec_len = 0; 1135 unsigned rec_len = 0;
1127 1136
1128 prev = to = de; 1137 prev = to = de;
1129 while ((char*)de < base + size) { 1138 while ((char *)de < base + blocksize) {
1130 next = ext3_next_entry(de); 1139 next = ext3_next_entry(de);
1131 if (de->inode && de->name_len) { 1140 if (de->inode && de->name_len) {
1132 rec_len = EXT3_DIR_REC_LEN(de->name_len); 1141 rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@ -2265,7 +2274,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2265 struct inode * old_inode, * new_inode; 2274 struct inode * old_inode, * new_inode;
2266 struct buffer_head * old_bh, * new_bh, * dir_bh; 2275 struct buffer_head * old_bh, * new_bh, * dir_bh;
2267 struct ext3_dir_entry_2 * old_de, * new_de; 2276 struct ext3_dir_entry_2 * old_de, * new_de;
2268 int retval; 2277 int retval, flush_file = 0;
2269 2278
2270 old_bh = new_bh = dir_bh = NULL; 2279 old_bh = new_bh = dir_bh = NULL;
2271 2280
@@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2401 ext3_mark_inode_dirty(handle, new_inode); 2410 ext3_mark_inode_dirty(handle, new_inode);
2402 if (!new_inode->i_nlink) 2411 if (!new_inode->i_nlink)
2403 ext3_orphan_add(handle, new_inode); 2412 ext3_orphan_add(handle, new_inode);
2413 if (ext3_should_writeback_data(new_inode))
2414 flush_file = 1;
2404 } 2415 }
2405 retval = 0; 2416 retval = 0;
2406 2417
@@ -2409,6 +2420,8 @@ end_rename:
2409 brelse (old_bh); 2420 brelse (old_bh);
2410 brelse (new_bh); 2421 brelse (new_bh);
2411 ext3_journal_stop(handle); 2422 ext3_journal_stop(handle);
2423 if (retval == 0 && flush_file)
2424 filemap_flush(old_inode->i_mapping);
2412 return retval; 2425 return retval;
2413} 2426}
2414 2427
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
18 filesystem; while there will be some performance gains from 18 filesystem; while there will be some performance gains from
19 the delayed allocation and inode table readahead, the best 19 the delayed allocation and inode table readahead, the best
20 performance gains will require enabling ext4 features in the 20 performance gains will require enabling ext4 features in the
21 filesystem, or formating a new filesystem as an ext4 21 filesystem, or formatting a new filesystem as an ext4
22 filesystem initially. 22 filesystem initially.
23 23
24 To compile this file system support as a module, choose M here. The 24 To compile this file system support as a module, choose M here. The
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e00..296785a0dec8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
523 523
524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) 524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
525{ 525{
526 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); 526 struct super_block *sb = dentry->d_sb;
527 struct msdos_sb_info *sbi = MSDOS_SB(sb);
528 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
527 529
528 /* If the count of free cluster is still unknown, counts it here. */ 530 /* If the count of free cluster is still unknown, counts it here. */
529 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { 531 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
537 buf->f_blocks = sbi->max_cluster - FAT_START_ENT; 539 buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
538 buf->f_bfree = sbi->free_clusters; 540 buf->f_bfree = sbi->free_clusters;
539 buf->f_bavail = sbi->free_clusters; 541 buf->f_bavail = sbi->free_clusters;
542 buf->f_fsid.val[0] = (u32)id;
543 buf->f_fsid.val[1] = (u32)(id >> 32);
540 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 544 buf->f_namelen = sbi->options.isvfat ? 260 : 12;
541 545
542 return 0; 546 return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
930 934
931 opts->fs_uid = current_uid(); 935 opts->fs_uid = current_uid();
932 opts->fs_gid = current_gid(); 936 opts->fs_gid = current_gid();
933 opts->fs_fmask = opts->fs_dmask = current->fs->umask; 937 opts->fs_fmask = current_umask();
934 opts->allow_utime = -1; 938 opts->allow_utime = -1;
935 opts->codepage = fat_default_codepage; 939 opts->codepage = fat_default_codepage;
936 opts->iocharset = fat_default_iocharset; 940 opts->iocharset = fat_default_iocharset;
diff --git a/fs/file_table.c b/fs/file_table.c
index b74a8e1da913..54018fe48840 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
169 fmode_t mode, const struct file_operations *fop) 169 fmode_t mode, const struct file_operations *fop)
170{ 170{
171 struct file *file; 171 struct file *file;
172 struct path;
173 172
174 file = get_empty_filp(); 173 file = get_empty_filp();
175 if (!file) 174 if (!file)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e3fe9918faaf..91013ff7dd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
196 struct inode *tail_inode; 196 struct inode *tail_inode;
197 197
198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
199 if (!time_after_eq(inode->dirtied_when, 199 if (time_before(inode->dirtied_when,
200 tail_inode->dirtied_when)) 200 tail_inode->dirtied_when))
201 inode->dirtied_when = jiffies; 201 inode->dirtied_when = jiffies;
202 } 202 }
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
220 wake_up_bit(&inode->i_state, __I_SYNC); 220 wake_up_bit(&inode->i_state, __I_SYNC);
221} 221}
222 222
223static bool inode_dirtied_after(struct inode *inode, unsigned long t)
224{
225 bool ret = time_after(inode->dirtied_when, t);
226#ifndef CONFIG_64BIT
227 /*
228 * For inodes being constantly redirtied, dirtied_when can get stuck.
229 * It _appears_ to be in the future, but is actually in distant past.
230 * This test is necessary to prevent such wrapped-around relative times
231 * from permanently stopping the whole pdflush writeback.
232 */
233 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
234#endif
235 return ret;
236}
237
223/* 238/*
224 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 239 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
225 */ 240 */
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
231 struct inode *inode = list_entry(delaying_queue->prev, 246 struct inode *inode = list_entry(delaying_queue->prev,
232 struct inode, i_list); 247 struct inode, i_list);
233 if (older_than_this && 248 if (older_than_this &&
234 time_after(inode->dirtied_when, *older_than_this)) 249 inode_dirtied_after(inode, *older_than_this))
235 break; 250 break;
236 list_move(&inode->i_list, dispatch_queue); 251 list_move(&inode->i_list, dispatch_queue);
237 } 252 }
@@ -420,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
420 * If older_than_this is non-NULL, then only write out inodes which 435 * If older_than_this is non-NULL, then only write out inodes which
421 * had their first dirtying at a time earlier than *older_than_this. 436 * had their first dirtying at a time earlier than *older_than_this.
422 * 437 *
423 * If we're a pdlfush thread, then implement pdflush collision avoidance 438 * If we're a pdflush thread, then implement pdflush collision avoidance
424 * against the entire list. 439 * against the entire list.
425 * 440 *
426 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 441 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
@@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
492 continue; /* blockdev has wrong queue */ 507 continue; /* blockdev has wrong queue */
493 } 508 }
494 509
495 /* Was this inode dirtied after sync_sb_inodes was called? */ 510 /*
496 if (time_after(inode->dirtied_when, start)) 511 * Was this inode dirtied after sync_sb_inodes was called?
512 * This keeps sync from extra jobs and livelock.
513 */
514 if (inode_dirtied_after(inode, start))
497 break; 515 break;
498 516
499 /* Is another pdflush already flushing this queue? */ 517 /* Is another pdflush already flushing this queue? */
@@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
538 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 556 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
539 struct address_space *mapping; 557 struct address_space *mapping;
540 558
541 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 559 if (inode->i_state &
560 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
542 continue; 561 continue;
543 mapping = inode->i_mapping; 562 mapping = inode->i_mapping;
544 if (mapping->nrpages == 0) 563 if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..eee059052db5
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/fs.h>
4#include <linux/path.h>
5#include <linux/slab.h>
6#include <linux/fs_struct.h>
7
8/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
10 * It can block.
11 */
12void set_fs_root(struct fs_struct *fs, struct path *path)
13{
14 struct path old_root;
15
16 write_lock(&fs->lock);
17 old_root = fs->root;
18 fs->root = *path;
19 path_get(path);
20 write_unlock(&fs->lock);
21 if (old_root.dentry)
22 path_put(&old_root);
23}
24
25/*
26 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
27 * It can block.
28 */
29void set_fs_pwd(struct fs_struct *fs, struct path *path)
30{
31 struct path old_pwd;
32
33 write_lock(&fs->lock);
34 old_pwd = fs->pwd;
35 fs->pwd = *path;
36 path_get(path);
37 write_unlock(&fs->lock);
38
39 if (old_pwd.dentry)
40 path_put(&old_pwd);
41}
42
43void chroot_fs_refs(struct path *old_root, struct path *new_root)
44{
45 struct task_struct *g, *p;
46 struct fs_struct *fs;
47 int count = 0;
48
49 read_lock(&tasklist_lock);
50 do_each_thread(g, p) {
51 task_lock(p);
52 fs = p->fs;
53 if (fs) {
54 write_lock(&fs->lock);
55 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root);
58 fs->root = *new_root;
59 count++;
60 }
61 if (fs->pwd.dentry == old_root->dentry
62 && fs->pwd.mnt == old_root->mnt) {
63 path_get(new_root);
64 fs->pwd = *new_root;
65 count++;
66 }
67 write_unlock(&fs->lock);
68 }
69 task_unlock(p);
70 } while_each_thread(g, p);
71 read_unlock(&tasklist_lock);
72 while (count--)
73 path_put(old_root);
74}
75
76void free_fs_struct(struct fs_struct *fs)
77{
78 path_put(&fs->root);
79 path_put(&fs->pwd);
80 kmem_cache_free(fs_cachep, fs);
81}
82
83void exit_fs(struct task_struct *tsk)
84{
85 struct fs_struct *fs = tsk->fs;
86
87 if (fs) {
88 int kill;
89 task_lock(tsk);
90 write_lock(&fs->lock);
91 tsk->fs = NULL;
92 kill = !--fs->users;
93 write_unlock(&fs->lock);
94 task_unlock(tsk);
95 if (kill)
96 free_fs_struct(fs);
97 }
98}
99
100struct fs_struct *copy_fs_struct(struct fs_struct *old)
101{
102 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
103 /* We don't need to lock fs - think why ;-) */
104 if (fs) {
105 fs->users = 1;
106 fs->in_exec = 0;
107 rwlock_init(&fs->lock);
108 fs->umask = old->umask;
109 read_lock(&old->lock);
110 fs->root = old->root;
111 path_get(&old->root);
112 fs->pwd = old->pwd;
113 path_get(&old->pwd);
114 read_unlock(&old->lock);
115 }
116 return fs;
117}
118
119int unshare_fs_struct(void)
120{
121 struct fs_struct *fs = current->fs;
122 struct fs_struct *new_fs = copy_fs_struct(fs);
123 int kill;
124
125 if (!new_fs)
126 return -ENOMEM;
127
128 task_lock(current);
129 write_lock(&fs->lock);
130 kill = !--fs->users;
131 current->fs = new_fs;
132 write_unlock(&fs->lock);
133 task_unlock(current);
134
135 if (kill)
136 free_fs_struct(fs);
137
138 return 0;
139}
140EXPORT_SYMBOL_GPL(unshare_fs_struct);
141
142int current_umask(void)
143{
144 return current->fs->umask;
145}
146EXPORT_SYMBOL(current_umask);
147
148/* to be mentioned only in INIT_TASK */
149struct fs_struct init_fs = {
150 .users = 1,
151 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
152 .umask = 0022,
153};
154
155void daemonize_fs_struct(void)
156{
157 struct fs_struct *fs = current->fs;
158
159 if (fs) {
160 int kill;
161
162 task_lock(current);
163
164 write_lock(&init_fs.lock);
165 init_fs.users++;
166 write_unlock(&init_fs.lock);
167
168 write_lock(&fs->lock);
169 current->fs = &init_fs;
170 kill = !--fs->users;
171 write_unlock(&fs->lock);
172
173 task_unlock(current);
174 if (kill)
175 free_fs_struct(fs);
176 }
177}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 000000000000..9bbb8ce7bea0
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
1
2config FSCACHE
3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK
6 help
7 This option enables a generic filesystem caching manager that can be
8 used by various network and other filesystems to cache data locally.
9 Different sorts of caches can be plugged in, depending on the
10 resources available.
11
12 See Documentation/filesystems/caching/fscache.txt for more information.
13
14config FSCACHE_STATS
15 bool "Gather statistical information on local caching"
16 depends on FSCACHE && PROC_FS
17 help
18 This option causes statistical information to be gathered on local
19 caching and exported through file:
20
21 /proc/fs/fscache/stats
22
23 The gathering of statistics adds a certain amount of overhead to
24 execution as there are a quite a few stats gathered, and on a
25 multi-CPU system these may be on cachelines that keep bouncing
26 between CPUs. On the other hand, the stats are very useful for
27 debugging purposes. Saying 'Y' here is recommended.
28
29 See Documentation/filesystems/caching/fscache.txt for more information.
30
31config FSCACHE_HISTOGRAM
32 bool "Gather latency information on local caching"
33 depends on FSCACHE && PROC_FS
34 help
35 This option causes latency information to be gathered on local
36 caching and exported through file:
37
38 /proc/fs/fscache/histogram
39
40 The generation of this histogram adds a certain amount of overhead to
41 execution as there are a number of points at which data is gathered,
42 and on a multi-CPU system these may be on cachelines that keep
43 bouncing between CPUs. On the other hand, the histogram may be
44 useful for debugging purposes. Saying 'N' here is recommended.
45
46 See Documentation/filesystems/caching/fscache.txt for more information.
47
48config FSCACHE_DEBUG
49 bool "Debug FS-Cache"
50 depends on FSCACHE
51 help
52 This permits debugging to be dynamically enabled in the local caching
53 management module. If this is set, the debugging output may be
54 enabled by setting bits in /sys/modules/fscache/parameter/debug.
55
56 See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 000000000000..91571b95aacc
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
1#
2# Makefile for general filesystem caching code
3#
4
5fscache-y := \
6 cache.o \
7 cookie.o \
8 fsdef.o \
9 main.o \
10 netfs.o \
11 object.o \
12 operation.o \
13 page.o
14
15fscache-$(CONFIG_PROC_FS) += proc.o
16fscache-$(CONFIG_FSCACHE_STATS) += stats.o
17fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
18
19obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..e21985bbb1fb
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
1/* FS-Cache cache handling
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17LIST_HEAD(fscache_cache_list);
18DECLARE_RWSEM(fscache_addremove_sem);
19DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
20EXPORT_SYMBOL(fscache_cache_cleared_wq);
21
22static LIST_HEAD(fscache_cache_tag_list);
23
24/*
25 * look up a cache tag
26 */
27struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
28{
29 struct fscache_cache_tag *tag, *xtag;
30
31 /* firstly check for the existence of the tag under read lock */
32 down_read(&fscache_addremove_sem);
33
34 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
35 if (strcmp(tag->name, name) == 0) {
36 atomic_inc(&tag->usage);
37 up_read(&fscache_addremove_sem);
38 return tag;
39 }
40 }
41
42 up_read(&fscache_addremove_sem);
43
44 /* the tag does not exist - create a candidate */
45 xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
46 if (!xtag)
47 /* return a dummy tag if out of memory */
48 return ERR_PTR(-ENOMEM);
49
50 atomic_set(&xtag->usage, 1);
51 strcpy(xtag->name, name);
52
53 /* write lock, search again and add if still not present */
54 down_write(&fscache_addremove_sem);
55
56 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
57 if (strcmp(tag->name, name) == 0) {
58 atomic_inc(&tag->usage);
59 up_write(&fscache_addremove_sem);
60 kfree(xtag);
61 return tag;
62 }
63 }
64
65 list_add_tail(&xtag->link, &fscache_cache_tag_list);
66 up_write(&fscache_addremove_sem);
67 return xtag;
68}
69
70/*
71 * release a reference to a cache tag
72 */
73void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
74{
75 if (tag != ERR_PTR(-ENOMEM)) {
76 down_write(&fscache_addremove_sem);
77
78 if (atomic_dec_and_test(&tag->usage))
79 list_del_init(&tag->link);
80 else
81 tag = NULL;
82
83 up_write(&fscache_addremove_sem);
84
85 kfree(tag);
86 }
87}
88
89/*
90 * select a cache in which to store an object
91 * - the cache addremove semaphore must be at least read-locked by the caller
92 * - the object will never be an index
93 */
94struct fscache_cache *fscache_select_cache_for_object(
95 struct fscache_cookie *cookie)
96{
97 struct fscache_cache_tag *tag;
98 struct fscache_object *object;
99 struct fscache_cache *cache;
100
101 _enter("");
102
103 if (list_empty(&fscache_cache_list)) {
104 _leave(" = NULL [no cache]");
105 return NULL;
106 }
107
108 /* we check the parent to determine the cache to use */
109 spin_lock(&cookie->lock);
110
111 /* the first in the parent's backing list should be the preferred
112 * cache */
113 if (!hlist_empty(&cookie->backing_objects)) {
114 object = hlist_entry(cookie->backing_objects.first,
115 struct fscache_object, cookie_link);
116
117 cache = object->cache;
118 if (object->state >= FSCACHE_OBJECT_DYING ||
119 test_bit(FSCACHE_IOERROR, &cache->flags))
120 cache = NULL;
121
122 spin_unlock(&cookie->lock);
123 _leave(" = %p [parent]", cache);
124 return cache;
125 }
126
127 /* the parent is unbacked */
128 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
129 /* cookie not an index and is unbacked */
130 spin_unlock(&cookie->lock);
131 _leave(" = NULL [cookie ub,ni]");
132 return NULL;
133 }
134
135 spin_unlock(&cookie->lock);
136
137 if (!cookie->def->select_cache)
138 goto no_preference;
139
140 /* ask the netfs for its preference */
141 tag = cookie->def->select_cache(cookie->parent->netfs_data,
142 cookie->netfs_data);
143 if (!tag)
144 goto no_preference;
145
146 if (tag == ERR_PTR(-ENOMEM)) {
147 _leave(" = NULL [nomem tag]");
148 return NULL;
149 }
150
151 if (!tag->cache) {
152 _leave(" = NULL [unbacked tag]");
153 return NULL;
154 }
155
156 if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
157 return NULL;
158
159 _leave(" = %p [specific]", tag->cache);
160 return tag->cache;
161
162no_preference:
163 /* netfs has no preference - just select first cache */
164 cache = list_entry(fscache_cache_list.next,
165 struct fscache_cache, link);
166 _leave(" = %p [first]", cache);
167 return cache;
168}
169
170/**
171 * fscache_init_cache - Initialise a cache record
172 * @cache: The cache record to be initialised
173 * @ops: The cache operations to be installed in that record
174 * @idfmt: Format string to define identifier
175 * @...: sprintf-style arguments
176 *
177 * Initialise a record of a cache and fill in the name.
178 *
179 * See Documentation/filesystems/caching/backend-api.txt for a complete
180 * description.
181 */
182void fscache_init_cache(struct fscache_cache *cache,
183 const struct fscache_cache_ops *ops,
184 const char *idfmt,
185 ...)
186{
187 va_list va;
188
189 memset(cache, 0, sizeof(*cache));
190
191 cache->ops = ops;
192
193 va_start(va, idfmt);
194 vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
195 va_end(va);
196
197 INIT_WORK(&cache->op_gc, fscache_operation_gc);
198 INIT_LIST_HEAD(&cache->link);
199 INIT_LIST_HEAD(&cache->object_list);
200 INIT_LIST_HEAD(&cache->op_gc_list);
201 spin_lock_init(&cache->object_list_lock);
202 spin_lock_init(&cache->op_gc_list_lock);
203}
204EXPORT_SYMBOL(fscache_init_cache);
205
206/**
207 * fscache_add_cache - Declare a cache as being open for business
208 * @cache: The record describing the cache
209 * @ifsdef: The record of the cache object describing the top-level index
210 * @tagname: The tag describing this cache
211 *
212 * Add a cache to the system, making it available for netfs's to use.
213 *
214 * See Documentation/filesystems/caching/backend-api.txt for a complete
215 * description.
216 */
217int fscache_add_cache(struct fscache_cache *cache,
218 struct fscache_object *ifsdef,
219 const char *tagname)
220{
221 struct fscache_cache_tag *tag;
222
223 BUG_ON(!cache->ops);
224 BUG_ON(!ifsdef);
225
226 cache->flags = 0;
227 ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
228 ifsdef->state = FSCACHE_OBJECT_ACTIVE;
229
230 if (!tagname)
231 tagname = cache->identifier;
232
233 BUG_ON(!tagname[0]);
234
235 _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
236
237 /* we use the cache tag to uniquely identify caches */
238 tag = __fscache_lookup_cache_tag(tagname);
239 if (IS_ERR(tag))
240 goto nomem;
241
242 if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
243 goto tag_in_use;
244
245 cache->kobj = kobject_create_and_add(tagname, fscache_root);
246 if (!cache->kobj)
247 goto error;
248
249 ifsdef->cookie = &fscache_fsdef_index;
250 ifsdef->cache = cache;
251 cache->fsdef = ifsdef;
252
253 down_write(&fscache_addremove_sem);
254
255 tag->cache = cache;
256 cache->tag = tag;
257
258 /* add the cache to the list */
259 list_add(&cache->link, &fscache_cache_list);
260
261 /* add the cache's netfs definition index object to the cache's
262 * list */
263 spin_lock(&cache->object_list_lock);
264 list_add_tail(&ifsdef->cache_link, &cache->object_list);
265 spin_unlock(&cache->object_list_lock);
266
267 /* add the cache's netfs definition index object to the top level index
268 * cookie as a known backing object */
269 spin_lock(&fscache_fsdef_index.lock);
270
271 hlist_add_head(&ifsdef->cookie_link,
272 &fscache_fsdef_index.backing_objects);
273
274 atomic_inc(&fscache_fsdef_index.usage);
275
276 /* done */
277 spin_unlock(&fscache_fsdef_index.lock);
278 up_write(&fscache_addremove_sem);
279
280 printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
281 cache->tag->name, cache->ops->name);
282 kobject_uevent(cache->kobj, KOBJ_ADD);
283
284 _leave(" = 0 [%s]", cache->identifier);
285 return 0;
286
287tag_in_use:
288 printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
289 __fscache_release_cache_tag(tag);
290 _leave(" = -EXIST");
291 return -EEXIST;
292
293error:
294 __fscache_release_cache_tag(tag);
295 _leave(" = -EINVAL");
296 return -EINVAL;
297
298nomem:
299 _leave(" = -ENOMEM");
300 return -ENOMEM;
301}
302EXPORT_SYMBOL(fscache_add_cache);
303
304/**
305 * fscache_io_error - Note a cache I/O error
306 * @cache: The record describing the cache
307 *
308 * Note that an I/O error occurred in a cache and that it should no longer be
309 * used for anything. This also reports the error into the kernel log.
310 *
311 * See Documentation/filesystems/caching/backend-api.txt for a complete
312 * description.
313 */
314void fscache_io_error(struct fscache_cache *cache)
315{
316 set_bit(FSCACHE_IOERROR, &cache->flags);
317
318 printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
319 cache->ops->name);
320}
321EXPORT_SYMBOL(fscache_io_error);
322
323/*
324 * request withdrawal of all the objects in a cache
325 * - all the objects being withdrawn are moved onto the supplied list
326 */
327static void fscache_withdraw_all_objects(struct fscache_cache *cache,
328 struct list_head *dying_objects)
329{
330 struct fscache_object *object;
331
332 spin_lock(&cache->object_list_lock);
333
334 while (!list_empty(&cache->object_list)) {
335 object = list_entry(cache->object_list.next,
336 struct fscache_object, cache_link);
337 list_move_tail(&object->cache_link, dying_objects);
338
339 _debug("withdraw %p", object->cookie);
340
341 spin_lock(&object->lock);
342 spin_unlock(&cache->object_list_lock);
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
344 spin_unlock(&object->lock);
345
346 cond_resched();
347 spin_lock(&cache->object_list_lock);
348 }
349
350 spin_unlock(&cache->object_list_lock);
351}
352
353/**
354 * fscache_withdraw_cache - Withdraw a cache from the active service
355 * @cache: The record describing the cache
356 *
357 * Withdraw a cache from service, unbinding all its cache objects from the
358 * netfs cookies they're currently representing.
359 *
360 * See Documentation/filesystems/caching/backend-api.txt for a complete
361 * description.
362 */
363void fscache_withdraw_cache(struct fscache_cache *cache)
364{
365 LIST_HEAD(dying_objects);
366
367 _enter("");
368
369 printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
370 cache->tag->name);
371
372 /* make the cache unavailable for cookie acquisition */
373 if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
374 BUG();
375
376 down_write(&fscache_addremove_sem);
377 list_del_init(&cache->link);
378 cache->tag->cache = NULL;
379 up_write(&fscache_addremove_sem);
380
381 /* make sure all pages pinned by operations on behalf of the netfs are
382 * written to disk */
383 cache->ops->sync_cache(cache);
384
385 /* dissociate all the netfs pages backed by this cache from the block
386 * mappings in the cache */
387 cache->ops->dissociate_pages(cache);
388
389 /* we now have to destroy all the active objects pertaining to this
390 * cache - which we do by passing them off to thread pool to be
391 * disposed of */
392 _debug("destroy");
393
394 fscache_withdraw_all_objects(cache, &dying_objects);
395
396 /* wait for all extant objects to finish their outstanding operations
397 * and go away */
398 _debug("wait for finish");
399 wait_event(fscache_cache_cleared_wq,
400 atomic_read(&cache->object_count) == 0);
401 _debug("wait for clearance");
402 wait_event(fscache_cache_cleared_wq,
403 list_empty(&cache->object_list));
404 _debug("cleared");
405 ASSERT(list_empty(&dying_objects));
406
407 kobject_put(cache->kobj);
408
409 clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
410 fscache_release_cache_tag(cache->tag);
411 cache->tag = NULL;
412
413 _leave("");
414}
415EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 000000000000..72fd18f6c71f
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
1/* netfs cookie management
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/netfs-api.txt for more information on
12 * the netfs API.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20struct kmem_cache *fscache_cookie_jar;
21
22static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
23
24static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
25static int fscache_alloc_object(struct fscache_cache *cache,
26 struct fscache_cookie *cookie);
27static int fscache_attach_object(struct fscache_cookie *cookie,
28 struct fscache_object *object);
29
30/*
31 * initialise an cookie jar slab element prior to any use
32 */
33void fscache_cookie_init_once(void *_cookie)
34{
35 struct fscache_cookie *cookie = _cookie;
36
37 memset(cookie, 0, sizeof(*cookie));
38 spin_lock_init(&cookie->lock);
39 INIT_HLIST_HEAD(&cookie->backing_objects);
40}
41
42/*
43 * request a cookie to represent an object (index, datafile, xattr, etc)
44 * - parent specifies the parent object
45 * - the top level index cookie for each netfs is stored in the fscache_netfs
46 * struct upon registration
47 * - def points to the definition
48 * - the netfs_data will be passed to the functions pointed to in *def
49 * - all attached caches will be searched to see if they contain this object
50 * - index objects aren't stored on disk until there's a dependent file that
51 * needs storing
52 * - other objects are stored in a selected cache immediately, and all the
53 * indices forming the path to it are instantiated if necessary
54 * - we never let on to the netfs about errors
55 * - we may set a negative cookie pointer, but that's okay
56 */
57struct fscache_cookie *__fscache_acquire_cookie(
58 struct fscache_cookie *parent,
59 const struct fscache_cookie_def *def,
60 void *netfs_data)
61{
62 struct fscache_cookie *cookie;
63
64 BUG_ON(!def);
65
66 _enter("{%s},{%s},%p",
67 parent ? (char *) parent->def->name : "<no-parent>",
68 def->name, netfs_data);
69
70 fscache_stat(&fscache_n_acquires);
71
72 /* if there's no parent cookie, then we don't create one here either */
73 if (!parent) {
74 fscache_stat(&fscache_n_acquires_null);
75 _leave(" [no parent]");
76 return NULL;
77 }
78
79 /* validate the definition */
80 BUG_ON(!def->get_key);
81 BUG_ON(!def->name[0]);
82
83 BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
84 parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
85
86 /* allocate and initialise a cookie */
87 cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
88 if (!cookie) {
89 fscache_stat(&fscache_n_acquires_oom);
90 _leave(" [ENOMEM]");
91 return NULL;
92 }
93
94 atomic_set(&cookie->usage, 1);
95 atomic_set(&cookie->n_children, 0);
96
97 atomic_inc(&parent->usage);
98 atomic_inc(&parent->n_children);
99
100 cookie->def = def;
101 cookie->parent = parent;
102 cookie->netfs_data = netfs_data;
103 cookie->flags = 0;
104
105 INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
106
107 switch (cookie->def->type) {
108 case FSCACHE_COOKIE_TYPE_INDEX:
109 fscache_stat(&fscache_n_cookie_index);
110 break;
111 case FSCACHE_COOKIE_TYPE_DATAFILE:
112 fscache_stat(&fscache_n_cookie_data);
113 break;
114 default:
115 fscache_stat(&fscache_n_cookie_special);
116 break;
117 }
118
119 /* if the object is an index then we need do nothing more here - we
120 * create indices on disk when we need them as an index may exist in
121 * multiple caches */
122 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
123 if (fscache_acquire_non_index_cookie(cookie) < 0) {
124 atomic_dec(&parent->n_children);
125 __fscache_cookie_put(cookie);
126 fscache_stat(&fscache_n_acquires_nobufs);
127 _leave(" = NULL");
128 return NULL;
129 }
130 }
131
132 fscache_stat(&fscache_n_acquires_ok);
133 _leave(" = %p", cookie);
134 return cookie;
135}
136EXPORT_SYMBOL(__fscache_acquire_cookie);
137
138/*
139 * acquire a non-index cookie
140 * - this must make sure the index chain is instantiated and instantiate the
141 * object representation too
142 */
143static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
144{
145 struct fscache_object *object;
146 struct fscache_cache *cache;
147 uint64_t i_size;
148 int ret;
149
150 _enter("");
151
152 cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
153
154 /* now we need to see whether the backing objects for this cookie yet
155 * exist, if not there'll be nothing to search */
156 down_read(&fscache_addremove_sem);
157
158 if (list_empty(&fscache_cache_list)) {
159 up_read(&fscache_addremove_sem);
160 _leave(" = 0 [no caches]");
161 return 0;
162 }
163
164 /* select a cache in which to store the object */
165 cache = fscache_select_cache_for_object(cookie->parent);
166 if (!cache) {
167 up_read(&fscache_addremove_sem);
168 fscache_stat(&fscache_n_acquires_no_cache);
169 _leave(" = -ENOMEDIUM [no cache]");
170 return -ENOMEDIUM;
171 }
172
173 _debug("cache %s", cache->tag->name);
174
175 cookie->flags =
176 (1 << FSCACHE_COOKIE_LOOKING_UP) |
177 (1 << FSCACHE_COOKIE_CREATING) |
178 (1 << FSCACHE_COOKIE_NO_DATA_YET);
179
180 /* ask the cache to allocate objects for this cookie and its parent
181 * chain */
182 ret = fscache_alloc_object(cache, cookie);
183 if (ret < 0) {
184 up_read(&fscache_addremove_sem);
185 _leave(" = %d", ret);
186 return ret;
187 }
188
189 /* pass on how big the object we're caching is supposed to be */
190 cookie->def->get_attr(cookie->netfs_data, &i_size);
191
192 spin_lock(&cookie->lock);
193 if (hlist_empty(&cookie->backing_objects)) {
194 spin_unlock(&cookie->lock);
195 goto unavailable;
196 }
197
198 object = hlist_entry(cookie->backing_objects.first,
199 struct fscache_object, cookie_link);
200
201 fscache_set_store_limit(object, i_size);
202
203 /* initiate the process of looking up all the objects in the chain
204 * (done by fscache_initialise_object()) */
205 fscache_enqueue_object(object);
206
207 spin_unlock(&cookie->lock);
208
209 /* we may be required to wait for lookup to complete at this point */
210 if (!fscache_defer_lookup) {
211 _debug("non-deferred lookup %p", &cookie->flags);
212 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
213 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
214 _debug("complete");
215 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
216 goto unavailable;
217 }
218
219 up_read(&fscache_addremove_sem);
220 _leave(" = 0 [deferred]");
221 return 0;
222
223unavailable:
224 up_read(&fscache_addremove_sem);
225 _leave(" = -ENOBUFS");
226 return -ENOBUFS;
227}
228
229/*
230 * recursively allocate cache object records for a cookie/cache combination
231 * - caller must be holding the addremove sem
232 */
233static int fscache_alloc_object(struct fscache_cache *cache,
234 struct fscache_cookie *cookie)
235{
236 struct fscache_object *object;
237 struct hlist_node *_n;
238 int ret;
239
240 _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
241
242 spin_lock(&cookie->lock);
243 hlist_for_each_entry(object, _n, &cookie->backing_objects,
244 cookie_link) {
245 if (object->cache == cache)
246 goto object_already_extant;
247 }
248 spin_unlock(&cookie->lock);
249
250 /* ask the cache to allocate an object (we may end up with duplicate
251 * objects at this stage, but we sort that out later) */
252 object = cache->ops->alloc_object(cache, cookie);
253 if (IS_ERR(object)) {
254 fscache_stat(&fscache_n_object_no_alloc);
255 ret = PTR_ERR(object);
256 goto error;
257 }
258
259 fscache_stat(&fscache_n_object_alloc);
260
261 object->debug_id = atomic_inc_return(&fscache_object_debug_id);
262
263 _debug("ALLOC OBJ%x: %s {%lx}",
264 object->debug_id, cookie->def->name, object->events);
265
266 ret = fscache_alloc_object(cache, cookie->parent);
267 if (ret < 0)
268 goto error_put;
269
270 /* only attach if we managed to allocate all we needed, otherwise
271 * discard the object we just allocated and instead use the one
272 * attached to the cookie */
273 if (fscache_attach_object(cookie, object) < 0)
274 cache->ops->put_object(object);
275
276 _leave(" = 0");
277 return 0;
278
279object_already_extant:
280 ret = -ENOBUFS;
281 if (object->state >= FSCACHE_OBJECT_DYING) {
282 spin_unlock(&cookie->lock);
283 goto error;
284 }
285 spin_unlock(&cookie->lock);
286 _leave(" = 0 [found]");
287 return 0;
288
289error_put:
290 cache->ops->put_object(object);
291error:
292 _leave(" = %d", ret);
293 return ret;
294}
295
296/*
297 * attach a cache object to a cookie
298 */
299static int fscache_attach_object(struct fscache_cookie *cookie,
300 struct fscache_object *object)
301{
302 struct fscache_object *p;
303 struct fscache_cache *cache = object->cache;
304 struct hlist_node *_n;
305 int ret;
306
307 _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
308
309 spin_lock(&cookie->lock);
310
311 /* there may be multiple initial creations of this object, but we only
312 * want one */
313 ret = -EEXIST;
314 hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
315 if (p->cache == object->cache) {
316 if (p->state >= FSCACHE_OBJECT_DYING)
317 ret = -ENOBUFS;
318 goto cant_attach_object;
319 }
320 }
321
322 /* pin the parent object */
323 spin_lock_nested(&cookie->parent->lock, 1);
324 hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
325 cookie_link) {
326 if (p->cache == object->cache) {
327 if (p->state >= FSCACHE_OBJECT_DYING) {
328 ret = -ENOBUFS;
329 spin_unlock(&cookie->parent->lock);
330 goto cant_attach_object;
331 }
332 object->parent = p;
333 spin_lock(&p->lock);
334 p->n_children++;
335 spin_unlock(&p->lock);
336 break;
337 }
338 }
339 spin_unlock(&cookie->parent->lock);
340
341 /* attach to the cache's object list */
342 if (list_empty(&object->cache_link)) {
343 spin_lock(&cache->object_list_lock);
344 list_add(&object->cache_link, &cache->object_list);
345 spin_unlock(&cache->object_list_lock);
346 }
347
348 /* attach to the cookie */
349 object->cookie = cookie;
350 atomic_inc(&cookie->usage);
351 hlist_add_head(&object->cookie_link, &cookie->backing_objects);
352 ret = 0;
353
354cant_attach_object:
355 spin_unlock(&cookie->lock);
356 _leave(" = %d", ret);
357 return ret;
358}
359
360/*
361 * update the index entries backing a cookie
362 */
363void __fscache_update_cookie(struct fscache_cookie *cookie)
364{
365 struct fscache_object *object;
366 struct hlist_node *_p;
367
368 fscache_stat(&fscache_n_updates);
369
370 if (!cookie) {
371 fscache_stat(&fscache_n_updates_null);
372 _leave(" [no cookie]");
373 return;
374 }
375
376 _enter("{%s}", cookie->def->name);
377
378 BUG_ON(!cookie->def->get_aux);
379
380 spin_lock(&cookie->lock);
381
382 /* update the index entry on disk in each cache backing this cookie */
383 hlist_for_each_entry(object, _p,
384 &cookie->backing_objects, cookie_link) {
385 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
386 }
387
388 spin_unlock(&cookie->lock);
389 _leave("");
390}
391EXPORT_SYMBOL(__fscache_update_cookie);
392
393/*
394 * release a cookie back to the cache
395 * - the object will be marked as recyclable on disk if retire is true
396 * - all dependents of this cookie must have already been unregistered
397 * (indices/files/pages)
398 */
399void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
400{
401 struct fscache_cache *cache;
402 struct fscache_object *object;
403 unsigned long event;
404
405 fscache_stat(&fscache_n_relinquishes);
406
407 if (!cookie) {
408 fscache_stat(&fscache_n_relinquishes_null);
409 _leave(" [no cookie]");
410 return;
411 }
412
413 _enter("%p{%s,%p},%d",
414 cookie, cookie->def->name, cookie->netfs_data, retire);
415
416 if (atomic_read(&cookie->n_children) != 0) {
417 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
418 cookie->def->name);
419 BUG();
420 }
421
422 /* wait for the cookie to finish being instantiated (or to fail) */
423 if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
424 fscache_stat(&fscache_n_relinquishes_waitcrt);
425 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
426 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
427 }
428
429 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
430
431 /* detach pointers back to the netfs */
432 spin_lock(&cookie->lock);
433
434 cookie->netfs_data = NULL;
435 cookie->def = NULL;
436
437 /* break links with all the active objects */
438 while (!hlist_empty(&cookie->backing_objects)) {
439 object = hlist_entry(cookie->backing_objects.first,
440 struct fscache_object,
441 cookie_link);
442
443 _debug("RELEASE OBJ%x", object->debug_id);
444
445 /* detach each cache object from the object cookie */
446 spin_lock(&object->lock);
447 hlist_del_init(&object->cookie_link);
448
449 cache = object->cache;
450 object->cookie = NULL;
451 fscache_raise_event(object, event);
452 spin_unlock(&object->lock);
453
454 if (atomic_dec_and_test(&cookie->usage))
455 /* the cookie refcount shouldn't be reduced to 0 yet */
456 BUG();
457 }
458
459 spin_unlock(&cookie->lock);
460
461 if (cookie->parent) {
462 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
463 ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
464 atomic_dec(&cookie->parent->n_children);
465 }
466
467 /* finally dispose of the cookie */
468 ASSERTCMP(atomic_read(&cookie->usage), >, 0);
469 fscache_cookie_put(cookie);
470
471 _leave("");
472}
473EXPORT_SYMBOL(__fscache_relinquish_cookie);
474
475/*
476 * destroy a cookie
477 */
478void __fscache_cookie_put(struct fscache_cookie *cookie)
479{
480 struct fscache_cookie *parent;
481
482 _enter("%p", cookie);
483
484 for (;;) {
485 _debug("FREE COOKIE %p", cookie);
486 parent = cookie->parent;
487 BUG_ON(!hlist_empty(&cookie->backing_objects));
488 kmem_cache_free(fscache_cookie_jar, cookie);
489
490 if (!parent)
491 break;
492
493 cookie = parent;
494 BUG_ON(atomic_read(&cookie->usage) <= 0);
495 if (!atomic_dec_and_test(&cookie->usage))
496 break;
497 }
498
499 _leave("");
500}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 000000000000..f5b4baee7352
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
1/* Filesystem index definition
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include "internal.h"
15
16static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
17 void *buffer, uint16_t bufmax);
18
19static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
20 void *buffer, uint16_t bufmax);
21
22static
23enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
24 const void *data,
25 uint16_t datalen);
26
27/*
28 * The root index is owned by FS-Cache itself.
29 *
30 * When a netfs requests caching facilities, FS-Cache will, if one doesn't
31 * already exist, create an entry in the root index with the key being the name
32 * of the netfs ("AFS" for example), and the auxiliary data holding the index
33 * structure version supplied by the netfs:
34 *
35 * FSDEF
36 * |
37 * +-----------+
38 * | |
39 * NFS AFS
40 * [v=1] [v=1]
41 *
42 * If an entry with the appropriate name does already exist, the version is
43 * compared. If the version is different, the entire subtree from that entry
44 * will be discarded and a new entry created.
45 *
46 * The new entry will be an index, and a cookie referring to it will be passed
47 * to the netfs. This is then the root handle by which the netfs accesses the
48 * cache. It can create whatever objects it likes in that index, including
49 * further indices.
50 */
51static struct fscache_cookie_def fscache_fsdef_index_def = {
52 .name = ".FS-Cache",
53 .type = FSCACHE_COOKIE_TYPE_INDEX,
54};
55
56struct fscache_cookie fscache_fsdef_index = {
57 .usage = ATOMIC_INIT(1),
58 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
59 .backing_objects = HLIST_HEAD_INIT,
60 .def = &fscache_fsdef_index_def,
61};
62EXPORT_SYMBOL(fscache_fsdef_index);
63
64/*
65 * Definition of an entry in the root index. Each entry is an index, keyed to
66 * a specific netfs and only applicable to a particular version of the index
67 * structure used by that netfs.
68 */
69struct fscache_cookie_def fscache_fsdef_netfs_def = {
70 .name = "FSDEF.netfs",
71 .type = FSCACHE_COOKIE_TYPE_INDEX,
72 .get_key = fscache_fsdef_netfs_get_key,
73 .get_aux = fscache_fsdef_netfs_get_aux,
74 .check_aux = fscache_fsdef_netfs_check_aux,
75};
76
77/*
78 * get the key data for an FSDEF index record - this is the name of the netfs
79 * for which this entry is created
80 */
81static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
82 void *buffer, uint16_t bufmax)
83{
84 const struct fscache_netfs *netfs = cookie_netfs_data;
85 unsigned klen;
86
87 _enter("{%s.%u},", netfs->name, netfs->version);
88
89 klen = strlen(netfs->name);
90 if (klen > bufmax)
91 return 0;
92
93 memcpy(buffer, netfs->name, klen);
94 return klen;
95}
96
97/*
98 * get the auxiliary data for an FSDEF index record - this is the index
99 * structure version number of the netfs for which this version is created
100 */
101static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
102 void *buffer, uint16_t bufmax)
103{
104 const struct fscache_netfs *netfs = cookie_netfs_data;
105 unsigned dlen;
106
107 _enter("{%s.%u},", netfs->name, netfs->version);
108
109 dlen = sizeof(uint32_t);
110 if (dlen > bufmax)
111 return 0;
112
113 memcpy(buffer, &netfs->version, dlen);
114 return dlen;
115}
116
117/*
118 * check that the index structure version number stored in the auxiliary data
119 * matches the one the netfs gave us
120 */
121static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
122 void *cookie_netfs_data,
123 const void *data,
124 uint16_t datalen)
125{
126 struct fscache_netfs *netfs = cookie_netfs_data;
127 uint32_t version;
128
129 _enter("{%s},,%hu", netfs->name, datalen);
130
131 if (datalen != sizeof(version)) {
132 _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
133 return FSCACHE_CHECKAUX_OBSOLETE;
134 }
135
136 memcpy(&version, data, sizeof(version));
137 if (version != netfs->version) {
138 _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
139 return FSCACHE_CHECKAUX_OBSOLETE;
140 }
141
142 _leave(" = OKAY");
143 return FSCACHE_CHECKAUX_OKAY;
144}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
1/* FS-Cache latency histogram
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18atomic_t fscache_obj_instantiate_histogram[HZ];
19atomic_t fscache_objs_histogram[HZ];
20atomic_t fscache_ops_histogram[HZ];
21atomic_t fscache_retrieval_delay_histogram[HZ];
22atomic_t fscache_retrieval_histogram[HZ];
23
24/*
25 * display the time-taken histogram
26 */
27static int fscache_histogram_show(struct seq_file *m, void *v)
28{
29 unsigned long index;
30 unsigned n[5], t;
31
32 switch ((unsigned long) v) {
33 case 1:
34 seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS "
35 " RETRV DLY RETRIEVLS\n");
36 return 0;
37 case 2:
38 seq_puts(m, "===== ===== ========= ========= ========="
39 " ========= =========\n");
40 return 0;
41 default:
42 index = (unsigned long) v - 3;
43 n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
44 n[1] = atomic_read(&fscache_ops_histogram[index]);
45 n[2] = atomic_read(&fscache_objs_histogram[index]);
46 n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
47 n[4] = atomic_read(&fscache_retrieval_histogram[index]);
48 if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
49 return 0;
50
51 t = (index * 1000) / HZ;
52
53 seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n",
54 index, t, n[0], n[1], n[2], n[3], n[4]);
55 return 0;
56 }
57}
58
59/*
60 * set up the iterator to start reading from the first line
61 */
62static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
63{
64 if ((unsigned long long)*_pos >= HZ + 2)
65 return NULL;
66 if (*_pos == 0)
67 *_pos = 1;
68 return (void *)(unsigned long) *_pos;
69}
70
71/*
72 * move to the next line
73 */
74static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
75{
76 (*pos)++;
77 return (unsigned long long)*pos > HZ + 2 ?
78 NULL : (void *)(unsigned long) *pos;
79}
80
81/*
82 * clean up after reading
83 */
84static void fscache_histogram_stop(struct seq_file *m, void *v)
85{
86}
87
88static const struct seq_operations fscache_histogram_ops = {
89 .start = fscache_histogram_start,
90 .stop = fscache_histogram_stop,
91 .next = fscache_histogram_next,
92 .show = fscache_histogram_show,
93};
94
95/*
96 * open "/proc/fs/fscache/histogram" to provide latency data
97 */
98static int fscache_histogram_open(struct inode *inode, struct file *file)
99{
100 return seq_open(file, &fscache_histogram_ops);
101}
102
103const struct file_operations fscache_histogram_fops = {
104 .owner = THIS_MODULE,
105 .open = fscache_histogram_open,
106 .read = seq_read,
107 .llseek = seq_lseek,
108 .release = seq_release,
109};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 000000000000..e0cbd16f6dc9
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
1/* Internal definitions for FS-Cache
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12/*
13 * Lock order, in the order in which multiple locks should be obtained:
14 * - fscache_addremove_sem
15 * - cookie->lock
16 * - cookie->parent->lock
17 * - cache->object_list_lock
18 * - object->lock
19 * - object->parent->lock
20 * - fscache_thread_lock
21 *
22 */
23
24#include <linux/fscache-cache.h>
25#include <linux/sched.h>
26
27#define FSCACHE_MIN_THREADS 4
28#define FSCACHE_MAX_THREADS 32
29
30/*
31 * fsc-cache.c
32 */
33extern struct list_head fscache_cache_list;
34extern struct rw_semaphore fscache_addremove_sem;
35
36extern struct fscache_cache *fscache_select_cache_for_object(
37 struct fscache_cookie *);
38
39/*
40 * fsc-cookie.c
41 */
42extern struct kmem_cache *fscache_cookie_jar;
43
44extern void fscache_cookie_init_once(void *);
45extern void __fscache_cookie_put(struct fscache_cookie *);
46
47/*
48 * fsc-fsdef.c
49 */
50extern struct fscache_cookie fscache_fsdef_index;
51extern struct fscache_cookie_def fscache_fsdef_netfs_def;
52
53/*
54 * fsc-histogram.c
55 */
56#ifdef CONFIG_FSCACHE_HISTOGRAM
57extern atomic_t fscache_obj_instantiate_histogram[HZ];
58extern atomic_t fscache_objs_histogram[HZ];
59extern atomic_t fscache_ops_histogram[HZ];
60extern atomic_t fscache_retrieval_delay_histogram[HZ];
61extern atomic_t fscache_retrieval_histogram[HZ];
62
63static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
64{
65 unsigned long jif = jiffies - start_jif;
66 if (jif >= HZ)
67 jif = HZ - 1;
68 atomic_inc(&histogram[jif]);
69}
70
71extern const struct file_operations fscache_histogram_fops;
72
73#else
74#define fscache_hist(hist, start_jif) do {} while (0)
75#endif
76
77/*
78 * fsc-main.c
79 */
80extern unsigned fscache_defer_lookup;
81extern unsigned fscache_defer_create;
82extern unsigned fscache_debug;
83extern struct kobject *fscache_root;
84
85extern int fscache_wait_bit(void *);
86extern int fscache_wait_bit_interruptible(void *);
87
88/*
89 * fsc-object.c
90 */
91extern void fscache_withdrawing_object(struct fscache_cache *,
92 struct fscache_object *);
93extern void fscache_enqueue_object(struct fscache_object *);
94
95/*
96 * fsc-operation.c
97 */
98extern int fscache_submit_exclusive_op(struct fscache_object *,
99 struct fscache_operation *);
100extern int fscache_submit_op(struct fscache_object *,
101 struct fscache_operation *);
102extern void fscache_abort_object(struct fscache_object *);
103extern void fscache_start_operations(struct fscache_object *);
104extern void fscache_operation_gc(struct work_struct *);
105
106/*
107 * fsc-proc.c
108 */
109#ifdef CONFIG_PROC_FS
110extern int __init fscache_proc_init(void);
111extern void fscache_proc_cleanup(void);
112#else
113#define fscache_proc_init() (0)
114#define fscache_proc_cleanup() do {} while (0)
115#endif
116
117/*
118 * fsc-stats.c
119 */
120#ifdef CONFIG_FSCACHE_STATS
121extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
122extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
123
124extern atomic_t fscache_n_op_pend;
125extern atomic_t fscache_n_op_run;
126extern atomic_t fscache_n_op_enqueue;
127extern atomic_t fscache_n_op_deferred_release;
128extern atomic_t fscache_n_op_release;
129extern atomic_t fscache_n_op_gc;
130
131extern atomic_t fscache_n_attr_changed;
132extern atomic_t fscache_n_attr_changed_ok;
133extern atomic_t fscache_n_attr_changed_nobufs;
134extern atomic_t fscache_n_attr_changed_nomem;
135extern atomic_t fscache_n_attr_changed_calls;
136
137extern atomic_t fscache_n_allocs;
138extern atomic_t fscache_n_allocs_ok;
139extern atomic_t fscache_n_allocs_wait;
140extern atomic_t fscache_n_allocs_nobufs;
141extern atomic_t fscache_n_alloc_ops;
142extern atomic_t fscache_n_alloc_op_waits;
143
144extern atomic_t fscache_n_retrievals;
145extern atomic_t fscache_n_retrievals_ok;
146extern atomic_t fscache_n_retrievals_wait;
147extern atomic_t fscache_n_retrievals_nodata;
148extern atomic_t fscache_n_retrievals_nobufs;
149extern atomic_t fscache_n_retrievals_intr;
150extern atomic_t fscache_n_retrievals_nomem;
151extern atomic_t fscache_n_retrieval_ops;
152extern atomic_t fscache_n_retrieval_op_waits;
153
154extern atomic_t fscache_n_stores;
155extern atomic_t fscache_n_stores_ok;
156extern atomic_t fscache_n_stores_again;
157extern atomic_t fscache_n_stores_nobufs;
158extern atomic_t fscache_n_stores_oom;
159extern atomic_t fscache_n_store_ops;
160extern atomic_t fscache_n_store_calls;
161
162extern atomic_t fscache_n_marks;
163extern atomic_t fscache_n_uncaches;
164
165extern atomic_t fscache_n_acquires;
166extern atomic_t fscache_n_acquires_null;
167extern atomic_t fscache_n_acquires_no_cache;
168extern atomic_t fscache_n_acquires_ok;
169extern atomic_t fscache_n_acquires_nobufs;
170extern atomic_t fscache_n_acquires_oom;
171
172extern atomic_t fscache_n_updates;
173extern atomic_t fscache_n_updates_null;
174extern atomic_t fscache_n_updates_run;
175
176extern atomic_t fscache_n_relinquishes;
177extern atomic_t fscache_n_relinquishes_null;
178extern atomic_t fscache_n_relinquishes_waitcrt;
179
180extern atomic_t fscache_n_cookie_index;
181extern atomic_t fscache_n_cookie_data;
182extern atomic_t fscache_n_cookie_special;
183
184extern atomic_t fscache_n_object_alloc;
185extern atomic_t fscache_n_object_no_alloc;
186extern atomic_t fscache_n_object_lookups;
187extern atomic_t fscache_n_object_lookups_negative;
188extern atomic_t fscache_n_object_lookups_positive;
189extern atomic_t fscache_n_object_created;
190extern atomic_t fscache_n_object_avail;
191extern atomic_t fscache_n_object_dead;
192
193extern atomic_t fscache_n_checkaux_none;
194extern atomic_t fscache_n_checkaux_okay;
195extern atomic_t fscache_n_checkaux_update;
196extern atomic_t fscache_n_checkaux_obsolete;
197
198static inline void fscache_stat(atomic_t *stat)
199{
200 atomic_inc(stat);
201}
202
203extern const struct file_operations fscache_stats_fops;
204#else
205
206#define fscache_stat(stat) do {} while (0)
207#endif
208
209/*
210 * raise an event on an object
211 * - if the event is not masked for that object, then the object is
212 * queued for attention by the thread pool.
213 */
214static inline void fscache_raise_event(struct fscache_object *object,
215 unsigned event)
216{
217 if (!test_and_set_bit(event, &object->events) &&
218 test_bit(event, &object->event_mask))
219 fscache_enqueue_object(object);
220}
221
222/*
223 * drop a reference to a cookie
224 */
225static inline void fscache_cookie_put(struct fscache_cookie *cookie)
226{
227 BUG_ON(atomic_read(&cookie->usage) <= 0);
228 if (atomic_dec_and_test(&cookie->usage))
229 __fscache_cookie_put(cookie);
230}
231
232/*
233 * get an extra reference to a netfs retrieval context
234 */
235static inline
236void *fscache_get_context(struct fscache_cookie *cookie, void *context)
237{
238 if (cookie->def->get_context)
239 cookie->def->get_context(cookie->netfs_data, context);
240 return context;
241}
242
243/*
244 * release a reference to a netfs retrieval context
245 */
246static inline
247void fscache_put_context(struct fscache_cookie *cookie, void *context)
248{
249 if (cookie->def->put_context)
250 cookie->def->put_context(cookie->netfs_data, context);
251}
252
253/*****************************************************************************/
254/*
255 * debug tracing
256 */
257#define dbgprintk(FMT, ...) \
258 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
259
260/* make sure we maintain the format strings, even when debugging is disabled */
261static inline __attribute__((format(printf, 1, 2)))
262void _dbprintk(const char *fmt, ...)
263{
264}
265
266#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
267#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
268#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
269
270#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
271
272#ifdef __KDEBUG
273#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
274#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
275#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
276
277#elif defined(CONFIG_FSCACHE_DEBUG)
278#define _enter(FMT, ...) \
279do { \
280 if (__do_kdebug(ENTER)) \
281 kenter(FMT, ##__VA_ARGS__); \
282} while (0)
283
284#define _leave(FMT, ...) \
285do { \
286 if (__do_kdebug(LEAVE)) \
287 kleave(FMT, ##__VA_ARGS__); \
288} while (0)
289
290#define _debug(FMT, ...) \
291do { \
292 if (__do_kdebug(DEBUG)) \
293 kdebug(FMT, ##__VA_ARGS__); \
294} while (0)
295
296#else
297#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
298#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
299#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
300#endif
301
302/*
303 * determine whether a particular optional debugging point should be logged
304 * - we need to go through three steps to persuade cpp to correctly join the
305 * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
306 */
307#define ____do_kdebug(LEVEL, POINT) \
308 unlikely((fscache_debug & \
309 (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
310#define ___do_kdebug(LEVEL, POINT) \
311 ____do_kdebug(LEVEL, POINT)
312#define __do_kdebug(POINT) \
313 ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
314
315#define FSCACHE_DEBUG_CACHE 0
316#define FSCACHE_DEBUG_COOKIE 1
317#define FSCACHE_DEBUG_PAGE 2
318#define FSCACHE_DEBUG_OPERATION 3
319
320#define FSCACHE_POINT_ENTER 1
321#define FSCACHE_POINT_LEAVE 2
322#define FSCACHE_POINT_DEBUG 4
323
324#ifndef FSCACHE_DEBUG_LEVEL
325#define FSCACHE_DEBUG_LEVEL CACHE
326#endif
327
328/*
329 * assertions
330 */
331#if 1 /* defined(__KDEBUGALL) */
332
333#define ASSERT(X) \
334do { \
335 if (unlikely(!(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTCMP(X, OP, Y) \
343do { \
344 if (unlikely(!((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#define ASSERTIF(C, X) \
354do { \
355 if (unlikely((C) && !(X))) { \
356 printk(KERN_ERR "\n"); \
357 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
358 BUG(); \
359 } \
360} while (0)
361
362#define ASSERTIFCMP(C, X, OP, Y) \
363do { \
364 if (unlikely((C) && !((X) OP (Y)))) { \
365 printk(KERN_ERR "\n"); \
366 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
367 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
368 (unsigned long)(X), (unsigned long)(Y)); \
369 BUG(); \
370 } \
371} while (0)
372
373#else
374
375#define ASSERT(X) do {} while (0)
376#define ASSERTCMP(X, OP, Y) do {} while (0)
377#define ASSERTIF(C, X) do {} while (0)
378#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
379
380#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 000000000000..4de41b597499
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
1/* General filesystem local caching manager
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20MODULE_DESCRIPTION("FS Cache Manager");
21MODULE_AUTHOR("Red Hat, Inc.");
22MODULE_LICENSE("GPL");
23
24unsigned fscache_defer_lookup = 1;
25module_param_named(defer_lookup, fscache_defer_lookup, uint,
26 S_IWUSR | S_IRUGO);
27MODULE_PARM_DESC(fscache_defer_lookup,
28 "Defer cookie lookup to background thread");
29
30unsigned fscache_defer_create = 1;
31module_param_named(defer_create, fscache_defer_create, uint,
32 S_IWUSR | S_IRUGO);
33MODULE_PARM_DESC(fscache_defer_create,
34 "Defer cookie creation to background thread");
35
36unsigned fscache_debug;
37module_param_named(debug, fscache_debug, uint,
38 S_IWUSR | S_IRUGO);
39MODULE_PARM_DESC(fscache_debug,
40 "FS-Cache debugging mask");
41
42struct kobject *fscache_root;
43
44/*
45 * initialise the fs caching module
46 */
47static int __init fscache_init(void)
48{
49 int ret;
50
51 ret = slow_work_register_user();
52 if (ret < 0)
53 goto error_slow_work;
54
55 ret = fscache_proc_init();
56 if (ret < 0)
57 goto error_proc;
58
59 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
60 sizeof(struct fscache_cookie),
61 0,
62 0,
63 fscache_cookie_init_once);
64 if (!fscache_cookie_jar) {
65 printk(KERN_NOTICE
66 "FS-Cache: Failed to allocate a cookie jar\n");
67 ret = -ENOMEM;
68 goto error_cookie_jar;
69 }
70
71 fscache_root = kobject_create_and_add("fscache", kernel_kobj);
72 if (!fscache_root)
73 goto error_kobj;
74
75 printk(KERN_NOTICE "FS-Cache: Loaded\n");
76 return 0;
77
78error_kobj:
79 kmem_cache_destroy(fscache_cookie_jar);
80error_cookie_jar:
81 fscache_proc_cleanup();
82error_proc:
83 slow_work_unregister_user();
84error_slow_work:
85 return ret;
86}
87
88fs_initcall(fscache_init);
89
90/*
91 * clean up on module removal
92 */
93static void __exit fscache_exit(void)
94{
95 _enter("");
96
97 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar);
99 fscache_proc_cleanup();
100 slow_work_unregister_user();
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102}
103
104module_exit(fscache_exit);
105
106/*
107 * wait_on_bit() sleep function for uninterruptible waiting
108 */
109int fscache_wait_bit(void *flags)
110{
111 schedule();
112 return 0;
113}
114EXPORT_SYMBOL(fscache_wait_bit);
115
116/*
117 * wait_on_bit() sleep function for interruptible waiting
118 */
119int fscache_wait_bit_interruptible(void *flags)
120{
121 schedule();
122 return signal_pending(current);
123}
124EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
1/* FS-Cache netfs (client) registration
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17static LIST_HEAD(fscache_netfs_list);
18
19/*
20 * register a network filesystem for caching
21 */
22int __fscache_register_netfs(struct fscache_netfs *netfs)
23{
24 struct fscache_netfs *ptr;
25 int ret;
26
27 _enter("{%s}", netfs->name);
28
29 INIT_LIST_HEAD(&netfs->link);
30
31 /* allocate a cookie for the primary index */
32 netfs->primary_index =
33 kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
34
35 if (!netfs->primary_index) {
36 _leave(" = -ENOMEM");
37 return -ENOMEM;
38 }
39
40 /* initialise the primary index cookie */
41 atomic_set(&netfs->primary_index->usage, 1);
42 atomic_set(&netfs->primary_index->n_children, 0);
43
44 netfs->primary_index->def = &fscache_fsdef_netfs_def;
45 netfs->primary_index->parent = &fscache_fsdef_index;
46 netfs->primary_index->netfs_data = netfs;
47
48 atomic_inc(&netfs->primary_index->parent->usage);
49 atomic_inc(&netfs->primary_index->parent->n_children);
50
51 spin_lock_init(&netfs->primary_index->lock);
52 INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
53
54 /* check the netfs type is not already present */
55 down_write(&fscache_addremove_sem);
56
57 ret = -EEXIST;
58 list_for_each_entry(ptr, &fscache_netfs_list, link) {
59 if (strcmp(ptr->name, netfs->name) == 0)
60 goto already_registered;
61 }
62
63 list_add(&netfs->link, &fscache_netfs_list);
64 ret = 0;
65
66 printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
67 netfs->name);
68
69already_registered:
70 up_write(&fscache_addremove_sem);
71
72 if (ret < 0) {
73 netfs->primary_index->parent = NULL;
74 __fscache_cookie_put(netfs->primary_index);
75 netfs->primary_index = NULL;
76 }
77
78 _leave(" = %d", ret);
79 return ret;
80}
81EXPORT_SYMBOL(__fscache_register_netfs);
82
83/*
84 * unregister a network filesystem from the cache
85 * - all cookies must have been released first
86 */
87void __fscache_unregister_netfs(struct fscache_netfs *netfs)
88{
89 _enter("{%s.%u}", netfs->name, netfs->version);
90
91 down_write(&fscache_addremove_sem);
92
93 list_del(&netfs->link);
94 fscache_relinquish_cookie(netfs->primary_index, 0);
95
96 up_write(&fscache_addremove_sem);
97
98 printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
99 netfs->name);
100
101 _leave("");
102}
103EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 000000000000..392a41b1b79d
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
1/* FS-Cache object state machine handler
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/object.txt for a description of the
12 * object state machine and the in-kernel representations.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include "internal.h"
18
19const char *fscache_object_states[] = {
20 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
21 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
22 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
23 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
24 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
25 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
26 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
27 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
28 [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
29 [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
30 [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
31 [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
32 [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
33};
34EXPORT_SYMBOL(fscache_object_states);
35
36static void fscache_object_slow_work_put_ref(struct slow_work *);
37static int fscache_object_slow_work_get_ref(struct slow_work *);
38static void fscache_object_slow_work_execute(struct slow_work *);
39static void fscache_initialise_object(struct fscache_object *);
40static void fscache_lookup_object(struct fscache_object *);
41static void fscache_object_available(struct fscache_object *);
42static void fscache_release_object(struct fscache_object *);
43static void fscache_withdraw_object(struct fscache_object *);
44static void fscache_enqueue_dependents(struct fscache_object *);
45static void fscache_dequeue_object(struct fscache_object *);
46
47const struct slow_work_ops fscache_object_slow_work_ops = {
48 .get_ref = fscache_object_slow_work_get_ref,
49 .put_ref = fscache_object_slow_work_put_ref,
50 .execute = fscache_object_slow_work_execute,
51};
52EXPORT_SYMBOL(fscache_object_slow_work_ops);
53
54/*
55 * we need to notify the parent when an op completes that we had outstanding
56 * upon it
57 */
58static inline void fscache_done_parent_op(struct fscache_object *object)
59{
60 struct fscache_object *parent = object->parent;
61
62 _enter("OBJ%x {OBJ%x,%x}",
63 object->debug_id, parent->debug_id, parent->n_ops);
64
65 spin_lock_nested(&parent->lock, 1);
66 parent->n_ops--;
67 parent->n_obj_ops--;
68 if (parent->n_ops == 0)
69 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
70 spin_unlock(&parent->lock);
71}
72
73/*
74 * process events that have been sent to an object's state machine
75 * - initiates parent lookup
76 * - does object lookup
77 * - does object creation
78 * - does object recycling and retirement
79 * - does object withdrawal
80 */
81static void fscache_object_state_machine(struct fscache_object *object)
82{
83 enum fscache_object_state new_state;
84
85 ASSERT(object != NULL);
86
87 _enter("{OBJ%x,%s,%lx}",
88 object->debug_id, fscache_object_states[object->state],
89 object->events);
90
91 switch (object->state) {
92 /* wait for the parent object to become ready */
93 case FSCACHE_OBJECT_INIT:
94 object->event_mask =
95 ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
96 fscache_initialise_object(object);
97 goto done;
98
99 /* look up the object metadata on disk */
100 case FSCACHE_OBJECT_LOOKING_UP:
101 fscache_lookup_object(object);
102 goto lookup_transit;
103
104 /* create the object metadata on disk */
105 case FSCACHE_OBJECT_CREATING:
106 fscache_lookup_object(object);
107 goto lookup_transit;
108
109 /* handle an object becoming available; start pending
110 * operations and queue dependent operations for processing */
111 case FSCACHE_OBJECT_AVAILABLE:
112 fscache_object_available(object);
113 goto active_transit;
114
115 /* normal running state */
116 case FSCACHE_OBJECT_ACTIVE:
117 goto active_transit;
118
119 /* update the object metadata on disk */
120 case FSCACHE_OBJECT_UPDATING:
121 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
122 fscache_stat(&fscache_n_updates_run);
123 object->cache->ops->update_object(object);
124 goto active_transit;
125
126 /* handle an object dying during lookup or creation */
127 case FSCACHE_OBJECT_LC_DYING:
128 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
129 object->cache->ops->lookup_complete(object);
130
131 spin_lock(&object->lock);
132 object->state = FSCACHE_OBJECT_DYING;
133 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
134 &object->cookie->flags))
135 wake_up_bit(&object->cookie->flags,
136 FSCACHE_COOKIE_CREATING);
137 spin_unlock(&object->lock);
138
139 fscache_done_parent_op(object);
140
141 /* wait for completion of all active operations on this object
142 * and the death of all child objects of this object */
143 case FSCACHE_OBJECT_DYING:
144 dying:
145 clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
146 spin_lock(&object->lock);
147 _debug("dying OBJ%x {%d,%d}",
148 object->debug_id, object->n_ops, object->n_children);
149 if (object->n_ops == 0 && object->n_children == 0) {
150 object->event_mask &=
151 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
152 object->event_mask |=
153 (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
154 (1 << FSCACHE_OBJECT_EV_RETIRE) |
155 (1 << FSCACHE_OBJECT_EV_RELEASE) |
156 (1 << FSCACHE_OBJECT_EV_ERROR);
157 } else {
158 object->event_mask &=
159 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
160 (1 << FSCACHE_OBJECT_EV_RETIRE) |
161 (1 << FSCACHE_OBJECT_EV_RELEASE) |
162 (1 << FSCACHE_OBJECT_EV_ERROR));
163 object->event_mask |=
164 1 << FSCACHE_OBJECT_EV_CLEARED;
165 }
166 spin_unlock(&object->lock);
167 fscache_enqueue_dependents(object);
168 goto terminal_transit;
169
170 /* handle an abort during initialisation */
171 case FSCACHE_OBJECT_ABORT_INIT:
172 _debug("handle abort init %lx", object->events);
173 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
174
175 spin_lock(&object->lock);
176 fscache_dequeue_object(object);
177
178 object->state = FSCACHE_OBJECT_DYING;
179 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
180 &object->cookie->flags))
181 wake_up_bit(&object->cookie->flags,
182 FSCACHE_COOKIE_CREATING);
183 spin_unlock(&object->lock);
184 goto dying;
185
186 /* handle the netfs releasing an object and possibly marking it
187 * obsolete too */
188 case FSCACHE_OBJECT_RELEASING:
189 case FSCACHE_OBJECT_RECYCLING:
190 object->event_mask &=
191 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
192 (1 << FSCACHE_OBJECT_EV_RETIRE) |
193 (1 << FSCACHE_OBJECT_EV_RELEASE) |
194 (1 << FSCACHE_OBJECT_EV_ERROR));
195 fscache_release_object(object);
196 spin_lock(&object->lock);
197 object->state = FSCACHE_OBJECT_DEAD;
198 spin_unlock(&object->lock);
199 fscache_stat(&fscache_n_object_dead);
200 goto terminal_transit;
201
202 /* handle the parent cache of this object being withdrawn from
203 * active service */
204 case FSCACHE_OBJECT_WITHDRAWING:
205 object->event_mask &=
206 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
207 (1 << FSCACHE_OBJECT_EV_RETIRE) |
208 (1 << FSCACHE_OBJECT_EV_RELEASE) |
209 (1 << FSCACHE_OBJECT_EV_ERROR));
210 fscache_withdraw_object(object);
211 spin_lock(&object->lock);
212 object->state = FSCACHE_OBJECT_DEAD;
213 spin_unlock(&object->lock);
214 fscache_stat(&fscache_n_object_dead);
215 goto terminal_transit;
216
217 /* complain about the object being woken up once it is
218 * deceased */
219 case FSCACHE_OBJECT_DEAD:
220 printk(KERN_ERR "FS-Cache:"
221 " Unexpected event in dead state %lx\n",
222 object->events & object->event_mask);
223 BUG();
224
225 default:
226 printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
227 object->state);
228 BUG();
229 }
230
231 /* determine the transition from a lookup state */
232lookup_transit:
233 switch (fls(object->events & object->event_mask) - 1) {
234 case FSCACHE_OBJECT_EV_WITHDRAW:
235 case FSCACHE_OBJECT_EV_RETIRE:
236 case FSCACHE_OBJECT_EV_RELEASE:
237 case FSCACHE_OBJECT_EV_ERROR:
238 new_state = FSCACHE_OBJECT_LC_DYING;
239 goto change_state;
240 case FSCACHE_OBJECT_EV_REQUEUE:
241 goto done;
242 case -1:
243 goto done; /* sleep until event */
244 default:
245 goto unsupported_event;
246 }
247
248 /* determine the transition from an active state */
249active_transit:
250 switch (fls(object->events & object->event_mask) - 1) {
251 case FSCACHE_OBJECT_EV_WITHDRAW:
252 case FSCACHE_OBJECT_EV_RETIRE:
253 case FSCACHE_OBJECT_EV_RELEASE:
254 case FSCACHE_OBJECT_EV_ERROR:
255 new_state = FSCACHE_OBJECT_DYING;
256 goto change_state;
257 case FSCACHE_OBJECT_EV_UPDATE:
258 new_state = FSCACHE_OBJECT_UPDATING;
259 goto change_state;
260 case -1:
261 new_state = FSCACHE_OBJECT_ACTIVE;
262 goto change_state; /* sleep until event */
263 default:
264 goto unsupported_event;
265 }
266
267 /* determine the transition from a terminal state */
268terminal_transit:
269 switch (fls(object->events & object->event_mask) - 1) {
270 case FSCACHE_OBJECT_EV_WITHDRAW:
271 new_state = FSCACHE_OBJECT_WITHDRAWING;
272 goto change_state;
273 case FSCACHE_OBJECT_EV_RETIRE:
274 new_state = FSCACHE_OBJECT_RECYCLING;
275 goto change_state;
276 case FSCACHE_OBJECT_EV_RELEASE:
277 new_state = FSCACHE_OBJECT_RELEASING;
278 goto change_state;
279 case FSCACHE_OBJECT_EV_ERROR:
280 new_state = FSCACHE_OBJECT_WITHDRAWING;
281 goto change_state;
282 case FSCACHE_OBJECT_EV_CLEARED:
283 new_state = FSCACHE_OBJECT_DYING;
284 goto change_state;
285 case -1:
286 goto done; /* sleep until event */
287 default:
288 goto unsupported_event;
289 }
290
291change_state:
292 spin_lock(&object->lock);
293 object->state = new_state;
294 spin_unlock(&object->lock);
295
296done:
297 _leave(" [->%s]", fscache_object_states[object->state]);
298 return;
299
300unsupported_event:
301 printk(KERN_ERR "FS-Cache:"
302 " Unsupported event %lx [mask %lx] in state %s\n",
303 object->events, object->event_mask,
304 fscache_object_states[object->state]);
305 BUG();
306}
307
308/*
309 * execute an object
310 */
311static void fscache_object_slow_work_execute(struct slow_work *work)
312{
313 struct fscache_object *object =
314 container_of(work, struct fscache_object, work);
315 unsigned long start;
316
317 _enter("{OBJ%x}", object->debug_id);
318
319 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
320
321 start = jiffies;
322 fscache_object_state_machine(object);
323 fscache_hist(fscache_objs_histogram, start);
324 if (object->events & object->event_mask)
325 fscache_enqueue_object(object);
326}
327
328/*
329 * initialise an object
330 * - check the specified object's parent to see if we can make use of it
331 * immediately to do a creation
332 * - we may need to start the process of creating a parent and we need to wait
333 * for the parent's lookup and creation to complete if it's not there yet
334 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
335 * leaf-most cookies of the object and all its children
336 */
337static void fscache_initialise_object(struct fscache_object *object)
338{
339 struct fscache_object *parent;
340
341 _enter("");
342 ASSERT(object->cookie != NULL);
343 ASSERT(object->cookie->parent != NULL);
344 ASSERT(list_empty(&object->work.link));
345
346 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
347 (1 << FSCACHE_OBJECT_EV_RELEASE) |
348 (1 << FSCACHE_OBJECT_EV_RETIRE) |
349 (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
350 _debug("abort init %lx", object->events);
351 spin_lock(&object->lock);
352 object->state = FSCACHE_OBJECT_ABORT_INIT;
353 spin_unlock(&object->lock);
354 return;
355 }
356
357 spin_lock(&object->cookie->lock);
358 spin_lock_nested(&object->cookie->parent->lock, 1);
359
360 parent = object->parent;
361 if (!parent) {
362 _debug("no parent");
363 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
364 } else {
365 spin_lock(&object->lock);
366 spin_lock_nested(&parent->lock, 1);
367 _debug("parent %s", fscache_object_states[parent->state]);
368
369 if (parent->state >= FSCACHE_OBJECT_DYING) {
370 _debug("bad parent");
371 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
372 } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
373 _debug("wait");
374
375 /* we may get woken up in this state by child objects
376 * binding on to us, so we need to make sure we don't
377 * add ourself to the list multiple times */
378 if (list_empty(&object->dep_link)) {
379 object->cache->ops->grab_object(object);
380 list_add(&object->dep_link,
381 &parent->dependents);
382
383 /* fscache_acquire_non_index_cookie() uses this
384 * to wake the chain up */
385 if (parent->state == FSCACHE_OBJECT_INIT)
386 fscache_enqueue_object(parent);
387 }
388 } else {
389 _debug("go");
390 parent->n_ops++;
391 parent->n_obj_ops++;
392 object->lookup_jif = jiffies;
393 object->state = FSCACHE_OBJECT_LOOKING_UP;
394 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
395 }
396
397 spin_unlock(&parent->lock);
398 spin_unlock(&object->lock);
399 }
400
401 spin_unlock(&object->cookie->parent->lock);
402 spin_unlock(&object->cookie->lock);
403 _leave("");
404}
405
406/*
407 * look an object up in the cache from which it was allocated
408 * - we hold an "access lock" on the parent object, so the parent object cannot
409 * be withdrawn by either party till we've finished
410 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
411 * leaf-most cookies of the object and all its children
412 */
413static void fscache_lookup_object(struct fscache_object *object)
414{
415 struct fscache_cookie *cookie = object->cookie;
416 struct fscache_object *parent;
417
418 _enter("");
419
420 parent = object->parent;
421 ASSERT(parent != NULL);
422 ASSERTCMP(parent->n_ops, >, 0);
423 ASSERTCMP(parent->n_obj_ops, >, 0);
424
425 /* make sure the parent is still available */
426 ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
427
428 if (parent->state >= FSCACHE_OBJECT_DYING ||
429 test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
430 _debug("unavailable");
431 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
432 _leave("");
433 return;
434 }
435
436 _debug("LOOKUP \"%s/%s\" in \"%s\"",
437 parent->cookie->def->name, cookie->def->name,
438 object->cache->tag->name);
439
440 fscache_stat(&fscache_n_object_lookups);
441 object->cache->ops->lookup_object(object);
442
443 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
444 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
445
446 _leave("");
447}
448
449/**
450 * fscache_object_lookup_negative - Note negative cookie lookup
451 * @object: Object pointing to cookie to mark
452 *
453 * Note negative lookup, permitting those waiting to read data from an already
454 * existing backing object to continue as there's no data for them to read.
455 */
456void fscache_object_lookup_negative(struct fscache_object *object)
457{
458 struct fscache_cookie *cookie = object->cookie;
459
460 _enter("{OBJ%x,%s}",
461 object->debug_id, fscache_object_states[object->state]);
462
463 spin_lock(&object->lock);
464 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
465 fscache_stat(&fscache_n_object_lookups_negative);
466
467 /* transit here to allow write requests to begin stacking up
468 * and read requests to begin returning ENODATA */
469 object->state = FSCACHE_OBJECT_CREATING;
470 spin_unlock(&object->lock);
471
472 set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
473 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
474
475 _debug("wake up lookup %p", &cookie->flags);
476 smp_mb__before_clear_bit();
477 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
478 smp_mb__after_clear_bit();
479 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
480 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
481 } else {
482 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
483 spin_unlock(&object->lock);
484 }
485
486 _leave("");
487}
488EXPORT_SYMBOL(fscache_object_lookup_negative);
489
490/**
491 * fscache_obtained_object - Note successful object lookup or creation
492 * @object: Object pointing to cookie to mark
493 *
494 * Note successful lookup and/or creation, permitting those waiting to write
495 * data to a backing object to continue.
496 *
497 * Note that after calling this, an object's cookie may be relinquished by the
498 * netfs, and so must be accessed with object lock held.
499 */
500void fscache_obtained_object(struct fscache_object *object)
501{
502 struct fscache_cookie *cookie = object->cookie;
503
504 _enter("{OBJ%x,%s}",
505 object->debug_id, fscache_object_states[object->state]);
506
507 /* if we were still looking up, then we must have a positive lookup
508 * result, in which case there may be data available */
509 spin_lock(&object->lock);
510 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
511 fscache_stat(&fscache_n_object_lookups_positive);
512
513 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
514
515 object->state = FSCACHE_OBJECT_AVAILABLE;
516 spin_unlock(&object->lock);
517
518 smp_mb__before_clear_bit();
519 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
520 smp_mb__after_clear_bit();
521 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
522 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
523 } else {
524 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
525 fscache_stat(&fscache_n_object_created);
526
527 object->state = FSCACHE_OBJECT_AVAILABLE;
528 spin_unlock(&object->lock);
529 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
530 smp_wmb();
531 }
532
533 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
534 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
535
536 _leave("");
537}
538EXPORT_SYMBOL(fscache_obtained_object);
539
540/*
541 * handle an object that has just become available
542 */
543static void fscache_object_available(struct fscache_object *object)
544{
545 _enter("{OBJ%x}", object->debug_id);
546
547 spin_lock(&object->lock);
548
549 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
550 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
551
552 fscache_done_parent_op(object);
553 if (object->n_in_progress == 0) {
554 if (object->n_ops > 0) {
555 ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
556 ASSERTIF(object->n_ops > object->n_obj_ops,
557 !list_empty(&object->pending_ops));
558 fscache_start_operations(object);
559 } else {
560 ASSERT(list_empty(&object->pending_ops));
561 }
562 }
563 spin_unlock(&object->lock);
564
565 object->cache->ops->lookup_complete(object);
566 fscache_enqueue_dependents(object);
567
568 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
569 fscache_stat(&fscache_n_object_avail);
570
571 _leave("");
572}
573
574/*
575 * drop an object's attachments
576 */
577static void fscache_drop_object(struct fscache_object *object)
578{
579 struct fscache_object *parent = object->parent;
580 struct fscache_cache *cache = object->cache;
581
582 _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
583
584 spin_lock(&cache->object_list_lock);
585 list_del_init(&object->cache_link);
586 spin_unlock(&cache->object_list_lock);
587
588 cache->ops->drop_object(object);
589
590 if (parent) {
591 _debug("release parent OBJ%x {%d}",
592 parent->debug_id, parent->n_children);
593
594 spin_lock(&parent->lock);
595 parent->n_children--;
596 if (parent->n_children == 0)
597 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
598 spin_unlock(&parent->lock);
599 object->parent = NULL;
600 }
601
602 /* this just shifts the object release to the slow work processor */
603 object->cache->ops->put_object(object);
604
605 _leave("");
606}
607
608/*
609 * release or recycle an object that the netfs has discarded
610 */
611static void fscache_release_object(struct fscache_object *object)
612{
613 _enter("");
614
615 fscache_drop_object(object);
616}
617
618/*
619 * withdraw an object from active service
620 */
621static void fscache_withdraw_object(struct fscache_object *object)
622{
623 struct fscache_cookie *cookie;
624 bool detached;
625
626 _enter("");
627
628 spin_lock(&object->lock);
629 cookie = object->cookie;
630 if (cookie) {
631 /* need to get the cookie lock before the object lock, starting
632 * from the object pointer */
633 atomic_inc(&cookie->usage);
634 spin_unlock(&object->lock);
635
636 detached = false;
637 spin_lock(&cookie->lock);
638 spin_lock(&object->lock);
639
640 if (object->cookie == cookie) {
641 hlist_del_init(&object->cookie_link);
642 object->cookie = NULL;
643 detached = true;
644 }
645 spin_unlock(&cookie->lock);
646 fscache_cookie_put(cookie);
647 if (detached)
648 fscache_cookie_put(cookie);
649 }
650
651 spin_unlock(&object->lock);
652
653 fscache_drop_object(object);
654}
655
656/*
657 * withdraw an object from active service at the behest of the cache
658 * - need break the links to a cached object cookie
659 * - called under two situations:
660 * (1) recycler decides to reclaim an in-use object
661 * (2) a cache is unmounted
662 * - have to take care as the cookie can be being relinquished by the netfs
663 * simultaneously
664 * - the object is pinned by the caller holding a refcount on it
665 */
666void fscache_withdrawing_object(struct fscache_cache *cache,
667 struct fscache_object *object)
668{
669 bool enqueue = false;
670
671 _enter(",OBJ%x", object->debug_id);
672
673 spin_lock(&object->lock);
674 if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
675 object->state = FSCACHE_OBJECT_WITHDRAWING;
676 enqueue = true;
677 }
678 spin_unlock(&object->lock);
679
680 if (enqueue)
681 fscache_enqueue_object(object);
682
683 _leave("");
684}
685
686/*
687 * allow the slow work item processor to get a ref on an object
688 */
689static int fscache_object_slow_work_get_ref(struct slow_work *work)
690{
691 struct fscache_object *object =
692 container_of(work, struct fscache_object, work);
693
694 return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
695}
696
697/*
698 * allow the slow work item processor to discard a ref on a work item
699 */
700static void fscache_object_slow_work_put_ref(struct slow_work *work)
701{
702 struct fscache_object *object =
703 container_of(work, struct fscache_object, work);
704
705 return object->cache->ops->put_object(object);
706}
707
708/*
709 * enqueue an object for metadata-type processing
710 */
711void fscache_enqueue_object(struct fscache_object *object)
712{
713 _enter("{OBJ%x}", object->debug_id);
714
715 slow_work_enqueue(&object->work);
716}
717
718/*
719 * enqueue the dependents of an object for metadata-type processing
720 * - the caller must hold the object's lock
721 * - this may cause an already locked object to wind up being processed again
722 */
723static void fscache_enqueue_dependents(struct fscache_object *object)
724{
725 struct fscache_object *dep;
726
727 _enter("{OBJ%x}", object->debug_id);
728
729 if (list_empty(&object->dependents))
730 return;
731
732 spin_lock(&object->lock);
733
734 while (!list_empty(&object->dependents)) {
735 dep = list_entry(object->dependents.next,
736 struct fscache_object, dep_link);
737 list_del_init(&dep->dep_link);
738
739
740 /* sort onto appropriate lists */
741 fscache_enqueue_object(dep);
742 dep->cache->ops->put_object(dep);
743
744 if (!list_empty(&object->dependents))
745 cond_resched_lock(&object->lock);
746 }
747
748 spin_unlock(&object->lock);
749}
750
751/*
752 * remove an object from whatever queue it's waiting on
753 * - the caller must hold object->lock
754 */
755void fscache_dequeue_object(struct fscache_object *object)
756{
757 _enter("{OBJ%x}", object->debug_id);
758
759 if (!list_empty(&object->dep_link)) {
760 spin_lock(&object->parent->lock);
761 list_del_init(&object->dep_link);
762 spin_unlock(&object->parent->lock);
763 }
764
765 _leave("");
766}
767
768/**
769 * fscache_check_aux - Ask the netfs whether an object on disk is still valid
770 * @object: The object to ask about
771 * @data: The auxiliary data for the object
772 * @datalen: The size of the auxiliary data
773 *
774 * This function consults the netfs about the coherency state of an object
775 */
776enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
777 const void *data, uint16_t datalen)
778{
779 enum fscache_checkaux result;
780
781 if (!object->cookie->def->check_aux) {
782 fscache_stat(&fscache_n_checkaux_none);
783 return FSCACHE_CHECKAUX_OKAY;
784 }
785
786 result = object->cookie->def->check_aux(object->cookie->netfs_data,
787 data, datalen);
788 switch (result) {
789 /* entry okay as is */
790 case FSCACHE_CHECKAUX_OKAY:
791 fscache_stat(&fscache_n_checkaux_okay);
792 break;
793
794 /* entry requires update */
795 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
796 fscache_stat(&fscache_n_checkaux_update);
797 break;
798
799 /* entry requires deletion */
800 case FSCACHE_CHECKAUX_OBSOLETE:
801 fscache_stat(&fscache_n_checkaux_obsolete);
802 break;
803
804 default:
805 BUG();
806 }
807
808 return result;
809}
810EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 000000000000..e7f8d53b8b6b
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
1/* FS-Cache worker operation management routines
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/operations.txt
12 */
13
14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h>
16#include "internal.h"
17
18atomic_t fscache_op_debug_id;
19EXPORT_SYMBOL(fscache_op_debug_id);
20
21/**
22 * fscache_enqueue_operation - Enqueue an operation for processing
23 * @op: The operation to enqueue
24 *
25 * Enqueue an operation for processing by the FS-Cache thread pool.
26 *
27 * This will get its own ref on the object.
28 */
29void fscache_enqueue_operation(struct fscache_operation *op)
30{
31 _enter("{OBJ%x OP%x,%u}",
32 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
33
34 ASSERT(op->processor != NULL);
35 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
36 ASSERTCMP(atomic_read(&op->usage), >, 0);
37
38 if (list_empty(&op->pend_link)) {
39 switch (op->flags & FSCACHE_OP_TYPE) {
40 case FSCACHE_OP_FAST:
41 _debug("queue fast");
42 atomic_inc(&op->usage);
43 if (!schedule_work(&op->fast_work))
44 fscache_put_operation(op);
45 break;
46 case FSCACHE_OP_SLOW:
47 _debug("queue slow");
48 slow_work_enqueue(&op->slow_work);
49 break;
50 case FSCACHE_OP_MYTHREAD:
51 _debug("queue for caller's attention");
52 break;
53 default:
54 printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
55 op->flags);
56 BUG();
57 break;
58 }
59 fscache_stat(&fscache_n_op_enqueue);
60 }
61}
62EXPORT_SYMBOL(fscache_enqueue_operation);
63
64/*
65 * start an op running
66 */
67static void fscache_run_op(struct fscache_object *object,
68 struct fscache_operation *op)
69{
70 object->n_in_progress++;
71 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
72 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
73 if (op->processor)
74 fscache_enqueue_operation(op);
75 fscache_stat(&fscache_n_op_run);
76}
77
78/*
79 * submit an exclusive operation for an object
80 * - other ops are excluded from running simultaneously with this one
81 * - this gets any extra refs it needs on an op
82 */
83int fscache_submit_exclusive_op(struct fscache_object *object,
84 struct fscache_operation *op)
85{
86 int ret;
87
88 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
89
90 spin_lock(&object->lock);
91 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
92 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
93
94 ret = -ENOBUFS;
95 if (fscache_object_is_active(object)) {
96 op->object = object;
97 object->n_ops++;
98 object->n_exclusive++; /* reads and writes must wait */
99
100 if (object->n_ops > 0) {
101 atomic_inc(&op->usage);
102 list_add_tail(&op->pend_link, &object->pending_ops);
103 fscache_stat(&fscache_n_op_pend);
104 } else if (!list_empty(&object->pending_ops)) {
105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend);
108 fscache_start_operations(object);
109 } else {
110 ASSERTCMP(object->n_in_progress, ==, 0);
111 fscache_run_op(object, op);
112 }
113
114 /* need to issue a new write op after this */
115 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
116 ret = 0;
117 } else if (object->state == FSCACHE_OBJECT_CREATING) {
118 op->object = object;
119 object->n_ops++;
120 object->n_exclusive++; /* reads and writes must wait */
121 atomic_inc(&op->usage);
122 list_add_tail(&op->pend_link, &object->pending_ops);
123 fscache_stat(&fscache_n_op_pend);
124 ret = 0;
125 } else {
126 /* not allowed to submit ops in any other state */
127 BUG();
128 }
129
130 spin_unlock(&object->lock);
131 return ret;
132}
133
134/*
135 * report an unexpected submission
136 */
137static void fscache_report_unexpected_submission(struct fscache_object *object,
138 struct fscache_operation *op,
139 unsigned long ostate)
140{
141 static bool once_only;
142 struct fscache_operation *p;
143 unsigned n;
144
145 if (once_only)
146 return;
147 once_only = true;
148
149 kdebug("unexpected submission OP%x [OBJ%x %s]",
150 op->debug_id, object->debug_id,
151 fscache_object_states[object->state]);
152 kdebug("objstate=%s [%s]",
153 fscache_object_states[object->state],
154 fscache_object_states[ostate]);
155 kdebug("objflags=%lx", object->flags);
156 kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
157 kdebug("ops=%u inp=%u exc=%u",
158 object->n_ops, object->n_in_progress, object->n_exclusive);
159
160 if (!list_empty(&object->pending_ops)) {
161 n = 0;
162 list_for_each_entry(p, &object->pending_ops, pend_link) {
163 ASSERTCMP(p->object, ==, object);
164 kdebug("%p %p", op->processor, op->release);
165 n++;
166 }
167
168 kdebug("n=%u", n);
169 }
170
171 dump_stack();
172}
173
174/*
175 * submit an operation for an object
176 * - objects may be submitted only in the following states:
177 * - during object creation (write ops may be submitted)
178 * - whilst the object is active
179 * - after an I/O error incurred in one of the two above states (op rejected)
180 * - this gets any extra refs it needs on an op
181 */
182int fscache_submit_op(struct fscache_object *object,
183 struct fscache_operation *op)
184{
185 unsigned long ostate;
186 int ret;
187
188 _enter("{OBJ%x OP%x},{%u}",
189 object->debug_id, op->debug_id, atomic_read(&op->usage));
190
191 ASSERTCMP(atomic_read(&op->usage), >, 0);
192
193 spin_lock(&object->lock);
194 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
195 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
196
197 ostate = object->state;
198 smp_rmb();
199
200 if (fscache_object_is_active(object)) {
201 op->object = object;
202 object->n_ops++;
203
204 if (object->n_exclusive > 0) {
205 atomic_inc(&op->usage);
206 list_add_tail(&op->pend_link, &object->pending_ops);
207 fscache_stat(&fscache_n_op_pend);
208 } else if (!list_empty(&object->pending_ops)) {
209 atomic_inc(&op->usage);
210 list_add_tail(&op->pend_link, &object->pending_ops);
211 fscache_stat(&fscache_n_op_pend);
212 fscache_start_operations(object);
213 } else {
214 ASSERTCMP(object->n_exclusive, ==, 0);
215 fscache_run_op(object, op);
216 }
217 ret = 0;
218 } else if (object->state == FSCACHE_OBJECT_CREATING) {
219 op->object = object;
220 object->n_ops++;
221 atomic_inc(&op->usage);
222 list_add_tail(&op->pend_link, &object->pending_ops);
223 fscache_stat(&fscache_n_op_pend);
224 ret = 0;
225 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
226 fscache_report_unexpected_submission(object, op, ostate);
227 ASSERT(!fscache_object_is_active(object));
228 ret = -ENOBUFS;
229 } else {
230 ret = -ENOBUFS;
231 }
232
233 spin_unlock(&object->lock);
234 return ret;
235}
236
237/*
238 * queue an object for withdrawal on error, aborting all following asynchronous
239 * operations
240 */
241void fscache_abort_object(struct fscache_object *object)
242{
243 _enter("{OBJ%x}", object->debug_id);
244
245 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
246}
247
248/*
249 * jump start the operation processing on an object
250 * - caller must hold object->lock
251 */
252void fscache_start_operations(struct fscache_object *object)
253{
254 struct fscache_operation *op;
255 bool stop = false;
256
257 while (!list_empty(&object->pending_ops) && !stop) {
258 op = list_entry(object->pending_ops.next,
259 struct fscache_operation, pend_link);
260
261 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
262 if (object->n_in_progress > 0)
263 break;
264 stop = true;
265 }
266 list_del_init(&op->pend_link);
267 object->n_in_progress++;
268
269 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
270 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
271 if (op->processor)
272 fscache_enqueue_operation(op);
273
274 /* the pending queue was holding a ref on the object */
275 fscache_put_operation(op);
276 }
277
278 ASSERTCMP(object->n_in_progress, <=, object->n_ops);
279
280 _debug("woke %d ops on OBJ%x",
281 object->n_in_progress, object->debug_id);
282}
283
284/*
285 * release an operation
286 * - queues pending ops if this is the last in-progress op
287 */
288void fscache_put_operation(struct fscache_operation *op)
289{
290 struct fscache_object *object;
291 struct fscache_cache *cache;
292
293 _enter("{OBJ%x OP%x,%d}",
294 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
295
296 ASSERTCMP(atomic_read(&op->usage), >, 0);
297
298 if (!atomic_dec_and_test(&op->usage))
299 return;
300
301 _debug("PUT OP");
302 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
303 BUG();
304
305 fscache_stat(&fscache_n_op_release);
306
307 if (op->release) {
308 op->release(op);
309 op->release = NULL;
310 }
311
312 object = op->object;
313
314 /* now... we may get called with the object spinlock held, so we
315 * complete the cleanup here only if we can immediately acquire the
316 * lock, and defer it otherwise */
317 if (!spin_trylock(&object->lock)) {
318 _debug("defer put");
319 fscache_stat(&fscache_n_op_deferred_release);
320
321 cache = object->cache;
322 spin_lock(&cache->op_gc_list_lock);
323 list_add_tail(&op->pend_link, &cache->op_gc_list);
324 spin_unlock(&cache->op_gc_list_lock);
325 schedule_work(&cache->op_gc);
326 _leave(" [defer]");
327 return;
328 }
329
330 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
331 ASSERTCMP(object->n_exclusive, >, 0);
332 object->n_exclusive--;
333 }
334
335 ASSERTCMP(object->n_in_progress, >, 0);
336 object->n_in_progress--;
337 if (object->n_in_progress == 0)
338 fscache_start_operations(object);
339
340 ASSERTCMP(object->n_ops, >, 0);
341 object->n_ops--;
342 if (object->n_ops == 0)
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
344
345 spin_unlock(&object->lock);
346
347 kfree(op);
348 _leave(" [done]");
349}
350EXPORT_SYMBOL(fscache_put_operation);
351
352/*
353 * garbage collect operations that have had their release deferred
354 */
355void fscache_operation_gc(struct work_struct *work)
356{
357 struct fscache_operation *op;
358 struct fscache_object *object;
359 struct fscache_cache *cache =
360 container_of(work, struct fscache_cache, op_gc);
361 int count = 0;
362
363 _enter("");
364
365 do {
366 spin_lock(&cache->op_gc_list_lock);
367 if (list_empty(&cache->op_gc_list)) {
368 spin_unlock(&cache->op_gc_list_lock);
369 break;
370 }
371
372 op = list_entry(cache->op_gc_list.next,
373 struct fscache_operation, pend_link);
374 list_del(&op->pend_link);
375 spin_unlock(&cache->op_gc_list_lock);
376
377 object = op->object;
378
379 _debug("GC DEFERRED REL OBJ%x OP%x",
380 object->debug_id, op->debug_id);
381 fscache_stat(&fscache_n_op_gc);
382
383 ASSERTCMP(atomic_read(&op->usage), ==, 0);
384
385 spin_lock(&object->lock);
386 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
387 ASSERTCMP(object->n_exclusive, >, 0);
388 object->n_exclusive--;
389 }
390
391 ASSERTCMP(object->n_in_progress, >, 0);
392 object->n_in_progress--;
393 if (object->n_in_progress == 0)
394 fscache_start_operations(object);
395
396 ASSERTCMP(object->n_ops, >, 0);
397 object->n_ops--;
398 if (object->n_ops == 0)
399 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
400
401 spin_unlock(&object->lock);
402
403 } while (count++ < 20);
404
405 if (!list_empty(&cache->op_gc_list))
406 schedule_work(&cache->op_gc);
407
408 _leave("");
409}
410
411/*
412 * allow the slow work item processor to get a ref on an operation
413 */
414static int fscache_op_get_ref(struct slow_work *work)
415{
416 struct fscache_operation *op =
417 container_of(work, struct fscache_operation, slow_work);
418
419 atomic_inc(&op->usage);
420 return 0;
421}
422
423/*
424 * allow the slow work item processor to discard a ref on an operation
425 */
426static void fscache_op_put_ref(struct slow_work *work)
427{
428 struct fscache_operation *op =
429 container_of(work, struct fscache_operation, slow_work);
430
431 fscache_put_operation(op);
432}
433
434/*
435 * execute an operation using the slow thread pool to provide processing context
436 * - the caller holds a ref to this object, so we don't need to hold one
437 */
438static void fscache_op_execute(struct slow_work *work)
439{
440 struct fscache_operation *op =
441 container_of(work, struct fscache_operation, slow_work);
442 unsigned long start;
443
444 _enter("{OBJ%x OP%x,%d}",
445 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
446
447 ASSERT(op->processor != NULL);
448 start = jiffies;
449 op->processor(op);
450 fscache_hist(fscache_ops_histogram, start);
451
452 _leave("");
453}
454
455const struct slow_work_ops fscache_op_slow_work_ops = {
456 .get_ref = fscache_op_get_ref,
457 .put_ref = fscache_op_put_ref,
458 .execute = fscache_op_execute,
459};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
1/* Cache page management and data I/O routines
2 *
3 * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL PAGE
13#include <linux/module.h>
14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h>
16#include <linux/pagevec.h>
17#include "internal.h"
18
19/*
20 * check to see if a page is being written to the cache
21 */
22bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
23{
24 void *val;
25
26 rcu_read_lock();
27 val = radix_tree_lookup(&cookie->stores, page->index);
28 rcu_read_unlock();
29
30 return val != NULL;
31}
32EXPORT_SYMBOL(__fscache_check_page_write);
33
34/*
35 * wait for a page to finish being written to the cache
36 */
37void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
38{
39 wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
40
41 wait_event(*wq, !__fscache_check_page_write(cookie, page));
42}
43EXPORT_SYMBOL(__fscache_wait_on_page_write);
44
45/*
46 * note that a page has finished being written to the cache
47 */
48static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
49{
50 struct page *xpage;
51
52 spin_lock(&cookie->lock);
53 xpage = radix_tree_delete(&cookie->stores, page->index);
54 spin_unlock(&cookie->lock);
55 ASSERT(xpage != NULL);
56
57 wake_up_bit(&cookie->flags, 0);
58}
59
60/*
61 * actually apply the changed attributes to a cache object
62 */
63static void fscache_attr_changed_op(struct fscache_operation *op)
64{
65 struct fscache_object *object = op->object;
66
67 _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
68
69 fscache_stat(&fscache_n_attr_changed_calls);
70
71 if (fscache_object_is_active(object) &&
72 object->cache->ops->attr_changed(object) < 0)
73 fscache_abort_object(object);
74
75 _leave("");
76}
77
78/*
79 * notification that the attributes on an object have changed
80 */
81int __fscache_attr_changed(struct fscache_cookie *cookie)
82{
83 struct fscache_operation *op;
84 struct fscache_object *object;
85
86 _enter("%p", cookie);
87
88 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
89
90 fscache_stat(&fscache_n_attr_changed);
91
92 op = kzalloc(sizeof(*op), GFP_KERNEL);
93 if (!op) {
94 fscache_stat(&fscache_n_attr_changed_nomem);
95 _leave(" = -ENOMEM");
96 return -ENOMEM;
97 }
98
99 fscache_operation_init(op, NULL);
100 fscache_operation_init_slow(op, fscache_attr_changed_op);
101 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
102
103 spin_lock(&cookie->lock);
104
105 if (hlist_empty(&cookie->backing_objects))
106 goto nobufs;
107 object = hlist_entry(cookie->backing_objects.first,
108 struct fscache_object, cookie_link);
109
110 if (fscache_submit_exclusive_op(object, op) < 0)
111 goto nobufs;
112 spin_unlock(&cookie->lock);
113 fscache_stat(&fscache_n_attr_changed_ok);
114 fscache_put_operation(op);
115 _leave(" = 0");
116 return 0;
117
118nobufs:
119 spin_unlock(&cookie->lock);
120 kfree(op);
121 fscache_stat(&fscache_n_attr_changed_nobufs);
122 _leave(" = %d", -ENOBUFS);
123 return -ENOBUFS;
124}
125EXPORT_SYMBOL(__fscache_attr_changed);
126
127/*
128 * handle secondary execution given to a retrieval op on behalf of the
129 * cache
130 */
131static void fscache_retrieval_work(struct work_struct *work)
132{
133 struct fscache_retrieval *op =
134 container_of(work, struct fscache_retrieval, op.fast_work);
135 unsigned long start;
136
137 _enter("{OP%x}", op->op.debug_id);
138
139 start = jiffies;
140 op->op.processor(&op->op);
141 fscache_hist(fscache_ops_histogram, start);
142 fscache_put_operation(&op->op);
143}
144
145/*
146 * release a retrieval op reference
147 */
148static void fscache_release_retrieval_op(struct fscache_operation *_op)
149{
150 struct fscache_retrieval *op =
151 container_of(_op, struct fscache_retrieval, op);
152
153 _enter("{OP%x}", op->op.debug_id);
154
155 fscache_hist(fscache_retrieval_histogram, op->start_time);
156 if (op->context)
157 fscache_put_context(op->op.object->cookie, op->context);
158
159 _leave("");
160}
161
162/*
163 * allocate a retrieval op
164 */
165static struct fscache_retrieval *fscache_alloc_retrieval(
166 struct address_space *mapping,
167 fscache_rw_complete_t end_io_func,
168 void *context)
169{
170 struct fscache_retrieval *op;
171
172 /* allocate a retrieval operation and attempt to submit it */
173 op = kzalloc(sizeof(*op), GFP_NOIO);
174 if (!op) {
175 fscache_stat(&fscache_n_retrievals_nomem);
176 return NULL;
177 }
178
179 fscache_operation_init(&op->op, fscache_release_retrieval_op);
180 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
181 op->mapping = mapping;
182 op->end_io_func = end_io_func;
183 op->context = context;
184 op->start_time = jiffies;
185 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
186 INIT_LIST_HEAD(&op->to_do);
187 return op;
188}
189
190/*
191 * wait for a deferred lookup to complete
192 */
193static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
194{
195 unsigned long jif;
196
197 _enter("");
198
199 if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
200 _leave(" = 0 [imm]");
201 return 0;
202 }
203
204 fscache_stat(&fscache_n_retrievals_wait);
205
206 jif = jiffies;
207 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
208 fscache_wait_bit_interruptible,
209 TASK_INTERRUPTIBLE) != 0) {
210 fscache_stat(&fscache_n_retrievals_intr);
211 _leave(" = -ERESTARTSYS");
212 return -ERESTARTSYS;
213 }
214
215 ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
216
217 smp_rmb();
218 fscache_hist(fscache_retrieval_delay_histogram, jif);
219 _leave(" = 0 [dly]");
220 return 0;
221}
222
223/*
224 * read a page from the cache or allocate a block in which to store it
225 * - we return:
226 * -ENOMEM - out of memory, nothing done
227 * -ERESTARTSYS - interrupted
228 * -ENOBUFS - no backing object available in which to cache the block
229 * -ENODATA - no data available in the backing object for this block
230 * 0 - dispatched a read - it'll call end_io_func() when finished
231 */
232int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
233 struct page *page,
234 fscache_rw_complete_t end_io_func,
235 void *context,
236 gfp_t gfp)
237{
238 struct fscache_retrieval *op;
239 struct fscache_object *object;
240 int ret;
241
242 _enter("%p,%p,,,", cookie, page);
243
244 fscache_stat(&fscache_n_retrievals);
245
246 if (hlist_empty(&cookie->backing_objects))
247 goto nobufs;
248
249 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
250 ASSERTCMP(page, !=, NULL);
251
252 if (fscache_wait_for_deferred_lookup(cookie) < 0)
253 return -ERESTARTSYS;
254
255 op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
256 if (!op) {
257 _leave(" = -ENOMEM");
258 return -ENOMEM;
259 }
260
261 spin_lock(&cookie->lock);
262
263 if (hlist_empty(&cookie->backing_objects))
264 goto nobufs_unlock;
265 object = hlist_entry(cookie->backing_objects.first,
266 struct fscache_object, cookie_link);
267
268 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
269
270 if (fscache_submit_op(object, &op->op) < 0)
271 goto nobufs_unlock;
272 spin_unlock(&cookie->lock);
273
274 fscache_stat(&fscache_n_retrieval_ops);
275
276 /* pin the netfs read context in case we need to do the actual netfs
277 * read because we've encountered a cache read failure */
278 fscache_get_context(object->cookie, op->context);
279
280 /* we wait for the operation to become active, and then process it
281 * *here*, in this thread, and not in the thread pool */
282 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
283 _debug(">>> WT");
284 fscache_stat(&fscache_n_retrieval_op_waits);
285 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
286 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
287 _debug("<<< GO");
288 }
289
290 /* ask the cache to honour the operation */
291 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
292 ret = object->cache->ops->allocate_page(op, page, gfp);
293 if (ret == 0)
294 ret = -ENODATA;
295 } else {
296 ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
297 }
298
299 if (ret == -ENOMEM)
300 fscache_stat(&fscache_n_retrievals_nomem);
301 else if (ret == -ERESTARTSYS)
302 fscache_stat(&fscache_n_retrievals_intr);
303 else if (ret == -ENODATA)
304 fscache_stat(&fscache_n_retrievals_nodata);
305 else if (ret < 0)
306 fscache_stat(&fscache_n_retrievals_nobufs);
307 else
308 fscache_stat(&fscache_n_retrievals_ok);
309
310 fscache_put_retrieval(op);
311 _leave(" = %d", ret);
312 return ret;
313
314nobufs_unlock:
315 spin_unlock(&cookie->lock);
316 kfree(op);
317nobufs:
318 fscache_stat(&fscache_n_retrievals_nobufs);
319 _leave(" = -ENOBUFS");
320 return -ENOBUFS;
321}
322EXPORT_SYMBOL(__fscache_read_or_alloc_page);
323
324/*
325 * read a list of page from the cache or allocate a block in which to store
326 * them
327 * - we return:
328 * -ENOMEM - out of memory, some pages may be being read
329 * -ERESTARTSYS - interrupted, some pages may be being read
330 * -ENOBUFS - no backing object or space available in which to cache any
331 * pages not being read
332 * -ENODATA - no data available in the backing object for some or all of
333 * the pages
334 * 0 - dispatched a read on all pages
335 *
336 * end_io_func() will be called for each page read from the cache as it is
337 * finishes being read
338 *
339 * any pages for which a read is dispatched will be removed from pages and
340 * nr_pages
341 */
342int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
343 struct address_space *mapping,
344 struct list_head *pages,
345 unsigned *nr_pages,
346 fscache_rw_complete_t end_io_func,
347 void *context,
348 gfp_t gfp)
349{
350 fscache_pages_retrieval_func_t func;
351 struct fscache_retrieval *op;
352 struct fscache_object *object;
353 int ret;
354
355 _enter("%p,,%d,,,", cookie, *nr_pages);
356
357 fscache_stat(&fscache_n_retrievals);
358
359 if (hlist_empty(&cookie->backing_objects))
360 goto nobufs;
361
362 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
363 ASSERTCMP(*nr_pages, >, 0);
364 ASSERT(!list_empty(pages));
365
366 if (fscache_wait_for_deferred_lookup(cookie) < 0)
367 return -ERESTARTSYS;
368
369 op = fscache_alloc_retrieval(mapping, end_io_func, context);
370 if (!op)
371 return -ENOMEM;
372
373 spin_lock(&cookie->lock);
374
375 if (hlist_empty(&cookie->backing_objects))
376 goto nobufs_unlock;
377 object = hlist_entry(cookie->backing_objects.first,
378 struct fscache_object, cookie_link);
379
380 if (fscache_submit_op(object, &op->op) < 0)
381 goto nobufs_unlock;
382 spin_unlock(&cookie->lock);
383
384 fscache_stat(&fscache_n_retrieval_ops);
385
386 /* pin the netfs read context in case we need to do the actual netfs
387 * read because we've encountered a cache read failure */
388 fscache_get_context(object->cookie, op->context);
389
390 /* we wait for the operation to become active, and then process it
391 * *here*, in this thread, and not in the thread pool */
392 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
393 _debug(">>> WT");
394 fscache_stat(&fscache_n_retrieval_op_waits);
395 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
396 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
397 _debug("<<< GO");
398 }
399
400 /* ask the cache to honour the operation */
401 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
402 func = object->cache->ops->allocate_pages;
403 else
404 func = object->cache->ops->read_or_alloc_pages;
405 ret = func(op, pages, nr_pages, gfp);
406
407 if (ret == -ENOMEM)
408 fscache_stat(&fscache_n_retrievals_nomem);
409 else if (ret == -ERESTARTSYS)
410 fscache_stat(&fscache_n_retrievals_intr);
411 else if (ret == -ENODATA)
412 fscache_stat(&fscache_n_retrievals_nodata);
413 else if (ret < 0)
414 fscache_stat(&fscache_n_retrievals_nobufs);
415 else
416 fscache_stat(&fscache_n_retrievals_ok);
417
418 fscache_put_retrieval(op);
419 _leave(" = %d", ret);
420 return ret;
421
422nobufs_unlock:
423 spin_unlock(&cookie->lock);
424 kfree(op);
425nobufs:
426 fscache_stat(&fscache_n_retrievals_nobufs);
427 _leave(" = -ENOBUFS");
428 return -ENOBUFS;
429}
430EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
431
432/*
433 * allocate a block in the cache on which to store a page
434 * - we return:
435 * -ENOMEM - out of memory, nothing done
436 * -ERESTARTSYS - interrupted
437 * -ENOBUFS - no backing object available in which to cache the block
438 * 0 - block allocated
439 */
440int __fscache_alloc_page(struct fscache_cookie *cookie,
441 struct page *page,
442 gfp_t gfp)
443{
444 struct fscache_retrieval *op;
445 struct fscache_object *object;
446 int ret;
447
448 _enter("%p,%p,,,", cookie, page);
449
450 fscache_stat(&fscache_n_allocs);
451
452 if (hlist_empty(&cookie->backing_objects))
453 goto nobufs;
454
455 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
456 ASSERTCMP(page, !=, NULL);
457
458 if (fscache_wait_for_deferred_lookup(cookie) < 0)
459 return -ERESTARTSYS;
460
461 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
462 if (!op)
463 return -ENOMEM;
464
465 spin_lock(&cookie->lock);
466
467 if (hlist_empty(&cookie->backing_objects))
468 goto nobufs_unlock;
469 object = hlist_entry(cookie->backing_objects.first,
470 struct fscache_object, cookie_link);
471
472 if (fscache_submit_op(object, &op->op) < 0)
473 goto nobufs_unlock;
474 spin_unlock(&cookie->lock);
475
476 fscache_stat(&fscache_n_alloc_ops);
477
478 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
479 _debug(">>> WT");
480 fscache_stat(&fscache_n_alloc_op_waits);
481 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
482 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
483 _debug("<<< GO");
484 }
485
486 /* ask the cache to honour the operation */
487 ret = object->cache->ops->allocate_page(op, page, gfp);
488
489 if (ret < 0)
490 fscache_stat(&fscache_n_allocs_nobufs);
491 else
492 fscache_stat(&fscache_n_allocs_ok);
493
494 fscache_put_retrieval(op);
495 _leave(" = %d", ret);
496 return ret;
497
498nobufs_unlock:
499 spin_unlock(&cookie->lock);
500 kfree(op);
501nobufs:
502 fscache_stat(&fscache_n_allocs_nobufs);
503 _leave(" = -ENOBUFS");
504 return -ENOBUFS;
505}
506EXPORT_SYMBOL(__fscache_alloc_page);
507
508/*
509 * release a write op reference
510 */
511static void fscache_release_write_op(struct fscache_operation *_op)
512{
513 _enter("{OP%x}", _op->debug_id);
514}
515
516/*
517 * perform the background storage of a page into the cache
518 */
519static void fscache_write_op(struct fscache_operation *_op)
520{
521 struct fscache_storage *op =
522 container_of(_op, struct fscache_storage, op);
523 struct fscache_object *object = op->op.object;
524 struct fscache_cookie *cookie = object->cookie;
525 struct page *page;
526 unsigned n;
527 void *results[1];
528 int ret;
529
530 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
531
532 spin_lock(&cookie->lock);
533 spin_lock(&object->lock);
534
535 if (!fscache_object_is_active(object)) {
536 spin_unlock(&object->lock);
537 spin_unlock(&cookie->lock);
538 _leave("");
539 return;
540 }
541
542 fscache_stat(&fscache_n_store_calls);
543
544 /* find a page to store */
545 page = NULL;
546 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
547 FSCACHE_COOKIE_PENDING_TAG);
548 if (n != 1)
549 goto superseded;
550 page = results[0];
551 _debug("gang %d [%lx]", n, page->index);
552 if (page->index > op->store_limit)
553 goto superseded;
554
555 radix_tree_tag_clear(&cookie->stores, page->index,
556 FSCACHE_COOKIE_PENDING_TAG);
557
558 spin_unlock(&object->lock);
559 spin_unlock(&cookie->lock);
560
561 if (page) {
562 ret = object->cache->ops->write_page(op, page);
563 fscache_end_page_write(cookie, page);
564 page_cache_release(page);
565 if (ret < 0)
566 fscache_abort_object(object);
567 else
568 fscache_enqueue_operation(&op->op);
569 }
570
571 _leave("");
572 return;
573
574superseded:
575 /* this writer is going away and there aren't any more things to
576 * write */
577 _debug("cease");
578 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
579 spin_unlock(&object->lock);
580 spin_unlock(&cookie->lock);
581 _leave("");
582}
583
584/*
585 * request a page be stored in the cache
586 * - returns:
587 * -ENOMEM - out of memory, nothing done
588 * -ENOBUFS - no backing object available in which to cache the page
589 * 0 - dispatched a write - it'll call end_io_func() when finished
590 *
591 * if the cookie still has a backing object at this point, that object can be
592 * in one of a few states with respect to storage processing:
593 *
594 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
595 * set)
596 *
597 * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
598 * fill op)
599 *
600 * (b) writes deferred till post-creation (mark page for writing and
601 * return immediately)
602 *
603 * (2) negative lookup, object created, initial fill being made from netfs
604 * (FSCACHE_COOKIE_INITIAL_FILL is set)
605 *
606 * (a) fill point not yet reached this page (mark page for writing and
607 * return)
608 *
609 * (b) fill point passed this page (queue op to store this page)
610 *
611 * (3) object extant (queue op to store this page)
612 *
613 * any other state is invalid
614 */
615int __fscache_write_page(struct fscache_cookie *cookie,
616 struct page *page,
617 gfp_t gfp)
618{
619 struct fscache_storage *op;
620 struct fscache_object *object;
621 int ret;
622
623 _enter("%p,%x,", cookie, (u32) page->flags);
624
625 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
626 ASSERT(PageFsCache(page));
627
628 fscache_stat(&fscache_n_stores);
629
630 op = kzalloc(sizeof(*op), GFP_NOIO);
631 if (!op)
632 goto nomem;
633
634 fscache_operation_init(&op->op, fscache_release_write_op);
635 fscache_operation_init_slow(&op->op, fscache_write_op);
636 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
637
638 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
639 if (ret < 0)
640 goto nomem_free;
641
642 ret = -ENOBUFS;
643 spin_lock(&cookie->lock);
644
645 if (hlist_empty(&cookie->backing_objects))
646 goto nobufs;
647 object = hlist_entry(cookie->backing_objects.first,
648 struct fscache_object, cookie_link);
649 if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
650 goto nobufs;
651
652 /* add the page to the pending-storage radix tree on the backing
653 * object */
654 spin_lock(&object->lock);
655
656 _debug("store limit %llx", (unsigned long long) object->store_limit);
657
658 ret = radix_tree_insert(&cookie->stores, page->index, page);
659 if (ret < 0) {
660 if (ret == -EEXIST)
661 goto already_queued;
662 _debug("insert failed %d", ret);
663 goto nobufs_unlock_obj;
664 }
665
666 radix_tree_tag_set(&cookie->stores, page->index,
667 FSCACHE_COOKIE_PENDING_TAG);
668 page_cache_get(page);
669
670 /* we only want one writer at a time, but we do need to queue new
671 * writers after exclusive ops */
672 if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
673 goto already_pending;
674
675 spin_unlock(&object->lock);
676
677 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
678 op->store_limit = object->store_limit;
679
680 if (fscache_submit_op(object, &op->op) < 0)
681 goto submit_failed;
682
683 spin_unlock(&cookie->lock);
684 radix_tree_preload_end();
685 fscache_stat(&fscache_n_store_ops);
686 fscache_stat(&fscache_n_stores_ok);
687
688 /* the slow work queue now carries its own ref on the object */
689 fscache_put_operation(&op->op);
690 _leave(" = 0");
691 return 0;
692
693already_queued:
694 fscache_stat(&fscache_n_stores_again);
695already_pending:
696 spin_unlock(&object->lock);
697 spin_unlock(&cookie->lock);
698 radix_tree_preload_end();
699 kfree(op);
700 fscache_stat(&fscache_n_stores_ok);
701 _leave(" = 0");
702 return 0;
703
704submit_failed:
705 radix_tree_delete(&cookie->stores, page->index);
706 page_cache_release(page);
707 ret = -ENOBUFS;
708 goto nobufs;
709
710nobufs_unlock_obj:
711 spin_unlock(&object->lock);
712nobufs:
713 spin_unlock(&cookie->lock);
714 radix_tree_preload_end();
715 kfree(op);
716 fscache_stat(&fscache_n_stores_nobufs);
717 _leave(" = -ENOBUFS");
718 return -ENOBUFS;
719
720nomem_free:
721 kfree(op);
722nomem:
723 fscache_stat(&fscache_n_stores_oom);
724 _leave(" = -ENOMEM");
725 return -ENOMEM;
726}
727EXPORT_SYMBOL(__fscache_write_page);
728
729/*
730 * remove a page from the cache
731 */
732void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
733{
734 struct fscache_object *object;
735
736 _enter(",%p", page);
737
738 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
739 ASSERTCMP(page, !=, NULL);
740
741 fscache_stat(&fscache_n_uncaches);
742
743 /* cache withdrawal may beat us to it */
744 if (!PageFsCache(page))
745 goto done;
746
747 /* get the object */
748 spin_lock(&cookie->lock);
749
750 if (hlist_empty(&cookie->backing_objects)) {
751 ClearPageFsCache(page);
752 goto done_unlock;
753 }
754
755 object = hlist_entry(cookie->backing_objects.first,
756 struct fscache_object, cookie_link);
757
758 /* there might now be stuff on disk we could read */
759 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
760
761 /* only invoke the cache backend if we managed to mark the page
762 * uncached here; this deals with synchronisation vs withdrawal */
763 if (TestClearPageFsCache(page) &&
764 object->cache->ops->uncache_page) {
765 /* the cache backend releases the cookie lock */
766 object->cache->ops->uncache_page(object, page);
767 goto done;
768 }
769
770done_unlock:
771 spin_unlock(&cookie->lock);
772done:
773 _leave("");
774}
775EXPORT_SYMBOL(__fscache_uncache_page);
776
777/**
778 * fscache_mark_pages_cached - Mark pages as being cached
779 * @op: The retrieval op pages are being marked for
780 * @pagevec: The pages to be marked
781 *
782 * Mark a bunch of netfs pages as being cached. After this is called,
783 * the netfs must call fscache_uncache_page() to remove the mark.
784 */
785void fscache_mark_pages_cached(struct fscache_retrieval *op,
786 struct pagevec *pagevec)
787{
788 struct fscache_cookie *cookie = op->op.object->cookie;
789 unsigned long loop;
790
791#ifdef CONFIG_FSCACHE_STATS
792 atomic_add(pagevec->nr, &fscache_n_marks);
793#endif
794
795 for (loop = 0; loop < pagevec->nr; loop++) {
796 struct page *page = pagevec->pages[loop];
797
798 _debug("- mark %p{%lx}", page, page->index);
799 if (TestSetPageFsCache(page)) {
800 static bool once_only;
801 if (!once_only) {
802 once_only = true;
803 printk(KERN_WARNING "FS-Cache:"
804 " Cookie type %s marked page %lx"
805 " multiple times\n",
806 cookie->def->name, page->index);
807 }
808 }
809 }
810
811 if (cookie->def->mark_pages_cached)
812 cookie->def->mark_pages_cached(cookie->netfs_data,
813 op->mapping, pagevec);
814 pagevec_reinit(pagevec);
815}
816EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
1/* FS-Cache statistics viewing interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL OPERATION
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * initialise the /proc/fs/fscache/ directory
20 */
21int __init fscache_proc_init(void)
22{
23 _enter("");
24
25 if (!proc_mkdir("fs/fscache", NULL))
26 goto error_dir;
27
28#ifdef CONFIG_FSCACHE_STATS
29 if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
30 &fscache_stats_fops))
31 goto error_stats;
32#endif
33
34#ifdef CONFIG_FSCACHE_HISTOGRAM
35 if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
36 &fscache_histogram_fops))
37 goto error_histogram;
38#endif
39
40 _leave(" = 0");
41 return 0;
42
43#ifdef CONFIG_FSCACHE_HISTOGRAM
44error_histogram:
45#endif
46#ifdef CONFIG_FSCACHE_STATS
47 remove_proc_entry("fs/fscache/stats", NULL);
48error_stats:
49#endif
50 remove_proc_entry("fs/fscache", NULL);
51error_dir:
52 _leave(" = -ENOMEM");
53 return -ENOMEM;
54}
55
56/*
57 * clean up the /proc/fs/fscache/ directory
58 */
59void fscache_proc_cleanup(void)
60{
61#ifdef CONFIG_FSCACHE_HISTOGRAM
62 remove_proc_entry("fs/fscache/histogram", NULL);
63#endif
64#ifdef CONFIG_FSCACHE_STATS
65 remove_proc_entry("fs/fscache/stats", NULL);
66#endif
67 remove_proc_entry("fs/fscache", NULL);
68}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
1/* FS-Cache statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * operation counters
20 */
21atomic_t fscache_n_op_pend;
22atomic_t fscache_n_op_run;
23atomic_t fscache_n_op_enqueue;
24atomic_t fscache_n_op_requeue;
25atomic_t fscache_n_op_deferred_release;
26atomic_t fscache_n_op_release;
27atomic_t fscache_n_op_gc;
28
29atomic_t fscache_n_attr_changed;
30atomic_t fscache_n_attr_changed_ok;
31atomic_t fscache_n_attr_changed_nobufs;
32atomic_t fscache_n_attr_changed_nomem;
33atomic_t fscache_n_attr_changed_calls;
34
35atomic_t fscache_n_allocs;
36atomic_t fscache_n_allocs_ok;
37atomic_t fscache_n_allocs_wait;
38atomic_t fscache_n_allocs_nobufs;
39atomic_t fscache_n_alloc_ops;
40atomic_t fscache_n_alloc_op_waits;
41
42atomic_t fscache_n_retrievals;
43atomic_t fscache_n_retrievals_ok;
44atomic_t fscache_n_retrievals_wait;
45atomic_t fscache_n_retrievals_nodata;
46atomic_t fscache_n_retrievals_nobufs;
47atomic_t fscache_n_retrievals_intr;
48atomic_t fscache_n_retrievals_nomem;
49atomic_t fscache_n_retrieval_ops;
50atomic_t fscache_n_retrieval_op_waits;
51
52atomic_t fscache_n_stores;
53atomic_t fscache_n_stores_ok;
54atomic_t fscache_n_stores_again;
55atomic_t fscache_n_stores_nobufs;
56atomic_t fscache_n_stores_oom;
57atomic_t fscache_n_store_ops;
58atomic_t fscache_n_store_calls;
59
60atomic_t fscache_n_marks;
61atomic_t fscache_n_uncaches;
62
63atomic_t fscache_n_acquires;
64atomic_t fscache_n_acquires_null;
65atomic_t fscache_n_acquires_no_cache;
66atomic_t fscache_n_acquires_ok;
67atomic_t fscache_n_acquires_nobufs;
68atomic_t fscache_n_acquires_oom;
69
70atomic_t fscache_n_updates;
71atomic_t fscache_n_updates_null;
72atomic_t fscache_n_updates_run;
73
74atomic_t fscache_n_relinquishes;
75atomic_t fscache_n_relinquishes_null;
76atomic_t fscache_n_relinquishes_waitcrt;
77
78atomic_t fscache_n_cookie_index;
79atomic_t fscache_n_cookie_data;
80atomic_t fscache_n_cookie_special;
81
82atomic_t fscache_n_object_alloc;
83atomic_t fscache_n_object_no_alloc;
84atomic_t fscache_n_object_lookups;
85atomic_t fscache_n_object_lookups_negative;
86atomic_t fscache_n_object_lookups_positive;
87atomic_t fscache_n_object_created;
88atomic_t fscache_n_object_avail;
89atomic_t fscache_n_object_dead;
90
91atomic_t fscache_n_checkaux_none;
92atomic_t fscache_n_checkaux_okay;
93atomic_t fscache_n_checkaux_update;
94atomic_t fscache_n_checkaux_obsolete;
95
96/*
97 * display the general statistics
98 */
99static int fscache_stats_show(struct seq_file *m, void *v)
100{
101 seq_puts(m, "FS-Cache statistics\n");
102
103 seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
104 atomic_read(&fscache_n_cookie_index),
105 atomic_read(&fscache_n_cookie_data),
106 atomic_read(&fscache_n_cookie_special));
107
108 seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
109 atomic_read(&fscache_n_object_alloc),
110 atomic_read(&fscache_n_object_no_alloc),
111 atomic_read(&fscache_n_object_avail),
112 atomic_read(&fscache_n_object_dead));
113 seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
114 atomic_read(&fscache_n_checkaux_none),
115 atomic_read(&fscache_n_checkaux_okay),
116 atomic_read(&fscache_n_checkaux_update),
117 atomic_read(&fscache_n_checkaux_obsolete));
118
119 seq_printf(m, "Pages : mrk=%u unc=%u\n",
120 atomic_read(&fscache_n_marks),
121 atomic_read(&fscache_n_uncaches));
122
123 seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
124 " oom=%u\n",
125 atomic_read(&fscache_n_acquires),
126 atomic_read(&fscache_n_acquires_null),
127 atomic_read(&fscache_n_acquires_no_cache),
128 atomic_read(&fscache_n_acquires_ok),
129 atomic_read(&fscache_n_acquires_nobufs),
130 atomic_read(&fscache_n_acquires_oom));
131
132 seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
133 atomic_read(&fscache_n_object_lookups),
134 atomic_read(&fscache_n_object_lookups_negative),
135 atomic_read(&fscache_n_object_lookups_positive),
136 atomic_read(&fscache_n_object_created));
137
138 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
139 atomic_read(&fscache_n_updates),
140 atomic_read(&fscache_n_updates_null),
141 atomic_read(&fscache_n_updates_run));
142
143 seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
144 atomic_read(&fscache_n_relinquishes),
145 atomic_read(&fscache_n_relinquishes_null),
146 atomic_read(&fscache_n_relinquishes_waitcrt));
147
148 seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
149 atomic_read(&fscache_n_attr_changed),
150 atomic_read(&fscache_n_attr_changed_ok),
151 atomic_read(&fscache_n_attr_changed_nobufs),
152 atomic_read(&fscache_n_attr_changed_nomem),
153 atomic_read(&fscache_n_attr_changed_calls));
154
155 seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
156 atomic_read(&fscache_n_allocs),
157 atomic_read(&fscache_n_allocs_ok),
158 atomic_read(&fscache_n_allocs_wait),
159 atomic_read(&fscache_n_allocs_nobufs));
160 seq_printf(m, "Allocs : ops=%u owt=%u\n",
161 atomic_read(&fscache_n_alloc_ops),
162 atomic_read(&fscache_n_alloc_op_waits));
163
164 seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
165 " int=%u oom=%u\n",
166 atomic_read(&fscache_n_retrievals),
167 atomic_read(&fscache_n_retrievals_ok),
168 atomic_read(&fscache_n_retrievals_wait),
169 atomic_read(&fscache_n_retrievals_nodata),
170 atomic_read(&fscache_n_retrievals_nobufs),
171 atomic_read(&fscache_n_retrievals_intr),
172 atomic_read(&fscache_n_retrievals_nomem));
173 seq_printf(m, "Retrvls: ops=%u owt=%u\n",
174 atomic_read(&fscache_n_retrieval_ops),
175 atomic_read(&fscache_n_retrieval_op_waits));
176
177 seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
178 atomic_read(&fscache_n_stores),
179 atomic_read(&fscache_n_stores_ok),
180 atomic_read(&fscache_n_stores_again),
181 atomic_read(&fscache_n_stores_nobufs),
182 atomic_read(&fscache_n_stores_oom));
183 seq_printf(m, "Stores : ops=%u run=%u\n",
184 atomic_read(&fscache_n_store_ops),
185 atomic_read(&fscache_n_store_calls));
186
187 seq_printf(m, "Ops : pend=%u run=%u enq=%u\n",
188 atomic_read(&fscache_n_op_pend),
189 atomic_read(&fscache_n_op_run),
190 atomic_read(&fscache_n_op_enqueue));
191 seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
192 atomic_read(&fscache_n_op_deferred_release),
193 atomic_read(&fscache_n_op_release),
194 atomic_read(&fscache_n_op_gc));
195 return 0;
196}
197
198/*
199 * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
200 */
201static int fscache_stats_open(struct inode *inode, struct file *file)
202{
203 return single_open(file, fscache_stats_show, NULL);
204}
205
206const struct file_operations fscache_stats_fops = {
207 .owner = THIS_MODULE,
208 .open = fscache_stats_open,
209 .read = seq_read,
210 .llseek = seq_lseek,
211 .release = seq_release,
212};
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 06da05261e04..8b8eebc5614b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1032 fuse_put_request(fc, req); 1032 fuse_put_request(fc, req);
1033 return -ENOMEM; 1033 return -ENOMEM;
1034 } 1034 }
1035 req->out.argpages = 1;
1035 req->num_pages = 1; 1036 req->num_pages = 1;
1036 req->pages[0] = page; 1037 req->pages[0] = page;
1037 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1038 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4e340fedf768..2b25133524a3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
386 req->in.numargs = 1; 386 req->in.numargs = 1;
387 req->in.args[0].size = sizeof(struct fuse_read_in); 387 req->in.args[0].size = sizeof(struct fuse_read_in);
388 req->in.args[0].value = inarg; 388 req->in.args[0].value = inarg;
389 req->out.argpages = 1;
390 req->out.argvar = 1; 389 req->out.argvar = 1;
391 req->out.numargs = 1; 390 req->out.numargs = 1;
392 req->out.args[0].size = count; 391 req->out.args[0].size = count;
@@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page)
453 attr_ver = fuse_get_attr_version(fc); 452 attr_ver = fuse_get_attr_version(fc);
454 453
455 req->out.page_zeroing = 1; 454 req->out.page_zeroing = 1;
455 req->out.argpages = 1;
456 req->num_pages = 1; 456 req->num_pages = 1;
457 req->pages[0] = page; 457 req->pages[0] = page;
458 num_read = fuse_send_read(req, file, inode, pos, count, NULL); 458 num_read = fuse_send_read(req, file, inode, pos, count, NULL);
@@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
510 struct fuse_conn *fc = get_fuse_conn(inode); 510 struct fuse_conn *fc = get_fuse_conn(inode);
511 loff_t pos = page_offset(req->pages[0]); 511 loff_t pos = page_offset(req->pages[0]);
512 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 512 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
513
514 req->out.argpages = 1;
513 req->out.page_zeroing = 1; 515 req->out.page_zeroing = 1;
514 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 516 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
515 req->misc.read.attr_ver = fuse_get_attr_version(fc); 517 req->misc.read.attr_ver = fuse_get_attr_version(fc);
@@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
621 inarg->flags = file ? file->f_flags : 0; 623 inarg->flags = file ? file->f_flags : 0;
622 req->in.h.opcode = FUSE_WRITE; 624 req->in.h.opcode = FUSE_WRITE;
623 req->in.h.nodeid = get_node_id(inode); 625 req->in.h.nodeid = get_node_id(inode);
624 req->in.argpages = 1;
625 req->in.numargs = 2; 626 req->in.numargs = 2;
626 if (fc->minor < 9) 627 if (fc->minor < 9)
627 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 628 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
@@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
695 if (IS_ERR(req)) 696 if (IS_ERR(req))
696 return PTR_ERR(req); 697 return PTR_ERR(req);
697 698
699 req->in.argpages = 1;
698 req->num_pages = 1; 700 req->num_pages = 1;
699 req->pages[0] = page; 701 req->pages[0] = page;
700 req->page_offset = offset; 702 req->page_offset = offset;
@@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
771 size_t count = 0; 773 size_t count = 0;
772 int err; 774 int err;
773 775
776 req->in.argpages = 1;
774 req->page_offset = offset; 777 req->page_offset = offset;
775 778
776 do { 779 do {
@@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
935} 938}
936 939
937static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, 940static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
938 unsigned nbytes, int write) 941 unsigned *nbytesp, int write)
939{ 942{
943 unsigned nbytes = *nbytesp;
940 unsigned long user_addr = (unsigned long) buf; 944 unsigned long user_addr = (unsigned long) buf;
941 unsigned offset = user_addr & ~PAGE_MASK; 945 unsigned offset = user_addr & ~PAGE_MASK;
942 int npages; 946 int npages;
943 947
944 /* This doesn't work with nfsd */ 948 /* Special case for kernel I/O: can copy directly into the buffer */
945 if (!current->mm) 949 if (segment_eq(get_fs(), KERNEL_DS)) {
946 return -EPERM; 950 if (write)
951 req->in.args[1].value = (void *) user_addr;
952 else
953 req->out.args[0].value = (void *) user_addr;
954
955 return 0;
956 }
947 957
948 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 958 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
949 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 959 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
950 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 960 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
951 down_read(&current->mm->mmap_sem); 961 down_read(&current->mm->mmap_sem);
952 npages = get_user_pages(current, current->mm, user_addr, npages, write, 962 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
953 0, req->pages, NULL); 963 0, req->pages, NULL);
954 up_read(&current->mm->mmap_sem); 964 up_read(&current->mm->mmap_sem);
955 if (npages < 0) 965 if (npages < 0)
@@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
957 967
958 req->num_pages = npages; 968 req->num_pages = npages;
959 req->page_offset = offset; 969 req->page_offset = offset;
970
971 if (write)
972 req->in.argpages = 1;
973 else
974 req->out.argpages = 1;
975
976 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
977 *nbytesp = min(*nbytesp, nbytes);
978
960 return 0; 979 return 0;
961} 980}
962 981
@@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
979 998
980 while (count) { 999 while (count) {
981 size_t nres; 1000 size_t nres;
982 size_t nbytes_limit = min(count, nmax); 1001 size_t nbytes = min(count, nmax);
983 size_t nbytes; 1002 int err = fuse_get_user_pages(req, buf, &nbytes, write);
984 int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
985 if (err) { 1003 if (err) {
986 res = err; 1004 res = err;
987 break; 1005 break;
988 } 1006 }
989 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; 1007
990 nbytes = min(nbytes_limit, nbytes);
991 if (write) 1008 if (write)
992 nres = fuse_send_write(req, file, inode, pos, nbytes, 1009 nres = fuse_send_write(req, file, inode, pos, nbytes,
993 current->files); 1010 current->files);
@@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page)
1163 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); 1180 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
1164 1181
1165 copy_highpage(tmp_page, page); 1182 copy_highpage(tmp_page, page);
1183 req->in.argpages = 1;
1166 req->num_pages = 1; 1184 req->num_pages = 1;
1167 req->pages[0] = tmp_page; 1185 req->pages[0] = tmp_page;
1168 req->page_offset = 0; 1186 req->page_offset = 0;
@@ -1274,6 +1292,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1274 return 0; 1292 return 0;
1275} 1293}
1276 1294
1295static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1296{
1297 /* Can't provide the coherency needed for MAP_SHARED */
1298 if (vma->vm_flags & VM_MAYSHARE)
1299 return -ENODEV;
1300
1301 return generic_file_mmap(file, vma);
1302}
1303
1277static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, 1304static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1278 struct file_lock *fl) 1305 struct file_lock *fl)
1279{ 1306{
@@ -1908,6 +1935,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1908 .llseek = fuse_file_llseek, 1935 .llseek = fuse_file_llseek,
1909 .read = fuse_direct_read, 1936 .read = fuse_direct_read,
1910 .write = fuse_direct_write, 1937 .write = fuse_direct_write,
1938 .mmap = fuse_direct_mmap,
1911 .open = fuse_open, 1939 .open = fuse_open,
1912 .flush = fuse_flush, 1940 .flush = fuse_flush,
1913 .release = fuse_release, 1941 .release = fuse_release,
@@ -1917,7 +1945,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1917 .unlocked_ioctl = fuse_file_ioctl, 1945 .unlocked_ioctl = fuse_file_ioctl,
1918 .compat_ioctl = fuse_file_compat_ioctl, 1946 .compat_ioctl = fuse_file_compat_ioctl,
1919 .poll = fuse_file_poll, 1947 .poll = fuse_file_poll,
1920 /* no mmap and splice_read */ 1948 /* no splice_read */
1921}; 1949};
1922 1950
1923static const struct address_space_operations fuse_file_aops = { 1951static const struct address_space_operations fuse_file_aops = {
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
134 mode_t mode = inode->i_mode; 134 mode_t mode = inode->i_mode;
135 int error; 135 int error;
136 136
137 inode->i_mode = mode & ~current->fs->umask; 137 inode->i_mode = mode & ~current_umask();
138 if (!S_ISLNK(inode->i_mode)) 138 if (!S_ISLNK(inode->i_mode))
139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT); 139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
140 if (acl) { 140 if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa763..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
215 if (error) 215 if (error)
216 return error; 216 return error;
217 if (!acl) { 217 if (!acl) {
218 mode &= ~current->fs->umask; 218 mode &= ~current_umask();
219 if (mode != ip->i_inode.i_mode) 219 if (mode != ip->i_inode.i_mode)
220 error = munge_mode(ip, mode); 220 error = munge_mode(ip, mode);
221 return error; 221 return error;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b7..a36bb749926d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) 82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
83{ 83{
84 struct super_block *sb = dentry->d_sb; 84 struct super_block *sb = dentry->d_sb;
85 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
85 86
86 buf->f_type = HFS_SUPER_MAGIC; 87 buf->f_type = HFS_SUPER_MAGIC;
87 buf->f_bsize = sb->s_blocksize; 88 buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
90 buf->f_bavail = buf->f_bfree; 91 buf->f_bavail = buf->f_bfree;
91 buf->f_files = HFS_SB(sb)->fs_ablocks; 92 buf->f_files = HFS_SB(sb)->fs_ablocks;
92 buf->f_ffree = HFS_SB(sb)->free_ablocks; 93 buf->f_ffree = HFS_SB(sb)->free_ablocks;
94 buf->f_fsid.val[0] = (u32)id;
95 buf->f_fsid.val[1] = (u32)(id >> 32);
93 buf->f_namelen = HFS_NAMELEN; 96 buf->f_namelen = HFS_NAMELEN;
94 97
95 return 0; 98 return 0;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
48 48
49 opts->creator = HFSPLUS_DEF_CR_TYPE; 49 opts->creator = HFSPLUS_DEF_CR_TYPE;
50 opts->type = HFSPLUS_DEF_CR_TYPE; 50 opts->type = HFSPLUS_DEF_CR_TYPE;
51 opts->umask = current->fs->umask; 51 opts->umask = current_umask();
52 opts->uid = current_uid(); 52 opts->uid = current_uid();
53 opts->gid = current_gid(); 53 opts->gid = current_gid();
54 opts->part = -1; 54 opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8e..f2a64020f42e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 224{
225 struct super_block *sb = dentry->d_sb; 225 struct super_block *sb = dentry->d_sb;
226 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
226 227
227 buf->f_type = HFSPLUS_SUPER_MAGIC; 228 buf->f_type = HFSPLUS_SUPER_MAGIC;
228 buf->f_bsize = sb->s_blocksize; 229 buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
231 buf->f_bavail = buf->f_bfree; 232 buf->f_bavail = buf->f_bfree;
232 buf->f_files = 0xFFFFFFFF; 233 buf->f_files = 0xFFFFFFFF;
233 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 234 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
235 buf->f_fsid.val[0] = (u32)id;
236 buf->f_fsid.val[1] = (u32)(id >> 32);
234 buf->f_namelen = HFSPLUS_MAX_STRLEN; 237 buf->f_namelen = HFSPLUS_MAX_STRLEN;
235 238
236 return 0; 239 return 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..fecf402d7b8a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
136{ 136{
137 struct super_block *s = dentry->d_sb; 137 struct super_block *s = dentry->d_sb;
138 struct hpfs_sb_info *sbi = hpfs_sb(s); 138 struct hpfs_sb_info *sbi = hpfs_sb(s);
139 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
139 lock_kernel(); 140 lock_kernel();
140 141
141 /*if (sbi->sb_n_free == -1) {*/ 142 /*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
149 buf->f_bavail = sbi->sb_n_free; 150 buf->f_bavail = sbi->sb_n_free;
150 buf->f_files = sbi->sb_dirband_size / 4; 151 buf->f_files = sbi->sb_dirband_size / 4;
151 buf->f_ffree = sbi->sb_n_free_dnodes; 152 buf->f_ffree = sbi->sb_n_free_dnodes;
153 buf->f_fsid.val[0] = (u32)id;
154 buf->f_fsid.val[1] = (u32)(id >> 32);
152 buf->f_namelen = 254; 155 buf->f_namelen = 254;
153 156
154 unlock_kernel(); 157 unlock_kernel();
@@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 480
478 uid = current_uid(); 481 uid = current_uid();
479 gid = current_gid(); 482 gid = current_gid();
480 umask = current->fs->umask; 483 umask = current_umask();
481 lowercase = 0; 484 lowercase = 0;
482 conv = CONV_BINARY; 485 conv = CONV_BINARY;
483 eas = 2; 486 eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f52024..a5089a6dd67a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
280 "errno = %d\n", err); 280 "errno = %d\n", err);
281 return err; 281 return err;
282 } 282 }
283 count = hppfs_read_file(hppfs->host_fd, buf, count); 283 err = hppfs_read_file(hppfs->host_fd, buf, count);
284 if (err < 0) {
285 printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
286 return err;
287 }
288 count = err;
284 if (count > 0) 289 if (count > 0)
285 *ppos += count; 290 *ppos += count;
286 } 291 }
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
11 11
12struct super_block; 12struct super_block;
13struct linux_binprm; 13struct linux_binprm;
14struct path;
14 15
15/* 16/*
16 * block_dev.c 17 * block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
43/* 44/*
44 * exec.c 45 * exec.c
45 */ 46 */
46extern void check_unsafe_exec(struct linux_binprm *); 47extern int check_unsafe_exec(struct linux_binprm *);
47 48
48/* 49/*
49 * namespace.c 50 * namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
60extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 61extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
61 62
62extern void __init mnt_init(void); 63extern void __init mnt_init(void);
64
65/*
66 * fs_struct.c
67 */
68extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 13d2eddd0692..b4cbe9603c7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -923,6 +923,7 @@ out_freesbi:
923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) 923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
924{ 924{
925 struct super_block *sb = dentry->d_sb; 925 struct super_block *sb = dentry->d_sb;
926 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
926 927
927 buf->f_type = ISOFS_SUPER_MAGIC; 928 buf->f_type = ISOFS_SUPER_MAGIC;
928 buf->f_bsize = sb->s_blocksize; 929 buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
932 buf->f_bavail = 0; 933 buf->f_bavail = 0;
933 buf->f_files = ISOFS_SB(sb)->s_ninodes; 934 buf->f_files = ISOFS_SB(sb)->s_ninodes;
934 buf->f_ffree = 0; 935 buf->f_ffree = 0;
936 buf->f_fsid.val[0] = (u32)id;
937 buf->f_fsid.val[1] = (u32)(id >> 32);
935 buf->f_namelen = NAME_MAX; 938 buf->f_namelen = NAME_MAX;
936 return 0; 939 return 0;
937} 940}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..f8077b9c8981 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bio.h>
23 24
24/* 25/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 26 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
171 return (ret == -EIO); 172 return (ret == -EIO);
172} 173}
173 174
174static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 175static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
176 int write_op)
175{ 177{
176 int i; 178 int i;
177 179
178 for (i = 0; i < bufs; i++) { 180 for (i = 0; i < bufs; i++) {
179 wbuf[i]->b_end_io = end_buffer_write_sync; 181 wbuf[i]->b_end_io = end_buffer_write_sync;
180 /* We use-up our safety reference in submit_bh() */ 182 /* We use-up our safety reference in submit_bh() */
181 submit_bh(WRITE, wbuf[i]); 183 submit_bh(write_op, wbuf[i]);
182 } 184 }
183} 185}
184 186
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
186 * Submit all the data buffers to disk 188 * Submit all the data buffers to disk
187 */ 189 */
188static int journal_submit_data_buffers(journal_t *journal, 190static int journal_submit_data_buffers(journal_t *journal,
189 transaction_t *commit_transaction) 191 transaction_t *commit_transaction,
192 int write_op)
190{ 193{
191 struct journal_head *jh; 194 struct journal_head *jh;
192 struct buffer_head *bh; 195 struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
225 BUFFER_TRACE(bh, "needs blocking lock"); 228 BUFFER_TRACE(bh, "needs blocking lock");
226 spin_unlock(&journal->j_list_lock); 229 spin_unlock(&journal->j_list_lock);
227 /* Write out all data to prevent deadlocks */ 230 /* Write out all data to prevent deadlocks */
228 journal_do_submit_data(wbuf, bufs); 231 journal_do_submit_data(wbuf, bufs, write_op);
229 bufs = 0; 232 bufs = 0;
230 lock_buffer(bh); 233 lock_buffer(bh);
231 spin_lock(&journal->j_list_lock); 234 spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
256 jbd_unlock_bh_state(bh); 259 jbd_unlock_bh_state(bh);
257 if (bufs == journal->j_wbufsize) { 260 if (bufs == journal->j_wbufsize) {
258 spin_unlock(&journal->j_list_lock); 261 spin_unlock(&journal->j_list_lock);
259 journal_do_submit_data(wbuf, bufs); 262 journal_do_submit_data(wbuf, bufs, write_op);
260 bufs = 0; 263 bufs = 0;
261 goto write_out_data; 264 goto write_out_data;
262 } 265 }
@@ -286,7 +289,7 @@ write_out_data:
286 } 289 }
287 } 290 }
288 spin_unlock(&journal->j_list_lock); 291 spin_unlock(&journal->j_list_lock);
289 journal_do_submit_data(wbuf, bufs); 292 journal_do_submit_data(wbuf, bufs, write_op);
290 293
291 return err; 294 return err;
292} 295}
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
315 int first_tag = 0; 318 int first_tag = 0;
316 int tag_flag; 319 int tag_flag;
317 int i; 320 int i;
321 int write_op = WRITE;
318 322
319 /* 323 /*
320 * First job: lock down the current transaction and wait for 324 * First job: lock down the current transaction and wait for
@@ -347,6 +351,8 @@ void journal_commit_transaction(journal_t *journal)
347 spin_lock(&journal->j_state_lock); 351 spin_lock(&journal->j_state_lock);
348 commit_transaction->t_state = T_LOCKED; 352 commit_transaction->t_state = T_LOCKED;
349 353
354 if (commit_transaction->t_synchronous_commit)
355 write_op = WRITE_SYNC;
350 spin_lock(&commit_transaction->t_handle_lock); 356 spin_lock(&commit_transaction->t_handle_lock);
351 while (commit_transaction->t_updates) { 357 while (commit_transaction->t_updates) {
352 DEFINE_WAIT(wait); 358 DEFINE_WAIT(wait);
@@ -431,7 +437,8 @@ void journal_commit_transaction(journal_t *journal)
431 * Now start flushing things to disk, in the order they appear 437 * Now start flushing things to disk, in the order they appear
432 * on the transaction lists. Data blocks go first. 438 * on the transaction lists. Data blocks go first.
433 */ 439 */
434 err = journal_submit_data_buffers(journal, commit_transaction); 440 err = journal_submit_data_buffers(journal, commit_transaction,
441 write_op);
435 442
436 /* 443 /*
437 * Wait for all previously submitted IO to complete. 444 * Wait for all previously submitted IO to complete.
@@ -660,7 +667,7 @@ start_journal_io:
660 clear_buffer_dirty(bh); 667 clear_buffer_dirty(bh);
661 set_buffer_uptodate(bh); 668 set_buffer_uptodate(bh);
662 bh->b_end_io = journal_end_buffer_io_sync; 669 bh->b_end_io = journal_end_buffer_io_sync;
663 submit_bh(WRITE, bh); 670 submit_bh(write_op, bh);
664 } 671 }
665 cond_resched(); 672 cond_resched();
666 673
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812afa..737f7246a4b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
637 return NULL; 637 return NULL;
638 638
639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
640 if (!bh)
641 return NULL;
640 lock_buffer(bh); 642 lock_buffer(bh);
641 memset(bh->b_data, 0, journal->j_blocksize); 643 memset(bh->b_data, 0, journal->j_blocksize);
642 set_buffer_uptodate(bh); 644 set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
733 if (!journal->j_wbuf) { 735 if (!journal->j_wbuf) {
734 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 736 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
735 __func__); 737 __func__);
736 kfree(journal); 738 goto out_err;
737 journal = NULL;
738 goto out;
739 } 739 }
740 journal->j_dev = bdev; 740 journal->j_dev = bdev;
741 journal->j_fs_dev = fs_dev; 741 journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
743 journal->j_maxlen = len; 743 journal->j_maxlen = len;
744 744
745 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 745 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
746 J_ASSERT(bh != NULL); 746 if (!bh) {
747 printk(KERN_ERR
748 "%s: Cannot get buffer for journal superblock\n",
749 __func__);
750 goto out_err;
751 }
747 journal->j_sb_buffer = bh; 752 journal->j_sb_buffer = bh;
748 journal->j_superblock = (journal_superblock_t *)bh->b_data; 753 journal->j_superblock = (journal_superblock_t *)bh->b_data;
749out: 754
750 return journal; 755 return journal;
756out_err:
757 kfree(journal);
758 return NULL;
751} 759}
752 760
753/** 761/**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
787 if (!journal->j_wbuf) { 795 if (!journal->j_wbuf) {
788 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 796 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
789 __func__); 797 __func__);
790 kfree(journal); 798 goto out_err;
791 return NULL;
792 } 799 }
793 800
794 err = journal_bmap(journal, 0, &blocknr); 801 err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
796 if (err) { 803 if (err) {
797 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 804 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
798 __func__); 805 __func__);
799 kfree(journal); 806 goto out_err;
800 return NULL;
801 } 807 }
802 808
803 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 809 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
804 J_ASSERT(bh != NULL); 810 if (!bh) {
811 printk(KERN_ERR
812 "%s: Cannot get buffer for journal superblock\n",
813 __func__);
814 goto out_err;
815 }
805 journal->j_sb_buffer = bh; 816 journal->j_sb_buffer = bh;
806 journal->j_superblock = (journal_superblock_t *)bh->b_data; 817 journal->j_superblock = (journal_superblock_t *)bh->b_data;
807 818
808 return journal; 819 return journal;
820out_err:
821 kfree(journal);
822 return NULL;
809} 823}
810 824
811/* 825/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
1440 } 1440 }
1441 } 1441 }
1442 1442
1443 if (handle->h_sync)
1444 transaction->t_synchronous_commit = 1;
1443 current->journal_info = NULL; 1445 current->journal_info = NULL;
1444 spin_lock(&journal->j_state_lock); 1446 spin_lock(&journal->j_state_lock);
1445 spin_lock(&transaction->t_handle_lock); 1447 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..77ccf8cb0823 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
336 return PTR_ERR(acl); 336 return PTR_ERR(acl);
337 337
338 if (!acl) { 338 if (!acl) {
339 *i_mode &= ~current->fs->umask; 339 *i_mode &= ~current_umask();
340 } else { 340 } else {
341 if (S_ISDIR(*i_mode)) 341 if (S_ISDIR(*i_mode))
342 jffs2_iset_acl(inode, &f->i_acl_default, acl); 342 jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e82..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
182cleanup: 182cleanup:
183 posix_acl_release(acl); 183 posix_acl_release(acl);
184 } else 184 } else
185 inode->i_mode &= ~current->fs->umask; 185 inode->i_mode &= ~current_umask();
186 186
187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | 187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
188 inode->i_mode; 188 inode->i_mode;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 618865b3128b..daad3c2740db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -321,15 +321,20 @@ out:
321 321
322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) 322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
323{ 323{
324 struct minix_sb_info *sbi = minix_sb(dentry->d_sb); 324 struct super_block *sb = dentry->d_sb;
325 buf->f_type = dentry->d_sb->s_magic; 325 struct minix_sb_info *sbi = minix_sb(sb);
326 buf->f_bsize = dentry->d_sb->s_blocksize; 326 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
327 buf->f_type = sb->s_magic;
328 buf->f_bsize = sb->s_blocksize;
327 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; 329 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
328 buf->f_bfree = minix_count_free_blocks(sbi); 330 buf->f_bfree = minix_count_free_blocks(sbi);
329 buf->f_bavail = buf->f_bfree; 331 buf->f_bavail = buf->f_bfree;
330 buf->f_files = sbi->s_ninodes; 332 buf->f_files = sbi->s_ninodes;
331 buf->f_ffree = minix_count_free_inodes(sbi); 333 buf->f_ffree = minix_count_free_inodes(sbi);
332 buf->f_namelen = sbi->s_namelen; 334 buf->f_namelen = sbi->s_namelen;
335 buf->f_fsid.val[0] = (u32)id;
336 buf->f_fsid.val[1] = (u32)(id >> 32);
337
333 return 0; 338 return 0;
334} 339}
335 340
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
82 bio_put(bio); 82 bio_put(bio);
83} 83}
84 84
85struct bio *mpage_bio_submit(int rw, struct bio *bio) 85static struct bio *mpage_bio_submit(int rw, struct bio *bio)
86{ 86{
87 bio->bi_end_io = mpage_end_io_read; 87 bio->bi_end_io = mpage_end_io_read;
88 if (rw == WRITE) 88 if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
90 submit_bio(rw, bio); 90 submit_bio(rw, bio);
91 return NULL; 91 return NULL;
92} 92}
93EXPORT_SYMBOL(mpage_bio_submit);
94 93
95static struct bio * 94static struct bio *
96mpage_alloc(struct block_device *bdev, 95mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
439 * just allocate full-size (16-page) BIOs. 438 * just allocate full-size (16-page) BIOs.
440 */ 439 */
441 440
442int __mpage_writepage(struct page *page, struct writeback_control *wbc, 441struct mpage_data {
442 struct bio *bio;
443 sector_t last_block_in_bio;
444 get_block_t *get_block;
445 unsigned use_writepage;
446};
447
448static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
443 void *data) 449 void *data)
444{ 450{
445 struct mpage_data *mpd = data; 451 struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
648 mpd->bio = bio; 654 mpd->bio = bio;
649 return ret; 655 return ret;
650} 656}
651EXPORT_SYMBOL(__mpage_writepage);
652 657
653/** 658/**
654 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 659 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/fcntl.h> 33#include <linux/fcntl.h>
34#include <linux/device_cgroup.h> 34#include <linux/device_cgroup.h>
35#include <linux/fs_struct.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
37#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) 38#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -1578,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
1578 struct dentry *dir = nd->path.dentry; 1579 struct dentry *dir = nd->path.dentry;
1579 1580
1580 if (!IS_POSIXACL(dir->d_inode)) 1581 if (!IS_POSIXACL(dir->d_inode))
1581 mode &= ~current->fs->umask; 1582 mode &= ~current_umask();
1582 error = security_path_mknod(&nd->path, path->dentry, mode, 0); 1583 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1583 if (error) 1584 if (error)
1584 goto out_unlock; 1585 goto out_unlock;
@@ -1989,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
1989 goto out_unlock; 1990 goto out_unlock;
1990 } 1991 }
1991 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 1992 if (!IS_POSIXACL(nd.path.dentry->d_inode))
1992 mode &= ~current->fs->umask; 1993 mode &= ~current_umask();
1993 error = may_mknod(mode); 1994 error = may_mknod(mode);
1994 if (error) 1995 if (error)
1995 goto out_dput; 1996 goto out_dput;
@@ -2067,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2067 goto out_unlock; 2068 goto out_unlock;
2068 2069
2069 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 2070 if (!IS_POSIXACL(nd.path.dentry->d_inode))
2070 mode &= ~current->fs->umask; 2071 mode &= ~current_umask();
2071 error = mnt_want_write(nd.path.mnt); 2072 error = mnt_want_write(nd.path.mnt);
2072 if (error) 2073 if (error)
2073 goto out_dput; 2074 goto out_dput;
@@ -2897,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink);
2897EXPORT_SYMBOL(vfs_unlink); 2898EXPORT_SYMBOL(vfs_unlink);
2898EXPORT_SYMBOL(dentry_unhash); 2899EXPORT_SYMBOL(dentry_unhash);
2899EXPORT_SYMBOL(generic_readlink); 2900EXPORT_SYMBOL(generic_readlink);
2900
2901/* to be mentioned only in INIT_TASK */
2902struct fs_struct init_fs = {
2903 .count = ATOMIC_INIT(1),
2904 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
2905 .umask = 0022,
2906};
diff --git a/fs/namespace.c b/fs/namespace.c
index 0a42e0e96027..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
27#include <linux/ramfs.h> 27#include <linux/ramfs.h>
28#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/idr.h> 29#include <linux/idr.h>
30#include <linux/fs_struct.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/unistd.h> 32#include <asm/unistd.h>
32#include "pnode.h" 33#include "pnode.h"
@@ -2093,66 +2094,6 @@ out1:
2093} 2094}
2094 2095
2095/* 2096/*
2096 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
2097 * It can block. Requires the big lock held.
2098 */
2099void set_fs_root(struct fs_struct *fs, struct path *path)
2100{
2101 struct path old_root;
2102
2103 write_lock(&fs->lock);
2104 old_root = fs->root;
2105 fs->root = *path;
2106 path_get(path);
2107 write_unlock(&fs->lock);
2108 if (old_root.dentry)
2109 path_put(&old_root);
2110}
2111
2112/*
2113 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
2114 * It can block. Requires the big lock held.
2115 */
2116void set_fs_pwd(struct fs_struct *fs, struct path *path)
2117{
2118 struct path old_pwd;
2119
2120 write_lock(&fs->lock);
2121 old_pwd = fs->pwd;
2122 fs->pwd = *path;
2123 path_get(path);
2124 write_unlock(&fs->lock);
2125
2126 if (old_pwd.dentry)
2127 path_put(&old_pwd);
2128}
2129
2130static void chroot_fs_refs(struct path *old_root, struct path *new_root)
2131{
2132 struct task_struct *g, *p;
2133 struct fs_struct *fs;
2134
2135 read_lock(&tasklist_lock);
2136 do_each_thread(g, p) {
2137 task_lock(p);
2138 fs = p->fs;
2139 if (fs) {
2140 atomic_inc(&fs->count);
2141 task_unlock(p);
2142 if (fs->root.dentry == old_root->dentry
2143 && fs->root.mnt == old_root->mnt)
2144 set_fs_root(fs, new_root);
2145 if (fs->pwd.dentry == old_root->dentry
2146 && fs->pwd.mnt == old_root->mnt)
2147 set_fs_pwd(fs, new_root);
2148 put_fs_struct(fs);
2149 } else
2150 task_unlock(p);
2151 } while_each_thread(g, p);
2152 read_unlock(&tasklist_lock);
2153}
2154
2155/*
2156 * pivot_root Semantics: 2097 * pivot_root Semantics:
2157 * Moves the root file system of the current process to the directory put_old, 2098 * Moves the root file system of the current process to the directory put_old,
2158 * makes new_root as the new root file system of the current process, and sets 2099 * makes new_root as the new root file system of the current process, and sets
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba2..e67f3ec07736 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
84 <file:Documentation/filesystems/nfsroot.txt>. 84 <file:Documentation/filesystems/nfsroot.txt>.
85 85
86 Most people say N here. 86 Most people say N here.
87
88config NFS_FSCACHE
89 bool "Provide NFS client caching support (EXPERIMENTAL)"
90 depends on EXPERIMENTAL
91 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
92 help
93 Say Y here if you want NFS data to be cached locally on disc through
94 the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a3..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 callback.o callback_xdr.o callback_proc.o \ 15 callback.o callback_xdr.o callback_proc.o \
16 nfs4namespace.o 16 nfs4namespace.o
17nfs-$(CONFIG_SYSCTL) += sysctl.o 17nfs-$(CONFIG_SYSCTL) += sysctl.o
18nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index aba38017bdef..75c9cd2aa119 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
45#include "delegation.h" 45#include "delegation.h"
46#include "iostat.h" 46#include "iostat.h"
47#include "internal.h" 47#include "internal.h"
48#include "fscache.h"
48 49
49#define NFSDBG_FACILITY NFSDBG_CLIENT 50#define NFSDBG_FACILITY NFSDBG_CLIENT
50 51
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
154 if (!IS_ERR(cred)) 155 if (!IS_ERR(cred))
155 clp->cl_machine_cred = cred; 156 clp->cl_machine_cred = cred;
156 157
158 nfs_fscache_get_client_cookie(clp);
159
157 return clp; 160 return clp;
158 161
159error_3: 162error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
187 190
188 nfs4_shutdown_client(clp); 191 nfs4_shutdown_client(clp);
189 192
193 nfs_fscache_release_client_cookie(clp);
194
190 /* -EIO all pending I/O */ 195 /* -EIO all pending I/O */
191 if (!IS_ERR(clp->cl_rpcclient)) 196 if (!IS_ERR(clp->cl_rpcclient))
192 rpc_shutdown_client(clp->cl_rpcclient); 197 rpc_shutdown_client(clp->cl_rpcclient);
@@ -760,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
760 765
761 /* Initialise the client representation from the mount data */ 766 /* Initialise the client representation from the mount data */
762 server->flags = data->flags; 767 server->flags = data->flags;
768 server->options = data->options;
763 769
764 if (data->rsize) 770 if (data->rsize)
765 server->rsize = nfs_block_size(data->rsize, NULL); 771 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1148,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
1148 /* Initialise the client representation from the mount data */ 1154 /* Initialise the client representation from the mount data */
1149 server->flags = data->flags; 1155 server->flags = data->flags;
1150 server->caps |= NFS_CAP_ATOMIC_OPEN; 1156 server->caps |= NFS_CAP_ATOMIC_OPEN;
1157 server->options = data->options;
1151 1158
1152 /* Get a client record */ 1159 /* Get a client record */
1153 error = nfs4_set_client(server, 1160 error = nfs4_set_client(server,
@@ -1559,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1559 1566
1560 /* display header on line 1 */ 1567 /* display header on line 1 */
1561 if (v == &nfs_volume_list) { 1568 if (v == &nfs_volume_list) {
1562 seq_puts(m, "NV SERVER PORT DEV FSID\n"); 1569 seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
1563 return 0; 1570 return 0;
1564 } 1571 }
1565 /* display one transport per line on subsequent lines */ 1572 /* display one transport per line on subsequent lines */
@@ -1573,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1573 (unsigned long long) server->fsid.major, 1580 (unsigned long long) server->fsid.major,
1574 (unsigned long long) server->fsid.minor); 1581 (unsigned long long) server->fsid.minor);
1575 1582
1576 seq_printf(m, "v%u %s %s %-7s %-17s\n", 1583 seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
1577 clp->rpc_ops->version, 1584 clp->rpc_ops->version,
1578 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1585 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1579 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), 1586 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
1580 dev, 1587 dev,
1581 fsid); 1588 fsid,
1589 nfs_server_fscache_state(server));
1582 1590
1583 return 0; 1591 return 0;
1584} 1592}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0abf3f331f56..3523b895eb4b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
35#include "delegation.h" 35#include "delegation.h"
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h"
38 39
39#define NFSDBG_FACILITY NFSDBG_FILE 40#define NFSDBG_FACILITY NFSDBG_FILE
40 41
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
409 return copied; 410 return copied;
410} 411}
411 412
413/*
414 * Partially or wholly invalidate a page
415 * - Release the private state associated with a page if undergoing complete
416 * page invalidation
417 * - Called if either PG_private or PG_fscache is set on the page
418 * - Caller holds page lock
419 */
412static void nfs_invalidate_page(struct page *page, unsigned long offset) 420static void nfs_invalidate_page(struct page *page, unsigned long offset)
413{ 421{
414 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 422 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
417 return; 425 return;
418 /* Cancel any unstarted writes on this page */ 426 /* Cancel any unstarted writes on this page */
419 nfs_wb_page_cancel(page->mapping->host, page); 427 nfs_wb_page_cancel(page->mapping->host, page);
428
429 nfs_fscache_invalidate_page(page, page->mapping->host);
420} 430}
421 431
432/*
433 * Attempt to release the private state associated with a page
434 * - Called if either PG_private or PG_fscache is set on the page
435 * - Caller holds page lock
436 * - Return true (may release page) or false (may not)
437 */
422static int nfs_release_page(struct page *page, gfp_t gfp) 438static int nfs_release_page(struct page *page, gfp_t gfp)
423{ 439{
424 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 440 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
425 441
426 /* If PagePrivate() is set, then the page is not freeable */ 442 /* If PagePrivate() is set, then the page is not freeable */
427 return 0; 443 if (PagePrivate(page))
444 return 0;
445 return nfs_fscache_release_page(page, gfp);
428} 446}
429 447
448/*
449 * Attempt to clear the private state associated with a page when an error
450 * occurs that requires the cached contents of an inode to be written back or
451 * destroyed
452 * - Called if either PG_private or fscache is set on the page
453 * - Caller holds page lock
454 * - Return 0 if successful, -error otherwise
455 */
430static int nfs_launder_page(struct page *page) 456static int nfs_launder_page(struct page *page)
431{ 457{
432 struct inode *inode = page->mapping->host; 458 struct inode *inode = page->mapping->host;
459 struct nfs_inode *nfsi = NFS_I(inode);
433 460
434 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", 461 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
435 inode->i_ino, (long long)page_offset(page)); 462 inode->i_ino, (long long)page_offset(page));
436 463
464 nfs_fscache_wait_on_page_write(nfsi, page);
437 return nfs_wb_page(inode, page); 465 return nfs_wb_page(inode, page);
438} 466}
439 467
@@ -451,6 +479,11 @@ const struct address_space_operations nfs_file_aops = {
451 .launder_page = nfs_launder_page, 479 .launder_page = nfs_launder_page,
452}; 480};
453 481
482/*
483 * Notification that a PTE pointing to an NFS page is about to be made
484 * writable, implying that someone is about to modify the page through a
485 * shared-writable mapping
486 */
454static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 487static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
455{ 488{
456 struct page *page = vmf->page; 489 struct page *page = vmf->page;
@@ -465,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
465 filp->f_mapping->host->i_ino, 498 filp->f_mapping->host->i_ino,
466 (long long)page_offset(page)); 499 (long long)page_offset(page));
467 500
501 /* make sure the cache has finished storing the page */
502 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
503
468 lock_page(page); 504 lock_page(page);
469 mapping = page->mapping; 505 mapping = page->mapping;
470 if (mapping != dentry->d_inode->i_mapping) 506 if (mapping != dentry->d_inode->i_mapping)
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000000..5b1006480bc2
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
1/* NFS FS-Cache index structure definition
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19
20#include "internal.h"
21#include "fscache.h"
22
23#define NFSDBG_FACILITY NFSDBG_FSCACHE
24
25/*
26 * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
27 * the cookie for the top-level index object for NFS into here. The top-level
28 * index can than have other cache objects inserted into it.
29 */
30struct fscache_netfs nfs_fscache_netfs = {
31 .name = "nfs",
32 .version = 0,
33};
34
35/*
36 * Register NFS for caching
37 */
38int nfs_fscache_register(void)
39{
40 return fscache_register_netfs(&nfs_fscache_netfs);
41}
42
43/*
44 * Unregister NFS for caching
45 */
46void nfs_fscache_unregister(void)
47{
48 fscache_unregister_netfs(&nfs_fscache_netfs);
49}
50
51/*
52 * Layout of the key for an NFS server cache object.
53 */
54struct nfs_server_key {
55 uint16_t nfsversion; /* NFS protocol version */
56 uint16_t family; /* address family */
57 uint16_t port; /* IP port */
58 union {
59 struct in_addr ipv4_addr; /* IPv4 address */
60 struct in6_addr ipv6_addr; /* IPv6 address */
61 } addr[0];
62};
63
64/*
65 * Generate a key to describe a server in the main NFS index
66 * - We return the length of the key, or 0 if we can't generate one
67 */
68static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
69 void *buffer, uint16_t bufmax)
70{
71 const struct nfs_client *clp = cookie_netfs_data;
72 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
73 const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
74 struct nfs_server_key *key = buffer;
75 uint16_t len = sizeof(struct nfs_server_key);
76
77 key->nfsversion = clp->rpc_ops->version;
78 key->family = clp->cl_addr.ss_family;
79
80 memset(key, 0, len);
81
82 switch (clp->cl_addr.ss_family) {
83 case AF_INET:
84 key->port = sin->sin_port;
85 key->addr[0].ipv4_addr = sin->sin_addr;
86 len += sizeof(key->addr[0].ipv4_addr);
87 break;
88
89 case AF_INET6:
90 key->port = sin6->sin6_port;
91 key->addr[0].ipv6_addr = sin6->sin6_addr;
92 len += sizeof(key->addr[0].ipv6_addr);
93 break;
94
95 default:
96 printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
97 clp->cl_addr.ss_family);
98 len = 0;
99 break;
100 }
101
102 return len;
103}
104
105/*
106 * Define the server object for FS-Cache. This is used to describe a server
107 * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
108 * server address parameters.
109 */
110const struct fscache_cookie_def nfs_fscache_server_index_def = {
111 .name = "NFS.server",
112 .type = FSCACHE_COOKIE_TYPE_INDEX,
113 .get_key = nfs_server_get_key,
114};
115
116/*
117 * Generate a key to describe a superblock key in the main NFS index
118 */
119static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
120 void *buffer, uint16_t bufmax)
121{
122 const struct nfs_fscache_key *key;
123 const struct nfs_server *nfss = cookie_netfs_data;
124 uint16_t len;
125
126 key = nfss->fscache_key;
127 len = sizeof(key->key) + key->key.uniq_len;
128 if (len > bufmax) {
129 len = 0;
130 } else {
131 memcpy(buffer, &key->key, sizeof(key->key));
132 memcpy(buffer + sizeof(key->key),
133 key->key.uniquifier, key->key.uniq_len);
134 }
135
136 return len;
137}
138
139/*
140 * Define the superblock object for FS-Cache. This is used to describe a
141 * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
142 * parameters that might cause a separate superblock.
143 */
144const struct fscache_cookie_def nfs_fscache_super_index_def = {
145 .name = "NFS.super",
146 .type = FSCACHE_COOKIE_TYPE_INDEX,
147 .get_key = nfs_super_get_key,
148};
149
150/*
151 * Definition of the auxiliary data attached to NFS inode storage objects
152 * within the cache.
153 *
154 * The contents of this struct are recorded in the on-disk local cache in the
155 * auxiliary data attached to the data storage object backing an inode. This
156 * permits coherency to be managed when a new inode binds to an already extant
157 * cache object.
158 */
159struct nfs_fscache_inode_auxdata {
160 struct timespec mtime;
161 struct timespec ctime;
162 loff_t size;
163 u64 change_attr;
164};
165
166/*
167 * Generate a key to describe an NFS inode in an NFS server's index
168 */
169static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
170 void *buffer, uint16_t bufmax)
171{
172 const struct nfs_inode *nfsi = cookie_netfs_data;
173 uint16_t nsize;
174
175 /* use the inode's NFS filehandle as the key */
176 nsize = nfsi->fh.size;
177 memcpy(buffer, nfsi->fh.data, nsize);
178 return nsize;
179}
180
181/*
182 * Get certain file attributes from the netfs data
183 * - This function can be absent for an index
184 * - Not permitted to return an error
185 * - The netfs data from the cookie being used as the source is presented
186 */
187static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
188 uint64_t *size)
189{
190 const struct nfs_inode *nfsi = cookie_netfs_data;
191
192 *size = nfsi->vfs_inode.i_size;
193}
194
195/*
196 * Get the auxiliary data from netfs data
197 * - This function can be absent if the index carries no state data
198 * - Should store the auxiliary data in the buffer
199 * - Should return the amount of amount stored
200 * - Not permitted to return an error
201 * - The netfs data from the cookie being used as the source is presented
202 */
203static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
204 void *buffer, uint16_t bufmax)
205{
206 struct nfs_fscache_inode_auxdata auxdata;
207 const struct nfs_inode *nfsi = cookie_netfs_data;
208
209 memset(&auxdata, 0, sizeof(auxdata));
210 auxdata.size = nfsi->vfs_inode.i_size;
211 auxdata.mtime = nfsi->vfs_inode.i_mtime;
212 auxdata.ctime = nfsi->vfs_inode.i_ctime;
213
214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
215 auxdata.change_attr = nfsi->change_attr;
216
217 if (bufmax > sizeof(auxdata))
218 bufmax = sizeof(auxdata);
219
220 memcpy(buffer, &auxdata, bufmax);
221 return bufmax;
222}
223
224/*
225 * Consult the netfs about the state of an object
226 * - This function can be absent if the index carries no state data
227 * - The netfs data from the cookie being used as the target is
228 * presented, as is the auxiliary data
229 */
230static
231enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
232 const void *data,
233 uint16_t datalen)
234{
235 struct nfs_fscache_inode_auxdata auxdata;
236 struct nfs_inode *nfsi = cookie_netfs_data;
237
238 if (datalen != sizeof(auxdata))
239 return FSCACHE_CHECKAUX_OBSOLETE;
240
241 memset(&auxdata, 0, sizeof(auxdata));
242 auxdata.size = nfsi->vfs_inode.i_size;
243 auxdata.mtime = nfsi->vfs_inode.i_mtime;
244 auxdata.ctime = nfsi->vfs_inode.i_ctime;
245
246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
247 auxdata.change_attr = nfsi->change_attr;
248
249 if (memcmp(data, &auxdata, datalen) != 0)
250 return FSCACHE_CHECKAUX_OBSOLETE;
251
252 return FSCACHE_CHECKAUX_OKAY;
253}
254
255/*
256 * Indication from FS-Cache that the cookie is no longer cached
257 * - This function is called when the backing store currently caching a cookie
258 * is removed
259 * - The netfs should use this to clean up any markers indicating cached pages
260 * - This is mandatory for any object that may have data
261 */
262static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
263{
264 struct nfs_inode *nfsi = cookie_netfs_data;
265 struct pagevec pvec;
266 pgoff_t first;
267 int loop, nr_pages;
268
269 pagevec_init(&pvec, 0);
270 first = 0;
271
272 dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
273
274 for (;;) {
275 /* grab a bunch of pages to unmark */
276 nr_pages = pagevec_lookup(&pvec,
277 nfsi->vfs_inode.i_mapping,
278 first,
279 PAGEVEC_SIZE - pagevec_count(&pvec));
280 if (!nr_pages)
281 break;
282
283 for (loop = 0; loop < nr_pages; loop++)
284 ClearPageFsCache(pvec.pages[loop]);
285
286 first = pvec.pages[nr_pages - 1]->index + 1;
287
288 pvec.nr = nr_pages;
289 pagevec_release(&pvec);
290 cond_resched();
291 }
292}
293
294/*
295 * Get an extra reference on a read context.
296 * - This function can be absent if the completion function doesn't require a
297 * context.
298 * - The read context is passed back to NFS in the event that a data read on the
299 * cache fails with EIO - in which case the server must be contacted to
300 * retrieve the data, which requires the read context for security.
301 */
302static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
303{
304 get_nfs_open_context(context);
305}
306
307/*
308 * Release an extra reference on a read context.
309 * - This function can be absent if the completion function doesn't require a
310 * context.
311 */
312static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
313{
314 if (context)
315 put_nfs_open_context(context);
316}
317
318/*
319 * Define the inode object for FS-Cache. This is used to describe an inode
320 * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
321 * an inode.
322 *
323 * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
324 * held in the cache auxiliary data for the data storage object with those in
325 * the inode struct in memory.
326 */
327const struct fscache_cookie_def nfs_fscache_inode_object_def = {
328 .name = "NFS.fh",
329 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
330 .get_key = nfs_fscache_inode_get_key,
331 .get_attr = nfs_fscache_inode_get_attr,
332 .get_aux = nfs_fscache_inode_get_aux,
333 .check_aux = nfs_fscache_inode_check_aux,
334 .now_uncached = nfs_fscache_inode_now_uncached,
335 .get_context = nfs_fh_get_context,
336 .put_context = nfs_fh_put_context,
337};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..379be678cb7e
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
1/* NFS filesystem cache interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19#include <linux/seq_file.h>
20
21#include "internal.h"
22#include "iostat.h"
23#include "fscache.h"
24
25#define NFSDBG_FACILITY NFSDBG_FSCACHE
26
27static struct rb_root nfs_fscache_keys = RB_ROOT;
28static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
29
30/*
31 * Get the per-client index cookie for an NFS client if the appropriate mount
32 * flag was set
33 * - We always try and get an index cookie for the client, but get filehandle
34 * cookies on a per-superblock basis, depending on the mount flags
35 */
36void nfs_fscache_get_client_cookie(struct nfs_client *clp)
37{
38 /* create a cache index for looking up filehandles */
39 clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
40 &nfs_fscache_server_index_def,
41 clp);
42 dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
43 clp, clp->fscache);
44}
45
46/*
47 * Dispose of a per-client cookie
48 */
49void nfs_fscache_release_client_cookie(struct nfs_client *clp)
50{
51 dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
52 clp, clp->fscache);
53
54 fscache_relinquish_cookie(clp->fscache, 0);
55 clp->fscache = NULL;
56}
57
58/*
59 * Get the cache cookie for an NFS superblock. We have to handle
60 * uniquification here because the cache doesn't do it for us.
61 */
62void nfs_fscache_get_super_cookie(struct super_block *sb,
63 struct nfs_parsed_mount_data *data)
64{
65 struct nfs_fscache_key *key, *xkey;
66 struct nfs_server *nfss = NFS_SB(sb);
67 struct rb_node **p, *parent;
68 const char *uniq = data->fscache_uniq ?: "";
69 int diff, ulen;
70
71 ulen = strlen(uniq);
72 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
73 if (!key)
74 return;
75
76 key->nfs_client = nfss->nfs_client;
77 key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
78 key->key.nfs_server.flags = nfss->flags;
79 key->key.nfs_server.rsize = nfss->rsize;
80 key->key.nfs_server.wsize = nfss->wsize;
81 key->key.nfs_server.acregmin = nfss->acregmin;
82 key->key.nfs_server.acregmax = nfss->acregmax;
83 key->key.nfs_server.acdirmin = nfss->acdirmin;
84 key->key.nfs_server.acdirmax = nfss->acdirmax;
85 key->key.nfs_server.fsid = nfss->fsid;
86 key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
87
88 key->key.uniq_len = ulen;
89 memcpy(key->key.uniquifier, uniq, ulen);
90
91 spin_lock(&nfs_fscache_keys_lock);
92 p = &nfs_fscache_keys.rb_node;
93 parent = NULL;
94 while (*p) {
95 parent = *p;
96 xkey = rb_entry(parent, struct nfs_fscache_key, node);
97
98 if (key->nfs_client < xkey->nfs_client)
99 goto go_left;
100 if (key->nfs_client > xkey->nfs_client)
101 goto go_right;
102
103 diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
104 if (diff < 0)
105 goto go_left;
106 if (diff > 0)
107 goto go_right;
108
109 if (key->key.uniq_len == 0)
110 goto non_unique;
111 diff = memcmp(key->key.uniquifier,
112 xkey->key.uniquifier,
113 key->key.uniq_len);
114 if (diff < 0)
115 goto go_left;
116 if (diff > 0)
117 goto go_right;
118 goto non_unique;
119
120 go_left:
121 p = &(*p)->rb_left;
122 continue;
123 go_right:
124 p = &(*p)->rb_right;
125 }
126
127 rb_link_node(&key->node, parent, p);
128 rb_insert_color(&key->node, &nfs_fscache_keys);
129 spin_unlock(&nfs_fscache_keys_lock);
130 nfss->fscache_key = key;
131
132 /* create a cache index for looking up filehandles */
133 nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
134 &nfs_fscache_super_index_def,
135 nfss);
136 dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
137 nfss, nfss->fscache);
138 return;
139
140non_unique:
141 spin_unlock(&nfs_fscache_keys_lock);
142 kfree(key);
143 nfss->fscache_key = NULL;
144 nfss->fscache = NULL;
145 printk(KERN_WARNING "NFS:"
146 " Cache request denied due to non-unique superblock keys\n");
147}
148
149/*
150 * release a per-superblock cookie
151 */
152void nfs_fscache_release_super_cookie(struct super_block *sb)
153{
154 struct nfs_server *nfss = NFS_SB(sb);
155
156 dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
157 nfss, nfss->fscache);
158
159 fscache_relinquish_cookie(nfss->fscache, 0);
160 nfss->fscache = NULL;
161
162 if (nfss->fscache_key) {
163 spin_lock(&nfs_fscache_keys_lock);
164 rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
165 spin_unlock(&nfs_fscache_keys_lock);
166 kfree(nfss->fscache_key);
167 nfss->fscache_key = NULL;
168 }
169}
170
171/*
172 * Initialise the per-inode cache cookie pointer for an NFS inode.
173 */
174void nfs_fscache_init_inode_cookie(struct inode *inode)
175{
176 NFS_I(inode)->fscache = NULL;
177 if (S_ISREG(inode->i_mode))
178 set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
179}
180
181/*
182 * Get the per-inode cache cookie for an NFS inode.
183 */
184static void nfs_fscache_enable_inode_cookie(struct inode *inode)
185{
186 struct super_block *sb = inode->i_sb;
187 struct nfs_inode *nfsi = NFS_I(inode);
188
189 if (nfsi->fscache || !NFS_FSCACHE(inode))
190 return;
191
192 if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
193 nfsi->fscache = fscache_acquire_cookie(
194 NFS_SB(sb)->fscache,
195 &nfs_fscache_inode_object_def,
196 nfsi);
197
198 dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
199 sb, nfsi, nfsi->fscache);
200 }
201}
202
203/*
204 * Release a per-inode cookie.
205 */
206void nfs_fscache_release_inode_cookie(struct inode *inode)
207{
208 struct nfs_inode *nfsi = NFS_I(inode);
209
210 dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
211 nfsi, nfsi->fscache);
212
213 fscache_relinquish_cookie(nfsi->fscache, 0);
214 nfsi->fscache = NULL;
215}
216
217/*
218 * Retire a per-inode cookie, destroying the data attached to it.
219 */
220void nfs_fscache_zap_inode_cookie(struct inode *inode)
221{
222 struct nfs_inode *nfsi = NFS_I(inode);
223
224 dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
225 nfsi, nfsi->fscache);
226
227 fscache_relinquish_cookie(nfsi->fscache, 1);
228 nfsi->fscache = NULL;
229}
230
231/*
232 * Turn off the cache with regard to a per-inode cookie if opened for writing,
233 * invalidating all the pages in the page cache relating to the associated
234 * inode to clear the per-page caching.
235 */
236static void nfs_fscache_disable_inode_cookie(struct inode *inode)
237{
238 clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
239
240 if (NFS_I(inode)->fscache) {
241 dfprintk(FSCACHE,
242 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
243
244 /* Need to invalidate any mapped pages that were read in before
245 * turning off the cache.
246 */
247 if (inode->i_mapping && inode->i_mapping->nrpages)
248 invalidate_inode_pages2(inode->i_mapping);
249
250 nfs_fscache_zap_inode_cookie(inode);
251 }
252}
253
254/*
255 * wait_on_bit() sleep function for uninterruptible waiting
256 */
257static int nfs_fscache_wait_bit(void *flags)
258{
259 schedule();
260 return 0;
261}
262
263/*
264 * Lock against someone else trying to also acquire or relinquish a cookie
265 */
266static inline void nfs_fscache_inode_lock(struct inode *inode)
267{
268 struct nfs_inode *nfsi = NFS_I(inode);
269
270 while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
271 wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
272 nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
273}
274
275/*
276 * Unlock cookie management lock
277 */
278static inline void nfs_fscache_inode_unlock(struct inode *inode)
279{
280 struct nfs_inode *nfsi = NFS_I(inode);
281
282 smp_mb__before_clear_bit();
283 clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
284 smp_mb__after_clear_bit();
285 wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
286}
287
288/*
289 * Decide if we should enable or disable local caching for this inode.
290 * - For now, with NFS, only regular files that are open read-only will be able
291 * to use the cache.
292 * - May be invoked multiple times in parallel by parallel nfs_open() functions.
293 */
294void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
295{
296 if (NFS_FSCACHE(inode)) {
297 nfs_fscache_inode_lock(inode);
298 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
299 nfs_fscache_disable_inode_cookie(inode);
300 else
301 nfs_fscache_enable_inode_cookie(inode);
302 nfs_fscache_inode_unlock(inode);
303 }
304}
305
306/*
307 * Replace a per-inode cookie due to revalidation detecting a file having
308 * changed on the server.
309 */
310void nfs_fscache_reset_inode_cookie(struct inode *inode)
311{
312 struct nfs_inode *nfsi = NFS_I(inode);
313 struct nfs_server *nfss = NFS_SERVER(inode);
314 struct fscache_cookie *old = nfsi->fscache;
315
316 nfs_fscache_inode_lock(inode);
317 if (nfsi->fscache) {
318 /* retire the current fscache cache and get a new one */
319 fscache_relinquish_cookie(nfsi->fscache, 1);
320
321 nfsi->fscache = fscache_acquire_cookie(
322 nfss->nfs_client->fscache,
323 &nfs_fscache_inode_object_def,
324 nfsi);
325
326 dfprintk(FSCACHE,
327 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
328 nfss, nfsi, old, nfsi->fscache);
329 }
330 nfs_fscache_inode_unlock(inode);
331}
332
333/*
334 * Release the caching state associated with a page, if the page isn't busy
335 * interacting with the cache.
336 * - Returns true (can release page) or false (page busy).
337 */
338int nfs_fscache_release_page(struct page *page, gfp_t gfp)
339{
340 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
341 struct fscache_cookie *cookie = nfsi->fscache;
342
343 BUG_ON(!cookie);
344
345 if (fscache_check_page_write(cookie, page)) {
346 if (!(gfp & __GFP_WAIT))
347 return 0;
348 fscache_wait_on_page_write(cookie, page);
349 }
350
351 if (PageFsCache(page)) {
352 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
353 cookie, page, nfsi);
354
355 fscache_uncache_page(cookie, page);
356 nfs_add_fscache_stats(page->mapping->host,
357 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
358 }
359
360 return 1;
361}
362
363/*
364 * Release the caching state associated with a page if undergoing complete page
365 * invalidation.
366 */
367void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
368{
369 struct nfs_inode *nfsi = NFS_I(inode);
370 struct fscache_cookie *cookie = nfsi->fscache;
371
372 BUG_ON(!cookie);
373
374 dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
375 cookie, page, nfsi);
376
377 fscache_wait_on_page_write(cookie, page);
378
379 BUG_ON(!PageLocked(page));
380 fscache_uncache_page(cookie, page);
381 nfs_add_fscache_stats(page->mapping->host,
382 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
383}
384
385/*
386 * Handle completion of a page being read from the cache.
387 * - Called in process (keventd) context.
388 */
389static void nfs_readpage_from_fscache_complete(struct page *page,
390 void *context,
391 int error)
392{
393 dfprintk(FSCACHE,
394 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
395 page, context, error);
396
397 /* if the read completes with an error, we just unlock the page and let
398 * the VM reissue the readpage */
399 if (!error) {
400 SetPageUptodate(page);
401 unlock_page(page);
402 } else {
403 error = nfs_readpage_async(context, page->mapping->host, page);
404 if (error)
405 unlock_page(page);
406 }
407}
408
409/*
410 * Retrieve a page from fscache
411 */
412int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
413 struct inode *inode, struct page *page)
414{
415 int ret;
416
417 dfprintk(FSCACHE,
418 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
419 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
420
421 ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
422 page,
423 nfs_readpage_from_fscache_complete,
424 ctx,
425 GFP_KERNEL);
426
427 switch (ret) {
428 case 0: /* read BIO submitted (page in fscache) */
429 dfprintk(FSCACHE,
430 "NFS: readpage_from_fscache: BIO submitted\n");
431 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
432 return ret;
433
434 case -ENOBUFS: /* inode not in cache */
435 case -ENODATA: /* page not in cache */
436 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
437 dfprintk(FSCACHE,
438 "NFS: readpage_from_fscache %d\n", ret);
439 return 1;
440
441 default:
442 dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
443 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
444 }
445 return ret;
446}
447
448/*
449 * Retrieve a set of pages from fscache
450 */
451int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
452 struct inode *inode,
453 struct address_space *mapping,
454 struct list_head *pages,
455 unsigned *nr_pages)
456{
457 int ret, npages = *nr_pages;
458
459 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
460 NFS_I(inode)->fscache, npages, inode);
461
462 ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
463 mapping, pages, nr_pages,
464 nfs_readpage_from_fscache_complete,
465 ctx,
466 mapping_gfp_mask(mapping));
467 if (*nr_pages < npages)
468 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
469 npages);
470 if (*nr_pages > 0)
471 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
472 *nr_pages);
473
474 switch (ret) {
475 case 0: /* read submitted to the cache for all pages */
476 BUG_ON(!list_empty(pages));
477 BUG_ON(*nr_pages != 0);
478 dfprintk(FSCACHE,
479 "NFS: nfs_getpages_from_fscache: submitted\n");
480
481 return ret;
482
483 case -ENOBUFS: /* some pages aren't cached and can't be */
484 case -ENODATA: /* some pages aren't cached */
485 dfprintk(FSCACHE,
486 "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
487 return 1;
488
489 default:
490 dfprintk(FSCACHE,
491 "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
492 }
493
494 return ret;
495}
496
497/*
498 * Store a newly fetched page in fscache
499 * - PG_fscache must be set on the page
500 */
501void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
502{
503 int ret;
504
505 dfprintk(FSCACHE,
506 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
507 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
508
509 ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
510 dfprintk(FSCACHE,
511 "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
512 page, page->index, page->flags, ret);
513
514 if (ret != 0) {
515 fscache_uncache_page(NFS_I(inode)->fscache, page);
516 nfs_add_fscache_stats(inode,
517 NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
518 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
519 } else {
520 nfs_add_fscache_stats(inode,
521 NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
522 }
523}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000000..6e809bb0ff08
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
1/* NFS filesystem cache interface definitions
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#ifndef _NFS_FSCACHE_H
13#define _NFS_FSCACHE_H
14
15#include <linux/nfs_fs.h>
16#include <linux/nfs_mount.h>
17#include <linux/nfs4_mount.h>
18#include <linux/fscache.h>
19
20#ifdef CONFIG_NFS_FSCACHE
21
22/*
23 * set of NFS FS-Cache objects that form a superblock key
24 */
25struct nfs_fscache_key {
26 struct rb_node node;
27 struct nfs_client *nfs_client; /* the server */
28
29 /* the elements of the unique key - as used by nfs_compare_super() and
30 * nfs_compare_mount_options() to distinguish superblocks */
31 struct {
32 struct {
33 unsigned long s_flags; /* various flags
34 * (& NFS_MS_MASK) */
35 } super;
36
37 struct {
38 struct nfs_fsid fsid;
39 int flags;
40 unsigned int rsize; /* read size */
41 unsigned int wsize; /* write size */
42 unsigned int acregmin; /* attr cache timeouts */
43 unsigned int acregmax;
44 unsigned int acdirmin;
45 unsigned int acdirmax;
46 } nfs_server;
47
48 struct {
49 rpc_authflavor_t au_flavor;
50 } rpc_auth;
51
52 /* uniquifier - can be used if nfs_server.flags includes
53 * NFS_MOUNT_UNSHARED */
54 u8 uniq_len;
55 char uniquifier[0];
56 } key;
57};
58
59/*
60 * fscache-index.c
61 */
62extern struct fscache_netfs nfs_fscache_netfs;
63extern const struct fscache_cookie_def nfs_fscache_server_index_def;
64extern const struct fscache_cookie_def nfs_fscache_super_index_def;
65extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
66
67extern int nfs_fscache_register(void);
68extern void nfs_fscache_unregister(void);
69
70/*
71 * fscache.c
72 */
73extern void nfs_fscache_get_client_cookie(struct nfs_client *);
74extern void nfs_fscache_release_client_cookie(struct nfs_client *);
75
76extern void nfs_fscache_get_super_cookie(struct super_block *,
77 struct nfs_parsed_mount_data *);
78extern void nfs_fscache_release_super_cookie(struct super_block *);
79
80extern void nfs_fscache_init_inode_cookie(struct inode *);
81extern void nfs_fscache_release_inode_cookie(struct inode *);
82extern void nfs_fscache_zap_inode_cookie(struct inode *);
83extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
84extern void nfs_fscache_reset_inode_cookie(struct inode *);
85
86extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
87extern int nfs_fscache_release_page(struct page *, gfp_t);
88
89extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
90 struct inode *, struct page *);
91extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
92 struct inode *, struct address_space *,
93 struct list_head *, unsigned *);
94extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
95
96/*
97 * wait for a page to complete writing to the cache
98 */
99static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
100 struct page *page)
101{
102 if (PageFsCache(page))
103 fscache_wait_on_page_write(nfsi->fscache, page);
104}
105
106/*
107 * release the caching state associated with a page if undergoing complete page
108 * invalidation
109 */
110static inline void nfs_fscache_invalidate_page(struct page *page,
111 struct inode *inode)
112{
113 if (PageFsCache(page))
114 __nfs_fscache_invalidate_page(page, inode);
115}
116
117/*
118 * Retrieve a page from an inode data storage object.
119 */
120static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
121 struct inode *inode,
122 struct page *page)
123{
124 if (NFS_I(inode)->fscache)
125 return __nfs_readpage_from_fscache(ctx, inode, page);
126 return -ENOBUFS;
127}
128
129/*
130 * Retrieve a set of pages from an inode data storage object.
131 */
132static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
133 struct inode *inode,
134 struct address_space *mapping,
135 struct list_head *pages,
136 unsigned *nr_pages)
137{
138 if (NFS_I(inode)->fscache)
139 return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
140 nr_pages);
141 return -ENOBUFS;
142}
143
144/*
145 * Store a page newly fetched from the server in an inode data storage object
146 * in the cache.
147 */
148static inline void nfs_readpage_to_fscache(struct inode *inode,
149 struct page *page,
150 int sync)
151{
152 if (PageFsCache(page))
153 __nfs_readpage_to_fscache(inode, page, sync);
154}
155
156/*
157 * indicate the client caching state as readable text
158 */
159static inline const char *nfs_server_fscache_state(struct nfs_server *server)
160{
161 if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
162 return "yes";
163 return "no ";
164}
165
166
167#else /* CONFIG_NFS_FSCACHE */
168static inline int nfs_fscache_register(void) { return 0; }
169static inline void nfs_fscache_unregister(void) {}
170
171static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
172static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
173
174static inline void nfs_fscache_get_super_cookie(
175 struct super_block *sb,
176 struct nfs_parsed_mount_data *data)
177{
178}
179static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
180
181static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
182static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
183static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
184static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
185 struct file *filp) {}
186static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
187
188static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
189{
190 return 1; /* True: may release page */
191}
192static inline void nfs_fscache_invalidate_page(struct page *page,
193 struct inode *inode) {}
194static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
195 struct page *page) {}
196
197static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
198 struct inode *inode,
199 struct page *page)
200{
201 return -ENOBUFS;
202}
203static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
204 struct inode *inode,
205 struct address_space *mapping,
206 struct list_head *pages,
207 unsigned *nr_pages)
208{
209 return -ENOBUFS;
210}
211static inline void nfs_readpage_to_fscache(struct inode *inode,
212 struct page *page, int sync) {}
213
214static inline const char *nfs_server_fscache_state(struct nfs_server *server)
215{
216 return "no ";
217}
218
219#endif /* CONFIG_NFS_FSCACHE */
220#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a834d1d850b7..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
46#include "delegation.h" 46#include "delegation.h"
47#include "iostat.h" 47#include "iostat.h"
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h"
49 50
50#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
51 52
@@ -121,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
121 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 122 BUG_ON(!list_empty(&NFS_I(inode)->open_files));
122 nfs_zap_acl_cache(inode); 123 nfs_zap_acl_cache(inode);
123 nfs_access_zap_cache(inode); 124 nfs_access_zap_cache(inode);
125 nfs_fscache_release_inode_cookie(inode);
124} 126}
125 127
126/** 128/**
@@ -355,6 +357,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
355 nfsi->attrtimeo_timestamp = now; 357 nfsi->attrtimeo_timestamp = now;
356 nfsi->access_cache = RB_ROOT; 358 nfsi->access_cache = RB_ROOT;
357 359
360 nfs_fscache_init_inode_cookie(inode);
361
358 unlock_new_inode(inode); 362 unlock_new_inode(inode);
359 } else 363 } else
360 nfs_refresh_inode(inode, fattr); 364 nfs_refresh_inode(inode, fattr);
@@ -686,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
686 ctx->mode = filp->f_mode; 690 ctx->mode = filp->f_mode;
687 nfs_file_set_open_context(filp, ctx); 691 nfs_file_set_open_context(filp, ctx);
688 put_nfs_open_context(ctx); 692 put_nfs_open_context(ctx);
693 nfs_fscache_set_inode_cookie(inode, filp);
689 return 0; 694 return 0;
690} 695}
691 696
@@ -786,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
786 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 791 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
787 spin_unlock(&inode->i_lock); 792 spin_unlock(&inode->i_lock);
788 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 793 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
794 nfs_fscache_reset_inode_cookie(inode);
789 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 795 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
790 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 796 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
791 return 0; 797 return 0;
@@ -1030,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
1030 spin_lock(&inode->i_lock); 1036 spin_lock(&inode->i_lock);
1031 status = nfs_refresh_inode_locked(inode, fattr); 1037 status = nfs_refresh_inode_locked(inode, fattr);
1032 spin_unlock(&inode->i_lock); 1038 spin_unlock(&inode->i_lock);
1039
1033 return status; 1040 return status;
1034} 1041}
1035 1042
@@ -1436,6 +1443,10 @@ static int __init init_nfs_fs(void)
1436{ 1443{
1437 int err; 1444 int err;
1438 1445
1446 err = nfs_fscache_register();
1447 if (err < 0)
1448 goto out7;
1449
1439 err = nfsiod_start(); 1450 err = nfsiod_start();
1440 if (err) 1451 if (err)
1441 goto out6; 1452 goto out6;
@@ -1488,6 +1499,8 @@ out4:
1488out5: 1499out5:
1489 nfsiod_stop(); 1500 nfsiod_stop();
1490out6: 1501out6:
1502 nfs_fscache_unregister();
1503out7:
1491 return err; 1504 return err;
1492} 1505}
1493 1506
@@ -1498,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
1498 nfs_destroy_readpagecache(); 1511 nfs_destroy_readpagecache();
1499 nfs_destroy_inodecache(); 1512 nfs_destroy_inodecache();
1500 nfs_destroy_nfspagecache(); 1513 nfs_destroy_nfspagecache();
1514 nfs_fscache_unregister();
1501#ifdef CONFIG_PROC_FS 1515#ifdef CONFIG_PROC_FS
1502 rpc_proc_unregister("nfs"); 1516 rpc_proc_unregister("nfs");
1503#endif 1517#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2041f68ff1cc..e4d6a8348adf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
5#include <linux/mount.h> 5#include <linux/mount.h>
6#include <linux/security.h> 6#include <linux/security.h>
7 7
8#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
9
8struct nfs_string; 10struct nfs_string;
9 11
10/* Maximum number of readahead requests 12/* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
37 int acregmin, acregmax, 39 int acregmin, acregmax,
38 acdirmin, acdirmax; 40 acdirmin, acdirmax;
39 int namlen; 41 int namlen;
42 unsigned int options;
40 unsigned int bsize; 43 unsigned int bsize;
41 unsigned int auth_flavor_len; 44 unsigned int auth_flavor_len;
42 rpc_authflavor_t auth_flavors[1]; 45 rpc_authflavor_t auth_flavors[1];
43 char *client_address; 46 char *client_address;
47 char *fscache_uniq;
44 48
45 struct { 49 struct {
46 struct sockaddr_storage address; 50 struct sockaddr_storage address;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
16 16
17struct nfs_iostats { 17struct nfs_iostats {
18 unsigned long long bytes[__NFSIOS_BYTESMAX]; 18 unsigned long long bytes[__NFSIOS_BYTESMAX];
19#ifdef CONFIG_NFS_FSCACHE
20 unsigned long long fscache[__NFSIOS_FSCACHEMAX];
21#endif
19 unsigned long events[__NFSIOS_COUNTSMAX]; 22 unsigned long events[__NFSIOS_COUNTSMAX];
20} ____cacheline_aligned; 23} ____cacheline_aligned;
21 24
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
57 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 60 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
58} 61}
59 62
63#ifdef CONFIG_NFS_FSCACHE
64static inline void nfs_add_fscache_stats(struct inode *inode,
65 enum nfs_stat_fscachecounters stat,
66 unsigned long addend)
67{
68 struct nfs_iostats *iostats;
69 int cpu;
70
71 cpu = get_cpu();
72 iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
73 iostats->fscache[stat] += addend;
74 put_cpu_no_resched();
75}
76#endif
77
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 78static inline struct nfs_iostats *nfs_alloc_iostats(void)
61{ 79{
62 return alloc_percpu(struct nfs_iostats); 80 return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index b82fe6847f14..d0cc5ce0edfe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
328 data->arg.create.verifier[1] = current->pid; 328 data->arg.create.verifier[1] = current->pid;
329 } 329 }
330 330
331 sattr->ia_mode &= ~current->fs->umask; 331 sattr->ia_mode &= ~current_umask();
332 332
333 for (;;) { 333 for (;;) {
334 status = nfs3_do_create(dir, dentry, data); 334 status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
528 528
529 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 529 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
530 530
531 sattr->ia_mode &= ~current->fs->umask; 531 sattr->ia_mode &= ~current_umask();
532 532
533 data = nfs3_alloc_createdata(); 533 data = nfs3_alloc_createdata();
534 if (data == NULL) 534 if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
640 MAJOR(rdev), MINOR(rdev)); 640 MAJOR(rdev), MINOR(rdev));
641 641
642 sattr->ia_mode &= ~current->fs->umask; 642 sattr->ia_mode &= ~current_umask();
643 643
644 data = nfs3_alloc_createdata(); 644 data = nfs3_alloc_createdata();
645 if (data == NULL) 645 if (data == NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 97bacccff579..a4d242680299 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1501,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1501 attr.ia_mode = nd->intent.open.create_mode; 1501 attr.ia_mode = nd->intent.open.create_mode;
1502 attr.ia_valid = ATTR_MODE; 1502 attr.ia_valid = ATTR_MODE;
1503 if (!IS_POSIXACL(dir)) 1503 if (!IS_POSIXACL(dir))
1504 attr.ia_mode &= ~current->fs->umask; 1504 attr.ia_mode &= ~current_umask();
1505 } else { 1505 } else {
1506 attr.ia_valid = 0; 1506 attr.ia_valid = 0;
1507 BUG_ON(nd->intent.open.flags & O_CREAT); 1507 BUG_ON(nd->intent.open.flags & O_CREAT);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..4ace3c50a8eb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
24 24
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h"
27 28
28#define NFSDBG_FACILITY NFSDBG_PAGECACHE 29#define NFSDBG_FACILITY NFSDBG_PAGECACHE
29 30
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
111 } 112 }
112} 113}
113 114
114static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 115int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
115 struct page *page) 116 struct page *page)
116{ 117{
117 LIST_HEAD(one_request); 118 LIST_HEAD(one_request);
118 struct nfs_page *new; 119 struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
139 140
140static void nfs_readpage_release(struct nfs_page *req) 141static void nfs_readpage_release(struct nfs_page *req)
141{ 142{
143 struct inode *d_inode = req->wb_context->path.dentry->d_inode;
144
145 if (PageUptodate(req->wb_page))
146 nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
147
142 unlock_page(req->wb_page); 148 unlock_page(req->wb_page);
143 149
144 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 150 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
510 } else 516 } else
511 ctx = get_nfs_open_context(nfs_file_open_context(file)); 517 ctx = get_nfs_open_context(nfs_file_open_context(file));
512 518
519 if (!IS_SYNC(inode)) {
520 error = nfs_readpage_from_fscache(ctx, inode, page);
521 if (error == 0)
522 goto out;
523 }
524
513 error = nfs_readpage_async(ctx, inode, page); 525 error = nfs_readpage_async(ctx, inode, page);
514 526
527out:
515 put_nfs_open_context(ctx); 528 put_nfs_open_context(ctx);
516 return error; 529 return error;
517out_unlock: 530out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
584 return -EBADF; 597 return -EBADF;
585 } else 598 } else
586 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); 599 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
600
601 /* attempt to read as many of the pages as possible from the cache
602 * - this returns -ENOBUFS immediately if the cookie is negative
603 */
604 ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
605 pages, &nr_pages);
606 if (ret == 0)
607 goto read_complete; /* all pages were read */
608
587 if (rsize < PAGE_CACHE_SIZE) 609 if (rsize < PAGE_CACHE_SIZE)
588 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 610 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
589 else 611 else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
594 nfs_pageio_complete(&pgio); 616 nfs_pageio_complete(&pgio);
595 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 617 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
596 nfs_add_stats(inode, NFSIOS_READPAGES, npages); 618 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
619read_complete:
597 put_nfs_open_context(desc.ctx); 620 put_nfs_open_context(desc.ctx);
598out: 621out:
599 return ret; 622 return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0942fcbbad3c..82eaadbff408 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
60#include "delegation.h" 60#include "delegation.h"
61#include "iostat.h" 61#include "iostat.h"
62#include "internal.h" 62#include "internal.h"
63#include "fscache.h"
63 64
64#define NFSDBG_FACILITY NFSDBG_VFS 65#define NFSDBG_FACILITY NFSDBG_VFS
65 66
@@ -76,6 +77,7 @@ enum {
76 Opt_rdirplus, Opt_nordirplus, 77 Opt_rdirplus, Opt_nordirplus,
77 Opt_sharecache, Opt_nosharecache, 78 Opt_sharecache, Opt_nosharecache,
78 Opt_resvport, Opt_noresvport, 79 Opt_resvport, Opt_noresvport,
80 Opt_fscache, Opt_nofscache,
79 81
80 /* Mount options that take integer arguments */ 82 /* Mount options that take integer arguments */
81 Opt_port, 83 Opt_port,
@@ -93,6 +95,7 @@ enum {
93 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 95 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
94 Opt_addr, Opt_mountaddr, Opt_clientaddr, 96 Opt_addr, Opt_mountaddr, Opt_clientaddr,
95 Opt_lookupcache, 97 Opt_lookupcache,
98 Opt_fscache_uniq,
96 99
97 /* Special mount options */ 100 /* Special mount options */
98 Opt_userspace, Opt_deprecated, Opt_sloppy, 101 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
132 { Opt_nosharecache, "nosharecache" }, 135 { Opt_nosharecache, "nosharecache" },
133 { Opt_resvport, "resvport" }, 136 { Opt_resvport, "resvport" },
134 { Opt_noresvport, "noresvport" }, 137 { Opt_noresvport, "noresvport" },
138 { Opt_fscache, "fsc" },
139 { Opt_fscache_uniq, "fsc=%s" },
140 { Opt_nofscache, "nofsc" },
135 141
136 { Opt_port, "port=%u" }, 142 { Opt_port, "port=%u" },
137 { Opt_rsize, "rsize=%u" }, 143 { Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
563 if (clp->rpc_ops->version == 4) 569 if (clp->rpc_ops->version == 4)
564 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); 570 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
565#endif 571#endif
572 if (nfss->options & NFS_OPTION_FSCACHE)
573 seq_printf(m, ",fsc");
566} 574}
567 575
568/* 576/*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
641 totals.events[i] += stats->events[i]; 649 totals.events[i] += stats->events[i];
642 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 650 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
643 totals.bytes[i] += stats->bytes[i]; 651 totals.bytes[i] += stats->bytes[i];
652#ifdef CONFIG_NFS_FSCACHE
653 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
654 totals.fscache[i] += stats->fscache[i];
655#endif
644 656
645 preempt_enable(); 657 preempt_enable();
646 } 658 }
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
651 seq_printf(m, "\n\tbytes:\t"); 663 seq_printf(m, "\n\tbytes:\t");
652 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 664 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
653 seq_printf(m, "%Lu ", totals.bytes[i]); 665 seq_printf(m, "%Lu ", totals.bytes[i]);
666#ifdef CONFIG_NFS_FSCACHE
667 if (nfss->options & NFS_OPTION_FSCACHE) {
668 seq_printf(m, "\n\tfsc:\t");
669 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
670 seq_printf(m, "%Lu ", totals.bytes[i]);
671 }
672#endif
654 seq_printf(m, "\n"); 673 seq_printf(m, "\n");
655 674
656 rpc_print_iostats(m, nfss->client); 675 rpc_print_iostats(m, nfss->client);
@@ -1044,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw,
1044 case Opt_noresvport: 1063 case Opt_noresvport:
1045 mnt->flags |= NFS_MOUNT_NORESVPORT; 1064 mnt->flags |= NFS_MOUNT_NORESVPORT;
1046 break; 1065 break;
1066 case Opt_fscache:
1067 mnt->options |= NFS_OPTION_FSCACHE;
1068 kfree(mnt->fscache_uniq);
1069 mnt->fscache_uniq = NULL;
1070 break;
1071 case Opt_nofscache:
1072 mnt->options &= ~NFS_OPTION_FSCACHE;
1073 kfree(mnt->fscache_uniq);
1074 mnt->fscache_uniq = NULL;
1075 break;
1076 case Opt_fscache_uniq:
1077 string = match_strdup(args);
1078 if (!string)
1079 goto out_nomem;
1080 kfree(mnt->fscache_uniq);
1081 mnt->fscache_uniq = string;
1082 mnt->options |= NFS_OPTION_FSCACHE;
1083 break;
1047 1084
1048 /* 1085 /*
1049 * options that take numeric values 1086 * options that take numeric values
@@ -1870,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb,
1870 nfs_initialise_sb(sb); 1907 nfs_initialise_sb(sb);
1871} 1908}
1872 1909
1873#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
1874
1875static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) 1910static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
1876{ 1911{
1877 const struct nfs_server *a = s->s_fs_info; 1912 const struct nfs_server *a = s->s_fs_info;
@@ -2036,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2036 if (!s->s_root) { 2071 if (!s->s_root) {
2037 /* initial superblock/root creation */ 2072 /* initial superblock/root creation */
2038 nfs_fill_super(s, data); 2073 nfs_fill_super(s, data);
2074 nfs_fscache_get_super_cookie(s, data);
2039 } 2075 }
2040 2076
2041 mntroot = nfs_get_root(s, mntfh); 2077 mntroot = nfs_get_root(s, mntfh);
@@ -2056,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2056out: 2092out:
2057 kfree(data->nfs_server.hostname); 2093 kfree(data->nfs_server.hostname);
2058 kfree(data->mount_server.hostname); 2094 kfree(data->mount_server.hostname);
2095 kfree(data->fscache_uniq);
2059 security_free_mnt_opts(&data->lsm_opts); 2096 security_free_mnt_opts(&data->lsm_opts);
2060out_free_fh: 2097out_free_fh:
2061 kfree(mntfh); 2098 kfree(mntfh);
@@ -2083,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s)
2083 2120
2084 bdi_unregister(&server->backing_dev_info); 2121 bdi_unregister(&server->backing_dev_info);
2085 kill_anon_super(s); 2122 kill_anon_super(s);
2123 nfs_fscache_release_super_cookie(s);
2086 nfs_free_server(server); 2124 nfs_free_server(server);
2087} 2125}
2088 2126
@@ -2390,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2390 if (!s->s_root) { 2428 if (!s->s_root) {
2391 /* initial superblock/root creation */ 2429 /* initial superblock/root creation */
2392 nfs4_fill_super(s); 2430 nfs4_fill_super(s);
2431 nfs_fscache_get_super_cookie(s, data);
2393 } 2432 }
2394 2433
2395 mntroot = nfs4_get_root(s, mntfh); 2434 mntroot = nfs4_get_root(s, mntfh);
@@ -2411,6 +2450,7 @@ out:
2411 kfree(data->client_address); 2450 kfree(data->client_address);
2412 kfree(data->nfs_server.export_path); 2451 kfree(data->nfs_server.export_path);
2413 kfree(data->nfs_server.hostname); 2452 kfree(data->nfs_server.hostname);
2453 kfree(data->fscache_uniq);
2414 security_free_mnt_opts(&data->lsm_opts); 2454 security_free_mnt_opts(&data->lsm_opts);
2415out_free_fh: 2455out_free_fh:
2416 kfree(mntfh); 2456 kfree(mntfh);
@@ -2437,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb)
2437 kill_anon_super(sb); 2477 kill_anon_super(sb);
2438 2478
2439 nfs4_renewd_prepare_shutdown(server); 2479 nfs4_renewd_prepare_shutdown(server);
2480 nfs_fscache_release_super_cookie(sb);
2440 nfs_free_server(server); 2481 nfs_free_server(server);
2441} 2482}
2442 2483
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index bc3567bab8c4..7c09852be713 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -403,7 +403,6 @@ static int
403nfsd(void *vrqstp) 403nfsd(void *vrqstp)
404{ 404{
405 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 405 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
406 struct fs_struct *fsp;
407 int err, preverr = 0; 406 int err, preverr = 0;
408 407
409 /* Lock module and set up kernel thread */ 408 /* Lock module and set up kernel thread */
@@ -412,13 +411,11 @@ nfsd(void *vrqstp)
412 /* At this point, the thread shares current->fs 411 /* At this point, the thread shares current->fs
413 * with the init process. We need to create files with a 412 * with the init process. We need to create files with a
414 * umask of 0 instead of init's umask. */ 413 * umask of 0 instead of init's umask. */
415 fsp = copy_fs_struct(current->fs); 414 if (unshare_fs_struct() < 0) {
416 if (!fsp) {
417 printk("Unable to start nfsd thread: out of memory\n"); 415 printk("Unable to start nfsd thread: out of memory\n");
418 goto out; 416 goto out;
419 } 417 }
420 exit_fs(current); 418
421 current->fs = fsp;
422 current->fs->umask = 0; 419 current->fs->umask = 0;
423 420
424 /* 421 /*
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
296 return PTR_ERR(acl); 296 return PTR_ERR(acl);
297 } 297 }
298 if (!acl) 298 if (!acl)
299 inode->i_mode &= ~current->fs->umask; 299 inode->i_mode &= ~current_umask();
300 } 300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone; 302 struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295}; 295};
296 296
297static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 u64 blkno)
299{
300 struct ocfs2_dx_root_block *dx_root = et->et_object;
301
302 dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303}
304
305static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306{
307 struct ocfs2_dx_root_block *dx_root = et->et_object;
308
309 return le64_to_cpu(dx_root->dr_last_eb_blk);
310}
311
312static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 struct ocfs2_extent_tree *et,
314 u32 clusters)
315{
316 struct ocfs2_dx_root_block *dx_root = et->et_object;
317
318 le32_add_cpu(&dx_root->dr_clusters, clusters);
319}
320
321static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 struct ocfs2_extent_tree *et)
323{
324 struct ocfs2_dx_root_block *dx_root = et->et_object;
325
326 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327
328 return 0;
329}
330
331static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332{
333 struct ocfs2_dx_root_block *dx_root = et->et_object;
334
335 et->et_root_el = &dx_root->dr_list;
336}
337
338static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
340 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
341 .eo_update_clusters = ocfs2_dx_root_update_clusters,
342 .eo_sanity_check = ocfs2_dx_root_sanity_check,
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344};
345
297static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
298 struct inode *inode, 347 struct inode *inode,
299 struct buffer_head *bh, 348 struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 &ocfs2_xattr_value_et_ops); 388 &ocfs2_xattr_value_et_ops);
340} 389}
341 390
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode,
393 struct buffer_head *bh)
394{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops);
397}
398
342static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
343 u64 new_last_eb_blk) 400 u64 new_last_eb_blk)
344{ 401{
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 76 struct inode *inode,
77 struct ocfs2_xattr_value_buf *vb); 77 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode,
80 struct buffer_head *bh);
78 81
79/* 82/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be 83 * Read an extent block into *bh. If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e1709a679b7..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1956} 1956}
1957 1957
1958const struct address_space_operations ocfs2_aops = { 1958const struct address_space_operations ocfs2_aops = {
1959 .readpage = ocfs2_readpage, 1959 .readpage = ocfs2_readpage,
1960 .readpages = ocfs2_readpages, 1960 .readpages = ocfs2_readpages,
1961 .writepage = ocfs2_writepage, 1961 .writepage = ocfs2_writepage,
1962 .write_begin = ocfs2_write_begin, 1962 .write_begin = ocfs2_write_begin,
1963 .write_end = ocfs2_write_end, 1963 .write_end = ocfs2_write_end,
1964 .bmap = ocfs2_bmap, 1964 .bmap = ocfs2_bmap,
1965 .sync_page = block_sync_page, 1965 .sync_page = block_sync_page,
1966 .direct_IO = ocfs2_direct_IO, 1966 .direct_IO = ocfs2_direct_IO,
1967 .invalidatepage = ocfs2_invalidatepage, 1967 .invalidatepage = ocfs2_invalidatepage,
1968 .releasepage = ocfs2_releasepage, 1968 .releasepage = ocfs2_releasepage,
1969 .migratepage = buffer_migrate_page, 1969 .migratepage = buffer_migrate_page,
1970 .is_partially_uptodate = block_is_partially_uptodate,
1970}; 1971};
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
33#include <linux/random.h> 33#include <linux/random.h>
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h>
36 37
37#include "heartbeat.h" 38#include "heartbeat.h"
38#include "tcp.h" 39#include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events); 61static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 62static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62 63
64#define O2HB_DEBUG_DIR "o2hb"
65#define O2HB_DEBUG_LIVENODES "livenodes"
66static struct dentry *o2hb_debug_dir;
67static struct dentry *o2hb_debug_livenodes;
68
63static LIST_HEAD(o2hb_all_regions); 69static LIST_HEAD(o2hb_all_regions);
64 70
65static struct o2hb_callback { 71static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
905 return 0; 911 return 0;
906} 912}
907 913
908void o2hb_init(void) 914#ifdef CONFIG_DEBUG_FS
915static int o2hb_debug_open(struct inode *inode, struct file *file)
916{
917 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
918 char *buf = NULL;
919 int i = -1;
920 int out = 0;
921
922 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
923 if (!buf)
924 goto bail;
925
926 o2hb_fill_node_map(map, sizeof(map));
927
928 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
929 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
930 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
931
932 i_size_write(inode, out);
933
934 file->private_data = buf;
935
936 return 0;
937bail:
938 return -ENOMEM;
939}
940
941static int o2hb_debug_release(struct inode *inode, struct file *file)
942{
943 kfree(file->private_data);
944 return 0;
945}
946
947static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
948 size_t nbytes, loff_t *ppos)
949{
950 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
951 i_size_read(file->f_mapping->host));
952}
953#else
954static int o2hb_debug_open(struct inode *inode, struct file *file)
955{
956 return 0;
957}
958static int o2hb_debug_release(struct inode *inode, struct file *file)
959{
960 return 0;
961}
962static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
963 size_t nbytes, loff_t *ppos)
964{
965 return 0;
966}
967#endif /* CONFIG_DEBUG_FS */
968
969static struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read,
973 .llseek = generic_file_llseek,
974};
975
976void o2hb_exit(void)
977{
978 if (o2hb_debug_livenodes)
979 debugfs_remove(o2hb_debug_livenodes);
980 if (o2hb_debug_dir)
981 debugfs_remove(o2hb_debug_dir);
982}
983
984int o2hb_init(void)
909{ 985{
910 int i; 986 int i;
911 987
@@ -918,6 +994,24 @@ void o2hb_init(void)
918 INIT_LIST_HEAD(&o2hb_node_events); 994 INIT_LIST_HEAD(&o2hb_node_events);
919 995
920 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 996 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
997
998 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
999 if (!o2hb_debug_dir) {
1000 mlog_errno(-ENOMEM);
1001 return -ENOMEM;
1002 }
1003
1004 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1005 S_IFREG|S_IRUSR,
1006 o2hb_debug_dir, NULL,
1007 &o2hb_debug_fops);
1008 if (!o2hb_debug_livenodes) {
1009 mlog_errno(-ENOMEM);
1010 debugfs_remove(o2hb_debug_dir);
1011 return -ENOMEM;
1012 }
1013
1014 return 0;
921} 1015}
922 1016
923/* if we're already in a callback then we're already serialized by the sem */ 1017/* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc); 75 struct o2hb_callback_func *hc);
76void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
77 unsigned bytes); 77 unsigned bytes);
78void o2hb_init(void); 78void o2hb_exit(void);
79int o2hb_init(void);
79int o2hb_check_node_heartbeating(u8 node_num); 80int o2hb_check_node_heartbeating(u8 node_num);
80int o2hb_check_node_heartbeating_from_callback(u8 node_num); 81int o2hb_check_node_heartbeating_from_callback(u8 node_num);
81int o2hb_check_local_node_heartbeating(void); 82int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
881 o2cb_sys_shutdown(); 881 o2cb_sys_shutdown();
882 882
883 o2net_exit(); 883 o2net_exit();
884 o2hb_exit();
884} 885}
885 886
886static int __init init_o2nm(void) 887static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
889 890
890 cluster_print_version(); 891 cluster_print_version();
891 892
892 o2hb_init(); 893 ret = o2hb_init();
894 if (ret)
895 goto out;
893 896
894 ret = o2net_init(); 897 ret = o2net_init();
895 if (ret) 898 if (ret)
896 goto out; 899 goto out_o2hb;
897 900
898 ret = o2net_register_hb_callbacks(); 901 ret = o2net_register_hb_callbacks();
899 if (ret) 902 if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
916 o2net_unregister_hb_callbacks(); 919 o2net_unregister_hb_callbacks();
917out_o2net: 920out_o2net:
918 o2net_exit(); 921 o2net_exit();
922out_o2hb:
923 o2hb_exit();
919out: 924out:
920 return ret; 925 return ret;
921} 926}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..e71160cda110 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h>
44 45
45#define MLOG_MASK_PREFIX ML_NAMEI 46#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -58,6 +59,7 @@
58#include "namei.h" 59#include "namei.h"
59#include "suballoc.h" 60#include "suballoc.h"
60#include "super.h" 61#include "super.h"
62#include "sysfile.h"
61#include "uptodate.h" 63#include "uptodate.h"
62 64
63#include "buffer_head_io.h" 65#include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
71 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 73 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
72}; 74};
73 75
74static int ocfs2_extend_dir(struct ocfs2_super *osb,
75 struct inode *dir,
76 struct buffer_head *parent_fe_bh,
77 unsigned int blocks_wanted,
78 struct buffer_head **new_de_bh);
79static int ocfs2_do_extend_dir(struct super_block *sb, 76static int ocfs2_do_extend_dir(struct super_block *sb,
80 handle_t *handle, 77 handle_t *handle,
81 struct inode *dir, 78 struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
83 struct ocfs2_alloc_context *data_ac, 80 struct ocfs2_alloc_context *data_ac,
84 struct ocfs2_alloc_context *meta_ac, 81 struct ocfs2_alloc_context *meta_ac,
85 struct buffer_head **new_bh); 82 struct buffer_head **new_bh);
83static int ocfs2_dir_indexed(struct inode *inode);
86 84
87/* 85/*
88 * These are distinct checks because future versions of the file system will 86 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing. 87 * want to have a trailing dirent structure independent of indexing.
90 */ 88 */
91static int ocfs2_dir_has_trailer(struct inode *dir) 89static int ocfs2_supports_dir_trailer(struct inode *dir)
92{ 90{
91 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
92
93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
94 return 0; 94 return 0;
95 95
96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); 96 return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
97} 97}
98 98
99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) 99/*
100 * "new' here refers to the point at which we're creating a new
101 * directory via "mkdir()", but also when we're expanding an inline
102 * directory. In either case, we don't yet have the indexing bit set
103 * on the directory, so the standard checks will fail in when metaecc
104 * is turned off. Only directory-initialization type functions should
105 * use this then. Everything else wants ocfs2_supports_dir_trailer()
106 */
107static int ocfs2_new_dir_wants_trailer(struct inode *dir)
100{ 108{
101 return ocfs2_meta_ecc(osb); 109 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
110
111 return ocfs2_meta_ecc(osb) ||
112 ocfs2_supports_indexed_dirs(osb);
102} 113}
103 114
104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) 115static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
130{ 141{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); 142 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
132 143
133 if (!ocfs2_dir_has_trailer(dir)) 144 if (!ocfs2_supports_dir_trailer(dir))
134 return 0; 145 return 0;
135 146
136 if (offset != toff) 147 if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
140} 151}
141 152
142static void ocfs2_init_dir_trailer(struct inode *inode, 153static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh) 154 struct buffer_head *bh, u16 rec_len)
144{ 155{
145 struct ocfs2_dir_block_trailer *trailer; 156 struct ocfs2_dir_block_trailer *trailer;
146 157
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); 161 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 162 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr); 163 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
164 trailer->db_free_rec_len = cpu_to_le16(rec_len);
165}
166/*
167 * Link an unindexed block with a dir trailer structure into the index free
168 * list. This function will modify dirdata_bh, but assumes you've already
169 * passed it to the journal.
170 */
171static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
172 struct buffer_head *dx_root_bh,
173 struct buffer_head *dirdata_bh)
174{
175 int ret;
176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer;
178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) {
182 mlog_errno(ret);
183 goto out;
184 }
185 trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
186 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
187
188 trailer->db_free_next = dx_root->dr_free_blk;
189 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
190
191 ocfs2_journal_dirty(handle, dx_root_bh);
192
193out:
194 return ret;
195}
196
197static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
198{
199 return res->dl_prev_leaf_bh == NULL;
200}
201
202void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
203{
204 brelse(res->dl_dx_root_bh);
205 brelse(res->dl_leaf_bh);
206 brelse(res->dl_dx_leaf_bh);
207 brelse(res->dl_prev_leaf_bh);
208}
209
210static int ocfs2_dir_indexed(struct inode *inode)
211{
212 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
213 return 1;
214 return 0;
215}
216
217static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
218{
219 return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
220}
221
222/*
223 * Hashing code adapted from ext3
224 */
225#define DELTA 0x9E3779B9
226
227static void TEA_transform(__u32 buf[4], __u32 const in[])
228{
229 __u32 sum = 0;
230 __u32 b0 = buf[0], b1 = buf[1];
231 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
232 int n = 16;
233
234 do {
235 sum += DELTA;
236 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
237 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
238 } while (--n);
239
240 buf[0] += b0;
241 buf[1] += b1;
242}
243
244static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
245{
246 __u32 pad, val;
247 int i;
248
249 pad = (__u32)len | ((__u32)len << 8);
250 pad |= pad << 16;
251
252 val = pad;
253 if (len > num*4)
254 len = num * 4;
255 for (i = 0; i < len; i++) {
256 if ((i % 4) == 0)
257 val = pad;
258 val = msg[i] + (val << 8);
259 if ((i % 4) == 3) {
260 *buf++ = val;
261 val = pad;
262 num--;
263 }
264 }
265 if (--num >= 0)
266 *buf++ = val;
267 while (--num >= 0)
268 *buf++ = pad;
269}
270
271static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
272 struct ocfs2_dx_hinfo *hinfo)
273{
274 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
275 const char *p;
276 __u32 in[8], buf[4];
277
278 /*
279 * XXX: Is this really necessary, if the index is never looked
280 * at by readdir? Is a hash value of '0' a bad idea?
281 */
282 if ((len == 1 && !strncmp(".", name, 1)) ||
283 (len == 2 && !strncmp("..", name, 2))) {
284 buf[0] = buf[1] = 0;
285 goto out;
286 }
287
288#ifdef OCFS2_DEBUG_DX_DIRS
289 /*
290 * This makes it very easy to debug indexing problems. We
291 * should never allow this to be selected without hand editing
292 * this file though.
293 */
294 buf[0] = buf[1] = len;
295 goto out;
296#endif
297
298 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
299
300 p = name;
301 while (len > 0) {
302 str2hashbuf(p, len, in, 4);
303 TEA_transform(buf, in);
304 len -= 16;
305 p += 16;
306 }
307
308out:
309 hinfo->major_hash = buf[0];
310 hinfo->minor_hash = buf[1];
153} 311}
154 312
155/* 313/*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
312} 470}
313 471
314/* 472/*
473 * Validate a directory trailer.
474 *
475 * We check the trailer here rather than in ocfs2_validate_dir_block()
476 * because that function doesn't have the inode to test.
477 */
478static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
479{
480 int rc = 0;
481 struct ocfs2_dir_block_trailer *trailer;
482
483 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
484 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
485 rc = -EINVAL;
486 ocfs2_error(dir->i_sb,
487 "Invalid dirblock #%llu: "
488 "signature = %.*s\n",
489 (unsigned long long)bh->b_blocknr, 7,
490 trailer->db_signature);
491 goto out;
492 }
493 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
494 rc = -EINVAL;
495 ocfs2_error(dir->i_sb,
496 "Directory block #%llu has an invalid "
497 "db_blkno of %llu",
498 (unsigned long long)bh->b_blocknr,
499 (unsigned long long)le64_to_cpu(trailer->db_blkno));
500 goto out;
501 }
502 if (le64_to_cpu(trailer->db_parent_dinode) !=
503 OCFS2_I(dir)->ip_blkno) {
504 rc = -EINVAL;
505 ocfs2_error(dir->i_sb,
506 "Directory block #%llu on dinode "
507 "#%llu has an invalid parent_dinode "
508 "of %llu",
509 (unsigned long long)bh->b_blocknr,
510 (unsigned long long)OCFS2_I(dir)->ip_blkno,
511 (unsigned long long)le64_to_cpu(trailer->db_blkno));
512 goto out;
513 }
514out:
515 return rc;
516}
517
518/*
315 * This function forces all errors to -EIO for consistency with its 519 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the 520 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with 521 * real error codes would do to callers. We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
322{ 526{
323 int rc = 0; 527 int rc = 0;
324 struct buffer_head *tmp = *bh; 528 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326 529
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 530 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block); 531 ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
331 goto out; 534 goto out;
332 } 535 }
333 536
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) && 537 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) { 538 ocfs2_supports_dir_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); 539 rc = ocfs2_check_dir_trailer(inode, tmp);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 540 if (rc) {
343 rc = -EINVAL; 541 if (!*bh)
344 ocfs2_error(inode->i_sb, 542 brelse(tmp);
345 "Invalid dirblock #%llu: " 543 mlog_errno(rc);
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out; 544 goto out;
371 } 545 }
372 } 546 }
@@ -379,6 +553,141 @@ out:
379 return rc ? -EIO : 0; 553 return rc ? -EIO : 0;
380} 554}
381 555
556/*
557 * Read the block at 'phys' which belongs to this directory
558 * inode. This function does no virtual->physical block translation -
559 * what's passed in is assumed to be a valid directory block.
560 */
561static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
562 struct buffer_head **bh)
563{
564 int ret;
565 struct buffer_head *tmp = *bh;
566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
568 if (ret) {
569 mlog_errno(ret);
570 goto out;
571 }
572
573 if (ocfs2_supports_dir_trailer(dir)) {
574 ret = ocfs2_check_dir_trailer(dir, tmp);
575 if (ret) {
576 if (!*bh)
577 brelse(tmp);
578 mlog_errno(ret);
579 goto out;
580 }
581 }
582
583 if (!ret && !*bh)
584 *bh = tmp;
585out:
586 return ret;
587}
588
589static int ocfs2_validate_dx_root(struct super_block *sb,
590 struct buffer_head *bh)
591{
592 int ret;
593 struct ocfs2_dx_root_block *dx_root;
594
595 BUG_ON(!buffer_uptodate(bh));
596
597 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
598
599 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
600 if (ret) {
601 mlog(ML_ERROR,
602 "Checksum failed for dir index root block %llu\n",
603 (unsigned long long)bh->b_blocknr);
604 return ret;
605 }
606
607 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
608 ocfs2_error(sb,
609 "Dir Index Root # %llu has bad signature %.*s",
610 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
611 7, dx_root->dr_signature);
612 return -EINVAL;
613 }
614
615 return 0;
616}
617
618static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
619 struct buffer_head **dx_root_bh)
620{
621 int ret;
622 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh;
624
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
626
627 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh)
629 *dx_root_bh = tmp;
630
631 return ret;
632}
633
634static int ocfs2_validate_dx_leaf(struct super_block *sb,
635 struct buffer_head *bh)
636{
637 int ret;
638 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
639
640 BUG_ON(!buffer_uptodate(bh));
641
642 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
643 if (ret) {
644 mlog(ML_ERROR,
645 "Checksum failed for dir index leaf block %llu\n",
646 (unsigned long long)bh->b_blocknr);
647 return ret;
648 }
649
650 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
651 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
652 7, dx_leaf->dl_signature);
653 return -EROFS;
654 }
655
656 return 0;
657}
658
659static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
660 struct buffer_head **dx_leaf_bh)
661{
662 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh;
664
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
666
667 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh)
669 *dx_leaf_bh = tmp;
670
671 return ret;
672}
673
674/*
675 * Read a series of dx_leaf blocks. This expects all buffer_head
676 * pointers to be NULL on function entry.
677 */
678static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
679 struct buffer_head **dx_leaf_bhs)
680{
681 int ret;
682
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf);
685 if (ret)
686 mlog_errno(ret);
687
688 return ret;
689}
690
382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 691static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
383 struct inode *dir, 692 struct inode *dir,
384 struct ocfs2_dir_entry **res_dir) 693 struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
480 return ret; 789 return ret;
481} 790}
482 791
792static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
793 struct ocfs2_extent_list *el,
794 u32 major_hash,
795 u32 *ret_cpos,
796 u64 *ret_phys_blkno,
797 unsigned int *ret_clen)
798{
799 int ret = 0, i, found;
800 struct buffer_head *eb_bh = NULL;
801 struct ocfs2_extent_block *eb;
802 struct ocfs2_extent_rec *rec = NULL;
803
804 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
806 if (ret) {
807 mlog_errno(ret);
808 goto out;
809 }
810
811 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
812 el = &eb->h_list;
813
814 if (el->l_tree_depth) {
815 ocfs2_error(inode->i_sb,
816 "Inode %lu has non zero tree depth in "
817 "btree tree block %llu\n", inode->i_ino,
818 (unsigned long long)eb_bh->b_blocknr);
819 ret = -EROFS;
820 goto out;
821 }
822 }
823
824 found = 0;
825 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
826 rec = &el->l_recs[i];
827
828 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
829 found = 1;
830 break;
831 }
832 }
833
834 if (!found) {
835 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
836 "record (%u, %u, 0) in btree", inode->i_ino,
837 le32_to_cpu(rec->e_cpos),
838 ocfs2_rec_clusters(el, rec));
839 ret = -EROFS;
840 goto out;
841 }
842
843 if (ret_phys_blkno)
844 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
845 if (ret_cpos)
846 *ret_cpos = le32_to_cpu(rec->e_cpos);
847 if (ret_clen)
848 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
849
850out:
851 brelse(eb_bh);
852 return ret;
853}
854
855/*
856 * Returns the block index, from the start of the cluster which this
857 * hash belongs too.
858 */
859static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
860 u32 minor_hash)
861{
862 return minor_hash & osb->osb_dx_mask;
863}
864
865static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
866 struct ocfs2_dx_hinfo *hinfo)
867{
868 return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
869}
870
871static int ocfs2_dx_dir_lookup(struct inode *inode,
872 struct ocfs2_extent_list *el,
873 struct ocfs2_dx_hinfo *hinfo,
874 u32 *ret_cpos,
875 u64 *ret_phys_blkno)
876{
877 int ret = 0;
878 unsigned int cend, uninitialized_var(clen);
879 u32 uninitialized_var(cpos);
880 u64 uninitialized_var(blkno);
881 u32 name_hash = hinfo->major_hash;
882
883 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
884 &clen);
885 if (ret) {
886 mlog_errno(ret);
887 goto out;
888 }
889
890 cend = cpos + clen;
891 if (name_hash >= cend) {
892 /* We want the last cluster */
893 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
894 cpos += clen - 1;
895 } else {
896 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
897 name_hash - cpos);
898 cpos = name_hash;
899 }
900
901 /*
902 * We now have the cluster which should hold our entry. To
903 * find the exact block from the start of the cluster to
904 * search, we take the lower bits of the hash.
905 */
906 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
907
908 if (ret_phys_blkno)
909 *ret_phys_blkno = blkno;
910 if (ret_cpos)
911 *ret_cpos = cpos;
912
913out:
914
915 return ret;
916}
917
918static int ocfs2_dx_dir_search(const char *name, int namelen,
919 struct inode *dir,
920 struct ocfs2_dx_root_block *dx_root,
921 struct ocfs2_dir_lookup_result *res)
922{
923 int ret, i, found;
924 u64 uninitialized_var(phys);
925 struct buffer_head *dx_leaf_bh = NULL;
926 struct ocfs2_dx_leaf *dx_leaf;
927 struct ocfs2_dx_entry *dx_entry = NULL;
928 struct buffer_head *dir_ent_bh = NULL;
929 struct ocfs2_dir_entry *dir_ent = NULL;
930 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
931 struct ocfs2_extent_list *dr_el;
932 struct ocfs2_dx_entry_list *entry_list;
933
934 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
935
936 if (ocfs2_dx_root_inline(dx_root)) {
937 entry_list = &dx_root->dr_entries;
938 goto search;
939 }
940
941 dr_el = &dx_root->dr_list;
942
943 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
944 if (ret) {
945 mlog_errno(ret);
946 goto out;
947 }
948
949 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
950 "returns: %llu\n",
951 (unsigned long long)OCFS2_I(dir)->ip_blkno,
952 namelen, name, hinfo->major_hash, hinfo->minor_hash,
953 (unsigned long long)phys);
954
955 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
956 if (ret) {
957 mlog_errno(ret);
958 goto out;
959 }
960
961 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
962
963 mlog(0, "leaf info: num_used: %d, count: %d\n",
964 le16_to_cpu(dx_leaf->dl_list.de_num_used),
965 le16_to_cpu(dx_leaf->dl_list.de_count));
966
967 entry_list = &dx_leaf->dl_list;
968
969search:
970 /*
971 * Empty leaf is legal, so no need to check for that.
972 */
973 found = 0;
974 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
975 dx_entry = &entry_list->de_entries[i];
976
977 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
978 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
979 continue;
980
981 /*
982 * Search unindexed leaf block now. We're not
983 * guaranteed to find anything.
984 */
985 ret = ocfs2_read_dir_block_direct(dir,
986 le64_to_cpu(dx_entry->dx_dirent_blk),
987 &dir_ent_bh);
988 if (ret) {
989 mlog_errno(ret);
990 goto out;
991 }
992
993 /*
994 * XXX: We should check the unindexed block here,
995 * before using it.
996 */
997
998 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
999 0, dir_ent_bh->b_data,
1000 dir->i_sb->s_blocksize, &dir_ent);
1001 if (found == 1)
1002 break;
1003
1004 if (found == -1) {
1005 /* This means we found a bad directory entry. */
1006 ret = -EIO;
1007 mlog_errno(ret);
1008 goto out;
1009 }
1010
1011 brelse(dir_ent_bh);
1012 dir_ent_bh = NULL;
1013 }
1014
1015 if (found <= 0) {
1016 ret = -ENOENT;
1017 goto out;
1018 }
1019
1020 res->dl_leaf_bh = dir_ent_bh;
1021 res->dl_entry = dir_ent;
1022 res->dl_dx_leaf_bh = dx_leaf_bh;
1023 res->dl_dx_entry = dx_entry;
1024
1025 ret = 0;
1026out:
1027 if (ret) {
1028 brelse(dx_leaf_bh);
1029 brelse(dir_ent_bh);
1030 }
1031 return ret;
1032}
1033
1034static int ocfs2_find_entry_dx(const char *name, int namelen,
1035 struct inode *dir,
1036 struct ocfs2_dir_lookup_result *lookup)
1037{
1038 int ret;
1039 struct buffer_head *di_bh = NULL;
1040 struct ocfs2_dinode *di;
1041 struct buffer_head *dx_root_bh = NULL;
1042 struct ocfs2_dx_root_block *dx_root;
1043
1044 ret = ocfs2_read_inode_block(dir, &di_bh);
1045 if (ret) {
1046 mlog_errno(ret);
1047 goto out;
1048 }
1049
1050 di = (struct ocfs2_dinode *)di_bh->b_data;
1051
1052 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1053 if (ret) {
1054 mlog_errno(ret);
1055 goto out;
1056 }
1057 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1058
1059 ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1060 if (ret) {
1061 if (ret != -ENOENT)
1062 mlog_errno(ret);
1063 goto out;
1064 }
1065
1066 lookup->dl_dx_root_bh = dx_root_bh;
1067 dx_root_bh = NULL;
1068out:
1069 brelse(di_bh);
1070 brelse(dx_root_bh);
1071 return ret;
1072}
1073
483/* 1074/*
484 * Try to find an entry of the provided name within 'dir'. 1075 * Try to find an entry of the provided name within 'dir'.
485 * 1076 *
486 * If nothing was found, NULL is returned. Otherwise, a buffer_head 1077 * If nothing was found, -ENOENT is returned. Otherwise, zero is
487 * and pointer to the dir entry are passed back. 1078 * returned and the struct 'res' will contain information useful to
1079 * other directory manipulation functions.
488 * 1080 *
489 * Caller can NOT assume anything about the contents of the 1081 * Caller can NOT assume anything about the contents of the
490 * buffer_head - it is passed back only so that it can be passed into 1082 * buffer_heads - they are passed back only so that it can be passed
491 * any one of the manipulation functions (add entry, delete entry, 1083 * into any one of the manipulation functions (add entry, delete
492 * etc). As an example, bh in the extent directory case is a data 1084 * entry, etc). As an example, bh in the extent directory case is a
493 * block, in the inline-data case it actually points to an inode. 1085 * data block, in the inline-data case it actually points to an inode,
1086 * in the indexed directory case, multiple buffers are involved.
494 */ 1087 */
495struct buffer_head *ocfs2_find_entry(const char *name, int namelen, 1088int ocfs2_find_entry(const char *name, int namelen,
496 struct inode *dir, 1089 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
497 struct ocfs2_dir_entry **res_dir)
498{ 1090{
499 *res_dir = NULL; 1091 struct buffer_head *bh;
1092 struct ocfs2_dir_entry *res_dir = NULL;
500 1093
1094 if (ocfs2_dir_indexed(dir))
1095 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1096
1097 /*
1098 * The unindexed dir code only uses part of the lookup
1099 * structure, so there's no reason to push it down further
1100 * than this.
1101 */
501 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1102 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
502 return ocfs2_find_entry_id(name, namelen, dir, res_dir); 1103 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1104 else
1105 bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1106
1107 if (bh == NULL)
1108 return -ENOENT;
503 1109
504 return ocfs2_find_entry_el(name, namelen, dir, res_dir); 1110 lookup->dl_leaf_bh = bh;
1111 lookup->dl_entry = res_dir;
1112 return 0;
505} 1113}
506 1114
507/* 1115/*
508 * Update inode number and type of a previously found directory entry. 1116 * Update inode number and type of a previously found directory entry.
509 */ 1117 */
510int ocfs2_update_entry(struct inode *dir, handle_t *handle, 1118int ocfs2_update_entry(struct inode *dir, handle_t *handle,
511 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 1119 struct ocfs2_dir_lookup_result *res,
512 struct inode *new_entry_inode) 1120 struct inode *new_entry_inode)
513{ 1121{
514 int ret; 1122 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1123 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1124 struct ocfs2_dir_entry *de = res->dl_entry;
1125 struct buffer_head *de_bh = res->dl_leaf_bh;
516 1126
517 /* 1127 /*
518 * The same code works fine for both inline-data and extent 1128 * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
538 return ret; 1148 return ret;
539} 1149}
540 1150
1151/*
1152 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1153 * previous entry
1154 */
541static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1155static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
542 struct ocfs2_dir_entry *de_del, 1156 struct ocfs2_dir_entry *de_del,
543 struct buffer_head *bh, char *first_de, 1157 struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
587 return status; 1201 return status;
588} 1202}
589 1203
1204static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1205{
1206 unsigned int hole;
1207
1208 if (le64_to_cpu(de->inode) == 0)
1209 hole = le16_to_cpu(de->rec_len);
1210 else
1211 hole = le16_to_cpu(de->rec_len) -
1212 OCFS2_DIR_REC_LEN(de->name_len);
1213
1214 return hole;
1215}
1216
1217static int ocfs2_find_max_rec_len(struct super_block *sb,
1218 struct buffer_head *dirblock_bh)
1219{
1220 int size, this_hole, largest_hole = 0;
1221 char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1222 struct ocfs2_dir_entry *de;
1223
1224 trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1225 size = ocfs2_dir_trailer_blk_off(sb);
1226 limit = start + size;
1227 de_buf = start;
1228 de = (struct ocfs2_dir_entry *)de_buf;
1229 do {
1230 if (de_buf != trailer) {
1231 this_hole = ocfs2_figure_dirent_hole(de);
1232 if (this_hole > largest_hole)
1233 largest_hole = this_hole;
1234 }
1235
1236 de_buf += le16_to_cpu(de->rec_len);
1237 de = (struct ocfs2_dir_entry *)de_buf;
1238 } while (de_buf < limit);
1239
1240 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1241 return largest_hole;
1242 return 0;
1243}
1244
1245static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1246 int index)
1247{
1248 int num_used = le16_to_cpu(entry_list->de_num_used);
1249
1250 if (num_used == 1 || index == (num_used - 1))
1251 goto clear;
1252
1253 memmove(&entry_list->de_entries[index],
1254 &entry_list->de_entries[index + 1],
1255 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1256clear:
1257 num_used--;
1258 memset(&entry_list->de_entries[num_used], 0,
1259 sizeof(struct ocfs2_dx_entry));
1260 entry_list->de_num_used = cpu_to_le16(num_used);
1261}
1262
1263static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1264 struct ocfs2_dir_lookup_result *lookup)
1265{
1266 int ret, index, max_rec_len, add_to_free_list = 0;
1267 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1268 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1269 struct ocfs2_dx_leaf *dx_leaf;
1270 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1271 struct ocfs2_dir_block_trailer *trailer;
1272 struct ocfs2_dx_root_block *dx_root;
1273 struct ocfs2_dx_entry_list *entry_list;
1274
1275 /*
1276 * This function gets a bit messy because we might have to
1277 * modify the root block, regardless of whether the indexed
1278 * entries are stored inline.
1279 */
1280
1281 /*
1282 * *Only* set 'entry_list' here, based on where we're looking
1283 * for the indexed entries. Later, we might still want to
1284 * journal both blocks, based on free list state.
1285 */
1286 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1287 if (ocfs2_dx_root_inline(dx_root)) {
1288 entry_list = &dx_root->dr_entries;
1289 } else {
1290 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1291 entry_list = &dx_leaf->dl_list;
1292 }
1293
1294 /* Neither of these are a disk corruption - that should have
1295 * been caught by lookup, before we got here. */
1296 BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1297 BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1298
1299 index = (char *)dx_entry - (char *)entry_list->de_entries;
1300 index /= sizeof(*dx_entry);
1301
1302 if (index >= le16_to_cpu(entry_list->de_num_used)) {
1303 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1304 (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1305 entry_list, dx_entry);
1306 return -EIO;
1307 }
1308
1309 /*
1310 * We know that removal of this dirent will leave enough room
1311 * for a new one, so add this block to the free list if it
1312 * isn't already there.
1313 */
1314 trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1315 if (trailer->db_free_rec_len == 0)
1316 add_to_free_list = 1;
1317
1318 /*
1319 * Add the block holding our index into the journal before
1320 * removing the unindexed entry. If we get an error return
1321 * from __ocfs2_delete_entry(), then it hasn't removed the
1322 * entry yet. Likewise, successful return means we *must*
1323 * remove the indexed entry.
1324 *
1325 * We're also careful to journal the root tree block here as
1326 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list.
1328 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) {
1332 mlog_errno(ret);
1333 goto out;
1334 }
1335
1336 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir,
1338 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) {
1341 mlog_errno(ret);
1342 goto out;
1343 }
1344 }
1345
1346 mlog(0, "Dir %llu: delete entry at index: %d\n",
1347 (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
1348
1349 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1350 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1357 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1358 if (add_to_free_list) {
1359 trailer->db_free_next = dx_root->dr_free_blk;
1360 dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1361 ocfs2_journal_dirty(handle, dx_root_bh);
1362 }
1363
1364 /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1365 ocfs2_journal_dirty(handle, leaf_bh);
1366
1367 le32_add_cpu(&dx_root->dr_num_entries, -1);
1368 ocfs2_journal_dirty(handle, dx_root_bh);
1369
1370 ocfs2_dx_list_remove_entry(entry_list, index);
1371
1372 if (!ocfs2_dx_root_inline(dx_root))
1373 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1374
1375out:
1376 return ret;
1377}
1378
590static inline int ocfs2_delete_entry_id(handle_t *handle, 1379static inline int ocfs2_delete_entry_id(handle_t *handle,
591 struct inode *dir, 1380 struct inode *dir,
592 struct ocfs2_dir_entry *de_del, 1381 struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
624} 1413}
625 1414
626/* 1415/*
627 * ocfs2_delete_entry deletes a directory entry by merging it with the 1416 * Delete a directory entry. Hide the details of directory
628 * previous entry 1417 * implementation from the caller.
629 */ 1418 */
630int ocfs2_delete_entry(handle_t *handle, 1419int ocfs2_delete_entry(handle_t *handle,
631 struct inode *dir, 1420 struct inode *dir,
632 struct ocfs2_dir_entry *de_del, 1421 struct ocfs2_dir_lookup_result *res)
633 struct buffer_head *bh)
634{ 1422{
1423 if (ocfs2_dir_indexed(dir))
1424 return ocfs2_delete_entry_dx(handle, dir, res);
1425
635 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1426 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
636 return ocfs2_delete_entry_id(handle, dir, de_del, bh); 1427 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1428 res->dl_leaf_bh);
637 1429
638 return ocfs2_delete_entry_el(handle, dir, de_del, bh); 1430 return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1431 res->dl_leaf_bh);
639} 1432}
640 1433
641/* 1434/*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
663 return 0; 1456 return 0;
664} 1457}
665 1458
1459static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1460 struct ocfs2_dx_entry *dx_new_entry)
1461{
1462 int i;
1463
1464 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1465 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1466
1467 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1468}
1469
1470static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1471 struct ocfs2_dx_hinfo *hinfo,
1472 u64 dirent_blk)
1473{
1474 int i;
1475 struct ocfs2_dx_entry *dx_entry;
1476
1477 i = le16_to_cpu(entry_list->de_num_used);
1478 dx_entry = &entry_list->de_entries[i];
1479
1480 memset(dx_entry, 0, sizeof(*dx_entry));
1481 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1482 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1483 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1484
1485 le16_add_cpu(&entry_list->de_num_used, 1);
1486}
1487
1488static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1489 struct ocfs2_dx_hinfo *hinfo,
1490 u64 dirent_blk,
1491 struct buffer_head *dx_leaf_bh)
1492{
1493 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf;
1495
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out;
1501 }
1502
1503 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1504 ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1505 ocfs2_journal_dirty(handle, dx_leaf_bh);
1506
1507out:
1508 return ret;
1509}
1510
1511static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1512 struct ocfs2_dx_hinfo *hinfo,
1513 u64 dirent_blk,
1514 struct ocfs2_dx_root_block *dx_root)
1515{
1516 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1517}
1518
1519static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1520 struct ocfs2_dir_lookup_result *lookup)
1521{
1522 int ret = 0;
1523 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) {
1529 mlog_errno(ret);
1530 goto out;
1531 }
1532
1533 dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1534 if (ocfs2_dx_root_inline(dx_root)) {
1535 ocfs2_dx_inline_root_insert(dir, handle,
1536 &lookup->dl_hinfo,
1537 lookup->dl_leaf_bh->b_blocknr,
1538 dx_root);
1539 } else {
1540 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1541 lookup->dl_leaf_bh->b_blocknr,
1542 lookup->dl_dx_leaf_bh);
1543 if (ret)
1544 goto out;
1545 }
1546
1547 le32_add_cpu(&dx_root->dr_num_entries, 1);
1548 ocfs2_journal_dirty(handle, dx_root_bh);
1549
1550out:
1551 return ret;
1552}
1553
1554static void ocfs2_remove_block_from_free_list(struct inode *dir,
1555 handle_t *handle,
1556 struct ocfs2_dir_lookup_result *lookup)
1557{
1558 struct ocfs2_dir_block_trailer *trailer, *prev;
1559 struct ocfs2_dx_root_block *dx_root;
1560 struct buffer_head *bh;
1561
1562 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1563
1564 if (ocfs2_free_list_at_root(lookup)) {
1565 bh = lookup->dl_dx_root_bh;
1566 dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1567 dx_root->dr_free_blk = trailer->db_free_next;
1568 } else {
1569 bh = lookup->dl_prev_leaf_bh;
1570 prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1571 prev->db_free_next = trailer->db_free_next;
1572 }
1573
1574 trailer->db_free_rec_len = cpu_to_le16(0);
1575 trailer->db_free_next = cpu_to_le64(0);
1576
1577 ocfs2_journal_dirty(handle, bh);
1578 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1579}
1580
1581/*
1582 * This expects that a journal write has been reserved on
1583 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1584 */
1585static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1586 struct ocfs2_dir_lookup_result *lookup)
1587{
1588 int max_rec_len;
1589 struct ocfs2_dir_block_trailer *trailer;
1590
1591 /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1592 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1593 if (max_rec_len) {
1594 /*
1595 * There's still room in this block, so no need to remove it
1596 * from the free list. In this case, we just want to update
1597 * the rec len accounting.
1598 */
1599 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1600 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1601 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1602 } else {
1603 ocfs2_remove_block_from_free_list(dir, handle, lookup);
1604 }
1605}
1606
666/* we don't always have a dentry for what we want to add, so people 1607/* we don't always have a dentry for what we want to add, so people
667 * like orphan dir can call this instead. 1608 * like orphan dir can call this instead.
668 * 1609 *
669 * If you pass me insert_bh, I'll skip the search of the other dir 1610 * The lookup context must have been filled from
670 * blocks and put the record in there. 1611 * ocfs2_prepare_dir_for_insert.
671 */ 1612 */
672int __ocfs2_add_entry(handle_t *handle, 1613int __ocfs2_add_entry(handle_t *handle,
673 struct inode *dir, 1614 struct inode *dir,
674 const char *name, int namelen, 1615 const char *name, int namelen,
675 struct inode *inode, u64 blkno, 1616 struct inode *inode, u64 blkno,
676 struct buffer_head *parent_fe_bh, 1617 struct buffer_head *parent_fe_bh,
677 struct buffer_head *insert_bh) 1618 struct ocfs2_dir_lookup_result *lookup)
678{ 1619{
679 unsigned long offset; 1620 unsigned long offset;
680 unsigned short rec_len; 1621 unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
683 struct super_block *sb = dir->i_sb; 1624 struct super_block *sb = dir->i_sb;
684 int retval, status; 1625 int retval, status;
685 unsigned int size = sb->s_blocksize; 1626 unsigned int size = sb->s_blocksize;
1627 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
686 char *data_start = insert_bh->b_data; 1628 char *data_start = insert_bh->b_data;
687 1629
688 mlog_entry_void(); 1630 mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
690 if (!namelen) 1632 if (!namelen)
691 return -EINVAL; 1633 return -EINVAL;
692 1634
693 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1635 if (ocfs2_dir_indexed(dir)) {
1636 struct buffer_head *bh;
1637
1638 /*
1639 * An indexed dir may require that we update the free space
1640 * list. Reserve a write to the previous node in the list so
1641 * that we don't fail later.
1642 *
1643 * XXX: This can be either a dx_root_block, or an unindexed
1644 * directory tree leaf block.
1645 */
1646 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else {
1651 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE);
1654 }
1655 if (retval) {
1656 mlog_errno(retval);
1657 return retval;
1658 }
1659 } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
694 data_start = di->id2.i_data.id_data; 1660 data_start = di->id2.i_data.id_data;
695 size = i_size_read(dir); 1661 size = i_size_read(dir);
696 1662
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
737 status = ocfs2_journal_access_di(handle, dir, 1703 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh, 1704 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE); 1705 OCFS2_JOURNAL_ACCESS_WRITE);
740 else 1706 else {
741 status = ocfs2_journal_access_db(handle, dir, 1707 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh, 1708 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE); 1709 OCFS2_JOURNAL_ACCESS_WRITE);
1710
1711 if (ocfs2_dir_indexed(dir)) {
1712 status = ocfs2_dx_dir_insert(dir,
1713 handle,
1714 lookup);
1715 if (status) {
1716 mlog_errno(status);
1717 goto bail;
1718 }
1719 }
1720 }
1721
744 /* By now the buffer is marked for journaling */ 1722 /* By now the buffer is marked for journaling */
745 offset += le16_to_cpu(de->rec_len); 1723 offset += le16_to_cpu(de->rec_len);
746 if (le64_to_cpu(de->inode)) { 1724 if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
761 de->name_len = namelen; 1739 de->name_len = namelen;
762 memcpy(de->name, name, namelen); 1740 memcpy(de->name, name, namelen);
763 1741
1742 if (ocfs2_dir_indexed(dir))
1743 ocfs2_recalc_free_list(dir, handle, lookup);
1744
764 dir->i_version++; 1745 dir->i_version++;
765 status = ocfs2_journal_dirty(handle, insert_bh); 1746 status = ocfs2_journal_dirty(handle, insert_bh);
766 retval = 0; 1747 retval = 0;
@@ -870,6 +1851,10 @@ out:
870 return 0; 1851 return 0;
871} 1852}
872 1853
1854/*
1855 * NOTE: This function can be called against unindexed directories,
1856 * and indexed ones.
1857 */
873static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1858static int ocfs2_dir_foreach_blk_el(struct inode *inode,
874 u64 *f_version, 1859 u64 *f_version,
875 loff_t *f_pos, void *priv, 1860 loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
1071 int namelen, 2056 int namelen,
1072 u64 *blkno, 2057 u64 *blkno,
1073 struct inode *inode, 2058 struct inode *inode,
1074 struct buffer_head **dirent_bh, 2059 struct ocfs2_dir_lookup_result *lookup)
1075 struct ocfs2_dir_entry **dirent)
1076{ 2060{
1077 int status = -ENOENT; 2061 int status = -ENOENT;
1078 2062
1079 mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", 2063 mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
1080 namelen, name, blkno, inode, dirent_bh, dirent); 2064 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1081 2065
1082 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); 2066 status = ocfs2_find_entry(name, namelen, inode, lookup);
1083 if (!*dirent_bh || !*dirent) { 2067 if (status)
1084 status = -ENOENT;
1085 goto leave; 2068 goto leave;
1086 }
1087 2069
1088 *blkno = le64_to_cpu((*dirent)->inode); 2070 *blkno = le64_to_cpu(lookup->dl_entry->inode);
1089 2071
1090 status = 0; 2072 status = 0;
1091leave: 2073leave:
1092 if (status < 0) {
1093 *dirent = NULL;
1094 brelse(*dirent_bh);
1095 *dirent_bh = NULL;
1096 }
1097 2074
1098 mlog_exit(status);
1099 return status; 2075 return status;
1100} 2076}
1101 2077
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
1107 int namelen, u64 *blkno) 2083 int namelen, u64 *blkno)
1108{ 2084{
1109 int ret; 2085 int ret;
1110 struct buffer_head *bh = NULL; 2086 struct ocfs2_dir_lookup_result lookup = { NULL, };
1111 struct ocfs2_dir_entry *dirent = NULL;
1112 2087
1113 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent); 2088 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
1114 brelse(bh); 2089 ocfs2_free_dir_lookup_result(&lookup);
1115 2090
1116 return ret; 2091 return ret;
1117} 2092}
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
1128 int namelen) 2103 int namelen)
1129{ 2104{
1130 int ret; 2105 int ret;
1131 struct buffer_head *dirent_bh = NULL; 2106 struct ocfs2_dir_lookup_result lookup = { NULL, };
1132 struct ocfs2_dir_entry *dirent = NULL;
1133 2107
1134 mlog_entry("dir %llu, name '%.*s'\n", 2108 mlog_entry("dir %llu, name '%.*s'\n",
1135 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2109 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
1136 2110
1137 ret = -EEXIST; 2111 ret = -EEXIST;
1138 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); 2112 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
1139 if (dirent_bh)
1140 goto bail; 2113 goto bail;
1141 2114
1142 ret = 0; 2115 ret = 0;
1143bail: 2116bail:
1144 brelse(dirent_bh); 2117 ocfs2_free_dir_lookup_result(&lookup);
1145 2118
1146 mlog_exit(ret); 2119 mlog_exit(ret);
1147 return ret; 2120 return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
1151 unsigned seen_dot; 2124 unsigned seen_dot;
1152 unsigned seen_dot_dot; 2125 unsigned seen_dot_dot;
1153 unsigned seen_other; 2126 unsigned seen_other;
2127 unsigned dx_dir;
1154}; 2128};
1155static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, 2129static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1156 loff_t pos, u64 ino, unsigned type) 2130 loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1160 /* 2134 /*
1161 * Check the positions of "." and ".." records to be sure 2135 * Check the positions of "." and ".." records to be sure
1162 * they're in the correct place. 2136 * they're in the correct place.
2137 *
2138 * Indexed directories don't need to proceed past the first
2139 * two entries, so we end the scan after seeing '..'. Despite
2140 * that, we allow the scan to proceed In the event that we
2141 * have a corrupted indexed directory (no dot or dot dot
2142 * entries). This allows us to double check for existing
2143 * entries which might not have been found in the index.
1163 */ 2144 */
1164 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { 2145 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
1165 p->seen_dot = 1; 2146 p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1169 if (name_len == 2 && !strncmp("..", name, 2) && 2150 if (name_len == 2 && !strncmp("..", name, 2) &&
1170 pos == OCFS2_DIR_REC_LEN(1)) { 2151 pos == OCFS2_DIR_REC_LEN(1)) {
1171 p->seen_dot_dot = 1; 2152 p->seen_dot_dot = 1;
2153
2154 if (p->dx_dir && p->seen_dot)
2155 return 1;
2156
1172 return 0; 2157 return 0;
1173 } 2158 }
1174 2159
1175 p->seen_other = 1; 2160 p->seen_other = 1;
1176 return 1; 2161 return 1;
1177} 2162}
2163
2164static int ocfs2_empty_dir_dx(struct inode *inode,
2165 struct ocfs2_empty_dir_priv *priv)
2166{
2167 int ret;
2168 struct buffer_head *di_bh = NULL;
2169 struct buffer_head *dx_root_bh = NULL;
2170 struct ocfs2_dinode *di;
2171 struct ocfs2_dx_root_block *dx_root;
2172
2173 priv->dx_dir = 1;
2174
2175 ret = ocfs2_read_inode_block(inode, &di_bh);
2176 if (ret) {
2177 mlog_errno(ret);
2178 goto out;
2179 }
2180 di = (struct ocfs2_dinode *)di_bh->b_data;
2181
2182 ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2183 if (ret) {
2184 mlog_errno(ret);
2185 goto out;
2186 }
2187 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2188
2189 if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2190 priv->seen_other = 1;
2191
2192out:
2193 brelse(di_bh);
2194 brelse(dx_root_bh);
2195 return ret;
2196}
2197
1178/* 2198/*
1179 * routine to check that the specified directory is empty (for rmdir) 2199 * routine to check that the specified directory is empty (for rmdir)
1180 * 2200 *
1181 * Returns 1 if dir is empty, zero otherwise. 2201 * Returns 1 if dir is empty, zero otherwise.
2202 *
2203 * XXX: This is a performance problem for unindexed directories.
1182 */ 2204 */
1183int ocfs2_empty_dir(struct inode *inode) 2205int ocfs2_empty_dir(struct inode *inode)
1184{ 2206{
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
1188 2210
1189 memset(&priv, 0, sizeof(priv)); 2211 memset(&priv, 0, sizeof(priv));
1190 2212
2213 if (ocfs2_dir_indexed(inode)) {
2214 ret = ocfs2_empty_dir_dx(inode, &priv);
2215 if (ret)
2216 mlog_errno(ret);
2217 /*
2218 * We still run ocfs2_dir_foreach to get the checks
2219 * for "." and "..".
2220 */
2221 }
2222
1191 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); 2223 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
1192 if (ret) 2224 if (ret)
1193 mlog_errno(ret); 2225 mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1280 struct inode *parent, 2312 struct inode *parent,
1281 struct inode *inode, 2313 struct inode *inode,
1282 struct buffer_head *fe_bh, 2314 struct buffer_head *fe_bh,
1283 struct ocfs2_alloc_context *data_ac) 2315 struct ocfs2_alloc_context *data_ac,
2316 struct buffer_head **ret_new_bh)
1284{ 2317{
1285 int status; 2318 int status;
1286 unsigned int size = osb->sb->s_blocksize; 2319 unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1289 2322
1290 mlog_entry_void(); 2323 mlog_entry_void();
1291 2324
1292 if (ocfs2_supports_dir_trailer(osb)) 2325 if (ocfs2_new_dir_wants_trailer(inode))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2326 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294 2327
1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 2328 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 2343 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1311 2344
1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); 2345 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1313 if (ocfs2_supports_dir_trailer(osb)) 2346 if (ocfs2_new_dir_wants_trailer(inode)) {
1314 ocfs2_init_dir_trailer(inode, new_bh); 2347 int size = le16_to_cpu(de->rec_len);
2348
2349 /*
2350 * Figure out the size of the hole left over after
2351 * insertion of '.' and '..'. The trailer wants this
2352 * information.
2353 */
2354 size -= OCFS2_DIR_REC_LEN(2);
2355 size -= sizeof(struct ocfs2_dir_block_trailer);
2356
2357 ocfs2_init_dir_trailer(inode, new_bh, size);
2358 }
1315 2359
1316 status = ocfs2_journal_dirty(handle, new_bh); 2360 status = ocfs2_journal_dirty(handle, new_bh);
1317 if (status < 0) { 2361 if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1329 } 2373 }
1330 2374
1331 status = 0; 2375 status = 0;
2376 if (ret_new_bh) {
2377 *ret_new_bh = new_bh;
2378 new_bh = NULL;
2379 }
1332bail: 2380bail:
1333 brelse(new_bh); 2381 brelse(new_bh);
1334 2382
@@ -1336,20 +2384,427 @@ bail:
1336 return status; 2384 return status;
1337} 2385}
1338 2386
2387static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2388 handle_t *handle, struct inode *dir,
2389 struct buffer_head *di_bh,
2390 struct buffer_head *dirdata_bh,
2391 struct ocfs2_alloc_context *meta_ac,
2392 int dx_inline, u32 num_entries,
2393 struct buffer_head **ret_dx_root_bh)
2394{
2395 int ret;
2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2397 u16 dr_suballoc_bit;
2398 u64 dr_blkno;
2399 unsigned int num_bits;
2400 struct buffer_head *dx_root_bh = NULL;
2401 struct ocfs2_dx_root_block *dx_root;
2402 struct ocfs2_dir_block_trailer *trailer =
2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2404
2405 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
2406 &num_bits, &dr_blkno);
2407 if (ret) {
2408 mlog_errno(ret);
2409 goto out;
2410 }
2411
2412 mlog(0, "Dir %llu, attach new index block: %llu\n",
2413 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2414 (unsigned long long)dr_blkno);
2415
2416 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2417 if (dx_root_bh == NULL) {
2418 ret = -EIO;
2419 goto out;
2420 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
2422
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429
2430 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2431 memset(dx_root, 0, osb->sb->s_blocksize);
2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2433 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
2434 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2435 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2436 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2437 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2438 dx_root->dr_num_entries = cpu_to_le32(num_entries);
2439 if (le16_to_cpu(trailer->db_free_rec_len))
2440 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2441 else
2442 dx_root->dr_free_blk = cpu_to_le64(0);
2443
2444 if (dx_inline) {
2445 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2446 dx_root->dr_entries.de_count =
2447 cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2448 } else {
2449 dx_root->dr_list.l_count =
2450 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2451 }
2452
2453 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2454 if (ret)
2455 mlog_errno(ret);
2456
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) {
2460 mlog_errno(ret);
2461 goto out;
2462 }
2463
2464 di->i_dx_root = cpu_to_le64(dr_blkno);
2465
2466 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2467 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2468
2469 ret = ocfs2_journal_dirty(handle, di_bh);
2470 if (ret)
2471 mlog_errno(ret);
2472
2473 *ret_dx_root_bh = dx_root_bh;
2474 dx_root_bh = NULL;
2475
2476out:
2477 brelse(dx_root_bh);
2478 return ret;
2479}
2480
2481static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2482 handle_t *handle, struct inode *dir,
2483 struct buffer_head **dx_leaves,
2484 int num_dx_leaves, u64 start_blk)
2485{
2486 int ret, i;
2487 struct ocfs2_dx_leaf *dx_leaf;
2488 struct buffer_head *bh;
2489
2490 for (i = 0; i < num_dx_leaves; i++) {
2491 bh = sb_getblk(osb->sb, start_blk + i);
2492 if (bh == NULL) {
2493 ret = -EIO;
2494 goto out;
2495 }
2496 dx_leaves[i] = bh;
2497
2498 ocfs2_set_new_buffer_uptodate(dir, bh);
2499
2500 ret = ocfs2_journal_access_dl(handle, dir, bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) {
2503 mlog_errno(ret);
2504 goto out;
2505 }
2506
2507 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2508
2509 memset(dx_leaf, 0, osb->sb->s_blocksize);
2510 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2511 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2512 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2513 dx_leaf->dl_list.de_count =
2514 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2515
2516 mlog(0,
2517 "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
2518 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2519 (unsigned long long)bh->b_blocknr,
2520 le16_to_cpu(dx_leaf->dl_list.de_count));
2521
2522 ocfs2_journal_dirty(handle, bh);
2523 }
2524
2525 ret = 0;
2526out:
2527 return ret;
2528}
2529
2530/*
2531 * Allocates and formats a new cluster for use in an indexed dir
2532 * leaf. This version will not do the extent insert, so that it can be
2533 * used by operations which need careful ordering.
2534 */
2535static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2536 u32 cpos, handle_t *handle,
2537 struct ocfs2_alloc_context *data_ac,
2538 struct buffer_head **dx_leaves,
2539 int num_dx_leaves, u64 *ret_phys_blkno)
2540{
2541 int ret;
2542 u32 phys, num;
2543 u64 phys_blkno;
2544 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2545
2546 /*
2547 * XXX: For create, this should claim cluster for the index
2548 * *before* the unindexed insert so that we have a better
2549 * chance of contiguousness as the directory grows in number
2550 * of entries.
2551 */
2552 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
2553 if (ret) {
2554 mlog_errno(ret);
2555 goto out;
2556 }
2557
2558 /*
2559 * Format the new cluster first. That way, we're inserting
2560 * valid data.
2561 */
2562 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2563 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2564 num_dx_leaves, phys_blkno);
2565 if (ret) {
2566 mlog_errno(ret);
2567 goto out;
2568 }
2569
2570 *ret_phys_blkno = phys_blkno;
2571out:
2572 return ret;
2573}
2574
2575static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2576 struct ocfs2_extent_tree *et,
2577 u32 cpos, handle_t *handle,
2578 struct ocfs2_alloc_context *data_ac,
2579 struct ocfs2_alloc_context *meta_ac,
2580 struct buffer_head **dx_leaves,
2581 int num_dx_leaves)
2582{
2583 int ret;
2584 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno);
2589 if (ret) {
2590 mlog_errno(ret);
2591 goto out;
2592 }
2593
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
2595 meta_ac);
2596 if (ret)
2597 mlog_errno(ret);
2598out:
2599 return ret;
2600}
2601
2602static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2603 int *ret_num_leaves)
2604{
2605 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2606 struct buffer_head **dx_leaves;
2607
2608 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2609 GFP_NOFS);
2610 if (dx_leaves && ret_num_leaves)
2611 *ret_num_leaves = num_dx_leaves;
2612
2613 return dx_leaves;
2614}
2615
2616static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2617 handle_t *handle,
2618 struct inode *parent,
2619 struct inode *inode,
2620 struct buffer_head *di_bh,
2621 struct ocfs2_alloc_context *data_ac,
2622 struct ocfs2_alloc_context *meta_ac)
2623{
2624 int ret;
2625 struct buffer_head *leaf_bh = NULL;
2626 struct buffer_head *dx_root_bh = NULL;
2627 struct ocfs2_dx_hinfo hinfo;
2628 struct ocfs2_dx_root_block *dx_root;
2629 struct ocfs2_dx_entry_list *entry_list;
2630
2631 /*
2632 * Our strategy is to create the directory as though it were
2633 * unindexed, then add the index block. This works with very
2634 * little complication since the state of a new directory is a
2635 * very well known quantity.
2636 *
2637 * Essentially, we have two dirents ("." and ".."), in the 1st
2638 * block which need indexing. These are easily inserted into
2639 * the index block.
2640 */
2641
2642 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2643 data_ac, &leaf_bh);
2644 if (ret) {
2645 mlog_errno(ret);
2646 goto out;
2647 }
2648
2649 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2650 meta_ac, 1, 2, &dx_root_bh);
2651 if (ret) {
2652 mlog_errno(ret);
2653 goto out;
2654 }
2655 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2656 entry_list = &dx_root->dr_entries;
2657
2658 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2659 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2660 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2661
2662 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2663 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2664
2665out:
2666 brelse(dx_root_bh);
2667 brelse(leaf_bh);
2668 return ret;
2669}
2670
1339int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2671int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1340 handle_t *handle, 2672 handle_t *handle,
1341 struct inode *parent, 2673 struct inode *parent,
1342 struct inode *inode, 2674 struct inode *inode,
1343 struct buffer_head *fe_bh, 2675 struct buffer_head *fe_bh,
1344 struct ocfs2_alloc_context *data_ac) 2676 struct ocfs2_alloc_context *data_ac,
2677 struct ocfs2_alloc_context *meta_ac)
2678
1345{ 2679{
1346 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2680 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1347 2681
1348 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2682 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1349 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2683 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1350 2684
2685 if (ocfs2_supports_indexed_dirs(osb))
2686 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2687 data_ac, meta_ac);
2688
1351 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2689 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1352 data_ac); 2690 data_ac, NULL);
2691}
2692
2693static int ocfs2_dx_dir_index_block(struct inode *dir,
2694 handle_t *handle,
2695 struct buffer_head **dx_leaves,
2696 int num_dx_leaves,
2697 u32 *num_dx_entries,
2698 struct buffer_head *dirent_bh)
2699{
2700 int ret, namelen, i;
2701 char *de_buf, *limit;
2702 struct ocfs2_dir_entry *de;
2703 struct buffer_head *dx_leaf_bh;
2704 struct ocfs2_dx_hinfo hinfo;
2705 u64 dirent_blk = dirent_bh->b_blocknr;
2706
2707 de_buf = dirent_bh->b_data;
2708 limit = de_buf + dir->i_sb->s_blocksize;
2709
2710 while (de_buf < limit) {
2711 de = (struct ocfs2_dir_entry *)de_buf;
2712
2713 namelen = de->name_len;
2714 if (!namelen || !de->inode)
2715 goto inc;
2716
2717 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2718
2719 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2720 dx_leaf_bh = dx_leaves[i];
2721
2722 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2723 dirent_blk, dx_leaf_bh);
2724 if (ret) {
2725 mlog_errno(ret);
2726 goto out;
2727 }
2728
2729 *num_dx_entries = *num_dx_entries + 1;
2730
2731inc:
2732 de_buf += le16_to_cpu(de->rec_len);
2733 }
2734
2735out:
2736 return ret;
2737}
2738
2739/*
2740 * XXX: This expects dx_root_bh to already be part of the transaction.
2741 */
2742static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2743 struct buffer_head *dx_root_bh,
2744 struct buffer_head *dirent_bh)
2745{
2746 char *de_buf, *limit;
2747 struct ocfs2_dx_root_block *dx_root;
2748 struct ocfs2_dir_entry *de;
2749 struct ocfs2_dx_hinfo hinfo;
2750 u64 dirent_blk = dirent_bh->b_blocknr;
2751
2752 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2753
2754 de_buf = dirent_bh->b_data;
2755 limit = de_buf + dir->i_sb->s_blocksize;
2756
2757 while (de_buf < limit) {
2758 de = (struct ocfs2_dir_entry *)de_buf;
2759
2760 if (!de->name_len || !de->inode)
2761 goto inc;
2762
2763 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2764
2765 mlog(0,
2766 "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
2767 (unsigned long long)dir->i_ino, hinfo.major_hash,
2768 hinfo.minor_hash,
2769 le16_to_cpu(dx_root->dr_entries.de_num_used),
2770 de->name_len, de->name);
2771
2772 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2773 dirent_blk);
2774
2775 le32_add_cpu(&dx_root->dr_num_entries, 1);
2776inc:
2777 de_buf += le16_to_cpu(de->rec_len);
2778 }
2779}
2780
2781/*
2782 * Count the number of inline directory entries in di_bh and compare
2783 * them against the number of entries we can hold in an inline dx root
2784 * block.
2785 */
2786static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2787 struct buffer_head *di_bh)
2788{
2789 int dirent_count = 0;
2790 char *de_buf, *limit;
2791 struct ocfs2_dir_entry *de;
2792 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2793
2794 de_buf = di->id2.i_data.id_data;
2795 limit = de_buf + i_size_read(dir);
2796
2797 while (de_buf < limit) {
2798 de = (struct ocfs2_dir_entry *)de_buf;
2799
2800 if (de->name_len && de->inode)
2801 dirent_count++;
2802
2803 de_buf += le16_to_cpu(de->rec_len);
2804 }
2805
2806 /* We are careful to leave room for one extra record. */
2807 return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
1353} 2808}
1354 2809
1355/* 2810/*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1358 * expansion from an inline directory to one with extents. The first dir block 2813 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block. 2814 * in that case is taken from the inline data portion of the inode block.
1360 * 2815 *
2816 * This will also return the largest amount of contiguous space for a dirent
2817 * in the block. That value is *not* necessarily the last dirent, even after
2818 * expansion. The directory indexing code wants this value for free space
2819 * accounting. We do this here since we're already walking the entire dir
2820 * block.
2821 *
1361 * We add the dir trailer if this filesystem wants it. 2822 * We add the dir trailer if this filesystem wants it.
1362 */ 2823 */
1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2824static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1364 struct super_block *sb) 2825 struct inode *dir)
1365{ 2826{
2827 struct super_block *sb = dir->i_sb;
1366 struct ocfs2_dir_entry *de; 2828 struct ocfs2_dir_entry *de;
1367 struct ocfs2_dir_entry *prev_de; 2829 struct ocfs2_dir_entry *prev_de;
1368 char *de_buf, *limit; 2830 char *de_buf, *limit;
1369 unsigned int new_size = sb->s_blocksize; 2831 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes; 2832 unsigned int bytes, this_hole;
2833 unsigned int largest_hole = 0;
1371 2834
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 2835 if (ocfs2_new_dir_wants_trailer(dir))
1373 new_size = ocfs2_dir_trailer_blk_off(sb); 2836 new_size = ocfs2_dir_trailer_blk_off(sb);
1374 2837
1375 bytes = new_size - old_size; 2838 bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1378 de_buf = start; 2841 de_buf = start;
1379 de = (struct ocfs2_dir_entry *)de_buf; 2842 de = (struct ocfs2_dir_entry *)de_buf;
1380 do { 2843 do {
2844 this_hole = ocfs2_figure_dirent_hole(de);
2845 if (this_hole > largest_hole)
2846 largest_hole = this_hole;
2847
1381 prev_de = de; 2848 prev_de = de;
1382 de_buf += le16_to_cpu(de->rec_len); 2849 de_buf += le16_to_cpu(de->rec_len);
1383 de = (struct ocfs2_dir_entry *)de_buf; 2850 de = (struct ocfs2_dir_entry *)de_buf;
1384 } while (de_buf < limit); 2851 } while (de_buf < limit);
1385 2852
1386 le16_add_cpu(&prev_de->rec_len, bytes); 2853 le16_add_cpu(&prev_de->rec_len, bytes);
2854
2855 /* We need to double check this after modification of the final
2856 * dirent. */
2857 this_hole = ocfs2_figure_dirent_hole(prev_de);
2858 if (this_hole > largest_hole)
2859 largest_hole = this_hole;
2860
2861 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2862 return largest_hole;
2863 return 0;
1387} 2864}
1388 2865
1389/* 2866/*
@@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1396 */ 2873 */
1397static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2874static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1398 unsigned int blocks_wanted, 2875 unsigned int blocks_wanted,
2876 struct ocfs2_dir_lookup_result *lookup,
1399 struct buffer_head **first_block_bh) 2877 struct buffer_head **first_block_bh)
1400{ 2878{
1401 u32 alloc, bit_off, len; 2879 u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
1402 struct super_block *sb = dir->i_sb; 2880 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb); 2881 int ret, i, num_dx_leaves = 0, dx_inline = 0,
1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 2882 credits = ocfs2_inline_to_extents_credits(sb);
2883 u64 dx_insert_blkno, blkno,
2884 bytes = blocks_wanted << sb->s_blocksize_bits;
1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2885 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1406 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2886 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1407 struct ocfs2_alloc_context *data_ac; 2887 struct ocfs2_alloc_context *data_ac;
2888 struct ocfs2_alloc_context *meta_ac = NULL;
1408 struct buffer_head *dirdata_bh = NULL; 2889 struct buffer_head *dirdata_bh = NULL;
2890 struct buffer_head *dx_root_bh = NULL;
2891 struct buffer_head **dx_leaves = NULL;
1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2892 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1410 handle_t *handle; 2893 handle_t *handle;
1411 struct ocfs2_extent_tree et; 2894 struct ocfs2_extent_tree et;
1412 int did_quota = 0; 2895 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0;
1413 2897
1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1415 2899
1416 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2900 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0;
2902
2903 if (ocfs2_supports_indexed_dirs(osb)) {
2904 credits += ocfs2_add_dir_index_credits(sb);
2905
2906 dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2907 if (!dx_inline) {
2908 /* Add one more cluster for an index leaf */
2909 dx_alloc++;
2910 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2911 &num_dx_leaves);
2912 if (!dx_leaves) {
2913 ret = -ENOMEM;
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917 }
2918
2919 /* This gets us the dx_root */
2920 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2921 if (ret) {
2922 mlog_errno(ret);
2923 goto out;
2924 }
2925 }
1417 2926
1418 /* 2927 /*
1419 * We should never need more than 2 clusters for this - 2928 * We should never need more than 2 clusters for the unindexed
1420 * maximum dirent size is far less than one block. In fact, 2929 * tree - maximum dirent size is far less than one block. In
1421 * the only time we'd need more than one cluster is if 2930 * fact, the only time we'd need more than one cluster is if
1422 * blocksize == clustersize and the dirent won't fit in the 2931 * blocksize == clustersize and the dirent won't fit in the
1423 * extra space that the expansion to a single block gives. As 2932 * extra space that the expansion to a single block gives. As
1424 * of today, that only happens on 4k/4k file systems. 2933 * of today, that only happens on 4k/4k file systems.
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1435 2944
1436 /* 2945 /*
1437 * Prepare for worst case allocation scenario of two separate 2946 * Prepare for worst case allocation scenario of two separate
1438 * extents. 2947 * extents in the unindexed tree.
1439 */ 2948 */
1440 if (alloc == 2) 2949 if (alloc == 2)
1441 credits += OCFS2_SUBALLOC_ALLOC; 2950 credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1448 } 2957 }
1449 2958
1450 if (vfs_dq_alloc_space_nodirty(dir, 2959 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) { 2960 ocfs2_clusters_to_bytes(osb->sb,
2961 alloc + dx_alloc))) {
1452 ret = -EDQUOT; 2962 ret = -EDQUOT;
1453 goto out_commit; 2963 goto out_commit;
1454 } 2964 }
1455 did_quota = 1; 2965 did_quota = 1;
2966
2967 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2968 /*
2969 * Allocate our index cluster first, to maximize the
2970 * possibility that unindexed leaves grow
2971 * contiguously.
2972 */
2973 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2974 dx_leaves, num_dx_leaves,
2975 &dx_insert_blkno);
2976 if (ret) {
2977 mlog_errno(ret);
2978 goto out_commit;
2979 }
2980 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2981 }
2982
1456 /* 2983 /*
1457 * Try to claim as many clusters as the bitmap can give though 2984 * Try to claim as many clusters as the bitmap can give though
1458 * if we only get one now, that's enough to continue. The rest 2985 * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1463 mlog_errno(ret); 2990 mlog_errno(ret);
1464 goto out_commit; 2991 goto out_commit;
1465 } 2992 }
2993 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1466 2994
1467 /* 2995 /*
1468 * Operations are carefully ordered so that we set up the new 2996 * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 3017 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1490 memset(dirdata_bh->b_data + i_size_read(dir), 0, 3018 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1491 sb->s_blocksize - i_size_read(dir)); 3019 sb->s_blocksize - i_size_read(dir));
1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); 3020 i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
1493 if (ocfs2_supports_dir_trailer(osb)) 3021 if (ocfs2_new_dir_wants_trailer(dir)) {
1494 ocfs2_init_dir_trailer(dir, dirdata_bh); 3022 /*
3023 * Prepare the dir trailer up front. It will otherwise look
3024 * like a valid dirent. Even if inserting the index fails
3025 * (unlikely), then all we'll have done is given first dir
3026 * block a small amount of fragmentation.
3027 */
3028 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3029 }
1495 3030
1496 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3031 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1497 if (ret) { 3032 if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1499 goto out_commit; 3034 goto out_commit;
1500 } 3035 }
1501 3036
3037 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3038 /*
3039 * Dx dirs with an external cluster need to do this up
3040 * front. Inline dx root's get handled later, after
3041 * we've allocated our root block. We get passed back
3042 * a total number of items so that dr_num_entries can
3043 * be correctly set once the dx_root has been
3044 * allocated.
3045 */
3046 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
3047 num_dx_leaves, &num_dx_entries,
3048 dirdata_bh);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out_commit;
3052 }
3053 }
3054
1502 /* 3055 /*
1503 * Set extent, i_size, etc on the directory. After this, the 3056 * Set extent, i_size, etc on the directory. After this, the
1504 * inode should contain the same exact dirents as before and 3057 * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1551 goto out_commit; 3104 goto out_commit;
1552 } 3105 }
1553 3106
3107 if (ocfs2_supports_indexed_dirs(osb)) {
3108 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
3109 dirdata_bh, meta_ac, dx_inline,
3110 num_dx_entries, &dx_root_bh);
3111 if (ret) {
3112 mlog_errno(ret);
3113 goto out_commit;
3114 }
3115
3116 if (dx_inline) {
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh);
3119 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL);
3123 if (ret)
3124 mlog_errno(ret);
3125 }
3126 }
3127
1554 /* 3128 /*
1555 * We asked for two clusters, but only got one in the 1st 3129 * We asked for two clusters, but only got one in the 1st
1556 * pass. Claim the 2nd cluster as a separate extent. 3130 * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1570 mlog_errno(ret); 3144 mlog_errno(ret);
1571 goto out_commit; 3145 goto out_commit;
1572 } 3146 }
3147 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1573 } 3148 }
1574 3149
1575 *first_block_bh = dirdata_bh; 3150 *first_block_bh = dirdata_bh;
1576 dirdata_bh = NULL; 3151 dirdata_bh = NULL;
3152 if (ocfs2_supports_indexed_dirs(osb)) {
3153 unsigned int off;
3154
3155 if (!dx_inline) {
3156 /*
3157 * We need to return the correct block within the
3158 * cluster which should hold our entry.
3159 */
3160 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
3161 &lookup->dl_hinfo);
3162 get_bh(dx_leaves[off]);
3163 lookup->dl_dx_leaf_bh = dx_leaves[off];
3164 }
3165 lookup->dl_dx_root_bh = dx_root_bh;
3166 dx_root_bh = NULL;
3167 }
1577 3168
1578out_commit: 3169out_commit:
1579 if (ret < 0 && did_quota) 3170 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir, 3171 vfs_dq_free_space_nodirty(dir, bytes_allocated);
1581 ocfs2_clusters_to_bytes(osb->sb, 2)); 3172
1582 ocfs2_commit_trans(osb, handle); 3173 ocfs2_commit_trans(osb, handle);
1583 3174
1584out_sem: 3175out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
1587out: 3178out:
1588 if (data_ac) 3179 if (data_ac)
1589 ocfs2_free_alloc_context(data_ac); 3180 ocfs2_free_alloc_context(data_ac);
3181 if (meta_ac)
3182 ocfs2_free_alloc_context(meta_ac);
3183
3184 if (dx_leaves) {
3185 for (i = 0; i < num_dx_leaves; i++)
3186 brelse(dx_leaves[i]);
3187 kfree(dx_leaves);
3188 }
1590 3189
1591 brelse(dirdata_bh); 3190 brelse(dirdata_bh);
3191 brelse(dx_root_bh);
1592 3192
1593 return ret; 3193 return ret;
1594} 3194}
@@ -1658,11 +3258,14 @@ bail:
1658 * is to be turned into an extent based one. The size of the dirent to 3258 * is to be turned into an extent based one. The size of the dirent to
1659 * insert might be larger than the space gained by growing to just one 3259 * insert might be larger than the space gained by growing to just one
1660 * block, so we may have to grow the inode by two blocks in that case. 3260 * block, so we may have to grow the inode by two blocks in that case.
3261 *
3262 * If the directory is already indexed, dx_root_bh must be provided.
1661 */ 3263 */
1662static int ocfs2_extend_dir(struct ocfs2_super *osb, 3264static int ocfs2_extend_dir(struct ocfs2_super *osb,
1663 struct inode *dir, 3265 struct inode *dir,
1664 struct buffer_head *parent_fe_bh, 3266 struct buffer_head *parent_fe_bh,
1665 unsigned int blocks_wanted, 3267 unsigned int blocks_wanted,
3268 struct ocfs2_dir_lookup_result *lookup,
1666 struct buffer_head **new_de_bh) 3269 struct buffer_head **new_de_bh)
1667{ 3270{
1668 int status = 0; 3271 int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1677 struct ocfs2_dir_entry * de; 3280 struct ocfs2_dir_entry * de;
1678 struct super_block *sb = osb->sb; 3281 struct super_block *sb = osb->sb;
1679 struct ocfs2_extent_tree et; 3282 struct ocfs2_extent_tree et;
3283 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1680 3284
1681 mlog_entry_void(); 3285 mlog_entry_void();
1682 3286
1683 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3287 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3288 /*
3289 * This would be a code error as an inline directory should
3290 * never have an index root.
3291 */
3292 BUG_ON(dx_root_bh);
3293
1684 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 3294 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1685 blocks_wanted, &new_bh); 3295 blocks_wanted, lookup,
3296 &new_bh);
1686 if (status) { 3297 if (status) {
1687 mlog_errno(status); 3298 mlog_errno(status);
1688 goto bail; 3299 goto bail;
1689 } 3300 }
1690 3301
3302 /* Expansion from inline to an indexed directory will
3303 * have given us this. */
3304 dx_root_bh = lookup->dl_dx_root_bh;
3305
1691 if (blocks_wanted == 1) { 3306 if (blocks_wanted == 1) {
1692 /* 3307 /*
1693 * If the new dirent will fit inside the space 3308 * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1751 } 3366 }
1752 3367
1753do_extend: 3368do_extend:
3369 if (ocfs2_dir_indexed(dir))
3370 credits++; /* For attaching the new dirent block to the
3371 * dx_root */
3372
1754 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3373 down_write(&OCFS2_I(dir)->ip_alloc_sem);
1755 drop_alloc_sem = 1; 3374 drop_alloc_sem = 1;
1756 3375
@@ -1781,9 +3400,19 @@ do_extend:
1781 3400
1782 de = (struct ocfs2_dir_entry *) new_bh->b_data; 3401 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1783 de->inode = 0; 3402 de->inode = 0;
1784 if (ocfs2_dir_has_trailer(dir)) { 3403 if (ocfs2_supports_dir_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); 3404 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh); 3405
3406 ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3407
3408 if (ocfs2_dir_indexed(dir)) {
3409 status = ocfs2_dx_dir_link_trailer(dir, handle,
3410 dx_root_bh, new_bh);
3411 if (status) {
3412 mlog_errno(status);
3413 goto bail;
3414 }
3415 }
1787 } else { 3416 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize); 3417 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 } 3418 }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1839 * This calculates how many free bytes we'd have in block zero, should 3468 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree. 3469 * this function force expansion to an extent tree.
1841 */ 3470 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 3471 if (ocfs2_new_dir_wants_trailer(dir))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); 3472 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else 3473 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir); 3474 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
1970 return status; 3599 return status;
1971} 3600}
1972 3601
3602static int dx_leaf_sort_cmp(const void *a, const void *b)
3603{
3604 const struct ocfs2_dx_entry *entry1 = a;
3605 const struct ocfs2_dx_entry *entry2 = b;
3606 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3607 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3608 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3609 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3610
3611 if (major_hash1 > major_hash2)
3612 return 1;
3613 if (major_hash1 < major_hash2)
3614 return -1;
3615
3616 /*
3617 * It is not strictly necessary to sort by minor
3618 */
3619 if (minor_hash1 > minor_hash2)
3620 return 1;
3621 if (minor_hash1 < minor_hash2)
3622 return -1;
3623 return 0;
3624}
3625
3626static void dx_leaf_sort_swap(void *a, void *b, int size)
3627{
3628 struct ocfs2_dx_entry *entry1 = a;
3629 struct ocfs2_dx_entry *entry2 = b;
3630 struct ocfs2_dx_entry tmp;
3631
3632 BUG_ON(size != sizeof(*entry1));
3633
3634 tmp = *entry1;
3635 *entry1 = *entry2;
3636 *entry2 = tmp;
3637}
3638
3639static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3640{
3641 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3642 int i, num = le16_to_cpu(dl_list->de_num_used);
3643
3644 for (i = 0; i < (num - 1); i++) {
3645 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3646 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3647 return 0;
3648 }
3649
3650 return 1;
3651}
3652
3653/*
3654 * Find the optimal value to split this leaf on. This expects the leaf
3655 * entries to be in sorted order.
3656 *
3657 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3658 * the hash we want to insert.
3659 *
3660 * This function is only concerned with the major hash - that which
3661 * determines which cluster an item belongs to.
3662 */
3663static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3664 u32 leaf_cpos, u32 insert_hash,
3665 u32 *split_hash)
3666{
3667 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3668 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3669 int allsame;
3670
3671 /*
3672 * There's a couple rare, but nasty corner cases we have to
3673 * check for here. All of them involve a leaf where all value
3674 * have the same hash, which is what we look for first.
3675 *
3676 * Most of the time, all of the above is false, and we simply
3677 * pick the median value for a split.
3678 */
3679 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3680 if (allsame) {
3681 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3682
3683 if (val == insert_hash) {
3684 /*
3685 * No matter where we would choose to split,
3686 * the new entry would want to occupy the same
3687 * block as these. Since there's no space left
3688 * in their existing block, we know there
3689 * won't be space after the split.
3690 */
3691 return -ENOSPC;
3692 }
3693
3694 if (val == leaf_cpos) {
3695 /*
3696 * Because val is the same as leaf_cpos (which
3697 * is the smallest value this leaf can have),
3698 * yet is not equal to insert_hash, then we
3699 * know that insert_hash *must* be larger than
3700 * val (and leaf_cpos). At least cpos+1 in value.
3701 *
3702 * We also know then, that there cannot be an
3703 * adjacent extent (otherwise we'd be looking
3704 * at it). Choosing this value gives us a
3705 * chance to get some contiguousness.
3706 */
3707 *split_hash = leaf_cpos + 1;
3708 return 0;
3709 }
3710
3711 if (val > insert_hash) {
3712 /*
3713 * val can not be the same as insert hash, and
3714 * also must be larger than leaf_cpos. Also,
3715 * we know that there can't be a leaf between
3716 * cpos and val, otherwise the entries with
3717 * hash 'val' would be there.
3718 */
3719 *split_hash = val;
3720 return 0;
3721 }
3722
3723 *split_hash = insert_hash;
3724 return 0;
3725 }
3726
3727 /*
3728 * Since the records are sorted and the checks above
3729 * guaranteed that not all records in this block are the same,
3730 * we simple travel forward, from the median, and pick the 1st
3731 * record whose value is larger than leaf_cpos.
3732 */
3733 for (i = (num_used / 2); i < num_used; i++)
3734 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3735 leaf_cpos)
3736 break;
3737
3738 BUG_ON(i == num_used); /* Should be impossible */
3739 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3740 return 0;
3741}
3742
3743/*
3744 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3745 * larger than split_hash into new_dx_leaves. We use a temporary
3746 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3747 *
3748 * Since the block offset inside a leaf (cluster) is a constant mask
3749 * of minor_hash, we can optimize - an item at block offset X within
3750 * the original cluster, will be at offset X within the new cluster.
3751 */
3752static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3753 handle_t *handle,
3754 struct ocfs2_dx_leaf *tmp_dx_leaf,
3755 struct buffer_head **orig_dx_leaves,
3756 struct buffer_head **new_dx_leaves,
3757 int num_dx_leaves)
3758{
3759 int i, j, num_used;
3760 u32 major_hash;
3761 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3762 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3763 struct ocfs2_dx_entry *dx_entry;
3764
3765 tmp_list = &tmp_dx_leaf->dl_list;
3766
3767 for (i = 0; i < num_dx_leaves; i++) {
3768 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3769 orig_list = &orig_dx_leaf->dl_list;
3770 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3771 new_list = &new_dx_leaf->dl_list;
3772
3773 num_used = le16_to_cpu(orig_list->de_num_used);
3774
3775 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3776 tmp_list->de_num_used = cpu_to_le16(0);
3777 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3778
3779 for (j = 0; j < num_used; j++) {
3780 dx_entry = &orig_list->de_entries[j];
3781 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3782 if (major_hash >= split_hash)
3783 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3784 dx_entry);
3785 else
3786 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3787 dx_entry);
3788 }
3789 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3790
3791 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3792 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3793 }
3794}
3795
3796static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3797 struct ocfs2_dx_root_block *dx_root)
3798{
3799 int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3800
3801 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3802 credits += ocfs2_quota_trans_credits(osb->sb);
3803 return credits;
3804}
3805
3806/*
3807 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3808 * half our entries into.
3809 */
3810static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3811 struct buffer_head *dx_root_bh,
3812 struct buffer_head *dx_leaf_bh,
3813 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3814 u64 leaf_blkno)
3815{
3816 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3817 int credits, ret, i, num_used, did_quota = 0;
3818 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3819 u64 orig_leaves_start;
3820 int num_dx_leaves;
3821 struct buffer_head **orig_dx_leaves = NULL;
3822 struct buffer_head **new_dx_leaves = NULL;
3823 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3824 struct ocfs2_extent_tree et;
3825 handle_t *handle = NULL;
3826 struct ocfs2_dx_root_block *dx_root;
3827 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3828
3829 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
3830 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3831 (unsigned long long)leaf_blkno, insert_hash);
3832
3833 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3834
3835 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3836 /*
3837 * XXX: This is a rather large limit. We should use a more
3838 * realistic value.
3839 */
3840 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3841 return -ENOSPC;
3842
3843 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3844 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3845 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3846 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3847 (unsigned long long)leaf_blkno, num_used);
3848 ret = -EIO;
3849 goto out;
3850 }
3851
3852 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3853 if (!orig_dx_leaves) {
3854 ret = -ENOMEM;
3855 mlog_errno(ret);
3856 goto out;
3857 }
3858
3859 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3860 if (!new_dx_leaves) {
3861 ret = -ENOMEM;
3862 mlog_errno(ret);
3863 goto out;
3864 }
3865
3866 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3867 if (ret) {
3868 if (ret != -ENOSPC)
3869 mlog_errno(ret);
3870 goto out;
3871 }
3872
3873 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3874 handle = ocfs2_start_trans(osb, credits);
3875 if (IS_ERR(handle)) {
3876 ret = PTR_ERR(handle);
3877 handle = NULL;
3878 mlog_errno(ret);
3879 goto out;
3880 }
3881
3882 if (vfs_dq_alloc_space_nodirty(dir,
3883 ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
3884 ret = -EDQUOT;
3885 goto out_commit;
3886 }
3887 did_quota = 1;
3888
3889 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
3890 OCFS2_JOURNAL_ACCESS_WRITE);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out_commit;
3894 }
3895
3896 /*
3897 * This block is changing anyway, so we can sort it in place.
3898 */
3899 sort(dx_leaf->dl_list.de_entries, num_used,
3900 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3901 dx_leaf_sort_swap);
3902
3903 ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
3904 if (ret) {
3905 mlog_errno(ret);
3906 goto out_commit;
3907 }
3908
3909 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3910 &split_hash);
3911 if (ret) {
3912 mlog_errno(ret);
3913 goto out_commit;
3914 }
3915
3916 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
3917 leaf_cpos, split_hash, insert_hash);
3918
3919 /*
3920 * We have to carefully order operations here. There are items
3921 * which want to be in the new cluster before insert, but in
3922 * order to put those items in the new cluster, we alter the
3923 * old cluster. A failure to insert gets nasty.
3924 *
3925 * So, start by reserving writes to the old
3926 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3927 * the new cluster for us, before inserting it. The insert
3928 * won't happen if there's an error before that. Once the
3929 * insert is done then, we can transfer from one leaf into the
3930 * other without fear of hitting any error.
3931 */
3932
3933 /*
3934 * The leaf transfer wants some scratch space so that we don't
3935 * wind up doing a bunch of expensive memmove().
3936 */
3937 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3938 if (!tmp_dx_leaf) {
3939 ret = -ENOMEM;
3940 mlog_errno(ret);
3941 goto out_commit;
3942 }
3943
3944 orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3945 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3946 orig_dx_leaves);
3947 if (ret) {
3948 mlog_errno(ret);
3949 goto out_commit;
3950 }
3951
3952 for (i = 0; i < num_dx_leaves; i++) {
3953 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
3954 OCFS2_JOURNAL_ACCESS_WRITE);
3955 if (ret) {
3956 mlog_errno(ret);
3957 goto out_commit;
3958 }
3959 }
3960
3961 cpos = split_hash;
3962 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3963 data_ac, meta_ac, new_dx_leaves,
3964 num_dx_leaves);
3965 if (ret) {
3966 mlog_errno(ret);
3967 goto out_commit;
3968 }
3969
3970 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3971 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3972
3973out_commit:
3974 if (ret < 0 && did_quota)
3975 vfs_dq_free_space_nodirty(dir,
3976 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3977
3978 ocfs2_commit_trans(osb, handle);
3979
3980out:
3981 if (orig_dx_leaves || new_dx_leaves) {
3982 for (i = 0; i < num_dx_leaves; i++) {
3983 if (orig_dx_leaves)
3984 brelse(orig_dx_leaves[i]);
3985 if (new_dx_leaves)
3986 brelse(new_dx_leaves[i]);
3987 }
3988 kfree(orig_dx_leaves);
3989 kfree(new_dx_leaves);
3990 }
3991
3992 if (meta_ac)
3993 ocfs2_free_alloc_context(meta_ac);
3994 if (data_ac)
3995 ocfs2_free_alloc_context(data_ac);
3996
3997 kfree(tmp_dx_leaf);
3998 return ret;
3999}
4000
4001static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
4002 struct buffer_head *di_bh,
4003 struct buffer_head *dx_root_bh,
4004 const char *name, int namelen,
4005 struct ocfs2_dir_lookup_result *lookup)
4006{
4007 int ret, rebalanced = 0;
4008 struct ocfs2_dx_root_block *dx_root;
4009 struct buffer_head *dx_leaf_bh = NULL;
4010 struct ocfs2_dx_leaf *dx_leaf;
4011 u64 blkno;
4012 u32 leaf_cpos;
4013
4014 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4015
4016restart_search:
4017 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
4018 &leaf_cpos, &blkno);
4019 if (ret) {
4020 mlog_errno(ret);
4021 goto out;
4022 }
4023
4024 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
4025 if (ret) {
4026 mlog_errno(ret);
4027 goto out;
4028 }
4029
4030 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
4031
4032 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
4033 le16_to_cpu(dx_leaf->dl_list.de_count)) {
4034 if (rebalanced) {
4035 /*
4036 * Rebalancing should have provided us with
4037 * space in an appropriate leaf.
4038 *
4039 * XXX: Is this an abnormal condition then?
4040 * Should we print a message here?
4041 */
4042 ret = -ENOSPC;
4043 goto out;
4044 }
4045
4046 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
4047 &lookup->dl_hinfo, leaf_cpos,
4048 blkno);
4049 if (ret) {
4050 if (ret != -ENOSPC)
4051 mlog_errno(ret);
4052 goto out;
4053 }
4054
4055 /*
4056 * Restart the lookup. The rebalance might have
4057 * changed which block our item fits into. Mark our
4058 * progress, so we only execute this once.
4059 */
4060 brelse(dx_leaf_bh);
4061 dx_leaf_bh = NULL;
4062 rebalanced = 1;
4063 goto restart_search;
4064 }
4065
4066 lookup->dl_dx_leaf_bh = dx_leaf_bh;
4067 dx_leaf_bh = NULL;
4068
4069out:
4070 brelse(dx_leaf_bh);
4071 return ret;
4072}
4073
4074static int ocfs2_search_dx_free_list(struct inode *dir,
4075 struct buffer_head *dx_root_bh,
4076 int namelen,
4077 struct ocfs2_dir_lookup_result *lookup)
4078{
4079 int ret = -ENOSPC;
4080 struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
4081 struct ocfs2_dir_block_trailer *db;
4082 u64 next_block;
4083 int rec_len = OCFS2_DIR_REC_LEN(namelen);
4084 struct ocfs2_dx_root_block *dx_root;
4085
4086 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4087 next_block = le64_to_cpu(dx_root->dr_free_blk);
4088
4089 while (next_block) {
4090 brelse(prev_leaf_bh);
4091 prev_leaf_bh = leaf_bh;
4092 leaf_bh = NULL;
4093
4094 ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
4095 if (ret) {
4096 mlog_errno(ret);
4097 goto out;
4098 }
4099
4100 db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
4101 if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
4102 lookup->dl_leaf_bh = leaf_bh;
4103 lookup->dl_prev_leaf_bh = prev_leaf_bh;
4104 leaf_bh = NULL;
4105 prev_leaf_bh = NULL;
4106 break;
4107 }
4108
4109 next_block = le64_to_cpu(db->db_free_next);
4110 }
4111
4112 if (!next_block)
4113 ret = -ENOSPC;
4114
4115out:
4116
4117 brelse(leaf_bh);
4118 brelse(prev_leaf_bh);
4119 return ret;
4120}
4121
4122static int ocfs2_expand_inline_dx_root(struct inode *dir,
4123 struct buffer_head *dx_root_bh)
4124{
4125 int ret, num_dx_leaves, i, j, did_quota = 0;
4126 struct buffer_head **dx_leaves = NULL;
4127 struct ocfs2_extent_tree et;
4128 u64 insert_blkno;
4129 struct ocfs2_alloc_context *data_ac = NULL;
4130 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4131 handle_t *handle = NULL;
4132 struct ocfs2_dx_root_block *dx_root;
4133 struct ocfs2_dx_entry_list *entry_list;
4134 struct ocfs2_dx_entry *dx_entry;
4135 struct ocfs2_dx_leaf *target_leaf;
4136
4137 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4138 if (ret) {
4139 mlog_errno(ret);
4140 goto out;
4141 }
4142
4143 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4144 if (!dx_leaves) {
4145 ret = -ENOMEM;
4146 mlog_errno(ret);
4147 goto out;
4148 }
4149
4150 handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4151 if (IS_ERR(handle)) {
4152 ret = PTR_ERR(handle);
4153 mlog_errno(ret);
4154 goto out;
4155 }
4156
4157 if (vfs_dq_alloc_space_nodirty(dir,
4158 ocfs2_clusters_to_bytes(osb->sb, 1))) {
4159 ret = -EDQUOT;
4160 goto out_commit;
4161 }
4162 did_quota = 1;
4163
4164 /*
4165 * We do this up front, before the allocation, so that a
4166 * failure to add the dx_root_bh to the journal won't result
4167 * us losing clusters.
4168 */
4169 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
4170 OCFS2_JOURNAL_ACCESS_WRITE);
4171 if (ret) {
4172 mlog_errno(ret);
4173 goto out_commit;
4174 }
4175
4176 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4177 num_dx_leaves, &insert_blkno);
4178 if (ret) {
4179 mlog_errno(ret);
4180 goto out_commit;
4181 }
4182
4183 /*
4184 * Transfer the entries from our dx_root into the appropriate
4185 * block
4186 */
4187 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4188 entry_list = &dx_root->dr_entries;
4189
4190 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4191 dx_entry = &entry_list->de_entries[i];
4192
4193 j = __ocfs2_dx_dir_hash_idx(osb,
4194 le32_to_cpu(dx_entry->dx_minor_hash));
4195 target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4196
4197 ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4198
4199 /* Each leaf has been passed to the journal already
4200 * via __ocfs2_dx_dir_new_cluster() */
4201 }
4202
4203 dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4204 memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4205 offsetof(struct ocfs2_dx_root_block, dr_list));
4206 dx_root->dr_list.l_count =
4207 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4208
4209 /* This should never fail considering we start with an empty
4210 * dx_root. */
4211 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4212 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
4213 insert_blkno, 1, 0, NULL);
4214 if (ret)
4215 mlog_errno(ret);
4216 did_quota = 0;
4217
4218 ocfs2_journal_dirty(handle, dx_root_bh);
4219
4220out_commit:
4221 if (ret < 0 && did_quota)
4222 vfs_dq_free_space_nodirty(dir,
4223 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4224
4225 ocfs2_commit_trans(osb, handle);
4226
4227out:
4228 if (data_ac)
4229 ocfs2_free_alloc_context(data_ac);
4230
4231 if (dx_leaves) {
4232 for (i = 0; i < num_dx_leaves; i++)
4233 brelse(dx_leaves[i]);
4234 kfree(dx_leaves);
4235 }
4236 return ret;
4237}
4238
4239static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4240{
4241 struct ocfs2_dx_root_block *dx_root;
4242 struct ocfs2_dx_entry_list *entry_list;
4243
4244 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4245 entry_list = &dx_root->dr_entries;
4246
4247 if (le16_to_cpu(entry_list->de_num_used) >=
4248 le16_to_cpu(entry_list->de_count))
4249 return -ENOSPC;
4250
4251 return 0;
4252}
4253
4254static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4255 struct buffer_head *di_bh,
4256 const char *name,
4257 int namelen,
4258 struct ocfs2_dir_lookup_result *lookup)
4259{
4260 int ret, free_dx_root = 1;
4261 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4262 struct buffer_head *dx_root_bh = NULL;
4263 struct buffer_head *leaf_bh = NULL;
4264 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4265 struct ocfs2_dx_root_block *dx_root;
4266
4267 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4268 if (ret) {
4269 mlog_errno(ret);
4270 goto out;
4271 }
4272
4273 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4274 if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4275 ret = -ENOSPC;
4276 mlog_errno(ret);
4277 goto out;
4278 }
4279
4280 if (ocfs2_dx_root_inline(dx_root)) {
4281 ret = ocfs2_inline_dx_has_space(dx_root_bh);
4282
4283 if (ret == 0)
4284 goto search_el;
4285
4286 /*
4287 * We ran out of room in the root block. Expand it to
4288 * an extent, then allow ocfs2_find_dir_space_dx to do
4289 * the rest.
4290 */
4291 ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4292 if (ret) {
4293 mlog_errno(ret);
4294 goto out;
4295 }
4296 }
4297
4298 /*
4299 * Insert preparation for an indexed directory is split into two
4300 * steps. The call to find_dir_space_dx reserves room in the index for
4301 * an additional item. If we run out of space there, it's a real error
4302 * we can't continue on.
4303 */
4304 ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4305 namelen, lookup);
4306 if (ret) {
4307 mlog_errno(ret);
4308 goto out;
4309 }
4310
4311search_el:
4312 /*
4313 * Next, we need to find space in the unindexed tree. This call
4314 * searches using the free space linked list. If the unindexed tree
4315 * lacks sufficient space, we'll expand it below. The expansion code
4316 * is smart enough to add any new blocks to the free space list.
4317 */
4318 ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4319 if (ret && ret != -ENOSPC) {
4320 mlog_errno(ret);
4321 goto out;
4322 }
4323
4324 /* Do this up here - ocfs2_extend_dir might need the dx_root */
4325 lookup->dl_dx_root_bh = dx_root_bh;
4326 free_dx_root = 0;
4327
4328 if (ret == -ENOSPC) {
4329 ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4330
4331 if (ret) {
4332 mlog_errno(ret);
4333 goto out;
4334 }
4335
4336 /*
4337 * We make the assumption here that new leaf blocks are added
4338 * to the front of our free list.
4339 */
4340 lookup->dl_prev_leaf_bh = NULL;
4341 lookup->dl_leaf_bh = leaf_bh;
4342 }
4343
4344out:
4345 if (free_dx_root)
4346 brelse(dx_root_bh);
4347 return ret;
4348}
4349
4350/*
4351 * Get a directory ready for insert. Any directory allocation required
4352 * happens here. Success returns zero, and enough context in the dir
4353 * lookup result that ocfs2_add_entry() will be able complete the task
4354 * with minimal performance impact.
4355 */
1973int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 4356int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1974 struct inode *dir, 4357 struct inode *dir,
1975 struct buffer_head *parent_fe_bh, 4358 struct buffer_head *parent_fe_bh,
1976 const char *name, 4359 const char *name,
1977 int namelen, 4360 int namelen,
1978 struct buffer_head **ret_de_bh) 4361 struct ocfs2_dir_lookup_result *lookup)
1979{ 4362{
1980 int ret; 4363 int ret;
1981 unsigned int blocks_wanted = 1; 4364 unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1984 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 4367 mlog(0, "getting ready to insert namelen %d into dir %llu\n",
1985 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 4368 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
1986 4369
1987 *ret_de_bh = NULL;
1988
1989 if (!namelen) { 4370 if (!namelen) {
1990 ret = -EINVAL; 4371 ret = -EINVAL;
1991 mlog_errno(ret); 4372 mlog_errno(ret);
1992 goto out; 4373 goto out;
1993 } 4374 }
1994 4375
4376 /*
4377 * Do this up front to reduce confusion.
4378 *
4379 * The directory might start inline, then be turned into an
4380 * indexed one, in which case we'd need to hash deep inside
4381 * ocfs2_find_dir_space_id(). Since
4382 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4383 * done, there seems no point in spreading out the calls. We
4384 * can optimize away the case where the file system doesn't
4385 * support indexing.
4386 */
4387 if (ocfs2_supports_indexed_dirs(osb))
4388 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4389
4390 if (ocfs2_dir_indexed(dir)) {
4391 ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4392 name, namelen, lookup);
4393 if (ret)
4394 mlog_errno(ret);
4395 goto out;
4396 }
4397
1995 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4398 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1996 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 4399 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
1997 namelen, &bh, &blocks_wanted); 4400 namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2010 BUG_ON(bh); 4413 BUG_ON(bh);
2011 4414
2012 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 4415 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
2013 &bh); 4416 lookup, &bh);
2014 if (ret) { 4417 if (ret) {
2015 if (ret != -ENOSPC) 4418 if (ret != -ENOSPC)
2016 mlog_errno(ret); 4419 mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2020 BUG_ON(!bh); 4423 BUG_ON(!bh);
2021 } 4424 }
2022 4425
2023 *ret_de_bh = bh; 4426 lookup->dl_leaf_bh = bh;
2024 bh = NULL; 4427 bh = NULL;
2025out: 4428out:
2026 brelse(bh); 4429 brelse(bh);
2027 return ret; 4430 return ret;
2028} 4431}
4432
4433static int ocfs2_dx_dir_remove_index(struct inode *dir,
4434 struct buffer_head *di_bh,
4435 struct buffer_head *dx_root_bh)
4436{
4437 int ret;
4438 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4439 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4440 struct ocfs2_dx_root_block *dx_root;
4441 struct inode *dx_alloc_inode = NULL;
4442 struct buffer_head *dx_alloc_bh = NULL;
4443 handle_t *handle;
4444 u64 blk;
4445 u16 bit;
4446 u64 bg_blkno;
4447
4448 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4449
4450 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4451 EXTENT_ALLOC_SYSTEM_INODE,
4452 le16_to_cpu(dx_root->dr_suballoc_slot));
4453 if (!dx_alloc_inode) {
4454 ret = -ENOMEM;
4455 mlog_errno(ret);
4456 goto out;
4457 }
4458 mutex_lock(&dx_alloc_inode->i_mutex);
4459
4460 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4461 if (ret) {
4462 mlog_errno(ret);
4463 goto out_mutex;
4464 }
4465
4466 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4467 if (IS_ERR(handle)) {
4468 ret = PTR_ERR(handle);
4469 mlog_errno(ret);
4470 goto out_unlock;
4471 }
4472
4473 ret = ocfs2_journal_access_di(handle, dir, di_bh,
4474 OCFS2_JOURNAL_ACCESS_WRITE);
4475 if (ret) {
4476 mlog_errno(ret);
4477 goto out_commit;
4478 }
4479
4480 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4481 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4482 di->i_dx_root = cpu_to_le64(0ULL);
4483
4484 ocfs2_journal_dirty(handle, di_bh);
4485
4486 blk = le64_to_cpu(dx_root->dr_blkno);
4487 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4488 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4489 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4490 bit, bg_blkno, 1);
4491 if (ret)
4492 mlog_errno(ret);
4493
4494out_commit:
4495 ocfs2_commit_trans(osb, handle);
4496
4497out_unlock:
4498 ocfs2_inode_unlock(dx_alloc_inode, 1);
4499
4500out_mutex:
4501 mutex_unlock(&dx_alloc_inode->i_mutex);
4502 brelse(dx_alloc_bh);
4503out:
4504 iput(dx_alloc_inode);
4505 return ret;
4506}
4507
4508int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4509{
4510 int ret;
4511 unsigned int uninitialized_var(clen);
4512 u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
4513 u64 uninitialized_var(blkno);
4514 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4515 struct buffer_head *dx_root_bh = NULL;
4516 struct ocfs2_dx_root_block *dx_root;
4517 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4518 struct ocfs2_cached_dealloc_ctxt dealloc;
4519 struct ocfs2_extent_tree et;
4520
4521 ocfs2_init_dealloc_ctxt(&dealloc);
4522
4523 if (!ocfs2_dir_indexed(dir))
4524 return 0;
4525
4526 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4527 if (ret) {
4528 mlog_errno(ret);
4529 goto out;
4530 }
4531 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4532
4533 if (ocfs2_dx_root_inline(dx_root))
4534 goto remove_index;
4535
4536 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4537
4538 /* XXX: What if dr_clusters is too large? */
4539 while (le32_to_cpu(dx_root->dr_clusters)) {
4540 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4541 major_hash, &cpos, &blkno, &clen);
4542 if (ret) {
4543 mlog_errno(ret);
4544 goto out;
4545 }
4546
4547 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4548
4549 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
4550 &dealloc);
4551 if (ret) {
4552 mlog_errno(ret);
4553 goto out;
4554 }
4555
4556 if (cpos == 0)
4557 break;
4558
4559 major_hash = cpos - 1;
4560 }
4561
4562remove_index:
4563 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4564 if (ret) {
4565 mlog_errno(ret);
4566 goto out;
4567 }
4568
4569 ocfs2_remove_from_cache(dir, dx_root_bh);
4570out:
4571 ocfs2_schedule_truncate_log_flush(osb, 1);
4572 ocfs2_run_deallocs(osb, &dealloc);
4573
4574 brelse(dx_root_bh);
4575 return ret;
4576}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct buffer_head *ocfs2_find_entry(const char *name, 29struct ocfs2_dx_hinfo {
30 int namelen, 30 u32 major_hash;
31 struct inode *dir, 31 u32 minor_hash;
32 struct ocfs2_dir_entry **res_dir); 32};
33
34struct ocfs2_dir_lookup_result {
35 struct buffer_head *dl_leaf_bh; /* Unindexed leaf
36 * block */
37 struct ocfs2_dir_entry *dl_entry; /* Target dirent in
38 * unindexed leaf */
39
40 struct buffer_head *dl_dx_root_bh; /* Root of indexed
41 * tree */
42
43 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
44 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
45 * indexed leaf */
46 struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
47
48 struct buffer_head *dl_prev_leaf_bh;/* Previous entry in
49 * dir free space
50 * list. NULL if
51 * previous entry is
52 * dx root block. */
53};
54
55void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
56
57int ocfs2_find_entry(const char *name, int namelen,
58 struct inode *dir,
59 struct ocfs2_dir_lookup_result *lookup);
33int ocfs2_delete_entry(handle_t *handle, 60int ocfs2_delete_entry(handle_t *handle,
34 struct inode *dir, 61 struct inode *dir,
35 struct ocfs2_dir_entry *de_del, 62 struct ocfs2_dir_lookup_result *res);
36 struct buffer_head *bh);
37int __ocfs2_add_entry(handle_t *handle, 63int __ocfs2_add_entry(handle_t *handle,
38 struct inode *dir, 64 struct inode *dir,
39 const char *name, int namelen, 65 const char *name, int namelen,
40 struct inode *inode, u64 blkno, 66 struct inode *inode, u64 blkno,
41 struct buffer_head *parent_fe_bh, 67 struct buffer_head *parent_fe_bh,
42 struct buffer_head *insert_bh); 68 struct ocfs2_dir_lookup_result *lookup);
43static inline int ocfs2_add_entry(handle_t *handle, 69static inline int ocfs2_add_entry(handle_t *handle,
44 struct dentry *dentry, 70 struct dentry *dentry,
45 struct inode *inode, u64 blkno, 71 struct inode *inode, u64 blkno,
46 struct buffer_head *parent_fe_bh, 72 struct buffer_head *parent_fe_bh,
47 struct buffer_head *insert_bh) 73 struct ocfs2_dir_lookup_result *lookup)
48{ 74{
49 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, 75 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
50 dentry->d_name.name, dentry->d_name.len, 76 dentry->d_name.name, dentry->d_name.len,
51 inode, blkno, parent_fe_bh, insert_bh); 77 inode, blkno, parent_fe_bh, lookup);
52} 78}
53int ocfs2_update_entry(struct inode *dir, handle_t *handle, 79int ocfs2_update_entry(struct inode *dir, handle_t *handle,
54 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 80 struct ocfs2_dir_lookup_result *res,
55 struct inode *new_entry_inode); 81 struct inode *new_entry_inode);
56 82
57int ocfs2_check_dir_for_entry(struct inode *dir, 83int ocfs2_check_dir_for_entry(struct inode *dir,
58 const char *name, 84 const char *name,
59 int namelen); 85 int namelen);
60int ocfs2_empty_dir(struct inode *inode); 86int ocfs2_empty_dir(struct inode *inode);
87
61int ocfs2_find_files_on_disk(const char *name, 88int ocfs2_find_files_on_disk(const char *name,
62 int namelen, 89 int namelen,
63 u64 *blkno, 90 u64 *blkno,
64 struct inode *inode, 91 struct inode *inode,
65 struct buffer_head **dirent_bh, 92 struct ocfs2_dir_lookup_result *res);
66 struct ocfs2_dir_entry **dirent);
67int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
68 int namelen, u64 *blkno); 94 int namelen, u64 *blkno);
69int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 95int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
74 struct buffer_head *parent_fe_bh, 100 struct buffer_head *parent_fe_bh,
75 const char *name, 101 const char *name,
76 int namelen, 102 int namelen,
77 struct buffer_head **ret_de_bh); 103 struct ocfs2_dir_lookup_result *lookup);
78struct ocfs2_alloc_context; 104struct ocfs2_alloc_context;
79int ocfs2_fill_new_dir(struct ocfs2_super *osb, 105int ocfs2_fill_new_dir(struct ocfs2_super *osb,
80 handle_t *handle, 106 handle_t *handle,
81 struct inode *parent, 107 struct inode *parent,
82 struct inode *inode, 108 struct inode *inode,
83 struct buffer_head *fe_bh, 109 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 110 struct ocfs2_alloc_context *data_ac,
111 struct ocfs2_alloc_context *meta_ac);
112
113int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
85 114
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, 115struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data); 116 void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION 55 DLM_MLE_MIGRATION,
56}; 56 DLM_MLE_NUM_TYPES
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61}; 57};
62 58
63struct dlm_master_list_entry { 59struct dlm_master_list_entry {
64 struct list_head list; 60 struct hlist_node master_hash_node;
65 struct list_head hb_events; 61 struct list_head hb_events;
66 struct dlm_ctxt *dlm; 62 struct dlm_ctxt *dlm;
67 spinlock_t spinlock; 63 spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
78 enum dlm_mle_type type; 74 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up; 75 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down; 76 struct o2hb_callback_func mle_hb_down;
81 union { 77 struct dlm_lock_resource *mleres;
82 struct dlm_lock_resource *res; 78 unsigned char mname[DLM_LOCKID_NAME_MAX];
83 struct dlm_lock_name name; 79 unsigned int mnamelen;
84 } u; 80 unsigned int mnamehash;
85}; 81};
86 82
87enum dlm_ast_type { 83enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
151 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
152 struct dlm_recovery_ctxt reco; 148 struct dlm_recovery_ctxt reco;
153 spinlock_t master_lock; 149 spinlock_t master_lock;
154 struct list_head master_list; 150 struct hlist_head **master_hash;
155 struct list_head mle_hb_events; 151 struct list_head mle_hb_events;
156 152
157 /* these give a really vague idea of the system load */ 153 /* these give a really vague idea of the system load */
158 atomic_t local_resources; 154 atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
159 atomic_t remote_resources; 155 atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
160 atomic_t unknown_resources; 156 atomic_t res_tot_count;
157 atomic_t res_cur_count;
161 158
162 struct dlm_debug_ctxt *dlm_debug_ctxt; 159 struct dlm_debug_ctxt *dlm_debug_ctxt;
163 struct dentry *dlm_debugfs_subroot; 160 struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
195 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); 192 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
196} 193}
197 194
195static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
196 unsigned i)
197{
198 return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
199 (i % DLM_BUCKETS_PER_PAGE);
200}
201
198/* these keventd work queue items are for less-frequently 202/* these keventd work queue items are for less-frequently
199 * called functions that cannot be directly called from the 203 * called functions that cannot be directly called from the
200 * net message handlers for some reason, usually because 204 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
848 unsigned int len); 852 unsigned int len);
849 853
850int dlm_is_host_down(int errno); 854int dlm_is_host_down(int errno);
851void dlm_change_lockres_owner(struct dlm_ctxt *dlm, 855
852 struct dlm_lock_resource *res,
853 u8 owner);
854struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 856struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
855 const char *lockid, 857 const char *lockid,
856 int namelen, 858 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1008 DLM_LOCK_RES_MIGRATING)); 1010 DLM_LOCK_RES_MIGRATING));
1009} 1011}
1010 1012
1013void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1014void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1015
1011/* create/destroy slab caches */ 1016/* create/destroy slab caches */
1012int dlm_init_master_caches(void); 1017int dlm_init_master_caches(void);
1013void dlm_destroy_master_caches(void); 1018void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
1110 return bit; 1115 return bit;
1111} 1116}
1112 1117
1118static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
1119 struct dlm_lock_resource *res,
1120 u8 owner)
1121{
1122 assert_spin_locked(&res->spinlock);
1123
1124 res->owner = owner;
1125}
1113 1126
1127static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
1128 struct dlm_lock_resource *res,
1129 u8 owner)
1130{
1131 assert_spin_locked(&res->spinlock);
1132
1133 if (owner != res->owner)
1134 dlm_set_lockres_owner(dlm, res, owner);
1135}
1114 1136
1115#endif /* DLMCOMMON_H */ 1137#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) 287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
288{ 288{
289 int out = 0; 289 int out = 0;
290 unsigned int namelen;
291 const char *name;
292 char *mle_type; 290 char *mle_type;
293 291
294 if (mle->type != DLM_MLE_MASTER) {
295 namelen = mle->u.name.len;
296 name = mle->u.name.name;
297 } else {
298 namelen = mle->u.res->lockname.len;
299 name = mle->u.res->lockname.name;
300 }
301
302 if (mle->type == DLM_MLE_BLOCK) 292 if (mle->type == DLM_MLE_BLOCK)
303 mle_type = "BLK"; 293 mle_type = "BLK";
304 else if (mle->type == DLM_MLE_MASTER) 294 else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
306 else 296 else
307 mle_type = "MIG"; 297 mle_type = "MIG";
308 298
309 out += stringify_lockname(name, namelen, buf + out, len - out); 299 out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
310 out += snprintf(buf + out, len - out, 300 out += snprintf(buf + out, len - out,
311 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", 301 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
312 mle_type, mle->master, mle->new_master, 302 mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
501static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 491static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
502{ 492{
503 struct dlm_master_list_entry *mle; 493 struct dlm_master_list_entry *mle;
504 int out = 0; 494 struct hlist_head *bucket;
505 unsigned long total = 0; 495 struct hlist_node *list;
496 int i, out = 0;
497 unsigned long total = 0, longest = 0, bktcnt;
506 498
507 out += snprintf(db->buf + out, db->len - out, 499 out += snprintf(db->buf + out, db->len - out,
508 "Dumping MLEs for Domain: %s\n", dlm->name); 500 "Dumping MLEs for Domain: %s\n", dlm->name);
509 501
510 spin_lock(&dlm->master_lock); 502 spin_lock(&dlm->master_lock);
511 list_for_each_entry(mle, &dlm->master_list, list) { 503 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
512 ++total; 504 bucket = dlm_master_hash(dlm, i);
513 if (db->len - out < 200) 505 hlist_for_each(list, bucket) {
514 continue; 506 mle = hlist_entry(list, struct dlm_master_list_entry,
515 out += dump_mle(mle, db->buf + out, db->len - out); 507 master_hash_node);
508 ++total;
509 ++bktcnt;
510 if (db->len - out < 200)
511 continue;
512 out += dump_mle(mle, db->buf + out, db->len - out);
513 }
514 longest = max(longest, bktcnt);
515 bktcnt = 0;
516 } 516 }
517 spin_unlock(&dlm->master_lock); 517 spin_unlock(&dlm->master_lock);
518 518
519 out += snprintf(db->buf + out, db->len - out, 519 out += snprintf(db->buf + out, db->len - out,
520 "Total on list: %ld\n", total); 520 "Total: %ld, Longest: %ld\n", total, longest);
521 return out; 521 return out;
522} 522}
523 523
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
756 int out = 0; 756 int out = 0;
757 struct dlm_reco_node_data *node; 757 struct dlm_reco_node_data *node;
758 char *state; 758 char *state;
759 int lres, rres, ures, tres; 759 int cur_mles = 0, tot_mles = 0;
760 760 int i;
761 lres = atomic_read(&dlm->local_resources);
762 rres = atomic_read(&dlm->remote_resources);
763 ures = atomic_read(&dlm->unknown_resources);
764 tres = lres + rres + ures;
765 761
766 spin_lock(&dlm->spinlock); 762 spin_lock(&dlm->spinlock);
767 763
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
804 db->buf + out, db->len - out); 800 db->buf + out, db->len - out);
805 out += snprintf(db->buf + out, db->len - out, "\n"); 801 out += snprintf(db->buf + out, db->len - out, "\n");
806 802
807 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ 803 /* Lock Resources: xxx (xxx) */
804 out += snprintf(db->buf + out, db->len - out,
805 "Lock Resources: %d (%d)\n",
806 atomic_read(&dlm->res_cur_count),
807 atomic_read(&dlm->res_tot_count));
808
809 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
810 tot_mles += atomic_read(&dlm->mle_tot_count[i]);
811
812 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
813 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
814
815 /* MLEs: xxx (xxx) */
816 out += snprintf(db->buf + out, db->len - out,
817 "MLEs: %d (%d)\n", cur_mles, tot_mles);
818
819 /* Blocking: xxx (xxx) */
820 out += snprintf(db->buf + out, db->len - out,
821 " Blocking: %d (%d)\n",
822 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
823 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
824
825 /* Mastery: xxx (xxx) */
826 out += snprintf(db->buf + out, db->len - out,
827 " Mastery: %d (%d)\n",
828 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
829 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
830
831 /* Migration: xxx (xxx) */
808 out += snprintf(db->buf + out, db->len - out, 832 out += snprintf(db->buf + out, db->len - out,
809 "Mastered Resources Total: %d Locally: %d " 833 " Migration: %d (%d)\n",
810 "Remotely: %d Unknown: %d\n", 834 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
811 tres, lres, rres, ures); 835 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
812 836
813 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 837 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
814 out += snprintf(db->buf + out, db->len - out, 838 out += snprintf(db->buf + out, db->len - out,
815 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 839 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
816 "PendingBASTs=%s Master=%s\n", 840 "PendingBASTs=%s\n",
817 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 841 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
818 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), 842 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
819 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), 843 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
820 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), 844 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
821 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
822 845
823 /* Purge Count: xxx Refs: xxx */ 846 /* Purge Count: xxx Refs: xxx */
824 out += snprintf(db->buf + out, db->len - out, 847 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
304 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
306 306
307 if (dlm->master_hash)
308 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
309
307 if (dlm->name) 310 if (dlm->name)
308 kfree(dlm->name); 311 kfree(dlm->name);
309 312
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1534 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1537 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1535 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); 1538 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
1536 1539
1540 dlm->master_hash = (struct hlist_head **)
1541 dlm_alloc_pagevec(DLM_HASH_PAGES);
1542 if (!dlm->master_hash) {
1543 mlog_errno(-ENOMEM);
1544 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1545 kfree(dlm->name);
1546 kfree(dlm);
1547 dlm = NULL;
1548 goto leave;
1549 }
1550
1551 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1552 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1553
1537 strcpy(dlm->name, domain); 1554 strcpy(dlm->name, domain);
1538 dlm->key = key; 1555 dlm->key = key;
1539 dlm->node_num = o2nm_this_node(); 1556 dlm->node_num = o2nm_this_node();
1540 1557
1541 ret = dlm_create_debugfs_subroot(dlm); 1558 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) { 1559 if (ret < 0) {
1560 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 1561 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name); 1562 kfree(dlm->name);
1545 kfree(dlm); 1563 kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1579 init_waitqueue_head(&dlm->reco.event); 1597 init_waitqueue_head(&dlm->reco.event);
1580 init_waitqueue_head(&dlm->ast_wq); 1598 init_waitqueue_head(&dlm->ast_wq);
1581 init_waitqueue_head(&dlm->migration_wq); 1599 init_waitqueue_head(&dlm->migration_wq);
1582 INIT_LIST_HEAD(&dlm->master_list);
1583 INIT_LIST_HEAD(&dlm->mle_hb_events); 1600 INIT_LIST_HEAD(&dlm->mle_hb_events);
1584 1601
1585 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; 1602 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1587 1604
1588 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 1605 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1589 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 1606 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1590 atomic_set(&dlm->local_resources, 0); 1607
1591 atomic_set(&dlm->remote_resources, 0); 1608 atomic_set(&dlm->res_tot_count, 0);
1592 atomic_set(&dlm->unknown_resources, 0); 1609 atomic_set(&dlm->res_cur_count, 0);
1610 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
1611 atomic_set(&dlm->mle_tot_count[i], 0);
1612 atomic_set(&dlm->mle_cur_count[i], 0);
1613 }
1593 1614
1594 spin_lock_init(&dlm->work_lock); 1615 spin_lock_init(&dlm->work_lock);
1595 INIT_LIST_HEAD(&dlm->work_list); 1616 INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
73 const char *name, 73 const char *name,
74 unsigned int namelen) 74 unsigned int namelen)
75{ 75{
76 struct dlm_lock_resource *res;
77
78 if (dlm != mle->dlm) 76 if (dlm != mle->dlm)
79 return 0; 77 return 0;
80 78
81 if (mle->type == DLM_MLE_BLOCK || 79 if (namelen != mle->mnamelen ||
82 mle->type == DLM_MLE_MIGRATION) { 80 memcmp(name, mle->mname, namelen) != 0)
83 if (namelen != mle->u.name.len || 81 return 0;
84 memcmp(name, mle->u.name.name, namelen)!=0) 82
85 return 0;
86 } else {
87 res = mle->u.res;
88 if (namelen != res->lockname.len ||
89 memcmp(res->lockname.name, name, namelen) != 0)
90 return 0;
91 }
92 return 1; 83 return 1;
93} 84}
94 85
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
283 274
284 mle->dlm = dlm; 275 mle->dlm = dlm;
285 mle->type = type; 276 mle->type = type;
286 INIT_LIST_HEAD(&mle->list); 277 INIT_HLIST_NODE(&mle->master_hash_node);
287 INIT_LIST_HEAD(&mle->hb_events); 278 INIT_LIST_HEAD(&mle->hb_events);
288 memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 279 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
289 spin_lock_init(&mle->spinlock); 280 spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
295 mle->new_master = O2NM_MAX_NODES; 286 mle->new_master = O2NM_MAX_NODES;
296 mle->inuse = 0; 287 mle->inuse = 0;
297 288
289 BUG_ON(mle->type != DLM_MLE_BLOCK &&
290 mle->type != DLM_MLE_MASTER &&
291 mle->type != DLM_MLE_MIGRATION);
292
298 if (mle->type == DLM_MLE_MASTER) { 293 if (mle->type == DLM_MLE_MASTER) {
299 BUG_ON(!res); 294 BUG_ON(!res);
300 mle->u.res = res; 295 mle->mleres = res;
301 } else if (mle->type == DLM_MLE_BLOCK) { 296 memcpy(mle->mname, res->lockname.name, res->lockname.len);
302 BUG_ON(!name); 297 mle->mnamelen = res->lockname.len;
303 memcpy(mle->u.name.name, name, namelen); 298 mle->mnamehash = res->lockname.hash;
304 mle->u.name.len = namelen; 299 } else {
305 } else /* DLM_MLE_MIGRATION */ {
306 BUG_ON(!name); 300 BUG_ON(!name);
307 memcpy(mle->u.name.name, name, namelen); 301 mle->mleres = NULL;
308 mle->u.name.len = namelen; 302 memcpy(mle->mname, name, namelen);
303 mle->mnamelen = namelen;
304 mle->mnamehash = dlm_lockid_hash(name, namelen);
309 } 305 }
310 306
307 atomic_inc(&dlm->mle_tot_count[mle->type]);
308 atomic_inc(&dlm->mle_cur_count[mle->type]);
309
311 /* copy off the node_map and register hb callbacks on our copy */ 310 /* copy off the node_map and register hb callbacks on our copy */
312 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 311 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
313 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 312 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
318 __dlm_mle_attach_hb_events(dlm, mle); 317 __dlm_mle_attach_hb_events(dlm, mle);
319} 318}
320 319
320void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
321{
322 assert_spin_locked(&dlm->spinlock);
323 assert_spin_locked(&dlm->master_lock);
324
325 if (!hlist_unhashed(&mle->master_hash_node))
326 hlist_del_init(&mle->master_hash_node);
327}
328
329void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
330{
331 struct hlist_head *bucket;
332
333 assert_spin_locked(&dlm->master_lock);
334
335 bucket = dlm_master_hash(dlm, mle->mnamehash);
336 hlist_add_head(&mle->master_hash_node, bucket);
337}
321 338
322/* returns 1 if found, 0 if not */ 339/* returns 1 if found, 0 if not */
323static int dlm_find_mle(struct dlm_ctxt *dlm, 340static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
325 char *name, unsigned int namelen) 342 char *name, unsigned int namelen)
326{ 343{
327 struct dlm_master_list_entry *tmpmle; 344 struct dlm_master_list_entry *tmpmle;
345 struct hlist_head *bucket;
346 struct hlist_node *list;
347 unsigned int hash;
328 348
329 assert_spin_locked(&dlm->master_lock); 349 assert_spin_locked(&dlm->master_lock);
330 350
331 list_for_each_entry(tmpmle, &dlm->master_list, list) { 351 hash = dlm_lockid_hash(name, namelen);
352 bucket = dlm_master_hash(dlm, hash);
353 hlist_for_each(list, bucket) {
354 tmpmle = hlist_entry(list, struct dlm_master_list_entry,
355 master_hash_node);
332 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 356 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
333 continue; 357 continue;
334 dlm_get_mle(tmpmle); 358 dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
408 mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 432 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
409 dlm = mle->dlm; 433 dlm = mle->dlm;
410 434
411 if (mle->type != DLM_MLE_MASTER) {
412 mlog(0, "calling mle_release for %.*s, type %d\n",
413 mle->u.name.len, mle->u.name.name, mle->type);
414 } else {
415 mlog(0, "calling mle_release for %.*s, type %d\n",
416 mle->u.res->lockname.len,
417 mle->u.res->lockname.name, mle->type);
418 }
419 assert_spin_locked(&dlm->spinlock); 435 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&dlm->master_lock); 436 assert_spin_locked(&dlm->master_lock);
421 437
438 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
439 mle->type);
440
422 /* remove from list if not already */ 441 /* remove from list if not already */
423 if (!list_empty(&mle->list)) 442 __dlm_unlink_mle(dlm, mle);
424 list_del_init(&mle->list);
425 443
426 /* detach the mle from the domain node up/down events */ 444 /* detach the mle from the domain node up/down events */
427 __dlm_mle_detach_hb_events(dlm, mle); 445 __dlm_mle_detach_hb_events(dlm, mle);
428 446
447 atomic_dec(&dlm->mle_cur_count[mle->type]);
448
429 /* NOTE: kfree under spinlock here. 449 /* NOTE: kfree under spinlock here.
430 * if this is bad, we can move this to a freelist. */ 450 * if this is bad, we can move this to a freelist. */
431 kmem_cache_free(dlm_mle_cache, mle); 451 kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
465 kmem_cache_destroy(dlm_lockres_cache); 485 kmem_cache_destroy(dlm_lockres_cache);
466} 486}
467 487
468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
469 struct dlm_lock_resource *res,
470 u8 owner)
471{
472 assert_spin_locked(&res->spinlock);
473
474 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
475
476 if (owner == dlm->node_num)
477 atomic_inc(&dlm->local_resources);
478 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
479 atomic_inc(&dlm->unknown_resources);
480 else
481 atomic_inc(&dlm->remote_resources);
482
483 res->owner = owner;
484}
485
486void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
487 struct dlm_lock_resource *res, u8 owner)
488{
489 assert_spin_locked(&res->spinlock);
490
491 if (owner == res->owner)
492 return;
493
494 if (res->owner == dlm->node_num)
495 atomic_dec(&dlm->local_resources);
496 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
497 atomic_dec(&dlm->unknown_resources);
498 else
499 atomic_dec(&dlm->remote_resources);
500
501 dlm_set_lockres_owner(dlm, res, owner);
502}
503
504
505static void dlm_lockres_release(struct kref *kref) 488static void dlm_lockres_release(struct kref *kref)
506{ 489{
507 struct dlm_lock_resource *res; 490 struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
527 } 510 }
528 spin_unlock(&dlm->track_lock); 511 spin_unlock(&dlm->track_lock);
529 512
513 atomic_dec(&dlm->res_cur_count);
514
530 dlm_put(dlm); 515 dlm_put(dlm);
531 516
532 if (!hlist_unhashed(&res->hash_node) || 517 if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
607 592
608 kref_init(&res->refs); 593 kref_init(&res->refs);
609 594
595 atomic_inc(&dlm->res_tot_count);
596 atomic_inc(&dlm->res_cur_count);
597
610 /* just for consistency */ 598 /* just for consistency */
611 spin_lock(&res->spinlock); 599 spin_lock(&res->spinlock);
612 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 600 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
843 alloc_mle = NULL; 831 alloc_mle = NULL;
844 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 832 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
845 set_bit(dlm->node_num, mle->maybe_map); 833 set_bit(dlm->node_num, mle->maybe_map);
846 list_add(&mle->list, &dlm->master_list); 834 __dlm_insert_mle(dlm, mle);
847 835
848 /* still holding the dlm spinlock, check the recovery map 836 /* still holding the dlm spinlock, check the recovery map
849 * to see if there are any nodes that still need to be 837 * to see if there are any nodes that still need to be
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1270 res->lockname.len, 1258 res->lockname.len,
1271 res->lockname.name); 1259 res->lockname.name);
1272 mle->type = DLM_MLE_MASTER; 1260 mle->type = DLM_MLE_MASTER;
1273 mle->u.res = res; 1261 mle->mleres = res;
1274 } 1262 }
1275 } 1263 }
1276 } 1264 }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
1315 1303
1316 BUG_ON(mle->type == DLM_MLE_MIGRATION); 1304 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1317 1305
1318 if (mle->type != DLM_MLE_MASTER) { 1306 request.namelen = (u8)mle->mnamelen;
1319 request.namelen = mle->u.name.len; 1307 memcpy(request.name, mle->mname, request.namelen);
1320 memcpy(request.name, mle->u.name.name, request.namelen);
1321 } else {
1322 request.namelen = mle->u.res->lockname.len;
1323 memcpy(request.name, mle->u.res->lockname.name,
1324 request.namelen);
1325 }
1326 1308
1327again: 1309again:
1328 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 1310 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
1575 // "add the block.\n"); 1557 // "add the block.\n");
1576 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 1558 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1577 set_bit(request->node_idx, mle->maybe_map); 1559 set_bit(request->node_idx, mle->maybe_map);
1578 list_add(&mle->list, &dlm->master_list); 1560 __dlm_insert_mle(dlm, mle);
1579 response = DLM_MASTER_RESP_NO; 1561 response = DLM_MASTER_RESP_NO;
1580 } else { 1562 } else {
1581 // mlog(0, "mle was found\n"); 1563 // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
1967 assert->node_idx, rr, extra_ref, mle->inuse); 1949 assert->node_idx, rr, extra_ref, mle->inuse);
1968 dlm_print_one_mle(mle); 1950 dlm_print_one_mle(mle);
1969 } 1951 }
1970 list_del_init(&mle->list); 1952 __dlm_unlink_mle(dlm, mle);
1971 __dlm_mle_detach_hb_events(dlm, mle); 1953 __dlm_mle_detach_hb_events(dlm, mle);
1972 __dlm_put_mle(mle); 1954 __dlm_put_mle(mle);
1973 if (extra_ref) { 1955 if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3159 tmp->master = master; 3141 tmp->master = master;
3160 atomic_set(&tmp->woken, 1); 3142 atomic_set(&tmp->woken, 1);
3161 wake_up(&tmp->wq); 3143 wake_up(&tmp->wq);
3162 /* remove it from the list so that only one 3144 /* remove it so that only one mle will be found */
3163 * mle will be found */ 3145 __dlm_unlink_mle(dlm, tmp);
3164 list_del_init(&tmp->list);
3165 /* this was obviously WRONG. mle is uninited here. should be tmp. */
3166 __dlm_mle_detach_hb_events(dlm, tmp); 3146 __dlm_mle_detach_hb_events(dlm, tmp);
3167 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3147 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3168 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3148 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3181 mle->master = master; 3161 mle->master = master;
3182 /* do this for consistency with other mle types */ 3162 /* do this for consistency with other mle types */
3183 set_bit(new_master, mle->maybe_map); 3163 set_bit(new_master, mle->maybe_map);
3184 list_add(&mle->list, &dlm->master_list); 3164 __dlm_insert_mle(dlm, mle);
3185 3165
3186 return ret; 3166 return ret;
3187} 3167}
3188 3168
3189 3169/*
3190void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3170 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3171 */
3172static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3173 struct dlm_master_list_entry *mle)
3191{ 3174{
3192 struct dlm_master_list_entry *mle, *next;
3193 struct dlm_lock_resource *res; 3175 struct dlm_lock_resource *res;
3194 unsigned int hash;
3195 3176
3196 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 3177 /* Find the lockres associated to the mle and set its owner to UNK */
3197top: 3178 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3198 assert_spin_locked(&dlm->spinlock); 3179 mle->mnamehash);
3180 if (res) {
3181 spin_unlock(&dlm->master_lock);
3199 3182
3200 /* clean the master list */ 3183 /* move lockres onto recovery list */
3201 spin_lock(&dlm->master_lock); 3184 spin_lock(&res->spinlock);
3202 list_for_each_entry_safe(mle, next, &dlm->master_list, list) { 3185 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3203 BUG_ON(mle->type != DLM_MLE_BLOCK && 3186 dlm_move_lockres_to_recovery_list(dlm, res);
3204 mle->type != DLM_MLE_MASTER && 3187 spin_unlock(&res->spinlock);
3205 mle->type != DLM_MLE_MIGRATION); 3188 dlm_lockres_put(res);
3206
3207 /* MASTER mles are initiated locally. the waiting
3208 * process will notice the node map change
3209 * shortly. let that happen as normal. */
3210 if (mle->type == DLM_MLE_MASTER)
3211 continue;
3212 3189
3190 /* about to get rid of mle, detach from heartbeat */
3191 __dlm_mle_detach_hb_events(dlm, mle);
3213 3192
3214 /* BLOCK mles are initiated by other nodes. 3193 /* dump the mle */
3215 * need to clean up if the dead node would have 3194 spin_lock(&dlm->master_lock);
3216 * been the master. */ 3195 __dlm_put_mle(mle);
3217 if (mle->type == DLM_MLE_BLOCK) { 3196 spin_unlock(&dlm->master_lock);
3218 int bit; 3197 }
3219 3198
3220 spin_lock(&mle->spinlock); 3199 return res;
3221 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 3200}
3222 if (bit != dead_node) {
3223 mlog(0, "mle found, but dead node %u would "
3224 "not have been master\n", dead_node);
3225 spin_unlock(&mle->spinlock);
3226 } else {
3227 /* must drop the refcount by one since the
3228 * assert_master will never arrive. this
3229 * may result in the mle being unlinked and
3230 * freed, but there may still be a process
3231 * waiting in the dlmlock path which is fine. */
3232 mlog(0, "node %u was expected master\n",
3233 dead_node);
3234 atomic_set(&mle->woken, 1);
3235 spin_unlock(&mle->spinlock);
3236 wake_up(&mle->wq);
3237 /* do not need events any longer, so detach
3238 * from heartbeat */
3239 __dlm_mle_detach_hb_events(dlm, mle);
3240 __dlm_put_mle(mle);
3241 }
3242 continue;
3243 }
3244 3201
3245 /* everything else is a MIGRATION mle */ 3202static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3246 3203 struct dlm_master_list_entry *mle)
3247 /* the rule for MIGRATION mles is that the master 3204{
3248 * becomes UNKNOWN if *either* the original or 3205 __dlm_mle_detach_hb_events(dlm, mle);
3249 * the new master dies. all UNKNOWN lockreses
3250 * are sent to whichever node becomes the recovery
3251 * master. the new master is responsible for
3252 * determining if there is still a master for
3253 * this lockres, or if he needs to take over
3254 * mastery. either way, this node should expect
3255 * another message to resolve this. */
3256 if (mle->master != dead_node &&
3257 mle->new_master != dead_node)
3258 continue;
3259 3206
3260 /* if we have reached this point, this mle needs to 3207 spin_lock(&mle->spinlock);
3261 * be removed from the list and freed. */ 3208 __dlm_unlink_mle(dlm, mle);
3209 atomic_set(&mle->woken, 1);
3210 spin_unlock(&mle->spinlock);
3262 3211
3263 /* remove from the list early. NOTE: unlinking 3212 wake_up(&mle->wq);
3264 * list_head while in list_for_each_safe */ 3213}
3265 __dlm_mle_detach_hb_events(dlm, mle); 3214
3266 spin_lock(&mle->spinlock); 3215static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3267 list_del_init(&mle->list); 3216 struct dlm_master_list_entry *mle, u8 dead_node)
3217{
3218 int bit;
3219
3220 BUG_ON(mle->type != DLM_MLE_BLOCK);
3221
3222 spin_lock(&mle->spinlock);
3223 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3224 if (bit != dead_node) {
3225 mlog(0, "mle found, but dead node %u would not have been "
3226 "master\n", dead_node);
3227 spin_unlock(&mle->spinlock);
3228 } else {
3229 /* Must drop the refcount by one since the assert_master will
3230 * never arrive. This may result in the mle being unlinked and
3231 * freed, but there may still be a process waiting in the
3232 * dlmlock path which is fine. */
3233 mlog(0, "node %u was expected master\n", dead_node);
3268 atomic_set(&mle->woken, 1); 3234 atomic_set(&mle->woken, 1);
3269 spin_unlock(&mle->spinlock); 3235 spin_unlock(&mle->spinlock);
3270 wake_up(&mle->wq); 3236 wake_up(&mle->wq);
3271 3237
3272 mlog(0, "%s: node %u died during migration from " 3238 /* Do not need events any longer, so detach from heartbeat */
3273 "%u to %u!\n", dlm->name, dead_node, 3239 __dlm_mle_detach_hb_events(dlm, mle);
3274 mle->master, mle->new_master); 3240 __dlm_put_mle(mle);
3275 /* if there is a lockres associated with this 3241 }
3276 * mle, find it and set its owner to UNKNOWN */ 3242}
3277 hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
3278 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
3279 mle->u.name.len, hash);
3280 if (res) {
3281 /* unfortunately if we hit this rare case, our
3282 * lock ordering is messed. we need to drop
3283 * the master lock so that we can take the
3284 * lockres lock, meaning that we will have to
3285 * restart from the head of list. */
3286 spin_unlock(&dlm->master_lock);
3287 3243
3288 /* move lockres onto recovery list */ 3244void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3289 spin_lock(&res->spinlock); 3245{
3290 dlm_set_lockres_owner(dlm, res, 3246 struct dlm_master_list_entry *mle;
3291 DLM_LOCK_RES_OWNER_UNKNOWN); 3247 struct dlm_lock_resource *res;
3292 dlm_move_lockres_to_recovery_list(dlm, res); 3248 struct hlist_head *bucket;
3293 spin_unlock(&res->spinlock); 3249 struct hlist_node *list;
3294 dlm_lockres_put(res); 3250 unsigned int i;
3295 3251
3296 /* about to get rid of mle, detach from heartbeat */ 3252 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
3297 __dlm_mle_detach_hb_events(dlm, mle); 3253top:
3254 assert_spin_locked(&dlm->spinlock);
3298 3255
3299 /* dump the mle */ 3256 /* clean the master list */
3300 spin_lock(&dlm->master_lock); 3257 spin_lock(&dlm->master_lock);
3301 __dlm_put_mle(mle); 3258 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3302 spin_unlock(&dlm->master_lock); 3259 bucket = dlm_master_hash(dlm, i);
3260 hlist_for_each(list, bucket) {
3261 mle = hlist_entry(list, struct dlm_master_list_entry,
3262 master_hash_node);
3263
3264 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3265 mle->type != DLM_MLE_MASTER &&
3266 mle->type != DLM_MLE_MIGRATION);
3267
3268 /* MASTER mles are initiated locally. The waiting
3269 * process will notice the node map change shortly.
3270 * Let that happen as normal. */
3271 if (mle->type == DLM_MLE_MASTER)
3272 continue;
3273
3274 /* BLOCK mles are initiated by other nodes. Need to
3275 * clean up if the dead node would have been the
3276 * master. */
3277 if (mle->type == DLM_MLE_BLOCK) {
3278 dlm_clean_block_mle(dlm, mle, dead_node);
3279 continue;
3280 }
3303 3281
3304 /* restart */ 3282 /* Everything else is a MIGRATION mle */
3305 goto top; 3283
3306 } 3284 /* The rule for MIGRATION mles is that the master
3285 * becomes UNKNOWN if *either* the original or the new
3286 * master dies. All UNKNOWN lockres' are sent to
3287 * whichever node becomes the recovery master. The new
3288 * master is responsible for determining if there is
3289 * still a master for this lockres, or if he needs to
3290 * take over mastery. Either way, this node should
3291 * expect another message to resolve this. */
3292
3293 if (mle->master != dead_node &&
3294 mle->new_master != dead_node)
3295 continue;
3296
3297 /* If we have reached this point, this mle needs to be
3298 * removed from the list and freed. */
3299 dlm_clean_migration_mle(dlm, mle);
3300
3301 mlog(0, "%s: node %u died during migration from "
3302 "%u to %u!\n", dlm->name, dead_node, mle->master,
3303 mle->new_master);
3304
3305 /* If we find a lockres associated with the mle, we've
3306 * hit this rare case that messes up our lock ordering.
3307 * If so, we need to drop the master lock so that we can
3308 * take the lockres lock, meaning that we will have to
3309 * restart from the head of list. */
3310 res = dlm_reset_mleres_owner(dlm, mle);
3311 if (res)
3312 /* restart */
3313 goto top;
3307 3314
3308 /* this may be the last reference */ 3315 /* This may be the last reference */
3309 __dlm_put_mle(mle); 3316 __dlm_put_mle(mle);
3317 }
3310 } 3318 }
3311 spin_unlock(&dlm->master_lock); 3319 spin_unlock(&dlm->master_lock);
3312} 3320}
3313 3321
3314
3315int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 3322int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3316 u8 old_master) 3323 u8 old_master)
3317{ 3324{
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
162 162
163 spin_lock(&res->spinlock); 163 spin_lock(&res->spinlock);
164 if (!__dlm_lockres_unused(res)) { 164 if (!__dlm_lockres_unused(res)) {
165 spin_unlock(&res->spinlock);
166 mlog(0, "%s:%.*s: tried to purge but not unused\n", 165 mlog(0, "%s:%.*s: tried to purge but not unused\n",
167 dlm->name, res->lockname.len, res->lockname.name); 166 dlm->name, res->lockname.len, res->lockname.name);
168 return -ENOTEMPTY; 167 __dlm_print_one_lock_resource(res);
168 spin_unlock(&res->spinlock);
169 BUG();
169 } 170 }
171
172 if (res->state & DLM_LOCK_RES_MIGRATING) {
173 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
174 "being remastered\n", dlm->name, res->lockname.len,
175 res->lockname.name);
176 /* Re-add the lockres to the end of the purge list */
177 if (!list_empty(&res->purge)) {
178 list_del_init(&res->purge);
179 list_add_tail(&res->purge, &dlm->purge_list);
180 }
181 spin_unlock(&res->spinlock);
182 return 0;
183 }
184
170 master = (res->owner == dlm->node_num); 185 master = (res->owner == dlm->node_num);
186
171 if (!master) 187 if (!master)
172 res->state |= DLM_LOCK_RES_DROPPING_REF; 188 res->state |= DLM_LOCK_RES_DROPPING_REF;
173 spin_unlock(&res->spinlock); 189 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
244 .flags = 0, 244 .flags = 0,
245}; 245};
246 246
247static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
248 .flags = 0,
249};
250
247static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { 251static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
248 .get_osb = ocfs2_get_dentry_osb, 252 .get_osb = ocfs2_get_dentry_osb,
249 .post_unlock = ocfs2_dentry_post_unlock, 253 .post_unlock = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
622 &ocfs2_rename_lops, osb); 626 &ocfs2_rename_lops, osb);
623} 627}
624 628
629static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
630 struct ocfs2_super *osb)
631{
632 /* nfs_sync lockres doesn't come from a slab so we call init
633 * once on it manually. */
634 ocfs2_lock_res_init_once(res);
635 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
636 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
637 &ocfs2_nfs_sync_lops, osb);
638}
639
625void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 640void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
626 struct ocfs2_file_private *fp) 641 struct ocfs2_file_private *fp)
627{ 642{
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2417 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); 2432 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2418} 2433}
2419 2434
2435int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
2436{
2437 int status;
2438 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2439
2440 if (ocfs2_is_hard_readonly(osb))
2441 return -EROFS;
2442
2443 if (ocfs2_mount_local(osb))
2444 return 0;
2445
2446 status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
2447 0, 0);
2448 if (status < 0)
2449 mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
2450
2451 return status;
2452}
2453
2454void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
2455{
2456 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2457
2458 if (!ocfs2_mount_local(osb))
2459 ocfs2_cluster_unlock(osb, lockres,
2460 ex ? LKM_EXMODE : LKM_PRMODE);
2461}
2462
2420int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2463int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2421{ 2464{
2422 int ret; 2465 int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2798local: 2841local:
2799 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2842 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2800 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2843 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2844 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
2801 2845
2802 osb->cconn = conn; 2846 osb->cconn = conn;
2803 2847
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2833 2877
2834 ocfs2_lock_res_free(&osb->osb_super_lockres); 2878 ocfs2_lock_res_free(&osb->osb_super_lockres);
2835 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2879 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2880 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
2836 2881
2837 ocfs2_cluster_disconnect(osb->cconn, hangup_pending); 2882 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2838 osb->cconn = NULL; 2883 osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
3015{ 3060{
3016 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); 3061 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
3017 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); 3062 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
3063 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
3018} 3064}
3019 3065
3020int ocfs2_drop_inode_locks(struct inode *inode) 3066int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
115 int ex); 115 int ex);
116int ocfs2_rename_lock(struct ocfs2_super *osb); 116int ocfs2_rename_lock(struct ocfs2_super *osb);
117void ocfs2_rename_unlock(struct ocfs2_super *osb); 117void ocfs2_rename_unlock(struct ocfs2_super *osb);
118int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
119void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
118int ocfs2_dentry_lock(struct dentry *dentry, int ex); 120int ocfs2_dentry_lock(struct dentry *dentry, int ex);
119void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 121void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
120int ocfs2_file_lock(struct file *file, int ex, int trylock); 122int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..de3da8eb558c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
31 31
32#include "ocfs2.h" 32#include "ocfs2.h"
33 33
34#include "alloc.h"
34#include "dir.h" 35#include "dir.h"
35#include "dlmglue.h" 36#include "dlmglue.h"
36#include "dcache.h" 37#include "dcache.h"
@@ -38,6 +39,7 @@
38#include "inode.h" 39#include "inode.h"
39 40
40#include "buffer_head_io.h" 41#include "buffer_head_io.h"
42#include "suballoc.h"
41 43
42struct ocfs2_inode_handle 44struct ocfs2_inode_handle
43{ 45{
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
49 struct ocfs2_inode_handle *handle) 51 struct ocfs2_inode_handle *handle)
50{ 52{
51 struct inode *inode; 53 struct inode *inode;
54 struct ocfs2_super *osb = OCFS2_SB(sb);
55 u64 blkno = handle->ih_blkno;
56 int status, set;
52 struct dentry *result; 57 struct dentry *result;
53 58
54 mlog_entry("(0x%p, 0x%p)\n", sb, handle); 59 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
55 60
56 if (handle->ih_blkno == 0) { 61 if (blkno == 0) {
57 mlog_errno(-ESTALE); 62 mlog(0, "nfs wants inode with blkno: 0\n");
58 return ERR_PTR(-ESTALE); 63 result = ERR_PTR(-ESTALE);
64 goto bail;
65 }
66
67 inode = ocfs2_ilookup(sb, blkno);
68 /*
69 * If the inode exists in memory, we only need to check it's
70 * generation number
71 */
72 if (inode)
73 goto check_gen;
74
75 /*
76 * This will synchronize us against ocfs2_delete_inode() on
77 * all nodes
78 */
79 status = ocfs2_nfs_sync_lock(osb, 1);
80 if (status < 0) {
81 mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
82 goto check_err;
83 }
84
85 status = ocfs2_test_inode_bit(osb, blkno, &set);
86 if (status < 0) {
87 if (status == -EINVAL) {
88 /*
89 * The blkno NFS gave us doesn't even show up
90 * as an inode, we return -ESTALE to be
91 * nice
92 */
93 mlog(0, "test inode bit failed %d\n", status);
94 status = -ESTALE;
95 } else {
96 mlog(ML_ERROR, "test inode bit failed %d\n", status);
97 }
98 goto unlock_nfs_sync;
99 }
100
101 /* If the inode allocator bit is clear, this inode must be stale */
102 if (!set) {
103 mlog(0, "inode %llu suballoc bit is clear\n", blkno);
104 status = -ESTALE;
105 goto unlock_nfs_sync;
59 } 106 }
60 107
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); 108 inode = ocfs2_iget(osb, blkno, 0, 0);
62 109
63 if (IS_ERR(inode)) 110unlock_nfs_sync:
64 return (void *)inode; 111 ocfs2_nfs_sync_unlock(osb, 1);
65 112
113check_err:
114 if (status < 0) {
115 if (status == -ESTALE) {
116 mlog(0, "stale inode ino: %llu generation: %u\n",
117 blkno, handle->ih_generation);
118 }
119 result = ERR_PTR(status);
120 goto bail;
121 }
122
123 if (IS_ERR(inode)) {
124 mlog_errno(PTR_ERR(inode));
125 result = (void *)inode;
126 goto bail;
127 }
128
129check_gen:
66 if (handle->ih_generation != inode->i_generation) { 130 if (handle->ih_generation != inode->i_generation) {
67 iput(inode); 131 iput(inode);
68 return ERR_PTR(-ESTALE); 132 mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
133 handle->ih_generation);
134 result = ERR_PTR(-ESTALE);
135 goto bail;
69 } 136 }
70 137
71 result = d_obtain_alias(inode); 138 result = d_obtain_alias(inode);
72 if (!IS_ERR(result)) 139 if (!IS_ERR(result))
73 result->d_op = &ocfs2_dentry_ops; 140 result->d_op = &ocfs2_dentry_ops;
141 else
142 mlog_errno(PTR_ERR(result));
74 143
144bail:
75 mlog_exit_ptr(result); 145 mlog_exit_ptr(result);
76 return result; 146 return result;
77} 147}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "alloc.h" 40#include "alloc.h"
41#include "dir.h"
41#include "blockcheck.h" 42#include "blockcheck.h"
42#include "dlmglue.h" 43#include "dlmglue.h"
43#include "extent_map.h" 44#include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
112 oi->ip_attr |= OCFS2_DIRSYNC_FL; 113 oi->ip_attr |= OCFS2_DIRSYNC_FL;
113} 114}
114 115
116struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
117{
118 struct ocfs2_find_inode_args args;
119
120 args.fi_blkno = blkno;
121 args.fi_flags = 0;
122 args.fi_ino = ino_from_blkno(sb, blkno);
123 args.fi_sysfile_type = 0;
124
125 return ilookup5(sb, blkno, ocfs2_find_actor, &args);
126}
115struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
116 int sysfile_type) 128 int sysfile_type)
117{ 129{
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
275 (unsigned long long)OCFS2_I(inode)->ip_blkno, 287 (unsigned long long)OCFS2_I(inode)->ip_blkno,
276 (unsigned long long)le64_to_cpu(fe->i_blkno)); 288 (unsigned long long)le64_to_cpu(fe->i_blkno));
277 289
278 inode->i_nlink = le16_to_cpu(fe->i_links_count); 290 inode->i_nlink = ocfs2_read_links_count(fe);
279 291
280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 292 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 293 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
351 363
352 ocfs2_set_inode_flags(inode); 364 ocfs2_set_inode_flags(inode);
353 365
366 OCFS2_I(inode)->ip_last_used_slot = 0;
367 OCFS2_I(inode)->ip_last_used_group = 0;
354 mlog_exit_void(); 368 mlog_exit_void();
355} 369}
356 370
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
606 } 620 }
607 621
608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 622 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb)); 623 ocfs2_quota_trans_credits(inode->i_sb));
610 if (IS_ERR(handle)) { 624 if (IS_ERR(handle)) {
611 status = PTR_ERR(handle); 625 status = PTR_ERR(handle);
612 mlog_errno(status); 626 mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
740 goto bail_unlock_dir; 754 goto bail_unlock_dir;
741 } 755 }
742 756
757 /* Remove any dir index tree */
758 if (S_ISDIR(inode->i_mode)) {
759 status = ocfs2_dx_dir_truncate(inode, di_bh);
760 if (status) {
761 mlog_errno(status);
762 goto bail_unlock_dir;
763 }
764 }
765
743 /*Free extended attribute resources associated with this inode.*/ 766 /*Free extended attribute resources associated with this inode.*/
744 status = ocfs2_xattr_remove(inode, di_bh); 767 status = ocfs2_xattr_remove(inode, di_bh);
745 if (status < 0) { 768 if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
949 goto bail; 972 goto bail;
950 } 973 }
951 974
975 /*
976 * Synchronize us against ocfs2_get_dentry. We take this in
977 * shared mode so that all nodes can still concurrently
978 * process deletes.
979 */
980 status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
981 if (status < 0) {
982 mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
983 ocfs2_cleanup_delete_inode(inode, 0);
984 goto bail_unblock;
985 }
952 /* Lock down the inode. This gives us an up to date view of 986 /* Lock down the inode. This gives us an up to date view of
953 * it's metadata (for verification), and allows us to 987 * it's metadata (for verification), and allows us to
954 * serialize delete_inode on multiple nodes. 988 * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
962 if (status != -ENOENT) 996 if (status != -ENOENT)
963 mlog_errno(status); 997 mlog_errno(status);
964 ocfs2_cleanup_delete_inode(inode, 0); 998 ocfs2_cleanup_delete_inode(inode, 0);
965 goto bail_unblock; 999 goto bail_unlock_nfs_sync;
966 } 1000 }
967 1001
968 /* Query the cluster. This will be the final decision made 1002 /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
1005bail_unlock_inode: 1039bail_unlock_inode:
1006 ocfs2_inode_unlock(inode, 1); 1040 ocfs2_inode_unlock(inode, 1);
1007 brelse(di_bh); 1041 brelse(di_bh);
1042
1043bail_unlock_nfs_sync:
1044 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1045
1008bail_unblock: 1046bail_unblock:
1009 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1047 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
1010 if (status < 0) 1048 if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1205 spin_unlock(&OCFS2_I(inode)->ip_lock); 1243 spin_unlock(&OCFS2_I(inode)->ip_lock);
1206 1244
1207 fe->i_size = cpu_to_le64(i_size_read(inode)); 1245 fe->i_size = cpu_to_le64(i_size_read(inode));
1208 fe->i_links_count = cpu_to_le16(inode->i_nlink); 1246 ocfs2_set_links_count(fe, inode->i_nlink);
1209 fe->i_uid = cpu_to_le32(inode->i_uid); 1247 fe->i_uid = cpu_to_le32(inode->i_uid);
1210 fe->i_gid = cpu_to_le32(inode->i_gid); 1248 fe->i_gid = cpu_to_le32(inode->i_gid);
1211 fe->i_mode = cpu_to_le16(inode->i_mode); 1249 fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1242 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1280 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1243 ocfs2_set_inode_flags(inode); 1281 ocfs2_set_inode_flags(inode);
1244 i_size_write(inode, le64_to_cpu(fe->i_size)); 1282 i_size_write(inode, le64_to_cpu(fe->i_size));
1245 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1283 inode->i_nlink = ocfs2_read_links_count(fe);
1246 inode->i_uid = le32_to_cpu(fe->i_uid); 1284 inode->i_uid = le32_to_cpu(fe->i_uid);
1247 inode->i_gid = le32_to_cpu(fe->i_gid); 1285 inode->i_gid = le32_to_cpu(fe->i_gid);
1248 inode->i_mode = le16_to_cpu(fe->i_mode); 1286 inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
72 72
73 struct inode vfs_inode; 73 struct inode vfs_inode;
74 struct jbd2_inode ip_jinode; 74 struct jbd2_inode ip_jinode;
75
76 /* Only valid if the inode is the dir. */
77 u32 ip_last_used_slot;
78 u64 ip_last_used_group;
75}; 79};
76 80
77/* 81/*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
124/* Flags for ocfs2_iget() */ 128/* Flags for ocfs2_iget() */
125#define OCFS2_FI_FLAG_SYSFILE 0x1 129#define OCFS2_FI_FLAG_SYSFILE 0x1
126#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 130#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
131struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, 132struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 133 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 134int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
65static int ocfs2_recover_orphans(struct ocfs2_super *osb, 65static int ocfs2_recover_orphans(struct ocfs2_super *osb,
66 int slot); 66 int slot);
67static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
68static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
69 int slot_num,
70 struct ocfs2_dinode *la_dinode,
71 struct ocfs2_dinode *tl_dinode,
72 struct ocfs2_quota_recovery *qrec);
68 73
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) 74static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{ 75{
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
76 return __ocfs2_wait_on_mount(osb, 1); 81 return __ocfs2_wait_on_mount(osb, 1);
77} 82}
78 83
79
80
81/* 84/*
82 * The recovery_list is a simple linked list of node numbers to recover. 85 * This replay_map is to track online/offline slots, so we could recover
83 * It is protected by the recovery_lock. 86 * offline slots during recovery and mount
84 */ 87 */
85 88
86struct ocfs2_recovery_map { 89enum ocfs2_replay_state {
87 unsigned int rm_used; 90 REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */
88 unsigned int *rm_entries; 91 REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */
92 REPLAY_DONE /* Replay was already queued */
89}; 93};
90 94
95struct ocfs2_replay_map {
96 unsigned int rm_slots;
97 enum ocfs2_replay_state rm_state;
98 unsigned char rm_replay_slots[0];
99};
100
101void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
102{
103 if (!osb->replay_map)
104 return;
105
106 /* If we've already queued the replay, we don't have any more to do */
107 if (osb->replay_map->rm_state == REPLAY_DONE)
108 return;
109
110 osb->replay_map->rm_state = state;
111}
112
113int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
114{
115 struct ocfs2_replay_map *replay_map;
116 int i, node_num;
117
118 /* If replay map is already set, we don't do it again */
119 if (osb->replay_map)
120 return 0;
121
122 replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
123 (osb->max_slots * sizeof(char)), GFP_KERNEL);
124
125 if (!replay_map) {
126 mlog_errno(-ENOMEM);
127 return -ENOMEM;
128 }
129
130 spin_lock(&osb->osb_lock);
131
132 replay_map->rm_slots = osb->max_slots;
133 replay_map->rm_state = REPLAY_UNNEEDED;
134
135 /* set rm_replay_slots for offline slot(s) */
136 for (i = 0; i < replay_map->rm_slots; i++) {
137 if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
138 replay_map->rm_replay_slots[i] = 1;
139 }
140
141 osb->replay_map = replay_map;
142 spin_unlock(&osb->osb_lock);
143 return 0;
144}
145
146void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
147{
148 struct ocfs2_replay_map *replay_map = osb->replay_map;
149 int i;
150
151 if (!replay_map)
152 return;
153
154 if (replay_map->rm_state != REPLAY_NEEDED)
155 return;
156
157 for (i = 0; i < replay_map->rm_slots; i++)
158 if (replay_map->rm_replay_slots[i])
159 ocfs2_queue_recovery_completion(osb->journal, i, NULL,
160 NULL, NULL);
161 replay_map->rm_state = REPLAY_DONE;
162}
163
164void ocfs2_free_replay_slots(struct ocfs2_super *osb)
165{
166 struct ocfs2_replay_map *replay_map = osb->replay_map;
167
168 if (!osb->replay_map)
169 return;
170
171 kfree(replay_map);
172 osb->replay_map = NULL;
173}
174
91int ocfs2_recovery_init(struct ocfs2_super *osb) 175int ocfs2_recovery_init(struct ocfs2_super *osb)
92{ 176{
93 struct ocfs2_recovery_map *rm; 177 struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
496 }, 580 },
497}; 581};
498 582
583static struct ocfs2_triggers dr_triggers = {
584 .ot_triggers = {
585 .t_commit = ocfs2_commit_trigger,
586 .t_abort = ocfs2_abort_trigger,
587 },
588 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
589};
590
591static struct ocfs2_triggers dl_triggers = {
592 .ot_triggers = {
593 .t_commit = ocfs2_commit_trigger,
594 .t_abort = ocfs2_abort_trigger,
595 },
596 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
597};
598
499static int __ocfs2_journal_access(handle_t *handle, 599static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode, 600 struct inode *inode,
501 struct buffer_head *bh, 601 struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
600 type); 700 type);
601} 701}
602 702
703int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
704 struct buffer_head *bh, int type)
705{
706 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
707 type);
708}
709
710int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
711 struct buffer_head *bh, int type)
712{
713 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
714 type);
715}
716
603int ocfs2_journal_access(handle_t *handle, struct inode *inode, 717int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type) 718 struct buffer_head *bh, int type)
605{ 719{
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1176} 1290}
1177 1291
1178/* Called by the mount code to queue recovery the last part of 1292/* Called by the mount code to queue recovery the last part of
1179 * recovery for it's own slot. */ 1293 * recovery for it's own and offline slot(s). */
1180void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) 1294void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1181{ 1295{
1182 struct ocfs2_journal *journal = osb->journal; 1296 struct ocfs2_journal *journal = osb->journal;
1183 1297
1184 if (osb->dirty) { 1298 /* No need to queue up our truncate_log as regular cleanup will catch
1185 /* No need to queue up our truncate_log as regular 1299 * that */
1186 * cleanup will catch that. */ 1300 ocfs2_queue_recovery_completion(journal, osb->slot_num,
1187 ocfs2_queue_recovery_completion(journal, 1301 osb->local_alloc_copy, NULL, NULL);
1188 osb->slot_num, 1302 ocfs2_schedule_truncate_log_flush(osb, 0);
1189 osb->local_alloc_copy,
1190 NULL,
1191 NULL);
1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1193 1303
1194 osb->local_alloc_copy = NULL; 1304 osb->local_alloc_copy = NULL;
1195 osb->dirty = 0; 1305 osb->dirty = 0;
1196 } 1306
1307 /* queue to recover orphan slots for all offline slots */
1308 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1309 ocfs2_queue_replay_slots(osb);
1310 ocfs2_free_replay_slots(osb);
1197} 1311}
1198 1312
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) 1313void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
1236 goto bail; 1350 goto bail;
1237 } 1351 }
1238 1352
1353 status = ocfs2_compute_replay_slots(osb);
1354 if (status < 0)
1355 mlog_errno(status);
1356
1357 /* queue recovery for our own slot */
1358 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1359 NULL, NULL);
1360
1239 spin_lock(&osb->osb_lock); 1361 spin_lock(&osb->osb_lock);
1240 while (rm->rm_used) { 1362 while (rm->rm_used) {
1241 /* It's always safe to remove entry zero, as we won't 1363 /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
1301 1423
1302 ocfs2_super_unlock(osb, 1); 1424 ocfs2_super_unlock(osb, 1);
1303 1425
1304 /* We always run recovery on our own orphan dir - the dead 1426 /* queue recovery for offline slots */
1305 * node(s) may have disallowd a previos inode delete. Re-processing 1427 ocfs2_queue_replay_slots(osb);
1306 * is therefore required. */
1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1308 NULL, NULL);
1309 1428
1310bail: 1429bail:
1311 mutex_lock(&osb->recovery_lock); 1430 mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
1314 goto restart; 1433 goto restart;
1315 } 1434 }
1316 1435
1436 ocfs2_free_replay_slots(osb);
1317 osb->recovery_thread_task = NULL; 1437 osb->recovery_thread_task = NULL;
1318 mb(); /* sync with ocfs2_recovery_thread_running */ 1438 mb(); /* sync with ocfs2_recovery_thread_running */
1319 wake_up(&osb->recovery_event); 1439 wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1465 goto done; 1585 goto done;
1466 } 1586 }
1467 1587
1588 /* we need to run complete recovery for offline orphan slots */
1589 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1590
1468 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1591 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1469 node_num, slot_num, 1592 node_num, slot_num,
1470 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1593 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..619dd7f6c053 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
38struct ocfs2_super; 38struct ocfs2_super;
39struct ocfs2_dinode; 39struct ocfs2_dinode;
40 40
41/*
42 * The recovery_list is a simple linked list of node numbers to recover.
43 * It is protected by the recovery_lock.
44 */
45
46struct ocfs2_recovery_map {
47 unsigned int rm_used;
48 unsigned int *rm_entries;
49};
50
51
41struct ocfs2_journal { 52struct ocfs2_journal {
42 enum ocfs2_journal_state j_state; /* Journals current state */ 53 enum ocfs2_journal_state j_state; /* Journals current state */
43 54
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
139int ocfs2_recovery_init(struct ocfs2_super *osb); 150int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb); 151void ocfs2_recovery_exit(struct ocfs2_super *osb);
141 152
153int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
142/* 154/*
143 * Journal Control: 155 * Journal Control:
144 * Initialize, Load, Shutdown, Wipe a journal. 156 * Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
266/* dirblock */ 278/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
281/* ocfs2_dx_root_block */
282int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
283 struct buffer_head *bh, int type);
284/* ocfs2_dx_leaf */
285int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
286 struct buffer_head *bh, int type);
269/* Anything that has no ecc */ 287/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
368} 386}
369 387
370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 388/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
371 * bitmap block for the new bit) */ 389 * bitmap block for the new bit) dx_root update for free list */
372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 390#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
391
392static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
393{
394 /* 1 block for index, 2 allocs (data, metadata), 1 clusters
395 * worth of blocks for initial extent. */
396 return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
397 ocfs2_clusters_to_blocks(sb, 1);
398}
373 399
374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 400/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
375 * group descriptor + mkdir/symlink blocks + quota update */ 401 * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
376static inline int ocfs2_mknod_credits(struct super_block *sb) 402 * blocks + quota update */
403static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
404 int xattr_credits)
377{ 405{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + 406 int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
407
408 if (is_dir)
409 dir_credits += ocfs2_add_dir_index_credits(sb);
410
411 return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
379 ocfs2_quota_trans_credits(sb); 412 ocfs2_quota_trans_credits(sb);
380} 413}
381 414
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 421#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
389 422
390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 423/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
391 * update on dir */ 424 * update on dir + index leaf + dx root update for free list */
392static inline int ocfs2_link_credits(struct super_block *sb) 425static inline int ocfs2_link_credits(struct super_block *sb)
393{ 426{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + 427 return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
395 ocfs2_quota_trans_credits(sb); 428 ocfs2_quota_trans_credits(sb);
396} 429}
397 430
398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 431/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
399 * dir inode link */ 432 * dir inode link + dir inode index leaf + dir index root */
400static inline int ocfs2_unlink_credits(struct super_block *sb) 433static inline int ocfs2_unlink_credits(struct super_block *sb)
401{ 434{
402 /* The quota update from ocfs2_link_credits is unused here... */ 435 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); 436 return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
404} 437}
405 438
406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 439/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
407 * inode alloc group descriptor */ 440 * inode alloc group descriptor + orphan dir index leaf */
408#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) 441#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
409 442
410/* dinode update, old dir dinode update, new dir dinode update, old 443/* dinode update, old dir dinode update, new dir dinode update, old
411 * dir dir entry, new dir dir entry, dir entry update for renaming 444 * dir dir entry, new dir dir entry, dir entry update for renaming
412 * directory + target unlink */ 445 * directory + target unlink + 3 x dir index leaves */
413static inline int ocfs2_rename_credits(struct super_block *sb) 446static inline int ocfs2_rename_credits(struct super_block *sb)
414{ 447{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); 448 return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
416} 449}
417 450
418/* global bitmap dinode, group desc., relinked group, 451/* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
422 + OCFS2_INODE_UPDATE_CREDITS \ 455 + OCFS2_INODE_UPDATE_CREDITS \
423 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) 456 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
424 457
458/* inode update, removal of dx root block from allocator */
459#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
460 OCFS2_SUBALLOC_FREE)
461
462static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
463{
464 int credits = 1 + OCFS2_SUBALLOC_ALLOC;
465
466 credits += ocfs2_clusters_to_blocks(sb, 1);
467 credits += ocfs2_quota_trans_credits(sb);
468
469 return credits;
470}
471
425/* 472/*
426 * Please note that the caller must make sure that root_el is the root 473 * Please note that the caller must make sure that root_el is the root
427 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 474 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
457 504
458static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 505static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
459{ 506{
460 int blocks = ocfs2_mknod_credits(sb); 507 int blocks = ocfs2_mknod_credits(sb, 0, 0);
461 508
462 /* links can be longer than one block so we may update many 509 /* links can be longer than one block so we may update many
463 * within our single allocated extent. */ 510 * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/debugfs.h>
32 31
33#define MLOG_MASK_PREFIX ML_DISK_ALLOC 32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
34#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 75 struct inode *local_alloc_inode);
77 76
78#ifdef CONFIG_OCFS2_FS_STATS
79
80static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
81{
82 file->private_data = inode->i_private;
83 return 0;
84}
85
86#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
87#define LA_DEBUG_VER 1
88static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
89 size_t count, loff_t *ppos)
90{
91 static DEFINE_MUTEX(la_debug_mutex);
92 struct ocfs2_super *osb = file->private_data;
93 int written, ret;
94 char *buf = osb->local_alloc_debug_buf;
95
96 mutex_lock(&la_debug_mutex);
97 memset(buf, 0, LA_DEBUG_BUF_SZ);
98
99 written = snprintf(buf, LA_DEBUG_BUF_SZ,
100 "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
101 LA_DEBUG_VER,
102 (unsigned long long)osb->la_last_gd,
103 osb->local_alloc_default_bits,
104 osb->local_alloc_bits, osb->local_alloc_state);
105
106 ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
107
108 mutex_unlock(&la_debug_mutex);
109 return ret;
110}
111
112static const struct file_operations ocfs2_la_debug_fops = {
113 .open = ocfs2_la_debug_open,
114 .read = ocfs2_la_debug_read,
115};
116
117static void ocfs2_init_la_debug(struct ocfs2_super *osb)
118{
119 osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
120 if (!osb->local_alloc_debug_buf)
121 return;
122
123 osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
124 S_IFREG|S_IRUSR,
125 osb->osb_debug_root,
126 osb,
127 &ocfs2_la_debug_fops);
128 if (!osb->local_alloc_debug) {
129 kfree(osb->local_alloc_debug_buf);
130 osb->local_alloc_debug_buf = NULL;
131 }
132}
133
134static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
135{
136 if (osb->local_alloc_debug)
137 debugfs_remove(osb->local_alloc_debug);
138
139 if (osb->local_alloc_debug_buf)
140 kfree(osb->local_alloc_debug_buf);
141
142 osb->local_alloc_debug_buf = NULL;
143 osb->local_alloc_debug = NULL;
144}
145#else /* CONFIG_OCFS2_FS_STATS */
146static void ocfs2_init_la_debug(struct ocfs2_super *osb)
147{
148 return;
149}
150static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
151{
152 return;
153}
154#endif
155
156static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
157{ 78{
158 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
226 147
227 mlog_entry_void(); 148 mlog_entry_void();
228 149
229 ocfs2_init_la_debug(osb);
230
231 if (osb->local_alloc_bits == 0) 150 if (osb->local_alloc_bits == 0)
232 goto bail; 151 goto bail;
233 152
@@ -299,9 +218,6 @@ bail:
299 if (inode) 218 if (inode)
300 iput(inode); 219 iput(inode);
301 220
302 if (status < 0)
303 ocfs2_shutdown_la_debug(osb);
304
305 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); 221 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
306 222
307 mlog_exit(status); 223 mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
331 cancel_delayed_work(&osb->la_enable_wq); 247 cancel_delayed_work(&osb->la_enable_wq);
332 flush_workqueue(ocfs2_wq); 248 flush_workqueue(ocfs2_wq);
333 249
334 ocfs2_shutdown_la_debug(osb);
335
336 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 250 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
337 goto out; 251 goto out;
338 252
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4b11762f249e..2220f93f668b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 80 struct inode **ret_orphan_dir,
81 struct inode *inode, 81 struct inode *inode,
82 char *name, 82 char *name,
83 struct buffer_head **de_bh); 83 struct ocfs2_dir_lookup_result *lookup);
84 84
85static int ocfs2_orphan_add(struct ocfs2_super *osb, 85static int ocfs2_orphan_add(struct ocfs2_super *osb,
86 handle_t *handle, 86 handle_t *handle,
87 struct inode *inode, 87 struct inode *inode,
88 struct ocfs2_dinode *fe, 88 struct ocfs2_dinode *fe,
89 char *name, 89 char *name,
90 struct buffer_head *de_bh, 90 struct ocfs2_dir_lookup_result *lookup,
91 struct inode *orphan_dir_inode); 91 struct inode *orphan_dir_inode);
92 92
93static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 93static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
228 struct ocfs2_super *osb; 228 struct ocfs2_super *osb;
229 struct ocfs2_dinode *dirfe; 229 struct ocfs2_dinode *dirfe;
230 struct buffer_head *new_fe_bh = NULL; 230 struct buffer_head *new_fe_bh = NULL;
231 struct buffer_head *de_bh = NULL;
232 struct inode *inode = NULL; 231 struct inode *inode = NULL;
233 struct ocfs2_alloc_context *inode_ac = NULL; 232 struct ocfs2_alloc_context *inode_ac = NULL;
234 struct ocfs2_alloc_context *data_ac = NULL; 233 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL; 234 struct ocfs2_alloc_context *meta_ac = NULL;
236 int want_clusters = 0; 235 int want_clusters = 0;
236 int want_meta = 0;
237 int xattr_credits = 0; 237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = { 238 struct ocfs2_security_xattr_info si = {
239 .enable = 1, 239 .enable = 1,
240 }; 240 };
241 int did_quota_inode = 0; 241 int did_quota_inode = 0;
242 struct ocfs2_dir_lookup_result lookup = { NULL, };
242 243
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 244 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 245 (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
254 return status; 255 return status;
255 } 256 }
256 257
257 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { 258 if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
258 status = -EMLINK; 259 status = -EMLINK;
259 goto leave; 260 goto leave;
260 } 261 }
261 262
262 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 263 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
263 if (!dirfe->i_links_count) { 264 if (!ocfs2_read_links_count(dirfe)) {
264 /* can't make a file in a deleted directory. */ 265 /* can't make a file in a deleted directory. */
265 status = -ENOENT; 266 status = -ENOENT;
266 goto leave; 267 goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
274 /* get a spot inside the dir. */ 275 /* get a spot inside the dir. */
275 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 276 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
276 dentry->d_name.name, 277 dentry->d_name.name,
277 dentry->d_name.len, &de_bh); 278 dentry->d_name.len, &lookup);
278 if (status < 0) { 279 if (status < 0) {
279 mlog_errno(status); 280 mlog_errno(status);
280 goto leave; 281 goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
308 309
309 /* calculate meta data/clusters for setting security and acl xattr */ 310 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, 311 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters, 312 &si, &want_clusters,
312 &xattr_credits, &xattr_ac); 313 &xattr_credits, &want_meta);
313 if (status < 0) { 314 if (status < 0) {
314 mlog_errno(status); 315 mlog_errno(status);
315 goto leave; 316 goto leave;
316 } 317 }
317 318
318 /* Reserve a cluster if creating an extent based directory. */ 319 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) 320 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
320 want_clusters += 1; 321 want_clusters += 1;
321 322
323 /* Dir indexing requires extra space as well */
324 if (ocfs2_supports_indexed_dirs(osb))
325 want_meta++;
326 }
327
328 status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
329 if (status < 0) {
330 if (status != -ENOSPC)
331 mlog_errno(status);
332 goto leave;
333 }
334
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); 335 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) { 336 if (status < 0) {
324 if (status != -ENOSPC) 337 if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
326 goto leave; 339 goto leave;
327 } 340 }
328 341
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + 342 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
330 xattr_credits); 343 S_ISDIR(mode),
344 xattr_credits));
331 if (IS_ERR(handle)) { 345 if (IS_ERR(handle)) {
332 status = PTR_ERR(handle); 346 status = PTR_ERR(handle);
333 handle = NULL; 347 handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
355 369
356 if (S_ISDIR(mode)) { 370 if (S_ISDIR(mode)) {
357 status = ocfs2_fill_new_dir(osb, handle, dir, inode, 371 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
358 new_fe_bh, data_ac); 372 new_fe_bh, data_ac, meta_ac);
359 if (status < 0) { 373 if (status < 0) {
360 mlog_errno(status); 374 mlog_errno(status);
361 goto leave; 375 goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
367 mlog_errno(status); 381 mlog_errno(status);
368 goto leave; 382 goto leave;
369 } 383 }
370 le16_add_cpu(&dirfe->i_links_count, 1); 384 ocfs2_add_links_count(dirfe, 1);
371 status = ocfs2_journal_dirty(handle, parent_fe_bh); 385 status = ocfs2_journal_dirty(handle, parent_fe_bh);
372 if (status < 0) { 386 if (status < 0) {
373 mlog_errno(status); 387 mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
377 } 391 }
378 392
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, 393 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac); 394 meta_ac, data_ac);
381 if (status < 0) { 395 if (status < 0) {
382 mlog_errno(status); 396 mlog_errno(status);
383 goto leave; 397 goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
385 399
386 if (si.enable) { 400 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, 401 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac); 402 meta_ac, data_ac);
389 if (status < 0) { 403 if (status < 0) {
390 mlog_errno(status); 404 mlog_errno(status);
391 goto leave; 405 goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
394 408
395 status = ocfs2_add_entry(handle, dentry, inode, 409 status = ocfs2_add_entry(handle, dentry, inode,
396 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 410 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
397 de_bh); 411 &lookup);
398 if (status < 0) { 412 if (status < 0) {
399 mlog_errno(status); 413 mlog_errno(status);
400 goto leave; 414 goto leave;
@@ -423,11 +437,12 @@ leave:
423 mlog(0, "Disk is full\n"); 437 mlog(0, "Disk is full\n");
424 438
425 brelse(new_fe_bh); 439 brelse(new_fe_bh);
426 brelse(de_bh);
427 brelse(parent_fe_bh); 440 brelse(parent_fe_bh);
428 kfree(si.name); 441 kfree(si.name);
429 kfree(si.value); 442 kfree(si.value);
430 443
444 ocfs2_free_dir_lookup_result(&lookup);
445
431 if ((status < 0) && inode) { 446 if ((status < 0) && inode) {
432 clear_nlink(inode); 447 clear_nlink(inode);
433 iput(inode); 448 iput(inode);
@@ -439,8 +454,8 @@ leave:
439 if (data_ac) 454 if (data_ac)
440 ocfs2_free_alloc_context(data_ac); 455 ocfs2_free_alloc_context(data_ac);
441 456
442 if (xattr_ac) 457 if (meta_ac)
443 ocfs2_free_alloc_context(xattr_ac); 458 ocfs2_free_alloc_context(meta_ac);
444 459
445 mlog_exit(status); 460 mlog_exit(status);
446 461
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
462 struct ocfs2_extent_list *fel; 477 struct ocfs2_extent_list *fel;
463 u64 fe_blkno = 0; 478 u64 fe_blkno = 0;
464 u16 suballoc_bit; 479 u16 suballoc_bit;
480 u16 feat;
465 481
466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
467 inode->i_mode, (unsigned long)dev, dentry->d_name.len, 483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
469 485
470 *new_fe_bh = NULL; 486 *new_fe_bh = NULL;
471 487
472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
473 &fe_blkno); 489 inode_ac, &suballoc_bit, &fe_blkno);
474 if (status < 0) { 490 if (status < 0) {
475 mlog_errno(status); 491 mlog_errno(status);
476 goto leave; 492 goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
513 fe->i_mode = cpu_to_le16(inode->i_mode); 529 fe->i_mode = cpu_to_le16(inode->i_mode);
514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 530 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 531 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
516 fe->i_links_count = cpu_to_le16(inode->i_nlink); 532
533 ocfs2_set_links_count(fe, inode->i_nlink);
517 534
518 fe->i_last_eb_blk = 0; 535 fe->i_last_eb_blk = 0;
519 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 536 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
525 fe->i_dtime = 0; 542 fe->i_dtime = 0;
526 543
527 /* 544 /*
528 * If supported, directories start with inline data. 545 * If supported, directories start with inline data. If inline
546 * isn't supported, but indexing is, we start them as indexed.
529 */ 547 */
548 feat = le16_to_cpu(fe->i_dyn_features);
530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { 549 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
531 u16 feat = le16_to_cpu(fe->i_dyn_features);
532
533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 550 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
534 551
535 fe->id2.i_data.id_count = cpu_to_le16( 552 fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
608 int err; 625 int err;
609 struct buffer_head *fe_bh = NULL; 626 struct buffer_head *fe_bh = NULL;
610 struct buffer_head *parent_fe_bh = NULL; 627 struct buffer_head *parent_fe_bh = NULL;
611 struct buffer_head *de_bh = NULL;
612 struct ocfs2_dinode *fe = NULL; 628 struct ocfs2_dinode *fe = NULL;
613 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 629 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
630 struct ocfs2_dir_lookup_result lookup = { NULL, };
614 631
615 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 632 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
616 old_dentry->d_name.len, old_dentry->d_name.name, 633 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
638 655
639 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 656 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
640 dentry->d_name.name, 657 dentry->d_name.name,
641 dentry->d_name.len, &de_bh); 658 dentry->d_name.len, &lookup);
642 if (err < 0) { 659 if (err < 0) {
643 mlog_errno(err); 660 mlog_errno(err);
644 goto out; 661 goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
652 } 669 }
653 670
654 fe = (struct ocfs2_dinode *) fe_bh->b_data; 671 fe = (struct ocfs2_dinode *) fe_bh->b_data;
655 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { 672 if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
656 err = -EMLINK; 673 err = -EMLINK;
657 goto out_unlock_inode; 674 goto out_unlock_inode;
658 } 675 }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
674 691
675 inc_nlink(inode); 692 inc_nlink(inode);
676 inode->i_ctime = CURRENT_TIME; 693 inode->i_ctime = CURRENT_TIME;
677 fe->i_links_count = cpu_to_le16(inode->i_nlink); 694 ocfs2_set_links_count(fe, inode->i_nlink);
678 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 695 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
679 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 696 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
680 697
681 err = ocfs2_journal_dirty(handle, fe_bh); 698 err = ocfs2_journal_dirty(handle, fe_bh);
682 if (err < 0) { 699 if (err < 0) {
683 le16_add_cpu(&fe->i_links_count, -1); 700 ocfs2_add_links_count(fe, -1);
684 drop_nlink(inode); 701 drop_nlink(inode);
685 mlog_errno(err); 702 mlog_errno(err);
686 goto out_commit; 703 goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
688 705
689 err = ocfs2_add_entry(handle, dentry, inode, 706 err = ocfs2_add_entry(handle, dentry, inode,
690 OCFS2_I(inode)->ip_blkno, 707 OCFS2_I(inode)->ip_blkno,
691 parent_fe_bh, de_bh); 708 parent_fe_bh, &lookup);
692 if (err) { 709 if (err) {
693 le16_add_cpu(&fe->i_links_count, -1); 710 ocfs2_add_links_count(fe, -1);
694 drop_nlink(inode); 711 drop_nlink(inode);
695 mlog_errno(err); 712 mlog_errno(err);
696 goto out_commit; 713 goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
714out: 731out:
715 ocfs2_inode_unlock(dir, 1); 732 ocfs2_inode_unlock(dir, 1);
716 733
717 brelse(de_bh);
718 brelse(fe_bh); 734 brelse(fe_bh);
719 brelse(parent_fe_bh); 735 brelse(parent_fe_bh);
720 736
737 ocfs2_free_dir_lookup_result(&lookup);
738
721 mlog_exit(err); 739 mlog_exit(err);
722 740
723 return err; 741 return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
766 struct buffer_head *fe_bh = NULL; 784 struct buffer_head *fe_bh = NULL;
767 struct buffer_head *parent_node_bh = NULL; 785 struct buffer_head *parent_node_bh = NULL;
768 handle_t *handle = NULL; 786 handle_t *handle = NULL;
769 struct ocfs2_dir_entry *dirent = NULL;
770 struct buffer_head *dirent_bh = NULL;
771 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 787 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
772 struct buffer_head *orphan_entry_bh = NULL; 788 struct ocfs2_dir_lookup_result lookup = { NULL, };
789 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
773 790
774 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
775 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
791 } 808 }
792 809
793 status = ocfs2_find_files_on_disk(dentry->d_name.name, 810 status = ocfs2_find_files_on_disk(dentry->d_name.name,
794 dentry->d_name.len, &blkno, 811 dentry->d_name.len, &blkno, dir,
795 dir, &dirent_bh, &dirent); 812 &lookup);
796 if (status < 0) { 813 if (status < 0) {
797 if (status != -ENOENT) 814 if (status != -ENOENT)
798 mlog_errno(status); 815 mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
817 child_locked = 1; 834 child_locked = 1;
818 835
819 if (S_ISDIR(inode->i_mode)) { 836 if (S_ISDIR(inode->i_mode)) {
820 if (!ocfs2_empty_dir(inode)) { 837 if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
821 status = -ENOTEMPTY;
822 goto leave;
823 } else if (inode->i_nlink != 2) {
824 status = -ENOTEMPTY; 838 status = -ENOTEMPTY;
825 goto leave; 839 goto leave;
826 } 840 }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
836 850
837 if (inode_is_unlinkable(inode)) { 851 if (inode_is_unlinkable(inode)) {
838 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 852 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
839 orphan_name, 853 orphan_name, &orphan_insert);
840 &orphan_entry_bh);
841 if (status < 0) { 854 if (status < 0) {
842 mlog_errno(status); 855 mlog_errno(status);
843 goto leave; 856 goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
863 876
864 if (inode_is_unlinkable(inode)) { 877 if (inode_is_unlinkable(inode)) {
865 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 878 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
866 orphan_entry_bh, orphan_dir); 879 &orphan_insert, orphan_dir);
867 if (status < 0) { 880 if (status < 0) {
868 mlog_errno(status); 881 mlog_errno(status);
869 goto leave; 882 goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
871 } 884 }
872 885
873 /* delete the name from the parent dir */ 886 /* delete the name from the parent dir */
874 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); 887 status = ocfs2_delete_entry(handle, dir, &lookup);
875 if (status < 0) { 888 if (status < 0) {
876 mlog_errno(status); 889 mlog_errno(status);
877 goto leave; 890 goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
880 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
881 drop_nlink(inode); 894 drop_nlink(inode);
882 drop_nlink(inode); 895 drop_nlink(inode);
883 fe->i_links_count = cpu_to_le16(inode->i_nlink); 896 ocfs2_set_links_count(fe, inode->i_nlink);
884 897
885 status = ocfs2_journal_dirty(handle, fe_bh); 898 status = ocfs2_journal_dirty(handle, fe_bh);
886 if (status < 0) { 899 if (status < 0) {
@@ -916,9 +929,10 @@ leave:
916 } 929 }
917 930
918 brelse(fe_bh); 931 brelse(fe_bh);
919 brelse(dirent_bh);
920 brelse(parent_node_bh); 932 brelse(parent_node_bh);
921 brelse(orphan_entry_bh); 933
934 ocfs2_free_dir_lookup_result(&orphan_insert);
935 ocfs2_free_dir_lookup_result(&lookup);
922 936
923 mlog_exit(status); 937 mlog_exit(status);
924 938
@@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
1004 struct inode *new_dir, 1018 struct inode *new_dir,
1005 struct dentry *new_dentry) 1019 struct dentry *new_dentry)
1006{ 1020{
1007 int status = 0, rename_lock = 0, parents_locked = 0; 1021 int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
1008 int old_child_locked = 0, new_child_locked = 0; 1022 int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
1009 struct inode *old_inode = old_dentry->d_inode; 1023 struct inode *old_inode = old_dentry->d_inode;
1010 struct inode *new_inode = new_dentry->d_inode; 1024 struct inode *new_inode = new_dentry->d_inode;
1011 struct inode *orphan_dir = NULL; 1025 struct inode *orphan_dir = NULL;
@@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
1020 handle_t *handle = NULL; 1034 handle_t *handle = NULL;
1021 struct buffer_head *old_dir_bh = NULL; 1035 struct buffer_head *old_dir_bh = NULL;
1022 struct buffer_head *new_dir_bh = NULL; 1036 struct buffer_head *new_dir_bh = NULL;
1023 struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
1024 *new_de = NULL;
1025 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1026 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1027 // this is the 1st dirent bh
1028 nlink_t old_dir_nlink = old_dir->i_nlink; 1037 nlink_t old_dir_nlink = old_dir->i_nlink;
1029 struct ocfs2_dinode *old_di; 1038 struct ocfs2_dinode *old_di;
1039 struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
1040 struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
1041 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
1042 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
1043 struct ocfs2_dir_lookup_result target_insert = { NULL, };
1030 1044
1031 /* At some point it might be nice to break this function up a 1045 /* At some point it might be nice to break this function up a
1032 * bit. */ 1046 * bit. */
@@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
1108 if (S_ISDIR(old_inode->i_mode)) { 1122 if (S_ISDIR(old_inode->i_mode)) {
1109 u64 old_inode_parent; 1123 u64 old_inode_parent;
1110 1124
1125 update_dot_dot = 1;
1111 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, 1126 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
1112 old_inode, &old_inode_de_bh, 1127 old_inode,
1113 &old_inode_dot_dot_de); 1128 &old_inode_dot_dot_res);
1114 if (status) { 1129 if (status) {
1115 status = -EIO; 1130 status = -EIO;
1116 goto bail; 1131 goto bail;
@@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
1122 } 1137 }
1123 1138
1124 if (!new_inode && new_dir != old_dir && 1139 if (!new_inode && new_dir != old_dir &&
1125 new_dir->i_nlink >= OCFS2_LINK_MAX) { 1140 new_dir->i_nlink >= ocfs2_link_max(osb)) {
1126 status = -EMLINK; 1141 status = -EMLINK;
1127 goto bail; 1142 goto bail;
1128 } 1143 }
@@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
1151 * to delete it */ 1166 * to delete it */
1152 status = ocfs2_find_files_on_disk(new_dentry->d_name.name, 1167 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1153 new_dentry->d_name.len, 1168 new_dentry->d_name.len,
1154 &newfe_blkno, new_dir, &new_de_bh, 1169 &newfe_blkno, new_dir,
1155 &new_de); 1170 &target_lookup_res);
1156 /* The only error we allow here is -ENOENT because the new 1171 /* The only error we allow here is -ENOENT because the new
1157 * file not existing is perfectly valid. */ 1172 * file not existing is perfectly valid. */
1158 if ((status < 0) && (status != -ENOENT)) { 1173 if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
1161 mlog_errno(status); 1176 mlog_errno(status);
1162 goto bail; 1177 goto bail;
1163 } 1178 }
1179 if (status == 0)
1180 target_exists = 1;
1164 1181
1165 if (!new_de && new_inode) { 1182 if (!target_exists && new_inode) {
1166 /* 1183 /*
1167 * Target was unlinked by another node while we were 1184 * Target was unlinked by another node while we were
1168 * waiting to get to ocfs2_rename(). There isn't 1185 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
1175 1192
1176 /* In case we need to overwrite an existing file, we blow it 1193 /* In case we need to overwrite an existing file, we blow it
1177 * away first */ 1194 * away first */
1178 if (new_de) { 1195 if (target_exists) {
1179 /* VFS didn't think there existed an inode here, but 1196 /* VFS didn't think there existed an inode here, but
1180 * someone else in the cluster must have raced our 1197 * someone else in the cluster must have raced our
1181 * rename to create one. Today we error cleanly, in 1198 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
1216 1233
1217 newfe = (struct ocfs2_dinode *) newfe_bh->b_data; 1234 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1218 1235
1219 mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " 1236 mlog(0, "aha rename over existing... new_blkno=%llu "
1220 "newfebh=%p bhblocknr=%llu\n", new_de, 1237 "newfebh=%p bhblocknr=%llu\n",
1221 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? 1238 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
1222 (unsigned long long)newfe_bh->b_blocknr : 0ULL); 1239 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1223 1240
@@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
1225 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1242 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1226 new_inode, 1243 new_inode,
1227 orphan_name, 1244 orphan_name,
1228 &orphan_entry_bh); 1245 &orphan_insert);
1229 if (status < 0) { 1246 if (status < 0) {
1230 mlog_errno(status); 1247 mlog_errno(status);
1231 goto bail; 1248 goto bail;
@@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
1243 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, 1260 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1244 new_dentry->d_name.name, 1261 new_dentry->d_name.name,
1245 new_dentry->d_name.len, 1262 new_dentry->d_name.len,
1246 &insert_entry_bh); 1263 &target_insert);
1247 if (status < 0) { 1264 if (status < 0) {
1248 mlog_errno(status); 1265 mlog_errno(status);
1249 goto bail; 1266 goto bail;
@@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
1258 goto bail; 1275 goto bail;
1259 } 1276 }
1260 1277
1261 if (new_de) { 1278 if (target_exists) {
1262 if (S_ISDIR(new_inode->i_mode)) { 1279 if (S_ISDIR(new_inode->i_mode)) {
1263 if (!ocfs2_empty_dir(new_inode) || 1280 if (new_inode->i_nlink != 2 ||
1264 new_inode->i_nlink != 2) { 1281 !ocfs2_empty_dir(new_inode)) {
1265 status = -ENOTEMPTY; 1282 status = -ENOTEMPTY;
1266 goto bail; 1283 goto bail;
1267 } 1284 }
@@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
1274 } 1291 }
1275 1292
1276 if (S_ISDIR(new_inode->i_mode) || 1293 if (S_ISDIR(new_inode->i_mode) ||
1277 (newfe->i_links_count == cpu_to_le16(1))){ 1294 (ocfs2_read_links_count(newfe) == 1)) {
1278 status = ocfs2_orphan_add(osb, handle, new_inode, 1295 status = ocfs2_orphan_add(osb, handle, new_inode,
1279 newfe, orphan_name, 1296 newfe, orphan_name,
1280 orphan_entry_bh, orphan_dir); 1297 &orphan_insert, orphan_dir);
1281 if (status < 0) { 1298 if (status < 0) {
1282 mlog_errno(status); 1299 mlog_errno(status);
1283 goto bail; 1300 goto bail;
@@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
1285 } 1302 }
1286 1303
1287 /* change the dirent to point to the correct inode */ 1304 /* change the dirent to point to the correct inode */
1288 status = ocfs2_update_entry(new_dir, handle, new_de_bh, 1305 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
1289 new_de, old_inode); 1306 old_inode);
1290 if (status < 0) { 1307 if (status < 0) {
1291 mlog_errno(status); 1308 mlog_errno(status);
1292 goto bail; 1309 goto bail;
@@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
1294 new_dir->i_version++; 1311 new_dir->i_version++;
1295 1312
1296 if (S_ISDIR(new_inode->i_mode)) 1313 if (S_ISDIR(new_inode->i_mode))
1297 newfe->i_links_count = 0; 1314 ocfs2_set_links_count(newfe, 0);
1298 else 1315 else
1299 le16_add_cpu(&newfe->i_links_count, -1); 1316 ocfs2_add_links_count(newfe, -1);
1300 1317
1301 status = ocfs2_journal_dirty(handle, newfe_bh); 1318 status = ocfs2_journal_dirty(handle, newfe_bh);
1302 if (status < 0) { 1319 if (status < 0) {
@@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
1307 /* if the name was not found in new_dir, add it now */ 1324 /* if the name was not found in new_dir, add it now */
1308 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1325 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1309 OCFS2_I(old_inode)->ip_blkno, 1326 OCFS2_I(old_inode)->ip_blkno,
1310 new_dir_bh, insert_entry_bh); 1327 new_dir_bh, &target_insert);
1311 } 1328 }
1312 1329
1313 old_inode->i_ctime = CURRENT_TIME; 1330 old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
1334 * because the insert might have changed the type of directory 1351 * because the insert might have changed the type of directory
1335 * we're dealing with. 1352 * we're dealing with.
1336 */ 1353 */
1337 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1354 status = ocfs2_find_entry(old_dentry->d_name.name,
1338 old_dentry->d_name.len, 1355 old_dentry->d_name.len, old_dir,
1339 old_dir, &old_de); 1356 &old_entry_lookup);
1340 if (!old_de_bh) { 1357 if (status)
1341 status = -EIO;
1342 goto bail; 1358 goto bail;
1343 }
1344 1359
1345 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1360 status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
1346 if (status < 0) { 1361 if (status < 0) {
1347 mlog_errno(status); 1362 mlog_errno(status);
1348 goto bail; 1363 goto bail;
@@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
1353 new_inode->i_ctime = CURRENT_TIME; 1368 new_inode->i_ctime = CURRENT_TIME;
1354 } 1369 }
1355 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1370 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1356 if (old_inode_de_bh) { 1371
1357 status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh, 1372 if (update_dot_dot) {
1358 old_inode_dot_dot_de, new_dir); 1373 status = ocfs2_update_entry(old_inode, handle,
1374 &old_inode_dot_dot_res, new_dir);
1359 old_dir->i_nlink--; 1375 old_dir->i_nlink--;
1360 if (new_inode) { 1376 if (new_inode) {
1361 new_inode->i_nlink--; 1377 new_inode->i_nlink--;
@@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
1391 } else { 1407 } else {
1392 struct ocfs2_dinode *fe; 1408 struct ocfs2_dinode *fe;
1393 status = ocfs2_journal_access_di(handle, old_dir, 1409 status = ocfs2_journal_access_di(handle, old_dir,
1394 old_dir_bh, 1410 old_dir_bh,
1395 OCFS2_JOURNAL_ACCESS_WRITE); 1411 OCFS2_JOURNAL_ACCESS_WRITE);
1396 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1412 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1397 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1413 ocfs2_set_links_count(fe, old_dir->i_nlink);
1398 status = ocfs2_journal_dirty(handle, old_dir_bh); 1414 status = ocfs2_journal_dirty(handle, old_dir_bh);
1399 } 1415 }
1400 } 1416 }
1401
1402 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1417 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
1403 status = 0; 1418 status = 0;
1404bail: 1419bail:
@@ -1429,13 +1444,17 @@ bail:
1429 1444
1430 if (new_inode) 1445 if (new_inode)
1431 iput(new_inode); 1446 iput(new_inode);
1447
1448 ocfs2_free_dir_lookup_result(&target_lookup_res);
1449 ocfs2_free_dir_lookup_result(&old_entry_lookup);
1450 ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
1451 ocfs2_free_dir_lookup_result(&orphan_insert);
1452 ocfs2_free_dir_lookup_result(&target_insert);
1453
1432 brelse(newfe_bh); 1454 brelse(newfe_bh);
1433 brelse(old_inode_bh); 1455 brelse(old_inode_bh);
1434 brelse(old_dir_bh); 1456 brelse(old_dir_bh);
1435 brelse(new_dir_bh); 1457 brelse(new_dir_bh);
1436 brelse(new_de_bh);
1437 brelse(old_de_bh);
1438 brelse(old_inode_de_bh);
1439 brelse(orphan_entry_bh); 1458 brelse(orphan_entry_bh);
1440 brelse(insert_entry_bh); 1459 brelse(insert_entry_bh);
1441 1460
@@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
1558 struct inode *inode = NULL; 1577 struct inode *inode = NULL;
1559 struct super_block *sb; 1578 struct super_block *sb;
1560 struct buffer_head *new_fe_bh = NULL; 1579 struct buffer_head *new_fe_bh = NULL;
1561 struct buffer_head *de_bh = NULL;
1562 struct buffer_head *parent_fe_bh = NULL; 1580 struct buffer_head *parent_fe_bh = NULL;
1563 struct ocfs2_dinode *fe = NULL; 1581 struct ocfs2_dinode *fe = NULL;
1564 struct ocfs2_dinode *dirfe; 1582 struct ocfs2_dinode *dirfe;
@@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
1572 .enable = 1, 1590 .enable = 1,
1573 }; 1591 };
1574 int did_quota = 0, did_quota_inode = 0; 1592 int did_quota = 0, did_quota_inode = 0;
1593 struct ocfs2_dir_lookup_result lookup = { NULL, };
1575 1594
1576 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1595 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1577 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1596 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
1592 } 1611 }
1593 1612
1594 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1613 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1595 if (!dirfe->i_links_count) { 1614 if (!ocfs2_read_links_count(dirfe)) {
1596 /* can't make a file in a deleted directory. */ 1615 /* can't make a file in a deleted directory. */
1597 status = -ENOENT; 1616 status = -ENOENT;
1598 goto bail; 1617 goto bail;
@@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
1605 1624
1606 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 1625 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1607 dentry->d_name.name, 1626 dentry->d_name.name,
1608 dentry->d_name.len, &de_bh); 1627 dentry->d_name.len, &lookup);
1609 if (status < 0) { 1628 if (status < 0) {
1610 mlog_errno(status); 1629 mlog_errno(status);
1611 goto bail; 1630 goto bail;
@@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
1744 1763
1745 status = ocfs2_add_entry(handle, dentry, inode, 1764 status = ocfs2_add_entry(handle, dentry, inode,
1746 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1765 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1747 de_bh); 1766 &lookup);
1748 if (status < 0) { 1767 if (status < 0) {
1749 mlog_errno(status); 1768 mlog_errno(status);
1750 goto bail; 1769 goto bail;
@@ -1772,9 +1791,9 @@ bail:
1772 1791
1773 brelse(new_fe_bh); 1792 brelse(new_fe_bh);
1774 brelse(parent_fe_bh); 1793 brelse(parent_fe_bh);
1775 brelse(de_bh);
1776 kfree(si.name); 1794 kfree(si.name);
1777 kfree(si.value); 1795 kfree(si.value);
1796 ocfs2_free_dir_lookup_result(&lookup);
1778 if (inode_ac) 1797 if (inode_ac)
1779 ocfs2_free_alloc_context(inode_ac); 1798 ocfs2_free_alloc_context(inode_ac);
1780 if (data_ac) 1799 if (data_ac)
@@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1826 struct inode **ret_orphan_dir, 1845 struct inode **ret_orphan_dir,
1827 struct inode *inode, 1846 struct inode *inode,
1828 char *name, 1847 char *name,
1829 struct buffer_head **de_bh) 1848 struct ocfs2_dir_lookup_result *lookup)
1830{ 1849{
1831 struct inode *orphan_dir_inode; 1850 struct inode *orphan_dir_inode;
1832 struct buffer_head *orphan_dir_bh = NULL; 1851 struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1857 1876
1858 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1877 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1859 orphan_dir_bh, name, 1878 orphan_dir_bh, name,
1860 OCFS2_ORPHAN_NAMELEN, de_bh); 1879 OCFS2_ORPHAN_NAMELEN, lookup);
1861 if (status < 0) { 1880 if (status < 0) {
1862 ocfs2_inode_unlock(orphan_dir_inode, 1); 1881 ocfs2_inode_unlock(orphan_dir_inode, 1);
1863 1882
@@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1884 struct inode *inode, 1903 struct inode *inode,
1885 struct ocfs2_dinode *fe, 1904 struct ocfs2_dinode *fe,
1886 char *name, 1905 char *name,
1887 struct buffer_head *de_bh, 1906 struct ocfs2_dir_lookup_result *lookup,
1888 struct inode *orphan_dir_inode) 1907 struct inode *orphan_dir_inode)
1889{ 1908{
1890 struct buffer_head *orphan_dir_bh = NULL; 1909 struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 * underneath us... */ 1929 * underneath us... */
1911 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 1930 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1912 if (S_ISDIR(inode->i_mode)) 1931 if (S_ISDIR(inode->i_mode))
1913 le16_add_cpu(&orphan_fe->i_links_count, 1); 1932 ocfs2_add_links_count(orphan_fe, 1);
1914 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 1933 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1915 1934
1916 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 1935 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
1917 if (status < 0) { 1936 if (status < 0) {
@@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1922 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1941 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1923 OCFS2_ORPHAN_NAMELEN, inode, 1942 OCFS2_ORPHAN_NAMELEN, inode,
1924 OCFS2_I(inode)->ip_blkno, 1943 OCFS2_I(inode)->ip_blkno,
1925 orphan_dir_bh, de_bh); 1944 orphan_dir_bh, lookup);
1926 if (status < 0) { 1945 if (status < 0) {
1927 mlog_errno(status); 1946 mlog_errno(status);
1928 goto leave; 1947 goto leave;
@@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1955 char name[OCFS2_ORPHAN_NAMELEN + 1]; 1974 char name[OCFS2_ORPHAN_NAMELEN + 1];
1956 struct ocfs2_dinode *orphan_fe; 1975 struct ocfs2_dinode *orphan_fe;
1957 int status = 0; 1976 int status = 0;
1958 struct buffer_head *target_de_bh = NULL; 1977 struct ocfs2_dir_lookup_result lookup = { NULL, };
1959 struct ocfs2_dir_entry *target_de = NULL;
1960 1978
1961 mlog_entry_void(); 1979 mlog_entry_void();
1962 1980
@@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1971 OCFS2_ORPHAN_NAMELEN); 1989 OCFS2_ORPHAN_NAMELEN);
1972 1990
1973 /* find it's spot in the orphan directory */ 1991 /* find it's spot in the orphan directory */
1974 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, 1992 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
1975 orphan_dir_inode, &target_de); 1993 &lookup);
1976 if (!target_de_bh) { 1994 if (status) {
1977 status = -ENOENT;
1978 mlog_errno(status); 1995 mlog_errno(status);
1979 goto leave; 1996 goto leave;
1980 } 1997 }
1981 1998
1982 /* remove it from the orphan directory */ 1999 /* remove it from the orphan directory */
1983 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, 2000 status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
1984 target_de_bh);
1985 if (status < 0) { 2001 if (status < 0) {
1986 mlog_errno(status); 2002 mlog_errno(status);
1987 goto leave; 2003 goto leave;
@@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1997 /* do the i_nlink dance! :) */ 2013 /* do the i_nlink dance! :) */
1998 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2014 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1999 if (S_ISDIR(inode->i_mode)) 2015 if (S_ISDIR(inode->i_mode))
2000 le16_add_cpu(&orphan_fe->i_links_count, -1); 2016 ocfs2_add_links_count(orphan_fe, -1);
2001 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 2017 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2002 2018
2003 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 2019 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2004 if (status < 0) { 2020 if (status < 0) {
@@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2007 } 2023 }
2008 2024
2009leave: 2025leave:
2010 brelse(target_de_bh); 2026 ocfs2_free_dir_lookup_result(&lookup);
2011 2027
2012 mlog_exit(status); 2028 mlog_exit(status);
2013 return status; 2029 return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
209struct ocfs2_journal; 209struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_replay_map;
212struct ocfs2_quota_recovery; 213struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock; 214struct ocfs2_dentry_lock;
214struct ocfs2_super 215struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
264 atomic_t vol_state; 265 atomic_t vol_state;
265 struct mutex recovery_lock; 266 struct mutex recovery_lock;
266 struct ocfs2_recovery_map *recovery_map; 267 struct ocfs2_recovery_map *recovery_map;
268 struct ocfs2_replay_map *replay_map;
267 struct task_struct *recovery_thread_task; 269 struct task_struct *recovery_thread_task;
268 int disable_recovery; 270 int disable_recovery;
269 wait_queue_head_t checkpoint_event; 271 wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
287 289
288 u64 la_last_gd; 290 u64 la_last_gd;
289 291
290#ifdef CONFIG_OCFS2_FS_STATS
291 struct dentry *local_alloc_debug;
292 char *local_alloc_debug_buf;
293#endif
294
295 /* Next three fields are for local node slot recovery during 292 /* Next three fields are for local node slot recovery during
296 * mount. */ 293 * mount. */
297 int dirty; 294 int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
305 struct ocfs2_cluster_connection *cconn; 302 struct ocfs2_cluster_connection *cconn;
306 struct ocfs2_lock_res osb_super_lockres; 303 struct ocfs2_lock_res osb_super_lockres;
307 struct ocfs2_lock_res osb_rename_lockres; 304 struct ocfs2_lock_res osb_rename_lockres;
305 struct ocfs2_lock_res osb_nfs_sync_lockres;
308 struct ocfs2_dlm_debug *osb_dlm_debug; 306 struct ocfs2_dlm_debug *osb_dlm_debug;
309 307
310 struct dentry *osb_debug_root; 308 struct dentry *osb_debug_root;
309 struct dentry *osb_ctxt;
311 310
312 wait_queue_head_t recovery_event; 311 wait_queue_head_t recovery_event;
313 312
@@ -344,6 +343,12 @@ struct ocfs2_super
344 343
345 /* used to protect metaecc calculation check of xattr. */ 344 /* used to protect metaecc calculation check of xattr. */
346 spinlock_t osb_xattr_lock; 345 spinlock_t osb_xattr_lock;
346
347 unsigned int osb_dx_mask;
348 u32 osb_dx_seed[4];
349
350 /* the group we used to allocate inodes. */
351 u64 osb_inode_alloc_group;
347}; 352};
348 353
349#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 354#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
402 return 0; 407 return 0;
403} 408}
404 409
410static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
411{
412 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
413 return 1;
414 return 0;
415}
416
417static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
418{
419 if (ocfs2_supports_indexed_dirs(osb))
420 return OCFS2_DX_LINK_MAX;
421 return OCFS2_LINK_MAX;
422}
423
424static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
425{
426 u32 nlink = le16_to_cpu(di->i_links_count);
427 u32 hi = le16_to_cpu(di->i_links_count_hi);
428
429 if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
430 nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
431
432 return nlink;
433}
434
435static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
436{
437 u16 lo, hi;
438
439 lo = nlink;
440 hi = nlink >> OCFS2_LINKS_HI_SHIFT;
441
442 di->i_links_count = cpu_to_le16(lo);
443 di->i_links_count_hi = cpu_to_le16(hi);
444}
445
446static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
447{
448 u32 links = ocfs2_read_links_count(di);
449
450 links += n;
451
452 ocfs2_set_links_count(di, links);
453}
454
405/* set / clear functions because cluster events can make these happen 455/* set / clear functions because cluster events can make these happen
406 * in parallel so we want the transitions to be atomic. this also 456 * in parallel so we want the transitions to be atomic. this also
407 * means that any future flags osb_flags must be protected by spinlock 457 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
482#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ 532#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
483 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) 533 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
484 534
535#define OCFS2_IS_VALID_DX_ROOT(ptr) \
536 (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
537
538#define OCFS2_IS_VALID_DX_LEAF(ptr) \
539 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
540
485static inline unsigned long ino_from_blkno(struct super_block *sb, 541static inline unsigned long ino_from_blkno(struct super_block *sb,
486 u64 blkno) 542 u64 blkno)
487{ 543{
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
532 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; 588 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
533} 589}
534 590
591static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
592 u64 blocks)
593{
594 int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
595 unsigned int clusters;
596
597 clusters = ocfs2_blocks_to_clusters(sb, blocks);
598 return (u64)clusters << bits;
599}
600
535static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, 601static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
536 u64 bytes) 602 u64 bytes)
537{ 603{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
69 71
70/* Compatibility flags */ 72/* Compatibility flags */
71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -95,7 +97,8 @@
95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 97 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
97 | OCFS2_FEATURE_INCOMPAT_XATTR \ 99 | OCFS2_FEATURE_INCOMPAT_XATTR \
98 | OCFS2_FEATURE_INCOMPAT_META_ECC) 100 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
151/* Support for extended attributes */ 154/* Support for extended attributes */
152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 155#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
153 156
157/* Support for indexed directores */
158#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
159
154/* Metadata checksum and error correction */ 160/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156 162
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
411#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ 417#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
412 OCFS2_DIR_ROUND) & \ 418 OCFS2_DIR_ROUND) & \
413 ~OCFS2_DIR_ROUND) 419 ~OCFS2_DIR_ROUND)
420#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1)
414 421
415#define OCFS2_LINK_MAX 32000 422#define OCFS2_LINK_MAX 32000
423#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U)
424#define OCFS2_LINKS_HI_SHIFT 16
425#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
416 426
417#define S_SHIFT 12 427#define S_SHIFT 12
418static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { 428static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
628/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 638/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
629 for this fs*/ 639 for this fs*/
630 __le16 s_reserved0; 640 __le16 s_reserved0;
631 __le32 s_reserved1; 641 __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
632/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ 642 * s_uuid_hash serves as seed[3]. */
643/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
633/*140*/ 644/*140*/
634 645
635 /* 646 /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
679 belongs to */ 690 belongs to */
680 __le16 i_suballoc_bit; /* Bit offset in suballocator 691 __le16 i_suballoc_bit; /* Bit offset in suballocator
681 block group */ 692 block group */
682/*10*/ __le16 i_reserved0; 693/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */
683 __le16 i_xattr_inline_size; 694 __le16 i_xattr_inline_size;
684 __le32 i_clusters; /* Cluster count */ 695 __le32 i_clusters; /* Cluster count */
685 __le32 i_uid; /* Owner UID */ 696 __le32 i_uid; /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
705 __le16 i_dyn_features; 716 __le16 i_dyn_features;
706 __le64 i_xattr_loc; 717 __le64 i_xattr_loc;
707/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 718/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6]; 719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5];
709/*B8*/ union { 721/*B8*/ union {
710 __le64 i_pad1; /* Generic way to refer to this 722 __le64 i_pad1; /* Generic way to refer to this
711 64bit union */ 723 64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
781/*40*/ 793/*40*/
782}; 794};
783 795
796 /*
797 * A directory entry in the indexed tree. We don't store the full name here,
798 * but instead provide a pointer to the full dirent in the unindexed tree.
799 *
800 * We also store name_len here so as to reduce the number of leaf blocks we
801 * need to search in case of collisions.
802 */
803struct ocfs2_dx_entry {
804 __le32 dx_major_hash; /* Used to find logical
805 * cluster in index */
806 __le32 dx_minor_hash; /* Lower bits used to find
807 * block in cluster */
808 __le64 dx_dirent_blk; /* Physical block in unindexed
809 * tree holding this dirent. */
810};
811
812struct ocfs2_dx_entry_list {
813 __le32 de_reserved;
814 __le16 de_count; /* Maximum number of entries
815 * possible in de_entries */
816 __le16 de_num_used; /* Current number of
817 * de_entries entries */
818 struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
819 * in a packed array of
820 * length de_num_used */
821};
822
823#define OCFS2_DX_FLAG_INLINE 0x01
824
825/*
826 * A directory indexing block. Each indexed directory has one of these,
827 * pointed to by ocfs2_dinode.
828 *
829 * This block stores an indexed btree root, and a set of free space
830 * start-of-list pointers.
831 */
832struct ocfs2_dx_root_block {
833 __u8 dr_signature[8]; /* Signature for verification */
834 struct ocfs2_block_check dr_check; /* Error checking */
835 __le16 dr_suballoc_slot; /* Slot suballocator this
836 * block belongs to. */
837 __le16 dr_suballoc_bit; /* Bit offset in suballocator
838 * block group */
839 __le32 dr_fs_generation; /* Must match super block */
840 __le64 dr_blkno; /* Offset on disk, in blocks */
841 __le64 dr_last_eb_blk; /* Pointer to last
842 * extent block */
843 __le32 dr_clusters; /* Clusters allocated
844 * to the indexed tree. */
845 __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */
846 __u8 dr_reserved0;
847 __le16 dr_reserved1;
848 __le64 dr_dir_blkno; /* Pointer to parent inode */
849 __le32 dr_num_entries; /* Total number of
850 * names stored in
851 * this directory.*/
852 __le32 dr_reserved2;
853 __le64 dr_free_blk; /* Pointer to head of free
854 * unindexed block list. */
855 __le64 dr_reserved3[15];
856 union {
857 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
858 * bits for maximum space
859 * efficiency. */
860 struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
861 * entries. We grow out
862 * to extents if this
863 * gets too big. */
864 };
865};
866
867/*
868 * The header of a leaf block in the indexed tree.
869 */
870struct ocfs2_dx_leaf {
871 __u8 dl_signature[8];/* Signature for verification */
872 struct ocfs2_block_check dl_check; /* Error checking */
873 __le64 dl_blkno; /* Offset on disk, in blocks */
874 __le32 dl_fs_generation;/* Must match super block */
875 __le32 dl_reserved0;
876 __le64 dl_reserved1;
877 struct ocfs2_dx_entry_list dl_list;
878};
879
784/* 880/*
785 * On disk allocator group structure for OCFS2 881 * On disk allocator group structure for OCFS2
786 */ 882 */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
1112 return size / sizeof(struct ocfs2_extent_rec); 1208 return size / sizeof(struct ocfs2_extent_rec);
1113} 1209}
1114 1210
1211static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
1212{
1213 int size;
1214
1215 size = sb->s_blocksize -
1216 offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
1217
1218 return size / sizeof(struct ocfs2_extent_rec);
1219}
1220
1115static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 1221static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
1116{ 1222{
1117 int size; 1223 int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1132 return size / sizeof(struct ocfs2_extent_rec); 1238 return size / sizeof(struct ocfs2_extent_rec);
1133} 1239}
1134 1240
1241static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1242{
1243 int size;
1244
1245 size = sb->s_blocksize -
1246 offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
1247
1248 return size / sizeof(struct ocfs2_dx_entry);
1249}
1250
1251static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
1252{
1253 int size;
1254
1255 size = sb->s_blocksize -
1256 offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
1257
1258 return size / sizeof(struct ocfs2_dx_entry);
1259}
1260
1135static inline u16 ocfs2_local_alloc_size(struct super_block *sb) 1261static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1136{ 1262{
1137 u16 size; 1263 u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC,
50 OCFS2_NUM_LOCK_TYPES 51 OCFS2_NUM_LOCK_TYPES
51}; 52};
52 53
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
81 case OCFS2_LOCK_TYPE_QINFO: 82 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q'; 83 c = 'Q';
83 break; 84 break;
85 case OCFS2_LOCK_TYPE_NFS_SYNC:
86 c = 'Y';
87 break;
84 default: 88 default:
85 c = '\0'; 89 c = '\0';
86 } 90 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..b4ca5911caaf 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
48#include "buffer_head_io.h" 48#include "buffer_head_io.h"
49 49
50#define NOT_ALLOC_NEW_GROUP 0 50#define NOT_ALLOC_NEW_GROUP 0
51#define ALLOC_NEW_GROUP 1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
52 53
53#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_INODES_TO_STEAL 1024
54 55
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
64static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 65static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
65 struct inode *alloc_inode, 66 struct inode *alloc_inode,
66 struct buffer_head *bh, 67 struct buffer_head *bh,
67 u64 max_block); 68 u64 max_block,
69 u64 *last_alloc_group,
70 int flags);
68 71
69static int ocfs2_cluster_group_search(struct inode *inode, 72static int ocfs2_cluster_group_search(struct inode *inode,
70 struct buffer_head *group_bh, 73 struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
116 u16 *bg_bit_off); 119 u16 *bg_bit_off);
117static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 120static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
118 u32 bits_wanted, u64 max_block, 121 u32 bits_wanted, u64 max_block,
122 int flags,
119 struct ocfs2_alloc_context **ac); 123 struct ocfs2_alloc_context **ac);
120 124
121void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 125void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
403static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 407static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404 struct inode *alloc_inode, 408 struct inode *alloc_inode,
405 struct buffer_head *bh, 409 struct buffer_head *bh,
406 u64 max_block) 410 u64 max_block,
411 u64 *last_alloc_group,
412 int flags)
407{ 413{
408 int status, credits; 414 int status, credits;
409 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 415 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
423 cl = &fe->id2.i_chain; 429 cl = &fe->id2.i_chain;
424 status = ocfs2_reserve_clusters_with_limit(osb, 430 status = ocfs2_reserve_clusters_with_limit(osb,
425 le16_to_cpu(cl->cl_cpg), 431 le16_to_cpu(cl->cl_cpg),
426 max_block, &ac); 432 max_block, flags, &ac);
427 if (status < 0) { 433 if (status < 0) {
428 if (status != -ENOSPC) 434 if (status != -ENOSPC)
429 mlog_errno(status); 435 mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
440 goto bail; 446 goto bail;
441 } 447 }
442 448
449 if (last_alloc_group && *last_alloc_group != 0) {
450 mlog(0, "use old allocation group %llu for block group alloc\n",
451 (unsigned long long)*last_alloc_group);
452 ac->ac_last_group = *last_alloc_group;
453 }
443 status = ocfs2_claim_clusters(osb, 454 status = ocfs2_claim_clusters(osb,
444 handle, 455 handle,
445 ac, 456 ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
514 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 525 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
515 526
516 status = 0; 527 status = 0;
528
529 /* save the new last alloc group so that the caller can cache it. */
530 if (last_alloc_group)
531 *last_alloc_group = ac->ac_last_group;
532
517bail: 533bail:
518 if (handle) 534 if (handle)
519 ocfs2_commit_trans(osb, handle); 535 ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
531 struct ocfs2_alloc_context *ac, 547 struct ocfs2_alloc_context *ac,
532 int type, 548 int type,
533 u32 slot, 549 u32 slot,
534 int alloc_new_group) 550 u64 *last_alloc_group,
551 int flags)
535{ 552{
536 int status; 553 int status;
537 u32 bits_wanted = ac->ac_bits_wanted; 554 u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
587 goto bail; 604 goto bail;
588 } 605 }
589 606
590 if (alloc_new_group != ALLOC_NEW_GROUP) { 607 if (!(flags & ALLOC_NEW_GROUP)) {
591 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 608 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
592 "and we don't alloc a new group for it.\n", 609 "and we don't alloc a new group for it.\n",
593 slot, bits_wanted, free_bits); 610 slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
596 } 613 }
597 614
598 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 615 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
599 ac->ac_max_block); 616 ac->ac_max_block,
617 last_alloc_group, flags);
600 if (status < 0) { 618 if (status < 0) {
601 if (status != -ENOSPC) 619 if (status != -ENOSPC)
602 mlog_errno(status); 620 mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
640 658
641 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 659 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
642 EXTENT_ALLOC_SYSTEM_INODE, 660 EXTENT_ALLOC_SYSTEM_INODE,
643 slot, ALLOC_NEW_GROUP); 661 slot, NULL, ALLOC_NEW_GROUP);
644 if (status < 0) { 662 if (status < 0) {
645 if (status != -ENOSPC) 663 if (status != -ENOSPC)
646 mlog_errno(status); 664 mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
686 704
687 status = ocfs2_reserve_suballoc_bits(osb, ac, 705 status = ocfs2_reserve_suballoc_bits(osb, ac,
688 INODE_ALLOC_SYSTEM_INODE, 706 INODE_ALLOC_SYSTEM_INODE,
689 slot, NOT_ALLOC_NEW_GROUP); 707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
690 if (status >= 0) { 709 if (status >= 0) {
691 ocfs2_set_inode_steal_slot(osb, slot); 710 ocfs2_set_inode_steal_slot(osb, slot);
692 break; 711 break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
703{ 722{
704 int status; 723 int status;
705 s16 slot = ocfs2_get_inode_steal_slot(osb); 724 s16 slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group;
706 726
707 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
708 if (!(*ac)) { 728 if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
738 goto inode_steal; 758 goto inode_steal;
739 759
740 atomic_set(&osb->s_num_inodes_stolen, 0); 760 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group;
741 status = ocfs2_reserve_suballoc_bits(osb, *ac, 762 status = ocfs2_reserve_suballoc_bits(osb, *ac,
742 INODE_ALLOC_SYSTEM_INODE, 763 INODE_ALLOC_SYSTEM_INODE,
743 osb->slot_num, ALLOC_NEW_GROUP); 764 osb->slot_num,
765 &alloc_group,
766 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL);
744 if (status >= 0) { 768 if (status >= 0) {
745 status = 0; 769 status = 0;
746 770
771 spin_lock(&osb->osb_lock);
772 osb->osb_inode_alloc_group = alloc_group;
773 spin_unlock(&osb->osb_lock);
774 mlog(0, "after reservation, new allocation group is "
775 "%llu\n", (unsigned long long)alloc_group);
776
747 /* 777 /*
748 * Some inodes must be freed by us, so try to allocate 778 * Some inodes must be freed by us, so try to allocate
749 * from our own next time. 779 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
790 820
791 status = ocfs2_reserve_suballoc_bits(osb, ac, 821 status = ocfs2_reserve_suballoc_bits(osb, ac,
792 GLOBAL_BITMAP_SYSTEM_INODE, 822 GLOBAL_BITMAP_SYSTEM_INODE,
793 OCFS2_INVALID_SLOT, 823 OCFS2_INVALID_SLOT, NULL,
794 ALLOC_NEW_GROUP); 824 ALLOC_NEW_GROUP);
795 if (status < 0 && status != -ENOSPC) { 825 if (status < 0 && status != -ENOSPC) {
796 mlog_errno(status); 826 mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
806 * things a bit. */ 836 * things a bit. */
807static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 837static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
808 u32 bits_wanted, u64 max_block, 838 u32 bits_wanted, u64 max_block,
839 int flags,
809 struct ocfs2_alloc_context **ac) 840 struct ocfs2_alloc_context **ac)
810{ 841{
811 int status; 842 int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
823 (*ac)->ac_max_block = max_block; 854 (*ac)->ac_max_block = max_block;
824 855
825 status = -ENOSPC; 856 status = -ENOSPC;
826 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 857 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
858 ocfs2_alloc_should_use_local(osb, bits_wanted)) {
827 status = ocfs2_reserve_local_alloc_bits(osb, 859 status = ocfs2_reserve_local_alloc_bits(osb,
828 bits_wanted, 860 bits_wanted,
829 *ac); 861 *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
861 u32 bits_wanted, 893 u32 bits_wanted,
862 struct ocfs2_alloc_context **ac) 894 struct ocfs2_alloc_context **ac)
863{ 895{
864 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); 896 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
897 ALLOC_NEW_GROUP, ac);
865} 898}
866 899
867/* 900/*
@@ -1618,8 +1651,41 @@ bail:
1618 return status; 1651 return status;
1619} 1652}
1620 1653
1654static void ocfs2_init_inode_ac_group(struct inode *dir,
1655 struct buffer_head *parent_fe_bh,
1656 struct ocfs2_alloc_context *ac)
1657{
1658 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1659 /*
1660 * Try to allocate inodes from some specific group.
1661 *
1662 * If the parent dir has recorded the last group used in allocation,
1663 * cool, use it. Otherwise if we try to allocate new inode from the
1664 * same slot the parent dir belongs to, use the same chunk.
1665 *
1666 * We are very careful here to avoid the mistake of setting
1667 * ac_last_group to a group descriptor from a different (unlocked) slot.
1668 */
1669 if (OCFS2_I(dir)->ip_last_used_group &&
1670 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1671 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1672 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1673 ac->ac_last_group = ocfs2_which_suballoc_group(
1674 le64_to_cpu(fe->i_blkno),
1675 le16_to_cpu(fe->i_suballoc_bit));
1676}
1677
1678static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1679 struct ocfs2_alloc_context *ac)
1680{
1681 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1682 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1683}
1684
1621int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1685int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1622 handle_t *handle, 1686 handle_t *handle,
1687 struct inode *dir,
1688 struct buffer_head *parent_fe_bh,
1623 struct ocfs2_alloc_context *ac, 1689 struct ocfs2_alloc_context *ac,
1624 u16 *suballoc_bit, 1690 u16 *suballoc_bit,
1625 u64 *fe_blkno) 1691 u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1635 BUG_ON(ac->ac_bits_wanted != 1); 1701 BUG_ON(ac->ac_bits_wanted != 1);
1636 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1702 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1637 1703
1704 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1705
1638 status = ocfs2_claim_suballoc_bits(osb, 1706 status = ocfs2_claim_suballoc_bits(osb,
1639 ac, 1707 ac,
1640 handle, 1708 handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1653 1721
1654 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1722 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1655 ac->ac_bits_given++; 1723 ac->ac_bits_given++;
1724 ocfs2_save_inode_ac_group(dir, ac);
1656 status = 0; 1725 status = 0;
1657bail: 1726bail:
1658 mlog_exit(status); 1727 mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
2116 2185
2117 return ret; 2186 return ret;
2118} 2187}
2188
2189/*
2190 * Read the inode specified by blkno to get suballoc_slot and
2191 * suballoc_bit.
2192 */
2193static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2194 u16 *suballoc_slot, u16 *suballoc_bit)
2195{
2196 int status;
2197 struct buffer_head *inode_bh = NULL;
2198 struct ocfs2_dinode *inode_fe;
2199
2200 mlog_entry("blkno: %llu\n", blkno);
2201
2202 /* dirty read disk */
2203 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2204 if (status < 0) {
2205 mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
2206 goto bail;
2207 }
2208
2209 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2210 if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2211 mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
2212 status = -EINVAL;
2213 goto bail;
2214 }
2215
2216 if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
2217 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2218 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2219 blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2220 status = -EINVAL;
2221 goto bail;
2222 }
2223
2224 if (suballoc_slot)
2225 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2226 if (suballoc_bit)
2227 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2228
2229bail:
2230 brelse(inode_bh);
2231
2232 mlog_exit(status);
2233 return status;
2234}
2235
2236/*
2237 * test whether bit is SET in allocator bitmap or not. on success, 0
2238 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
2239 * is returned and *res is meaningless. Call this after you have
2240 * cluster locked against suballoc, or you may get a result based on
2241 * non-up2date contents
2242 */
2243static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2244 struct inode *suballoc,
2245 struct buffer_head *alloc_bh, u64 blkno,
2246 u16 bit, int *res)
2247{
2248 struct ocfs2_dinode *alloc_fe;
2249 struct ocfs2_group_desc *group;
2250 struct buffer_head *group_bh = NULL;
2251 u64 bg_blkno;
2252 int status;
2253
2254 mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
2255
2256 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2257 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2258 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2259 (unsigned int)bit,
2260 ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2261 status = -EINVAL;
2262 goto bail;
2263 }
2264
2265 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2266 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2267 &group_bh);
2268 if (status < 0) {
2269 mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
2270 goto bail;
2271 }
2272
2273 group = (struct ocfs2_group_desc *) group_bh->b_data;
2274 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2275
2276bail:
2277 brelse(group_bh);
2278
2279 mlog_exit(status);
2280 return status;
2281}
2282
2283/*
2284 * Test if the bit representing this inode (blkno) is set in the
2285 * suballocator.
2286 *
2287 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2288 *
2289 * In the event of failure, a negative value is returned and *res is
2290 * meaningless.
2291 *
2292 * Callers must make sure to hold nfs_sync_lock to prevent
2293 * ocfs2_delete_inode() on another node from accessing the same
2294 * suballocator concurrently.
2295 */
2296int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2297{
2298 int status;
2299 u16 suballoc_bit = 0, suballoc_slot = 0;
2300 struct inode *inode_alloc_inode;
2301 struct buffer_head *alloc_bh = NULL;
2302
2303 mlog_entry("blkno: %llu", blkno);
2304
2305 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2306 &suballoc_bit);
2307 if (status < 0) {
2308 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2309 goto bail;
2310 }
2311
2312 inode_alloc_inode =
2313 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2314 suballoc_slot);
2315 if (!inode_alloc_inode) {
2316 /* the error code could be inaccurate, but we are not able to
2317 * get the correct one. */
2318 status = -EINVAL;
2319 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2320 (u32)suballoc_slot);
2321 goto bail;
2322 }
2323
2324 mutex_lock(&inode_alloc_inode->i_mutex);
2325 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2326 if (status < 0) {
2327 mutex_unlock(&inode_alloc_inode->i_mutex);
2328 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2329 (u32)suballoc_slot, status);
2330 goto bail;
2331 }
2332
2333 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2334 blkno, suballoc_bit, res);
2335 if (status < 0)
2336 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2337
2338 ocfs2_inode_unlock(inode_alloc_inode, 0);
2339 mutex_unlock(&inode_alloc_inode->i_mutex);
2340
2341 iput(inode_alloc_inode);
2342 brelse(alloc_bh);
2343bail:
2344 mlog_exit(status);
2345 return status;
2346}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
88 u64 *blkno_start); 88 u64 *blkno_start);
89int ocfs2_claim_new_inode(struct ocfs2_super *osb, 89int ocfs2_claim_new_inode(struct ocfs2_super *osb,
90 handle_t *handle, 90 handle_t *handle,
91 struct inode *dir,
92 struct buffer_head *parent_fe_bh,
91 struct ocfs2_alloc_context *ac, 93 struct ocfs2_alloc_context *ac,
92 u16 *suballoc_bit, 94 u16 *suballoc_bit,
93 u64 *fe_blkno); 95 u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
186 u32 clusters_to_add, u32 extents_to_split, 188 u32 clusters_to_add, u32 extents_to_split,
187 struct ocfs2_alloc_context **data_ac, 189 struct ocfs2_alloc_context **data_ac,
188 struct ocfs2_alloc_context **meta_ac); 190 struct ocfs2_alloc_context **meta_ac);
191
192int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
189#endif /* _CHAINALLOC_H_ */ 193#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
201 {Opt_err, NULL} 201 {Opt_err, NULL}
202}; 202};
203 203
204#ifdef CONFIG_DEBUG_FS
205static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
206{
207 int out = 0;
208 int i;
209 struct ocfs2_cluster_connection *cconn = osb->cconn;
210 struct ocfs2_recovery_map *rm = osb->recovery_map;
211
212 out += snprintf(buf + out, len - out,
213 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
214 "Device", osb->dev_str, osb->uuid_str,
215 osb->fs_generation, osb->vol_label);
216
217 out += snprintf(buf + out, len - out,
218 "%10s => State: %d Flags: 0x%lX\n", "Volume",
219 atomic_read(&osb->vol_state), osb->osb_flags);
220
221 out += snprintf(buf + out, len - out,
222 "%10s => Block: %lu Cluster: %d\n", "Sizes",
223 osb->sb->s_blocksize, osb->s_clustersize);
224
225 out += snprintf(buf + out, len - out,
226 "%10s => Compat: 0x%X Incompat: 0x%X "
227 "ROcompat: 0x%X\n",
228 "Features", osb->s_feature_compat,
229 osb->s_feature_incompat, osb->s_feature_ro_compat);
230
231 out += snprintf(buf + out, len - out,
232 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
233 osb->s_mount_opt, osb->s_atime_quantum);
234
235 out += snprintf(buf + out, len - out,
236 "%10s => Stack: %s Name: %*s Version: %d.%d\n",
237 "Cluster",
238 (*osb->osb_cluster_stack == '\0' ?
239 "o2cb" : osb->osb_cluster_stack),
240 cconn->cc_namelen, cconn->cc_name,
241 cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
242
243 spin_lock(&osb->dc_task_lock);
244 out += snprintf(buf + out, len - out,
245 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
246 "WorkSeq: %lu\n", "DownCnvt",
247 task_pid_nr(osb->dc_task), osb->blocked_lock_count,
248 osb->dc_wake_sequence, osb->dc_work_sequence);
249 spin_unlock(&osb->dc_task_lock);
250
251 spin_lock(&osb->osb_lock);
252 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
253 "Recovery",
254 (osb->recovery_thread_task ?
255 task_pid_nr(osb->recovery_thread_task) : -1));
256 if (rm->rm_used == 0)
257 out += snprintf(buf + out, len - out, " None\n");
258 else {
259 for (i = 0; i < rm->rm_used; i++)
260 out += snprintf(buf + out, len - out, " %d",
261 rm->rm_entries[i]);
262 out += snprintf(buf + out, len - out, "\n");
263 }
264 spin_unlock(&osb->osb_lock);
265
266 out += snprintf(buf + out, len - out,
267 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
268 task_pid_nr(osb->commit_task), osb->osb_commit_interval,
269 atomic_read(&osb->needs_checkpoint));
270
271 out += snprintf(buf + out, len - out,
272 "%10s => State: %d NumTxns: %d TxnId: %lu\n",
273 "Journal", osb->journal->j_state,
274 atomic_read(&osb->journal->j_num_trans),
275 osb->journal->j_trans_id);
276
277 out += snprintf(buf + out, len - out,
278 "%10s => GlobalAllocs: %d LocalAllocs: %d "
279 "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
280 "Stats",
281 atomic_read(&osb->alloc_stats.bitmap_data),
282 atomic_read(&osb->alloc_stats.local_data),
283 atomic_read(&osb->alloc_stats.bg_allocs),
284 atomic_read(&osb->alloc_stats.moves),
285 atomic_read(&osb->alloc_stats.bg_extends));
286
287 out += snprintf(buf + out, len - out,
288 "%10s => State: %u Descriptor: %llu Size: %u bits "
289 "Default: %u bits\n",
290 "LocalAlloc", osb->local_alloc_state,
291 (unsigned long long)osb->la_last_gd,
292 osb->local_alloc_bits, osb->local_alloc_default_bits);
293
294 spin_lock(&osb->osb_lock);
295 out += snprintf(buf + out, len - out,
296 "%10s => Slot: %d NumStolen: %d\n", "Steal",
297 osb->s_inode_steal_slot,
298 atomic_read(&osb->s_num_inodes_stolen));
299 spin_unlock(&osb->osb_lock);
300
301 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
302 "Slots", "Num", "RecoGen");
303
304 for (i = 0; i < osb->max_slots; ++i) {
305 out += snprintf(buf + out, len - out,
306 "%10s %c %3d %10d\n",
307 " ",
308 (i == osb->slot_num ? '*' : ' '),
309 i, osb->slot_recovery_generations[i]);
310 }
311
312 return out;
313}
314
315static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
316{
317 struct ocfs2_super *osb = inode->i_private;
318 char *buf = NULL;
319
320 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
321 if (!buf)
322 goto bail;
323
324 i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
325
326 file->private_data = buf;
327
328 return 0;
329bail:
330 return -ENOMEM;
331}
332
333static int ocfs2_debug_release(struct inode *inode, struct file *file)
334{
335 kfree(file->private_data);
336 return 0;
337}
338
339static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
340 size_t nbytes, loff_t *ppos)
341{
342 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
343 i_size_read(file->f_mapping->host));
344}
345#else
346static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
347{
348 return 0;
349}
350static int ocfs2_debug_release(struct inode *inode, struct file *file)
351{
352 return 0;
353}
354static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
355 size_t nbytes, loff_t *ppos)
356{
357 return 0;
358}
359#endif /* CONFIG_DEBUG_FS */
360
361static struct file_operations ocfs2_osb_debug_fops = {
362 .open = ocfs2_osb_debug_open,
363 .release = ocfs2_debug_release,
364 .read = ocfs2_debug_read,
365 .llseek = generic_file_llseek,
366};
367
204/* 368/*
205 * write_super and sync_fs ripped right out of ext3. 369 * write_super and sync_fs ripped right out of ext3.
206 */ 370 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
926 goto read_super_error; 1090 goto read_super_error;
927 } 1091 }
928 1092
1093 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
1094 osb->osb_debug_root,
1095 osb,
1096 &ocfs2_osb_debug_fops);
1097 if (!osb->osb_ctxt) {
1098 status = -EINVAL;
1099 mlog_errno(status);
1100 goto read_super_error;
1101 }
1102
929 status = ocfs2_mount_volume(sb); 1103 status = ocfs2_mount_volume(sb);
930 if (osb->root_inode) 1104 if (osb->root_inode)
931 inode = igrab(osb->root_inode); 1105 inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1620 osb = OCFS2_SB(sb); 1794 osb = OCFS2_SB(sb);
1621 BUG_ON(!osb); 1795 BUG_ON(!osb);
1622 1796
1797 debugfs_remove(osb->osb_ctxt);
1798
1623 ocfs2_disable_quotas(osb); 1799 ocfs2_disable_quotas(osb);
1624 1800
1625 ocfs2_shutdown_local_alloc(osb); 1801 ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1742 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 1918 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
1743 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 1919 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
1744 1920
1921 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
1922
1923 for (i = 0; i < 3; i++)
1924 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
1925 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1926
1745 osb->sb = sb; 1927 osb->sb = sb;
1746 /* Save off for ocfs2_rw_direct */ 1928 /* Save off for ocfs2_rw_direct */
1747 osb->s_sectsize_bits = blksize_bits(sector_size); 1929 osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2130 * lock, and it's marked as dirty, set the bit in the recover 2312 * lock, and it's marked as dirty, set the bit in the recover
2131 * map and launch a recovery thread for it. */ 2313 * map and launch a recovery thread for it. */
2132 status = ocfs2_mark_dead_nodes(osb); 2314 status = ocfs2_mark_dead_nodes(osb);
2315 if (status < 0) {
2316 mlog_errno(status);
2317 goto finally;
2318 }
2319
2320 status = ocfs2_compute_replay_slots(osb);
2133 if (status < 0) 2321 if (status < 0)
2134 mlog_errno(status); 2322 mlog_errno(status);
2135 2323
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
512 struct ocfs2_security_xattr_info *si, 512 struct ocfs2_security_xattr_info *si,
513 int *want_clusters, 513 int *want_clusters,
514 int *xattr_credits, 514 int *xattr_credits,
515 struct ocfs2_alloc_context **xattr_ac) 515 int *want_meta)
516{ 516{
517 int ret = 0; 517 int ret = 0;
518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || 555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
557 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 *want_meta = *want_meta + 1;
558 if (ret) {
559 mlog_errno(ret);
560 return ret;
561 }
562 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 558 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
563 } 559 }
564 560
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
68 int *, int *, struct ocfs2_alloc_context **); 68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, 69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *, 70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **); 71 int *, int *, int *);
72 72
73/* 73/*
74 * xattrs can live inside an inode, as part of an external xattr block, 74 * xattrs can live inside an inode, as part of an external xattr block,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..379ae5fb4411 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
262{ 262{
263 struct super_block *s = dentry->d_sb; 263 struct super_block *s = dentry->d_sb;
264 struct omfs_sb_info *sbi = OMFS_SB(s); 264 struct omfs_sb_info *sbi = OMFS_SB(s);
265 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
266
265 buf->f_type = OMFS_MAGIC; 267 buf->f_type = OMFS_MAGIC;
266 buf->f_bsize = sbi->s_blocksize; 268 buf->f_bsize = sbi->s_blocksize;
267 buf->f_blocks = sbi->s_num_blocks; 269 buf->f_blocks = sbi->s_num_blocks;
268 buf->f_files = sbi->s_num_blocks; 270 buf->f_files = sbi->s_num_blocks;
269 buf->f_namelen = OMFS_NAMELEN; 271 buf->f_namelen = OMFS_NAMELEN;
272 buf->f_fsid.val[0] = (u32)id;
273 buf->f_fsid.val[1] = (u32)(id >> 32);
270 274
271 buf->f_bfree = buf->f_bavail = buf->f_ffree = 275 buf->f_bfree = buf->f_bavail = buf->f_ffree =
272 omfs_count_free(s); 276 omfs_count_free(s);
277
273 return 0; 278 return 0;
274} 279}
275 280
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
421 426
422 sbi->s_uid = current_uid(); 427 sbi->s_uid = current_uid();
423 sbi->s_gid = current_gid(); 428 sbi->s_gid = current_gid();
424 sbi->s_dmask = sbi->s_fmask = current->fs->umask; 429 sbi->s_dmask = sbi->s_fmask = current_umask();
425 430
426 if (!parse_options((char *) data, sbi)) 431 if (!parse_options((char *) data, sbi))
427 goto end; 432 goto end;
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/audit.h> 30#include <linux/audit.h>
31#include <linux/falloc.h> 31#include <linux/falloc.h>
32#include <linux/fs_struct.h>
32 33
33int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) 34int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
34{ 35{
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
80#include <linux/oom.h> 80#include <linux/oom.h>
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h>
83#include "internal.h" 84#include "internal.h"
84 85
85/* NOTE: 86/* NOTE:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
120 K(i.freeram-i.freehigh), 120 K(i.freeram-i.freehigh),
121#endif 121#endif
122#ifndef CONFIG_MMU 122#ifndef CONFIG_MMU
123 K((unsigned long) atomic_read(&mmap_pages_allocated)), 123 K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
124#endif 124#endif
125 K(i.totalswap), 125 K(i.totalswap),
126 K(i.freeswap), 126 K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b446d7ad0b0d..7e14d1a04001 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
76 76
77/* 77/*
78 * display a list of all the REGIONs the kernel knows about 78 * display a list of all the REGIONs the kernel knows about
79 * - nommu kernals have a single flat list 79 * - nommu kernels have a single flat list
80 */ 80 */
81static int nommu_region_list_show(struct seq_file *m, void *_p) 81static int nommu_region_list_show(struct seq_file *m, void *_p)
82{ 82{
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..863464d5519c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/fdtable.h> 4#include <linux/fdtable.h>
5#include <linux/fs_struct.h>
5#include <linux/mount.h> 6#include <linux/mount.h>
6#include <linux/ptrace.h> 7#include <linux/ptrace.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
49 else 50 else
50 bytes += kobjsize(mm); 51 bytes += kobjsize(mm);
51 52
52 if (current->fs && atomic_read(&current->fs->count) > 1) 53 if (current->fs && current->fs->users > 1)
53 sbytes += kobjsize(current->fs); 54 sbytes += kobjsize(current->fs);
54 else 55 else
55 bytes += kobjsize(current->fs); 56 bytes += kobjsize(current->fs);
@@ -136,14 +137,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
136 } 137 }
137 138
138 seq_printf(m, 139 seq_printf(m,
139 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", 140 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
140 vma->vm_start, 141 vma->vm_start,
141 vma->vm_end, 142 vma->vm_end,
142 flags & VM_READ ? 'r' : '-', 143 flags & VM_READ ? 'r' : '-',
143 flags & VM_WRITE ? 'w' : '-', 144 flags & VM_WRITE ? 'w' : '-',
144 flags & VM_EXEC ? 'x' : '-', 145 flags & VM_EXEC ? 'x' : '-',
145 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 146 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
146 vma->vm_pgoff << PAGE_SHIFT, 147 (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
147 MAJOR(dev), MINOR(dev), ino, &len); 148 MAJOR(dev), MINOR(dev), ino, &len);
148 149
149 if (file) { 150 if (file) {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84c..fe1f0f31d11c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) 282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
283{ 283{
284 struct super_block *sb = dentry->d_sb; 284 struct super_block *sb = dentry->d_sb;
285 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
285 286
286 lock_kernel(); 287 lock_kernel();
287 288
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
291 buf->f_bfree = qnx4_count_free_blocks(sb); 292 buf->f_bfree = qnx4_count_free_blocks(sb);
292 buf->f_bavail = buf->f_bfree; 293 buf->f_bavail = buf->f_bfree;
293 buf->f_namelen = QNX4_NAME_MAX; 294 buf->f_namelen = QNX4_NAME_MAX;
295 buf->f_fsid.val[0] = (u32)id;
296 buf->f_fsid.val[1] = (u32)(id >> 32);
294 297
295 unlock_kernel(); 298 unlock_kernel();
296 299
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2ca967a5ef77..607c579e5eca 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
823 823
824 spin_lock(&inode_lock); 824 spin_lock(&inode_lock);
825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
826 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 826 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
827 continue; 827 continue;
828 if (!atomic_read(&inode->i_writecount)) 828 if (!atomic_read(&inode->i_writecount))
829 continue; 829 continue;
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
731 return ret; 731 return ret;
732} 732}
733 733
734static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
735{
736#define HALF_LONG_BITS (BITS_PER_LONG / 2)
737 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
738}
739
740SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
741 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
742{
743 loff_t pos = pos_from_hilo(pos_h, pos_l);
744 struct file *file;
745 ssize_t ret = -EBADF;
746 int fput_needed;
747
748 if (pos < 0)
749 return -EINVAL;
750
751 file = fget_light(fd, &fput_needed);
752 if (file) {
753 ret = -ESPIPE;
754 if (file->f_mode & FMODE_PREAD)
755 ret = vfs_readv(file, vec, vlen, &pos);
756 fput_light(file, fput_needed);
757 }
758
759 if (ret > 0)
760 add_rchar(current, ret);
761 inc_syscr(current);
762 return ret;
763}
764
765SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
766 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
767{
768 loff_t pos = pos_from_hilo(pos_h, pos_l);
769 struct file *file;
770 ssize_t ret = -EBADF;
771 int fput_needed;
772
773 if (pos < 0)
774 return -EINVAL;
775
776 file = fget_light(fd, &fput_needed);
777 if (file) {
778 ret = -ESPIPE;
779 if (file->f_mode & FMODE_PWRITE)
780 ret = vfs_writev(file, vec, vlen, &pos);
781 fput_light(file, fput_needed);
782 }
783
784 if (ret > 0)
785 add_wchar(current, ret);
786 inc_syscw(current);
787 return ret;
788}
789
734static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 790static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
735 size_t count, loff_t max) 791 size_t count, loff_t max)
736{ 792{
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
1config REISERFS_FS 1config REISERFS_FS
2 tristate "Reiserfs support" 2 tristate "Reiserfs support"
3 select CRC32
3 help 4 help
4 Stores not just filenames but the files themselves in a balanced 5 Stores not just filenames but the files themselves in a balanced
5 tree. Uses journalling. 6 tree. Uses journalling.
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 972250c62896..0ae6486d9046 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h>
30 31
31struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
32 33
@@ -1904,6 +1905,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1904 buf->f_bsize = dentry->d_sb->s_blocksize; 1905 buf->f_bsize = dentry->d_sb->s_blocksize;
1905 /* changed to accommodate gcc folks. */ 1906 /* changed to accommodate gcc folks. */
1906 buf->f_type = REISERFS_SUPER_MAGIC; 1907 buf->f_type = REISERFS_SUPER_MAGIC;
1908 buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
1909 buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
1910 sizeof(rs->s_uuid)/2);
1911
1907 return 0; 1912 return 0;
1908} 1913}
1909 1914
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d1..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
428 } else { 428 } else {
429 apply_umask: 429 apply_umask:
430 /* no ACL, apply umask */ 430 /* no ACL, apply umask */
431 inode->i_mode &= ~current->fs->umask; 431 inode->i_mode &= ~current_umask();
432 } 432 }
433 433
434 return err; 434 return err;
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..dd727d43e5b7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
59 */ 59 */
60 wait_on_page_writeback(page); 60 wait_on_page_writeback(page);
61 61
62 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 62 if (page_has_private(page) &&
63 !try_to_release_page(page, GFP_KERNEL))
63 goto out_unlock; 64 goto out_unlock;
64 65
65 /* 66 /*
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 681ec0d83799..ffa6edcd2d0c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -301,6 +301,7 @@ failure:
301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) 301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
302{ 302{
303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; 303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
304 u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
304 305
305 TRACE("Entered squashfs_statfs\n"); 306 TRACE("Entered squashfs_statfs\n");
306 307
@@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
311 buf->f_files = msblk->inodes; 312 buf->f_files = msblk->inodes;
312 buf->f_ffree = 0; 313 buf->f_ffree = 0;
313 buf->f_namelen = SQUASHFS_NAME_LEN; 314 buf->f_namelen = SQUASHFS_NAME_LEN;
315 buf->f_fsid.val[0] = (u32)id;
316 buf->f_fsid.val[1] = (u32)(id >> 32);
314 317
315 return 0; 318 return 0;
316} 319}
diff --git a/fs/super.c b/fs/super.c
index 2ba481518ba7..77cb4ec919b9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb)
287 __fsync_super(sb); 287 __fsync_super(sb);
288 return sync_blockdev(sb->s_bdev); 288 return sync_blockdev(sb->s_bdev);
289} 289}
290EXPORT_SYMBOL_GPL(fsync_super);
290 291
291/** 292/**
292 * generic_shutdown_super - common helper for ->kill_sb() 293 * generic_shutdown_super - common helper for ->kill_sb()
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae2..da20b48d350f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
90{ 90{
91 struct super_block *sb = dentry->d_sb; 91 struct super_block *sb = dentry->d_sb;
92 struct sysv_sb_info *sbi = SYSV_SB(sb); 92 struct sysv_sb_info *sbi = SYSV_SB(sb);
93 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
93 94
94 buf->f_type = sb->s_magic; 95 buf->f_type = sb->s_magic;
95 buf->f_bsize = sb->s_blocksize; 96 buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
98 buf->f_files = sbi->s_ninodes; 99 buf->f_files = sbi->s_ninodes;
99 buf->f_ffree = sysv_count_free_inodes(sb); 100 buf->f_ffree = sysv_count_free_inodes(sb);
100 buf->f_namelen = SYSV_NAMELEN; 101 buf->f_namelen = SYSV_NAMELEN;
102 buf->f_fsid.val[0] = (u32)id;
103 buf->f_fsid.val[1] = (u32)(id >> 32);
101 return 0; 104 return 0;
102} 105}
103 106
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
22 depends on UBIFS_FS 22 depends on UBIFS_FS
23 help 23 help
24 This option allows to explicitly choose which compressions, if any, 24 This option allows to explicitly choose which compressions, if any,
25 are enabled in UBIFS. Removing compressors means inbility to read 25 are enabled in UBIFS. Removing compressors means inability to read
26 existing file systems. 26 existing file systems.
27 27
28 If unsure, say 'N'. 28 If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
32 depends on UBIFS_FS 32 depends on UBIFS_FS
33 default y 33 default y
34 help 34 help
35 LZO compressor is generally faster then zlib but compresses worse. 35 LZO compressor is generally faster than zlib but compresses worse.
36 Say 'Y' if unsure. 36 Say 'Y' if unsure.
37 37
38config UBIFS_FS_ZLIB 38config UBIFS_FS_ZLIB
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 2bb788a2acb1..e48e9a3af763 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
87{ 87{
88 struct buffer_head *bh = NULL; 88 struct buffer_head *bh = NULL;
89 int retval = 0; 89 int retval = 0;
90 kernel_lb_addr loc; 90 struct kernel_lb_addr loc;
91 91
92 loc.logicalBlockNum = bitmap->s_extPosition; 92 loc.logicalBlockNum = bitmap->s_extPosition;
93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
94 94
95 bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block)); 95 bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
96 if (!bh) 96 if (!bh)
97 retval = -EIO; 97 retval = -EIO;
98 98
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
140 return slot; 140 return slot;
141} 141}
142 142
143static bool udf_add_free_space(struct udf_sb_info *sbi, 143static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
144 u16 partition, u32 cnt)
145{ 144{
145 struct udf_sb_info *sbi = UDF_SB(sb);
146 struct logicalVolIntegrityDesc *lvid; 146 struct logicalVolIntegrityDesc *lvid;
147 147
148 if (sbi->s_lvid_bh == NULL) 148 if (!sbi->s_lvid_bh)
149 return false; 149 return;
150 150
151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; 151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); 152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
153 return true; 153 udf_updated_lvid(sb);
154} 154}
155 155
156static void udf_bitmap_free_blocks(struct super_block *sb, 156static void udf_bitmap_free_blocks(struct super_block *sb,
157 struct inode *inode, 157 struct inode *inode,
158 struct udf_bitmap *bitmap, 158 struct udf_bitmap *bitmap,
159 kernel_lb_addr bloc, uint32_t offset, 159 struct kernel_lb_addr *bloc,
160 uint32_t offset,
160 uint32_t count) 161 uint32_t count)
161{ 162{
162 struct udf_sb_info *sbi = UDF_SB(sb); 163 struct udf_sb_info *sbi = UDF_SB(sb);
163 struct buffer_head *bh = NULL; 164 struct buffer_head *bh = NULL;
165 struct udf_part_map *partmap;
164 unsigned long block; 166 unsigned long block;
165 unsigned long block_group; 167 unsigned long block_group;
166 unsigned long bit; 168 unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
169 unsigned long overflow; 171 unsigned long overflow;
170 172
171 mutex_lock(&sbi->s_alloc_mutex); 173 mutex_lock(&sbi->s_alloc_mutex);
172 if (bloc.logicalBlockNum < 0 || 174 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
173 (bloc.logicalBlockNum + count) > 175 if (bloc->logicalBlockNum < 0 ||
174 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 176 (bloc->logicalBlockNum + count) >
177 partmap->s_partition_len) {
175 udf_debug("%d < %d || %d + %d > %d\n", 178 udf_debug("%d < %d || %d + %d > %d\n",
176 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 179 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
177 sbi->s_partmaps[bloc.partitionReferenceNum]. 180 count, partmap->s_partition_len);
178 s_partition_len);
179 goto error_return; 181 goto error_return;
180 } 182 }
181 183
182 block = bloc.logicalBlockNum + offset + 184 block = bloc->logicalBlockNum + offset +
183 (sizeof(struct spaceBitmapDesc) << 3); 185 (sizeof(struct spaceBitmapDesc) << 3);
184 186
185 do { 187 do {
@@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
207 } else { 209 } else {
208 if (inode) 210 if (inode)
209 vfs_dq_free_block(inode, 1); 211 vfs_dq_free_block(inode, 1);
210 udf_add_free_space(sbi, sbi->s_partition, 1); 212 udf_add_free_space(sb, sbi->s_partition, 1);
211 } 213 }
212 } 214 }
213 mark_buffer_dirty(bh); 215 mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
218 } while (overflow); 220 } while (overflow);
219 221
220error_return: 222error_return:
221 sb->s_dirt = 1;
222 if (sbi->s_lvid_bh)
223 mark_buffer_dirty(sbi->s_lvid_bh);
224 mutex_unlock(&sbi->s_alloc_mutex); 223 mutex_unlock(&sbi->s_alloc_mutex);
225} 224}
226 225
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
277 } while (block_count > 0); 276 } while (block_count > 0);
278 277
279out: 278out:
280 if (udf_add_free_space(sbi, partition, -alloc_count)) 279 udf_add_free_space(sb, partition, -alloc_count);
281 mark_buffer_dirty(sbi->s_lvid_bh);
282 sb->s_dirt = 1;
283 mutex_unlock(&sbi->s_alloc_mutex); 280 mutex_unlock(&sbi->s_alloc_mutex);
284 return alloc_count; 281 return alloc_count;
285} 282}
@@ -409,9 +406,7 @@ got_block:
409 406
410 mark_buffer_dirty(bh); 407 mark_buffer_dirty(bh);
411 408
412 if (udf_add_free_space(sbi, partition, -1)) 409 udf_add_free_space(sb, partition, -1);
413 mark_buffer_dirty(sbi->s_lvid_bh);
414 sb->s_dirt = 1;
415 mutex_unlock(&sbi->s_alloc_mutex); 410 mutex_unlock(&sbi->s_alloc_mutex);
416 *err = 0; 411 *err = 0;
417 return newblock; 412 return newblock;
@@ -425,26 +420,28 @@ error_return:
425static void udf_table_free_blocks(struct super_block *sb, 420static void udf_table_free_blocks(struct super_block *sb,
426 struct inode *inode, 421 struct inode *inode,
427 struct inode *table, 422 struct inode *table,
428 kernel_lb_addr bloc, uint32_t offset, 423 struct kernel_lb_addr *bloc,
424 uint32_t offset,
429 uint32_t count) 425 uint32_t count)
430{ 426{
431 struct udf_sb_info *sbi = UDF_SB(sb); 427 struct udf_sb_info *sbi = UDF_SB(sb);
428 struct udf_part_map *partmap;
432 uint32_t start, end; 429 uint32_t start, end;
433 uint32_t elen; 430 uint32_t elen;
434 kernel_lb_addr eloc; 431 struct kernel_lb_addr eloc;
435 struct extent_position oepos, epos; 432 struct extent_position oepos, epos;
436 int8_t etype; 433 int8_t etype;
437 int i; 434 int i;
438 struct udf_inode_info *iinfo; 435 struct udf_inode_info *iinfo;
439 436
440 mutex_lock(&sbi->s_alloc_mutex); 437 mutex_lock(&sbi->s_alloc_mutex);
441 if (bloc.logicalBlockNum < 0 || 438 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
442 (bloc.logicalBlockNum + count) > 439 if (bloc->logicalBlockNum < 0 ||
443 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 440 (bloc->logicalBlockNum + count) >
441 partmap->s_partition_len) {
444 udf_debug("%d < %d || %d + %d > %d\n", 442 udf_debug("%d < %d || %d + %d > %d\n",
445 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 443 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
446 sbi->s_partmaps[bloc.partitionReferenceNum]. 444 partmap->s_partition_len);
447 s_partition_len);
448 goto error_return; 445 goto error_return;
449 } 446 }
450 447
@@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb,
453 could occure, but.. oh well */ 450 could occure, but.. oh well */
454 if (inode) 451 if (inode)
455 vfs_dq_free_block(inode, count); 452 vfs_dq_free_block(inode, count);
456 if (udf_add_free_space(sbi, sbi->s_partition, count)) 453 udf_add_free_space(sb, sbi->s_partition, count);
457 mark_buffer_dirty(sbi->s_lvid_bh);
458 454
459 start = bloc.logicalBlockNum + offset; 455 start = bloc->logicalBlockNum + offset;
460 end = bloc.logicalBlockNum + offset + count - 1; 456 end = bloc->logicalBlockNum + offset + count - 1;
461 457
462 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); 458 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
463 elen = 0; 459 elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
483 start += count; 479 start += count;
484 count = 0; 480 count = 0;
485 } 481 }
486 udf_write_aext(table, &oepos, eloc, elen, 1); 482 udf_write_aext(table, &oepos, &eloc, elen, 1);
487 } else if (eloc.logicalBlockNum == (end + 1)) { 483 } else if (eloc.logicalBlockNum == (end + 1)) {
488 if ((0x3FFFFFFF - elen) < 484 if ((0x3FFFFFFF - elen) <
489 (count << sb->s_blocksize_bits)) { 485 (count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
502 end -= count; 498 end -= count;
503 count = 0; 499 count = 0;
504 } 500 }
505 udf_write_aext(table, &oepos, eloc, elen, 1); 501 udf_write_aext(table, &oepos, &eloc, elen, 1);
506 } 502 }
507 503
508 if (epos.bh != oepos.bh) { 504 if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
532 */ 528 */
533 529
534 int adsize; 530 int adsize;
535 short_ad *sad = NULL; 531 struct short_ad *sad = NULL;
536 long_ad *lad = NULL; 532 struct long_ad *lad = NULL;
537 struct allocExtDesc *aed; 533 struct allocExtDesc *aed;
538 534
539 eloc.logicalBlockNum = start; 535 eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
541 (count << sb->s_blocksize_bits); 537 (count << sb->s_blocksize_bits);
542 538
543 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 539 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
544 adsize = sizeof(short_ad); 540 adsize = sizeof(struct short_ad);
545 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 541 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
546 adsize = sizeof(long_ad); 542 adsize = sizeof(struct long_ad);
547 else { 543 else {
548 brelse(oepos.bh); 544 brelse(oepos.bh);
549 brelse(epos.bh); 545 brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
563 elen -= sb->s_blocksize; 559 elen -= sb->s_blocksize;
564 560
565 epos.bh = udf_tread(sb, 561 epos.bh = udf_tread(sb,
566 udf_get_lb_pblock(sb, epos.block, 0)); 562 udf_get_lb_pblock(sb, &epos.block, 0));
567 if (!epos.bh) { 563 if (!epos.bh) {
568 brelse(oepos.bh); 564 brelse(oepos.bh);
569 goto error_return; 565 goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
601 if (sbi->s_udfrev >= 0x0200) 597 if (sbi->s_udfrev >= 0x0200)
602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 598 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
603 3, 1, epos.block.logicalBlockNum, 599 3, 1, epos.block.logicalBlockNum,
604 sizeof(tag)); 600 sizeof(struct tag));
605 else 601 else
606 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
607 2, 1, epos.block.logicalBlockNum, 603 2, 1, epos.block.logicalBlockNum,
608 sizeof(tag)); 604 sizeof(struct tag));
609 605
610 switch (iinfo->i_alloc_type) { 606 switch (iinfo->i_alloc_type) {
611 case ICBTAG_FLAG_AD_SHORT: 607 case ICBTAG_FLAG_AD_SHORT:
612 sad = (short_ad *)sptr; 608 sad = (struct short_ad *)sptr;
613 sad->extLength = cpu_to_le32( 609 sad->extLength = cpu_to_le32(
614 EXT_NEXT_EXTENT_ALLOCDECS | 610 EXT_NEXT_EXTENT_ALLOCDECS |
615 sb->s_blocksize); 611 sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
617 cpu_to_le32(epos.block.logicalBlockNum); 613 cpu_to_le32(epos.block.logicalBlockNum);
618 break; 614 break;
619 case ICBTAG_FLAG_AD_LONG: 615 case ICBTAG_FLAG_AD_LONG:
620 lad = (long_ad *)sptr; 616 lad = (struct long_ad *)sptr;
621 lad->extLength = cpu_to_le32( 617 lad->extLength = cpu_to_le32(
622 EXT_NEXT_EXTENT_ALLOCDECS | 618 EXT_NEXT_EXTENT_ALLOCDECS |
623 sb->s_blocksize); 619 sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
635 631
636 /* It's possible that stealing the block emptied the extent */ 632 /* It's possible that stealing the block emptied the extent */
637 if (elen) { 633 if (elen) {
638 udf_write_aext(table, &epos, eloc, elen, 1); 634 udf_write_aext(table, &epos, &eloc, elen, 1);
639 635
640 if (!epos.bh) { 636 if (!epos.bh) {
641 iinfo->i_lenAlloc += adsize; 637 iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
653 brelse(oepos.bh); 649 brelse(oepos.bh);
654 650
655error_return: 651error_return:
656 sb->s_dirt = 1;
657 mutex_unlock(&sbi->s_alloc_mutex); 652 mutex_unlock(&sbi->s_alloc_mutex);
658 return; 653 return;
659} 654}
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
666 struct udf_sb_info *sbi = UDF_SB(sb); 661 struct udf_sb_info *sbi = UDF_SB(sb);
667 int alloc_count = 0; 662 int alloc_count = 0;
668 uint32_t elen, adsize; 663 uint32_t elen, adsize;
669 kernel_lb_addr eloc; 664 struct kernel_lb_addr eloc;
670 struct extent_position epos; 665 struct extent_position epos;
671 int8_t etype = -1; 666 int8_t etype = -1;
672 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
677 672
678 iinfo = UDF_I(table); 673 iinfo = UDF_I(table);
679 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 674 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
680 adsize = sizeof(short_ad); 675 adsize = sizeof(struct short_ad);
681 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 676 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
682 adsize = sizeof(long_ad); 677 adsize = sizeof(struct long_ad);
683 else 678 else
684 return 0; 679 return 0;
685 680
@@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
707 alloc_count = block_count; 702 alloc_count = block_count;
708 eloc.logicalBlockNum += alloc_count; 703 eloc.logicalBlockNum += alloc_count;
709 elen -= (alloc_count << sb->s_blocksize_bits); 704 elen -= (alloc_count << sb->s_blocksize_bits);
710 udf_write_aext(table, &epos, eloc, 705 udf_write_aext(table, &epos, &eloc,
711 (etype << 30) | elen, 1); 706 (etype << 30) | elen, 1);
712 } else 707 } else
713 udf_delete_aext(table, epos, eloc, 708 udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
718 713
719 brelse(epos.bh); 714 brelse(epos.bh);
720 715
721 if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) { 716 if (alloc_count)
722 mark_buffer_dirty(sbi->s_lvid_bh); 717 udf_add_free_space(sb, partition, -alloc_count);
723 sb->s_dirt = 1;
724 }
725 mutex_unlock(&sbi->s_alloc_mutex); 718 mutex_unlock(&sbi->s_alloc_mutex);
726 return alloc_count; 719 return alloc_count;
727} 720}
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
735 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; 728 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
736 uint32_t newblock = 0, adsize; 729 uint32_t newblock = 0, adsize;
737 uint32_t elen, goal_elen = 0; 730 uint32_t elen, goal_elen = 0;
738 kernel_lb_addr eloc, uninitialized_var(goal_eloc); 731 struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
739 struct extent_position epos, goal_epos; 732 struct extent_position epos, goal_epos;
740 int8_t etype; 733 int8_t etype;
741 struct udf_inode_info *iinfo = UDF_I(table); 734 struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
743 *err = -ENOSPC; 736 *err = -ENOSPC;
744 737
745 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 738 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
746 adsize = sizeof(short_ad); 739 adsize = sizeof(struct short_ad);
747 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 740 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
748 adsize = sizeof(long_ad); 741 adsize = sizeof(struct long_ad);
749 else 742 else
750 return newblock; 743 return newblock;
751 744
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
814 } 807 }
815 808
816 if (goal_elen) 809 if (goal_elen)
817 udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1); 810 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
818 else 811 else
819 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); 812 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
820 brelse(goal_epos.bh); 813 brelse(goal_epos.bh);
821 814
822 if (udf_add_free_space(sbi, partition, -1)) 815 udf_add_free_space(sb, partition, -1);
823 mark_buffer_dirty(sbi->s_lvid_bh);
824 816
825 sb->s_dirt = 1;
826 mutex_unlock(&sbi->s_alloc_mutex); 817 mutex_unlock(&sbi->s_alloc_mutex);
827 *err = 0; 818 *err = 0;
828 return newblock; 819 return newblock;
829} 820}
830 821
831inline void udf_free_blocks(struct super_block *sb, 822void udf_free_blocks(struct super_block *sb, struct inode *inode,
832 struct inode *inode, 823 struct kernel_lb_addr *bloc, uint32_t offset,
833 kernel_lb_addr bloc, uint32_t offset, 824 uint32_t count)
834 uint32_t count)
835{ 825{
836 uint16_t partition = bloc.partitionReferenceNum; 826 uint16_t partition = bloc->partitionReferenceNum;
837 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 827 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
838 828
839 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { 829 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
840 return udf_bitmap_free_blocks(sb, inode, 830 udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
841 map->s_uspace.s_bitmap, 831 bloc, offset, count);
842 bloc, offset, count);
843 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { 832 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
844 return udf_table_free_blocks(sb, inode, 833 udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
845 map->s_uspace.s_table, 834 bloc, offset, count);
846 bloc, offset, count);
847 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { 835 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
848 return udf_bitmap_free_blocks(sb, inode, 836 udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
849 map->s_fspace.s_bitmap, 837 bloc, offset, count);
850 bloc, offset, count);
851 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { 838 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
852 return udf_table_free_blocks(sb, inode, 839 udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
853 map->s_fspace.s_table, 840 bloc, offset, count);
854 bloc, offset, count);
855 } else {
856 return;
857 } 841 }
858} 842}
859 843
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d1..2efd4d5291b6 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
53 struct buffer_head *tmp, *bha[16]; 53 struct buffer_head *tmp, *bha[16];
54 kernel_lb_addr eloc; 54 struct kernel_lb_addr eloc;
55 uint32_t elen; 55 uint32_t elen;
56 sector_t offset; 56 sector_t offset;
57 int i, num, ret = 0; 57 int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
80 ret = -ENOENT; 80 ret = -ENOENT;
81 goto out; 81 goto out;
82 } 82 }
83 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 83 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 epos.offset -= sizeof(short_ad); 86 epos.offset -= sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == 87 else if (iinfo->i_alloc_type ==
88 ICBTAG_FLAG_AD_LONG) 88 ICBTAG_FLAG_AD_LONG)
89 epos.offset -= sizeof(long_ad); 89 epos.offset -= sizeof(struct long_ad);
90 } else { 90 } else {
91 offset = 0; 91 offset = 0;
92 } 92 }
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) 101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset; 102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
103 for (num = 0; i > 0; i--) { 103 for (num = 0; i > 0; i--) {
104 block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i); 104 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
105 tmp = udf_tgetblk(dir->i_sb, block); 105 tmp = udf_tgetblk(dir->i_sb, block);
106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) 106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
107 bha[num++] = tmp; 107 bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
161 memcpy(fname, "..", flen); 161 memcpy(fname, "..", flen);
162 dt_type = DT_DIR; 162 dt_type = DT_DIR;
163 } else { 163 } else {
164 kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); 164 struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
165 165
166 iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0); 166 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); 167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
168 dt_type = DT_UNKNOWN; 168 dt_type = DT_UNKNOWN;
169 } 169 }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4cc..1d2c570704c8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
20 20
21#if 0 21#if 0
22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, 22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
23 uint8_t ad_size, kernel_lb_addr fe_loc, 23 uint8_t ad_size, struct kernel_lb_addr fe_loc,
24 int *pos, int *offset, struct buffer_head **bh, 24 int *pos, int *offset, struct buffer_head **bh,
25 int *error) 25 int *error)
26{ 26{
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
75 struct udf_fileident_bh *fibh, 75 struct udf_fileident_bh *fibh,
76 struct fileIdentDesc *cfi, 76 struct fileIdentDesc *cfi,
77 struct extent_position *epos, 77 struct extent_position *epos,
78 kernel_lb_addr *eloc, uint32_t *elen, 78 struct kernel_lb_addr *eloc, uint32_t *elen,
79 sector_t *offset) 79 sector_t *offset)
80{ 80{
81 struct fileIdentDesc *fi; 81 struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
111 (EXT_RECORDED_ALLOCATED >> 30)) 111 (EXT_RECORDED_ALLOCATED >> 30))
112 return NULL; 112 return NULL;
113 113
114 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 114 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
115 115
116 (*offset)++; 116 (*offset)++;
117 117
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
131 if (i + *offset > (*elen >> blocksize_bits)) 131 if (i + *offset > (*elen >> blocksize_bits))
132 i = (*elen >> blocksize_bits)-*offset; 132 i = (*elen >> blocksize_bits)-*offset;
133 for (num = 0; i > 0; i--) { 133 for (num = 0; i > 0; i--) {
134 block = udf_get_lb_pblock(dir->i_sb, *eloc, 134 block = udf_get_lb_pblock(dir->i_sb, eloc,
135 *offset + i); 135 *offset + i);
136 tmp = udf_tgetblk(dir->i_sb, block); 136 tmp = udf_tgetblk(dir->i_sb, block);
137 if (tmp && !buffer_uptodate(tmp) && 137 if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
169 (EXT_RECORDED_ALLOCATED >> 30)) 169 (EXT_RECORDED_ALLOCATED >> 30))
170 return NULL; 170 return NULL;
171 171
172 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 172 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
173 173
174 (*offset)++; 174 (*offset)++;
175 175
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
249} 249}
250 250
251#if 0 251#if 0
252static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) 252static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
253{ 253{
254 extent_ad *ext; 254 struct extent_ad *ext;
255 struct fileEntry *fe; 255 struct fileEntry *fe;
256 uint8_t *ptr; 256 uint8_t *ptr;
257 257
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) 274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
275 ptr += *offset; 275 ptr += *offset;
276 276
277 ext = (extent_ad *)ptr; 277 ext = (struct extent_ad *)ptr;
278 278
279 *offset = *offset + sizeof(extent_ad); 279 *offset = *offset + sizeof(struct extent_ad);
280 return ext; 280 return ext;
281} 281}
282#endif 282#endif
283 283
284short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, 284struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
285 int inc) 285 int inc)
286{ 286{
287 short_ad *sa; 287 struct short_ad *sa;
288 288
289 if ((!ptr) || (!offset)) { 289 if ((!ptr) || (!offset)) {
290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); 290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
291 return NULL; 291 return NULL;
292 } 292 }
293 293
294 if ((*offset + sizeof(short_ad)) > maxoffset) 294 if ((*offset + sizeof(struct short_ad)) > maxoffset)
295 return NULL; 295 return NULL;
296 else { 296 else {
297 sa = (short_ad *)ptr; 297 sa = (struct short_ad *)ptr;
298 if (sa->extLength == 0) 298 if (sa->extLength == 0)
299 return NULL; 299 return NULL;
300 } 300 }
301 301
302 if (inc) 302 if (inc)
303 *offset += sizeof(short_ad); 303 *offset += sizeof(struct short_ad);
304 return sa; 304 return sa;
305} 305}
306 306
307long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) 307struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
308{ 308{
309 long_ad *la; 309 struct long_ad *la;
310 310
311 if ((!ptr) || (!offset)) { 311 if ((!ptr) || (!offset)) {
312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); 312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
313 return NULL; 313 return NULL;
314 } 314 }
315 315
316 if ((*offset + sizeof(long_ad)) > maxoffset) 316 if ((*offset + sizeof(struct long_ad)) > maxoffset)
317 return NULL; 317 return NULL;
318 else { 318 else {
319 la = (long_ad *)ptr; 319 la = (struct long_ad *)ptr;
320 if (la->extLength == 0) 320 if (la->extLength == 0)
321 return NULL; 321 return NULL;
322 } 322 }
323 323
324 if (inc) 324 if (inc)
325 *offset += sizeof(long_ad); 325 *offset += sizeof(struct long_ad);
326 return la; 326 return la;
327} 327}
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b31..4792b771aa80 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
38#define _ECMA_167_H 1 38#define _ECMA_167_H 1
39 39
40/* Character set specification (ECMA 167r3 1/7.2.1) */ 40/* Character set specification (ECMA 167r3 1/7.2.1) */
41typedef struct { 41struct charspec {
42 uint8_t charSetType; 42 uint8_t charSetType;
43 uint8_t charSetInfo[63]; 43 uint8_t charSetInfo[63];
44} __attribute__ ((packed)) charspec; 44} __attribute__ ((packed));
45 45
46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */ 46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */
47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ 47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
57typedef uint8_t dstring; 57typedef uint8_t dstring;
58 58
59/* Timestamp (ECMA 167r3 1/7.3) */ 59/* Timestamp (ECMA 167r3 1/7.3) */
60typedef struct { 60struct timestamp {
61 __le16 typeAndTimezone; 61 __le16 typeAndTimezone;
62 __le16 year; 62 __le16 year;
63 uint8_t month; 63 uint8_t month;
@@ -68,7 +68,7 @@ typedef struct {
68 uint8_t centiseconds; 68 uint8_t centiseconds;
69 uint8_t hundredsOfMicroseconds; 69 uint8_t hundredsOfMicroseconds;
70 uint8_t microseconds; 70 uint8_t microseconds;
71} __attribute__ ((packed)) timestamp; 71} __attribute__ ((packed));
72 72
73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */ 73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
74#define TIMESTAMP_TYPE_MASK 0xF000 74#define TIMESTAMP_TYPE_MASK 0xF000
@@ -78,11 +78,11 @@ typedef struct {
78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF 78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF
79 79
80/* Entity identifier (ECMA 167r3 1/7.4) */ 80/* Entity identifier (ECMA 167r3 1/7.4) */
81typedef struct { 81struct regid {
82 uint8_t flags; 82 uint8_t flags;
83 uint8_t ident[23]; 83 uint8_t ident[23];
84 uint8_t identSuffix[8]; 84 uint8_t identSuffix[8];
85} __attribute__ ((packed)) regid; 85} __attribute__ ((packed));
86 86
87/* Flags (ECMA 167r3 1/7.4.1) */ 87/* Flags (ECMA 167r3 1/7.4.1) */
88#define ENTITYID_FLAGS_DIRTY 0x00 88#define ENTITYID_FLAGS_DIRTY 0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
126 126
127/* Boot Descriptor (ECMA 167r3 2/9.4) */ 127/* Boot Descriptor (ECMA 167r3 2/9.4) */
128struct bootDesc { 128struct bootDesc {
129 uint8_t structType; 129 uint8_t structType;
130 uint8_t stdIdent[VSD_STD_ID_LEN]; 130 uint8_t stdIdent[VSD_STD_ID_LEN];
131 uint8_t structVersion; 131 uint8_t structVersion;
132 uint8_t reserved1; 132 uint8_t reserved1;
133 regid archType; 133 struct regid archType;
134 regid bootIdent; 134 struct regid bootIdent;
135 __le32 bootExtLocation; 135 __le32 bootExtLocation;
136 __le32 bootExtLength; 136 __le32 bootExtLength;
137 __le64 loadAddress; 137 __le64 loadAddress;
138 __le64 startAddress; 138 __le64 startAddress;
139 timestamp descCreationDateAndTime; 139 struct timestamp descCreationDateAndTime;
140 __le16 flags; 140 __le16 flags;
141 uint8_t reserved2[32]; 141 uint8_t reserved2[32];
142 uint8_t bootUse[1906]; 142 uint8_t bootUse[1906];
143} __attribute__ ((packed)); 143} __attribute__ ((packed));
144 144
145/* Flags (ECMA 167r3 2/9.4.12) */ 145/* Flags (ECMA 167r3 2/9.4.12) */
146#define BOOT_FLAGS_ERASE 0x01 146#define BOOT_FLAGS_ERASE 0x01
147 147
148/* Extent Descriptor (ECMA 167r3 3/7.1) */ 148/* Extent Descriptor (ECMA 167r3 3/7.1) */
149typedef struct { 149struct extent_ad {
150 __le32 extLength; 150 __le32 extLength;
151 __le32 extLocation; 151 __le32 extLocation;
152} __attribute__ ((packed)) extent_ad; 152} __attribute__ ((packed));
153 153
154typedef struct { 154struct kernel_extent_ad {
155 uint32_t extLength; 155 uint32_t extLength;
156 uint32_t extLocation; 156 uint32_t extLocation;
157} kernel_extent_ad; 157};
158 158
159/* Descriptor Tag (ECMA 167r3 3/7.2) */ 159/* Descriptor Tag (ECMA 167r3 3/7.2) */
160typedef struct { 160struct tag {
161 __le16 tagIdent; 161 __le16 tagIdent;
162 __le16 descVersion; 162 __le16 descVersion;
163 uint8_t tagChecksum; 163 uint8_t tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
166 __le16 descCRC; 166 __le16 descCRC;
167 __le16 descCRCLength; 167 __le16 descCRCLength;
168 __le32 tagLocation; 168 __le32 tagLocation;
169} __attribute__ ((packed)) tag; 169} __attribute__ ((packed));
170 170
171/* Tag Identifier (ECMA 167r3 3/7.2.1) */ 171/* Tag Identifier (ECMA 167r3 3/7.2.1) */
172#define TAG_IDENT_PVD 0x0001 172#define TAG_IDENT_PVD 0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
190 190
191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ 191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
192struct primaryVolDesc { 192struct primaryVolDesc {
193 tag descTag; 193 struct tag descTag;
194 __le32 volDescSeqNum; 194 __le32 volDescSeqNum;
195 __le32 primaryVolDescNum; 195 __le32 primaryVolDescNum;
196 dstring volIdent[32]; 196 dstring volIdent[32];
197 __le16 volSeqNum; 197 __le16 volSeqNum;
198 __le16 maxVolSeqNum; 198 __le16 maxVolSeqNum;
199 __le16 interchangeLvl; 199 __le16 interchangeLvl;
200 __le16 maxInterchangeLvl; 200 __le16 maxInterchangeLvl;
201 __le32 charSetList; 201 __le32 charSetList;
202 __le32 maxCharSetList; 202 __le32 maxCharSetList;
203 dstring volSetIdent[128]; 203 dstring volSetIdent[128];
204 charspec descCharSet; 204 struct charspec descCharSet;
205 charspec explanatoryCharSet; 205 struct charspec explanatoryCharSet;
206 extent_ad volAbstract; 206 struct extent_ad volAbstract;
207 extent_ad volCopyright; 207 struct extent_ad volCopyright;
208 regid appIdent; 208 struct regid appIdent;
209 timestamp recordingDateAndTime; 209 struct timestamp recordingDateAndTime;
210 regid impIdent; 210 struct regid impIdent;
211 uint8_t impUse[64]; 211 uint8_t impUse[64];
212 __le32 predecessorVolDescSeqLocation; 212 __le32 predecessorVolDescSeqLocation;
213 __le16 flags; 213 __le16 flags;
214 uint8_t reserved[22]; 214 uint8_t reserved[22];
215} __attribute__ ((packed)); 215} __attribute__ ((packed));
216 216
217/* Flags (ECMA 167r3 3/10.1.21) */ 217/* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
219 219
220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ 220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
221struct anchorVolDescPtr { 221struct anchorVolDescPtr {
222 tag descTag; 222 struct tag descTag;
223 extent_ad mainVolDescSeqExt; 223 struct extent_ad mainVolDescSeqExt;
224 extent_ad reserveVolDescSeqExt; 224 struct extent_ad reserveVolDescSeqExt;
225 uint8_t reserved[480]; 225 uint8_t reserved[480];
226} __attribute__ ((packed)); 226} __attribute__ ((packed));
227 227
228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ 228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
229struct volDescPtr { 229struct volDescPtr {
230 tag descTag; 230 struct tag descTag;
231 __le32 volDescSeqNum; 231 __le32 volDescSeqNum;
232 extent_ad nextVolDescSeqExt; 232 struct extent_ad nextVolDescSeqExt;
233 uint8_t reserved[484]; 233 uint8_t reserved[484];
234} __attribute__ ((packed)); 234} __attribute__ ((packed));
235 235
236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ 236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
237struct impUseVolDesc { 237struct impUseVolDesc {
238 tag descTag; 238 struct tag descTag;
239 __le32 volDescSeqNum; 239 __le32 volDescSeqNum;
240 regid impIdent; 240 struct regid impIdent;
241 uint8_t impUse[460]; 241 uint8_t impUse[460];
242} __attribute__ ((packed)); 242} __attribute__ ((packed));
243 243
244/* Partition Descriptor (ECMA 167r3 3/10.5) */ 244/* Partition Descriptor (ECMA 167r3 3/10.5) */
245struct partitionDesc { 245struct partitionDesc {
246 tag descTag; 246 struct tag descTag;
247 __le32 volDescSeqNum; 247 __le32 volDescSeqNum;
248 __le16 partitionFlags; 248 __le16 partitionFlags;
249 __le16 partitionNumber; 249 __le16 partitionNumber;
250 regid partitionContents; 250 struct regid partitionContents;
251 uint8_t partitionContentsUse[128]; 251 uint8_t partitionContentsUse[128];
252 __le32 accessType; 252 __le32 accessType;
253 __le32 partitionStartingLocation; 253 __le32 partitionStartingLocation;
254 __le32 partitionLength; 254 __le32 partitionLength;
255 regid impIdent; 255 struct regid impIdent;
256 uint8_t impUse[128]; 256 uint8_t impUse[128];
257 uint8_t reserved[156]; 257 uint8_t reserved[156];
258} __attribute__ ((packed)); 258} __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
278 278
279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ 279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
280struct logicalVolDesc { 280struct logicalVolDesc {
281 tag descTag; 281 struct tag descTag;
282 __le32 volDescSeqNum; 282 __le32 volDescSeqNum;
283 charspec descCharSet; 283 struct charspec descCharSet;
284 dstring logicalVolIdent[128]; 284 dstring logicalVolIdent[128];
285 __le32 logicalBlockSize; 285 __le32 logicalBlockSize;
286 regid domainIdent; 286 struct regid domainIdent;
287 uint8_t logicalVolContentsUse[16]; 287 uint8_t logicalVolContentsUse[16];
288 __le32 mapTableLength; 288 __le32 mapTableLength;
289 __le32 numPartitionMaps; 289 __le32 numPartitionMaps;
290 regid impIdent; 290 struct regid impIdent;
291 uint8_t impUse[128]; 291 uint8_t impUse[128];
292 extent_ad integritySeqExt; 292 struct extent_ad integritySeqExt;
293 uint8_t partitionMaps[0]; 293 uint8_t partitionMaps[0];
294} __attribute__ ((packed)); 294} __attribute__ ((packed));
295 295
296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */ 296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
322 322
323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ 323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
324struct unallocSpaceDesc { 324struct unallocSpaceDesc {
325 tag descTag; 325 struct tag descTag;
326 __le32 volDescSeqNum; 326 __le32 volDescSeqNum;
327 __le32 numAllocDescs; 327 __le32 numAllocDescs;
328 extent_ad allocDescs[0]; 328 struct extent_ad allocDescs[0];
329} __attribute__ ((packed)); 329} __attribute__ ((packed));
330 330
331/* Terminating Descriptor (ECMA 167r3 3/10.9) */ 331/* Terminating Descriptor (ECMA 167r3 3/10.9) */
332struct terminatingDesc { 332struct terminatingDesc {
333 tag descTag; 333 struct tag descTag;
334 uint8_t reserved[496]; 334 uint8_t reserved[496];
335} __attribute__ ((packed)); 335} __attribute__ ((packed));
336 336
337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ 337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
338struct logicalVolIntegrityDesc { 338struct logicalVolIntegrityDesc {
339 tag descTag; 339 struct tag descTag;
340 timestamp recordingDateAndTime; 340 struct timestamp recordingDateAndTime;
341 __le32 integrityType; 341 __le32 integrityType;
342 extent_ad nextIntegrityExt; 342 struct extent_ad nextIntegrityExt;
343 uint8_t logicalVolContentsUse[32]; 343 uint8_t logicalVolContentsUse[32];
344 __le32 numOfPartitions; 344 __le32 numOfPartitions;
345 __le32 lengthOfImpUse; 345 __le32 lengthOfImpUse;
346 __le32 freeSpaceTable[0]; 346 __le32 freeSpaceTable[0];
347 __le32 sizeTable[0]; 347 __le32 sizeTable[0];
348 uint8_t impUse[0]; 348 uint8_t impUse[0];
349} __attribute__ ((packed)); 349} __attribute__ ((packed));
350 350
351/* Integrity Type (ECMA 167r3 3/10.10.3) */ 351/* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001
354 354
355/* Recorded Address (ECMA 167r3 4/7.1) */ 355/* Recorded Address (ECMA 167r3 4/7.1) */
356typedef struct { 356struct lb_addr {
357 __le32 logicalBlockNum; 357 __le32 logicalBlockNum;
358 __le16 partitionReferenceNum; 358 __le16 partitionReferenceNum;
359} __attribute__ ((packed)) lb_addr; 359} __attribute__ ((packed));
360 360
361/* ... and its in-core analog */ 361/* ... and its in-core analog */
362typedef struct { 362struct kernel_lb_addr {
363 uint32_t logicalBlockNum; 363 uint32_t logicalBlockNum;
364 uint16_t partitionReferenceNum; 364 uint16_t partitionReferenceNum;
365} kernel_lb_addr; 365};
366 366
367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
368typedef struct { 368struct short_ad {
369 __le32 extLength; 369 __le32 extLength;
370 __le32 extPosition; 370 __le32 extPosition;
371} __attribute__ ((packed)) short_ad; 371} __attribute__ ((packed));
372 372
373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ 373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
374typedef struct { 374struct long_ad {
375 __le32 extLength; 375 __le32 extLength;
376 lb_addr extLocation; 376 struct lb_addr extLocation;
377 uint8_t impUse[6]; 377 uint8_t impUse[6];
378} __attribute__ ((packed)) long_ad; 378} __attribute__ ((packed));
379 379
380typedef struct { 380struct kernel_long_ad {
381 uint32_t extLength; 381 uint32_t extLength;
382 kernel_lb_addr extLocation; 382 struct kernel_lb_addr extLocation;
383 uint8_t impUse[6]; 383 uint8_t impUse[6];
384} kernel_long_ad; 384};
385 385
386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ 386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
387typedef struct { 387struct ext_ad {
388 __le32 extLength; 388 __le32 extLength;
389 __le32 recordedLength; 389 __le32 recordedLength;
390 __le32 informationLength; 390 __le32 informationLength;
391 lb_addr extLocation; 391 struct lb_addr extLocation;
392} __attribute__ ((packed)) ext_ad; 392} __attribute__ ((packed));
393 393
394typedef struct { 394struct kernel_ext_ad {
395 uint32_t extLength; 395 uint32_t extLength;
396 uint32_t recordedLength; 396 uint32_t recordedLength;
397 uint32_t informationLength; 397 uint32_t informationLength;
398 kernel_lb_addr extLocation; 398 struct kernel_lb_addr extLocation;
399} kernel_ext_ad; 399};
400 400
401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ 401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
402 402
@@ -415,44 +415,44 @@ typedef struct {
415 415
416/* File Set Descriptor (ECMA 167r3 4/14.1) */ 416/* File Set Descriptor (ECMA 167r3 4/14.1) */
417struct fileSetDesc { 417struct fileSetDesc {
418 tag descTag; 418 struct tag descTag;
419 timestamp recordingDateAndTime; 419 struct timestamp recordingDateAndTime;
420 __le16 interchangeLvl; 420 __le16 interchangeLvl;
421 __le16 maxInterchangeLvl; 421 __le16 maxInterchangeLvl;
422 __le32 charSetList; 422 __le32 charSetList;
423 __le32 maxCharSetList; 423 __le32 maxCharSetList;
424 __le32 fileSetNum; 424 __le32 fileSetNum;
425 __le32 fileSetDescNum; 425 __le32 fileSetDescNum;
426 charspec logicalVolIdentCharSet; 426 struct charspec logicalVolIdentCharSet;
427 dstring logicalVolIdent[128]; 427 dstring logicalVolIdent[128];
428 charspec fileSetCharSet; 428 struct charspec fileSetCharSet;
429 dstring fileSetIdent[32]; 429 dstring fileSetIdent[32];
430 dstring copyrightFileIdent[32]; 430 dstring copyrightFileIdent[32];
431 dstring abstractFileIdent[32]; 431 dstring abstractFileIdent[32];
432 long_ad rootDirectoryICB; 432 struct long_ad rootDirectoryICB;
433 regid domainIdent; 433 struct regid domainIdent;
434 long_ad nextExt; 434 struct long_ad nextExt;
435 long_ad streamDirectoryICB; 435 struct long_ad streamDirectoryICB;
436 uint8_t reserved[32]; 436 uint8_t reserved[32];
437} __attribute__ ((packed)); 437} __attribute__ ((packed));
438 438
439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */ 439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */
440struct partitionHeaderDesc { 440struct partitionHeaderDesc {
441 short_ad unallocSpaceTable; 441 struct short_ad unallocSpaceTable;
442 short_ad unallocSpaceBitmap; 442 struct short_ad unallocSpaceBitmap;
443 short_ad partitionIntegrityTable; 443 struct short_ad partitionIntegrityTable;
444 short_ad freedSpaceTable; 444 struct short_ad freedSpaceTable;
445 short_ad freedSpaceBitmap; 445 struct short_ad freedSpaceBitmap;
446 uint8_t reserved[88]; 446 uint8_t reserved[88];
447} __attribute__ ((packed)); 447} __attribute__ ((packed));
448 448
449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */ 449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */
450struct fileIdentDesc { 450struct fileIdentDesc {
451 tag descTag; 451 struct tag descTag;
452 __le16 fileVersionNum; 452 __le16 fileVersionNum;
453 uint8_t fileCharacteristics; 453 uint8_t fileCharacteristics;
454 uint8_t lengthFileIdent; 454 uint8_t lengthFileIdent;
455 long_ad icb; 455 struct long_ad icb;
456 __le16 lengthOfImpUse; 456 __le16 lengthOfImpUse;
457 uint8_t impUse[0]; 457 uint8_t impUse[0];
458 uint8_t fileIdent[0]; 458 uint8_t fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
468 468
469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ 469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
470struct allocExtDesc { 470struct allocExtDesc {
471 tag descTag; 471 struct tag descTag;
472 __le32 previousAllocExtLocation; 472 __le32 previousAllocExtLocation;
473 __le32 lengthAllocDescs; 473 __le32 lengthAllocDescs;
474} __attribute__ ((packed)); 474} __attribute__ ((packed));
475 475
476/* ICB Tag (ECMA 167r3 4/14.6) */ 476/* ICB Tag (ECMA 167r3 4/14.6) */
477typedef struct { 477struct icbtag {
478 __le32 priorRecordedNumDirectEntries; 478 __le32 priorRecordedNumDirectEntries;
479 __le16 strategyType; 479 __le16 strategyType;
480 __le16 strategyParameter; 480 __le16 strategyParameter;
481 __le16 numEntries; 481 __le16 numEntries;
482 uint8_t reserved; 482 uint8_t reserved;
483 uint8_t fileType; 483 uint8_t fileType;
484 lb_addr parentICBLocation; 484 struct lb_addr parentICBLocation;
485 __le16 flags; 485 __le16 flags;
486} __attribute__ ((packed)) icbtag; 486} __attribute__ ((packed));
487 487
488/* Strategy Type (ECMA 167r3 4/14.6.2) */ 488/* Strategy Type (ECMA 167r3 4/14.6.2) */
489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000
@@ -528,41 +528,41 @@ typedef struct {
528 528
529/* Indirect Entry (ECMA 167r3 4/14.7) */ 529/* Indirect Entry (ECMA 167r3 4/14.7) */
530struct indirectEntry { 530struct indirectEntry {
531 tag descTag; 531 struct tag descTag;
532 icbtag icbTag; 532 struct icbtag icbTag;
533 long_ad indirectICB; 533 struct long_ad indirectICB;
534} __attribute__ ((packed)); 534} __attribute__ ((packed));
535 535
536/* Terminal Entry (ECMA 167r3 4/14.8) */ 536/* Terminal Entry (ECMA 167r3 4/14.8) */
537struct terminalEntry { 537struct terminalEntry {
538 tag descTag; 538 struct tag descTag;
539 icbtag icbTag; 539 struct icbtag icbTag;
540} __attribute__ ((packed)); 540} __attribute__ ((packed));
541 541
542/* File Entry (ECMA 167r3 4/14.9) */ 542/* File Entry (ECMA 167r3 4/14.9) */
543struct fileEntry { 543struct fileEntry {
544 tag descTag; 544 struct tag descTag;
545 icbtag icbTag; 545 struct icbtag icbTag;
546 __le32 uid; 546 __le32 uid;
547 __le32 gid; 547 __le32 gid;
548 __le32 permissions; 548 __le32 permissions;
549 __le16 fileLinkCount; 549 __le16 fileLinkCount;
550 uint8_t recordFormat; 550 uint8_t recordFormat;
551 uint8_t recordDisplayAttr; 551 uint8_t recordDisplayAttr;
552 __le32 recordLength; 552 __le32 recordLength;
553 __le64 informationLength; 553 __le64 informationLength;
554 __le64 logicalBlocksRecorded; 554 __le64 logicalBlocksRecorded;
555 timestamp accessTime; 555 struct timestamp accessTime;
556 timestamp modificationTime; 556 struct timestamp modificationTime;
557 timestamp attrTime; 557 struct timestamp attrTime;
558 __le32 checkpoint; 558 __le32 checkpoint;
559 long_ad extendedAttrICB; 559 struct long_ad extendedAttrICB;
560 regid impIdent; 560 struct regid impIdent;
561 __le64 uniqueID; 561 __le64 uniqueID;
562 __le32 lengthExtendedAttr; 562 __le32 lengthExtendedAttr;
563 __le32 lengthAllocDescs; 563 __le32 lengthAllocDescs;
564 uint8_t extendedAttr[0]; 564 uint8_t extendedAttr[0];
565 uint8_t allocDescs[0]; 565 uint8_t allocDescs[0];
566} __attribute__ ((packed)); 566} __attribute__ ((packed));
567 567
568/* Permissions (ECMA 167r3 4/14.9.5) */ 568/* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
604 604
605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ 605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
606struct extendedAttrHeaderDesc { 606struct extendedAttrHeaderDesc {
607 tag descTag; 607 struct tag descTag;
608 __le32 impAttrLocation; 608 __le32 impAttrLocation;
609 __le32 appAttrLocation; 609 __le32 appAttrLocation;
610} __attribute__ ((packed)); 610} __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
687 uint8_t reserved[3]; 687 uint8_t reserved[3];
688 __le32 attrLength; 688 __le32 attrLength;
689 __le32 impUseLength; 689 __le32 impUseLength;
690 regid impIdent; 690 struct regid impIdent;
691 uint8_t impUse[0]; 691 uint8_t impUse[0];
692} __attribute__ ((packed)); 692} __attribute__ ((packed));
693 693
@@ -698,7 +698,7 @@ struct appUseExtAttr {
698 uint8_t reserved[3]; 698 uint8_t reserved[3];
699 __le32 attrLength; 699 __le32 attrLength;
700 __le32 appUseLength; 700 __le32 appUseLength;
701 regid appIdent; 701 struct regid appIdent;
702 uint8_t appUse[0]; 702 uint8_t appUse[0];
703} __attribute__ ((packed)); 703} __attribute__ ((packed));
704 704
@@ -712,15 +712,15 @@ struct appUseExtAttr {
712 712
713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */ 713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
714struct unallocSpaceEntry { 714struct unallocSpaceEntry {
715 tag descTag; 715 struct tag descTag;
716 icbtag icbTag; 716 struct icbtag icbTag;
717 __le32 lengthAllocDescs; 717 __le32 lengthAllocDescs;
718 uint8_t allocDescs[0]; 718 uint8_t allocDescs[0];
719} __attribute__ ((packed)); 719} __attribute__ ((packed));
720 720
721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ 721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
722struct spaceBitmapDesc { 722struct spaceBitmapDesc {
723 tag descTag; 723 struct tag descTag;
724 __le32 numOfBits; 724 __le32 numOfBits;
725 __le32 numOfBytes; 725 __le32 numOfBytes;
726 uint8_t bitmap[0]; 726 uint8_t bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
728 728
729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */ 729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
730struct partitionIntegrityEntry { 730struct partitionIntegrityEntry {
731 tag descTag; 731 struct tag descTag;
732 icbtag icbTag; 732 struct icbtag icbTag;
733 timestamp recordingDateAndTime; 733 struct timestamp recordingDateAndTime;
734 uint8_t integrityType; 734 uint8_t integrityType;
735 uint8_t reserved[175]; 735 uint8_t reserved[175];
736 regid impIdent; 736 struct regid impIdent;
737 uint8_t impUse[256]; 737 uint8_t impUse[256];
738} __attribute__ ((packed)); 738} __attribute__ ((packed));
739 739
740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
765 765
766/* File Entry (ECMA 167r3 4/14.17) */ 766/* File Entry (ECMA 167r3 4/14.17) */
767struct extendedFileEntry { 767struct extendedFileEntry {
768 tag descTag; 768 struct tag descTag;
769 icbtag icbTag; 769 struct icbtag icbTag;
770 __le32 uid; 770 __le32 uid;
771 __le32 gid; 771 __le32 gid;
772 __le32 permissions; 772 __le32 permissions;
773 __le16 fileLinkCount; 773 __le16 fileLinkCount;
774 uint8_t recordFormat; 774 uint8_t recordFormat;
775 uint8_t recordDisplayAttr; 775 uint8_t recordDisplayAttr;
776 __le32 recordLength; 776 __le32 recordLength;
777 __le64 informationLength; 777 __le64 informationLength;
778 __le64 objectSize; 778 __le64 objectSize;
779 __le64 logicalBlocksRecorded; 779 __le64 logicalBlocksRecorded;
780 timestamp accessTime; 780 struct timestamp accessTime;
781 timestamp modificationTime; 781 struct timestamp modificationTime;
782 timestamp createTime; 782 struct timestamp createTime;
783 timestamp attrTime; 783 struct timestamp attrTime;
784 __le32 checkpoint; 784 __le32 checkpoint;
785 __le32 reserved; 785 __le32 reserved;
786 long_ad extendedAttrICB; 786 struct long_ad extendedAttrICB;
787 long_ad streamDirectoryICB; 787 struct long_ad streamDirectoryICB;
788 regid impIdent; 788 struct regid impIdent;
789 __le64 uniqueID; 789 __le64 uniqueID;
790 __le32 lengthExtendedAttr; 790 __le32 lengthExtendedAttr;
791 __le32 lengthAllocDescs; 791 __le32 lengthAllocDescs;
792 uint8_t extendedAttr[0]; 792 uint8_t extendedAttr[0];
793 uint8_t allocDescs[0]; 793 uint8_t allocDescs[0];
794} __attribute__ ((packed)); 794} __attribute__ ((packed));
795 795
796#endif /* _ECMA_167_H */ 796#endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 47dbe5613f90..c10fa39f97e2 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
49 le32_add_cpu(&lvidiu->numDirs, -1); 49 le32_add_cpu(&lvidiu->numDirs, -1);
50 else 50 else
51 le32_add_cpu(&lvidiu->numFiles, -1); 51 le32_add_cpu(&lvidiu->numFiles, -1);
52 52 udf_updated_lvid(sb);
53 mark_buffer_dirty(sbi->s_lvid_bh);
54 } 53 }
55 mutex_unlock(&sbi->s_alloc_mutex); 54 mutex_unlock(&sbi->s_alloc_mutex);
56 55
57 udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1); 56 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
58} 57}
59 58
60struct inode *udf_new_inode(struct inode *dir, int mode, int *err) 59struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
122 if (!(++uniqueID & 0x00000000FFFFFFFFUL)) 121 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
123 uniqueID += 16; 122 uniqueID += 16;
124 lvhd->uniqueID = cpu_to_le64(uniqueID); 123 lvhd->uniqueID = cpu_to_le64(uniqueID);
125 mark_buffer_dirty(sbi->s_lvid_bh); 124 udf_updated_lvid(sb);
126 } 125 }
127 mutex_unlock(&sbi->s_alloc_mutex); 126 mutex_unlock(&sbi->s_alloc_mutex);
128 inode->i_mode = mode; 127 inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
138 iinfo->i_location.logicalBlockNum = block; 137 iinfo->i_location.logicalBlockNum = block;
139 iinfo->i_location.partitionReferenceNum = 138 iinfo->i_location.partitionReferenceNum =
140 dinfo->i_location.partitionReferenceNum; 139 dinfo->i_location.partitionReferenceNum;
141 inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0); 140 inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
142 inode->i_blocks = 0; 141 inode->i_blocks = 0;
143 iinfo->i_lenEAttr = 0; 142 iinfo->i_lenEAttr = 0;
144 iinfo->i_lenAlloc = 0; 143 iinfo->i_lenAlloc = 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7f..e7533f785636 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
56 sector_t *, int *); 56 sector_t *, int *);
57static int8_t udf_insert_aext(struct inode *, struct extent_position, 57static int8_t udf_insert_aext(struct inode *, struct extent_position,
58 kernel_lb_addr, uint32_t); 58 struct kernel_lb_addr, uint32_t);
59static void udf_split_extents(struct inode *, int *, int, int, 59static void udf_split_extents(struct inode *, int *, int, int,
60 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 60 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
61static void udf_prealloc_extents(struct inode *, int, int, 61static void udf_prealloc_extents(struct inode *, int, int,
62 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 62 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
63static void udf_merge_extents(struct inode *, 63static void udf_merge_extents(struct inode *,
64 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 64 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
65static void udf_update_extents(struct inode *, 65static void udf_update_extents(struct inode *,
66 kernel_long_ad[EXTENT_MERGE_SIZE], int, int, 66 struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
67 struct extent_position *); 67 struct extent_position *);
68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); 68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
69 69
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
200{ 200{
201 int newblock; 201 int newblock;
202 struct buffer_head *dbh = NULL; 202 struct buffer_head *dbh = NULL;
203 kernel_lb_addr eloc; 203 struct kernel_lb_addr eloc;
204 uint32_t elen; 204 uint32_t elen;
205 uint8_t alloctype; 205 uint8_t alloctype;
206 struct extent_position epos; 206 struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
281 epos.bh = NULL; 281 epos.bh = NULL;
282 epos.block = iinfo->i_location; 282 epos.block = iinfo->i_location;
283 epos.offset = udf_file_entry_alloc_offset(inode); 283 epos.offset = udf_file_entry_alloc_offset(inode);
284 udf_add_aext(inode, &epos, eloc, elen, 0); 284 udf_add_aext(inode, &epos, &eloc, elen, 0);
285 /* UniqueID stuff */ 285 /* UniqueID stuff */
286 286
287 brelse(epos.bh); 287 brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
359 359
360/* Extend the file by 'blocks' blocks, return the number of extents added */ 360/* Extend the file by 'blocks' blocks, return the number of extents added */
361int udf_extend_file(struct inode *inode, struct extent_position *last_pos, 361int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
362 kernel_long_ad *last_ext, sector_t blocks) 362 struct kernel_long_ad *last_ext, sector_t blocks)
363{ 363{
364 sector_t add; 364 sector_t add;
365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); 365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
366 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
367 kernel_lb_addr prealloc_loc = {}; 367 struct kernel_lb_addr prealloc_loc = {};
368 int prealloc_len = 0; 368 int prealloc_len = 0;
369 struct udf_inode_info *iinfo; 369 struct udf_inode_info *iinfo;
370 370
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
411 } 411 }
412 412
413 if (fake) { 413 if (fake) {
414 udf_add_aext(inode, last_pos, last_ext->extLocation, 414 udf_add_aext(inode, last_pos, &last_ext->extLocation,
415 last_ext->extLength, 1); 415 last_ext->extLength, 1);
416 count++; 416 count++;
417 } else 417 } else
418 udf_write_aext(inode, last_pos, last_ext->extLocation, 418 udf_write_aext(inode, last_pos, &last_ext->extLocation,
419 last_ext->extLength, 1); 419 last_ext->extLength, 1);
420 420
421 /* Managed to do everything necessary? */ 421 /* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
432 /* Create enough extents to cover the whole hole */ 432 /* Create enough extents to cover the whole hole */
433 while (blocks > add) { 433 while (blocks > add) {
434 blocks -= add; 434 blocks -= add;
435 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 435 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
436 last_ext->extLength, 1) == -1) 436 last_ext->extLength, 1) == -1)
437 return -1; 437 return -1;
438 count++; 438 count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
440 if (blocks) { 440 if (blocks) {
441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
442 (blocks << sb->s_blocksize_bits); 442 (blocks << sb->s_blocksize_bits);
443 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 443 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
444 last_ext->extLength, 1) == -1) 444 last_ext->extLength, 1) == -1)
445 return -1; 445 return -1;
446 count++; 446 count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
449out: 449out:
450 /* Do we have some preallocated blocks saved? */ 450 /* Do we have some preallocated blocks saved? */
451 if (prealloc_len) { 451 if (prealloc_len) {
452 if (udf_add_aext(inode, last_pos, prealloc_loc, 452 if (udf_add_aext(inode, last_pos, &prealloc_loc,
453 prealloc_len, 1) == -1) 453 prealloc_len, 1) == -1)
454 return -1; 454 return -1;
455 last_ext->extLocation = prealloc_loc; 455 last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
459 459
460 /* last_pos should point to the last written extent... */ 460 /* last_pos should point to the last written extent... */
461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
462 last_pos->offset -= sizeof(short_ad); 462 last_pos->offset -= sizeof(struct short_ad);
463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
464 last_pos->offset -= sizeof(long_ad); 464 last_pos->offset -= sizeof(struct long_ad);
465 else 465 else
466 return -1; 466 return -1;
467 467
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
473{ 473{
474 static sector_t last_block; 474 static sector_t last_block;
475 struct buffer_head *result = NULL; 475 struct buffer_head *result = NULL;
476 kernel_long_ad laarr[EXTENT_MERGE_SIZE]; 476 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
477 struct extent_position prev_epos, cur_epos, next_epos; 477 struct extent_position prev_epos, cur_epos, next_epos;
478 int count = 0, startnum = 0, endnum = 0; 478 int count = 0, startnum = 0, endnum = 0;
479 uint32_t elen = 0, tmpelen; 479 uint32_t elen = 0, tmpelen;
480 kernel_lb_addr eloc, tmpeloc; 480 struct kernel_lb_addr eloc, tmpeloc;
481 int c = 1; 481 int c = 1;
482 loff_t lbcount = 0, b_off = 0; 482 loff_t lbcount = 0, b_off = 0;
483 uint32_t newblocknum, newblock; 483 uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
550 elen = EXT_RECORDED_ALLOCATED | 550 elen = EXT_RECORDED_ALLOCATED |
551 ((elen + inode->i_sb->s_blocksize - 1) & 551 ((elen + inode->i_sb->s_blocksize - 1) &
552 ~(inode->i_sb->s_blocksize - 1)); 552 ~(inode->i_sb->s_blocksize - 1));
553 etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1); 553 etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
554 } 554 }
555 brelse(prev_epos.bh); 555 brelse(prev_epos.bh);
556 brelse(cur_epos.bh); 556 brelse(cur_epos.bh);
557 brelse(next_epos.bh); 557 brelse(next_epos.bh);
558 newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset); 558 newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
559 *phys = newblock; 559 *phys = newblock;
560 return NULL; 560 return NULL;
561 } 561 }
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
572 } else { 572 } else {
573 /* Create a fake extent when there's not one */ 573 /* Create a fake extent when there's not one */
574 memset(&laarr[0].extLocation, 0x00, 574 memset(&laarr[0].extLocation, 0x00,
575 sizeof(kernel_lb_addr)); 575 sizeof(struct kernel_lb_addr));
576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; 576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
577 /* Will udf_extend_file() create real extent from 577 /* Will udf_extend_file() create real extent from
578 a fake one? */ 578 a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
603 inode->i_sb->s_blocksize; 603 inode->i_sb->s_blocksize;
604 memset(&laarr[c].extLocation, 0x00, 604 memset(&laarr[c].extLocation, 0x00,
605 sizeof(kernel_lb_addr)); 605 sizeof(struct kernel_lb_addr));
606 count++; 606 count++;
607 endnum++; 607 endnum++;
608 } 608 }
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
699 699
700static void udf_split_extents(struct inode *inode, int *c, int offset, 700static void udf_split_extents(struct inode *inode, int *c, int offset,
701 int newblocknum, 701 int newblocknum,
702 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 702 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
703 int *endnum) 703 int *endnum)
704{ 704{
705 unsigned long blocksize = inode->i_sb->s_blocksize; 705 unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
726 if (offset) { 726 if (offset) {
727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
728 udf_free_blocks(inode->i_sb, inode, 728 udf_free_blocks(inode->i_sb, inode,
729 laarr[curr].extLocation, 729 &laarr[curr].extLocation,
730 0, offset); 730 0, offset);
731 laarr[curr].extLength = 731 laarr[curr].extLength =
732 EXT_NOT_RECORDED_NOT_ALLOCATED | 732 EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
763} 763}
764 764
765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, 765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
766 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 766 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
767 int *endnum) 767 int *endnum)
768{ 768{
769 int start, length = 0, currlength = 0, i; 769 int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
817 inode->i_sb->s_blocksize_bits); 817 inode->i_sb->s_blocksize_bits);
818 else { 818 else {
819 memmove(&laarr[c + 2], &laarr[c + 1], 819 memmove(&laarr[c + 2], &laarr[c + 1],
820 sizeof(long_ad) * (*endnum - (c + 1))); 820 sizeof(struct long_ad) * (*endnum - (c + 1)));
821 (*endnum)++; 821 (*endnum)++;
822 laarr[c + 1].extLocation.logicalBlockNum = next; 822 laarr[c + 1].extLocation.logicalBlockNum = next;
823 laarr[c + 1].extLocation.partitionReferenceNum = 823 laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
846 if (*endnum > (i + 1)) 846 if (*endnum > (i + 1))
847 memmove(&laarr[i], 847 memmove(&laarr[i],
848 &laarr[i + 1], 848 &laarr[i + 1],
849 sizeof(long_ad) * 849 sizeof(struct long_ad) *
850 (*endnum - (i + 1))); 850 (*endnum - (i + 1)));
851 i--; 851 i--;
852 (*endnum)--; 852 (*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
859} 859}
860 860
861static void udf_merge_extents(struct inode *inode, 861static void udf_merge_extents(struct inode *inode,
862 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 862 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
863 int *endnum) 863 int *endnum)
864{ 864{
865 int i; 865 int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
868 868
869 for (i = 0; i < (*endnum - 1); i++) { 869 for (i = 0; i < (*endnum - 1); i++) {
870 kernel_long_ad *li /*l[i]*/ = &laarr[i]; 870 struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
871 kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; 871 struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
872 872
873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) && 873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
874 (((li->extLength >> 30) == 874 (((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
902 blocksize - 1) & ~(blocksize - 1)); 902 blocksize - 1) & ~(blocksize - 1));
903 if (*endnum > (i + 2)) 903 if (*endnum > (i + 2))
904 memmove(&laarr[i + 1], &laarr[i + 2], 904 memmove(&laarr[i + 1], &laarr[i + 2],
905 sizeof(long_ad) * 905 sizeof(struct long_ad) *
906 (*endnum - (i + 2))); 906 (*endnum - (i + 2)));
907 i--; 907 i--;
908 (*endnum)--; 908 (*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) && 911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
912 ((lip1->extLength >> 30) == 912 ((lip1->extLength >> 30) ==
913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { 913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
914 udf_free_blocks(inode->i_sb, inode, li->extLocation, 0, 914 udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
915 ((li->extLength & 915 ((li->extLength &
916 UDF_EXTENT_LENGTH_MASK) + 916 UDF_EXTENT_LENGTH_MASK) +
917 blocksize - 1) >> blocksize_bits); 917 blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
937 blocksize - 1) & ~(blocksize - 1)); 937 blocksize - 1) & ~(blocksize - 1));
938 if (*endnum > (i + 2)) 938 if (*endnum > (i + 2))
939 memmove(&laarr[i + 1], &laarr[i + 2], 939 memmove(&laarr[i + 1], &laarr[i + 2],
940 sizeof(long_ad) * 940 sizeof(struct long_ad) *
941 (*endnum - (i + 2))); 941 (*endnum - (i + 2)));
942 i--; 942 i--;
943 (*endnum)--; 943 (*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
945 } else if ((li->extLength >> 30) == 945 } else if ((li->extLength >> 30) ==
946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
947 udf_free_blocks(inode->i_sb, inode, 947 udf_free_blocks(inode->i_sb, inode,
948 li->extLocation, 0, 948 &li->extLocation, 0,
949 ((li->extLength & 949 ((li->extLength &
950 UDF_EXTENT_LENGTH_MASK) + 950 UDF_EXTENT_LENGTH_MASK) +
951 blocksize - 1) >> blocksize_bits); 951 blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
959} 959}
960 960
961static void udf_update_extents(struct inode *inode, 961static void udf_update_extents(struct inode *inode,
962 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 962 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
963 int startnum, int endnum, 963 int startnum, int endnum,
964 struct extent_position *epos) 964 struct extent_position *epos)
965{ 965{
966 int start = 0, i; 966 int start = 0, i;
967 kernel_lb_addr tmploc; 967 struct kernel_lb_addr tmploc;
968 uint32_t tmplen; 968 uint32_t tmplen;
969 969
970 if (startnum > endnum) { 970 if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
983 983
984 for (i = start; i < endnum; i++) { 984 for (i = start; i < endnum; i++) {
985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0); 985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
986 udf_write_aext(inode, epos, laarr[i].extLocation, 986 udf_write_aext(inode, epos, &laarr[i].extLocation,
987 laarr[i].extLength, 1); 987 laarr[i].extLength, 1);
988 } 988 }
989} 989}
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
1076 * i_nlink = 1 1076 * i_nlink = 1
1077 * i_op = NULL; 1077 * i_op = NULL;
1078 */ 1078 */
1079 bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident); 1079 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
1080 if (!bh) { 1080 if (!bh) {
1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", 1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
1082 inode->i_ino); 1082 inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) { 1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
1099 struct buffer_head *ibh; 1099 struct buffer_head *ibh;
1100 1100
1101 ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, 1101 ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
1102 &ident); 1102 &ident);
1103 if (ident == TAG_IDENT_IE && ibh) { 1103 if (ident == TAG_IDENT_IE && ibh) {
1104 struct buffer_head *nbh = NULL; 1104 struct buffer_head *nbh = NULL;
1105 kernel_lb_addr loc; 1105 struct kernel_lb_addr loc;
1106 struct indirectEntry *ie; 1106 struct indirectEntry *ie;
1107 1107
1108 ie = (struct indirectEntry *)ibh->b_data; 1108 ie = (struct indirectEntry *)ibh->b_data;
1109 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1109 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1110 1110
1111 if (ie->indirectICB.extLength && 1111 if (ie->indirectICB.extLength &&
1112 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, 1112 (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
1113 &ident))) { 1113 &ident))) {
1114 if (ident == TAG_IDENT_FE || 1114 if (ident == TAG_IDENT_FE ||
1115 ident == TAG_IDENT_EFE) { 1115 ident == TAG_IDENT_EFE) {
1116 memcpy(&iinfo->i_location, 1116 memcpy(&iinfo->i_location,
1117 &loc, 1117 &loc,
1118 sizeof(kernel_lb_addr)); 1118 sizeof(struct kernel_lb_addr));
1119 brelse(bh); 1119 brelse(bh);
1120 brelse(ibh); 1120 brelse(ibh);
1121 brelse(nbh); 1121 brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1222 inode->i_size = le64_to_cpu(fe->informationLength); 1222 inode->i_size = le64_to_cpu(fe->informationLength);
1223 iinfo->i_lenExtents = inode->i_size; 1223 iinfo->i_lenExtents = inode->i_size;
1224 1224
1225 inode->i_mode = udf_convert_permissions(fe); 1225 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
1226 inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask; 1226 sbi->s_fmode != UDF_INVALID_MODE)
1227 inode->i_mode = sbi->s_fmode;
1228 else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
1229 sbi->s_dmode != UDF_INVALID_MODE)
1230 inode->i_mode = sbi->s_dmode;
1231 else
1232 inode->i_mode = udf_convert_permissions(fe);
1233 inode->i_mode &= ~sbi->s_umask;
1227 1234
1228 if (iinfo->i_efe == 0) { 1235 if (iinfo->i_efe == 0) {
1229 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1236 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1396 1403
1397 bh = udf_tread(inode->i_sb, 1404 bh = udf_tread(inode->i_sb,
1398 udf_get_lb_pblock(inode->i_sb, 1405 udf_get_lb_pblock(inode->i_sb,
1399 iinfo->i_location, 0)); 1406 &iinfo->i_location, 0));
1400 if (!bh) { 1407 if (!bh) {
1401 udf_debug("bread failure\n"); 1408 udf_debug("bread failure\n");
1402 return -EIO; 1409 return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1416 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1423 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1417 sizeof(struct unallocSpaceEntry)); 1424 sizeof(struct unallocSpaceEntry));
1418 crclen = sizeof(struct unallocSpaceEntry) + 1425 crclen = sizeof(struct unallocSpaceEntry) +
1419 iinfo->i_lenAlloc - sizeof(tag); 1426 iinfo->i_lenAlloc - sizeof(struct tag);
1420 use->descTag.tagLocation = cpu_to_le32( 1427 use->descTag.tagLocation = cpu_to_le32(
1421 iinfo->i_location. 1428 iinfo->i_location.
1422 logicalBlockNum); 1429 logicalBlockNum);
1423 use->descTag.descCRCLength = cpu_to_le16(crclen); 1430 use->descTag.descCRCLength = cpu_to_le16(crclen);
1424 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1431 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1425 sizeof(tag), 1432 sizeof(struct tag),
1426 crclen)); 1433 crclen));
1427 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1434 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1428 1435
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1459 fe->informationLength = cpu_to_le64(inode->i_size); 1466 fe->informationLength = cpu_to_le64(inode->i_size);
1460 1467
1461 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1468 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1462 regid *eid; 1469 struct regid *eid;
1463 struct deviceSpec *dsea = 1470 struct deviceSpec *dsea =
1464 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); 1471 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
1465 if (!dsea) { 1472 if (!dsea) {
1466 dsea = (struct deviceSpec *) 1473 dsea = (struct deviceSpec *)
1467 udf_add_extendedattr(inode, 1474 udf_add_extendedattr(inode,
1468 sizeof(struct deviceSpec) + 1475 sizeof(struct deviceSpec) +
1469 sizeof(regid), 12, 0x3); 1476 sizeof(struct regid), 12, 0x3);
1470 dsea->attrType = cpu_to_le32(12); 1477 dsea->attrType = cpu_to_le32(12);
1471 dsea->attrSubtype = 1; 1478 dsea->attrSubtype = 1;
1472 dsea->attrLength = cpu_to_le32( 1479 dsea->attrLength = cpu_to_le32(
1473 sizeof(struct deviceSpec) + 1480 sizeof(struct deviceSpec) +
1474 sizeof(regid)); 1481 sizeof(struct regid));
1475 dsea->impUseLength = cpu_to_le32(sizeof(regid)); 1482 dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
1476 } 1483 }
1477 eid = (regid *)dsea->impUse; 1484 eid = (struct regid *)dsea->impUse;
1478 memset(eid, 0, sizeof(regid)); 1485 memset(eid, 0, sizeof(struct regid));
1479 strcpy(eid->ident, UDF_ID_DEVELOPER); 1486 strcpy(eid->ident, UDF_ID_DEVELOPER);
1480 eid->identSuffix[0] = UDF_OS_CLASS_UNIX; 1487 eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
1481 eid->identSuffix[1] = UDF_OS_ID_LINUX; 1488 eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1494 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); 1501 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
1495 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); 1502 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
1496 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); 1503 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
1497 memset(&(fe->impIdent), 0, sizeof(regid)); 1504 memset(&(fe->impIdent), 0, sizeof(struct regid));
1498 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); 1505 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
1499 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1506 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1500 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1507 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1533 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); 1540 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
1534 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); 1541 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
1535 1542
1536 memset(&(efe->impIdent), 0, sizeof(regid)); 1543 memset(&(efe->impIdent), 0, sizeof(struct regid));
1537 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); 1544 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
1538 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1545 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1539 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1546 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1584 fe->descTag.tagLocation = cpu_to_le32( 1591 fe->descTag.tagLocation = cpu_to_le32(
1585 iinfo->i_location.logicalBlockNum); 1592 iinfo->i_location.logicalBlockNum);
1586 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1593 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
1587 sizeof(tag); 1594 sizeof(struct tag);
1588 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1595 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1589 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag), 1596 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1590 crclen)); 1597 crclen));
1591 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1598 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1592 1599
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1606 return err; 1613 return err;
1607} 1614}
1608 1615
1609struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) 1616struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
1610{ 1617{
1611 unsigned long block = udf_get_lb_pblock(sb, ino, 0); 1618 unsigned long block = udf_get_lb_pblock(sb, ino, 0);
1612 struct inode *inode = iget_locked(sb, block); 1619 struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1615 return NULL; 1622 return NULL;
1616 1623
1617 if (inode->i_state & I_NEW) { 1624 if (inode->i_state & I_NEW) {
1618 memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr)); 1625 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1619 __udf_read_inode(inode); 1626 __udf_read_inode(inode);
1620 unlock_new_inode(inode); 1627 unlock_new_inode(inode);
1621 } 1628 }
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1623 if (is_bad_inode(inode)) 1630 if (is_bad_inode(inode))
1624 goto out_iput; 1631 goto out_iput;
1625 1632
1626 if (ino.logicalBlockNum >= UDF_SB(sb)-> 1633 if (ino->logicalBlockNum >= UDF_SB(sb)->
1627 s_partmaps[ino.partitionReferenceNum].s_partition_len) { 1634 s_partmaps[ino->partitionReferenceNum].s_partition_len) {
1628 udf_debug("block=%d, partition=%d out of range\n", 1635 udf_debug("block=%d, partition=%d out of range\n",
1629 ino.logicalBlockNum, ino.partitionReferenceNum); 1636 ino->logicalBlockNum, ino->partitionReferenceNum);
1630 make_bad_inode(inode); 1637 make_bad_inode(inode);
1631 goto out_iput; 1638 goto out_iput;
1632 } 1639 }
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1639} 1646}
1640 1647
1641int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, 1648int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1642 kernel_lb_addr eloc, uint32_t elen, int inc) 1649 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1643{ 1650{
1644 int adsize; 1651 int adsize;
1645 short_ad *sad = NULL; 1652 struct short_ad *sad = NULL;
1646 long_ad *lad = NULL; 1653 struct long_ad *lad = NULL;
1647 struct allocExtDesc *aed; 1654 struct allocExtDesc *aed;
1648 int8_t etype; 1655 int8_t etype;
1649 uint8_t *ptr; 1656 uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1657 ptr = epos->bh->b_data + epos->offset; 1664 ptr = epos->bh->b_data + epos->offset;
1658 1665
1659 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1666 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1660 adsize = sizeof(short_ad); 1667 adsize = sizeof(struct short_ad);
1661 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1668 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1662 adsize = sizeof(long_ad); 1669 adsize = sizeof(struct long_ad);
1663 else 1670 else
1664 return -1; 1671 return -1;
1665 1672
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1667 char *sptr, *dptr; 1674 char *sptr, *dptr;
1668 struct buffer_head *nbh; 1675 struct buffer_head *nbh;
1669 int err, loffset; 1676 int err, loffset;
1670 kernel_lb_addr obloc = epos->block; 1677 struct kernel_lb_addr obloc = epos->block;
1671 1678
1672 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, 1679 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
1673 obloc.partitionReferenceNum, 1680 obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1675 if (!epos->block.logicalBlockNum) 1682 if (!epos->block.logicalBlockNum)
1676 return -1; 1683 return -1;
1677 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, 1684 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
1678 epos->block, 1685 &epos->block,
1679 0)); 1686 0));
1680 if (!nbh) 1687 if (!nbh)
1681 return -1; 1688 return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1712 } 1719 }
1713 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) 1720 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
1714 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, 1721 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
1715 epos->block.logicalBlockNum, sizeof(tag)); 1722 epos->block.logicalBlockNum, sizeof(struct tag));
1716 else 1723 else
1717 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, 1724 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
1718 epos->block.logicalBlockNum, sizeof(tag)); 1725 epos->block.logicalBlockNum, sizeof(struct tag));
1719 switch (iinfo->i_alloc_type) { 1726 switch (iinfo->i_alloc_type) {
1720 case ICBTAG_FLAG_AD_SHORT: 1727 case ICBTAG_FLAG_AD_SHORT:
1721 sad = (short_ad *)sptr; 1728 sad = (struct short_ad *)sptr;
1722 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1729 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1723 inode->i_sb->s_blocksize); 1730 inode->i_sb->s_blocksize);
1724 sad->extPosition = 1731 sad->extPosition =
1725 cpu_to_le32(epos->block.logicalBlockNum); 1732 cpu_to_le32(epos->block.logicalBlockNum);
1726 break; 1733 break;
1727 case ICBTAG_FLAG_AD_LONG: 1734 case ICBTAG_FLAG_AD_LONG:
1728 lad = (long_ad *)sptr; 1735 lad = (struct long_ad *)sptr;
1729 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1736 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1730 inode->i_sb->s_blocksize); 1737 inode->i_sb->s_blocksize);
1731 lad->extLocation = cpu_to_lelb(epos->block); 1738 lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1769} 1776}
1770 1777
1771int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, 1778int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1772 kernel_lb_addr eloc, uint32_t elen, int inc) 1779 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1773{ 1780{
1774 int adsize; 1781 int adsize;
1775 uint8_t *ptr; 1782 uint8_t *ptr;
1776 short_ad *sad; 1783 struct short_ad *sad;
1777 long_ad *lad; 1784 struct long_ad *lad;
1778 struct udf_inode_info *iinfo = UDF_I(inode); 1785 struct udf_inode_info *iinfo = UDF_I(inode);
1779 1786
1780 if (!epos->bh) 1787 if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1786 1793
1787 switch (iinfo->i_alloc_type) { 1794 switch (iinfo->i_alloc_type) {
1788 case ICBTAG_FLAG_AD_SHORT: 1795 case ICBTAG_FLAG_AD_SHORT:
1789 sad = (short_ad *)ptr; 1796 sad = (struct short_ad *)ptr;
1790 sad->extLength = cpu_to_le32(elen); 1797 sad->extLength = cpu_to_le32(elen);
1791 sad->extPosition = cpu_to_le32(eloc.logicalBlockNum); 1798 sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
1792 adsize = sizeof(short_ad); 1799 adsize = sizeof(struct short_ad);
1793 break; 1800 break;
1794 case ICBTAG_FLAG_AD_LONG: 1801 case ICBTAG_FLAG_AD_LONG:
1795 lad = (long_ad *)ptr; 1802 lad = (struct long_ad *)ptr;
1796 lad->extLength = cpu_to_le32(elen); 1803 lad->extLength = cpu_to_le32(elen);
1797 lad->extLocation = cpu_to_lelb(eloc); 1804 lad->extLocation = cpu_to_lelb(*eloc);
1798 memset(lad->impUse, 0x00, sizeof(lad->impUse)); 1805 memset(lad->impUse, 0x00, sizeof(lad->impUse));
1799 adsize = sizeof(long_ad); 1806 adsize = sizeof(struct long_ad);
1800 break; 1807 break;
1801 default: 1808 default:
1802 return -1; 1809 return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1823} 1830}
1824 1831
1825int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, 1832int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1826 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1833 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1827{ 1834{
1828 int8_t etype; 1835 int8_t etype;
1829 1836
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1833 epos->block = *eloc; 1840 epos->block = *eloc;
1834 epos->offset = sizeof(struct allocExtDesc); 1841 epos->offset = sizeof(struct allocExtDesc);
1835 brelse(epos->bh); 1842 brelse(epos->bh);
1836 block = udf_get_lb_pblock(inode->i_sb, epos->block, 0); 1843 block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
1837 epos->bh = udf_tread(inode->i_sb, block); 1844 epos->bh = udf_tread(inode->i_sb, block);
1838 if (!epos->bh) { 1845 if (!epos->bh) {
1839 udf_debug("reading block %d failed!\n", block); 1846 udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1845} 1852}
1846 1853
1847int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, 1854int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1848 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1855 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1849{ 1856{
1850 int alen; 1857 int alen;
1851 int8_t etype; 1858 int8_t etype;
1852 uint8_t *ptr; 1859 uint8_t *ptr;
1853 short_ad *sad; 1860 struct short_ad *sad;
1854 long_ad *lad; 1861 struct long_ad *lad;
1855 struct udf_inode_info *iinfo = UDF_I(inode); 1862 struct udf_inode_info *iinfo = UDF_I(inode);
1856 1863
1857 if (!epos->bh) { 1864 if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1900} 1907}
1901 1908
1902static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, 1909static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1903 kernel_lb_addr neloc, uint32_t nelen) 1910 struct kernel_lb_addr neloc, uint32_t nelen)
1904{ 1911{
1905 kernel_lb_addr oeloc; 1912 struct kernel_lb_addr oeloc;
1906 uint32_t oelen; 1913 uint32_t oelen;
1907 int8_t etype; 1914 int8_t etype;
1908 1915
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1910 get_bh(epos.bh); 1917 get_bh(epos.bh);
1911 1918
1912 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { 1919 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
1913 udf_write_aext(inode, &epos, neloc, nelen, 1); 1920 udf_write_aext(inode, &epos, &neloc, nelen, 1);
1914 neloc = oeloc; 1921 neloc = oeloc;
1915 nelen = (etype << 30) | oelen; 1922 nelen = (etype << 30) | oelen;
1916 } 1923 }
1917 udf_add_aext(inode, &epos, neloc, nelen, 1); 1924 udf_add_aext(inode, &epos, &neloc, nelen, 1);
1918 brelse(epos.bh); 1925 brelse(epos.bh);
1919 1926
1920 return (nelen >> 30); 1927 return (nelen >> 30);
1921} 1928}
1922 1929
1923int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, 1930int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1924 kernel_lb_addr eloc, uint32_t elen) 1931 struct kernel_lb_addr eloc, uint32_t elen)
1925{ 1932{
1926 struct extent_position oepos; 1933 struct extent_position oepos;
1927 int adsize; 1934 int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1936 1943
1937 iinfo = UDF_I(inode); 1944 iinfo = UDF_I(inode);
1938 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1945 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1939 adsize = sizeof(short_ad); 1946 adsize = sizeof(struct short_ad);
1940 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1947 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1941 adsize = sizeof(long_ad); 1948 adsize = sizeof(struct long_ad);
1942 else 1949 else
1943 adsize = 0; 1950 adsize = 0;
1944 1951
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1947 return -1; 1954 return -1;
1948 1955
1949 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { 1956 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
1950 udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1); 1957 udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
1951 if (oepos.bh != epos.bh) { 1958 if (oepos.bh != epos.bh) {
1952 oepos.block = epos.block; 1959 oepos.block = epos.block;
1953 brelse(oepos.bh); 1960 brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1956 oepos.offset = epos.offset - adsize; 1963 oepos.offset = epos.offset - adsize;
1957 } 1964 }
1958 } 1965 }
1959 memset(&eloc, 0x00, sizeof(kernel_lb_addr)); 1966 memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
1960 elen = 0; 1967 elen = 0;
1961 1968
1962 if (epos.bh != oepos.bh) { 1969 if (epos.bh != oepos.bh) {
1963 udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1); 1970 udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
1964 udf_write_aext(inode, &oepos, eloc, elen, 1); 1971 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1965 udf_write_aext(inode, &oepos, eloc, elen, 1); 1972 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1966 if (!oepos.bh) { 1973 if (!oepos.bh) {
1967 iinfo->i_lenAlloc -= (adsize * 2); 1974 iinfo->i_lenAlloc -= (adsize * 2);
1968 mark_inode_dirty(inode); 1975 mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1979 mark_buffer_dirty_inode(oepos.bh, inode); 1986 mark_buffer_dirty_inode(oepos.bh, inode);
1980 } 1987 }
1981 } else { 1988 } else {
1982 udf_write_aext(inode, &oepos, eloc, elen, 1); 1989 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1983 if (!oepos.bh) { 1990 if (!oepos.bh) {
1984 iinfo->i_lenAlloc -= adsize; 1991 iinfo->i_lenAlloc -= adsize;
1985 mark_inode_dirty(inode); 1992 mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2004} 2011}
2005 2012
2006int8_t inode_bmap(struct inode *inode, sector_t block, 2013int8_t inode_bmap(struct inode *inode, sector_t block,
2007 struct extent_position *pos, kernel_lb_addr *eloc, 2014 struct extent_position *pos, struct kernel_lb_addr *eloc,
2008 uint32_t *elen, sector_t *offset) 2015 uint32_t *elen, sector_t *offset)
2009{ 2016{
2010 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 2017 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
2036 2043
2037long udf_block_map(struct inode *inode, sector_t block) 2044long udf_block_map(struct inode *inode, sector_t block)
2038{ 2045{
2039 kernel_lb_addr eloc; 2046 struct kernel_lb_addr eloc;
2040 uint32_t elen; 2047 uint32_t elen;
2041 sector_t offset; 2048 sector_t offset;
2042 struct extent_position epos = {}; 2049 struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2046 2053
2047 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == 2054 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
2048 (EXT_RECORDED_ALLOCATED >> 30)) 2055 (EXT_RECORDED_ALLOCATED >> 30))
2049 ret = udf_get_lb_pblock(inode->i_sb, eloc, offset); 2056 ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
2050 else 2057 else
2051 ret = 0; 2058 ret = 0;
2052 2059
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f1..9215700c00a4 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
134 } 134 }
135 } 135 }
136 /* rewrite CRC + checksum of eahd */ 136 /* rewrite CRC + checksum of eahd */
137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); 137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
138 eahd->descTag.descCRCLength = cpu_to_le16(crclen); 138 eahd->descTag.descCRCLength = cpu_to_le16(crclen);
139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + 139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
140 sizeof(tag), crclen)); 140 sizeof(struct tag), crclen));
141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); 141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
142 iinfo->i_lenEAttr += size; 142 iinfo->i_lenEAttr += size;
143 return (struct genericFormat *)&ea[offset]; 143 return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, 202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
203 uint32_t location, uint16_t *ident) 203 uint32_t location, uint16_t *ident)
204{ 204{
205 tag *tag_p; 205 struct tag *tag_p;
206 struct buffer_head *bh = NULL; 206 struct buffer_head *bh = NULL;
207 207
208 /* Read the block */ 208 /* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
216 return NULL; 216 return NULL;
217 } 217 }
218 218
219 tag_p = (tag *)(bh->b_data); 219 tag_p = (struct tag *)(bh->b_data);
220 220
221 *ident = le16_to_cpu(tag_p->tagIdent); 221 *ident = le16_to_cpu(tag_p->tagIdent);
222 222
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
241 } 241 }
242 242
243 /* Verify the descriptor CRC */ 243 /* Verify the descriptor CRC */
244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || 244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, 245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
246 bh->b_data + sizeof(tag), 246 bh->b_data + sizeof(struct tag),
247 le16_to_cpu(tag_p->descCRCLength))) 247 le16_to_cpu(tag_p->descCRCLength)))
248 return bh; 248 return bh;
249 249
@@ -255,27 +255,28 @@ error_out:
255 return NULL; 255 return NULL;
256} 256}
257 257
258struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc, 258struct buffer_head *udf_read_ptagged(struct super_block *sb,
259 struct kernel_lb_addr *loc,
259 uint32_t offset, uint16_t *ident) 260 uint32_t offset, uint16_t *ident)
260{ 261{
261 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), 262 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
262 loc.logicalBlockNum + offset, ident); 263 loc->logicalBlockNum + offset, ident);
263} 264}
264 265
265void udf_update_tag(char *data, int length) 266void udf_update_tag(char *data, int length)
266{ 267{
267 tag *tptr = (tag *)data; 268 struct tag *tptr = (struct tag *)data;
268 length -= sizeof(tag); 269 length -= sizeof(struct tag);
269 270
270 tptr->descCRCLength = cpu_to_le16(length); 271 tptr->descCRCLength = cpu_to_le16(length);
271 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length)); 272 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
272 tptr->tagChecksum = udf_tag_checksum(tptr); 273 tptr->tagChecksum = udf_tag_checksum(tptr);
273} 274}
274 275
275void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, 276void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
276 uint32_t loc, int length) 277 uint32_t loc, int length)
277{ 278{
278 tag *tptr = (tag *)data; 279 struct tag *tptr = (struct tag *)data;
279 tptr->tagIdent = cpu_to_le16(ident); 280 tptr->tagIdent = cpu_to_le16(ident);
280 tptr->descVersion = cpu_to_le16(version); 281 tptr->descVersion = cpu_to_le16(version);
281 tptr->tagSerialNum = cpu_to_le16(snum); 282 tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
283 udf_update_tag(data, length); 284 udf_update_tag(data, length);
284} 285}
285 286
286u8 udf_tag_checksum(const tag *t) 287u8 udf_tag_checksum(const struct tag *t)
287{ 288{
288 u8 *data = (u8 *)t; 289 u8 *data = (u8 *)t;
289 u8 checksum = 0; 290 u8 checksum = 0;
290 int i; 291 int i;
291 for (i = 0; i < sizeof(tag); ++i) 292 for (i = 0; i < sizeof(struct tag); ++i)
292 if (i != 4) /* position of checksum */ 293 if (i != 4) /* position of checksum */
293 checksum += data[i]; 294 checksum += data[i];
294 return checksum; 295 return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d941..6a29fa34c478 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, 47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
48 uint8_t *impuse, uint8_t *fileident) 48 uint8_t *impuse, uint8_t *fileident)
49{ 49{
50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag); 50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
51 uint16_t crc; 51 uint16_t crc;
52 int offset; 52 int offset;
53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); 53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
99 memset(fibh->ebh->b_data, 0x00, padlen + offset); 99 memset(fibh->ebh->b_data, 0x00, padlen + offset);
100 } 100 }
101 101
102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag), 102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
103 sizeof(struct fileIdentDesc) - sizeof(tag)); 103 sizeof(struct fileIdentDesc) - sizeof(struct tag));
104 104
105 if (fibh->sbh == fibh->ebh) { 105 if (fibh->sbh == fibh->ebh) {
106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
107 crclen + sizeof(tag) - 107 crclen + sizeof(struct tag) -
108 sizeof(struct fileIdentDesc)); 108 sizeof(struct fileIdentDesc));
109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { 109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
110 crc = crc_itu_t(crc, fibh->ebh->b_data + 110 crc = crc_itu_t(crc, fibh->ebh->b_data +
111 sizeof(struct fileIdentDesc) + 111 sizeof(struct fileIdentDesc) +
112 fibh->soffset, 112 fibh->soffset,
113 crclen + sizeof(tag) - 113 crclen + sizeof(struct tag) -
114 sizeof(struct fileIdentDesc)); 114 sizeof(struct fileIdentDesc));
115 } else { 115 } else {
116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
157 kernel_lb_addr eloc; 157 struct kernel_lb_addr eloc;
158 uint32_t elen; 158 uint32_t elen;
159 sector_t offset; 159 sector_t offset;
160 struct extent_position epos = {}; 160 struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) 172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
173 goto out_err; 173 goto out_err;
174 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 174 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
177 epos.offset -= sizeof(short_ad); 177 epos.offset -= sizeof(struct short_ad);
178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
179 epos.offset -= sizeof(long_ad); 179 epos.offset -= sizeof(struct long_ad);
180 } else 180 } else
181 offset = 0; 181 offset = 0;
182 182
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
268#ifdef UDF_RECOVERY 268#ifdef UDF_RECOVERY
269 /* temporary shorthand for specifying files by inode number */ 269 /* temporary shorthand for specifying files by inode number */
270 if (!strncmp(dentry->d_name.name, ".B=", 3)) { 270 if (!strncmp(dentry->d_name.name, ".B=", 3)) {
271 kernel_lb_addr lb = { 271 struct kernel_lb_addr lb = {
272 .logicalBlockNum = 0, 272 .logicalBlockNum = 0,
273 .partitionReferenceNum = 273 .partitionReferenceNum =
274 simple_strtoul(dentry->d_name.name + 3, 274 simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
283#endif /* UDF_RECOVERY */ 283#endif /* UDF_RECOVERY */
284 284
285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { 285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
286 struct kernel_lb_addr loc;
287
286 if (fibh.sbh != fibh.ebh) 288 if (fibh.sbh != fibh.ebh)
287 brelse(fibh.ebh); 289 brelse(fibh.ebh);
288 brelse(fibh.sbh); 290 brelse(fibh.sbh);
289 291
290 inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation)); 292 loc = lelb_to_cpu(cfi.icb.extLocation);
293 inode = udf_iget(dir->i_sb, &loc);
291 if (!inode) { 294 if (!inode) {
292 unlock_kernel(); 295 unlock_kernel();
293 return ERR_PTR(-EACCES); 296 return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
313 uint8_t lfi; 316 uint8_t lfi;
314 uint16_t liu; 317 uint16_t liu;
315 int block; 318 int block;
316 kernel_lb_addr eloc; 319 struct kernel_lb_addr eloc;
317 uint32_t elen = 0; 320 uint32_t elen = 0;
318 sector_t offset; 321 sector_t offset;
319 struct extent_position epos = {}; 322 struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
351 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 354 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
352 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { 355 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
353 block = udf_get_lb_pblock(dir->i_sb, 356 block = udf_get_lb_pblock(dir->i_sb,
354 dinfo->i_location, 0); 357 &dinfo->i_location, 0);
355 fibh->soffset = fibh->eoffset = sb->s_blocksize; 358 fibh->soffset = fibh->eoffset = sb->s_blocksize;
356 goto add; 359 goto add;
357 } 360 }
358 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 361 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
359 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 362 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
360 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 363 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
361 epos.offset -= sizeof(short_ad); 364 epos.offset -= sizeof(struct short_ad);
362 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 365 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
363 epos.offset -= sizeof(long_ad); 366 epos.offset -= sizeof(struct long_ad);
364 } else 367 } else
365 offset = 0; 368 offset = 0;
366 369
@@ -409,10 +412,10 @@ add:
409 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { 412 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
410 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); 413 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
411 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 414 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
412 epos.offset -= sizeof(short_ad); 415 epos.offset -= sizeof(struct short_ad);
413 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 416 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
414 epos.offset -= sizeof(long_ad); 417 epos.offset -= sizeof(struct long_ad);
415 udf_write_aext(dir, &epos, eloc, elen, 1); 418 udf_write_aext(dir, &epos, &eloc, elen, 1);
416 } 419 }
417 f_pos += nfidlen; 420 f_pos += nfidlen;
418 421
@@ -494,10 +497,10 @@ add:
494 memset(cfi, 0, sizeof(struct fileIdentDesc)); 497 memset(cfi, 0, sizeof(struct fileIdentDesc));
495 if (UDF_SB(sb)->s_udfrev >= 0x0200) 498 if (UDF_SB(sb)->s_udfrev >= 0x0200)
496 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, 499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
497 sizeof(tag)); 500 sizeof(struct tag));
498 else 501 else
499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, 502 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
500 sizeof(tag)); 503 sizeof(struct tag));
501 cfi->fileVersionNum = cpu_to_le16(1); 504 cfi->fileVersionNum = cpu_to_le16(1);
502 cfi->lengthFileIdent = namelen; 505 cfi->lengthFileIdent = namelen;
503 cfi->lengthOfImpUse = cpu_to_le16(0); 506 cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
530 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; 533 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
531 534
532 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) 535 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
533 memset(&(cfi->icb), 0x00, sizeof(long_ad)); 536 memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
534 537
535 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); 538 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
536} 539}
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
710 loff_t f_pos; 713 loff_t f_pos;
711 loff_t size = udf_ext0_offset(dir) + dir->i_size; 714 loff_t size = udf_ext0_offset(dir) + dir->i_size;
712 int block; 715 int block;
713 kernel_lb_addr eloc; 716 struct kernel_lb_addr eloc;
714 uint32_t elen; 717 uint32_t elen;
715 sector_t offset; 718 sector_t offset;
716 struct extent_position epos = {}; 719 struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
724 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, 727 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
725 &epos, &eloc, &elen, &offset) == 728 &epos, &eloc, &elen, &offset) ==
726 (EXT_RECORDED_ALLOCATED >> 30)) { 729 (EXT_RECORDED_ALLOCATED >> 30)) {
727 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 730 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
728 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 731 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
729 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 732 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
730 epos.offset -= sizeof(short_ad); 733 epos.offset -= sizeof(struct short_ad);
731 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 734 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
732 epos.offset -= sizeof(long_ad); 735 epos.offset -= sizeof(struct long_ad);
733 } else 736 } else
734 offset = 0; 737 offset = 0;
735 738
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
778 struct inode *inode = dentry->d_inode; 781 struct inode *inode = dentry->d_inode;
779 struct udf_fileident_bh fibh; 782 struct udf_fileident_bh fibh;
780 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
781 kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
782 785
783 retval = -ENOENT; 786 retval = -ENOENT;
784 lock_kernel(); 787 lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
788 791
789 retval = -EIO; 792 retval = -EIO;
790 tloc = lelb_to_cpu(cfi.icb.extLocation); 793 tloc = lelb_to_cpu(cfi.icb.extLocation);
791 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 794 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
792 goto end_rmdir; 795 goto end_rmdir;
793 retval = -ENOTEMPTY; 796 retval = -ENOTEMPTY;
794 if (!empty_dir(inode)) 797 if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
824 struct udf_fileident_bh fibh; 827 struct udf_fileident_bh fibh;
825 struct fileIdentDesc *fi; 828 struct fileIdentDesc *fi;
826 struct fileIdentDesc cfi; 829 struct fileIdentDesc cfi;
827 kernel_lb_addr tloc; 830 struct kernel_lb_addr tloc;
828 831
829 retval = -ENOENT; 832 retval = -ENOENT;
830 lock_kernel(); 833 lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
834 837
835 retval = -EIO; 838 retval = -EIO;
836 tloc = lelb_to_cpu(cfi.icb.extLocation); 839 tloc = lelb_to_cpu(cfi.icb.extLocation);
837 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 840 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
838 goto end_unlink; 841 goto end_unlink;
839 842
840 if (!inode->i_nlink) { 843 if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
897 inode->i_op = &page_symlink_inode_operations; 900 inode->i_op = &page_symlink_inode_operations;
898 901
899 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 902 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
900 kernel_lb_addr eloc; 903 struct kernel_lb_addr eloc;
901 uint32_t bsize; 904 uint32_t bsize;
902 905
903 block = udf_new_block(inode->i_sb, inode, 906 block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
913 iinfo->i_location.partitionReferenceNum; 916 iinfo->i_location.partitionReferenceNum;
914 bsize = inode->i_sb->s_blocksize; 917 bsize = inode->i_sb->s_blocksize;
915 iinfo->i_lenExtents = bsize; 918 iinfo->i_lenExtents = bsize;
916 udf_add_aext(inode, &epos, eloc, bsize, 0); 919 udf_add_aext(inode, &epos, &eloc, bsize, 0);
917 brelse(epos.bh); 920 brelse(epos.bh);
918 921
919 block = udf_get_pblock(inode->i_sb, block, 922 block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1108 struct fileIdentDesc ocfi, ncfi; 1111 struct fileIdentDesc ocfi, ncfi;
1109 struct buffer_head *dir_bh = NULL; 1112 struct buffer_head *dir_bh = NULL;
1110 int retval = -ENOENT; 1113 int retval = -ENOENT;
1111 kernel_lb_addr tloc; 1114 struct kernel_lb_addr tloc;
1112 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1115 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1113 1116
1114 lock_kernel(); 1117 lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1119 brelse(ofibh.sbh); 1122 brelse(ofibh.sbh);
1120 } 1123 }
1121 tloc = lelb_to_cpu(ocfi.icb.extLocation); 1124 tloc = lelb_to_cpu(ocfi.icb.extLocation);
1122 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0) 1125 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
1123 != old_inode->i_ino) 1126 != old_inode->i_ino)
1124 goto end_rename; 1127 goto end_rename;
1125 1128
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1158 if (!dir_fi) 1161 if (!dir_fi)
1159 goto end_rename; 1162 goto end_rename;
1160 tloc = lelb_to_cpu(dir_fi->icb.extLocation); 1163 tloc = lelb_to_cpu(dir_fi->icb.extLocation);
1161 if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) != 1164 if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
1162 old_dir->i_ino) 1165 old_dir->i_ino)
1163 goto end_rename; 1166 goto end_rename;
1164 1167
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1187 */ 1190 */
1188 ncfi.fileVersionNum = ocfi.fileVersionNum; 1191 ncfi.fileVersionNum = ocfi.fileVersionNum;
1189 ncfi.fileCharacteristics = ocfi.fileCharacteristics; 1192 ncfi.fileCharacteristics = ocfi.fileCharacteristics;
1190 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad)); 1193 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
1191 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); 1194 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
1192 1195
1193 /* The old fid may have moved - find it again */ 1196 /* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
1242 1245
1243static struct dentry *udf_get_parent(struct dentry *child) 1246static struct dentry *udf_get_parent(struct dentry *child)
1244{ 1247{
1248 struct kernel_lb_addr tloc;
1245 struct inode *inode = NULL; 1249 struct inode *inode = NULL;
1246 struct qstr dotdot = {.name = "..", .len = 2}; 1250 struct qstr dotdot = {.name = "..", .len = 2};
1247 struct fileIdentDesc cfi; 1251 struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
1255 brelse(fibh.ebh); 1259 brelse(fibh.ebh);
1256 brelse(fibh.sbh); 1260 brelse(fibh.sbh);
1257 1261
1258 inode = udf_iget(child->d_inode->i_sb, 1262 tloc = lelb_to_cpu(cfi.icb.extLocation);
1259 lelb_to_cpu(cfi.icb.extLocation)); 1263 inode = udf_iget(child->d_inode->i_sb, &tloc);
1260 if (!inode) 1264 if (!inode)
1261 goto out_unlock; 1265 goto out_unlock;
1262 unlock_kernel(); 1266 unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
1272 u16 partref, __u32 generation) 1276 u16 partref, __u32 generation)
1273{ 1277{
1274 struct inode *inode; 1278 struct inode *inode;
1275 kernel_lb_addr loc; 1279 struct kernel_lb_addr loc;
1276 1280
1277 if (block == 0) 1281 if (block == 0)
1278 return ERR_PTR(-ESTALE); 1282 return ERR_PTR(-ESTALE);
1279 1283
1280 loc.logicalBlockNum = block; 1284 loc.logicalBlockNum = block;
1281 loc.partitionReferenceNum = partref; 1285 loc.partitionReferenceNum = partref;
1282 inode = udf_iget(sb, loc); 1286 inode = udf_iget(sb, &loc);
1283 1287
1284 if (inode == NULL) 1288 if (inode == NULL)
1285 return ERR_PTR(-ENOMEM); 1289 return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1318{ 1322{
1319 int len = *lenp; 1323 int len = *lenp;
1320 struct inode *inode = de->d_inode; 1324 struct inode *inode = de->d_inode;
1321 kernel_lb_addr location = UDF_I(inode)->i_location; 1325 struct kernel_lb_addr location = UDF_I(inode)->i_location;
1322 struct fid *fid = (struct fid *)fh; 1326 struct fid *fid = (struct fid *)fh;
1323 int type = FILEID_UDF_WITHOUT_PARENT; 1327 int type = FILEID_UDF_WITHOUT_PARENT;
1324 1328
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd2..fbff74654df2 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ 85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
86/* Implementation Use (UDF 2.50 2.2.6.4) */ 86/* Implementation Use (UDF 2.50 2.2.6.4) */
87struct logicalVolIntegrityDescImpUse { 87struct logicalVolIntegrityDescImpUse {
88 regid impIdent; 88 struct regid impIdent;
89 __le32 numFiles; 89 __le32 numFiles;
90 __le32 numDirs; 90 __le32 numDirs;
91 __le16 minUDFReadRev; 91 __le16 minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ 97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
98/* Implementation Use (UDF 2.50 2.2.7.2) */ 98/* Implementation Use (UDF 2.50 2.2.7.2) */
99struct impUseVolDescImpUse { 99struct impUseVolDescImpUse {
100 charspec LVICharset; 100 struct charspec LVICharset;
101 dstring logicalVolIdent[128]; 101 dstring logicalVolIdent[128];
102 dstring LVInfo1[36]; 102 dstring LVInfo1[36];
103 dstring LVInfo2[36]; 103 dstring LVInfo2[36];
104 dstring LVInfo3[36]; 104 dstring LVInfo3[36];
105 regid impIdent; 105 struct regid impIdent;
106 uint8_t impUse[128]; 106 uint8_t impUse[128];
107} __attribute__ ((packed)); 107} __attribute__ ((packed));
108 108
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
110 uint8_t partitionMapType; 110 uint8_t partitionMapType;
111 uint8_t partitionMapLength; 111 uint8_t partitionMapLength;
112 uint8_t reserved1[2]; 112 uint8_t reserved1[2];
113 regid partIdent; 113 struct regid partIdent;
114 __le16 volSeqNum; 114 __le16 volSeqNum;
115 __le16 partitionNum; 115 __le16 partitionNum;
116} __attribute__ ((packed)); 116} __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
120 uint8_t partitionMapType; 120 uint8_t partitionMapType;
121 uint8_t partitionMapLength; 121 uint8_t partitionMapLength;
122 uint8_t reserved1[2]; 122 uint8_t reserved1[2];
123 regid partIdent; 123 struct regid partIdent;
124 __le16 volSeqNum; 124 __le16 volSeqNum;
125 __le16 partitionNum; 125 __le16 partitionNum;
126 uint8_t reserved2[24]; 126 uint8_t reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
131 uint8_t partitionMapType; 131 uint8_t partitionMapType;
132 uint8_t partitionMapLength; 132 uint8_t partitionMapLength;
133 uint8_t reserved1[2]; 133 uint8_t reserved1[2];
134 regid partIdent; 134 struct regid partIdent;
135 __le16 volSeqNum; 135 __le16 volSeqNum;
136 __le16 partitionNum; 136 __le16 partitionNum;
137 __le16 packetLength; 137 __le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
146 uint8_t partitionMapType; 146 uint8_t partitionMapType;
147 uint8_t partitionMapLength; 147 uint8_t partitionMapLength;
148 uint8_t reserved1[2]; 148 uint8_t reserved1[2];
149 regid partIdent; 149 struct regid partIdent;
150 __le16 volSeqNum; 150 __le16 volSeqNum;
151 __le16 partitionNum; 151 __le16 partitionNum;
152 __le32 metadataFileLoc; 152 __le32 metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
161/* Virtual Allocation Table (UDF 1.5 2.2.10) */ 161/* Virtual Allocation Table (UDF 1.5 2.2.10) */
162struct virtualAllocationTable15 { 162struct virtualAllocationTable15 {
163 __le32 VirtualSector[0]; 163 __le32 VirtualSector[0];
164 regid vatIdent; 164 struct regid vatIdent;
165 __le32 previousVATICBLoc; 165 __le32 previousVATICBLoc;
166} __attribute__ ((packed)); 166} __attribute__ ((packed));
167 167
@@ -192,8 +192,8 @@ struct sparingEntry {
192} __attribute__ ((packed)); 192} __attribute__ ((packed));
193 193
194struct sparingTable { 194struct sparingTable {
195 tag descTag; 195 struct tag descTag;
196 regid sparingIdent; 196 struct regid sparingIdent;
197 __le16 reallocationTableLen; 197 __le16 reallocationTableLen;
198 __le16 reserved; 198 __le16 reserved;
199 __le32 sequenceNum; 199 __le32 sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
206#define ICBTAG_FILE_TYPE_MIRROR 0xFB 206#define ICBTAG_FILE_TYPE_MIRROR 0xFB
207#define ICBTAG_FILE_TYPE_BITMAP 0xFC 207#define ICBTAG_FILE_TYPE_BITMAP 0xFC
208 208
209/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ 209/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
210struct allocDescImpUse { 210struct allocDescImpUse {
211 __le16 flags; 211 __le16 flags;
212 uint8_t impUse[4]; 212 uint8_t impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d6..4b540ee632d5 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
273{ 273{
274 struct super_block *sb = inode->i_sb; 274 struct super_block *sb = inode->i_sb;
275 struct udf_part_map *map; 275 struct udf_part_map *map;
276 kernel_lb_addr eloc; 276 struct kernel_lb_addr eloc;
277 uint32_t elen; 277 uint32_t elen;
278 sector_t ext_offset; 278 sector_t ext_offset;
279 struct extent_position epos = {}; 279 struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627b..72348cc855a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
81/* These are the "meat" - everything else is stuffing */ 81/* These are the "meat" - everything else is stuffing */
82static int udf_fill_super(struct super_block *, void *, int); 82static int udf_fill_super(struct super_block *, void *, int);
83static void udf_put_super(struct super_block *); 83static void udf_put_super(struct super_block *);
84static void udf_write_super(struct super_block *); 84static int udf_sync_fs(struct super_block *, int);
85static int udf_remount_fs(struct super_block *, int *, char *); 85static int udf_remount_fs(struct super_block *, int *, char *);
86static int udf_check_valid(struct super_block *, int, int); 86static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
87static int udf_vrs(struct super_block *sb, int silent); 87static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
88static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); 88 struct kernel_lb_addr *);
89static void udf_find_anchor(struct super_block *);
90static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
91 kernel_lb_addr *);
92static void udf_load_fileset(struct super_block *, struct buffer_head *, 89static void udf_load_fileset(struct super_block *, struct buffer_head *,
93 kernel_lb_addr *); 90 struct kernel_lb_addr *);
94static void udf_open_lvid(struct super_block *); 91static void udf_open_lvid(struct super_block *);
95static void udf_close_lvid(struct super_block *); 92static void udf_close_lvid(struct super_block *);
96static unsigned int udf_count_free(struct super_block *); 93static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
181 .delete_inode = udf_delete_inode, 178 .delete_inode = udf_delete_inode,
182 .clear_inode = udf_clear_inode, 179 .clear_inode = udf_clear_inode,
183 .put_super = udf_put_super, 180 .put_super = udf_put_super,
184 .write_super = udf_write_super, 181 .sync_fs = udf_sync_fs,
185 .statfs = udf_statfs, 182 .statfs = udf_statfs,
186 .remount_fs = udf_remount_fs, 183 .remount_fs = udf_remount_fs,
187 .show_options = udf_show_options, 184 .show_options = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
201 mode_t umask; 198 mode_t umask;
202 gid_t gid; 199 gid_t gid;
203 uid_t uid; 200 uid_t uid;
201 mode_t fmode;
202 mode_t dmode;
204 struct nls_table *nls_map; 203 struct nls_table *nls_map;
205}; 204};
206 205
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
258 257
259 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) 258 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
260 seq_puts(seq, ",nostrict"); 259 seq_puts(seq, ",nostrict");
261 if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE) 260 if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
262 seq_printf(seq, ",bs=%lu", sb->s_blocksize); 261 seq_printf(seq, ",bs=%lu", sb->s_blocksize);
263 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) 262 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
264 seq_puts(seq, ",unhide"); 263 seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
282 seq_printf(seq, ",gid=%u", sbi->s_gid); 281 seq_printf(seq, ",gid=%u", sbi->s_gid);
283 if (sbi->s_umask != 0) 282 if (sbi->s_umask != 0)
284 seq_printf(seq, ",umask=%o", sbi->s_umask); 283 seq_printf(seq, ",umask=%o", sbi->s_umask);
284 if (sbi->s_fmode != UDF_INVALID_MODE)
285 seq_printf(seq, ",mode=%o", sbi->s_fmode);
286 if (sbi->s_dmode != UDF_INVALID_MODE)
287 seq_printf(seq, ",dmode=%o", sbi->s_dmode);
285 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) 288 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
286 seq_printf(seq, ",session=%u", sbi->s_session); 289 seq_printf(seq, ",session=%u", sbi->s_session);
287 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) 290 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
288 seq_printf(seq, ",lastblock=%u", sbi->s_last_block); 291 seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
289 /* 292 if (sbi->s_anchor != 0)
290 * s_anchor[2] could be zeroed out in case there is no anchor 293 seq_printf(seq, ",anchor=%u", sbi->s_anchor);
291 * in the specified block, but then the "anchor=N" option
292 * originally given by the user wasn't effective, so it's OK
293 * if we don't show it.
294 */
295 if (sbi->s_anchor[2] != 0)
296 seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
297 /* 294 /*
298 * volume, partition, fileset and rootdir seem to be ignored 295 * volume, partition, fileset and rootdir seem to be ignored
299 * currently 296 * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
317 * 314 *
318 * gid= Set the default group. 315 * gid= Set the default group.
319 * umask= Set the default umask. 316 * umask= Set the default umask.
317 * mode= Set the default file permissions.
318 * dmode= Set the default directory permissions.
320 * uid= Set the default user. 319 * uid= Set the default user.
321 * bs= Set the block size. 320 * bs= Set the block size.
322 * unhide Show otherwise hidden files. 321 * unhide Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
366 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, 365 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
367 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, 366 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
368 Opt_rootdir, Opt_utf8, Opt_iocharset, 367 Opt_rootdir, Opt_utf8, Opt_iocharset,
369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore 368 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
369 Opt_fmode, Opt_dmode
370}; 370};
371 371
372static const match_table_t tokens = { 372static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
395 {Opt_rootdir, "rootdir=%u"}, 395 {Opt_rootdir, "rootdir=%u"},
396 {Opt_utf8, "utf8"}, 396 {Opt_utf8, "utf8"},
397 {Opt_iocharset, "iocharset=%s"}, 397 {Opt_iocharset, "iocharset=%s"},
398 {Opt_fmode, "mode=%o"},
399 {Opt_dmode, "dmode=%o"},
398 {Opt_err, NULL} 400 {Opt_err, NULL}
399}; 401};
400 402
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
405 int option; 407 int option;
406 408
407 uopt->novrs = 0; 409 uopt->novrs = 0;
408 uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
409 uopt->partition = 0xFFFF; 410 uopt->partition = 0xFFFF;
410 uopt->session = 0xFFFFFFFF; 411 uopt->session = 0xFFFFFFFF;
411 uopt->lastblock = 0; 412 uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
428 switch (token) { 429 switch (token) {
429 case Opt_novrs: 430 case Opt_novrs:
430 uopt->novrs = 1; 431 uopt->novrs = 1;
432 break;
431 case Opt_bs: 433 case Opt_bs:
432 if (match_int(&args[0], &option)) 434 if (match_int(&args[0], &option))
433 return 0; 435 return 0;
434 uopt->blocksize = option; 436 uopt->blocksize = option;
437 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
435 break; 438 break;
436 case Opt_unhide: 439 case Opt_unhide:
437 uopt->flags |= (1 << UDF_FLAG_UNHIDE); 440 uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
531 case Opt_gforget: 534 case Opt_gforget:
532 uopt->flags |= (1 << UDF_FLAG_GID_FORGET); 535 uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
533 break; 536 break;
537 case Opt_fmode:
538 if (match_octal(args, &option))
539 return 0;
540 uopt->fmode = option & 0777;
541 break;
542 case Opt_dmode:
543 if (match_octal(args, &option))
544 return 0;
545 uopt->dmode = option & 0777;
546 break;
534 default: 547 default:
535 printk(KERN_ERR "udf: bad mount option \"%s\" " 548 printk(KERN_ERR "udf: bad mount option \"%s\" "
536 "or missing value\n", p); 549 "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
540 return 1; 553 return 1;
541} 554}
542 555
543static void udf_write_super(struct super_block *sb)
544{
545 lock_kernel();
546
547 if (!(sb->s_flags & MS_RDONLY))
548 udf_open_lvid(sb);
549 sb->s_dirt = 0;
550
551 unlock_kernel();
552}
553
554static int udf_remount_fs(struct super_block *sb, int *flags, char *options) 556static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
555{ 557{
556 struct udf_options uopt; 558 struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
560 uopt.uid = sbi->s_uid; 562 uopt.uid = sbi->s_uid;
561 uopt.gid = sbi->s_gid; 563 uopt.gid = sbi->s_gid;
562 uopt.umask = sbi->s_umask; 564 uopt.umask = sbi->s_umask;
565 uopt.fmode = sbi->s_fmode;
566 uopt.dmode = sbi->s_dmode;
563 567
564 if (!udf_parse_options(options, &uopt, true)) 568 if (!udf_parse_options(options, &uopt, true))
565 return -EINVAL; 569 return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
568 sbi->s_uid = uopt.uid; 572 sbi->s_uid = uopt.uid;
569 sbi->s_gid = uopt.gid; 573 sbi->s_gid = uopt.gid;
570 sbi->s_umask = uopt.umask; 574 sbi->s_umask = uopt.umask;
575 sbi->s_fmode = uopt.fmode;
576 sbi->s_dmode = uopt.dmode;
571 577
572 if (sbi->s_lvid_bh) { 578 if (sbi->s_lvid_bh) {
573 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); 579 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
585 return 0; 591 return 0;
586} 592}
587 593
588static int udf_vrs(struct super_block *sb, int silent) 594/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
595/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
596static loff_t udf_check_vsd(struct super_block *sb)
589{ 597{
590 struct volStructDesc *vsd = NULL; 598 struct volStructDesc *vsd = NULL;
591 loff_t sector = 32768; 599 loff_t sector = 32768;
592 int sectorsize; 600 int sectorsize;
593 struct buffer_head *bh = NULL; 601 struct buffer_head *bh = NULL;
594 int iso9660 = 0;
595 int nsr02 = 0; 602 int nsr02 = 0;
596 int nsr03 = 0; 603 int nsr03 = 0;
597 struct udf_sb_info *sbi; 604 struct udf_sb_info *sbi;
598 605
599 /* Block size must be a multiple of 512 */
600 if (sb->s_blocksize & 511)
601 return 0;
602 sbi = UDF_SB(sb); 606 sbi = UDF_SB(sb);
603
604 if (sb->s_blocksize < sizeof(struct volStructDesc)) 607 if (sb->s_blocksize < sizeof(struct volStructDesc))
605 sectorsize = sizeof(struct volStructDesc); 608 sectorsize = sizeof(struct volStructDesc);
606 else 609 else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
627 break; 630 break;
628 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, 631 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
629 VSD_STD_ID_LEN)) { 632 VSD_STD_ID_LEN)) {
630 iso9660 = sector;
631 switch (vsd->structType) { 633 switch (vsd->structType) {
632 case 0: 634 case 0:
633 udf_debug("ISO9660 Boot Record found\n"); 635 udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
679 return 0; 681 return 0;
680} 682}
681 683
682/*
683 * Check whether there is an anchor block in the given block
684 */
685static int udf_check_anchor_block(struct super_block *sb, sector_t block)
686{
687 struct buffer_head *bh;
688 uint16_t ident;
689
690 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
691 udf_fixed_to_variable(block) >=
692 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
693 return 0;
694
695 bh = udf_read_tagged(sb, block, block, &ident);
696 if (!bh)
697 return 0;
698 brelse(bh);
699
700 return ident == TAG_IDENT_AVDP;
701}
702
703/* Search for an anchor volume descriptor pointer */
704static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
705{
706 sector_t last[6];
707 int i;
708 struct udf_sb_info *sbi = UDF_SB(sb);
709
710 last[0] = lastblock;
711 last[1] = last[0] - 1;
712 last[2] = last[0] + 1;
713 last[3] = last[0] - 2;
714 last[4] = last[0] - 150;
715 last[5] = last[0] - 152;
716
717 /* according to spec, anchor is in either:
718 * block 256
719 * lastblock-256
720 * lastblock
721 * however, if the disc isn't closed, it could be 512 */
722
723 for (i = 0; i < ARRAY_SIZE(last); i++) {
724 if (last[i] < 0)
725 continue;
726 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
727 sb->s_blocksize_bits)
728 continue;
729
730 if (udf_check_anchor_block(sb, last[i])) {
731 sbi->s_anchor[0] = last[i];
732 sbi->s_anchor[1] = last[i] - 256;
733 return last[i];
734 }
735
736 if (last[i] < 256)
737 continue;
738
739 if (udf_check_anchor_block(sb, last[i] - 256)) {
740 sbi->s_anchor[1] = last[i] - 256;
741 return last[i];
742 }
743 }
744
745 if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
746 sbi->s_anchor[0] = sbi->s_session + 256;
747 return last[0];
748 }
749 if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
750 sbi->s_anchor[0] = sbi->s_session + 512;
751 return last[0];
752 }
753 return 0;
754}
755
756/*
757 * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
758 * be the last block on the media.
759 *
760 * Return 1 if not found, 0 if ok
761 *
762 */
763static void udf_find_anchor(struct super_block *sb)
764{
765 sector_t lastblock;
766 struct buffer_head *bh = NULL;
767 uint16_t ident;
768 int i;
769 struct udf_sb_info *sbi = UDF_SB(sb);
770
771 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
772 if (lastblock)
773 goto check_anchor;
774
775 /* No anchor found? Try VARCONV conversion of block numbers */
776 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
777 /* Firstly, we try to not convert number of the last block */
778 lastblock = udf_scan_anchors(sb,
779 udf_variable_to_fixed(sbi->s_last_block));
780 if (lastblock)
781 goto check_anchor;
782
783 /* Secondly, we try with converted number of the last block */
784 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
785 if (!lastblock) {
786 /* VARCONV didn't help. Clear it. */
787 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
788 }
789
790check_anchor:
791 /*
792 * Check located anchors and the anchor block supplied via
793 * mount options
794 */
795 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
796 if (!sbi->s_anchor[i])
797 continue;
798 bh = udf_read_tagged(sb, sbi->s_anchor[i],
799 sbi->s_anchor[i], &ident);
800 if (!bh)
801 sbi->s_anchor[i] = 0;
802 else {
803 brelse(bh);
804 if (ident != TAG_IDENT_AVDP)
805 sbi->s_anchor[i] = 0;
806 }
807 }
808
809 sbi->s_last_block = lastblock;
810}
811
812static int udf_find_fileset(struct super_block *sb, 684static int udf_find_fileset(struct super_block *sb,
813 kernel_lb_addr *fileset, 685 struct kernel_lb_addr *fileset,
814 kernel_lb_addr *root) 686 struct kernel_lb_addr *root)
815{ 687{
816 struct buffer_head *bh = NULL; 688 struct buffer_head *bh = NULL;
817 long lastblock; 689 long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
820 692
821 if (fileset->logicalBlockNum != 0xFFFFFFFF || 693 if (fileset->logicalBlockNum != 0xFFFFFFFF ||
822 fileset->partitionReferenceNum != 0xFFFF) { 694 fileset->partitionReferenceNum != 0xFFFF) {
823 bh = udf_read_ptagged(sb, *fileset, 0, &ident); 695 bh = udf_read_ptagged(sb, fileset, 0, &ident);
824 696
825 if (!bh) { 697 if (!bh) {
826 return 1; 698 return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
834 sbi = UDF_SB(sb); 706 sbi = UDF_SB(sb);
835 if (!bh) { 707 if (!bh) {
836 /* Search backwards through the partitions */ 708 /* Search backwards through the partitions */
837 kernel_lb_addr newfileset; 709 struct kernel_lb_addr newfileset;
838 710
839/* --> cvg: FIXME - is it reasonable? */ 711/* --> cvg: FIXME - is it reasonable? */
840 return 1; 712 return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
850 newfileset.logicalBlockNum = 0; 722 newfileset.logicalBlockNum = 0;
851 723
852 do { 724 do {
853 bh = udf_read_ptagged(sb, newfileset, 0, 725 bh = udf_read_ptagged(sb, &newfileset, 0,
854 &ident); 726 &ident);
855 if (!bh) { 727 if (!bh) {
856 newfileset.logicalBlockNum++; 728 newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
902static int udf_load_pvoldesc(struct super_block *sb, sector_t block) 774static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
903{ 775{
904 struct primaryVolDesc *pvoldesc; 776 struct primaryVolDesc *pvoldesc;
905 struct ustr instr; 777 struct ustr *instr, *outstr;
906 struct ustr outstr;
907 struct buffer_head *bh; 778 struct buffer_head *bh;
908 uint16_t ident; 779 uint16_t ident;
780 int ret = 1;
781
782 instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
783 if (!instr)
784 return 1;
785
786 outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
787 if (!outstr)
788 goto out1;
909 789
910 bh = udf_read_tagged(sb, block, block, &ident); 790 bh = udf_read_tagged(sb, block, block, &ident);
911 if (!bh) 791 if (!bh)
912 return 1; 792 goto out2;
793
913 BUG_ON(ident != TAG_IDENT_PVD); 794 BUG_ON(ident != TAG_IDENT_PVD);
914 795
915 pvoldesc = (struct primaryVolDesc *)bh->b_data; 796 pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
917 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, 798 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
918 pvoldesc->recordingDateAndTime)) { 799 pvoldesc->recordingDateAndTime)) {
919#ifdef UDFFS_DEBUG 800#ifdef UDFFS_DEBUG
920 timestamp *ts = &pvoldesc->recordingDateAndTime; 801 struct timestamp *ts = &pvoldesc->recordingDateAndTime;
921 udf_debug("recording time %04u/%02u/%02u" 802 udf_debug("recording time %04u/%02u/%02u"
922 " %02u:%02u (%x)\n", 803 " %02u:%02u (%x)\n",
923 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, 804 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
925#endif 806#endif
926 } 807 }
927 808
928 if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) 809 if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
929 if (udf_CS0toUTF8(&outstr, &instr)) { 810 if (udf_CS0toUTF8(outstr, instr)) {
930 strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name, 811 strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
931 outstr.u_len > 31 ? 31 : outstr.u_len); 812 outstr->u_len > 31 ? 31 : outstr->u_len);
932 udf_debug("volIdent[] = '%s'\n", 813 udf_debug("volIdent[] = '%s'\n",
933 UDF_SB(sb)->s_volume_ident); 814 UDF_SB(sb)->s_volume_ident);
934 } 815 }
935 816
936 if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) 817 if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
937 if (udf_CS0toUTF8(&outstr, &instr)) 818 if (udf_CS0toUTF8(outstr, instr))
938 udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); 819 udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
939 820
940 brelse(bh); 821 brelse(bh);
941 return 0; 822 ret = 0;
823out2:
824 kfree(outstr);
825out1:
826 kfree(instr);
827 return ret;
942} 828}
943 829
944static int udf_load_metadata_files(struct super_block *sb, int partition) 830static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
946 struct udf_sb_info *sbi = UDF_SB(sb); 832 struct udf_sb_info *sbi = UDF_SB(sb);
947 struct udf_part_map *map; 833 struct udf_part_map *map;
948 struct udf_meta_data *mdata; 834 struct udf_meta_data *mdata;
949 kernel_lb_addr addr; 835 struct kernel_lb_addr addr;
950 int fe_error = 0; 836 int fe_error = 0;
951 837
952 map = &sbi->s_partmaps[partition]; 838 map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
959 udf_debug("Metadata file location: block = %d part = %d\n", 845 udf_debug("Metadata file location: block = %d part = %d\n",
960 addr.logicalBlockNum, addr.partitionReferenceNum); 846 addr.logicalBlockNum, addr.partitionReferenceNum);
961 847
962 mdata->s_metadata_fe = udf_iget(sb, addr); 848 mdata->s_metadata_fe = udf_iget(sb, &addr);
963 849
964 if (mdata->s_metadata_fe == NULL) { 850 if (mdata->s_metadata_fe == NULL) {
965 udf_warning(sb, __func__, "metadata inode efe not found, " 851 udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
981 udf_debug("Mirror metadata file location: block = %d part = %d\n", 867 udf_debug("Mirror metadata file location: block = %d part = %d\n",
982 addr.logicalBlockNum, addr.partitionReferenceNum); 868 addr.logicalBlockNum, addr.partitionReferenceNum);
983 869
984 mdata->s_mirror_fe = udf_iget(sb, addr); 870 mdata->s_mirror_fe = udf_iget(sb, &addr);
985 871
986 if (mdata->s_mirror_fe == NULL) { 872 if (mdata->s_mirror_fe == NULL) {
987 if (fe_error) { 873 if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
1013 udf_debug("Bitmap file location: block = %d part = %d\n", 899 udf_debug("Bitmap file location: block = %d part = %d\n",
1014 addr.logicalBlockNum, addr.partitionReferenceNum); 900 addr.logicalBlockNum, addr.partitionReferenceNum);
1015 901
1016 mdata->s_bitmap_fe = udf_iget(sb, addr); 902 mdata->s_bitmap_fe = udf_iget(sb, &addr);
1017 903
1018 if (mdata->s_bitmap_fe == NULL) { 904 if (mdata->s_bitmap_fe == NULL) {
1019 if (sb->s_flags & MS_RDONLY) 905 if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
1037} 923}
1038 924
1039static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, 925static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
1040 kernel_lb_addr *root) 926 struct kernel_lb_addr *root)
1041{ 927{
1042 struct fileSetDesc *fset; 928 struct fileSetDesc *fset;
1043 929
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1119 1005
1120 phd = (struct partitionHeaderDesc *)p->partitionContentsUse; 1006 phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
1121 if (phd->unallocSpaceTable.extLength) { 1007 if (phd->unallocSpaceTable.extLength) {
1122 kernel_lb_addr loc = { 1008 struct kernel_lb_addr loc = {
1123 .logicalBlockNum = le32_to_cpu( 1009 .logicalBlockNum = le32_to_cpu(
1124 phd->unallocSpaceTable.extPosition), 1010 phd->unallocSpaceTable.extPosition),
1125 .partitionReferenceNum = p_index, 1011 .partitionReferenceNum = p_index,
1126 }; 1012 };
1127 1013
1128 map->s_uspace.s_table = udf_iget(sb, loc); 1014 map->s_uspace.s_table = udf_iget(sb, &loc);
1129 if (!map->s_uspace.s_table) { 1015 if (!map->s_uspace.s_table) {
1130 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1016 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1131 p_index); 1017 p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1154 udf_debug("partitionIntegrityTable (part %d)\n", p_index); 1040 udf_debug("partitionIntegrityTable (part %d)\n", p_index);
1155 1041
1156 if (phd->freedSpaceTable.extLength) { 1042 if (phd->freedSpaceTable.extLength) {
1157 kernel_lb_addr loc = { 1043 struct kernel_lb_addr loc = {
1158 .logicalBlockNum = le32_to_cpu( 1044 .logicalBlockNum = le32_to_cpu(
1159 phd->freedSpaceTable.extPosition), 1045 phd->freedSpaceTable.extPosition),
1160 .partitionReferenceNum = p_index, 1046 .partitionReferenceNum = p_index,
1161 }; 1047 };
1162 1048
1163 map->s_fspace.s_table = udf_iget(sb, loc); 1049 map->s_fspace.s_table = udf_iget(sb, &loc);
1164 if (!map->s_fspace.s_table) { 1050 if (!map->s_fspace.s_table) {
1165 udf_debug("cannot load freedSpaceTable (part %d)\n", 1051 udf_debug("cannot load freedSpaceTable (part %d)\n",
1166 p_index); 1052 p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1192{ 1078{
1193 struct udf_sb_info *sbi = UDF_SB(sb); 1079 struct udf_sb_info *sbi = UDF_SB(sb);
1194 struct udf_part_map *map = &sbi->s_partmaps[p_index]; 1080 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1195 kernel_lb_addr ino; 1081 struct kernel_lb_addr ino;
1196 struct buffer_head *bh = NULL; 1082 struct buffer_head *bh = NULL;
1197 struct udf_inode_info *vati; 1083 struct udf_inode_info *vati;
1198 uint32_t pos; 1084 uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1201 /* VAT file entry is in the last recorded block */ 1087 /* VAT file entry is in the last recorded block */
1202 ino.partitionReferenceNum = type1_index; 1088 ino.partitionReferenceNum = type1_index;
1203 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; 1089 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1204 sbi->s_vat_inode = udf_iget(sb, ino); 1090 sbi->s_vat_inode = udf_iget(sb, &ino);
1205 if (!sbi->s_vat_inode) 1091 if (!sbi->s_vat_inode)
1206 return 1; 1092 return 1;
1207 1093
@@ -1322,7 +1208,7 @@ out_bh:
1322} 1208}
1323 1209
1324static int udf_load_logicalvol(struct super_block *sb, sector_t block, 1210static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1325 kernel_lb_addr *fileset) 1211 struct kernel_lb_addr *fileset)
1326{ 1212{
1327 struct logicalVolDesc *lvd; 1213 struct logicalVolDesc *lvd;
1328 int i, j, offset; 1214 int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1471 } 1357 }
1472 1358
1473 if (fileset) { 1359 if (fileset) {
1474 long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]); 1360 struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
1475 1361
1476 *fileset = lelb_to_cpu(la->extLocation); 1362 *fileset = lelb_to_cpu(la->extLocation);
1477 udf_debug("FileSet found in LogicalVolDesc at block=%d, " 1363 udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
1490 * udf_load_logicalvolint 1376 * udf_load_logicalvolint
1491 * 1377 *
1492 */ 1378 */
1493static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) 1379static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
1494{ 1380{
1495 struct buffer_head *bh = NULL; 1381 struct buffer_head *bh = NULL;
1496 uint16_t ident; 1382 uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
1533 * Written, tested, and released. 1419 * Written, tested, and released.
1534 */ 1420 */
1535static noinline int udf_process_sequence(struct super_block *sb, long block, 1421static noinline int udf_process_sequence(struct super_block *sb, long block,
1536 long lastblock, kernel_lb_addr *fileset) 1422 long lastblock, struct kernel_lb_addr *fileset)
1537{ 1423{
1538 struct buffer_head *bh = NULL; 1424 struct buffer_head *bh = NULL;
1539 struct udf_vds_record vds[VDS_POS_LENGTH]; 1425 struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1655 return 0; 1541 return 0;
1656} 1542}
1657 1543
1544static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1545 struct kernel_lb_addr *fileset)
1546{
1547 struct anchorVolDescPtr *anchor;
1548 long main_s, main_e, reserve_s, reserve_e;
1549 struct udf_sb_info *sbi;
1550
1551 sbi = UDF_SB(sb);
1552 anchor = (struct anchorVolDescPtr *)bh->b_data;
1553
1554 /* Locate the main sequence */
1555 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
1556 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
1557 main_e = main_e >> sb->s_blocksize_bits;
1558 main_e += main_s;
1559
1560 /* Locate the reserve sequence */
1561 reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
1562 reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
1563 reserve_e = reserve_e >> sb->s_blocksize_bits;
1564 reserve_e += reserve_s;
1565
1566 /* Process the main & reserve sequences */
1567 /* responsible for finding the PartitionDesc(s) */
1568 if (!udf_process_sequence(sb, main_s, main_e, fileset))
1569 return 1;
1570 return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
1571}
1572
1658/* 1573/*
1659 * udf_check_valid() 1574 * Check whether there is an anchor block in the given block and
1575 * load Volume Descriptor Sequence if so.
1660 */ 1576 */
1661static int udf_check_valid(struct super_block *sb, int novrs, int silent) 1577static int udf_check_anchor_block(struct super_block *sb, sector_t block,
1578 struct kernel_lb_addr *fileset)
1662{ 1579{
1663 long block; 1580 struct buffer_head *bh;
1664 struct udf_sb_info *sbi = UDF_SB(sb); 1581 uint16_t ident;
1582 int ret;
1665 1583
1666 if (novrs) { 1584 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
1667 udf_debug("Validity check skipped because of novrs option\n"); 1585 udf_fixed_to_variable(block) >=
1586 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
1587 return 0;
1588
1589 bh = udf_read_tagged(sb, block, block, &ident);
1590 if (!bh)
1591 return 0;
1592 if (ident != TAG_IDENT_AVDP) {
1593 brelse(bh);
1668 return 0; 1594 return 0;
1669 } 1595 }
1670 /* Check that it is NSR02 compliant */ 1596 ret = udf_load_sequence(sb, bh, fileset);
1671 /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ 1597 brelse(bh);
1672 block = udf_vrs(sb, silent); 1598 return ret;
1673 if (block == -1)
1674 udf_debug("Failed to read byte 32768. Assuming open "
1675 "disc. Skipping validity check\n");
1676 if (block && !sbi->s_last_block)
1677 sbi->s_last_block = udf_get_last_block(sb);
1678 return !block;
1679} 1599}
1680 1600
1681static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset) 1601/* Search for an anchor volume descriptor pointer */
1602static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
1603 struct kernel_lb_addr *fileset)
1682{ 1604{
1683 struct anchorVolDescPtr *anchor; 1605 sector_t last[6];
1684 uint16_t ident;
1685 struct buffer_head *bh;
1686 long main_s, main_e, reserve_s, reserve_e;
1687 int i; 1606 int i;
1688 struct udf_sb_info *sbi; 1607 struct udf_sb_info *sbi = UDF_SB(sb);
1689 1608 int last_count = 0;
1690 if (!sb)
1691 return 1;
1692 sbi = UDF_SB(sb);
1693 1609
1694 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { 1610 /* First try user provided anchor */
1695 if (!sbi->s_anchor[i]) 1611 if (sbi->s_anchor) {
1612 if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
1613 return lastblock;
1614 }
1615 /*
1616 * according to spec, anchor is in either:
1617 * block 256
1618 * lastblock-256
1619 * lastblock
1620 * however, if the disc isn't closed, it could be 512.
1621 */
1622 if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
1623 return lastblock;
1624 /*
1625 * The trouble is which block is the last one. Drives often misreport
1626 * this so we try various possibilities.
1627 */
1628 last[last_count++] = lastblock;
1629 if (lastblock >= 1)
1630 last[last_count++] = lastblock - 1;
1631 last[last_count++] = lastblock + 1;
1632 if (lastblock >= 2)
1633 last[last_count++] = lastblock - 2;
1634 if (lastblock >= 150)
1635 last[last_count++] = lastblock - 150;
1636 if (lastblock >= 152)
1637 last[last_count++] = lastblock - 152;
1638
1639 for (i = 0; i < last_count; i++) {
1640 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
1641 sb->s_blocksize_bits)
1696 continue; 1642 continue;
1697 1643 if (udf_check_anchor_block(sb, last[i], fileset))
1698 bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], 1644 return last[i];
1699 &ident); 1645 if (last[i] < 256)
1700 if (!bh)
1701 continue; 1646 continue;
1647 if (udf_check_anchor_block(sb, last[i] - 256, fileset))
1648 return last[i];
1649 }
1702 1650
1703 anchor = (struct anchorVolDescPtr *)bh->b_data; 1651 /* Finally try block 512 in case media is open */
1652 if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
1653 return last[0];
1654 return 0;
1655}
1704 1656
1705 /* Locate the main sequence */ 1657/*
1706 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); 1658 * Find an anchor volume descriptor and load Volume Descriptor Sequence from
1707 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); 1659 * area specified by it. The function expects sbi->s_lastblock to be the last
1708 main_e = main_e >> sb->s_blocksize_bits; 1660 * block on the media.
1709 main_e += main_s; 1661 *
1662 * Return 1 if ok, 0 if not found.
1663 *
1664 */
1665static int udf_find_anchor(struct super_block *sb,
1666 struct kernel_lb_addr *fileset)
1667{
1668 sector_t lastblock;
1669 struct udf_sb_info *sbi = UDF_SB(sb);
1710 1670
1711 /* Locate the reserve sequence */ 1671 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1712 reserve_s = le32_to_cpu( 1672 if (lastblock)
1713 anchor->reserveVolDescSeqExt.extLocation); 1673 goto out;
1714 reserve_e = le32_to_cpu(
1715 anchor->reserveVolDescSeqExt.extLength);
1716 reserve_e = reserve_e >> sb->s_blocksize_bits;
1717 reserve_e += reserve_s;
1718 1674
1719 brelse(bh); 1675 /* No anchor found? Try VARCONV conversion of block numbers */
1676 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
1677 /* Firstly, we try to not convert number of the last block */
1678 lastblock = udf_scan_anchors(sb,
1679 udf_variable_to_fixed(sbi->s_last_block),
1680 fileset);
1681 if (lastblock)
1682 goto out;
1720 1683
1721 /* Process the main & reserve sequences */ 1684 /* Secondly, we try with converted number of the last block */
1722 /* responsible for finding the PartitionDesc(s) */ 1685 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1723 if (!(udf_process_sequence(sb, main_s, main_e, 1686 if (!lastblock) {
1724 fileset) && 1687 /* VARCONV didn't help. Clear it. */
1725 udf_process_sequence(sb, reserve_s, reserve_e, 1688 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
1726 fileset))) 1689 return 0;
1727 break;
1728 } 1690 }
1691out:
1692 sbi->s_last_block = lastblock;
1693 return 1;
1694}
1729 1695
1730 if (i == ARRAY_SIZE(sbi->s_anchor)) { 1696/*
1731 udf_debug("No Anchor block found\n"); 1697 * Check Volume Structure Descriptor, find Anchor block and load Volume
1732 return 1; 1698 * Descriptor Sequence
1699 */
1700static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1701 int silent, struct kernel_lb_addr *fileset)
1702{
1703 struct udf_sb_info *sbi = UDF_SB(sb);
1704 loff_t nsr_off;
1705
1706 if (!sb_set_blocksize(sb, uopt->blocksize)) {
1707 if (!silent)
1708 printk(KERN_WARNING "UDF-fs: Bad block size\n");
1709 return 0;
1710 }
1711 sbi->s_last_block = uopt->lastblock;
1712 if (!uopt->novrs) {
1713 /* Check that it is NSR02 compliant */
1714 nsr_off = udf_check_vsd(sb);
1715 if (!nsr_off) {
1716 if (!silent)
1717 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1718 return 0;
1719 }
1720 if (nsr_off == -1)
1721 udf_debug("Failed to read byte 32768. Assuming open "
1722 "disc. Skipping validity check\n");
1723 if (!sbi->s_last_block)
1724 sbi->s_last_block = udf_get_last_block(sb);
1725 } else {
1726 udf_debug("Validity check skipped because of novrs option\n");
1733 } 1727 }
1734 udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
1735 1728
1736 return 0; 1729 /* Look for anchor block and load Volume Descriptor Sequence */
1730 sbi->s_anchor = uopt->anchor;
1731 if (!udf_find_anchor(sb, fileset)) {
1732 if (!silent)
1733 printk(KERN_WARNING "UDF-fs: No anchor found\n");
1734 return 0;
1735 }
1736 return 1;
1737} 1737}
1738 1738
1739static void udf_open_lvid(struct super_block *sb) 1739static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
1742 struct buffer_head *bh = sbi->s_lvid_bh; 1742 struct buffer_head *bh = sbi->s_lvid_bh;
1743 struct logicalVolIntegrityDesc *lvid; 1743 struct logicalVolIntegrityDesc *lvid;
1744 struct logicalVolIntegrityDescImpUse *lvidiu; 1744 struct logicalVolIntegrityDescImpUse *lvidiu;
1745
1745 if (!bh) 1746 if (!bh)
1746 return; 1747 return;
1747
1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1749 lvidiu = udf_sb_lvidiu(sbi); 1749 lvidiu = udf_sb_lvidiu(sbi);
1750 1750
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, 1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
1754 CURRENT_TIME); 1754 CURRENT_TIME);
1755 lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; 1755 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
1756 1756
1757 lvid->descTag.descCRC = cpu_to_le16( 1757 lvid->descTag.descCRC = cpu_to_le16(
1758 crc_itu_t(0, (char *)lvid + sizeof(tag), 1758 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1759 le16_to_cpu(lvid->descTag.descCRCLength))); 1759 le16_to_cpu(lvid->descTag.descCRCLength)));
1760 1760
1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1762 mark_buffer_dirty(bh); 1762 mark_buffer_dirty(bh);
1763 sbi->s_lvid_dirty = 0;
1763} 1764}
1764 1765
1765static void udf_close_lvid(struct super_block *sb) 1766static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
1773 return; 1774 return;
1774 1775
1775 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1776 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1776
1777 if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
1778 return;
1779
1780 lvidiu = udf_sb_lvidiu(sbi); 1777 lvidiu = udf_sb_lvidiu(sbi);
1781 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1778 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1782 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1779 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
1790 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); 1787 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
1791 1788
1792 lvid->descTag.descCRC = cpu_to_le16( 1789 lvid->descTag.descCRC = cpu_to_le16(
1793 crc_itu_t(0, (char *)lvid + sizeof(tag), 1790 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1794 le16_to_cpu(lvid->descTag.descCRCLength))); 1791 le16_to_cpu(lvid->descTag.descCRCLength)));
1795 1792
1796 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1793 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1797 mark_buffer_dirty(bh); 1794 mark_buffer_dirty(bh);
1795 sbi->s_lvid_dirty = 0;
1798} 1796}
1799 1797
1800static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1798static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
1846static int udf_fill_super(struct super_block *sb, void *options, int silent) 1844static int udf_fill_super(struct super_block *sb, void *options, int silent)
1847{ 1845{
1848 int i; 1846 int i;
1847 int ret;
1849 struct inode *inode = NULL; 1848 struct inode *inode = NULL;
1850 struct udf_options uopt; 1849 struct udf_options uopt;
1851 kernel_lb_addr rootdir, fileset; 1850 struct kernel_lb_addr rootdir, fileset;
1852 struct udf_sb_info *sbi; 1851 struct udf_sb_info *sbi;
1853 1852
1854 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1853 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1855 uopt.uid = -1; 1854 uopt.uid = -1;
1856 uopt.gid = -1; 1855 uopt.gid = -1;
1857 uopt.umask = 0; 1856 uopt.umask = 0;
1857 uopt.fmode = UDF_INVALID_MODE;
1858 uopt.dmode = UDF_INVALID_MODE;
1858 1859
1859 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1860 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1860 if (!sbi) 1861 if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1892 sbi->s_uid = uopt.uid; 1893 sbi->s_uid = uopt.uid;
1893 sbi->s_gid = uopt.gid; 1894 sbi->s_gid = uopt.gid;
1894 sbi->s_umask = uopt.umask; 1895 sbi->s_umask = uopt.umask;
1896 sbi->s_fmode = uopt.fmode;
1897 sbi->s_dmode = uopt.dmode;
1895 sbi->s_nls_map = uopt.nls_map; 1898 sbi->s_nls_map = uopt.nls_map;
1896 1899
1897 /* Set the block size for all transfers */
1898 if (!sb_min_blocksize(sb, uopt.blocksize)) {
1899 udf_debug("Bad block size (%d)\n", uopt.blocksize);
1900 printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
1901 goto error_out;
1902 }
1903
1904 if (uopt.session == 0xFFFFFFFF) 1900 if (uopt.session == 0xFFFFFFFF)
1905 sbi->s_session = udf_get_last_session(sb); 1901 sbi->s_session = udf_get_last_session(sb);
1906 else 1902 else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1908 1904
1909 udf_debug("Multi-session=%d\n", sbi->s_session); 1905 udf_debug("Multi-session=%d\n", sbi->s_session);
1910 1906
1911 sbi->s_last_block = uopt.lastblock;
1912 sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
1913 sbi->s_anchor[2] = uopt.anchor;
1914
1915 if (udf_check_valid(sb, uopt.novrs, silent)) {
1916 /* read volume recognition sequences */
1917 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1918 goto error_out;
1919 }
1920
1921 udf_find_anchor(sb);
1922
1923 /* Fill in the rest of the superblock */ 1907 /* Fill in the rest of the superblock */
1924 sb->s_op = &udf_sb_ops; 1908 sb->s_op = &udf_sb_ops;
1925 sb->s_export_op = &udf_export_ops; 1909 sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1928 sb->s_magic = UDF_SUPER_MAGIC; 1912 sb->s_magic = UDF_SUPER_MAGIC;
1929 sb->s_time_gran = 1000; 1913 sb->s_time_gran = 1000;
1930 1914
1931 if (udf_load_sequence(sb, &fileset)) { 1915 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
1916 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1917 } else {
1918 uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
1919 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1920 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
1921 if (!silent)
1922 printk(KERN_NOTICE
1923 "UDF-fs: Rescanning with blocksize "
1924 "%d\n", UDF_DEFAULT_BLOCKSIZE);
1925 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
1926 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1927 }
1928 }
1929 if (!ret) {
1932 printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); 1930 printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
1933 goto error_out; 1931 goto error_out;
1934 } 1932 }
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1978 } 1976 }
1979 1977
1980 if (!silent) { 1978 if (!silent) {
1981 timestamp ts; 1979 struct timestamp ts;
1982 udf_time_to_disk_stamp(&ts, sbi->s_record_time); 1980 udf_time_to_disk_stamp(&ts, sbi->s_record_time);
1983 udf_info("UDF: Mounting volume '%s', " 1981 udf_info("UDF: Mounting volume '%s', "
1984 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", 1982 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1991 /* Assign the root inode */ 1989 /* Assign the root inode */
1992 /* assign inodes by physical block number */ 1990 /* assign inodes by physical block number */
1993 /* perhaps it's not extensible enough, but for now ... */ 1991 /* perhaps it's not extensible enough, but for now ... */
1994 inode = udf_iget(sb, rootdir); 1992 inode = udf_iget(sb, &rootdir);
1995 if (!inode) { 1993 if (!inode) {
1996 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " 1994 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
1997 "partition=%d\n", 1995 "partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
2081 sb->s_fs_info = NULL; 2079 sb->s_fs_info = NULL;
2082} 2080}
2083 2081
2082static int udf_sync_fs(struct super_block *sb, int wait)
2083{
2084 struct udf_sb_info *sbi = UDF_SB(sb);
2085
2086 mutex_lock(&sbi->s_alloc_mutex);
2087 if (sbi->s_lvid_dirty) {
2088 /*
2089 * Blockdevice will be synced later so we don't have to submit
2090 * the buffer for IO
2091 */
2092 mark_buffer_dirty(sbi->s_lvid_bh);
2093 sb->s_dirt = 0;
2094 sbi->s_lvid_dirty = 0;
2095 }
2096 mutex_unlock(&sbi->s_alloc_mutex);
2097
2098 return 0;
2099}
2100
2084static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) 2101static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2085{ 2102{
2086 struct super_block *sb = dentry->d_sb; 2103 struct super_block *sb = dentry->d_sb;
2087 struct udf_sb_info *sbi = UDF_SB(sb); 2104 struct udf_sb_info *sbi = UDF_SB(sb);
2088 struct logicalVolIntegrityDescImpUse *lvidiu; 2105 struct logicalVolIntegrityDescImpUse *lvidiu;
2106 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
2089 2107
2090 if (sbi->s_lvid_bh != NULL) 2108 if (sbi->s_lvid_bh != NULL)
2091 lvidiu = udf_sb_lvidiu(sbi); 2109 lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2101 le32_to_cpu(lvidiu->numDirs)) : 0) 2119 le32_to_cpu(lvidiu->numDirs)) : 0)
2102 + buf->f_bfree; 2120 + buf->f_bfree;
2103 buf->f_ffree = buf->f_bfree; 2121 buf->f_ffree = buf->f_bfree;
2104 /* __kernel_fsid_t f_fsid */
2105 buf->f_namelen = UDF_NAME_LEN - 2; 2122 buf->f_namelen = UDF_NAME_LEN - 2;
2123 buf->f_fsid.val[0] = (u32)id;
2124 buf->f_fsid.val[1] = (u32)(id >> 32);
2106 2125
2107 return 0; 2126 return 0;
2108} 2127}
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2114 unsigned int accum = 0; 2133 unsigned int accum = 0;
2115 int index; 2134 int index;
2116 int block = 0, newblock; 2135 int block = 0, newblock;
2117 kernel_lb_addr loc; 2136 struct kernel_lb_addr loc;
2118 uint32_t bytes; 2137 uint32_t bytes;
2119 uint8_t *ptr; 2138 uint8_t *ptr;
2120 uint16_t ident; 2139 uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2124 2143
2125 loc.logicalBlockNum = bitmap->s_extPosition; 2144 loc.logicalBlockNum = bitmap->s_extPosition;
2126 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 2145 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
2127 bh = udf_read_ptagged(sb, loc, 0, &ident); 2146 bh = udf_read_ptagged(sb, &loc, 0, &ident);
2128 2147
2129 if (!bh) { 2148 if (!bh) {
2130 printk(KERN_ERR "udf: udf_count_free failed\n"); 2149 printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2147 bytes -= cur_bytes; 2166 bytes -= cur_bytes;
2148 if (bytes) { 2167 if (bytes) {
2149 brelse(bh); 2168 brelse(bh);
2150 newblock = udf_get_lb_pblock(sb, loc, ++block); 2169 newblock = udf_get_lb_pblock(sb, &loc, ++block);
2151 bh = udf_tread(sb, newblock); 2170 bh = udf_tread(sb, newblock);
2152 if (!bh) { 2171 if (!bh) {
2153 udf_debug("read failed\n"); 2172 udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2170{ 2189{
2171 unsigned int accum = 0; 2190 unsigned int accum = 0;
2172 uint32_t elen; 2191 uint32_t elen;
2173 kernel_lb_addr eloc; 2192 struct kernel_lb_addr eloc;
2174 int8_t etype; 2193 int8_t etype;
2175 struct extent_position epos; 2194 struct extent_position epos;
2176 2195
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f9424..225527cdc885 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
28#include "udf_sb.h" 28#include "udf_sb.h"
29 29
30static void extent_trunc(struct inode *inode, struct extent_position *epos, 30static void extent_trunc(struct inode *inode, struct extent_position *epos,
31 kernel_lb_addr eloc, int8_t etype, uint32_t elen, 31 struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
32 uint32_t nelen) 32 uint32_t nelen)
33{ 33{
34 kernel_lb_addr neloc = {}; 34 struct kernel_lb_addr neloc = {};
35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >> 35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
36 inode->i_sb->s_blocksize_bits; 36 inode->i_sb->s_blocksize_bits;
37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> 37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
43 last_block); 43 last_block);
44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); 44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
45 } else 45 } else
46 neloc = eloc; 46 neloc = *eloc;
47 nelen = (etype << 30) | nelen; 47 nelen = (etype << 30) | nelen;
48 } 48 }
49 49
50 if (elen != nelen) { 50 if (elen != nelen) {
51 udf_write_aext(inode, epos, neloc, nelen, 0); 51 udf_write_aext(inode, epos, &neloc, nelen, 0);
52 if (last_block - first_block > 0) { 52 if (last_block - first_block > 0) {
53 if (etype == (EXT_RECORDED_ALLOCATED >> 30)) 53 if (etype == (EXT_RECORDED_ALLOCATED >> 30))
54 mark_inode_dirty(inode); 54 mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
68void udf_truncate_tail_extent(struct inode *inode) 68void udf_truncate_tail_extent(struct inode *inode)
69{ 69{
70 struct extent_position epos = {}; 70 struct extent_position epos = {};
71 kernel_lb_addr eloc; 71 struct kernel_lb_addr eloc;
72 uint32_t elen, nelen; 72 uint32_t elen, nelen;
73 uint64_t lbcount = 0; 73 uint64_t lbcount = 0;
74 int8_t etype = -1, netype; 74 int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
83 return; 83 return;
84 84
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 adsize = sizeof(short_ad); 86 adsize = sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
88 adsize = sizeof(long_ad); 88 adsize = sizeof(struct long_ad);
89 else 89 else
90 BUG(); 90 BUG();
91 91
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
106 (unsigned)elen); 106 (unsigned)elen);
107 nelen = elen - (lbcount - inode->i_size); 107 nelen = elen - (lbcount - inode->i_size);
108 epos.offset -= adsize; 108 epos.offset -= adsize;
109 extent_trunc(inode, &epos, eloc, etype, elen, nelen); 109 extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
110 epos.offset += adsize; 110 epos.offset += adsize;
111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) 111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
112 printk(KERN_ERR "udf_truncate_tail_extent(): " 112 printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
124void udf_discard_prealloc(struct inode *inode) 124void udf_discard_prealloc(struct inode *inode)
125{ 125{
126 struct extent_position epos = { NULL, 0, {0, 0} }; 126 struct extent_position epos = { NULL, 0, {0, 0} };
127 kernel_lb_addr eloc; 127 struct kernel_lb_addr eloc;
128 uint32_t elen; 128 uint32_t elen;
129 uint64_t lbcount = 0; 129 uint64_t lbcount = 0;
130 int8_t etype = -1, netype; 130 int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
136 return; 136 return;
137 137
138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
139 adsize = sizeof(short_ad); 139 adsize = sizeof(struct short_ad);
140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
141 adsize = sizeof(long_ad); 141 adsize = sizeof(struct long_ad);
142 else 142 else
143 adsize = 0; 143 adsize = 0;
144 144
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
153 epos.offset -= adsize; 153 epos.offset -= adsize;
154 lbcount -= elen; 154 lbcount -= elen;
155 extent_trunc(inode, &epos, eloc, etype, elen, 0); 155 extent_trunc(inode, &epos, &eloc, etype, elen, 0);
156 if (!epos.bh) { 156 if (!epos.bh) {
157 iinfo->i_lenAlloc = 157 iinfo->i_lenAlloc =
158 epos.offset - 158 epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
200void udf_truncate_extents(struct inode *inode) 200void udf_truncate_extents(struct inode *inode)
201{ 201{
202 struct extent_position epos; 202 struct extent_position epos;
203 kernel_lb_addr eloc, neloc = {}; 203 struct kernel_lb_addr eloc, neloc = {};
204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; 204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
205 int8_t etype; 205 int8_t etype;
206 struct super_block *sb = inode->i_sb; 206 struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
210 struct udf_inode_info *iinfo = UDF_I(inode); 210 struct udf_inode_info *iinfo = UDF_I(inode);
211 211
212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
213 adsize = sizeof(short_ad); 213 adsize = sizeof(struct short_ad);
214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
215 adsize = sizeof(long_ad); 215 adsize = sizeof(struct long_ad);
216 else 216 else
217 BUG(); 217 BUG();
218 218
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
221 (inode->i_size & (sb->s_blocksize - 1)); 221 (inode->i_size & (sb->s_blocksize - 1));
222 if (etype != -1) { 222 if (etype != -1) {
223 epos.offset -= adsize; 223 epos.offset -= adsize;
224 extent_trunc(inode, &epos, eloc, etype, elen, byte_offset); 224 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
225 epos.offset += adsize; 225 epos.offset += adsize;
226 if (byte_offset) 226 if (byte_offset)
227 lenalloc = epos.offset; 227 lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
236 while ((etype = udf_current_aext(inode, &epos, &eloc, 236 while ((etype = udf_current_aext(inode, &epos, &eloc,
237 &elen, 0)) != -1) { 237 &elen, 0)) != -1) {
238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { 238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
239 udf_write_aext(inode, &epos, neloc, nelen, 0); 239 udf_write_aext(inode, &epos, &neloc, nelen, 0);
240 if (indirect_ext_len) { 240 if (indirect_ext_len) {
241 /* We managed to free all extents in the 241 /* We managed to free all extents in the
242 * indirect extent - free it too */ 242 * indirect extent - free it too */
243 BUG_ON(!epos.bh); 243 BUG_ON(!epos.bh);
244 udf_free_blocks(sb, inode, epos.block, 244 udf_free_blocks(sb, inode, &epos.block,
245 0, indirect_ext_len); 245 0, indirect_ext_len);
246 } else if (!epos.bh) { 246 } else if (!epos.bh) {
247 iinfo->i_lenAlloc = lenalloc; 247 iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
253 epos.offset = sizeof(struct allocExtDesc); 253 epos.offset = sizeof(struct allocExtDesc);
254 epos.block = eloc; 254 epos.block = eloc;
255 epos.bh = udf_tread(sb, 255 epos.bh = udf_tread(sb,
256 udf_get_lb_pblock(sb, eloc, 0)); 256 udf_get_lb_pblock(sb, &eloc, 0));
257 if (elen) 257 if (elen)
258 indirect_ext_len = 258 indirect_ext_len =
259 (elen + sb->s_blocksize - 1) >> 259 (elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
261 else 261 else
262 indirect_ext_len = 1; 262 indirect_ext_len = 1;
263 } else { 263 } else {
264 extent_trunc(inode, &epos, eloc, etype, 264 extent_trunc(inode, &epos, &eloc, etype,
265 elen, 0); 265 elen, 0);
266 epos.offset += adsize; 266 epos.offset += adsize;
267 } 267 }
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
269 269
270 if (indirect_ext_len) { 270 if (indirect_ext_len) {
271 BUG_ON(!epos.bh); 271 BUG_ON(!epos.bh);
272 udf_free_blocks(sb, inode, epos.block, 0, 272 udf_free_blocks(sb, inode, &epos.block, 0,
273 indirect_ext_len); 273 indirect_ext_len);
274 } else if (!epos.bh) { 274 } else if (!epos.bh) {
275 iinfo->i_lenAlloc = lenalloc; 275 iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
278 udf_update_alloc_ext_desc(inode, &epos, lenalloc); 278 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
279 } else if (inode->i_size) { 279 } else if (inode->i_size) {
280 if (byte_offset) { 280 if (byte_offset) {
281 kernel_long_ad extent; 281 struct kernel_long_ad extent;
282 282
283 /* 283 /*
284 * OK, there is not extent covering inode->i_size and 284 * OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5d..e58d1de41073 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
4struct udf_inode_info { 4struct udf_inode_info {
5 struct timespec i_crtime; 5 struct timespec i_crtime;
6 /* Physical address of inode */ 6 /* Physical address of inode */
7 kernel_lb_addr i_location; 7 struct kernel_lb_addr i_location;
8 __u64 i_unique; 8 __u64 i_unique;
9 __u32 i_lenEAttr; 9 __u32 i_lenEAttr;
10 __u32 i_lenAlloc; 10 __u32 i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
17 unsigned i_strat4096 : 1; 17 unsigned i_strat4096 : 1;
18 unsigned reserved : 26; 18 unsigned reserved : 26;
19 union { 19 union {
20 short_ad *i_sad; 20 struct short_ad *i_sad;
21 long_ad *i_lad; 21 struct long_ad *i_lad;
22 __u8 *i_data; 22 __u8 *i_data;
23 } i_ext; 23 } i_ext;
24 struct inode vfs_inode; 24 struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a9725..d113b72c2768 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
30#define UDF_FLAG_GID_SET 16 30#define UDF_FLAG_GID_SET 16
31#define UDF_FLAG_SESSION_SET 17 31#define UDF_FLAG_SESSION_SET 17
32#define UDF_FLAG_LASTBLOCK_SET 18 32#define UDF_FLAG_LASTBLOCK_SET 18
33#define UDF_FLAG_BLOCKSIZE_SET 19
33 34
34#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 35#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
35#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 36#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
@@ -48,6 +49,8 @@
48#define UDF_SPARABLE_MAP15 0x1522U 49#define UDF_SPARABLE_MAP15 0x1522U
49#define UDF_METADATA_MAP25 0x2511U 50#define UDF_METADATA_MAP25 0x2511U
50 51
52#define UDF_INVALID_MODE ((mode_t)-1)
53
51#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ 54#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
52 55
53struct udf_meta_data { 56struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
114 117
115 /* Sector headers */ 118 /* Sector headers */
116 __s32 s_session; 119 __s32 s_session;
117 __u32 s_anchor[3]; 120 __u32 s_anchor;
118 __u32 s_last_block; 121 __u32 s_last_block;
119 122
120 struct buffer_head *s_lvid_bh; 123 struct buffer_head *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
123 mode_t s_umask; 126 mode_t s_umask;
124 gid_t s_gid; 127 gid_t s_gid;
125 uid_t s_uid; 128 uid_t s_uid;
129 mode_t s_fmode;
130 mode_t s_dmode;
126 131
127 /* Root Info */ 132 /* Root Info */
128 struct timespec s_record_time; 133 struct timespec s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
143 struct inode *s_vat_inode; 148 struct inode *s_vat_inode;
144 149
145 struct mutex s_alloc_mutex; 150 struct mutex s_alloc_mutex;
151 /* Protected by s_alloc_mutex */
152 unsigned int s_lvid_dirty;
146}; 153};
147 154
148static inline struct udf_sb_info *UDF_SB(struct super_block *sb) 155static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f13..cac51b77a5d1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
62 return 0; 62 return 0;
63} 63}
64 64
65#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
66
67/* computes tag checksum */ 65/* computes tag checksum */
68u8 udf_tag_checksum(const tag *t); 66u8 udf_tag_checksum(const struct tag *t);
69 67
70struct dentry; 68struct dentry;
71struct inode; 69struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
95}; 93};
96 94
97struct generic_desc { 95struct generic_desc {
98 tag descTag; 96 struct tag descTag;
99 __le32 volDescSeqNum; 97 __le32 volDescSeqNum;
100}; 98};
101 99
@@ -108,11 +106,22 @@ struct ustr {
108struct extent_position { 106struct extent_position {
109 struct buffer_head *bh; 107 struct buffer_head *bh;
110 uint32_t offset; 108 uint32_t offset;
111 kernel_lb_addr block; 109 struct kernel_lb_addr block;
112}; 110};
113 111
114/* super.c */ 112/* super.c */
115extern void udf_warning(struct super_block *, const char *, const char *, ...); 113extern void udf_warning(struct super_block *, const char *, const char *, ...);
114static inline void udf_updated_lvid(struct super_block *sb)
115{
116 struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
117
118 BUG_ON(!bh);
119 WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
120 bh->b_data)->integrityType !=
121 cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
122 sb->s_dirt = 1;
123 UDF_SB(sb)->s_lvid_dirty = 1;
124}
116 125
117/* namei.c */ 126/* namei.c */
118extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 127extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
124 unsigned long); 133 unsigned long);
125 134
126/* inode.c */ 135/* inode.c */
127extern struct inode *udf_iget(struct super_block *, kernel_lb_addr); 136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
128extern int udf_sync_inode(struct inode *); 137extern int udf_sync_inode(struct inode *);
129extern void udf_expand_file_adinicb(struct inode *, int, int *); 138extern void udf_expand_file_adinicb(struct inode *, int, int *);
130extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 139extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
136extern int udf_write_inode(struct inode *, int); 145extern int udf_write_inode(struct inode *, int);
137extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
138extern int udf_extend_file(struct inode *, struct extent_position *, 147extern int udf_extend_file(struct inode *, struct extent_position *,
139 kernel_long_ad *, sector_t); 148 struct kernel_long_ad *, sector_t);
140extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, 149extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
141 kernel_lb_addr *, uint32_t *, sector_t *); 150 struct kernel_lb_addr *, uint32_t *, sector_t *);
142extern int8_t udf_add_aext(struct inode *, struct extent_position *, 151extern int8_t udf_add_aext(struct inode *, struct extent_position *,
143 kernel_lb_addr, uint32_t, int); 152 struct kernel_lb_addr *, uint32_t, int);
144extern int8_t udf_write_aext(struct inode *, struct extent_position *, 153extern int8_t udf_write_aext(struct inode *, struct extent_position *,
145 kernel_lb_addr, uint32_t, int); 154 struct kernel_lb_addr *, uint32_t, int);
146extern int8_t udf_delete_aext(struct inode *, struct extent_position, 155extern int8_t udf_delete_aext(struct inode *, struct extent_position,
147 kernel_lb_addr, uint32_t); 156 struct kernel_lb_addr, uint32_t);
148extern int8_t udf_next_aext(struct inode *, struct extent_position *, 157extern int8_t udf_next_aext(struct inode *, struct extent_position *,
149 kernel_lb_addr *, uint32_t *, int); 158 struct kernel_lb_addr *, uint32_t *, int);
150extern int8_t udf_current_aext(struct inode *, struct extent_position *, 159extern int8_t udf_current_aext(struct inode *, struct extent_position *,
151 kernel_lb_addr *, uint32_t *, int); 160 struct kernel_lb_addr *, uint32_t *, int);
152 161
153/* misc.c */ 162/* misc.c */
154extern struct buffer_head *udf_tgetblk(struct super_block *, int); 163extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
160extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, 169extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
161 uint32_t, uint16_t *); 170 uint32_t, uint16_t *);
162extern struct buffer_head *udf_read_ptagged(struct super_block *, 171extern struct buffer_head *udf_read_ptagged(struct super_block *,
163 kernel_lb_addr, uint32_t, 172 struct kernel_lb_addr *, uint32_t,
164 uint16_t *); 173 uint16_t *);
165extern void udf_update_tag(char *, int); 174extern void udf_update_tag(char *, int);
166extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); 175extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
182 uint32_t); 191 uint32_t);
183extern int udf_relocate_blocks(struct super_block *, long, long *); 192extern int udf_relocate_blocks(struct super_block *, long, long *);
184 193
194static inline uint32_t
195udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
196 uint32_t offset)
197{
198 return udf_get_pblock(sb, loc->logicalBlockNum,
199 loc->partitionReferenceNum, offset);
200}
201
185/* unicode.c */ 202/* unicode.c */
186extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); 203extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
187extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, 204extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
200 217
201/* balloc.c */ 218/* balloc.c */
202extern void udf_free_blocks(struct super_block *, struct inode *, 219extern void udf_free_blocks(struct super_block *, struct inode *,
203 kernel_lb_addr, uint32_t, uint32_t); 220 struct kernel_lb_addr *, uint32_t, uint32_t);
204extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, 221extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
205 uint32_t, uint32_t); 222 uint32_t, uint32_t);
206extern int udf_new_block(struct super_block *, struct inode *, uint16_t, 223extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
214 struct udf_fileident_bh *, 231 struct udf_fileident_bh *,
215 struct fileIdentDesc *, 232 struct fileIdentDesc *,
216 struct extent_position *, 233 struct extent_position *,
217 kernel_lb_addr *, uint32_t *, 234 struct kernel_lb_addr *, uint32_t *,
218 sector_t *); 235 sector_t *);
219extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, 236extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
220 int *offset); 237 int *offset);
221extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); 238extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
222extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); 239extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
223 240
224/* udftime.c */ 241/* udftime.c */
225extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, 242extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
226 timestamp src); 243 struct timestamp src);
227extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src); 244extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
228 245
229#endif /* __UDF_DECL_H */ 246#endif /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428c..6a9f3a9cc428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
4#include <asm/byteorder.h> 4#include <asm/byteorder.h>
5#include <linux/string.h> 5#include <linux/string.h>
6 6
7static inline kernel_lb_addr lelb_to_cpu(lb_addr in) 7static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
8{ 8{
9 kernel_lb_addr out; 9 struct kernel_lb_addr out;
10 10
11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); 11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); 12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
14 return out; 14 return out;
15} 15}
16 16
17static inline lb_addr cpu_to_lelb(kernel_lb_addr in) 17static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
18{ 18{
19 lb_addr out; 19 struct lb_addr out;
20 20
21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); 21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); 22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
24 return out; 24 return out;
25} 25}
26 26
27static inline short_ad lesa_to_cpu(short_ad in) 27static inline struct short_ad lesa_to_cpu(struct short_ad in)
28{ 28{
29 short_ad out; 29 struct short_ad out;
30 30
31 out.extLength = le32_to_cpu(in.extLength); 31 out.extLength = le32_to_cpu(in.extLength);
32 out.extPosition = le32_to_cpu(in.extPosition); 32 out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
34 return out; 34 return out;
35} 35}
36 36
37static inline short_ad cpu_to_lesa(short_ad in) 37static inline struct short_ad cpu_to_lesa(struct short_ad in)
38{ 38{
39 short_ad out; 39 struct short_ad out;
40 40
41 out.extLength = cpu_to_le32(in.extLength); 41 out.extLength = cpu_to_le32(in.extLength);
42 out.extPosition = cpu_to_le32(in.extPosition); 42 out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
44 return out; 44 return out;
45} 45}
46 46
47static inline kernel_long_ad lela_to_cpu(long_ad in) 47static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
48{ 48{
49 kernel_long_ad out; 49 struct kernel_long_ad out;
50 50
51 out.extLength = le32_to_cpu(in.extLength); 51 out.extLength = le32_to_cpu(in.extLength);
52 out.extLocation = lelb_to_cpu(in.extLocation); 52 out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
54 return out; 54 return out;
55} 55}
56 56
57static inline long_ad cpu_to_lela(kernel_long_ad in) 57static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
58{ 58{
59 long_ad out; 59 struct long_ad out;
60 60
61 out.extLength = cpu_to_le32(in.extLength); 61 out.extLength = cpu_to_le32(in.extLength);
62 out.extLocation = cpu_to_lelb(in.extLocation); 62 out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
64 return out; 64 return out;
65} 65}
66 66
67static inline kernel_extent_ad leea_to_cpu(extent_ad in) 67static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
68{ 68{
69 kernel_extent_ad out; 69 struct kernel_extent_ad out;
70 70
71 out.extLength = le32_to_cpu(in.extLength); 71 out.extLength = le32_to_cpu(in.extLength);
72 out.extLocation = le32_to_cpu(in.extLocation); 72 out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b5..b8c828c4d200 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
85#define SECS_PER_HOUR (60 * 60) 85#define SECS_PER_HOUR (60 * 60)
86#define SECS_PER_DAY (SECS_PER_HOUR * 24) 86#define SECS_PER_DAY (SECS_PER_HOUR * 24)
87 87
88struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) 88struct timespec *
89udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
89{ 90{
90 int yday; 91 int yday;
91 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); 92 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
116 return dest; 117 return dest;
117} 118}
118 119
119timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts) 120struct timestamp *
121udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
120{ 122{
121 long int days, rem, y; 123 long int days, rem, y;
122 const unsigned short int *ip; 124 const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58e..cefa8c8913e6 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
254{ 254{
255 const uint8_t *ocu; 255 const uint8_t *ocu;
256 uint8_t cmp_id, ocu_len; 256 uint8_t cmp_id, ocu_len;
257 int i; 257 int i, len;
258 258
259 259
260 ocu_len = ocu_i->u_len; 260 ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
279 if (cmp_id == 16) 279 if (cmp_id == 16)
280 c = (c << 8) | ocu[i++]; 280 c = (c << 8) | ocu[i++];
281 281
282 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 282 len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
283 UDF_NAME_LEN - utf_o->u_len); 283 UDF_NAME_LEN - utf_o->u_len);
284 /* Valid character? */
285 if (len >= 0)
286 utf_o->u_len += len;
287 else
288 utf_o->u_name[utf_o->u_len++] = '?';
284 } 289 }
285 utf_o->u_cmpID = 8; 290 utf_o->u_cmpID = 8;
286 291
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
290static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, 295static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
291 int length) 296 int length)
292{ 297{
293 unsigned len, i, max_val; 298 int len;
299 unsigned i, max_val;
294 uint16_t uni_char; 300 uint16_t uni_char;
295 int u_len; 301 int u_len;
296 302
@@ -302,8 +308,13 @@ try_again:
302 u_len = 0U; 308 u_len = 0U;
303 for (i = 0U; i < uni->u_len; i++) { 309 for (i = 0U; i < uni->u_len; i++) {
304 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); 310 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
305 if (len <= 0) 311 if (!len)
306 continue; 312 continue;
313 /* Invalid character, deal with it */
314 if (len < 0) {
315 len = 1;
316 uni_char = '?';
317 }
307 318
308 if (uni_char > max_val) { 319 if (uni_char > max_val) {
309 max_val = 0xffffU; 320 max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
324int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, 335int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
325 int flen) 336 int flen)
326{ 337{
327 struct ustr filename, unifilename; 338 struct ustr *filename, *unifilename;
328 int len; 339 int len = 0;
329 340
330 if (udf_build_ustr_exact(&unifilename, sname, flen)) 341 filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
342 if (!filename)
331 return 0; 343 return 0;
332 344
345 unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
346 if (!unifilename)
347 goto out1;
348
349 if (udf_build_ustr_exact(unifilename, sname, flen))
350 goto out2;
351
333 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 352 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
334 if (!udf_CS0toUTF8(&filename, &unifilename)) { 353 if (!udf_CS0toUTF8(filename, unifilename)) {
335 udf_debug("Failed in udf_get_filename: sname = %s\n", 354 udf_debug("Failed in udf_get_filename: sname = %s\n",
336 sname); 355 sname);
337 return 0; 356 goto out2;
338 } 357 }
339 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 358 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
340 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, 359 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
341 &unifilename)) { 360 unifilename)) {
342 udf_debug("Failed in udf_get_filename: sname = %s\n", 361 udf_debug("Failed in udf_get_filename: sname = %s\n",
343 sname); 362 sname);
344 return 0; 363 goto out2;
345 } 364 }
346 } else 365 } else
347 return 0; 366 goto out2;
348 367
349 len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, 368 len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
350 unifilename.u_name, unifilename.u_len); 369 unifilename->u_name, unifilename->u_len);
351 if (len) 370out2:
352 return len; 371 kfree(unifilename);
353 372out1:
354 return 0; 373 kfree(filename);
374 return len;
355} 375}
356 376
357int udf_put_filename(struct super_block *sb, const uint8_t *sname, 377int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e1c1fc5ee239..60359291761f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1268 struct ufs_super_block_first *usb1; 1268 struct ufs_super_block_first *usb1;
1269 struct ufs_super_block_second *usb2; 1269 struct ufs_super_block_second *usb2;
1270 struct ufs_super_block_third *usb3; 1270 struct ufs_super_block_third *usb3;
1271 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1271 1272
1272 lock_kernel(); 1273 lock_kernel();
1273 1274
@@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1290 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; 1291 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
1291 buf->f_files = uspi->s_ncg * uspi->s_ipg; 1292 buf->f_files = uspi->s_ncg * uspi->s_ipg;
1292 buf->f_namelen = UFS_MAXNAMLEN; 1293 buf->f_namelen = UFS_MAXNAMLEN;
1294 buf->f_fsid.val[0] = (u32)id;
1295 buf->f_fsid.val[1] = (u32)(id >> 32);
1293 1296
1294 unlock_kernel(); 1297 unlock_kernel();
1295 1298
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c3dc491fff89..60f107e47fe9 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
33 xfs_qm_syscalls.o \ 33 xfs_qm_syscalls.o \
34 xfs_qm_bhv.o \ 34 xfs_qm_bhv.o \
35 xfs_qm.o) 35 xfs_qm.o)
36xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o
36 37
37ifeq ($(CONFIG_XFS_QUOTA),y) 38ifeq ($(CONFIG_XFS_QUOTA),y)
38xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o 39xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
deleted file mode 100644
index 2a88d56c4dc2..000000000000
--- a/fs/xfs/linux-2.6/mutex.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_MUTEX_H__
19#define __XFS_SUPPORT_MUTEX_H__
20
21#include <linux/mutex.h>
22
23typedef struct mutex mutex_t;
24
25#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198f771e..c13f67300fe7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = {
1623 .bmap = xfs_vm_bmap, 1623 .bmap = xfs_vm_bmap,
1624 .direct_IO = xfs_vm_direct_IO, 1624 .direct_IO = xfs_vm_direct_IO,
1625 .migratepage = buffer_migrate_page, 1625 .migratepage = buffer_migrate_page,
1626 .is_partially_uptodate = block_is_partially_uptodate,
1626}; 1627};
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4bd112313f33..d0b499418a7d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -34,6 +34,7 @@
34#include "xfs_dir2_sf.h" 34#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 35#include "xfs_dinode.h"
36#include "xfs_inode.h" 36#include "xfs_inode.h"
37#include "xfs_ioctl.h"
37#include "xfs_btree.h" 38#include "xfs_btree.h"
38#include "xfs_ialloc.h" 39#include "xfs_ialloc.h"
39#include "xfs_rtalloc.h" 40#include "xfs_rtalloc.h"
@@ -78,92 +79,74 @@ xfs_find_handle(
78 int hsize; 79 int hsize;
79 xfs_handle_t handle; 80 xfs_handle_t handle;
80 struct inode *inode; 81 struct inode *inode;
82 struct file *file = NULL;
83 struct path path;
84 int error;
85 struct xfs_inode *ip;
81 86
82 memset((char *)&handle, 0, sizeof(handle)); 87 if (cmd == XFS_IOC_FD_TO_HANDLE) {
83 88 file = fget(hreq->fd);
84 switch (cmd) { 89 if (!file)
85 case XFS_IOC_PATH_TO_FSHANDLE: 90 return -EBADF;
86 case XFS_IOC_PATH_TO_HANDLE: { 91 inode = file->f_path.dentry->d_inode;
87 struct path path; 92 } else {
88 int error = user_lpath((const char __user *)hreq->path, &path); 93 error = user_lpath((const char __user *)hreq->path, &path);
89 if (error) 94 if (error)
90 return error; 95 return error;
91 96 inode = path.dentry->d_inode;
92 ASSERT(path.dentry);
93 ASSERT(path.dentry->d_inode);
94 inode = igrab(path.dentry->d_inode);
95 path_put(&path);
96 break;
97 } 97 }
98 ip = XFS_I(inode);
98 99
99 case XFS_IOC_FD_TO_HANDLE: { 100 /*
100 struct file *file; 101 * We can only generate handles for inodes residing on a XFS filesystem,
101 102 * and only for regular files, directories or symbolic links.
102 file = fget(hreq->fd); 103 */
103 if (!file) 104 error = -EINVAL;
104 return -EBADF; 105 if (inode->i_sb->s_magic != XFS_SB_MAGIC)
106 goto out_put;
105 107
106 ASSERT(file->f_path.dentry); 108 error = -EBADF;
107 ASSERT(file->f_path.dentry->d_inode); 109 if (!S_ISREG(inode->i_mode) &&
108 inode = igrab(file->f_path.dentry->d_inode); 110 !S_ISDIR(inode->i_mode) &&
109 fput(file); 111 !S_ISLNK(inode->i_mode))
110 break; 112 goto out_put;
111 }
112 113
113 default:
114 ASSERT(0);
115 return -XFS_ERROR(EINVAL);
116 }
117 114
118 if (inode->i_sb->s_magic != XFS_SB_MAGIC) { 115 memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
119 /* we're not in XFS anymore, Toto */
120 iput(inode);
121 return -XFS_ERROR(EINVAL);
122 }
123 116
124 switch (inode->i_mode & S_IFMT) { 117 if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
125 case S_IFREG: 118 /*
126 case S_IFDIR: 119 * This handle only contains an fsid, zero the rest.
127 case S_IFLNK: 120 */
128 break; 121 memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
129 default: 122 hsize = sizeof(xfs_fsid_t);
130 iput(inode); 123 } else {
131 return -XFS_ERROR(EBADF);
132 }
133
134 /* now we can grab the fsid */
135 memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid,
136 sizeof(xfs_fsid_t));
137 hsize = sizeof(xfs_fsid_t);
138
139 if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
140 xfs_inode_t *ip = XFS_I(inode);
141 int lock_mode; 124 int lock_mode;
142 125
143 /* need to get access to the xfs_inode to read the generation */
144 lock_mode = xfs_ilock_map_shared(ip); 126 lock_mode = xfs_ilock_map_shared(ip);
145
146 /* fill in fid section of handle from inode */
147 handle.ha_fid.fid_len = sizeof(xfs_fid_t) - 127 handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
148 sizeof(handle.ha_fid.fid_len); 128 sizeof(handle.ha_fid.fid_len);
149 handle.ha_fid.fid_pad = 0; 129 handle.ha_fid.fid_pad = 0;
150 handle.ha_fid.fid_gen = ip->i_d.di_gen; 130 handle.ha_fid.fid_gen = ip->i_d.di_gen;
151 handle.ha_fid.fid_ino = ip->i_ino; 131 handle.ha_fid.fid_ino = ip->i_ino;
152
153 xfs_iunlock_map_shared(ip, lock_mode); 132 xfs_iunlock_map_shared(ip, lock_mode);
154 133
155 hsize = XFS_HSIZE(handle); 134 hsize = XFS_HSIZE(handle);
156 } 135 }
157 136
158 /* now copy our handle into the user buffer & write out the size */ 137 error = -EFAULT;
159 if (copy_to_user(hreq->ohandle, &handle, hsize) || 138 if (copy_to_user(hreq->ohandle, &handle, hsize) ||
160 copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) { 139 copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
161 iput(inode); 140 goto out_put;
162 return -XFS_ERROR(EFAULT);
163 }
164 141
165 iput(inode); 142 error = 0;
166 return 0; 143
144 out_put:
145 if (cmd == XFS_IOC_FD_TO_HANDLE)
146 fput(file);
147 else
148 path_put(&path);
149 return error;
167} 150}
168 151
169/* 152/*
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67f..6075382336d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -211,8 +211,13 @@ xfs_vn_mknod(
211 * Irix uses Missed'em'V split, but doesn't want to see 211 * Irix uses Missed'em'V split, but doesn't want to see
212 * the upper 5 bits of (14bit) major. 212 * the upper 5 bits of (14bit) major.
213 */ 213 */
214 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) 214 if (S_ISCHR(mode) || S_ISBLK(mode)) {
215 return -EINVAL; 215 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
216 return -EINVAL;
217 rdev = sysv_encode_dev(rdev);
218 } else {
219 rdev = 0;
220 }
216 221
217 if (test_default_acl && test_default_acl(dir)) { 222 if (test_default_acl && test_default_acl(dir)) {
218 if (!_ACL_ALLOC(default_acl)) { 223 if (!_ACL_ALLOC(default_acl)) {
@@ -224,28 +229,11 @@ xfs_vn_mknod(
224 } 229 }
225 } 230 }
226 231
227 xfs_dentry_to_name(&name, dentry);
228
229 if (IS_POSIXACL(dir) && !default_acl) 232 if (IS_POSIXACL(dir) && !default_acl)
230 mode &= ~current->fs->umask; 233 mode &= ~current_umask();
231
232 switch (mode & S_IFMT) {
233 case S_IFCHR:
234 case S_IFBLK:
235 case S_IFIFO:
236 case S_IFSOCK:
237 rdev = sysv_encode_dev(rdev);
238 case S_IFREG:
239 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
240 break;
241 case S_IFDIR:
242 error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
243 break;
244 default:
245 error = EINVAL;
246 break;
247 }
248 234
235 xfs_dentry_to_name(&name, dentry);
236 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
249 if (unlikely(error)) 237 if (unlikely(error))
250 goto out_free_acl; 238 goto out_free_acl;
251 239
@@ -416,7 +404,7 @@ xfs_vn_symlink(
416 mode_t mode; 404 mode_t mode;
417 405
418 mode = S_IFLNK | 406 mode = S_IFLNK |
419 (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); 407 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
420 xfs_dentry_to_name(&name, dentry); 408 xfs_dentry_to_name(&name, dentry);
421 409
422 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 410 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
@@ -553,9 +541,6 @@ xfs_vn_getattr(
553 stat->uid = ip->i_d.di_uid; 541 stat->uid = ip->i_d.di_uid;
554 stat->gid = ip->i_d.di_gid; 542 stat->gid = ip->i_d.di_gid;
555 stat->ino = ip->i_ino; 543 stat->ino = ip->i_ino;
556#if XFS_BIG_INUMS
557 stat->ino += mp->m_inoadd;
558#endif
559 stat->atime = inode->i_atime; 544 stat->atime = inode->i_atime;
560 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; 545 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
561 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; 546 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 507492d6dccd..f65a53f8752f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -38,7 +38,6 @@
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h> 40#include <sv.h>
41#include <mutex.h>
42#include <time.h> 41#include <time.h>
43 42
44#include <support/ktrace.h> 43#include <support/ktrace.h>
@@ -51,6 +50,7 @@
51#include <linux/blkdev.h> 50#include <linux/blkdev.h>
52#include <linux/slab.h> 51#include <linux/slab.h>
53#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/mutex.h>
54#include <linux/file.h> 54#include <linux/file.h>
55#include <linux/swap.h> 55#include <linux/swap.h>
56#include <linux/errno.h> 56#include <linux/errno.h>
@@ -147,17 +147,6 @@
147#define SYNCHRONIZE() barrier() 147#define SYNCHRONIZE() barrier()
148#define __return_address __builtin_return_address(0) 148#define __return_address __builtin_return_address(0)
149 149
150/*
151 * IRIX (BSD) quotactl makes use of separate commands for user/group,
152 * whereas on Linux the syscall encodes this information into the cmd
153 * field (see the QCMD macro in quota.h). These macros help keep the
154 * code portable - they are not visible from the syscall interface.
155 */
156#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */
157#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */
158#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */
159#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */
160
161#define dfltprid 0 150#define dfltprid 0
162#define MAXPATHLEN 1024 151#define MAXPATHLEN 1024
163 152
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 000000000000..94d9a633d3d9
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2008, Christoph Hellwig
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_dmapi.h"
20#include "xfs_sb.h"
21#include "xfs_inum.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_log.h"
26#include "xfs_trans.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_inode.h"
29#include "quota/xfs_qm.h"
30#include <linux/quota.h>
31
32
33STATIC int
34xfs_quota_type(int type)
35{
36 switch (type) {
37 case USRQUOTA:
38 return XFS_DQ_USER;
39 case GRPQUOTA:
40 return XFS_DQ_GROUP;
41 default:
42 return XFS_DQ_PROJ;
43 }
44}
45
46STATIC int
47xfs_fs_quota_sync(
48 struct super_block *sb,
49 int type)
50{
51 struct xfs_mount *mp = XFS_M(sb);
52
53 if (!XFS_IS_QUOTA_RUNNING(mp))
54 return -ENOSYS;
55 return -xfs_sync_inodes(mp, SYNC_DELWRI);
56}
57
58STATIC int
59xfs_fs_get_xstate(
60 struct super_block *sb,
61 struct fs_quota_stat *fqs)
62{
63 struct xfs_mount *mp = XFS_M(sb);
64
65 if (!XFS_IS_QUOTA_RUNNING(mp))
66 return -ENOSYS;
67 return -xfs_qm_scall_getqstat(mp, fqs);
68}
69
70STATIC int
71xfs_fs_set_xstate(
72 struct super_block *sb,
73 unsigned int uflags,
74 int op)
75{
76 struct xfs_mount *mp = XFS_M(sb);
77 unsigned int flags = 0;
78
79 if (sb->s_flags & MS_RDONLY)
80 return -EROFS;
81 if (!XFS_IS_QUOTA_RUNNING(mp))
82 return -ENOSYS;
83 if (!capable(CAP_SYS_ADMIN))
84 return -EPERM;
85
86 if (uflags & XFS_QUOTA_UDQ_ACCT)
87 flags |= XFS_UQUOTA_ACCT;
88 if (uflags & XFS_QUOTA_PDQ_ACCT)
89 flags |= XFS_PQUOTA_ACCT;
90 if (uflags & XFS_QUOTA_GDQ_ACCT)
91 flags |= XFS_GQUOTA_ACCT;
92 if (uflags & XFS_QUOTA_UDQ_ENFD)
93 flags |= XFS_UQUOTA_ENFD;
94 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
95 flags |= XFS_OQUOTA_ENFD;
96
97 switch (op) {
98 case Q_XQUOTAON:
99 return -xfs_qm_scall_quotaon(mp, flags);
100 case Q_XQUOTAOFF:
101 if (!XFS_IS_QUOTA_ON(mp))
102 return -EINVAL;
103 return -xfs_qm_scall_quotaoff(mp, flags);
104 case Q_XQUOTARM:
105 if (XFS_IS_QUOTA_ON(mp))
106 return -EINVAL;
107 return -xfs_qm_scall_trunc_qfiles(mp, flags);
108 }
109
110 return -EINVAL;
111}
112
113STATIC int
114xfs_fs_get_xquota(
115 struct super_block *sb,
116 int type,
117 qid_t id,
118 struct fs_disk_quota *fdq)
119{
120 struct xfs_mount *mp = XFS_M(sb);
121
122 if (!XFS_IS_QUOTA_RUNNING(mp))
123 return -ENOSYS;
124 if (!XFS_IS_QUOTA_ON(mp))
125 return -ESRCH;
126
127 return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
128}
129
130STATIC int
131xfs_fs_set_xquota(
132 struct super_block *sb,
133 int type,
134 qid_t id,
135 struct fs_disk_quota *fdq)
136{
137 struct xfs_mount *mp = XFS_M(sb);
138
139 if (sb->s_flags & MS_RDONLY)
140 return -EROFS;
141 if (!XFS_IS_QUOTA_RUNNING(mp))
142 return -ENOSYS;
143 if (!XFS_IS_QUOTA_ON(mp))
144 return -ESRCH;
145 if (!capable(CAP_SYS_ADMIN))
146 return -EPERM;
147
148 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
149}
150
151struct quotactl_ops xfs_quotactl_operations = {
152 .quota_sync = xfs_fs_quota_sync,
153 .get_xstate = xfs_fs_get_xstate,
154 .set_xstate = xfs_fs_set_xstate,
155 .get_xquota = xfs_fs_get_xquota,
156 .set_xquota = xfs_fs_set_xquota,
157};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 32ae5028e96b..bb685269f832 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -68,7 +68,6 @@
68#include <linux/freezer.h> 68#include <linux/freezer.h>
69#include <linux/parser.h> 69#include <linux/parser.h>
70 70
71static struct quotactl_ops xfs_quotactl_operations;
72static struct super_operations xfs_super_operations; 71static struct super_operations xfs_super_operations;
73static kmem_zone_t *xfs_ioend_zone; 72static kmem_zone_t *xfs_ioend_zone;
74mempool_t *xfs_ioend_pool; 73mempool_t *xfs_ioend_pool;
@@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool;
79#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ 78#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
80#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ 79#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
81#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ 80#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
82#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */
83#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ 81#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
84#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ 82#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
85#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ 83#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
@@ -180,7 +178,7 @@ xfs_parseargs(
180 int dswidth = 0; 178 int dswidth = 0;
181 int iosize = 0; 179 int iosize = 0;
182 int dmapi_implies_ikeep = 1; 180 int dmapi_implies_ikeep = 1;
183 uchar_t iosizelog = 0; 181 __uint8_t iosizelog = 0;
184 182
185 /* 183 /*
186 * Copy binary VFS mount flags we are interested in. 184 * Copy binary VFS mount flags we are interested in.
@@ -291,16 +289,6 @@ xfs_parseargs(
291 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; 289 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
292 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { 290 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
293 mp->m_flags |= XFS_MOUNT_NORECOVERY; 291 mp->m_flags |= XFS_MOUNT_NORECOVERY;
294 } else if (!strcmp(this_char, MNTOPT_INO64)) {
295#if XFS_BIG_INUMS
296 mp->m_flags |= XFS_MOUNT_INO64;
297 mp->m_inoadd = XFS_INO64_OFFSET;
298#else
299 cmn_err(CE_WARN,
300 "XFS: %s option not allowed on this system",
301 this_char);
302 return EINVAL;
303#endif
304 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { 292 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
305 mp->m_flags |= XFS_MOUNT_NOALIGN; 293 mp->m_flags |= XFS_MOUNT_NOALIGN;
306 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { 294 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
@@ -529,7 +517,6 @@ xfs_showargs(
529 /* the few simple ones we can get from the mount struct */ 517 /* the few simple ones we can get from the mount struct */
530 { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, 518 { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP },
531 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, 519 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
532 { XFS_MOUNT_INO64, "," MNTOPT_INO64 },
533 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, 520 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
534 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, 521 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
535 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, 522 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
@@ -634,7 +621,7 @@ xfs_max_file_offset(
634 return (((__uint64_t)pagefactor) << bitshift) - 1; 621 return (((__uint64_t)pagefactor) << bitshift) - 1;
635} 622}
636 623
637int 624STATIC int
638xfs_blkdev_get( 625xfs_blkdev_get(
639 xfs_mount_t *mp, 626 xfs_mount_t *mp,
640 const char *name, 627 const char *name,
@@ -651,7 +638,7 @@ xfs_blkdev_get(
651 return -error; 638 return -error;
652} 639}
653 640
654void 641STATIC void
655xfs_blkdev_put( 642xfs_blkdev_put(
656 struct block_device *bdev) 643 struct block_device *bdev)
657{ 644{
@@ -872,7 +859,7 @@ xfsaild_wakeup(
872 wake_up_process(ailp->xa_task); 859 wake_up_process(ailp->xa_task);
873} 860}
874 861
875int 862STATIC int
876xfsaild( 863xfsaild(
877 void *data) 864 void *data)
878{ 865{
@@ -990,26 +977,57 @@ xfs_fs_write_inode(
990 int sync) 977 int sync)
991{ 978{
992 struct xfs_inode *ip = XFS_I(inode); 979 struct xfs_inode *ip = XFS_I(inode);
980 struct xfs_mount *mp = ip->i_mount;
993 int error = 0; 981 int error = 0;
994 int flags = 0;
995 982
996 xfs_itrace_entry(ip); 983 xfs_itrace_entry(ip);
984
985 if (XFS_FORCED_SHUTDOWN(mp))
986 return XFS_ERROR(EIO);
987
997 if (sync) { 988 if (sync) {
998 error = xfs_wait_on_pages(ip, 0, -1); 989 error = xfs_wait_on_pages(ip, 0, -1);
999 if (error) 990 if (error)
1000 goto out_error; 991 goto out;
1001 flags |= FLUSH_SYNC;
1002 } 992 }
1003 error = xfs_inode_flush(ip, flags);
1004 993
1005out_error: 994 /*
995 * Bypass inodes which have already been cleaned by
996 * the inode flush clustering code inside xfs_iflush
997 */
998 if (xfs_inode_clean(ip))
999 goto out;
1000
1001 /*
1002 * We make this non-blocking if the inode is contended, return
1003 * EAGAIN to indicate to the caller that they did not succeed.
1004 * This prevents the flush path from blocking on inodes inside
1005 * another operation right now, they get caught later by xfs_sync.
1006 */
1007 if (sync) {
1008 xfs_ilock(ip, XFS_ILOCK_SHARED);
1009 xfs_iflock(ip);
1010
1011 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
1012 } else {
1013 error = EAGAIN;
1014 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1015 goto out;
1016 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1017 goto out_unlock;
1018
1019 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
1020 }
1021
1022 out_unlock:
1023 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1024 out:
1006 /* 1025 /*
1007 * if we failed to write out the inode then mark 1026 * if we failed to write out the inode then mark
1008 * it dirty again so we'll try again later. 1027 * it dirty again so we'll try again later.
1009 */ 1028 */
1010 if (error) 1029 if (error)
1011 xfs_mark_inode_dirty_sync(ip); 1030 xfs_mark_inode_dirty_sync(ip);
1012
1013 return -error; 1031 return -error;
1014} 1032}
1015 1033
@@ -1169,18 +1187,12 @@ xfs_fs_statfs(
1169 statp->f_bfree = statp->f_bavail = 1187 statp->f_bfree = statp->f_bavail =
1170 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); 1188 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1171 fakeinos = statp->f_bfree << sbp->sb_inopblog; 1189 fakeinos = statp->f_bfree << sbp->sb_inopblog;
1172#if XFS_BIG_INUMS
1173 fakeinos += mp->m_inoadd;
1174#endif
1175 statp->f_files = 1190 statp->f_files =
1176 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); 1191 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
1177 if (mp->m_maxicount) 1192 if (mp->m_maxicount)
1178#if XFS_BIG_INUMS 1193 statp->f_files = min_t(typeof(statp->f_files),
1179 if (!mp->m_inoadd) 1194 statp->f_files,
1180#endif 1195 mp->m_maxicount);
1181 statp->f_files = min_t(typeof(statp->f_files),
1182 statp->f_files,
1183 mp->m_maxicount);
1184 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1196 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1185 spin_unlock(&mp->m_sb_lock); 1197 spin_unlock(&mp->m_sb_lock);
1186 1198
@@ -1302,57 +1314,6 @@ xfs_fs_show_options(
1302 return -xfs_showargs(XFS_M(mnt->mnt_sb), m); 1314 return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
1303} 1315}
1304 1316
1305STATIC int
1306xfs_fs_quotasync(
1307 struct super_block *sb,
1308 int type)
1309{
1310 return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL);
1311}
1312
1313STATIC int
1314xfs_fs_getxstate(
1315 struct super_block *sb,
1316 struct fs_quota_stat *fqs)
1317{
1318 return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
1319}
1320
1321STATIC int
1322xfs_fs_setxstate(
1323 struct super_block *sb,
1324 unsigned int flags,
1325 int op)
1326{
1327 return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags);
1328}
1329
1330STATIC int
1331xfs_fs_getxquota(
1332 struct super_block *sb,
1333 int type,
1334 qid_t id,
1335 struct fs_disk_quota *fdq)
1336{
1337 return -XFS_QM_QUOTACTL(XFS_M(sb),
1338 (type == USRQUOTA) ? Q_XGETQUOTA :
1339 ((type == GRPQUOTA) ? Q_XGETGQUOTA :
1340 Q_XGETPQUOTA), id, (caddr_t)fdq);
1341}
1342
1343STATIC int
1344xfs_fs_setxquota(
1345 struct super_block *sb,
1346 int type,
1347 qid_t id,
1348 struct fs_disk_quota *fdq)
1349{
1350 return -XFS_QM_QUOTACTL(XFS_M(sb),
1351 (type == USRQUOTA) ? Q_XSETQLIM :
1352 ((type == GRPQUOTA) ? Q_XSETGQLIM :
1353 Q_XSETPQLIM), id, (caddr_t)fdq);
1354}
1355
1356/* 1317/*
1357 * This function fills in xfs_mount_t fields based on mount args. 1318 * This function fills in xfs_mount_t fields based on mount args.
1358 * Note: the superblock _has_ now been read in. 1319 * Note: the superblock _has_ now been read in.
@@ -1435,7 +1396,9 @@ xfs_fs_fill_super(
1435 sb_min_blocksize(sb, BBSIZE); 1396 sb_min_blocksize(sb, BBSIZE);
1436 sb->s_xattr = xfs_xattr_handlers; 1397 sb->s_xattr = xfs_xattr_handlers;
1437 sb->s_export_op = &xfs_export_operations; 1398 sb->s_export_op = &xfs_export_operations;
1399#ifdef CONFIG_XFS_QUOTA
1438 sb->s_qcop = &xfs_quotactl_operations; 1400 sb->s_qcop = &xfs_quotactl_operations;
1401#endif
1439 sb->s_op = &xfs_super_operations; 1402 sb->s_op = &xfs_super_operations;
1440 1403
1441 error = xfs_dmops_get(mp); 1404 error = xfs_dmops_get(mp);
@@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = {
1578 .show_options = xfs_fs_show_options, 1541 .show_options = xfs_fs_show_options,
1579}; 1542};
1580 1543
1581static struct quotactl_ops xfs_quotactl_operations = {
1582 .quota_sync = xfs_fs_quotasync,
1583 .get_xstate = xfs_fs_getxstate,
1584 .set_xstate = xfs_fs_setxstate,
1585 .get_xquota = xfs_fs_getxquota,
1586 .set_xquota = xfs_fs_setxquota,
1587};
1588
1589static struct file_system_type xfs_fs_type = { 1544static struct file_system_type xfs_fs_type = {
1590 .owner = THIS_MODULE, 1545 .owner = THIS_MODULE,
1591 .name = "xfs", 1546 .name = "xfs",
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index d5d776d4cd67..5a2ea3a21781 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
93 93
94extern const struct export_operations xfs_export_operations; 94extern const struct export_operations xfs_export_operations;
95extern struct xattr_handler *xfs_xattr_handlers[]; 95extern struct xattr_handler *xfs_xattr_handlers[];
96extern struct quotactl_ops xfs_quotactl_operations;
96 97
97#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
98 99
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 5f6de1efe1f6..04f058c848ae 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -19,6 +19,7 @@
19#define XFS_SYNC_H 1 19#define XFS_SYNC_H 1
20 20
21struct xfs_mount; 21struct xfs_mount;
22struct xfs_perag;
22 23
23typedef struct bhv_vfs_sync_work { 24typedef struct bhv_vfs_sync_work {
24 struct list_head w_list; 25 struct list_head w_list;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f65983a230d3..ad7fbead4c97 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -41,11 +41,6 @@ struct attrlist_cursor_kern;
41#define IO_INVIS 0x00020 /* don't update inode timestamps */ 41#define IO_INVIS 0x00020 /* don't update inode timestamps */
42 42
43/* 43/*
44 * Flags for xfs_inode_flush
45 */
46#define FLUSH_SYNC 1 /* wait for flush to complete */
47
48/*
49 * Flush/Invalidate options for vop_toss/flush/flushinval_pages. 44 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
50 */ 45 */
51#define FI_NONE 0 /* none */ 46#define FI_NONE 0 /* none */
@@ -55,33 +50,6 @@ struct attrlist_cursor_kern;
55 the operation completes. */ 50 the operation completes. */
56 51
57/* 52/*
58 * Dealing with bad inodes
59 */
60static inline int VN_BAD(struct inode *vp)
61{
62 return is_bad_inode(vp);
63}
64
65/*
66 * Extracting atime values in various formats
67 */
68static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
69{
70 bs_atime->tv_sec = vp->i_atime.tv_sec;
71 bs_atime->tv_nsec = vp->i_atime.tv_nsec;
72}
73
74static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
75{
76 *ts = vp->i_atime;
77}
78
79static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
80{
81 *tt = vp->i_atime.tv_sec;
82}
83
84/*
85 * Some useful predicates. 53 * Some useful predicates.
86 */ 54 */
87#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) 55#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 6543c0b29753..e4babcc63423 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -804,7 +804,7 @@ xfs_qm_dqlookup(
804 uint flist_locked; 804 uint flist_locked;
805 xfs_dquot_t *d; 805 xfs_dquot_t *d;
806 806
807 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 807 ASSERT(mutex_is_locked(&qh->qh_lock));
808 808
809 flist_locked = B_FALSE; 809 flist_locked = B_FALSE;
810 810
@@ -877,7 +877,7 @@ xfs_qm_dqlookup(
877 /* 877 /*
878 * move the dquot to the front of the hashchain 878 * move the dquot to the front of the hashchain
879 */ 879 */
880 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 880 ASSERT(mutex_is_locked(&qh->qh_lock));
881 if (dqp->HL_PREVP != &qh->qh_next) { 881 if (dqp->HL_PREVP != &qh->qh_next) {
882 xfs_dqtrace_entry(dqp, 882 xfs_dqtrace_entry(dqp,
883 "DQLOOKUP: HASH MOVETOFRONT"); 883 "DQLOOKUP: HASH MOVETOFRONT");
@@ -892,13 +892,13 @@ xfs_qm_dqlookup(
892 } 892 }
893 xfs_dqtrace_entry(dqp, "LOOKUP END"); 893 xfs_dqtrace_entry(dqp, "LOOKUP END");
894 *O_dqpp = dqp; 894 *O_dqpp = dqp;
895 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 895 ASSERT(mutex_is_locked(&qh->qh_lock));
896 return (0); 896 return (0);
897 } 897 }
898 } 898 }
899 899
900 *O_dqpp = NULL; 900 *O_dqpp = NULL;
901 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 901 ASSERT(mutex_is_locked(&qh->qh_lock));
902 return (1); 902 return (1);
903} 903}
904 904
@@ -956,7 +956,7 @@ xfs_qm_dqget(
956 ASSERT(ip->i_gdquot == NULL); 956 ASSERT(ip->i_gdquot == NULL);
957 } 957 }
958#endif 958#endif
959 XFS_DQ_HASH_LOCK(h); 959 mutex_lock(&h->qh_lock);
960 960
961 /* 961 /*
962 * Look in the cache (hashtable). 962 * Look in the cache (hashtable).
@@ -971,7 +971,7 @@ xfs_qm_dqget(
971 */ 971 */
972 ASSERT(*O_dqpp); 972 ASSERT(*O_dqpp);
973 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); 973 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
974 XFS_DQ_HASH_UNLOCK(h); 974 mutex_unlock(&h->qh_lock);
975 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); 975 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
976 return (0); /* success */ 976 return (0); /* success */
977 } 977 }
@@ -991,7 +991,7 @@ xfs_qm_dqget(
991 * we don't keep the lock across a disk read 991 * we don't keep the lock across a disk read
992 */ 992 */
993 version = h->qh_version; 993 version = h->qh_version;
994 XFS_DQ_HASH_UNLOCK(h); 994 mutex_unlock(&h->qh_lock);
995 995
996 /* 996 /*
997 * Allocate the dquot on the kernel heap, and read the ondisk 997 * Allocate the dquot on the kernel heap, and read the ondisk
@@ -1056,7 +1056,7 @@ xfs_qm_dqget(
1056 /* 1056 /*
1057 * Hashlock comes after ilock in lock order 1057 * Hashlock comes after ilock in lock order
1058 */ 1058 */
1059 XFS_DQ_HASH_LOCK(h); 1059 mutex_lock(&h->qh_lock);
1060 if (version != h->qh_version) { 1060 if (version != h->qh_version) {
1061 xfs_dquot_t *tmpdqp; 1061 xfs_dquot_t *tmpdqp;
1062 /* 1062 /*
@@ -1072,7 +1072,7 @@ xfs_qm_dqget(
1072 * and start over. 1072 * and start over.
1073 */ 1073 */
1074 xfs_qm_dqput(tmpdqp); 1074 xfs_qm_dqput(tmpdqp);
1075 XFS_DQ_HASH_UNLOCK(h); 1075 mutex_unlock(&h->qh_lock);
1076 xfs_qm_dqdestroy(dqp); 1076 xfs_qm_dqdestroy(dqp);
1077 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); 1077 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
1078 goto again; 1078 goto again;
@@ -1083,7 +1083,7 @@ xfs_qm_dqget(
1083 * Put the dquot at the beginning of the hash-chain and mp's list 1083 * Put the dquot at the beginning of the hash-chain and mp's list
1084 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. 1084 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
1085 */ 1085 */
1086 ASSERT(XFS_DQ_IS_HASH_LOCKED(h)); 1086 ASSERT(mutex_is_locked(&h->qh_lock));
1087 dqp->q_hash = h; 1087 dqp->q_hash = h;
1088 XQM_HASHLIST_INSERT(h, dqp); 1088 XQM_HASHLIST_INSERT(h, dqp);
1089 1089
@@ -1102,7 +1102,7 @@ xfs_qm_dqget(
1102 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); 1102 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
1103 1103
1104 xfs_qm_mplist_unlock(mp); 1104 xfs_qm_mplist_unlock(mp);
1105 XFS_DQ_HASH_UNLOCK(h); 1105 mutex_unlock(&h->qh_lock);
1106 dqret: 1106 dqret:
1107 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1107 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
1108 xfs_dqtrace_entry(dqp, "DQGET DONE"); 1108 xfs_dqtrace_entry(dqp, "DQGET DONE");
@@ -1440,7 +1440,7 @@ xfs_qm_dqpurge(
1440 xfs_mount_t *mp = dqp->q_mount; 1440 xfs_mount_t *mp = dqp->q_mount;
1441 1441
1442 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1442 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
1443 ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); 1443 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
1444 1444
1445 xfs_dqlock(dqp); 1445 xfs_dqlock(dqp);
1446 /* 1446 /*
@@ -1453,7 +1453,7 @@ xfs_qm_dqpurge(
1453 */ 1453 */
1454 if (dqp->q_nrefs != 0) { 1454 if (dqp->q_nrefs != 0) {
1455 xfs_dqunlock(dqp); 1455 xfs_dqunlock(dqp);
1456 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 1456 mutex_unlock(&dqp->q_hash->qh_lock);
1457 return (1); 1457 return (1);
1458 } 1458 }
1459 1459
@@ -1517,7 +1517,7 @@ xfs_qm_dqpurge(
1517 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 1517 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1518 xfs_dqfunlock(dqp); 1518 xfs_dqfunlock(dqp);
1519 xfs_dqunlock(dqp); 1519 xfs_dqunlock(dqp);
1520 XFS_DQ_HASH_UNLOCK(thishash); 1520 mutex_unlock(&thishash->qh_lock);
1521 return (0); 1521 return (0);
1522} 1522}
1523 1523
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index d443e93b4331..de0f402ddb4c 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -34,7 +34,7 @@
34 */ 34 */
35typedef struct xfs_dqhash { 35typedef struct xfs_dqhash {
36 struct xfs_dquot *qh_next; 36 struct xfs_dquot *qh_next;
37 mutex_t qh_lock; 37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */ 38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */ 39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t; 40} xfs_dqhash_t;
@@ -81,7 +81,7 @@ typedef struct xfs_dquot {
81 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ 81 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ 82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ 83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
84 mutex_t q_qlock; /* quota lock */ 84 struct mutex q_qlock; /* quota lock */
85 struct completion q_flush; /* flush completion queue */ 85 struct completion q_flush; /* flush completion queue */
86 atomic_t q_pincount; /* dquot pin count */ 86 atomic_t q_pincount; /* dquot pin count */
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
@@ -109,19 +109,6 @@ enum {
109 109
110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) 110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
111 111
112#ifdef DEBUG
113static inline int
114XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
115{
116 if (mutex_trylock(&dqp->q_qlock)) {
117 mutex_unlock(&dqp->q_qlock);
118 return 0;
119 }
120 return 1;
121}
122#endif
123
124
125/* 112/*
126 * Manage the q_flush completion queue embedded in the dquot. This completion 113 * Manage the q_flush completion queue embedded in the dquot. This completion
127 * queue synchronizes processes attempting to flush the in-core dquot back to 114 * queue synchronizes processes attempting to flush the in-core dquot back to
@@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
142 complete(&dqp->q_flush); 129 complete(&dqp->q_flush);
143} 130}
144 131
132#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
145#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) 133#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
146#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 134#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
147#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 135#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7a2beb64314f..5b6695049e00 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,7 +55,7 @@
55 * quota functionality, including maintaining the freelist and hash 55 * quota functionality, including maintaining the freelist and hash
56 * tables of dquots. 56 * tables of dquots.
57 */ 57 */
58mutex_t xfs_Gqm_lock; 58struct mutex xfs_Gqm_lock;
59struct xfs_qm *xfs_Gqm; 59struct xfs_qm *xfs_Gqm;
60uint ndquot; 60uint ndquot;
61 61
@@ -69,8 +69,6 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 69
70STATIC void xfs_qm_freelist_init(xfs_frlist_t *); 70STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); 71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
72STATIC int xfs_qm_mplist_nowait(xfs_mount_t *);
73STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
74 72
75STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 73STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
76STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 74STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
@@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = {
82}; 80};
83 81
84#ifdef DEBUG 82#ifdef DEBUG
85extern mutex_t qcheck_lock; 83extern struct mutex qcheck_lock;
86#endif 84#endif
87 85
88#ifdef QUOTADEBUG 86#ifdef QUOTADEBUG
@@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref(
219 * the structure could disappear between the entry to this routine and 217 * the structure could disappear between the entry to this routine and
220 * a HOLD operation if not locked. 218 * a HOLD operation if not locked.
221 */ 219 */
222 XFS_QM_LOCK(xfs_Gqm); 220 mutex_lock(&xfs_Gqm_lock);
223 221
224 if (xfs_Gqm == NULL) 222 if (xfs_Gqm == NULL)
225 xfs_Gqm = xfs_Gqm_init(); 223 xfs_Gqm = xfs_Gqm_init();
@@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref(
228 * debugging and statistical purposes, but ... 226 * debugging and statistical purposes, but ...
229 * Just take a reference and get out. 227 * Just take a reference and get out.
230 */ 228 */
231 XFS_QM_HOLD(xfs_Gqm); 229 xfs_Gqm->qm_nrefs++;
232 XFS_QM_UNLOCK(xfs_Gqm); 230 mutex_unlock(&xfs_Gqm_lock);
233 231
234 return 0; 232 return 0;
235} 233}
@@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref(
277 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 275 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
278 * be restarted. 276 * be restarted.
279 */ 277 */
280 XFS_QM_LOCK(xfs_Gqm); 278 mutex_lock(&xfs_Gqm_lock);
281 XFS_QM_RELE(xfs_Gqm); 279 if (--xfs_Gqm->qm_nrefs == 0) {
282 if (xfs_Gqm->qm_nrefs == 0) {
283 xfs_qm_destroy(xfs_Gqm); 280 xfs_qm_destroy(xfs_Gqm);
284 xfs_Gqm = NULL; 281 xfs_Gqm = NULL;
285 } 282 }
286 XFS_QM_UNLOCK(xfs_Gqm); 283 mutex_unlock(&xfs_Gqm_lock);
287} 284}
288 285
289/* 286/*
@@ -577,10 +574,10 @@ xfs_qm_dqpurge_int(
577 continue; 574 continue;
578 } 575 }
579 576
580 if (! xfs_qm_dqhashlock_nowait(dqp)) { 577 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
581 nrecl = XFS_QI_MPLRECLAIMS(mp); 578 nrecl = XFS_QI_MPLRECLAIMS(mp);
582 xfs_qm_mplist_unlock(mp); 579 xfs_qm_mplist_unlock(mp);
583 XFS_DQ_HASH_LOCK(dqp->q_hash); 580 mutex_lock(&dqp->q_hash->qh_lock);
584 xfs_qm_mplist_lock(mp); 581 xfs_qm_mplist_lock(mp);
585 582
586 /* 583 /*
@@ -590,7 +587,7 @@ xfs_qm_dqpurge_int(
590 * this point, but somebody might be taking things off. 587 * this point, but somebody might be taking things off.
591 */ 588 */
592 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { 589 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
593 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 590 mutex_unlock(&dqp->q_hash->qh_lock);
594 goto again; 591 goto again;
595 } 592 }
596 } 593 }
@@ -632,7 +629,6 @@ xfs_qm_dqattach_one(
632 xfs_dqid_t id, 629 xfs_dqid_t id,
633 uint type, 630 uint type,
634 uint doalloc, 631 uint doalloc,
635 uint dolock,
636 xfs_dquot_t *udqhint, /* hint */ 632 xfs_dquot_t *udqhint, /* hint */
637 xfs_dquot_t **IO_idqpp) 633 xfs_dquot_t **IO_idqpp)
638{ 634{
@@ -641,16 +637,16 @@ xfs_qm_dqattach_one(
641 637
642 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 638 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
643 error = 0; 639 error = 0;
640
644 /* 641 /*
645 * See if we already have it in the inode itself. IO_idqpp is 642 * See if we already have it in the inode itself. IO_idqpp is
646 * &i_udquot or &i_gdquot. This made the code look weird, but 643 * &i_udquot or &i_gdquot. This made the code look weird, but
647 * made the logic a lot simpler. 644 * made the logic a lot simpler.
648 */ 645 */
649 if ((dqp = *IO_idqpp)) { 646 dqp = *IO_idqpp;
650 if (dolock) 647 if (dqp) {
651 xfs_dqlock(dqp);
652 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); 648 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
653 goto done; 649 return 0;
654 } 650 }
655 651
656 /* 652 /*
@@ -659,38 +655,38 @@ xfs_qm_dqattach_one(
659 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside 655 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
660 * the user dquot. 656 * the user dquot.
661 */ 657 */
662 ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); 658 if (udqhint) {
663 if (udqhint && !dolock) 659 ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
664 xfs_dqlock(udqhint); 660 xfs_dqlock(udqhint);
665 661
666 /* 662 /*
667 * No need to take dqlock to look at the id. 663 * No need to take dqlock to look at the id.
668 * The ID can't change until it gets reclaimed, and it won't 664 *
669 * be reclaimed as long as we have a ref from inode and we hold 665 * The ID can't change until it gets reclaimed, and it won't
670 * the ilock. 666 * be reclaimed as long as we have a ref from inode and we
671 */ 667 * hold the ilock.
672 if (udqhint && 668 */
673 (dqp = udqhint->q_gdquot) && 669 dqp = udqhint->q_gdquot;
674 (be32_to_cpu(dqp->q_core.d_id) == id)) { 670 if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
675 ASSERT(XFS_DQ_IS_LOCKED(udqhint)); 671 xfs_dqlock(dqp);
676 xfs_dqlock(dqp); 672 XFS_DQHOLD(dqp);
677 XFS_DQHOLD(dqp); 673 ASSERT(*IO_idqpp == NULL);
678 ASSERT(*IO_idqpp == NULL); 674 *IO_idqpp = dqp;
679 *IO_idqpp = dqp; 675
680 if (!dolock) {
681 xfs_dqunlock(dqp); 676 xfs_dqunlock(dqp);
682 xfs_dqunlock(udqhint); 677 xfs_dqunlock(udqhint);
678 return 0;
683 } 679 }
684 goto done; 680
685 } 681 /*
686 /* 682 * We can't hold a dquot lock when we call the dqget code.
687 * We can't hold a dquot lock when we call the dqget code. 683 * We'll deadlock in no time, because of (not conforming to)
688 * We'll deadlock in no time, because of (not conforming to) 684 * lock ordering - the inodelock comes before any dquot lock,
689 * lock ordering - the inodelock comes before any dquot lock, 685 * and we may drop and reacquire the ilock in xfs_qm_dqget().
690 * and we may drop and reacquire the ilock in xfs_qm_dqget(). 686 */
691 */
692 if (udqhint)
693 xfs_dqunlock(udqhint); 687 xfs_dqunlock(udqhint);
688 }
689
694 /* 690 /*
695 * Find the dquot from somewhere. This bumps the 691 * Find the dquot from somewhere. This bumps the
696 * reference count of dquot and returns it locked. 692 * reference count of dquot and returns it locked.
@@ -698,48 +694,19 @@ xfs_qm_dqattach_one(
698 * disk and we didn't ask it to allocate; 694 * disk and we didn't ask it to allocate;
699 * ESRCH if quotas got turned off suddenly. 695 * ESRCH if quotas got turned off suddenly.
700 */ 696 */
701 if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type, 697 error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
702 doalloc|XFS_QMOPT_DOWARN, &dqp))) { 698 if (error)
703 if (udqhint && dolock) 699 return error;
704 xfs_dqlock(udqhint);
705 goto done;
706 }
707 700
708 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); 701 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
702
709 /* 703 /*
710 * dqget may have dropped and re-acquired the ilock, but it guarantees 704 * dqget may have dropped and re-acquired the ilock, but it guarantees
711 * that the dquot returned is the one that should go in the inode. 705 * that the dquot returned is the one that should go in the inode.
712 */ 706 */
713 *IO_idqpp = dqp; 707 *IO_idqpp = dqp;
714 ASSERT(dqp); 708 xfs_dqunlock(dqp);
715 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 709 return 0;
716 if (! dolock) {
717 xfs_dqunlock(dqp);
718 goto done;
719 }
720 if (! udqhint)
721 goto done;
722
723 ASSERT(udqhint);
724 ASSERT(dolock);
725 ASSERT(XFS_DQ_IS_LOCKED(dqp));
726 if (! xfs_qm_dqlock_nowait(udqhint)) {
727 xfs_dqunlock(dqp);
728 xfs_dqlock(udqhint);
729 xfs_dqlock(dqp);
730 }
731 done:
732#ifdef QUOTADEBUG
733 if (udqhint) {
734 if (dolock)
735 ASSERT(XFS_DQ_IS_LOCKED(udqhint));
736 }
737 if (! error) {
738 if (dolock)
739 ASSERT(XFS_DQ_IS_LOCKED(dqp));
740 }
741#endif
742 return error;
743} 710}
744 711
745 712
@@ -754,24 +721,15 @@ xfs_qm_dqattach_one(
754STATIC void 721STATIC void
755xfs_qm_dqattach_grouphint( 722xfs_qm_dqattach_grouphint(
756 xfs_dquot_t *udq, 723 xfs_dquot_t *udq,
757 xfs_dquot_t *gdq, 724 xfs_dquot_t *gdq)
758 uint locked)
759{ 725{
760 xfs_dquot_t *tmp; 726 xfs_dquot_t *tmp;
761 727
762#ifdef QUOTADEBUG 728 xfs_dqlock(udq);
763 if (locked) {
764 ASSERT(XFS_DQ_IS_LOCKED(udq));
765 ASSERT(XFS_DQ_IS_LOCKED(gdq));
766 }
767#endif
768 if (! locked)
769 xfs_dqlock(udq);
770 729
771 if ((tmp = udq->q_gdquot)) { 730 if ((tmp = udq->q_gdquot)) {
772 if (tmp == gdq) { 731 if (tmp == gdq) {
773 if (! locked) 732 xfs_dqunlock(udq);
774 xfs_dqunlock(udq);
775 return; 733 return;
776 } 734 }
777 735
@@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint(
781 * because the freelist lock comes before dqlocks. 739 * because the freelist lock comes before dqlocks.
782 */ 740 */
783 xfs_dqunlock(udq); 741 xfs_dqunlock(udq);
784 if (locked)
785 xfs_dqunlock(gdq);
786 /* 742 /*
787 * we took a hard reference once upon a time in dqget, 743 * we took a hard reference once upon a time in dqget,
788 * so give it back when the udquot no longer points at it 744 * so give it back when the udquot no longer points at it
@@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint(
795 751
796 } else { 752 } else {
797 ASSERT(XFS_DQ_IS_LOCKED(udq)); 753 ASSERT(XFS_DQ_IS_LOCKED(udq));
798 if (! locked) { 754 xfs_dqlock(gdq);
799 xfs_dqlock(gdq);
800 }
801 } 755 }
802 756
803 ASSERT(XFS_DQ_IS_LOCKED(udq)); 757 ASSERT(XFS_DQ_IS_LOCKED(udq));
@@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint(
810 XFS_DQHOLD(gdq); 764 XFS_DQHOLD(gdq);
811 udq->q_gdquot = gdq; 765 udq->q_gdquot = gdq;
812 } 766 }
813 if (! locked) { 767
814 xfs_dqunlock(gdq); 768 xfs_dqunlock(gdq);
815 xfs_dqunlock(udq); 769 xfs_dqunlock(udq);
816 }
817} 770}
818 771
819 772
@@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint(
821 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON 774 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
822 * into account. 775 * into account.
823 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. 776 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
824 * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
825 * much made this code a complete mess, but it has been pretty useful.
826 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL. 777 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
827 * Inode may get unlocked and relocked in here, and the caller must deal with 778 * Inode may get unlocked and relocked in here, and the caller must deal with
828 * the consequences. 779 * the consequences.
@@ -851,7 +802,6 @@ xfs_qm_dqattach(
851 if (XFS_IS_UQUOTA_ON(mp)) { 802 if (XFS_IS_UQUOTA_ON(mp)) {
852 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, 803 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
853 flags & XFS_QMOPT_DQALLOC, 804 flags & XFS_QMOPT_DQALLOC,
854 flags & XFS_QMOPT_DQLOCK,
855 NULL, &ip->i_udquot); 805 NULL, &ip->i_udquot);
856 if (error) 806 if (error)
857 goto done; 807 goto done;
@@ -863,11 +813,9 @@ xfs_qm_dqattach(
863 error = XFS_IS_GQUOTA_ON(mp) ? 813 error = XFS_IS_GQUOTA_ON(mp) ?
864 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 814 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
865 flags & XFS_QMOPT_DQALLOC, 815 flags & XFS_QMOPT_DQALLOC,
866 flags & XFS_QMOPT_DQLOCK,
867 ip->i_udquot, &ip->i_gdquot) : 816 ip->i_udquot, &ip->i_gdquot) :
868 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 817 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
869 flags & XFS_QMOPT_DQALLOC, 818 flags & XFS_QMOPT_DQALLOC,
870 flags & XFS_QMOPT_DQLOCK,
871 ip->i_udquot, &ip->i_gdquot); 819 ip->i_udquot, &ip->i_gdquot);
872 /* 820 /*
873 * Don't worry about the udquot that we may have 821 * Don't worry about the udquot that we may have
@@ -898,22 +846,13 @@ xfs_qm_dqattach(
898 /* 846 /*
899 * Attach i_gdquot to the gdquot hint inside the i_udquot. 847 * Attach i_gdquot to the gdquot hint inside the i_udquot.
900 */ 848 */
901 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot, 849 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
902 flags & XFS_QMOPT_DQLOCK);
903 } 850 }
904 851
905 done: 852 done:
906 853
907#ifdef QUOTADEBUG 854#ifdef QUOTADEBUG
908 if (! error) { 855 if (! error) {
909 if (ip->i_udquot) {
910 if (flags & XFS_QMOPT_DQLOCK)
911 ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
912 }
913 if (ip->i_gdquot) {
914 if (flags & XFS_QMOPT_DQLOCK)
915 ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
916 }
917 if (XFS_IS_UQUOTA_ON(mp)) 856 if (XFS_IS_UQUOTA_ON(mp))
918 ASSERT(ip->i_udquot); 857 ASSERT(ip->i_udquot);
919 if (XFS_IS_OQUOTA_ON(mp)) 858 if (XFS_IS_OQUOTA_ON(mp))
@@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist(
2086 * a dqlookup process that holds the hashlock that is 2025 * a dqlookup process that holds the hashlock that is
2087 * waiting for the freelist lock. 2026 * waiting for the freelist lock.
2088 */ 2027 */
2089 if (! xfs_qm_dqhashlock_nowait(dqp)) { 2028 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
2090 xfs_dqfunlock(dqp); 2029 xfs_dqfunlock(dqp);
2091 xfs_dqunlock(dqp); 2030 xfs_dqunlock(dqp);
2092 dqp = dqp->dq_flnext; 2031 dqp = dqp->dq_flnext;
@@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist(
2103 /* XXX put a sentinel so that we can come back here */ 2042 /* XXX put a sentinel so that we can come back here */
2104 xfs_dqfunlock(dqp); 2043 xfs_dqfunlock(dqp);
2105 xfs_dqunlock(dqp); 2044 xfs_dqunlock(dqp);
2106 XFS_DQ_HASH_UNLOCK(hash); 2045 mutex_unlock(&hash->qh_lock);
2107 xfs_qm_freelist_unlock(xfs_Gqm); 2046 xfs_qm_freelist_unlock(xfs_Gqm);
2108 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2047 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2109 return nreclaimed; 2048 return nreclaimed;
@@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist(
2120 XQM_HASHLIST_REMOVE(hash, dqp); 2059 XQM_HASHLIST_REMOVE(hash, dqp);
2121 xfs_dqfunlock(dqp); 2060 xfs_dqfunlock(dqp);
2122 xfs_qm_mplist_unlock(dqp->q_mount); 2061 xfs_qm_mplist_unlock(dqp->q_mount);
2123 XFS_DQ_HASH_UNLOCK(hash); 2062 mutex_unlock(&hash->qh_lock);
2124 2063
2125 off_freelist: 2064 off_freelist:
2126 XQM_FREELIST_REMOVE(dqp); 2065 XQM_FREELIST_REMOVE(dqp);
@@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void)
2262 continue; 2201 continue;
2263 } 2202 }
2264 2203
2265 if (! xfs_qm_dqhashlock_nowait(dqp)) 2204 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2266 goto mplistunlock; 2205 goto mplistunlock;
2267 2206
2268 ASSERT(dqp->q_nrefs == 0); 2207 ASSERT(dqp->q_nrefs == 0);
@@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void)
2271 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); 2210 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2272 XQM_FREELIST_REMOVE(dqp); 2211 XQM_FREELIST_REMOVE(dqp);
2273 dqpout = dqp; 2212 dqpout = dqp;
2274 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 2213 mutex_unlock(&dqp->q_hash->qh_lock);
2275 mplistunlock: 2214 mplistunlock:
2276 xfs_qm_mplist_unlock(dqp->q_mount); 2215 xfs_qm_mplist_unlock(dqp->q_mount);
2277 xfs_dqfunlock(dqp); 2216 xfs_dqfunlock(dqp);
@@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2774{ 2713{
2775 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); 2714 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2776} 2715}
2777
2778STATIC int
2779xfs_qm_dqhashlock_nowait(
2780 xfs_dquot_t *dqp)
2781{
2782 int locked;
2783
2784 locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
2785 return locked;
2786}
2787
2788int
2789xfs_qm_freelist_lock_nowait(
2790 xfs_qm_t *xqm)
2791{
2792 int locked;
2793
2794 locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
2795 return locked;
2796}
2797
2798STATIC int
2799xfs_qm_mplist_nowait(
2800 xfs_mount_t *mp)
2801{
2802 int locked;
2803
2804 ASSERT(mp->m_quotainfo);
2805 locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
2806 return locked;
2807}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index ddf09166387c..a371954cae1b 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -27,7 +27,7 @@ struct xfs_qm;
27struct xfs_inode; 27struct xfs_inode;
28 28
29extern uint ndquot; 29extern uint ndquot;
30extern mutex_t xfs_Gqm_lock; 30extern struct mutex xfs_Gqm_lock;
31extern struct xfs_qm *xfs_Gqm; 31extern struct xfs_qm *xfs_Gqm;
32extern kmem_zone_t *qm_dqzone; 32extern kmem_zone_t *qm_dqzone;
33extern kmem_zone_t *qm_dqtrxzone; 33extern kmem_zone_t *qm_dqtrxzone;
@@ -79,7 +79,7 @@ typedef xfs_dqhash_t xfs_dqlist_t;
79typedef struct xfs_frlist { 79typedef struct xfs_frlist {
80 struct xfs_dquot *qh_next; 80 struct xfs_dquot *qh_next;
81 struct xfs_dquot *qh_prev; 81 struct xfs_dquot *qh_prev;
82 mutex_t qh_lock; 82 struct mutex qh_lock;
83 uint qh_version; 83 uint qh_version;
84 uint qh_nelems; 84 uint qh_nelems;
85} xfs_frlist_t; 85} xfs_frlist_t;
@@ -115,7 +115,7 @@ typedef struct xfs_quotainfo {
115 xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ 115 xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
116 xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ 116 xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
117 xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ 117 xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
118 mutex_t qi_quotaofflock;/* to serialize quotaoff */ 118 struct mutex qi_quotaofflock;/* to serialize quotaoff */
119 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ 119 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
120 uint qi_dqperchunk; /* # ondisk dqs in above chunk */ 120 uint qi_dqperchunk; /* # ondisk dqs in above chunk */
121 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ 121 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
@@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct {
158#define XFS_QM_IWARNLIMIT 5 158#define XFS_QM_IWARNLIMIT 5
159#define XFS_QM_RTBWARNLIMIT 5 159#define XFS_QM_RTBWARNLIMIT 5
160 160
161#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock))
162#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock))
163#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++)
164#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
165
166extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 161extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
167extern void xfs_qm_mount_quotas(xfs_mount_t *); 162extern void xfs_qm_mount_quotas(xfs_mount_t *);
168extern int xfs_qm_quotacheck(xfs_mount_t *); 163extern int xfs_qm_quotacheck(xfs_mount_t *);
@@ -178,6 +173,16 @@ extern void xfs_qm_dqdetach(xfs_inode_t *);
178extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); 173extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
179extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 174extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
180 175
176/* quota ops */
177extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
178extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
179 fs_disk_quota_t *);
180extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
181 fs_disk_quota_t *);
182extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
183extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
184extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
185
181/* vop stuff */ 186/* vop stuff */
182extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, 187extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
183 uid_t, gid_t, prid_t, uint, 188 uid_t, gid_t, prid_t, uint,
@@ -194,11 +199,6 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
194/* list stuff */ 199/* list stuff */
195extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); 200extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
196extern void xfs_qm_freelist_unlink(xfs_dquot_t *); 201extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
197extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *);
198
199/* system call interface */
200extern int xfs_qm_quotactl(struct xfs_mount *, int, int,
201 xfs_caddr_t);
202 202
203#ifdef DEBUG 203#ifdef DEBUG
204extern int xfs_qm_internalqcheck(xfs_mount_t *); 204extern int xfs_qm_internalqcheck(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bc6c5cca3e12..63037c689a4b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = {
235 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, 235 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve,
236 .xfs_dqstatvfs = xfs_qm_statvfs, 236 .xfs_dqstatvfs = xfs_qm_statvfs,
237 .xfs_dqsync = xfs_qm_sync, 237 .xfs_dqsync = xfs_qm_sync,
238 .xfs_quotactl = xfs_qm_quotactl,
239 .xfs_dqtrxops = &xfs_trans_dquot_ops, 238 .xfs_dqtrxops = &xfs_trans_dquot_ops,
240}; 239};
241EXPORT_SYMBOL(xfs_qmcore_xfs); 240EXPORT_SYMBOL(xfs_qmcore_xfs);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68139b38aede..c7b66f6506ce 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -57,135 +57,16 @@
57# define qdprintk(s, args...) do { } while (0) 57# define qdprintk(s, args...) do { } while (0)
58#endif 58#endif
59 59
60STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
61STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
62 fs_disk_quota_t *);
63STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
64STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
65 fs_disk_quota_t *);
66STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
67STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
68STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 60STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
69STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 61STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
70 uint); 62 uint);
71STATIC uint xfs_qm_import_flags(uint);
72STATIC uint xfs_qm_export_flags(uint); 63STATIC uint xfs_qm_export_flags(uint);
73STATIC uint xfs_qm_import_qtype_flags(uint);
74STATIC uint xfs_qm_export_qtype_flags(uint); 64STATIC uint xfs_qm_export_qtype_flags(uint);
75STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, 65STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
76 fs_disk_quota_t *); 66 fs_disk_quota_t *);
77 67
78 68
79/* 69/*
80 * The main distribution switch of all XFS quotactl system calls.
81 */
82int
83xfs_qm_quotactl(
84 xfs_mount_t *mp,
85 int cmd,
86 int id,
87 xfs_caddr_t addr)
88{
89 int error;
90
91 ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
92
93 /*
94 * The following commands are valid even when quotaoff.
95 */
96 switch (cmd) {
97 case Q_XQUOTARM:
98 /*
99 * Truncate quota files. quota must be off.
100 */
101 if (XFS_IS_QUOTA_ON(mp))
102 return XFS_ERROR(EINVAL);
103 if (mp->m_flags & XFS_MOUNT_RDONLY)
104 return XFS_ERROR(EROFS);
105 return (xfs_qm_scall_trunc_qfiles(mp,
106 xfs_qm_import_qtype_flags(*(uint *)addr)));
107
108 case Q_XGETQSTAT:
109 /*
110 * Get quota status information.
111 */
112 return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
113
114 case Q_XQUOTAON:
115 /*
116 * QUOTAON - enabling quota enforcement.
117 * Quota accounting must be turned on at mount time.
118 */
119 if (mp->m_flags & XFS_MOUNT_RDONLY)
120 return XFS_ERROR(EROFS);
121 return (xfs_qm_scall_quotaon(mp,
122 xfs_qm_import_flags(*(uint *)addr)));
123
124 case Q_XQUOTAOFF:
125 if (mp->m_flags & XFS_MOUNT_RDONLY)
126 return XFS_ERROR(EROFS);
127 break;
128
129 case Q_XQUOTASYNC:
130 return xfs_sync_inodes(mp, SYNC_DELWRI);
131
132 default:
133 break;
134 }
135
136 if (! XFS_IS_QUOTA_ON(mp))
137 return XFS_ERROR(ESRCH);
138
139 switch (cmd) {
140 case Q_XQUOTAOFF:
141 if (mp->m_flags & XFS_MOUNT_RDONLY)
142 return XFS_ERROR(EROFS);
143 error = xfs_qm_scall_quotaoff(mp,
144 xfs_qm_import_flags(*(uint *)addr),
145 B_FALSE);
146 break;
147
148 case Q_XGETQUOTA:
149 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
150 (fs_disk_quota_t *)addr);
151 break;
152 case Q_XGETGQUOTA:
153 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
154 (fs_disk_quota_t *)addr);
155 break;
156 case Q_XGETPQUOTA:
157 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
158 (fs_disk_quota_t *)addr);
159 break;
160
161 case Q_XSETQLIM:
162 if (mp->m_flags & XFS_MOUNT_RDONLY)
163 return XFS_ERROR(EROFS);
164 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
165 (fs_disk_quota_t *)addr);
166 break;
167 case Q_XSETGQLIM:
168 if (mp->m_flags & XFS_MOUNT_RDONLY)
169 return XFS_ERROR(EROFS);
170 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
171 (fs_disk_quota_t *)addr);
172 break;
173 case Q_XSETPQLIM:
174 if (mp->m_flags & XFS_MOUNT_RDONLY)
175 return XFS_ERROR(EROFS);
176 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
177 (fs_disk_quota_t *)addr);
178 break;
179
180 default:
181 error = XFS_ERROR(EINVAL);
182 break;
183 }
184
185 return (error);
186}
187
188/*
189 * Turn off quota accounting and/or enforcement for all udquots and/or 70 * Turn off quota accounting and/or enforcement for all udquots and/or
190 * gdquots. Called only at unmount time. 71 * gdquots. Called only at unmount time.
191 * 72 *
@@ -193,11 +74,10 @@ xfs_qm_quotactl(
193 * incore, and modifies the ondisk dquot directly. Therefore, for example, 74 * incore, and modifies the ondisk dquot directly. Therefore, for example,
194 * it is an error to call this twice, without purging the cache. 75 * it is an error to call this twice, without purging the cache.
195 */ 76 */
196STATIC int 77int
197xfs_qm_scall_quotaoff( 78xfs_qm_scall_quotaoff(
198 xfs_mount_t *mp, 79 xfs_mount_t *mp,
199 uint flags, 80 uint flags)
200 boolean_t force)
201{ 81{
202 uint dqtype; 82 uint dqtype;
203 int error; 83 int error;
@@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff(
205 xfs_qoff_logitem_t *qoffstart; 85 xfs_qoff_logitem_t *qoffstart;
206 int nculprits; 86 int nculprits;
207 87
208 if (!force && !capable(CAP_SYS_ADMIN))
209 return XFS_ERROR(EPERM);
210 /* 88 /*
211 * No file system can have quotas enabled on disk but not in core. 89 * No file system can have quotas enabled on disk but not in core.
212 * Note that quota utilities (like quotaoff) _expect_ 90 * Note that quota utilities (like quotaoff) _expect_
@@ -375,7 +253,7 @@ out_error:
375 return (error); 253 return (error);
376} 254}
377 255
378STATIC int 256int
379xfs_qm_scall_trunc_qfiles( 257xfs_qm_scall_trunc_qfiles(
380 xfs_mount_t *mp, 258 xfs_mount_t *mp,
381 uint flags) 259 uint flags)
@@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles(
383 int error = 0, error2 = 0; 261 int error = 0, error2 = 0;
384 xfs_inode_t *qip; 262 xfs_inode_t *qip;
385 263
386 if (!capable(CAP_SYS_ADMIN))
387 return XFS_ERROR(EPERM);
388 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 264 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
389 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 265 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
390 return XFS_ERROR(EINVAL); 266 return XFS_ERROR(EINVAL);
@@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles(
416 * effect immediately. 292 * effect immediately.
417 * (Switching on quota accounting must be done at mount time.) 293 * (Switching on quota accounting must be done at mount time.)
418 */ 294 */
419STATIC int 295int
420xfs_qm_scall_quotaon( 296xfs_qm_scall_quotaon(
421 xfs_mount_t *mp, 297 xfs_mount_t *mp,
422 uint flags) 298 uint flags)
@@ -426,9 +302,6 @@ xfs_qm_scall_quotaon(
426 uint accflags; 302 uint accflags;
427 __int64_t sbflags; 303 __int64_t sbflags;
428 304
429 if (!capable(CAP_SYS_ADMIN))
430 return XFS_ERROR(EPERM);
431
432 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 305 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
433 /* 306 /*
434 * Switching on quota accounting must be done at mount time. 307 * Switching on quota accounting must be done at mount time.
@@ -517,7 +390,7 @@ xfs_qm_scall_quotaon(
517/* 390/*
518 * Return quota status information, such as uquota-off, enforcements, etc. 391 * Return quota status information, such as uquota-off, enforcements, etc.
519 */ 392 */
520STATIC int 393int
521xfs_qm_scall_getqstat( 394xfs_qm_scall_getqstat(
522 xfs_mount_t *mp, 395 xfs_mount_t *mp,
523 fs_quota_stat_t *out) 396 fs_quota_stat_t *out)
@@ -582,7 +455,7 @@ xfs_qm_scall_getqstat(
582/* 455/*
583 * Adjust quota limits, and start/stop timers accordingly. 456 * Adjust quota limits, and start/stop timers accordingly.
584 */ 457 */
585STATIC int 458int
586xfs_qm_scall_setqlim( 459xfs_qm_scall_setqlim(
587 xfs_mount_t *mp, 460 xfs_mount_t *mp,
588 xfs_dqid_t id, 461 xfs_dqid_t id,
@@ -595,9 +468,6 @@ xfs_qm_scall_setqlim(
595 int error; 468 int error;
596 xfs_qcnt_t hard, soft; 469 xfs_qcnt_t hard, soft;
597 470
598 if (!capable(CAP_SYS_ADMIN))
599 return XFS_ERROR(EPERM);
600
601 if ((newlim->d_fieldmask & 471 if ((newlim->d_fieldmask &
602 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) 472 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
603 return (0); 473 return (0);
@@ -742,7 +612,7 @@ xfs_qm_scall_setqlim(
742 return error; 612 return error;
743} 613}
744 614
745STATIC int 615int
746xfs_qm_scall_getquota( 616xfs_qm_scall_getquota(
747 xfs_mount_t *mp, 617 xfs_mount_t *mp,
748 xfs_dqid_t id, 618 xfs_dqid_t id,
@@ -935,30 +805,6 @@ xfs_qm_export_dquot(
935} 805}
936 806
937STATIC uint 807STATIC uint
938xfs_qm_import_qtype_flags(
939 uint uflags)
940{
941 uint oflags = 0;
942
943 /*
944 * Can't be more than one, or none.
945 */
946 if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
947 (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
948 ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
949 (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
950 ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
951 (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
952 ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
953 return (0);
954
955 oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
956 oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
957 oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
958 return oflags;
959}
960
961STATIC uint
962xfs_qm_export_qtype_flags( 808xfs_qm_export_qtype_flags(
963 uint flags) 809 uint flags)
964{ 810{
@@ -979,26 +825,6 @@ xfs_qm_export_qtype_flags(
979} 825}
980 826
981STATIC uint 827STATIC uint
982xfs_qm_import_flags(
983 uint uflags)
984{
985 uint flags = 0;
986
987 if (uflags & XFS_QUOTA_UDQ_ACCT)
988 flags |= XFS_UQUOTA_ACCT;
989 if (uflags & XFS_QUOTA_PDQ_ACCT)
990 flags |= XFS_PQUOTA_ACCT;
991 if (uflags & XFS_QUOTA_GDQ_ACCT)
992 flags |= XFS_GQUOTA_ACCT;
993 if (uflags & XFS_QUOTA_UDQ_ENFD)
994 flags |= XFS_UQUOTA_ENFD;
995 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
996 flags |= XFS_OQUOTA_ENFD;
997 return (flags);
998}
999
1000
1001STATIC uint
1002xfs_qm_export_flags( 828xfs_qm_export_flags(
1003 uint flags) 829 uint flags)
1004{ 830{
@@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab;
1134xfs_dqhash_t *qmtest_gdqtab; 960xfs_dqhash_t *qmtest_gdqtab;
1135int qmtest_hashmask; 961int qmtest_hashmask;
1136int qmtest_nfails; 962int qmtest_nfails;
1137mutex_t qcheck_lock; 963struct mutex qcheck_lock;
1138 964
1139#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ 965#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
1140 (__psunsigned_t)(id)) & \ 966 (__psunsigned_t)(id)) & \
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index c4fcea600bc2..8286b2842b6b 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -42,34 +42,24 @@
42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) 42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
43 43
44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) 44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
45#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock)
46#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) 45#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
47#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) 46#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
48 47
49#define XQMLCK(h) (mutex_lock(&((h)->qh_lock))) 48#define xfs_qm_mplist_lock(mp) \
50#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) 49 mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
51#ifdef DEBUG 50#define xfs_qm_mplist_nowait(mp) \
52struct xfs_dqhash; 51 mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
53static inline int XQMISLCKD(struct xfs_dqhash *h) 52#define xfs_qm_mplist_unlock(mp) \
54{ 53 mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
55 if (mutex_trylock(&h->qh_lock)) { 54#define XFS_QM_IS_MPLIST_LOCKED(mp) \
56 mutex_unlock(&h->qh_lock); 55 mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
57 return 0; 56
58 } 57#define xfs_qm_freelist_lock(qm) \
59 return 1; 58 mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
60} 59#define xfs_qm_freelist_lock_nowait(qm) \
61#endif 60 mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
62 61#define xfs_qm_freelist_unlock(qm) \
63#define XFS_DQ_HASH_LOCK(h) XQMLCK(h) 62 mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
64#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h)
65#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h)
66
67#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp)))
68#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
69#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
70
71#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist))
72#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist))
73 63
74/* 64/*
75 * Hash into a bucket in the dquot hash table, based on <mp, id>. 65 * Hash into a bucket in the dquot hash table, based on <mp, id>.
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 99611381e740..447173bcf96d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -624,10 +624,9 @@ xfs_trans_dqresv(
624 xfs_qcnt_t *resbcountp; 624 xfs_qcnt_t *resbcountp;
625 xfs_quotainfo_t *q = mp->m_quotainfo; 625 xfs_quotainfo_t *q = mp->m_quotainfo;
626 626
627 if (! (flags & XFS_QMOPT_DQLOCK)) { 627
628 xfs_dqlock(dqp); 628 xfs_dqlock(dqp);
629 } 629
630 ASSERT(XFS_DQ_IS_LOCKED(dqp));
631 if (flags & XFS_TRANS_DQ_RES_BLKS) { 630 if (flags & XFS_TRANS_DQ_RES_BLKS) {
632 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); 631 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
633 if (!hardlimit) 632 if (!hardlimit)
@@ -740,10 +739,8 @@ xfs_trans_dqresv(
740 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 739 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
741 740
742error_return: 741error_return:
743 if (! (flags & XFS_QMOPT_DQLOCK)) { 742 xfs_dqunlock(dqp);
744 xfs_dqunlock(dqp); 743 return error;
745 }
746 return (error);
747} 744}
748 745
749 746
@@ -753,8 +750,7 @@ error_return:
753 * grp/prj quotas is important, because this follows a both-or-nothing 750 * grp/prj quotas is important, because this follows a both-or-nothing
754 * approach. 751 * approach.
755 * 752 *
756 * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. 753 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
757 * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
758 * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. 754 * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota.
759 * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks 755 * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
760 * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks 756 * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index ae5482965424..3f3610a7ee05 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -24,6 +24,7 @@
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dmapi.h" 25#include "xfs_dmapi.h"
26#include "xfs_mount.h" 26#include "xfs_mount.h"
27#include "xfs_error.h"
27 28
28static char message[1024]; /* keep it off the stack */ 29static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock); 30static DEFINE_SPINLOCK(xfs_err_lock);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 5830c040ea7e..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,10 +17,6 @@
17 */ 17 */
18#include <xfs.h> 18#include <xfs.h>
19 19
20static DEFINE_MUTEX(uuid_monitor);
21static int uuid_table_size;
22static uuid_t *uuid_table;
23
24/* IRIX interpretation of an uuid_t */ 20/* IRIX interpretation of an uuid_t */
25typedef struct { 21typedef struct {
26 __be32 uu_timelow; 22 __be32 uu_timelow;
@@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
46 fsid[1] = be32_to_cpu(uup->uu_timelow); 42 fsid[1] = be32_to_cpu(uup->uu_timelow);
47} 43}
48 44
49void
50uuid_create_nil(uuid_t *uuid)
51{
52 memset(uuid, 0, sizeof(*uuid));
53}
54
55int 45int
56uuid_is_nil(uuid_t *uuid) 46uuid_is_nil(uuid_t *uuid)
57{ 47{
@@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
71{ 61{
72 return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; 62 return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
73} 63}
74
75/*
76 * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
77 * 64-bit words. NOTE: This function can not be changed EVER. Although
78 * brain-dead, some applications depend on this 64-bit value remaining
79 * persistent. Specifically, DMI vendors store the value as a persistent
80 * filehandle.
81 */
82__uint64_t
83uuid_hash64(uuid_t *uuid)
84{
85 __uint64_t *sp = (__uint64_t *)uuid;
86
87 return sp[0] + sp[1];
88}
89
90int
91uuid_table_insert(uuid_t *uuid)
92{
93 int i, hole;
94
95 mutex_lock(&uuid_monitor);
96 for (i = 0, hole = -1; i < uuid_table_size; i++) {
97 if (uuid_is_nil(&uuid_table[i])) {
98 hole = i;
99 continue;
100 }
101 if (uuid_equal(uuid, &uuid_table[i])) {
102 mutex_unlock(&uuid_monitor);
103 return 0;
104 }
105 }
106 if (hole < 0) {
107 uuid_table = kmem_realloc(uuid_table,
108 (uuid_table_size + 1) * sizeof(*uuid_table),
109 uuid_table_size * sizeof(*uuid_table),
110 KM_SLEEP);
111 hole = uuid_table_size++;
112 }
113 uuid_table[hole] = *uuid;
114 mutex_unlock(&uuid_monitor);
115 return 1;
116}
117
118void
119uuid_table_remove(uuid_t *uuid)
120{
121 int i;
122
123 mutex_lock(&uuid_monitor);
124 for (i = 0; i < uuid_table_size; i++) {
125 if (uuid_is_nil(&uuid_table[i]))
126 continue;
127 if (!uuid_equal(uuid, &uuid_table[i]))
128 continue;
129 uuid_create_nil(&uuid_table[i]);
130 break;
131 }
132 ASSERT(i < uuid_table_size);
133 mutex_unlock(&uuid_monitor);
134}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index cff5b607d445..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,12 +22,8 @@ typedef struct {
22 unsigned char __u_bits[16]; 22 unsigned char __u_bits[16];
23} uuid_t; 23} uuid_t;
24 24
25extern void uuid_create_nil(uuid_t *uuid);
26extern int uuid_is_nil(uuid_t *uuid); 25extern int uuid_is_nil(uuid_t *uuid);
27extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); 26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
28extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); 27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
29extern __uint64_t uuid_hash64(uuid_t *uuid);
30extern int uuid_table_insert(uuid_t *uuid);
31extern void uuid_table_remove(uuid_t *uuid);
32 28
33#endif /* __XFS_SUPPORT_UUID_H__ */ 29#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 143d63ecb20a..c8641f713caa 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,8 +223,8 @@ typedef struct xfs_perag
223 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) 223 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
224#define XFS_MIN_FREELIST_PAG(pag,mp) \ 224#define XFS_MIN_FREELIST_PAG(pag,mp) \
225 (XFS_MIN_FREELIST_RAW( \ 225 (XFS_MIN_FREELIST_RAW( \
226 (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ 226 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
227 (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) 227 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
228 228
229#define XFS_AGB_TO_FSB(mp,agno,agbno) \ 229#define XFS_AGB_TO_FSB(mp,agno,agbno) \
230 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) 230 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 028e44e58ea9..2cf944eb796d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1872,6 +1872,25 @@ xfs_alloc_compute_maxlevels(
1872} 1872}
1873 1873
1874/* 1874/*
1875 * Find the length of the longest extent in an AG.
1876 */
1877xfs_extlen_t
1878xfs_alloc_longest_free_extent(
1879 struct xfs_mount *mp,
1880 struct xfs_perag *pag)
1881{
1882 xfs_extlen_t need, delta = 0;
1883
1884 need = XFS_MIN_FREELIST_PAG(pag, mp);
1885 if (need > pag->pagf_flcount)
1886 delta = need - pag->pagf_flcount;
1887
1888 if (pag->pagf_longest > delta)
1889 return pag->pagf_longest - delta;
1890 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
1891}
1892
1893/*
1875 * Decide whether to use this allocation group for this allocation. 1894 * Decide whether to use this allocation group for this allocation.
1876 * If so, fix up the btree freelist's size. 1895 * If so, fix up the btree freelist's size.
1877 */ 1896 */
@@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist(
1923 } 1942 }
1924 1943
1925 if (!(flags & XFS_ALLOC_FLAG_FREEING)) { 1944 if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
1926 need = XFS_MIN_FREELIST_PAG(pag, mp);
1927 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
1928 /* 1945 /*
1929 * If it looks like there isn't a long enough extent, or enough 1946 * If it looks like there isn't a long enough extent, or enough
1930 * total blocks, reject it. 1947 * total blocks, reject it.
1931 */ 1948 */
1932 longest = (pag->pagf_longest > delta) ? 1949 need = XFS_MIN_FREELIST_PAG(pag, mp);
1933 (pag->pagf_longest - delta) : 1950 longest = xfs_alloc_longest_free_extent(mp, pag);
1934 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
1935 if ((args->minlen + args->alignment + args->minalignslop - 1) > 1951 if ((args->minlen + args->alignment + args->minalignslop - 1) >
1936 longest || 1952 longest ||
1937 ((int)(pag->pagf_freeblks + pag->pagf_flcount - 1953 ((int)(pag->pagf_freeblks + pag->pagf_flcount -
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 588172796f7b..e704caee10df 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg {
100#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ 100#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
101#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ 101#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
102 102
103/*
104 * Find the length of the longest extent in an AG.
105 */
106xfs_extlen_t
107xfs_alloc_longest_free_extent(struct xfs_mount *mp,
108 struct xfs_perag *pag);
103 109
104#ifdef __KERNEL__ 110#ifdef __KERNEL__
105 111
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 6c323f8a4cd1..afdc8911637d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
155 * minimum offset only needs to be the space required for 155 * minimum offset only needs to be the space required for
156 * the btree root. 156 * the btree root.
157 */ 157 */
158 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset) 158 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
159 xfs_default_attroffset(dp))
159 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); 160 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
160 break; 161 break;
161 162
@@ -298,6 +299,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
298} 299}
299 300
300/* 301/*
302 * After the last attribute is removed revert to original inode format,
303 * making all literal area available to the data fork once more.
304 */
305STATIC void
306xfs_attr_fork_reset(
307 struct xfs_inode *ip,
308 struct xfs_trans *tp)
309{
310 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
311 ip->i_d.di_forkoff = 0;
312 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
313
314 ASSERT(ip->i_d.di_anextents == 0);
315 ASSERT(ip->i_afp == NULL);
316
317 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
318 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
319}
320
321/*
301 * Remove an attribute from the shortform attribute list structure. 322 * Remove an attribute from the shortform attribute list structure.
302 */ 323 */
303int 324int
@@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
344 */ 365 */
345 totsize -= size; 366 totsize -= size;
346 if (totsize == sizeof(xfs_attr_sf_hdr_t) && 367 if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
347 !(args->op_flags & XFS_DA_OP_ADDNAME) && 368 (mp->m_flags & XFS_MOUNT_ATTR2) &&
348 (mp->m_flags & XFS_MOUNT_ATTR2) && 369 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
349 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { 370 !(args->op_flags & XFS_DA_OP_ADDNAME)) {
350 /* 371 xfs_attr_fork_reset(dp, args->trans);
351 * Last attribute now removed, revert to original
352 * inode format making all literal area available
353 * to the data fork once more.
354 */
355 xfs_idestroy_fork(dp, XFS_ATTR_FORK);
356 dp->i_d.di_forkoff = 0;
357 dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
358 ASSERT(dp->i_d.di_anextents == 0);
359 ASSERT(dp->i_afp == NULL);
360 dp->i_df.if_ext_max =
361 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
362 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
363 } else { 372 } else {
364 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); 373 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
365 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); 374 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
786 if (forkoff == -1) { 795 if (forkoff == -1) {
787 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); 796 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
788 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); 797 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
789 798 xfs_attr_fork_reset(dp, args->trans);
790 /*
791 * Last attribute was removed, revert to original
792 * inode format making all literal area available
793 * to the data fork once more.
794 */
795 xfs_idestroy_fork(dp, XFS_ATTR_FORK);
796 dp->i_d.di_forkoff = 0;
797 dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
798 ASSERT(dp->i_d.di_anextents == 0);
799 ASSERT(dp->i_afp == NULL);
800 dp->i_df.if_ext_max =
801 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
802 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
803 goto out; 799 goto out;
804 } 800 }
805 801
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c852cd65aaea..3a6ed426327a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2479,7 +2479,7 @@ xfs_bmap_adjacent(
2479 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2479 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
2480 /* 2480 /*
2481 * If allocating at eof, and there's a previous real block, 2481 * If allocating at eof, and there's a previous real block,
2482 * try to use it's last block as our starting point. 2482 * try to use its last block as our starting point.
2483 */ 2483 */
2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && 2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
2485 !isnullstartblock(ap->prevp->br_startblock) && 2485 !isnullstartblock(ap->prevp->br_startblock) &&
@@ -2712,9 +2712,6 @@ xfs_bmap_btalloc(
2712 xfs_agnumber_t startag; 2712 xfs_agnumber_t startag;
2713 xfs_alloc_arg_t args; 2713 xfs_alloc_arg_t args;
2714 xfs_extlen_t blen; 2714 xfs_extlen_t blen;
2715 xfs_extlen_t delta;
2716 xfs_extlen_t longest;
2717 xfs_extlen_t need;
2718 xfs_extlen_t nextminlen = 0; 2715 xfs_extlen_t nextminlen = 0;
2719 xfs_perag_t *pag; 2716 xfs_perag_t *pag;
2720 int nullfb; /* true if ap->firstblock isn't set */ 2717 int nullfb; /* true if ap->firstblock isn't set */
@@ -2796,13 +2793,8 @@ xfs_bmap_btalloc(
2796 * See xfs_alloc_fix_freelist... 2793 * See xfs_alloc_fix_freelist...
2797 */ 2794 */
2798 if (pag->pagf_init) { 2795 if (pag->pagf_init) {
2799 need = XFS_MIN_FREELIST_PAG(pag, mp); 2796 xfs_extlen_t longest;
2800 delta = need > pag->pagf_flcount ? 2797 longest = xfs_alloc_longest_free_extent(mp, pag);
2801 need - pag->pagf_flcount : 0;
2802 longest = (pag->pagf_longest > delta) ?
2803 (pag->pagf_longest - delta) :
2804 (pag->pagf_flcount > 0 ||
2805 pag->pagf_longest > 0);
2806 if (blen < longest) 2798 if (blen < longest)
2807 blen = longest; 2799 blen = longest;
2808 } else 2800 } else
@@ -3577,6 +3569,27 @@ xfs_bmap_extents_to_btree(
3577} 3569}
3578 3570
3579/* 3571/*
3572 * Calculate the default attribute fork offset for newly created inodes.
3573 */
3574uint
3575xfs_default_attroffset(
3576 struct xfs_inode *ip)
3577{
3578 struct xfs_mount *mp = ip->i_mount;
3579 uint offset;
3580
3581 if (mp->m_sb.sb_inodesize == 256) {
3582 offset = XFS_LITINO(mp) -
3583 XFS_BMDR_SPACE_CALC(MINABTPTRS);
3584 } else {
3585 offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
3586 }
3587
3588 ASSERT(offset < XFS_LITINO(mp));
3589 return offset;
3590}
3591
3592/*
3580 * Helper routine to reset inode di_forkoff field when switching 3593 * Helper routine to reset inode di_forkoff field when switching
3581 * attribute fork from local to extent format - we reset it where 3594 * attribute fork from local to extent format - we reset it where
3582 * possible to make space available for inline data fork extents. 3595 * possible to make space available for inline data fork extents.
@@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset(
3588 int whichfork) 3601 int whichfork)
3589{ 3602{
3590 if (whichfork == XFS_ATTR_FORK && 3603 if (whichfork == XFS_ATTR_FORK &&
3591 (ip->i_d.di_format != XFS_DINODE_FMT_DEV) && 3604 ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
3592 (ip->i_d.di_format != XFS_DINODE_FMT_UUID) && 3605 ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
3593 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3606 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
3594 ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) { 3607 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
3595 ip->i_d.di_forkoff = mp->m_attroffset >> 3; 3608
3596 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / 3609 if (dfl_forkoff > ip->i_d.di_forkoff) {
3597 (uint)sizeof(xfs_bmbt_rec_t); 3610 ip->i_d.di_forkoff = dfl_forkoff;
3598 ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / 3611 ip->i_df.if_ext_max =
3599 (uint)sizeof(xfs_bmbt_rec_t); 3612 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
3613 ip->i_afp->if_ext_max =
3614 XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
3615 }
3600 } 3616 }
3601} 3617}
3602 3618
@@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork(
4065 case XFS_DINODE_FMT_BTREE: 4081 case XFS_DINODE_FMT_BTREE:
4066 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); 4082 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
4067 if (!ip->i_d.di_forkoff) 4083 if (!ip->i_d.di_forkoff)
4068 ip->i_d.di_forkoff = mp->m_attroffset >> 3; 4084 ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
4069 else if (mp->m_flags & XFS_MOUNT_ATTR2) 4085 else if (mp->m_flags & XFS_MOUNT_ATTR2)
4070 version = 2; 4086 version = 2;
4071 break; 4087 break;
@@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels(
4212 * (a signed 16-bit number, xfs_aextnum_t). 4228 * (a signed 16-bit number, xfs_aextnum_t).
4213 * 4229 *
4214 * Note that we can no longer assume that if we are in ATTR1 that 4230 * Note that we can no longer assume that if we are in ATTR1 that
4215 * the fork offset of all the inodes will be (m_attroffset >> 3) 4231 * the fork offset of all the inodes will be
4216 * because we could have mounted with ATTR2 and then mounted back 4232 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
4217 * with ATTR1, keeping the di_forkoff's fixed but probably at 4233 * with ATTR2 and then mounted back with ATTR1, keeping the
4218 * various positions. Therefore, for both ATTR1 and ATTR2 4234 * di_forkoff's fixed but probably at various positions. Therefore,
4219 * we have to assume the worst case scenario of a minimum size 4235 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
4220 * available. 4236 * of a minimum size available.
4221 */ 4237 */
4222 if (whichfork == XFS_DATA_FORK) { 4238 if (whichfork == XFS_DATA_FORK) {
4223 maxleafents = MAXEXTNUM; 4239 maxleafents = MAXEXTNUM;
@@ -4804,7 +4820,7 @@ xfs_bmapi(
4804 xfs_extlen_t minlen; /* min allocation size */ 4820 xfs_extlen_t minlen; /* min allocation size */
4805 xfs_mount_t *mp; /* xfs mount structure */ 4821 xfs_mount_t *mp; /* xfs mount structure */
4806 int n; /* current extent index */ 4822 int n; /* current extent index */
4807 int nallocs; /* number of extents alloc\'d */ 4823 int nallocs; /* number of extents alloc'd */
4808 xfs_extnum_t nextents; /* number of extents in file */ 4824 xfs_extnum_t nextents; /* number of extents in file */
4809 xfs_fileoff_t obno; /* old block number (offset) */ 4825 xfs_fileoff_t obno; /* old block number (offset) */
4810 xfs_bmbt_irec_t prev; /* previous file extent record */ 4826 xfs_bmbt_irec_t prev; /* previous file extent record */
@@ -6204,7 +6220,7 @@ xfs_bmap_get_bp(
6204 return(bp); 6220 return(bp);
6205} 6221}
6206 6222
6207void 6223STATIC void
6208xfs_check_block( 6224xfs_check_block(
6209 struct xfs_btree_block *block, 6225 struct xfs_btree_block *block,
6210 xfs_mount_t *mp, 6226 xfs_mount_t *mp,
@@ -6494,7 +6510,7 @@ xfs_bmap_count_tree(
6494 block = XFS_BUF_TO_BLOCK(bp); 6510 block = XFS_BUF_TO_BLOCK(bp);
6495 6511
6496 if (--level) { 6512 if (--level) {
6497 /* Not at node above leafs, count this level of nodes */ 6513 /* Not at node above leaves, count this level of nodes */
6498 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 6514 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6499 while (nextbno != NULLFSBLOCK) { 6515 while (nextbno != NULLFSBLOCK) {
6500 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6516 if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index be2979d88d32..1b8ff9256bd0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -125,7 +125,7 @@ typedef struct xfs_bmalloca {
125 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ 125 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
126 xfs_extlen_t alen; /* i/o length asked/allocated */ 126 xfs_extlen_t alen; /* i/o length asked/allocated */
127 xfs_extlen_t total; /* total blocks needed for xaction */ 127 xfs_extlen_t total; /* total blocks needed for xaction */
128 xfs_extlen_t minlen; /* mininum allocation size (blocks) */ 128 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
129 xfs_extlen_t minleft; /* amount must be left after alloc */ 129 xfs_extlen_t minleft; /* amount must be left after alloc */
130 char eof; /* set if allocating past last extent */ 130 char eof; /* set if allocating past last extent */
131 char wasdel; /* replacing a delayed allocation */ 131 char wasdel; /* replacing a delayed allocation */
@@ -338,6 +338,10 @@ xfs_check_nostate_extents(
338 xfs_extnum_t idx, 338 xfs_extnum_t idx,
339 xfs_extnum_t num); 339 xfs_extnum_t num);
340 340
341uint
342xfs_default_attroffset(
343 struct xfs_inode *ip);
344
341#ifdef __KERNEL__ 345#ifdef __KERNEL__
342 346
343/* 347/*
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e73c332eb23f..e9df99574829 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -1883,7 +1883,7 @@ xfs_btree_lshift(
1883 1883
1884 /* 1884 /*
1885 * We add one entry to the left side and remove one for the right side. 1885 * We add one entry to the left side and remove one for the right side.
1886 * Accout for it here, the changes will be updated on disk and logged 1886 * Account for it here, the changes will be updated on disk and logged
1887 * later. 1887 * later.
1888 */ 1888 */
1889 lrecs++; 1889 lrecs++;
@@ -3535,7 +3535,7 @@ xfs_btree_delrec(
3535 XFS_BTREE_STATS_INC(cur, join); 3535 XFS_BTREE_STATS_INC(cur, join);
3536 3536
3537 /* 3537 /*
3538 * Fix up the the number of records and right block pointer in the 3538 * Fix up the number of records and right block pointer in the
3539 * surviving block, and log it. 3539 * surviving block, and log it.
3540 */ 3540 */
3541 xfs_btree_set_numrecs(left, lrecs + rrecs); 3541 xfs_btree_set_numrecs(left, lrecs + rrecs);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 789fffdf8b2f..4f852b735b96 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -41,7 +41,7 @@ extern kmem_zone_t *xfs_btree_cur_zone;
41/* 41/*
42 * Generic btree header. 42 * Generic btree header.
43 * 43 *
44 * This is a comination of the actual format used on disk for short and long 44 * This is a combination of the actual format used on disk for short and long
45 * format btrees. The first three fields are shared by both format, but 45 * format btrees. The first three fields are shared by both format, but
46 * the pointers are different and should be used with care. 46 * the pointers are different and should be used with care.
47 * 47 *
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c45f74ff1a5b..9ff6e57a5075 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1503 * This is implemented with some source-level loop unrolling. 1503 * This is implemented with some source-level loop unrolling.
1504 */ 1504 */
1505xfs_dahash_t 1505xfs_dahash_t
1506xfs_da_hashname(const uchar_t *name, int namelen) 1506xfs_da_hashname(const __uint8_t *name, int namelen)
1507{ 1507{
1508 xfs_dahash_t hash; 1508 xfs_dahash_t hash;
1509 1509
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 70b710c1792d..8c536167bf75 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -91,9 +91,9 @@ enum xfs_dacmp {
91 * Structure to ease passing around component names. 91 * Structure to ease passing around component names.
92 */ 92 */
93typedef struct xfs_da_args { 93typedef struct xfs_da_args {
94 const uchar_t *name; /* string (maybe not NULL terminated) */ 94 const __uint8_t *name; /* string (maybe not NULL terminated) */
95 int namelen; /* length of string (maybe no NULL) */ 95 int namelen; /* length of string (maybe no NULL) */
96 uchar_t *value; /* set of bytes (maybe contain NULLs) */ 96 __uint8_t *value; /* set of bytes (maybe contain NULLs) */
97 int valuelen; /* length of value */ 97 int valuelen; /* length of value */
98 int flags; /* argument flags (eg: ATTR_NOCREATE) */ 98 int flags; /* argument flags (eg: ATTR_NOCREATE) */
99 xfs_dahash_t hashval; /* hash value of name */ 99 xfs_dahash_t hashval; /* hash value of name */
@@ -185,7 +185,7 @@ typedef struct xfs_da_state {
185 unsigned char inleaf; /* insert into 1->lf, 0->splf */ 185 unsigned char inleaf; /* insert into 1->lf, 0->splf */
186 unsigned char extravalid; /* T/F: extrablk is in use */ 186 unsigned char extravalid; /* T/F: extrablk is in use */
187 unsigned char extraafter; /* T/F: extrablk is after new */ 187 unsigned char extraafter; /* T/F: extrablk is after new */
188 xfs_da_state_blk_t extrablk; /* for double-splits on leafs */ 188 xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
189 /* for dirv2 extrablk is data */ 189 /* for dirv2 extrablk is data */
190} xfs_da_state_t; 190} xfs_da_state_t;
191 191
@@ -251,7 +251,7 @@ xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
251int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 251int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
252 xfs_dabuf_t *dead_buf); 252 xfs_dabuf_t *dead_buf);
253 253
254uint xfs_da_hashname(const uchar_t *name_string, int name_length); 254uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
255enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, 255enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
256 const char *name, int len); 256 const char *name, int len);
257 257
@@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
268 268
269extern struct kmem_zone *xfs_da_state_zone; 269extern struct kmem_zone *xfs_da_state_zone;
270extern struct kmem_zone *xfs_dabuf_zone; 270extern struct kmem_zone *xfs_dabuf_zone;
271extern const struct xfs_nameops xfs_default_nameops;
271 272
272#endif /* __XFS_DA_BTREE_H__ */ 273#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f8278cfcc1d3..e6d839bddbf0 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -79,6 +79,12 @@ xfs_swapext(
79 goto out_put_target_file; 79 goto out_put_target_file;
80 } 80 }
81 81
82 if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
83 IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
84 error = XFS_ERROR(EINVAL);
85 goto out_put_target_file;
86 }
87
82 ip = XFS_I(file->f_path.dentry->d_inode); 88 ip = XFS_I(file->f_path.dentry->d_inode);
83 tip = XFS_I(target_file->f_path.dentry->d_inode); 89 tip = XFS_I(target_file->f_path.dentry->d_inode);
84 90
@@ -118,19 +124,17 @@ xfs_swap_extents(
118 xfs_bstat_t *sbp = &sxp->sx_stat; 124 xfs_bstat_t *sbp = &sxp->sx_stat;
119 xfs_ifork_t *tempifp, *ifp, *tifp; 125 xfs_ifork_t *tempifp, *ifp, *tifp;
120 int ilf_fields, tilf_fields; 126 int ilf_fields, tilf_fields;
121 static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
122 int error = 0; 127 int error = 0;
123 int aforkblks = 0; 128 int aforkblks = 0;
124 int taforkblks = 0; 129 int taforkblks = 0;
125 __uint64_t tmp; 130 __uint64_t tmp;
126 char locked = 0;
127 131
128 mp = ip->i_mount; 132 mp = ip->i_mount;
129 133
130 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 134 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
131 if (!tempifp) { 135 if (!tempifp) {
132 error = XFS_ERROR(ENOMEM); 136 error = XFS_ERROR(ENOMEM);
133 goto error0; 137 goto out;
134 } 138 }
135 139
136 sbp = &sxp->sx_stat; 140 sbp = &sxp->sx_stat;
@@ -143,25 +147,24 @@ xfs_swap_extents(
143 */ 147 */
144 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 148 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
145 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 149 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
146 locked = 1;
147 150
148 /* Verify that both files have the same format */ 151 /* Verify that both files have the same format */
149 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { 152 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
150 error = XFS_ERROR(EINVAL); 153 error = XFS_ERROR(EINVAL);
151 goto error0; 154 goto out_unlock;
152 } 155 }
153 156
154 /* Verify both files are either real-time or non-realtime */ 157 /* Verify both files are either real-time or non-realtime */
155 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { 158 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
156 error = XFS_ERROR(EINVAL); 159 error = XFS_ERROR(EINVAL);
157 goto error0; 160 goto out_unlock;
158 } 161 }
159 162
160 /* Should never get a local format */ 163 /* Should never get a local format */
161 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || 164 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
162 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { 165 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
163 error = XFS_ERROR(EINVAL); 166 error = XFS_ERROR(EINVAL);
164 goto error0; 167 goto out_unlock;
165 } 168 }
166 169
167 if (VN_CACHED(VFS_I(tip)) != 0) { 170 if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -169,13 +172,13 @@ xfs_swap_extents(
169 error = xfs_flushinval_pages(tip, 0, -1, 172 error = xfs_flushinval_pages(tip, 0, -1,
170 FI_REMAPF_LOCKED); 173 FI_REMAPF_LOCKED);
171 if (error) 174 if (error)
172 goto error0; 175 goto out_unlock;
173 } 176 }
174 177
175 /* Verify O_DIRECT for ftmp */ 178 /* Verify O_DIRECT for ftmp */
176 if (VN_CACHED(VFS_I(tip)) != 0) { 179 if (VN_CACHED(VFS_I(tip)) != 0) {
177 error = XFS_ERROR(EINVAL); 180 error = XFS_ERROR(EINVAL);
178 goto error0; 181 goto out_unlock;
179 } 182 }
180 183
181 /* Verify all data are being swapped */ 184 /* Verify all data are being swapped */
@@ -183,7 +186,7 @@ xfs_swap_extents(
183 sxp->sx_length != ip->i_d.di_size || 186 sxp->sx_length != ip->i_d.di_size ||
184 sxp->sx_length != tip->i_d.di_size) { 187 sxp->sx_length != tip->i_d.di_size) {
185 error = XFS_ERROR(EFAULT); 188 error = XFS_ERROR(EFAULT);
186 goto error0; 189 goto out_unlock;
187 } 190 }
188 191
189 /* 192 /*
@@ -193,7 +196,7 @@ xfs_swap_extents(
193 */ 196 */
194 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { 197 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
195 error = XFS_ERROR(EINVAL); 198 error = XFS_ERROR(EINVAL);
196 goto error0; 199 goto out_unlock;
197 } 200 }
198 201
199 /* 202 /*
@@ -208,7 +211,7 @@ xfs_swap_extents(
208 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || 211 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
209 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { 212 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
210 error = XFS_ERROR(EBUSY); 213 error = XFS_ERROR(EBUSY);
211 goto error0; 214 goto out_unlock;
212 } 215 }
213 216
214 /* We need to fail if the file is memory mapped. Once we have tossed 217 /* We need to fail if the file is memory mapped. Once we have tossed
@@ -219,7 +222,7 @@ xfs_swap_extents(
219 */ 222 */
220 if (VN_MAPPED(VFS_I(ip))) { 223 if (VN_MAPPED(VFS_I(ip))) {
221 error = XFS_ERROR(EBUSY); 224 error = XFS_ERROR(EBUSY);
222 goto error0; 225 goto out_unlock;
223 } 226 }
224 227
225 xfs_iunlock(ip, XFS_ILOCK_EXCL); 228 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -242,8 +245,7 @@ xfs_swap_extents(
242 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 245 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
243 xfs_iunlock(tip, XFS_IOLOCK_EXCL); 246 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
244 xfs_trans_cancel(tp, 0); 247 xfs_trans_cancel(tp, 0);
245 locked = 0; 248 goto out;
246 goto error0;
247 } 249 }
248 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 250 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
249 251
@@ -253,19 +255,15 @@ xfs_swap_extents(
253 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && 255 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
254 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { 256 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
255 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); 257 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
256 if (error) { 258 if (error)
257 xfs_trans_cancel(tp, 0); 259 goto out_trans_cancel;
258 goto error0;
259 }
260 } 260 }
261 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && 261 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
262 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { 262 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
263 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, 263 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
264 &taforkblks); 264 &taforkblks);
265 if (error) { 265 if (error)
266 xfs_trans_cancel(tp, 0); 266 goto out_trans_cancel;
267 goto error0;
268 }
269 } 267 }
270 268
271 /* 269 /*
@@ -332,10 +330,10 @@ xfs_swap_extents(
332 330
333 331
334 IHOLD(ip); 332 IHOLD(ip);
335 xfs_trans_ijoin(tp, ip, lock_flags); 333 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
336 334
337 IHOLD(tip); 335 IHOLD(tip);
338 xfs_trans_ijoin(tp, tip, lock_flags); 336 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
339 337
340 xfs_trans_log_inode(tp, ip, ilf_fields); 338 xfs_trans_log_inode(tp, ip, ilf_fields);
341 xfs_trans_log_inode(tp, tip, tilf_fields); 339 xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -344,19 +342,19 @@ xfs_swap_extents(
344 * If this is a synchronous mount, make sure that the 342 * If this is a synchronous mount, make sure that the
345 * transaction goes to disk before returning to the user. 343 * transaction goes to disk before returning to the user.
346 */ 344 */
347 if (mp->m_flags & XFS_MOUNT_WSYNC) { 345 if (mp->m_flags & XFS_MOUNT_WSYNC)
348 xfs_trans_set_sync(tp); 346 xfs_trans_set_sync(tp);
349 }
350 347
351 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 348 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
352 locked = 0;
353 349
354 error0: 350out_unlock:
355 if (locked) { 351 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
356 xfs_iunlock(ip, lock_flags); 352 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
357 xfs_iunlock(tip, lock_flags); 353out:
358 } 354 kmem_free(tempifp);
359 if (tempifp != NULL)
360 kmem_free(tempifp);
361 return error; 355 return error;
356
357out_trans_cancel:
358 xfs_trans_cancel(tp, 0);
359 goto out_unlock;
362} 360}
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 162e8726df5e..e5b153b2e6a3 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt {
103/* 103/*
104 * Inode size for given fs. 104 * Inode size for given fs.
105 */ 105 */
106#define XFS_LITINO(mp) ((mp)->m_litino) 106#define XFS_LITINO(mp) \
107 ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
108
107#define XFS_BROOT_SIZE_ADJ \ 109#define XFS_BROOT_SIZE_ADJ \
108 (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) 110 (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
109 111
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 1afb12278b8d..c657bec6d951 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,8 +46,6 @@
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2}; 47struct xfs_name xfs_name_dotdot = {"..", 2};
48 48
49extern const struct xfs_nameops xfs_default_nameops;
50
51/* 49/*
52 * ASCII case-insensitive (ie. A-Z) support for directories that was 50 * ASCII case-insensitive (ie. A-Z) support for directories that was
53 * used in IRIX. 51 * used in IRIX.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e1f0a06aaf04..ab52e9e1c1ee 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -448,7 +448,6 @@ xfs_dir2_block_getdents(
448 xfs_mount_t *mp; /* filesystem mount point */ 448 xfs_mount_t *mp; /* filesystem mount point */
449 char *ptr; /* current data entry */ 449 char *ptr; /* current data entry */
450 int wantoff; /* starting block offset */ 450 int wantoff; /* starting block offset */
451 xfs_ino_t ino;
452 xfs_off_t cook; 451 xfs_off_t cook;
453 452
454 mp = dp->i_mount; 453 mp = dp->i_mount;
@@ -509,16 +508,12 @@ xfs_dir2_block_getdents(
509 508
510 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 509 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
511 (char *)dep - (char *)block); 510 (char *)dep - (char *)block);
512 ino = be64_to_cpu(dep->inumber);
513#if XFS_BIG_INUMS
514 ino += mp->m_inoadd;
515#endif
516 511
517 /* 512 /*
518 * If it didn't fit, set the final offset to here & return. 513 * If it didn't fit, set the final offset to here & return.
519 */ 514 */
520 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, 515 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
521 ino, DT_UNKNOWN)) { 516 be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
522 *offset = cook & 0x7fffffff; 517 *offset = cook & 0x7fffffff;
523 xfs_da_brelse(NULL, bp); 518 xfs_da_brelse(NULL, bp);
524 return 0; 519 return 0;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index b816e0252739..efbc290c7fec 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -38,7 +38,7 @@ struct xfs_trans;
38 38
39/* 39/*
40 * Directory address space divided into sections, 40 * Directory address space divided into sections,
41 * spaces separated by 32gb. 41 * spaces separated by 32GB.
42 */ 42 */
43#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) 43#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
44#define XFS_DIR2_DATA_SPACE 0 44#define XFS_DIR2_DATA_SPACE 0
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ef805a374eec..fa913e459442 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -549,7 +549,7 @@ xfs_dir2_leaf_addname(
549 * Check the internal consistency of a leaf1 block. 549 * Check the internal consistency of a leaf1 block.
550 * Pop an assert if something is wrong. 550 * Pop an assert if something is wrong.
551 */ 551 */
552void 552STATIC void
553xfs_dir2_leaf_check( 553xfs_dir2_leaf_check(
554 xfs_inode_t *dp, /* incore directory inode */ 554 xfs_inode_t *dp, /* incore directory inode */
555 xfs_dabuf_t *bp) /* leaf's buffer */ 555 xfs_dabuf_t *bp) /* leaf's buffer */
@@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents(
780 int ra_index; /* *map index for read-ahead */ 780 int ra_index; /* *map index for read-ahead */
781 int ra_offset; /* map entry offset for ra */ 781 int ra_offset; /* map entry offset for ra */
782 int ra_want; /* readahead count wanted */ 782 int ra_want; /* readahead count wanted */
783 xfs_ino_t ino;
784 783
785 /* 784 /*
786 * If the offset is at or past the largest allowed value, 785 * If the offset is at or past the largest allowed value,
@@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents(
1076 continue; 1075 continue;
1077 } 1076 }
1078 1077
1079 /*
1080 * Copy the entry into the putargs, and try formatting it.
1081 */
1082 dep = (xfs_dir2_data_entry_t *)ptr; 1078 dep = (xfs_dir2_data_entry_t *)ptr;
1083
1084 length = xfs_dir2_data_entsize(dep->namelen); 1079 length = xfs_dir2_data_entsize(dep->namelen);
1085 1080
1086 ino = be64_to_cpu(dep->inumber);
1087#if XFS_BIG_INUMS
1088 ino += mp->m_inoadd;
1089#endif
1090
1091 /*
1092 * Won't fit. Return to caller.
1093 */
1094 if (filldir(dirent, dep->name, dep->namelen, 1081 if (filldir(dirent, dep->name, dep->namelen,
1095 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1082 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1096 ino, DT_UNKNOWN)) 1083 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1097 break; 1084 break;
1098 1085
1099 /* 1086 /*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index fa6c3a5ddbc6..5a81ccd1045b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove(
1104 } 1104 }
1105 xfs_dir2_leafn_check(dp, bp); 1105 xfs_dir2_leafn_check(dp, bp);
1106 /* 1106 /*
1107 * Return indication of whether this leaf block is emtpy enough 1107 * Return indication of whether this leaf block is empty enough
1108 * to justify trying to join it with a neighbor. 1108 * to justify trying to join it with a neighbor.
1109 */ 1109 */
1110 *rval = 1110 *rval =
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index a8a8a6efad5b..e89734e84646 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -748,11 +748,7 @@ xfs_dir2_sf_getdents(
748 * Put . entry unless we're starting past it. 748 * Put . entry unless we're starting past it.
749 */ 749 */
750 if (*offset <= dot_offset) { 750 if (*offset <= dot_offset) {
751 ino = dp->i_ino; 751 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
752#if XFS_BIG_INUMS
753 ino += mp->m_inoadd;
754#endif
755 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
756 *offset = dot_offset & 0x7fffffff; 752 *offset = dot_offset & 0x7fffffff;
757 return 0; 753 return 0;
758 } 754 }
@@ -763,9 +759,6 @@ xfs_dir2_sf_getdents(
763 */ 759 */
764 if (*offset <= dotdot_offset) { 760 if (*offset <= dotdot_offset) {
765 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); 761 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
766#if XFS_BIG_INUMS
767 ino += mp->m_inoadd;
768#endif
769 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { 762 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
770 *offset = dotdot_offset & 0x7fffffff; 763 *offset = dotdot_offset & 0x7fffffff;
771 return 0; 764 return 0;
@@ -786,10 +779,6 @@ xfs_dir2_sf_getdents(
786 } 779 }
787 780
788 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); 781 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
789#if XFS_BIG_INUMS
790 ino += mp->m_inoadd;
791#endif
792
793 if (filldir(dirent, sfep->name, sfep->namelen, 782 if (filldir(dirent, sfep->name, sfep->namelen,
794 off & 0x7fffffff, ino, DT_UNKNOWN)) { 783 off & 0x7fffffff, ino, DT_UNKNOWN)) {
795 *offset = off & 0x7fffffff; 784 *offset = off & 0x7fffffff;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2f049f63e85f..0d22c56fdf64 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,12 +33,10 @@ typedef struct xfs_extent {
33 * conversion routine. 33 * conversion routine.
34 */ 34 */
35 35
36#ifndef HAVE_FORMAT32
37typedef struct xfs_extent_32 { 36typedef struct xfs_extent_32 {
38 __uint64_t ext_start; 37 __uint64_t ext_start;
39 __uint32_t ext_len; 38 __uint32_t ext_len;
40} __attribute__((packed)) xfs_extent_32_t; 39} __attribute__((packed)) xfs_extent_32_t;
41#endif
42 40
43typedef struct xfs_extent_64 { 41typedef struct xfs_extent_64 {
44 __uint64_t ext_start; 42 __uint64_t ext_start;
@@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format {
59 xfs_extent_t efi_extents[1]; /* array of extents to free */ 57 xfs_extent_t efi_extents[1]; /* array of extents to free */
60} xfs_efi_log_format_t; 58} xfs_efi_log_format_t;
61 59
62#ifndef HAVE_FORMAT32
63typedef struct xfs_efi_log_format_32 { 60typedef struct xfs_efi_log_format_32 {
64 __uint16_t efi_type; /* efi log item type */ 61 __uint16_t efi_type; /* efi log item type */
65 __uint16_t efi_size; /* size of this item */ 62 __uint16_t efi_size; /* size of this item */
@@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 {
67 __uint64_t efi_id; /* efi identifier */ 64 __uint64_t efi_id; /* efi identifier */
68 xfs_extent_32_t efi_extents[1]; /* array of extents to free */ 65 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
69} __attribute__((packed)) xfs_efi_log_format_32_t; 66} __attribute__((packed)) xfs_efi_log_format_32_t;
70#endif
71 67
72typedef struct xfs_efi_log_format_64 { 68typedef struct xfs_efi_log_format_64 {
73 __uint16_t efi_type; /* efi log item type */ 69 __uint16_t efi_type; /* efi log item type */
@@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format {
90 xfs_extent_t efd_extents[1]; /* array of extents freed */ 86 xfs_extent_t efd_extents[1]; /* array of extents freed */
91} xfs_efd_log_format_t; 87} xfs_efd_log_format_t;
92 88
93#ifndef HAVE_FORMAT32
94typedef struct xfs_efd_log_format_32 { 89typedef struct xfs_efd_log_format_32 {
95 __uint16_t efd_type; /* efd log item type */ 90 __uint16_t efd_type; /* efd log item type */
96 __uint16_t efd_size; /* size of this item */ 91 __uint16_t efd_size; /* size of this item */
@@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 {
98 __uint64_t efd_efi_id; /* id of corresponding efi */ 93 __uint64_t efd_efi_id; /* id of corresponding efi */
99 xfs_extent_32_t efd_extents[1]; /* array of extents freed */ 94 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
100} __attribute__((packed)) xfs_efd_log_format_32_t; 95} __attribute__((packed)) xfs_efd_log_format_32_t;
101#endif
102 96
103typedef struct xfs_efd_log_format_64 { 97typedef struct xfs_efd_log_format_64 {
104 __uint16_t efd_type; /* efd log item type */ 98 __uint16_t efd_type; /* efd log item type */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index f3bb75da384e..6c87c8f304ef 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,7 +140,7 @@ _xfs_filestream_pick_ag(
140 xfs_extlen_t minlen) 140 xfs_extlen_t minlen)
141{ 141{
142 int err, trylock, nscan; 142 int err, trylock, nscan;
143 xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; 143 xfs_extlen_t longest, free, minfree, maxfree = 0;
144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
145 struct xfs_perag *pag; 145 struct xfs_perag *pag;
146 146
@@ -186,12 +186,7 @@ _xfs_filestream_pick_ag(
186 goto next_ag; 186 goto next_ag;
187 } 187 }
188 188
189 need = XFS_MIN_FREELIST_PAG(pag, mp); 189 longest = xfs_alloc_longest_free_extent(mp, pag);
190 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
191 longest = (pag->pagf_longest > delta) ?
192 (pag->pagf_longest - delta) :
193 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
194
195 if (((minlen && longest >= minlen) || 190 if (((minlen && longest >= minlen) ||
196 (!minlen && pag->pagf_freeblks >= minfree)) && 191 (!minlen && pag->pagf_freeblks >= minfree)) &&
197 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || 192 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 680d0e0ec932..8379e3bca26c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -576,7 +576,7 @@ out:
576 if (fdblks_delta) { 576 if (fdblks_delta) {
577 /* 577 /*
578 * If we are putting blocks back here, m_resblks_avail is 578 * If we are putting blocks back here, m_resblks_avail is
579 * already at it's max so this will put it in the free pool. 579 * already at its max so this will put it in the free pool.
580 * 580 *
581 * If we need space, we'll either succeed in getting it 581 * If we need space, we'll either succeed in getting it
582 * from the free block count or we'll get an enospc. If 582 * from the free block count or we'll get an enospc. If
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab016e5ae7be..3120a3a5e20f 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc(
230 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; 230 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
231 231
232 /* Allow space for the inode btree to split. */ 232 /* Allow space for the inode btree to split. */
233 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; 233 args.minleft = args.mp->m_in_maxlevels - 1;
234 if ((error = xfs_alloc_vextent(&args))) 234 if ((error = xfs_alloc_vextent(&args)))
235 return error; 235 return error;
236 } else 236 } else
@@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc(
270 /* 270 /*
271 * Allow space for the inode btree to split. 271 * Allow space for the inode btree to split.
272 */ 272 */
273 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; 273 args.minleft = args.mp->m_in_maxlevels - 1;
274 if ((error = xfs_alloc_vextent(&args))) 274 if ((error = xfs_alloc_vextent(&args)))
275 return error; 275 return error;
276 } 276 }
@@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc(
349 * Initialize all inodes in this buffer and then log them. 349 * Initialize all inodes in this buffer and then log them.
350 * 350 *
351 * XXX: It would be much better if we had just one transaction to 351 * XXX: It would be much better if we had just one transaction to
352 * log a whole cluster of inodes instead of all the indivdual 352 * log a whole cluster of inodes instead of all the individual
353 * transactions causing a lot of log traffic. 353 * transactions causing a lot of log traffic.
354 */ 354 */
355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); 355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
@@ -943,7 +943,7 @@ nextag:
943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
944 XFS_INODES_PER_CHUNK) == 0); 944 XFS_INODES_PER_CHUNK) == 0);
945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); 945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
946 XFS_INOBT_CLR_FREE(&rec, offset); 946 rec.ir_free &= ~XFS_INOBT_MASK(offset);
947 rec.ir_freecount--; 947 rec.ir_freecount--;
948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, 948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
949 rec.ir_free))) 949 rec.ir_free)))
@@ -1105,11 +1105,11 @@ xfs_difree(
1105 */ 1105 */
1106 off = agino - rec.ir_startino; 1106 off = agino - rec.ir_startino;
1107 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); 1107 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1108 ASSERT(!XFS_INOBT_IS_FREE(&rec, off)); 1108 ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1109 /* 1109 /*
1110 * Mark the inode free & increment the count. 1110 * Mark the inode free & increment the count.
1111 */ 1111 */
1112 XFS_INOBT_SET_FREE(&rec, off); 1112 rec.ir_free |= XFS_INOBT_MASK(off);
1113 rec.ir_freecount++; 1113 rec.ir_freecount++;
1114 1114
1115 /* 1115 /*
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 99f2408e8d8e..c282a9af5393 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur(
164} 164}
165 165
166/* 166/*
167 * intial value of ptr for lookup 167 * initial value of ptr for lookup
168 */ 168 */
169STATIC void 169STATIC void
170xfs_inobt_init_ptr_from_cur( 170xfs_inobt_init_ptr_from_cur(
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 5580e255ff06..f782ad0c4769 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -32,14 +32,14 @@ struct xfs_mount;
32#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ 32#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
33 33
34typedef __uint64_t xfs_inofree_t; 34typedef __uint64_t xfs_inofree_t;
35#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) 35#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) 36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
38#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
38 39
39static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 40static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
40{ 41{
41 return (((n) >= XFS_INODES_PER_CHUNK ? \ 42 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
42 (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
43} 43}
44 44
45/* 45/*
@@ -69,20 +69,6 @@ typedef struct xfs_inobt_key {
69typedef __be32 xfs_inobt_ptr_t; 69typedef __be32 xfs_inobt_ptr_t;
70 70
71/* 71/*
72 * Bit manipulations for ir_free.
73 */
74#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
75#define XFS_INOBT_IS_FREE(rp,i) \
76 (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
77#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i))
78#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
79
80/*
81 * Maximum number of inode btree levels.
82 */
83#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
84
85/*
86 * block numbers in the AG. 72 * block numbers in the AG.
87 */ 73 */
88#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) 74#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1f175fa34b22..f879c1bc4b96 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp {
122 122
123/* 123/*
124 * NOTE: This structure must be kept identical to struct xfs_dinode 124 * NOTE: This structure must be kept identical to struct xfs_dinode
125 * in xfs_dinode.h except for the endianess annotations. 125 * in xfs_dinode.h except for the endianness annotations.
126 */ 126 */
127typedef struct xfs_icdinode { 127typedef struct xfs_icdinode {
128 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 128 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9957d0602d54..a52ac125f055 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format {
40 __int32_t ilf_boffset; /* off of inode in buffer */ 40 __int32_t ilf_boffset; /* off of inode in buffer */
41} xfs_inode_log_format_t; 41} xfs_inode_log_format_t;
42 42
43#ifndef HAVE_FORMAT32
44typedef struct xfs_inode_log_format_32 { 43typedef struct xfs_inode_log_format_32 {
45 __uint16_t ilf_type; /* inode log item type */ 44 __uint16_t ilf_type; /* inode log item type */
46 __uint16_t ilf_size; /* size of this item */ 45 __uint16_t ilf_size; /* size of this item */
@@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 {
56 __int32_t ilf_len; /* len of inode buffer */ 55 __int32_t ilf_len; /* len of inode buffer */
57 __int32_t ilf_boffset; /* off of inode in buffer */ 56 __int32_t ilf_boffset; /* off of inode in buffer */
58} __attribute__((packed)) xfs_inode_log_format_32_t; 57} __attribute__((packed)) xfs_inode_log_format_32_t;
59#endif
60 58
61typedef struct xfs_inode_log_format_64 { 59typedef struct xfs_inode_log_format_64 {
62 __uint16_t ilf_type; /* inode log item type */ 60 __uint16_t ilf_type; /* inode log item type */
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ee1a0c134cc2..a1cc1322fc0f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -63,7 +63,7 @@ typedef enum {
63 */ 63 */
64 64
65typedef struct xfs_iomap { 65typedef struct xfs_iomap {
66 xfs_daddr_t iomap_bn; /* first 512b blk of mapping */ 66 xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
67 xfs_buftarg_t *iomap_target; 67 xfs_buftarg_t *iomap_target;
68 xfs_off_t iomap_offset; /* offset of mapping, bytes */ 68 xfs_off_t iomap_offset; /* offset of mapping, bytes */
69 xfs_off_t iomap_bsize; /* size of mapping, bytes */ 69 xfs_off_t iomap_bsize; /* size of mapping, bytes */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf98a805ec90..aeb2d2221c7d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -83,7 +83,12 @@ xfs_bulkstat_one_iget(
83 buf->bs_uid = dic->di_uid; 83 buf->bs_uid = dic->di_uid;
84 buf->bs_gid = dic->di_gid; 84 buf->bs_gid = dic->di_gid;
85 buf->bs_size = dic->di_size; 85 buf->bs_size = dic->di_size;
86 vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime); 86 /*
87 * We are reading the atime from the Linux inode because the
88 * dinode might not be uptodate.
89 */
90 buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
91 buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
87 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 92 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
88 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 93 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
89 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 94 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -579,7 +584,7 @@ xfs_bulkstat(
579 * first inode of the cluster. 584 * first inode of the cluster.
580 * 585 *
581 * Careful with clustidx. There can be 586 * Careful with clustidx. There can be
582 * multple clusters per chunk, a single 587 * multiple clusters per chunk, a single
583 * cluster per chunk or a cluster that has 588 * cluster per chunk or a cluster that has
584 * inodes represented from several different 589 * inodes represented from several different
585 * chunks (if blocksize is large). 590 * chunks (if blocksize is large).
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f4726f702a9e..f76c6d7cea21 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -574,7 +574,7 @@ xfs_log_mount(
574 error = xfs_trans_ail_init(mp); 574 error = xfs_trans_ail_init(mp);
575 if (error) { 575 if (error) {
576 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 576 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
577 goto error; 577 goto out_free_log;
578 } 578 }
579 mp->m_log->l_ailp = mp->m_ail; 579 mp->m_log->l_ailp = mp->m_ail;
580 580
@@ -594,20 +594,22 @@ xfs_log_mount(
594 mp->m_flags |= XFS_MOUNT_RDONLY; 594 mp->m_flags |= XFS_MOUNT_RDONLY;
595 if (error) { 595 if (error) {
596 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 596 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
597 goto error; 597 goto out_destroy_ail;
598 } 598 }
599 } 599 }
600 600
601 /* Normal transactions can now occur */ 601 /* Normal transactions can now occur */
602 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 602 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
603 603
604 /* End mounting message in xfs_log_mount_finish */
605 return 0; 604 return 0;
606error: 605
607 xfs_log_unmount_dealloc(mp); 606out_destroy_ail:
607 xfs_trans_ail_destroy(mp);
608out_free_log:
609 xlog_dealloc_log(mp->m_log);
608out: 610out:
609 return error; 611 return error;
610} /* xfs_log_mount */ 612}
611 613
612/* 614/*
613 * Finish the recovery of the file system. This is separate from 615 * Finish the recovery of the file system. This is separate from
@@ -633,19 +635,6 @@ xfs_log_mount_finish(xfs_mount_t *mp)
633} 635}
634 636
635/* 637/*
636 * Unmount processing for the log.
637 */
638int
639xfs_log_unmount(xfs_mount_t *mp)
640{
641 int error;
642
643 error = xfs_log_unmount_write(mp);
644 xfs_log_unmount_dealloc(mp);
645 return error;
646}
647
648/*
649 * Final log writes as part of unmount. 638 * Final log writes as part of unmount.
650 * 639 *
651 * Mark the filesystem clean as unmount happens. Note that during relocation 640 * Mark the filesystem clean as unmount happens. Note that during relocation
@@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
795 * and deallocate the log as the aild references the log. 784 * and deallocate the log as the aild references the log.
796 */ 785 */
797void 786void
798xfs_log_unmount_dealloc(xfs_mount_t *mp) 787xfs_log_unmount(xfs_mount_t *mp)
799{ 788{
800 xfs_trans_ail_destroy(mp); 789 xfs_trans_ail_destroy(mp);
801 xlog_dealloc_log(mp->m_log); 790 xlog_dealloc_log(mp->m_log);
@@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
1109/* 1098/*
1110 * Return size of each in-core log record buffer. 1099 * Return size of each in-core log record buffer.
1111 * 1100 *
1112 * All machines get 8 x 32KB buffers by default, unless tuned otherwise. 1101 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
1113 * 1102 *
1114 * If the filesystem blocksize is too large, we may need to choose a 1103 * If the filesystem blocksize is too large, we may need to choose a
1115 * larger size since the directory code currently logs entire blocks. 1104 * larger size since the directory code currently logs entire blocks.
@@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1139 } 1128 }
1140 1129
1141 if (xfs_sb_version_haslogv2(&mp->m_sb)) { 1130 if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1142 /* # headers = size / 32K 1131 /* # headers = size / 32k
1143 * one header holds cycles from 32K of data 1132 * one header holds cycles from 32k of data
1144 */ 1133 */
1145 1134
1146 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; 1135 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
@@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1156 goto done; 1145 goto done;
1157 } 1146 }
1158 1147
1159 /* All machines use 32KB buffers by default. */ 1148 /* All machines use 32kB buffers by default. */
1160 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; 1149 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1161 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; 1150 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1162 1151
@@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1164 log->l_iclog_hsize = BBSIZE; 1153 log->l_iclog_hsize = BBSIZE;
1165 log->l_iclog_heads = 1; 1154 log->l_iclog_heads = 1;
1166 1155
1167 /* 1156done:
1168 * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use 1157 /* are we being asked to make the sizes selected above visible? */
1169 * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers.
1170 */
1171 if (mp->m_sb.sb_blocksize >= 16*1024) {
1172 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1173 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1174 if (mp->m_logbufs <= 0) {
1175 switch (mp->m_sb.sb_blocksize) {
1176 case 16*1024: /* 16 KB */
1177 log->l_iclog_bufs = 3;
1178 break;
1179 case 32*1024: /* 32 KB */
1180 log->l_iclog_bufs = 4;
1181 break;
1182 case 64*1024: /* 64 KB */
1183 log->l_iclog_bufs = 8;
1184 break;
1185 default:
1186 xlog_panic("XFS: Invalid blocksize");
1187 break;
1188 }
1189 }
1190 }
1191
1192done: /* are we being asked to make the sizes selected above visible? */
1193 if (mp->m_logbufs == 0) 1158 if (mp->m_logbufs == 0)
1194 mp->m_logbufs = log->l_iclog_bufs; 1159 mp->m_logbufs = log->l_iclog_bufs;
1195 if (mp->m_logbsize == 0) 1160 if (mp->m_logbsize == 0)
@@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3214 */ 3179 */
3215 3180
3216/* 3181/*
3217 * Free a used ticket when it's refcount falls to zero. 3182 * Free a used ticket when its refcount falls to zero.
3218 */ 3183 */
3219void 3184void
3220xfs_log_ticket_put( 3185xfs_log_ticket_put(
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 8a3e84e900a3..d0c9baa50b1a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -170,9 +170,8 @@ int xfs_log_write(struct xfs_mount *mp,
170 int nentries, 170 int nentries,
171 xfs_log_ticket_t ticket, 171 xfs_log_ticket_t ticket,
172 xfs_lsn_t *start_lsn); 172 xfs_lsn_t *start_lsn);
173int xfs_log_unmount(struct xfs_mount *mp);
174int xfs_log_unmount_write(struct xfs_mount *mp); 173int xfs_log_unmount_write(struct xfs_mount *mp);
175void xfs_log_unmount_dealloc(struct xfs_mount *mp); 174void xfs_log_unmount(struct xfs_mount *mp);
176int xfs_log_force_umount(struct xfs_mount *mp, int logerror); 175int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
177int xfs_log_need_covered(struct xfs_mount *mp); 176int xfs_log_need_covered(struct xfs_mount *mp);
178 177
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 654167be0efb..bcad5f4c1fd1 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -359,7 +359,7 @@ typedef struct xlog_in_core {
359 int ic_size; 359 int ic_size;
360 int ic_offset; 360 int ic_offset;
361 int ic_bwritecnt; 361 int ic_bwritecnt;
362 ushort_t ic_state; 362 unsigned short ic_state;
363 char *ic_datap; /* pointer to iclog data */ 363 char *ic_datap; /* pointer to iclog data */
364#ifdef XFS_LOG_TRACE 364#ifdef XFS_LOG_TRACE
365 struct ktrace *ic_trace; 365 struct ktrace *ic_trace;
@@ -455,7 +455,6 @@ extern void xlog_recover_process_iunlinks(xlog_t *log);
455 455
456extern struct xfs_buf *xlog_get_bp(xlog_t *, int); 456extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
457extern void xlog_put_bp(struct xfs_buf *); 457extern void xlog_put_bp(struct xfs_buf *);
458extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
459 458
460extern kmem_zone_t *xfs_log_ticket_zone; 459extern kmem_zone_t *xfs_log_ticket_zone;
461 460
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 61af610d79b3..7ba450116d4f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -94,12 +94,30 @@ xlog_put_bp(
94 xfs_buf_free(bp); 94 xfs_buf_free(bp);
95} 95}
96 96
97STATIC xfs_caddr_t
98xlog_align(
99 xlog_t *log,
100 xfs_daddr_t blk_no,
101 int nbblks,
102 xfs_buf_t *bp)
103{
104 xfs_caddr_t ptr;
105
106 if (!log->l_sectbb_log)
107 return XFS_BUF_PTR(bp);
108
109 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
110 ASSERT(XFS_BUF_SIZE(bp) >=
111 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
112 return ptr;
113}
114
97 115
98/* 116/*
99 * nbblks should be uint, but oh well. Just want to catch that 32-bit length. 117 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
100 */ 118 */
101int 119STATIC int
102xlog_bread( 120xlog_bread_noalign(
103 xlog_t *log, 121 xlog_t *log,
104 xfs_daddr_t blk_no, 122 xfs_daddr_t blk_no,
105 int nbblks, 123 int nbblks,
@@ -137,6 +155,24 @@ xlog_bread(
137 return error; 155 return error;
138} 156}
139 157
158STATIC int
159xlog_bread(
160 xlog_t *log,
161 xfs_daddr_t blk_no,
162 int nbblks,
163 xfs_buf_t *bp,
164 xfs_caddr_t *offset)
165{
166 int error;
167
168 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
169 if (error)
170 return error;
171
172 *offset = xlog_align(log, blk_no, nbblks, bp);
173 return 0;
174}
175
140/* 176/*
141 * Write out the buffer at the given block for the given number of blocks. 177 * Write out the buffer at the given block for the given number of blocks.
142 * The buffer is kept locked across the write and is returned locked. 178 * The buffer is kept locked across the write and is returned locked.
@@ -180,24 +216,6 @@ xlog_bwrite(
180 return error; 216 return error;
181} 217}
182 218
183STATIC xfs_caddr_t
184xlog_align(
185 xlog_t *log,
186 xfs_daddr_t blk_no,
187 int nbblks,
188 xfs_buf_t *bp)
189{
190 xfs_caddr_t ptr;
191
192 if (!log->l_sectbb_log)
193 return XFS_BUF_PTR(bp);
194
195 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
196 ASSERT(XFS_BUF_SIZE(bp) >=
197 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
198 return ptr;
199}
200
201#ifdef DEBUG 219#ifdef DEBUG
202/* 220/*
203 * dump debug superblock and log record information 221 * dump debug superblock and log record information
@@ -211,11 +229,11 @@ xlog_header_check_dump(
211 229
212 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); 230 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__);
213 for (b = 0; b < 16; b++) 231 for (b = 0; b < 16; b++)
214 cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); 232 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
215 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); 233 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
216 cmn_err(CE_DEBUG, " log : uuid = "); 234 cmn_err(CE_DEBUG, " log : uuid = ");
217 for (b = 0; b < 16; b++) 235 for (b = 0; b < 16; b++)
218 cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); 236 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
219 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); 237 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
220} 238}
221#else 239#else
@@ -321,9 +339,9 @@ xlog_find_cycle_start(
321 339
322 mid_blk = BLK_AVG(first_blk, *last_blk); 340 mid_blk = BLK_AVG(first_blk, *last_blk);
323 while (mid_blk != first_blk && mid_blk != *last_blk) { 341 while (mid_blk != first_blk && mid_blk != *last_blk) {
324 if ((error = xlog_bread(log, mid_blk, 1, bp))) 342 error = xlog_bread(log, mid_blk, 1, bp, &offset);
343 if (error)
325 return error; 344 return error;
326 offset = xlog_align(log, mid_blk, 1, bp);
327 mid_cycle = xlog_get_cycle(offset); 345 mid_cycle = xlog_get_cycle(offset);
328 if (mid_cycle == cycle) { 346 if (mid_cycle == cycle) {
329 *last_blk = mid_blk; 347 *last_blk = mid_blk;
@@ -379,10 +397,10 @@ xlog_find_verify_cycle(
379 397
380 bcount = min(bufblks, (start_blk + nbblks - i)); 398 bcount = min(bufblks, (start_blk + nbblks - i));
381 399
382 if ((error = xlog_bread(log, i, bcount, bp))) 400 error = xlog_bread(log, i, bcount, bp, &buf);
401 if (error)
383 goto out; 402 goto out;
384 403
385 buf = xlog_align(log, i, bcount, bp);
386 for (j = 0; j < bcount; j++) { 404 for (j = 0; j < bcount; j++) {
387 cycle = xlog_get_cycle(buf); 405 cycle = xlog_get_cycle(buf);
388 if (cycle == stop_on_cycle_no) { 406 if (cycle == stop_on_cycle_no) {
@@ -436,9 +454,9 @@ xlog_find_verify_log_record(
436 return ENOMEM; 454 return ENOMEM;
437 smallmem = 1; 455 smallmem = 1;
438 } else { 456 } else {
439 if ((error = xlog_bread(log, start_blk, num_blks, bp))) 457 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
458 if (error)
440 goto out; 459 goto out;
441 offset = xlog_align(log, start_blk, num_blks, bp);
442 offset += ((num_blks - 1) << BBSHIFT); 460 offset += ((num_blks - 1) << BBSHIFT);
443 } 461 }
444 462
@@ -453,9 +471,9 @@ xlog_find_verify_log_record(
453 } 471 }
454 472
455 if (smallmem) { 473 if (smallmem) {
456 if ((error = xlog_bread(log, i, 1, bp))) 474 error = xlog_bread(log, i, 1, bp, &offset);
475 if (error)
457 goto out; 476 goto out;
458 offset = xlog_align(log, i, 1, bp);
459 } 477 }
460 478
461 head = (xlog_rec_header_t *)offset; 479 head = (xlog_rec_header_t *)offset;
@@ -559,15 +577,18 @@ xlog_find_head(
559 bp = xlog_get_bp(log, 1); 577 bp = xlog_get_bp(log, 1);
560 if (!bp) 578 if (!bp)
561 return ENOMEM; 579 return ENOMEM;
562 if ((error = xlog_bread(log, 0, 1, bp))) 580
581 error = xlog_bread(log, 0, 1, bp, &offset);
582 if (error)
563 goto bp_err; 583 goto bp_err;
564 offset = xlog_align(log, 0, 1, bp); 584
565 first_half_cycle = xlog_get_cycle(offset); 585 first_half_cycle = xlog_get_cycle(offset);
566 586
567 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ 587 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
568 if ((error = xlog_bread(log, last_blk, 1, bp))) 588 error = xlog_bread(log, last_blk, 1, bp, &offset);
589 if (error)
569 goto bp_err; 590 goto bp_err;
570 offset = xlog_align(log, last_blk, 1, bp); 591
571 last_half_cycle = xlog_get_cycle(offset); 592 last_half_cycle = xlog_get_cycle(offset);
572 ASSERT(last_half_cycle != 0); 593 ASSERT(last_half_cycle != 0);
573 594
@@ -817,9 +838,10 @@ xlog_find_tail(
817 if (!bp) 838 if (!bp)
818 return ENOMEM; 839 return ENOMEM;
819 if (*head_blk == 0) { /* special case */ 840 if (*head_blk == 0) { /* special case */
820 if ((error = xlog_bread(log, 0, 1, bp))) 841 error = xlog_bread(log, 0, 1, bp, &offset);
842 if (error)
821 goto bread_err; 843 goto bread_err;
822 offset = xlog_align(log, 0, 1, bp); 844
823 if (xlog_get_cycle(offset) == 0) { 845 if (xlog_get_cycle(offset) == 0) {
824 *tail_blk = 0; 846 *tail_blk = 0;
825 /* leave all other log inited values alone */ 847 /* leave all other log inited values alone */
@@ -832,9 +854,10 @@ xlog_find_tail(
832 */ 854 */
833 ASSERT(*head_blk < INT_MAX); 855 ASSERT(*head_blk < INT_MAX);
834 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 856 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
835 if ((error = xlog_bread(log, i, 1, bp))) 857 error = xlog_bread(log, i, 1, bp, &offset);
858 if (error)
836 goto bread_err; 859 goto bread_err;
837 offset = xlog_align(log, i, 1, bp); 860
838 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 861 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
839 found = 1; 862 found = 1;
840 break; 863 break;
@@ -848,9 +871,10 @@ xlog_find_tail(
848 */ 871 */
849 if (!found) { 872 if (!found) {
850 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 873 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
851 if ((error = xlog_bread(log, i, 1, bp))) 874 error = xlog_bread(log, i, 1, bp, &offset);
875 if (error)
852 goto bread_err; 876 goto bread_err;
853 offset = xlog_align(log, i, 1, bp); 877
854 if (XLOG_HEADER_MAGIC_NUM == 878 if (XLOG_HEADER_MAGIC_NUM ==
855 be32_to_cpu(*(__be32 *)offset)) { 879 be32_to_cpu(*(__be32 *)offset)) {
856 found = 2; 880 found = 2;
@@ -922,10 +946,10 @@ xlog_find_tail(
922 if (*head_blk == after_umount_blk && 946 if (*head_blk == after_umount_blk &&
923 be32_to_cpu(rhead->h_num_logops) == 1) { 947 be32_to_cpu(rhead->h_num_logops) == 1) {
924 umount_data_blk = (i + hblks) % log->l_logBBsize; 948 umount_data_blk = (i + hblks) % log->l_logBBsize;
925 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { 949 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
950 if (error)
926 goto bread_err; 951 goto bread_err;
927 } 952
928 offset = xlog_align(log, umount_data_blk, 1, bp);
929 op_head = (xlog_op_header_t *)offset; 953 op_head = (xlog_op_header_t *)offset;
930 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 954 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
931 /* 955 /*
@@ -1017,9 +1041,10 @@ xlog_find_zeroed(
1017 bp = xlog_get_bp(log, 1); 1041 bp = xlog_get_bp(log, 1);
1018 if (!bp) 1042 if (!bp)
1019 return ENOMEM; 1043 return ENOMEM;
1020 if ((error = xlog_bread(log, 0, 1, bp))) 1044 error = xlog_bread(log, 0, 1, bp, &offset);
1045 if (error)
1021 goto bp_err; 1046 goto bp_err;
1022 offset = xlog_align(log, 0, 1, bp); 1047
1023 first_cycle = xlog_get_cycle(offset); 1048 first_cycle = xlog_get_cycle(offset);
1024 if (first_cycle == 0) { /* completely zeroed log */ 1049 if (first_cycle == 0) { /* completely zeroed log */
1025 *blk_no = 0; 1050 *blk_no = 0;
@@ -1028,9 +1053,10 @@ xlog_find_zeroed(
1028 } 1053 }
1029 1054
1030 /* check partially zeroed log */ 1055 /* check partially zeroed log */
1031 if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) 1056 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1057 if (error)
1032 goto bp_err; 1058 goto bp_err;
1033 offset = xlog_align(log, log_bbnum-1, 1, bp); 1059
1034 last_cycle = xlog_get_cycle(offset); 1060 last_cycle = xlog_get_cycle(offset);
1035 if (last_cycle != 0) { /* log completely written to */ 1061 if (last_cycle != 0) { /* log completely written to */
1036 xlog_put_bp(bp); 1062 xlog_put_bp(bp);
@@ -1152,10 +1178,10 @@ xlog_write_log_records(
1152 */ 1178 */
1153 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1179 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1154 if (balign != start_block) { 1180 if (balign != start_block) {
1155 if ((error = xlog_bread(log, start_block, 1, bp))) { 1181 error = xlog_bread_noalign(log, start_block, 1, bp);
1156 xlog_put_bp(bp); 1182 if (error)
1157 return error; 1183 goto out_put_bp;
1158 } 1184
1159 j = start_block - balign; 1185 j = start_block - balign;
1160 } 1186 }
1161 1187
@@ -1175,10 +1201,14 @@ xlog_write_log_records(
1175 balign = BBTOB(ealign - start_block); 1201 balign = BBTOB(ealign - start_block);
1176 error = XFS_BUF_SET_PTR(bp, offset + balign, 1202 error = XFS_BUF_SET_PTR(bp, offset + balign,
1177 BBTOB(sectbb)); 1203 BBTOB(sectbb));
1178 if (!error) 1204 if (error)
1179 error = xlog_bread(log, ealign, sectbb, bp); 1205 break;
1180 if (!error) 1206
1181 error = XFS_BUF_SET_PTR(bp, offset, bufblks); 1207 error = xlog_bread_noalign(log, ealign, sectbb, bp);
1208 if (error)
1209 break;
1210
1211 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1182 if (error) 1212 if (error)
1183 break; 1213 break;
1184 } 1214 }
@@ -1195,6 +1225,8 @@ xlog_write_log_records(
1195 start_block += endcount; 1225 start_block += endcount;
1196 j = 0; 1226 j = 0;
1197 } 1227 }
1228
1229 out_put_bp:
1198 xlog_put_bp(bp); 1230 xlog_put_bp(bp);
1199 return error; 1231 return error;
1200} 1232}
@@ -2511,16 +2543,10 @@ xlog_recover_do_inode_trans(
2511 } 2543 }
2512 2544
2513write_inode_buffer: 2545write_inode_buffer:
2514 if (ITEM_TYPE(item) == XFS_LI_INODE) { 2546 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2515 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2547 bp->b_mount = mp;
2516 bp->b_mount = mp; 2548 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2517 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2549 xfs_bdwrite(mp, bp);
2518 xfs_bdwrite(mp, bp);
2519 } else {
2520 XFS_BUF_STALE(bp);
2521 error = xfs_bwrite(mp, bp);
2522 }
2523
2524error: 2550error:
2525 if (need_free) 2551 if (need_free)
2526 kmem_free(in_f); 2552 kmem_free(in_f);
@@ -2769,51 +2795,48 @@ xlog_recover_do_trans(
2769 int error = 0; 2795 int error = 0;
2770 xlog_recover_item_t *item, *first_item; 2796 xlog_recover_item_t *item, *first_item;
2771 2797
2772 if ((error = xlog_recover_reorder_trans(trans))) 2798 error = xlog_recover_reorder_trans(trans);
2799 if (error)
2773 return error; 2800 return error;
2801
2774 first_item = item = trans->r_itemq; 2802 first_item = item = trans->r_itemq;
2775 do { 2803 do {
2776 /* 2804 switch (ITEM_TYPE(item)) {
2777 * we don't need to worry about the block number being 2805 case XFS_LI_BUF:
2778 * truncated in > 1 TB buffers because in user-land, 2806 error = xlog_recover_do_buffer_trans(log, item, pass);
2779 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so 2807 break;
2780 * the blknos will get through the user-mode buffer 2808 case XFS_LI_INODE:
2781 * cache properly. The only bad case is o32 kernels 2809 error = xlog_recover_do_inode_trans(log, item, pass);
2782 * where xfs_daddr_t is 32-bits but mount will warn us 2810 break;
2783 * off a > 1 TB filesystem before we get here. 2811 case XFS_LI_EFI:
2784 */ 2812 error = xlog_recover_do_efi_trans(log, item,
2785 if ((ITEM_TYPE(item) == XFS_LI_BUF)) { 2813 trans->r_lsn, pass);
2786 if ((error = xlog_recover_do_buffer_trans(log, item, 2814 break;
2787 pass))) 2815 case XFS_LI_EFD:
2788 break;
2789 } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
2790 if ((error = xlog_recover_do_inode_trans(log, item,
2791 pass)))
2792 break;
2793 } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2794 if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2795 pass)))
2796 break;
2797 } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2798 xlog_recover_do_efd_trans(log, item, pass); 2816 xlog_recover_do_efd_trans(log, item, pass);
2799 } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { 2817 error = 0;
2800 if ((error = xlog_recover_do_dquot_trans(log, item, 2818 break;
2801 pass))) 2819 case XFS_LI_DQUOT:
2802 break; 2820 error = xlog_recover_do_dquot_trans(log, item, pass);
2803 } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { 2821 break;
2804 if ((error = xlog_recover_do_quotaoff_trans(log, item, 2822 case XFS_LI_QUOTAOFF:
2805 pass))) 2823 error = xlog_recover_do_quotaoff_trans(log, item,
2806 break; 2824 pass);
2807 } else { 2825 break;
2808 xlog_warn("XFS: xlog_recover_do_trans"); 2826 default:
2827 xlog_warn(
2828 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2809 ASSERT(0); 2829 ASSERT(0);
2810 error = XFS_ERROR(EIO); 2830 error = XFS_ERROR(EIO);
2811 break; 2831 break;
2812 } 2832 }
2833
2834 if (error)
2835 return error;
2813 item = item->ri_next; 2836 item = item->ri_next;
2814 } while (first_item != item); 2837 } while (first_item != item);
2815 2838
2816 return error; 2839 return 0;
2817} 2840}
2818 2841
2819/* 2842/*
@@ -3490,9 +3513,11 @@ xlog_do_recovery_pass(
3490 hbp = xlog_get_bp(log, 1); 3513 hbp = xlog_get_bp(log, 1);
3491 if (!hbp) 3514 if (!hbp)
3492 return ENOMEM; 3515 return ENOMEM;
3493 if ((error = xlog_bread(log, tail_blk, 1, hbp))) 3516
3517 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3518 if (error)
3494 goto bread_err1; 3519 goto bread_err1;
3495 offset = xlog_align(log, tail_blk, 1, hbp); 3520
3496 rhead = (xlog_rec_header_t *)offset; 3521 rhead = (xlog_rec_header_t *)offset;
3497 error = xlog_valid_rec_header(log, rhead, tail_blk); 3522 error = xlog_valid_rec_header(log, rhead, tail_blk);
3498 if (error) 3523 if (error)
@@ -3526,9 +3551,10 @@ xlog_do_recovery_pass(
3526 memset(rhash, 0, sizeof(rhash)); 3551 memset(rhash, 0, sizeof(rhash));
3527 if (tail_blk <= head_blk) { 3552 if (tail_blk <= head_blk) {
3528 for (blk_no = tail_blk; blk_no < head_blk; ) { 3553 for (blk_no = tail_blk; blk_no < head_blk; ) {
3529 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3554 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3555 if (error)
3530 goto bread_err2; 3556 goto bread_err2;
3531 offset = xlog_align(log, blk_no, hblks, hbp); 3557
3532 rhead = (xlog_rec_header_t *)offset; 3558 rhead = (xlog_rec_header_t *)offset;
3533 error = xlog_valid_rec_header(log, rhead, blk_no); 3559 error = xlog_valid_rec_header(log, rhead, blk_no);
3534 if (error) 3560 if (error)
@@ -3536,10 +3562,11 @@ xlog_do_recovery_pass(
3536 3562
3537 /* blocks in data section */ 3563 /* blocks in data section */
3538 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3564 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3539 error = xlog_bread(log, blk_no + hblks, bblks, dbp); 3565 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3566 &offset);
3540 if (error) 3567 if (error)
3541 goto bread_err2; 3568 goto bread_err2;
3542 offset = xlog_align(log, blk_no + hblks, bblks, dbp); 3569
3543 xlog_unpack_data(rhead, offset, log); 3570 xlog_unpack_data(rhead, offset, log);
3544 if ((error = xlog_recover_process_data(log, 3571 if ((error = xlog_recover_process_data(log,
3545 rhash, rhead, offset, pass))) 3572 rhash, rhead, offset, pass)))
@@ -3562,10 +3589,10 @@ xlog_do_recovery_pass(
3562 wrapped_hblks = 0; 3589 wrapped_hblks = 0;
3563 if (blk_no + hblks <= log->l_logBBsize) { 3590 if (blk_no + hblks <= log->l_logBBsize) {
3564 /* Read header in one read */ 3591 /* Read header in one read */
3565 error = xlog_bread(log, blk_no, hblks, hbp); 3592 error = xlog_bread(log, blk_no, hblks, hbp,
3593 &offset);
3566 if (error) 3594 if (error)
3567 goto bread_err2; 3595 goto bread_err2;
3568 offset = xlog_align(log, blk_no, hblks, hbp);
3569 } else { 3596 } else {
3570 /* This LR is split across physical log end */ 3597 /* This LR is split across physical log end */
3571 if (blk_no != log->l_logBBsize) { 3598 if (blk_no != log->l_logBBsize) {
@@ -3573,12 +3600,13 @@ xlog_do_recovery_pass(
3573 ASSERT(blk_no <= INT_MAX); 3600 ASSERT(blk_no <= INT_MAX);
3574 split_hblks = log->l_logBBsize - (int)blk_no; 3601 split_hblks = log->l_logBBsize - (int)blk_no;
3575 ASSERT(split_hblks > 0); 3602 ASSERT(split_hblks > 0);
3576 if ((error = xlog_bread(log, blk_no, 3603 error = xlog_bread(log, blk_no,
3577 split_hblks, hbp))) 3604 split_hblks, hbp,
3605 &offset);
3606 if (error)
3578 goto bread_err2; 3607 goto bread_err2;
3579 offset = xlog_align(log, blk_no,
3580 split_hblks, hbp);
3581 } 3608 }
3609
3582 /* 3610 /*
3583 * Note: this black magic still works with 3611 * Note: this black magic still works with
3584 * large sector sizes (non-512) only because: 3612 * large sector sizes (non-512) only because:
@@ -3596,14 +3624,19 @@ xlog_do_recovery_pass(
3596 error = XFS_BUF_SET_PTR(hbp, 3624 error = XFS_BUF_SET_PTR(hbp,
3597 bufaddr + BBTOB(split_hblks), 3625 bufaddr + BBTOB(split_hblks),
3598 BBTOB(hblks - split_hblks)); 3626 BBTOB(hblks - split_hblks));
3599 if (!error) 3627 if (error)
3600 error = xlog_bread(log, 0, 3628 goto bread_err2;
3601 wrapped_hblks, hbp); 3629
3602 if (!error) 3630 error = xlog_bread_noalign(log, 0,
3603 error = XFS_BUF_SET_PTR(hbp, bufaddr, 3631 wrapped_hblks, hbp);
3632 if (error)
3633 goto bread_err2;
3634
3635 error = XFS_BUF_SET_PTR(hbp, bufaddr,
3604 BBTOB(hblks)); 3636 BBTOB(hblks));
3605 if (error) 3637 if (error)
3606 goto bread_err2; 3638 goto bread_err2;
3639
3607 if (!offset) 3640 if (!offset)
3608 offset = xlog_align(log, 0, 3641 offset = xlog_align(log, 0,
3609 wrapped_hblks, hbp); 3642 wrapped_hblks, hbp);
@@ -3619,10 +3652,10 @@ xlog_do_recovery_pass(
3619 3652
3620 /* Read in data for log record */ 3653 /* Read in data for log record */
3621 if (blk_no + bblks <= log->l_logBBsize) { 3654 if (blk_no + bblks <= log->l_logBBsize) {
3622 error = xlog_bread(log, blk_no, bblks, dbp); 3655 error = xlog_bread(log, blk_no, bblks, dbp,
3656 &offset);
3623 if (error) 3657 if (error)
3624 goto bread_err2; 3658 goto bread_err2;
3625 offset = xlog_align(log, blk_no, bblks, dbp);
3626 } else { 3659 } else {
3627 /* This log record is split across the 3660 /* This log record is split across the
3628 * physical end of log */ 3661 * physical end of log */
@@ -3636,12 +3669,13 @@ xlog_do_recovery_pass(
3636 split_bblks = 3669 split_bblks =
3637 log->l_logBBsize - (int)blk_no; 3670 log->l_logBBsize - (int)blk_no;
3638 ASSERT(split_bblks > 0); 3671 ASSERT(split_bblks > 0);
3639 if ((error = xlog_bread(log, blk_no, 3672 error = xlog_bread(log, blk_no,
3640 split_bblks, dbp))) 3673 split_bblks, dbp,
3674 &offset);
3675 if (error)
3641 goto bread_err2; 3676 goto bread_err2;
3642 offset = xlog_align(log, blk_no,
3643 split_bblks, dbp);
3644 } 3677 }
3678
3645 /* 3679 /*
3646 * Note: this black magic still works with 3680 * Note: this black magic still works with
3647 * large sector sizes (non-512) only because: 3681 * large sector sizes (non-512) only because:
@@ -3658,15 +3692,19 @@ xlog_do_recovery_pass(
3658 error = XFS_BUF_SET_PTR(dbp, 3692 error = XFS_BUF_SET_PTR(dbp,
3659 bufaddr + BBTOB(split_bblks), 3693 bufaddr + BBTOB(split_bblks),
3660 BBTOB(bblks - split_bblks)); 3694 BBTOB(bblks - split_bblks));
3661 if (!error)
3662 error = xlog_bread(log, wrapped_hblks,
3663 bblks - split_bblks,
3664 dbp);
3665 if (!error)
3666 error = XFS_BUF_SET_PTR(dbp, bufaddr,
3667 h_size);
3668 if (error) 3695 if (error)
3669 goto bread_err2; 3696 goto bread_err2;
3697
3698 error = xlog_bread_noalign(log, wrapped_hblks,
3699 bblks - split_bblks,
3700 dbp);
3701 if (error)
3702 goto bread_err2;
3703
3704 error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
3705 if (error)
3706 goto bread_err2;
3707
3670 if (!offset) 3708 if (!offset)
3671 offset = xlog_align(log, wrapped_hblks, 3709 offset = xlog_align(log, wrapped_hblks,
3672 bblks - split_bblks, dbp); 3710 bblks - split_bblks, dbp);
@@ -3683,17 +3721,21 @@ xlog_do_recovery_pass(
3683 3721
3684 /* read first part of physical log */ 3722 /* read first part of physical log */
3685 while (blk_no < head_blk) { 3723 while (blk_no < head_blk) {
3686 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3724 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3725 if (error)
3687 goto bread_err2; 3726 goto bread_err2;
3688 offset = xlog_align(log, blk_no, hblks, hbp); 3727
3689 rhead = (xlog_rec_header_t *)offset; 3728 rhead = (xlog_rec_header_t *)offset;
3690 error = xlog_valid_rec_header(log, rhead, blk_no); 3729 error = xlog_valid_rec_header(log, rhead, blk_no);
3691 if (error) 3730 if (error)
3692 goto bread_err2; 3731 goto bread_err2;
3732
3693 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3733 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3694 if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) 3734 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3735 &offset);
3736 if (error)
3695 goto bread_err2; 3737 goto bread_err2;
3696 offset = xlog_align(log, blk_no+hblks, bblks, dbp); 3738
3697 xlog_unpack_data(rhead, offset, log); 3739 xlog_unpack_data(rhead, offset, log);
3698 if ((error = xlog_recover_process_data(log, rhash, 3740 if ((error = xlog_recover_process_data(log, rhash,
3699 rhead, offset, pass))) 3741 rhead, offset, pass)))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 35300250e86d..b101990df027 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
45#include "xfs_fsops.h" 45#include "xfs_fsops.h"
46#include "xfs_utils.h" 46#include "xfs_utils.h"
47 47
48STATIC int xfs_uuid_mount(xfs_mount_t *);
49STATIC void xfs_unmountfs_wait(xfs_mount_t *); 48STATIC void xfs_unmountfs_wait(xfs_mount_t *);
50 49
51 50
@@ -121,6 +120,84 @@ static const struct {
121 { sizeof(xfs_sb_t), 0 } 120 { sizeof(xfs_sb_t), 0 }
122}; 121};
123 122
123static DEFINE_MUTEX(xfs_uuid_table_mutex);
124static int xfs_uuid_table_size;
125static uuid_t *xfs_uuid_table;
126
127/*
128 * See if the UUID is unique among mounted XFS filesystems.
129 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
130 */
131STATIC int
132xfs_uuid_mount(
133 struct xfs_mount *mp)
134{
135 uuid_t *uuid = &mp->m_sb.sb_uuid;
136 int hole, i;
137
138 if (mp->m_flags & XFS_MOUNT_NOUUID)
139 return 0;
140
141 if (uuid_is_nil(uuid)) {
142 cmn_err(CE_WARN,
143 "XFS: Filesystem %s has nil UUID - can't mount",
144 mp->m_fsname);
145 return XFS_ERROR(EINVAL);
146 }
147
148 mutex_lock(&xfs_uuid_table_mutex);
149 for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
150 if (uuid_is_nil(&xfs_uuid_table[i])) {
151 hole = i;
152 continue;
153 }
154 if (uuid_equal(uuid, &xfs_uuid_table[i]))
155 goto out_duplicate;
156 }
157
158 if (hole < 0) {
159 xfs_uuid_table = kmem_realloc(xfs_uuid_table,
160 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
161 xfs_uuid_table_size * sizeof(*xfs_uuid_table),
162 KM_SLEEP);
163 hole = xfs_uuid_table_size++;
164 }
165 xfs_uuid_table[hole] = *uuid;
166 mutex_unlock(&xfs_uuid_table_mutex);
167
168 return 0;
169
170 out_duplicate:
171 mutex_unlock(&xfs_uuid_table_mutex);
172 cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
173 mp->m_fsname);
174 return XFS_ERROR(EINVAL);
175}
176
177STATIC void
178xfs_uuid_unmount(
179 struct xfs_mount *mp)
180{
181 uuid_t *uuid = &mp->m_sb.sb_uuid;
182 int i;
183
184 if (mp->m_flags & XFS_MOUNT_NOUUID)
185 return;
186
187 mutex_lock(&xfs_uuid_table_mutex);
188 for (i = 0; i < xfs_uuid_table_size; i++) {
189 if (uuid_is_nil(&xfs_uuid_table[i]))
190 continue;
191 if (!uuid_equal(uuid, &xfs_uuid_table[i]))
192 continue;
193 memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
194 break;
195 }
196 ASSERT(i < xfs_uuid_table_size);
197 mutex_unlock(&xfs_uuid_table_mutex);
198}
199
200
124/* 201/*
125 * Free up the resources associated with a mount structure. Assume that 202 * Free up the resources associated with a mount structure. Assume that
126 * the structure was initially zeroed, so we can tell which fields got 203 * the structure was initially zeroed, so we can tell which fields got
@@ -256,6 +333,22 @@ xfs_mount_validate_sb(
256 return XFS_ERROR(ENOSYS); 333 return XFS_ERROR(ENOSYS);
257 } 334 }
258 335
336 /*
337 * Currently only very few inode sizes are supported.
338 */
339 switch (sbp->sb_inodesize) {
340 case 256:
341 case 512:
342 case 1024:
343 case 2048:
344 break;
345 default:
346 xfs_fs_mount_cmn_err(flags,
347 "inode size of %d bytes not supported",
348 sbp->sb_inodesize);
349 return XFS_ERROR(ENOSYS);
350 }
351
259 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 352 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
260 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 353 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
261 xfs_fs_mount_cmn_err(flags, 354 xfs_fs_mount_cmn_err(flags,
@@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
574 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; 667 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
575 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; 668 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
576 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; 669 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
577 mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
578 mp->m_blockmask = sbp->sb_blocksize - 1; 670 mp->m_blockmask = sbp->sb_blocksize - 1;
579 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; 671 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
580 mp->m_blockwmask = mp->m_blockwsize - 1; 672 mp->m_blockwmask = mp->m_blockwsize - 1;
581 673
582 /*
583 * Setup for attributes, in case they get created.
584 * This value is for inodes getting attributes for the first time,
585 * the per-inode value is for old attribute values.
586 */
587 ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
588 switch (sbp->sb_inodesize) {
589 case 256:
590 mp->m_attroffset = XFS_LITINO(mp) -
591 XFS_BMDR_SPACE_CALC(MINABTPTRS);
592 break;
593 case 512:
594 case 1024:
595 case 2048:
596 mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
597 break;
598 default:
599 ASSERT(0);
600 }
601 ASSERT(mp->m_attroffset < XFS_LITINO(mp));
602
603 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); 674 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
604 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); 675 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
605 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; 676 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
@@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
645 for (index = 0; index < agcount; index++) { 716 for (index = 0; index < agcount; index++) {
646 /* 717 /*
647 * read the agf, then the agi. This gets us 718 * read the agf, then the agi. This gets us
648 * all the inforamtion we need and populates the 719 * all the information we need and populates the
649 * per-ag structures for us. 720 * per-ag structures for us.
650 */ 721 */
651 error = xfs_alloc_pagf_init(mp, NULL, index, 0); 722 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
@@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp)
886} 957}
887 958
888/* 959/*
889 * xfs_mountfs
890 *
891 * This function does the following on an initial mount of a file system: 960 * This function does the following on an initial mount of a file system:
892 * - reads the superblock from disk and init the mount struct 961 * - reads the superblock from disk and init the mount struct
893 * - if we're a 32-bit kernel, do a size check on the superblock 962 * - if we're a 32-bit kernel, do a size check on the superblock
@@ -905,7 +974,6 @@ xfs_mountfs(
905 xfs_inode_t *rip; 974 xfs_inode_t *rip;
906 __uint64_t resblks; 975 __uint64_t resblks;
907 uint quotamount, quotaflags; 976 uint quotamount, quotaflags;
908 int uuid_mounted = 0;
909 int error = 0; 977 int error = 0;
910 978
911 xfs_mount_common(mp, sbp); 979 xfs_mount_common(mp, sbp);
@@ -960,7 +1028,7 @@ xfs_mountfs(
960 */ 1028 */
961 error = xfs_update_alignment(mp); 1029 error = xfs_update_alignment(mp);
962 if (error) 1030 if (error)
963 goto error1; 1031 goto out;
964 1032
965 xfs_alloc_compute_maxlevels(mp); 1033 xfs_alloc_compute_maxlevels(mp);
966 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); 1034 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
@@ -971,19 +1039,9 @@ xfs_mountfs(
971 1039
972 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); 1040 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
973 1041
974 /* 1042 error = xfs_uuid_mount(mp);
975 * XFS uses the uuid from the superblock as the unique 1043 if (error)
976 * identifier for fsid. We can not use the uuid from the volume 1044 goto out;
977 * since a single partition filesystem is identical to a single
978 * partition volume/filesystem.
979 */
980 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
981 if (xfs_uuid_mount(mp)) {
982 error = XFS_ERROR(EINVAL);
983 goto error1;
984 }
985 uuid_mounted=1;
986 }
987 1045
988 /* 1046 /*
989 * Set the minimum read and write sizes 1047 * Set the minimum read and write sizes
@@ -1007,7 +1065,7 @@ xfs_mountfs(
1007 */ 1065 */
1008 error = xfs_check_sizes(mp); 1066 error = xfs_check_sizes(mp);
1009 if (error) 1067 if (error)
1010 goto error1; 1068 goto out_remove_uuid;
1011 1069
1012 /* 1070 /*
1013 * Initialize realtime fields in the mount structure 1071 * Initialize realtime fields in the mount structure
@@ -1015,7 +1073,7 @@ xfs_mountfs(
1015 error = xfs_rtmount_init(mp); 1073 error = xfs_rtmount_init(mp);
1016 if (error) { 1074 if (error) {
1017 cmn_err(CE_WARN, "XFS: RT mount failed"); 1075 cmn_err(CE_WARN, "XFS: RT mount failed");
1018 goto error1; 1076 goto out_remove_uuid;
1019 } 1077 }
1020 1078
1021 /* 1079 /*
@@ -1045,26 +1103,26 @@ xfs_mountfs(
1045 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), 1103 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
1046 KM_MAYFAIL); 1104 KM_MAYFAIL);
1047 if (!mp->m_perag) 1105 if (!mp->m_perag)
1048 goto error1; 1106 goto out_remove_uuid;
1049 1107
1050 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); 1108 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1051 1109
1110 if (!sbp->sb_logblocks) {
1111 cmn_err(CE_WARN, "XFS: no log defined");
1112 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1113 error = XFS_ERROR(EFSCORRUPTED);
1114 goto out_free_perag;
1115 }
1116
1052 /* 1117 /*
1053 * log's mount-time initialization. Perform 1st part recovery if needed 1118 * log's mount-time initialization. Perform 1st part recovery if needed
1054 */ 1119 */
1055 if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ 1120 error = xfs_log_mount(mp, mp->m_logdev_targp,
1056 error = xfs_log_mount(mp, mp->m_logdev_targp, 1121 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1057 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1122 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1058 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1123 if (error) {
1059 if (error) { 1124 cmn_err(CE_WARN, "XFS: log mount failed");
1060 cmn_err(CE_WARN, "XFS: log mount failed"); 1125 goto out_free_perag;
1061 goto error2;
1062 }
1063 } else { /* No log has been defined */
1064 cmn_err(CE_WARN, "XFS: no log defined");
1065 XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
1066 error = XFS_ERROR(EFSCORRUPTED);
1067 goto error2;
1068 } 1126 }
1069 1127
1070 /* 1128 /*
@@ -1086,15 +1144,14 @@ xfs_mountfs(
1086 * If we are currently making the filesystem, the initialisation will 1144 * If we are currently making the filesystem, the initialisation will
1087 * fail as the perag data is in an undefined state. 1145 * fail as the perag data is in an undefined state.
1088 */ 1146 */
1089
1090 if (xfs_sb_version_haslazysbcount(&mp->m_sb) && 1147 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
1091 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && 1148 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
1092 !mp->m_sb.sb_inprogress) { 1149 !mp->m_sb.sb_inprogress) {
1093 error = xfs_initialize_perag_data(mp, sbp->sb_agcount); 1150 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1094 if (error) { 1151 if (error)
1095 goto error2; 1152 goto out_free_perag;
1096 }
1097 } 1153 }
1154
1098 /* 1155 /*
1099 * Get and sanity-check the root inode. 1156 * Get and sanity-check the root inode.
1100 * Save the pointer to it in the mount structure. 1157 * Save the pointer to it in the mount structure.
@@ -1102,7 +1159,7 @@ xfs_mountfs(
1102 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); 1159 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
1103 if (error) { 1160 if (error) {
1104 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1161 cmn_err(CE_WARN, "XFS: failed to read root inode");
1105 goto error3; 1162 goto out_log_dealloc;
1106 } 1163 }
1107 1164
1108 ASSERT(rip != NULL); 1165 ASSERT(rip != NULL);
@@ -1116,7 +1173,7 @@ xfs_mountfs(
1116 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1173 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
1117 mp); 1174 mp);
1118 error = XFS_ERROR(EFSCORRUPTED); 1175 error = XFS_ERROR(EFSCORRUPTED);
1119 goto error4; 1176 goto out_rele_rip;
1120 } 1177 }
1121 mp->m_rootip = rip; /* save it */ 1178 mp->m_rootip = rip; /* save it */
1122 1179
@@ -1131,7 +1188,7 @@ xfs_mountfs(
1131 * Free up the root inode. 1188 * Free up the root inode.
1132 */ 1189 */
1133 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1190 cmn_err(CE_WARN, "XFS: failed to read RT inodes");
1134 goto error4; 1191 goto out_rele_rip;
1135 } 1192 }
1136 1193
1137 /* 1194 /*
@@ -1143,7 +1200,7 @@ xfs_mountfs(
1143 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1200 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1144 if (error) { 1201 if (error) {
1145 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1202 cmn_err(CE_WARN, "XFS: failed to write sb changes");
1146 goto error4; 1203 goto out_rtunmount;
1147 } 1204 }
1148 } 1205 }
1149 1206
@@ -1152,7 +1209,7 @@ xfs_mountfs(
1152 */ 1209 */
1153 error = XFS_QM_INIT(mp, &quotamount, &quotaflags); 1210 error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
1154 if (error) 1211 if (error)
1155 goto error4; 1212 goto out_rtunmount;
1156 1213
1157 /* 1214 /*
1158 * Finish recovering the file system. This part needed to be 1215 * Finish recovering the file system. This part needed to be
@@ -1162,7 +1219,7 @@ xfs_mountfs(
1162 error = xfs_log_mount_finish(mp); 1219 error = xfs_log_mount_finish(mp);
1163 if (error) { 1220 if (error) {
1164 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1221 cmn_err(CE_WARN, "XFS: log mount finish failed");
1165 goto error4; 1222 goto out_rtunmount;
1166 } 1223 }
1167 1224
1168 /* 1225 /*
@@ -1170,7 +1227,7 @@ xfs_mountfs(
1170 */ 1227 */
1171 error = XFS_QM_MOUNT(mp, quotamount, quotaflags); 1228 error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
1172 if (error) 1229 if (error)
1173 goto error4; 1230 goto out_rtunmount;
1174 1231
1175 /* 1232 /*
1176 * Now we are mounted, reserve a small amount of unused space for 1233 * Now we are mounted, reserve a small amount of unused space for
@@ -1194,18 +1251,17 @@ xfs_mountfs(
1194 1251
1195 return 0; 1252 return 0;
1196 1253
1197 error4: 1254 out_rtunmount:
1198 /* 1255 xfs_rtunmount_inodes(mp);
1199 * Free up the root inode. 1256 out_rele_rip:
1200 */
1201 IRELE(rip); 1257 IRELE(rip);
1202 error3: 1258 out_log_dealloc:
1203 xfs_log_unmount_dealloc(mp); 1259 xfs_log_unmount(mp);
1204 error2: 1260 out_free_perag:
1205 xfs_free_perag(mp); 1261 xfs_free_perag(mp);
1206 error1: 1262 out_remove_uuid:
1207 if (uuid_mounted) 1263 xfs_uuid_unmount(mp);
1208 uuid_table_remove(&mp->m_sb.sb_uuid); 1264 out:
1209 return error; 1265 return error;
1210} 1266}
1211 1267
@@ -1226,15 +1282,12 @@ xfs_unmountfs(
1226 */ 1282 */
1227 XFS_QM_UNMOUNT(mp); 1283 XFS_QM_UNMOUNT(mp);
1228 1284
1229 if (mp->m_rbmip) 1285 xfs_rtunmount_inodes(mp);
1230 IRELE(mp->m_rbmip);
1231 if (mp->m_rsumip)
1232 IRELE(mp->m_rsumip);
1233 IRELE(mp->m_rootip); 1286 IRELE(mp->m_rootip);
1234 1287
1235 /* 1288 /*
1236 * We can potentially deadlock here if we have an inode cluster 1289 * We can potentially deadlock here if we have an inode cluster
1237 * that has been freed has it's buffer still pinned in memory because 1290 * that has been freed has its buffer still pinned in memory because
1238 * the transaction is still sitting in a iclog. The stale inodes 1291 * the transaction is still sitting in a iclog. The stale inodes
1239 * on that buffer will have their flush locks held until the 1292 * on that buffer will have their flush locks held until the
1240 * transaction hits the disk and the callbacks run. the inode 1293 * transaction hits the disk and the callbacks run. the inode
@@ -1266,7 +1319,7 @@ xfs_unmountfs(
1266 * Unreserve any blocks we have so that when we unmount we don't account 1319 * Unreserve any blocks we have so that when we unmount we don't account
1267 * the reserved free space as used. This is really only necessary for 1320 * the reserved free space as used. This is really only necessary for
1268 * lazy superblock counting because it trusts the incore superblock 1321 * lazy superblock counting because it trusts the incore superblock
1269 * counters to be aboslutely correct on clean unmount. 1322 * counters to be absolutely correct on clean unmount.
1270 * 1323 *
1271 * We don't bother correcting this elsewhere for lazy superblock 1324 * We don't bother correcting this elsewhere for lazy superblock
1272 * counting because on mount of an unclean filesystem we reconstruct the 1325 * counting because on mount of an unclean filesystem we reconstruct the
@@ -1288,10 +1341,9 @@ xfs_unmountfs(
1288 "Freespace may not be correct on next mount."); 1341 "Freespace may not be correct on next mount.");
1289 xfs_unmountfs_writesb(mp); 1342 xfs_unmountfs_writesb(mp);
1290 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1343 xfs_unmountfs_wait(mp); /* wait for async bufs */
1291 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1344 xfs_log_unmount_write(mp);
1292 1345 xfs_log_unmount(mp);
1293 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) 1346 xfs_uuid_unmount(mp);
1294 uuid_table_remove(&mp->m_sb.sb_uuid);
1295 1347
1296#if defined(DEBUG) 1348#if defined(DEBUG)
1297 xfs_errortag_clearall(mp, 0); 1349 xfs_errortag_clearall(mp, 0);
@@ -1793,29 +1845,6 @@ xfs_freesb(
1793} 1845}
1794 1846
1795/* 1847/*
1796 * See if the UUID is unique among mounted XFS filesystems.
1797 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
1798 */
1799STATIC int
1800xfs_uuid_mount(
1801 xfs_mount_t *mp)
1802{
1803 if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
1804 cmn_err(CE_WARN,
1805 "XFS: Filesystem %s has nil UUID - can't mount",
1806 mp->m_fsname);
1807 return -1;
1808 }
1809 if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
1810 cmn_err(CE_WARN,
1811 "XFS: Filesystem %s has duplicate UUID - can't mount",
1812 mp->m_fsname);
1813 return -1;
1814 }
1815 return 0;
1816}
1817
1818/*
1819 * Used to log changes to the superblock unit and width fields which could 1848 * Used to log changes to the superblock unit and width fields which could
1820 * be altered by the mount options, as well as any potential sb_features2 1849 * be altered by the mount options, as well as any potential sb_features2
1821 * fixup. Only the first superblock is updated. 1850 * fixup. Only the first superblock is updated.
@@ -1868,7 +1897,7 @@ xfs_mount_log_sb(
1868 * we disable the per-cpu counter and go through the slow path. 1897 * we disable the per-cpu counter and go through the slow path.
1869 * 1898 *
1870 * The slow path is the current xfs_mod_incore_sb() function. This means that 1899 * The slow path is the current xfs_mod_incore_sb() function. This means that
1871 * when we disable a per-cpu counter, we need to drain it's resources back to 1900 * when we disable a per-cpu counter, we need to drain its resources back to
1872 * the global superblock. We do this after disabling the counter to prevent 1901 * the global superblock. We do this after disabling the counter to prevent
1873 * more threads from queueing up on the counter. 1902 * more threads from queueing up on the counter.
1874 * 1903 *
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f5e9937f9bdb..7af44adffc8f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -136,7 +136,6 @@ typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
136 struct xfs_dquot *, struct xfs_dquot *, uint); 136 struct xfs_dquot *, struct xfs_dquot *, uint);
137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *); 137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); 138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
139typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
140 139
141typedef struct xfs_qmops { 140typedef struct xfs_qmops {
142 xfs_qminit_t xfs_qminit; 141 xfs_qminit_t xfs_qminit;
@@ -154,7 +153,6 @@ typedef struct xfs_qmops {
154 xfs_dqvopchownresv_t xfs_dqvopchownresv; 153 xfs_dqvopchownresv_t xfs_dqvopchownresv;
155 xfs_dqstatvfs_t xfs_dqstatvfs; 154 xfs_dqstatvfs_t xfs_dqstatvfs;
156 xfs_dqsync_t xfs_dqsync; 155 xfs_dqsync_t xfs_dqsync;
157 xfs_quotactl_t xfs_quotactl;
158 struct xfs_dqtrxops *xfs_dqtrxops; 156 struct xfs_dqtrxops *xfs_dqtrxops;
159} xfs_qmops_t; 157} xfs_qmops_t;
160 158
@@ -188,8 +186,6 @@ typedef struct xfs_qmops {
188 (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp) 186 (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
189#define XFS_QM_DQSYNC(mp, flags) \ 187#define XFS_QM_DQSYNC(mp, flags) \
190 (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags) 188 (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
191#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
192 (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
193 189
194#ifdef HAVE_PERCPU_SB 190#ifdef HAVE_PERCPU_SB
195 191
@@ -273,19 +269,17 @@ typedef struct xfs_mount {
273 uint m_inobt_mnr[2]; /* min inobt btree records */ 269 uint m_inobt_mnr[2]; /* min inobt btree records */
274 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 270 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
275 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 271 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
276 uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ 272 uint m_in_maxlevels; /* max inobt btree levels. */
277 struct xfs_perag *m_perag; /* per-ag accounting info */ 273 struct xfs_perag *m_perag; /* per-ag accounting info */
278 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ 274 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */
279 struct mutex m_growlock; /* growfs mutex */ 275 struct mutex m_growlock; /* growfs mutex */
280 int m_fixedfsid[2]; /* unchanged for life of FS */ 276 int m_fixedfsid[2]; /* unchanged for life of FS */
281 uint m_dmevmask; /* DMI events for this FS */ 277 uint m_dmevmask; /* DMI events for this FS */
282 __uint64_t m_flags; /* global mount flags */ 278 __uint64_t m_flags; /* global mount flags */
283 uint m_attroffset; /* inode attribute offset */
284 uint m_dir_node_ents; /* #entries in a dir danode */ 279 uint m_dir_node_ents; /* #entries in a dir danode */
285 uint m_attr_node_ents; /* #entries in attr danode */ 280 uint m_attr_node_ents; /* #entries in attr danode */
286 int m_ialloc_inos; /* inodes in inode allocation */ 281 int m_ialloc_inos; /* inodes in inode allocation */
287 int m_ialloc_blks; /* blocks in inode allocation */ 282 int m_ialloc_blks; /* blocks in inode allocation */
288 int m_litino; /* size of inode union area */
289 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 283 int m_inoalign_mask;/* mask sb_inoalignmt if used */
290 uint m_qflags; /* quota status flags */ 284 uint m_qflags; /* quota status flags */
291 xfs_trans_reservations_t m_reservations;/* precomputed res values */ 285 xfs_trans_reservations_t m_reservations;/* precomputed res values */
@@ -293,9 +287,6 @@ typedef struct xfs_mount {
293 __uint64_t m_maxioffset; /* maximum inode offset */ 287 __uint64_t m_maxioffset; /* maximum inode offset */
294 __uint64_t m_resblks; /* total reserved blocks */ 288 __uint64_t m_resblks; /* total reserved blocks */
295 __uint64_t m_resblks_avail;/* available reserved blocks */ 289 __uint64_t m_resblks_avail;/* available reserved blocks */
296#if XFS_BIG_INUMS
297 xfs_ino_t m_inoadd; /* add value for ino64_offset */
298#endif
299 int m_dalign; /* stripe unit */ 290 int m_dalign; /* stripe unit */
300 int m_swidth; /* stripe width */ 291 int m_swidth; /* stripe width */
301 int m_sinoalign; /* stripe unit inode alignment */ 292 int m_sinoalign; /* stripe unit inode alignment */
@@ -337,7 +328,6 @@ typedef struct xfs_mount {
337#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 328#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
338 must be synchronous except 329 must be synchronous except
339 for space allocations */ 330 for space allocations */
340#define XFS_MOUNT_INO64 (1ULL << 1)
341#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 331#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
342#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 332#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
343#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 333#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
@@ -389,8 +379,8 @@ typedef struct xfs_mount {
389 * Synchronous read and write sizes. This should be 379 * Synchronous read and write sizes. This should be
390 * better for NFSv2 wsync filesystems. 380 * better for NFSv2 wsync filesystems.
391 */ 381 */
392#define XFS_WSYNC_READIO_LOG 15 /* 32K */ 382#define XFS_WSYNC_READIO_LOG 15 /* 32k */
393#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */ 383#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
394 384
395/* 385/*
396 * Allow large block sizes to be reported to userspace programs if the 386 * Allow large block sizes to be reported to userspace programs if the
@@ -500,9 +490,6 @@ typedef struct xfs_mod_sb {
500 int64_t msb_delta; /* Change to make to specified field */ 490 int64_t msb_delta; /* Change to make to specified field */
501} xfs_mod_sb_t; 491} xfs_mod_sb_t;
502 492
503#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
504#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
505
506extern int xfs_log_sbcount(xfs_mount_t *, uint); 493extern int xfs_log_sbcount(xfs_mount_t *, uint);
507extern int xfs_mountfs(xfs_mount_t *mp); 494extern int xfs_mountfs(xfs_mount_t *mp);
508extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); 495extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 27f80581520a..e101790ea8e7 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = {
126 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, 126 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr,
127 .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval, 127 .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval,
128 .xfs_dqsync = (xfs_dqsync_t) fs_noerr, 128 .xfs_dqsync = (xfs_dqsync_t) fs_noerr,
129 .xfs_quotactl = (xfs_quotactl_t) fs_nosys,
130}; 129};
131 130
132int 131int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 48965ecaa155..f5d1202dde25 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_QUOTA_H__ 18#ifndef __XFS_QUOTA_H__
19#define __XFS_QUOTA_H__ 19#define __XFS_QUOTA_H__
20 20
21struct xfs_trans;
22
21/* 23/*
22 * The ondisk form of a dquot structure. 24 * The ondisk form of a dquot structure.
23 */ 25 */
@@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat {
185 * to a single function. None of these XFS_QMOPT_* flags are meant to have 187 * to a single function. None of these XFS_QMOPT_* flags are meant to have
186 * persistent values (ie. their values can and will change between versions) 188 * persistent values (ie. their values can and will change between versions)
187 */ 189 */
188#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */
189#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ 190#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
190#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ 191#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
191#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ 192#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c5bb86f3ec05..385f6dceba5d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2288,6 +2288,16 @@ xfs_rtmount_inodes(
2288 return 0; 2288 return 0;
2289} 2289}
2290 2290
2291void
2292xfs_rtunmount_inodes(
2293 struct xfs_mount *mp)
2294{
2295 if (mp->m_rbmip)
2296 IRELE(mp->m_rbmip);
2297 if (mp->m_rsumip)
2298 IRELE(mp->m_rsumip);
2299}
2300
2291/* 2301/*
2292 * Pick an extent for allocation at the start of a new realtime file. 2302 * Pick an extent for allocation at the start of a new realtime file.
2293 * Use the sequence number stored in the atime field of the bitmap inode. 2303 * Use the sequence number stored in the atime field of the bitmap inode.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8d8dcd215716..b2d67adb6a08 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,8 +23,8 @@ struct xfs_trans;
23 23
24/* Min and max rt extent sizes, specified in bytes */ 24/* Min and max rt extent sizes, specified in bytes */
25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ 25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */ 26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */ 27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
28 28
29/* 29/*
30 * Constants for bit manipulations. 30 * Constants for bit manipulations.
@@ -108,6 +108,9 @@ xfs_rtfree_extent(
108int /* error */ 108int /* error */
109xfs_rtmount_init( 109xfs_rtmount_init(
110 struct xfs_mount *mp); /* file system mount structure */ 110 struct xfs_mount *mp); /* file system mount structure */
111void
112xfs_rtunmount_inodes(
113 struct xfs_mount *mp);
111 114
112/* 115/*
113 * Get the bitmap and summary inodes into the mount structure 116 * Get the bitmap and summary inodes into the mount structure
@@ -146,6 +149,7 @@ xfs_growfs_rt(
146# define xfs_growfs_rt(mp,in) (ENOSYS) 149# define xfs_growfs_rt(mp,in) (ENOSYS)
147# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 150# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
148# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 151# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
152# define xfs_rtunmount_inodes(m)
149#endif /* CONFIG_XFS_RT */ 153#endif /* CONFIG_XFS_RT */
150 154
151#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index d6fe4a88d79f..775249a54f6f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
292 * In a write transaction we can allocate a maximum of 2 292 * In a write transaction we can allocate a maximum of 2
293 * extents. This gives: 293 * extents. This gives:
294 * the inode getting the new extents: inode size 294 * the inode getting the new extents: inode size
295 * the inode\'s bmap btree: max depth * block size 295 * the inode's bmap btree: max depth * block size
296 * the agfs of the ags from which the extents are allocated: 2 * sector 296 * the agfs of the ags from which the extents are allocated: 2 * sector
297 * the superblock free block counter: sector size 297 * the superblock free block counter: sector size
298 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size 298 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
@@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
321/* 321/*
322 * In truncating a file we free up to two extents at once. We can modify: 322 * In truncating a file we free up to two extents at once. We can modify:
323 * the inode being truncated: inode size 323 * the inode being truncated: inode size
324 * the inode\'s bmap btree: (max depth + 1) * block size 324 * the inode's bmap btree: (max depth + 1) * block size
325 * And the bmap_finish transaction can free the blocks and bmap blocks: 325 * And the bmap_finish transaction can free the blocks and bmap blocks:
326 * the agf for each of the ags: 4 * sector size 326 * the agf for each of the ags: 4 * sector size
327 * the agfl for each of the ags: 4 * sector size 327 * the agfl for each of the ags: 4 * sector size
@@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
343 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ 343 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
344 (128 * 5) + \ 344 (128 * 5) + \
345 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 345 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
346 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 346 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
347 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 347 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
348 348
349#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) 349#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
@@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
431 * the new inode: inode size 431 * the new inode: inode size
432 * the inode btree entry: 1 block 432 * the inode btree entry: 1 block
433 * the directory btree: (max depth + v2) * dir block size 433 * the directory btree: (max depth + v2) * dir block size
434 * the directory inode\'s bmap btree: (max depth + v2) * block size 434 * the directory inode's bmap btree: (max depth + v2) * block size
435 * the blocks for the symlink: 1 KB 435 * the blocks for the symlink: 1 kB
436 * Or in the first xact we allocate some inodes giving: 436 * Or in the first xact we allocate some inodes giving:
437 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 437 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
438 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 438 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
@@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
449 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ 449 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
450 (2 * (mp)->m_sb.sb_sectsize + \ 450 (2 * (mp)->m_sb.sb_sectsize + \
451 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ 451 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
452 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ 452 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
453 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 453 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
454 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 454 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
455 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 455 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
456 456
457#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) 457#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
@@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
463 * the inode btree entry: block size 463 * the inode btree entry: block size
464 * the superblock for the nlink flag: sector size 464 * the superblock for the nlink flag: sector size
465 * the directory btree: (max depth + v2) * dir block size 465 * the directory btree: (max depth + v2) * dir block size
466 * the directory inode\'s bmap btree: (max depth + v2) * block size 466 * the directory inode's bmap btree: (max depth + v2) * block size
467 * Or in the first xact we allocate some inodes giving: 467 * Or in the first xact we allocate some inodes giving:
468 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 468 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
469 * the superblock for the nlink flag: sector size 469 * the superblock for the nlink flag: sector size
@@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
481 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ 481 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
482 (3 * (mp)->m_sb.sb_sectsize + \ 482 (3 * (mp)->m_sb.sb_sectsize + \
483 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ 483 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
484 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ 484 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
485 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 485 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
486 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 486 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
487 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 487 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
488 488
489#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) 489#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
@@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
513 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ 513 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
514 (128 * 5) + \ 514 (128 * 5) + \
515 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 515 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
516 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 516 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
517 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) 517 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
518 518
519 519
@@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
637/* 637/*
638 * Removing the attribute fork of a file 638 * Removing the attribute fork of a file
639 * the inode being truncated: inode size 639 * the inode being truncated: inode size
640 * the inode\'s bmap btree: max depth * block size 640 * the inode's bmap btree: max depth * block size
641 * And the bmap_finish transaction can free the blocks and bmap blocks: 641 * And the bmap_finish transaction can free the blocks and bmap blocks:
642 * the agf for each of the ags: 4 * sector size 642 * the agf for each of the ags: 4 * sector size
643 * the agfl for each of the ags: 4 * sector size 643 * the agfl for each of the ags: 4 * sector size
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2d47f10f8bed..f31271c30de9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -79,7 +79,7 @@ xfs_trans_ail_tail(
79 * the push is run asynchronously in a separate thread, so we return the tail 79 * the push is run asynchronously in a separate thread, so we return the tail
80 * of the log right now instead of the tail after the push. This means we will 80 * of the log right now instead of the tail after the push. This means we will
81 * either continue right away, or we will sleep waiting on the async thread to 81 * either continue right away, or we will sleep waiting on the async thread to
82 * do it's work. 82 * do its work.
83 * 83 *
84 * We do this unlocked - we only need to know whether there is anything in the 84 * We do this unlocked - we only need to know whether there is anything in the
85 * AIL at the time we are called. We don't need to access the contents of 85 * AIL at the time we are called. We don't need to access the contents of
@@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next(
160/* 160/*
161 * Now that the traversal is complete, we need to remove the cursor 161 * Now that the traversal is complete, we need to remove the cursor
162 * from the list of traversing cursors. Avoid removing the embedded 162 * from the list of traversing cursors. Avoid removing the embedded
163 * push cursor, but use the fact it is alway present to make the 163 * push cursor, but use the fact it is always present to make the
164 * list deletion simple. 164 * list deletion simple.
165 */ 165 */
166void 166void
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index e110bf57d7f4..eb3fc57f9eef 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,7 +22,7 @@
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 24#include "xfs_trans_priv.h"
25/* XXX: from here down needed until struct xfs_trans has it's own ailp */ 25/* XXX: from here down needed until struct xfs_trans has its own ailp */
26#include "xfs_bit.h" 26#include "xfs_bit.h"
27#include "xfs_buf_item.h" 27#include "xfs_buf_item.h"
28#include "xfs_sb.h" 28#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 4ea2e5074bdd..7d2c920dfb9c 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
47#define XFS_DIRREMOVE_SPACE_RES(mp) \ 47#define XFS_DIRREMOVE_SPACE_RES(mp) \
48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) 48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
49#define XFS_IALLOC_SPACE_RES(mp) \ 49#define XFS_IALLOC_SPACE_RES(mp) \
50 (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1) 50 (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
51 51
52/* 52/*
53 * Space reservation values for various transactions. 53 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b2f724502f1b..d725428c9df6 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -21,14 +21,6 @@
21#ifdef __KERNEL__ 21#ifdef __KERNEL__
22 22
23/* 23/*
24 * POSIX Extensions
25 */
26typedef unsigned char uchar_t;
27typedef unsigned short ushort_t;
28typedef unsigned int uint_t;
29typedef unsigned long ulong_t;
30
31/*
32 * Additional type declarations for XFS 24 * Additional type declarations for XFS
33 */ 25 */
34typedef signed char __int8_t; 26typedef signed char __int8_t;
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fcc2285d03ed..79b9e5ea5359 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -374,7 +374,7 @@ xfs_truncate_file(
374 374
375 /* 375 /*
376 * Follow the normal truncate locking protocol. Since we 376 * Follow the normal truncate locking protocol. Since we
377 * hold the inode in the transaction, we know that it's number 377 * hold the inode in the transaction, we know that its number
378 * of references will stay constant. 378 * of references will stay constant.
379 */ 379 */
380 xfs_ilock(ip, XFS_ILOCK_EXCL); 380 xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0e55c5d7db5f..7394c7af5de5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1136,7 +1136,7 @@ xfs_inactive(
1136 * If the inode is already free, then there can be nothing 1136 * If the inode is already free, then there can be nothing
1137 * to clean up here. 1137 * to clean up here.
1138 */ 1138 */
1139 if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) { 1139 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
1140 ASSERT(ip->i_df.if_real_bytes == 0); 1140 ASSERT(ip->i_df.if_real_bytes == 0);
1141 ASSERT(ip->i_df.if_broot_bytes == 0); 1141 ASSERT(ip->i_df.if_broot_bytes == 0);
1142 return VN_INACTIVE_CACHE; 1142 return VN_INACTIVE_CACHE;
@@ -1387,23 +1387,28 @@ xfs_create(
1387 xfs_inode_t **ipp, 1387 xfs_inode_t **ipp,
1388 cred_t *credp) 1388 cred_t *credp)
1389{ 1389{
1390 xfs_mount_t *mp = dp->i_mount; 1390 int is_dir = S_ISDIR(mode);
1391 xfs_inode_t *ip; 1391 struct xfs_mount *mp = dp->i_mount;
1392 xfs_trans_t *tp; 1392 struct xfs_inode *ip = NULL;
1393 struct xfs_trans *tp = NULL;
1393 int error; 1394 int error;
1394 xfs_bmap_free_t free_list; 1395 xfs_bmap_free_t free_list;
1395 xfs_fsblock_t first_block; 1396 xfs_fsblock_t first_block;
1396 boolean_t unlock_dp_on_error = B_FALSE; 1397 boolean_t unlock_dp_on_error = B_FALSE;
1397 int dm_event_sent = 0;
1398 uint cancel_flags; 1398 uint cancel_flags;
1399 int committed; 1399 int committed;
1400 xfs_prid_t prid; 1400 xfs_prid_t prid;
1401 struct xfs_dquot *udqp, *gdqp; 1401 struct xfs_dquot *udqp = NULL;
1402 struct xfs_dquot *gdqp = NULL;
1402 uint resblks; 1403 uint resblks;
1404 uint log_res;
1405 uint log_count;
1403 1406
1404 ASSERT(!*ipp);
1405 xfs_itrace_entry(dp); 1407 xfs_itrace_entry(dp);
1406 1408
1409 if (XFS_FORCED_SHUTDOWN(mp))
1410 return XFS_ERROR(EIO);
1411
1407 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { 1412 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1408 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, 1413 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1409 dp, DM_RIGHT_NULL, NULL, 1414 dp, DM_RIGHT_NULL, NULL,
@@ -1412,84 +1417,97 @@ xfs_create(
1412 1417
1413 if (error) 1418 if (error)
1414 return error; 1419 return error;
1415 dm_event_sent = 1;
1416 } 1420 }
1417 1421
1418 if (XFS_FORCED_SHUTDOWN(mp))
1419 return XFS_ERROR(EIO);
1420
1421 /* Return through std_return after this point. */
1422
1423 udqp = gdqp = NULL;
1424 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1422 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1425 prid = dp->i_d.di_projid; 1423 prid = dp->i_d.di_projid;
1426 else 1424 else
1427 prid = (xfs_prid_t)dfltprid; 1425 prid = dfltprid;
1428 1426
1429 /* 1427 /*
1430 * Make sure that we have allocated dquot(s) on disk. 1428 * Make sure that we have allocated dquot(s) on disk.
1431 */ 1429 */
1432 error = XFS_QM_DQVOPALLOC(mp, dp, 1430 error = XFS_QM_DQVOPALLOC(mp, dp,
1433 current_fsuid(), current_fsgid(), prid, 1431 current_fsuid(), current_fsgid(), prid,
1434 XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); 1432 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1435 if (error) 1433 if (error)
1436 goto std_return; 1434 goto std_return;
1437 1435
1438 ip = NULL; 1436 if (is_dir) {
1437 rdev = 0;
1438 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1439 log_res = XFS_MKDIR_LOG_RES(mp);
1440 log_count = XFS_MKDIR_LOG_COUNT;
1441 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1442 } else {
1443 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1444 log_res = XFS_CREATE_LOG_RES(mp);
1445 log_count = XFS_CREATE_LOG_COUNT;
1446 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1447 }
1439 1448
1440 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1441 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 1449 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1442 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1450
1443 /* 1451 /*
1444 * Initially assume that the file does not exist and 1452 * Initially assume that the file does not exist and
1445 * reserve the resources for that case. If that is not 1453 * reserve the resources for that case. If that is not
1446 * the case we'll drop the one we have and get a more 1454 * the case we'll drop the one we have and get a more
1447 * appropriate transaction later. 1455 * appropriate transaction later.
1448 */ 1456 */
1449 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, 1457 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1450 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1458 XFS_TRANS_PERM_LOG_RES, log_count);
1451 if (error == ENOSPC) { 1459 if (error == ENOSPC) {
1452 resblks = 0; 1460 resblks = 0;
1453 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0, 1461 error = xfs_trans_reserve(tp, 0, log_res, 0,
1454 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1462 XFS_TRANS_PERM_LOG_RES, log_count);
1455 } 1463 }
1456 if (error) { 1464 if (error) {
1457 cancel_flags = 0; 1465 cancel_flags = 0;
1458 goto error_return; 1466 goto out_trans_cancel;
1459 } 1467 }
1460 1468
1461 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1469 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1462 unlock_dp_on_error = B_TRUE; 1470 unlock_dp_on_error = B_TRUE;
1463 1471
1464 xfs_bmap_init(&free_list, &first_block); 1472 /*
1473 * Check for directory link count overflow.
1474 */
1475 if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
1476 error = XFS_ERROR(EMLINK);
1477 goto out_trans_cancel;
1478 }
1465 1479
1466 ASSERT(ip == NULL); 1480 xfs_bmap_init(&free_list, &first_block);
1467 1481
1468 /* 1482 /*
1469 * Reserve disk quota and the inode. 1483 * Reserve disk quota and the inode.
1470 */ 1484 */
1471 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); 1485 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1472 if (error) 1486 if (error)
1473 goto error_return; 1487 goto out_trans_cancel;
1474 1488
1475 error = xfs_dir_canenter(tp, dp, name, resblks); 1489 error = xfs_dir_canenter(tp, dp, name, resblks);
1476 if (error) 1490 if (error)
1477 goto error_return; 1491 goto out_trans_cancel;
1478 error = xfs_dir_ialloc(&tp, dp, mode, 1, 1492
1479 rdev, credp, prid, resblks > 0, 1493 /*
1480 &ip, &committed); 1494 * A newly created regular or special file just has one directory
1495 * entry pointing to them, but a directory also the "." entry
1496 * pointing to itself.
1497 */
1498 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
1499 prid, resblks > 0, &ip, &committed);
1481 if (error) { 1500 if (error) {
1482 if (error == ENOSPC) 1501 if (error == ENOSPC)
1483 goto error_return; 1502 goto out_trans_cancel;
1484 goto abort_return; 1503 goto out_trans_abort;
1485 } 1504 }
1486 xfs_itrace_ref(ip);
1487 1505
1488 /* 1506 /*
1489 * At this point, we've gotten a newly allocated inode. 1507 * At this point, we've gotten a newly allocated inode.
1490 * It is locked (and joined to the transaction). 1508 * It is locked (and joined to the transaction).
1491 */ 1509 */
1492 1510 xfs_itrace_ref(ip);
1493 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1511 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1494 1512
1495 /* 1513 /*
@@ -1508,19 +1526,28 @@ xfs_create(
1508 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 1526 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1509 if (error) { 1527 if (error) {
1510 ASSERT(error != ENOSPC); 1528 ASSERT(error != ENOSPC);
1511 goto abort_return; 1529 goto out_trans_abort;
1512 } 1530 }
1513 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1531 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1514 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1532 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1515 1533
1534 if (is_dir) {
1535 error = xfs_dir_init(tp, ip, dp);
1536 if (error)
1537 goto out_bmap_cancel;
1538
1539 error = xfs_bumplink(tp, dp);
1540 if (error)
1541 goto out_bmap_cancel;
1542 }
1543
1516 /* 1544 /*
1517 * If this is a synchronous mount, make sure that the 1545 * If this is a synchronous mount, make sure that the
1518 * create transaction goes to disk before returning to 1546 * create transaction goes to disk before returning to
1519 * the user. 1547 * the user.
1520 */ 1548 */
1521 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { 1549 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1522 xfs_trans_set_sync(tp); 1550 xfs_trans_set_sync(tp);
1523 }
1524 1551
1525 /* 1552 /*
1526 * Attach the dquot(s) to the inodes and modify them incore. 1553 * Attach the dquot(s) to the inodes and modify them incore.
@@ -1537,16 +1564,13 @@ xfs_create(
1537 IHOLD(ip); 1564 IHOLD(ip);
1538 1565
1539 error = xfs_bmap_finish(&tp, &free_list, &committed); 1566 error = xfs_bmap_finish(&tp, &free_list, &committed);
1540 if (error) { 1567 if (error)
1541 xfs_bmap_cancel(&free_list); 1568 goto out_abort_rele;
1542 goto abort_rele;
1543 }
1544 1569
1545 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1570 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1546 if (error) { 1571 if (error) {
1547 IRELE(ip); 1572 IRELE(ip);
1548 tp = NULL; 1573 goto out_dqrele;
1549 goto error_return;
1550 } 1574 }
1551 1575
1552 XFS_QM_DQRELE(mp, udqp); 1576 XFS_QM_DQRELE(mp, udqp);
@@ -1555,26 +1579,22 @@ xfs_create(
1555 *ipp = ip; 1579 *ipp = ip;
1556 1580
1557 /* Fallthrough to std_return with error = 0 */ 1581 /* Fallthrough to std_return with error = 0 */
1558 1582 std_return:
1559std_return: 1583 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1560 if ((*ipp || (error != 0 && dm_event_sent != 0)) && 1584 XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
1561 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { 1585 ip, DM_RIGHT_NULL, name->name, NULL, mode,
1562 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, 1586 error, 0);
1563 dp, DM_RIGHT_NULL,
1564 *ipp ? ip : NULL,
1565 DM_RIGHT_NULL, name->name, NULL,
1566 mode, error, 0);
1567 } 1587 }
1588
1568 return error; 1589 return error;
1569 1590
1570 abort_return: 1591 out_bmap_cancel:
1592 xfs_bmap_cancel(&free_list);
1593 out_trans_abort:
1571 cancel_flags |= XFS_TRANS_ABORT; 1594 cancel_flags |= XFS_TRANS_ABORT;
1572 /* FALLTHROUGH */ 1595 out_trans_cancel:
1573 1596 xfs_trans_cancel(tp, cancel_flags);
1574 error_return: 1597 out_dqrele:
1575 if (tp != NULL)
1576 xfs_trans_cancel(tp, cancel_flags);
1577
1578 XFS_QM_DQRELE(mp, udqp); 1598 XFS_QM_DQRELE(mp, udqp);
1579 XFS_QM_DQRELE(mp, gdqp); 1599 XFS_QM_DQRELE(mp, gdqp);
1580 1600
@@ -1583,20 +1603,18 @@ std_return:
1583 1603
1584 goto std_return; 1604 goto std_return;
1585 1605
1586 abort_rele: 1606 out_abort_rele:
1587 /* 1607 /*
1588 * Wait until after the current transaction is aborted to 1608 * Wait until after the current transaction is aborted to
1589 * release the inode. This prevents recursive transactions 1609 * release the inode. This prevents recursive transactions
1590 * and deadlocks from xfs_inactive. 1610 * and deadlocks from xfs_inactive.
1591 */ 1611 */
1612 xfs_bmap_cancel(&free_list);
1592 cancel_flags |= XFS_TRANS_ABORT; 1613 cancel_flags |= XFS_TRANS_ABORT;
1593 xfs_trans_cancel(tp, cancel_flags); 1614 xfs_trans_cancel(tp, cancel_flags);
1594 IRELE(ip); 1615 IRELE(ip);
1595 1616 unlock_dp_on_error = B_FALSE;
1596 XFS_QM_DQRELE(mp, udqp); 1617 goto out_dqrele;
1597 XFS_QM_DQRELE(mp, gdqp);
1598
1599 goto std_return;
1600} 1618}
1601 1619
1602#ifdef DEBUG 1620#ifdef DEBUG
@@ -2004,8 +2022,10 @@ xfs_link(
2004 /* Return through std_return after this point. */ 2022 /* Return through std_return after this point. */
2005 2023
2006 error = XFS_QM_DQATTACH(mp, sip, 0); 2024 error = XFS_QM_DQATTACH(mp, sip, 0);
2007 if (!error && sip != tdp) 2025 if (error)
2008 error = XFS_QM_DQATTACH(mp, tdp, 0); 2026 goto std_return;
2027
2028 error = XFS_QM_DQATTACH(mp, tdp, 0);
2009 if (error) 2029 if (error)
2010 goto std_return; 2030 goto std_return;
2011 2031
@@ -2110,209 +2130,6 @@ std_return:
2110 goto std_return; 2130 goto std_return;
2111} 2131}
2112 2132
2113
2114int
2115xfs_mkdir(
2116 xfs_inode_t *dp,
2117 struct xfs_name *dir_name,
2118 mode_t mode,
2119 xfs_inode_t **ipp,
2120 cred_t *credp)
2121{
2122 xfs_mount_t *mp = dp->i_mount;
2123 xfs_inode_t *cdp; /* inode of created dir */
2124 xfs_trans_t *tp;
2125 int cancel_flags;
2126 int error;
2127 int committed;
2128 xfs_bmap_free_t free_list;
2129 xfs_fsblock_t first_block;
2130 boolean_t unlock_dp_on_error = B_FALSE;
2131 boolean_t created = B_FALSE;
2132 int dm_event_sent = 0;
2133 xfs_prid_t prid;
2134 struct xfs_dquot *udqp, *gdqp;
2135 uint resblks;
2136
2137 if (XFS_FORCED_SHUTDOWN(mp))
2138 return XFS_ERROR(EIO);
2139
2140 tp = NULL;
2141
2142 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2143 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2144 dp, DM_RIGHT_NULL, NULL,
2145 DM_RIGHT_NULL, dir_name->name, NULL,
2146 mode, 0, 0);
2147 if (error)
2148 return error;
2149 dm_event_sent = 1;
2150 }
2151
2152 /* Return through std_return after this point. */
2153
2154 xfs_itrace_entry(dp);
2155
2156 mp = dp->i_mount;
2157 udqp = gdqp = NULL;
2158 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2159 prid = dp->i_d.di_projid;
2160 else
2161 prid = (xfs_prid_t)dfltprid;
2162
2163 /*
2164 * Make sure that we have allocated dquot(s) on disk.
2165 */
2166 error = XFS_QM_DQVOPALLOC(mp, dp,
2167 current_fsuid(), current_fsgid(), prid,
2168 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2169 if (error)
2170 goto std_return;
2171
2172 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2173 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2174 resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2175 error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2176 XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2177 if (error == ENOSPC) {
2178 resblks = 0;
2179 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2180 XFS_TRANS_PERM_LOG_RES,
2181 XFS_MKDIR_LOG_COUNT);
2182 }
2183 if (error) {
2184 cancel_flags = 0;
2185 goto error_return;
2186 }
2187
2188 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2189 unlock_dp_on_error = B_TRUE;
2190
2191 /*
2192 * Check for directory link count overflow.
2193 */
2194 if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2195 error = XFS_ERROR(EMLINK);
2196 goto error_return;
2197 }
2198
2199 /*
2200 * Reserve disk quota and the inode.
2201 */
2202 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2203 if (error)
2204 goto error_return;
2205
2206 error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2207 if (error)
2208 goto error_return;
2209 /*
2210 * create the directory inode.
2211 */
2212 error = xfs_dir_ialloc(&tp, dp, mode, 2,
2213 0, credp, prid, resblks > 0,
2214 &cdp, NULL);
2215 if (error) {
2216 if (error == ENOSPC)
2217 goto error_return;
2218 goto abort_return;
2219 }
2220 xfs_itrace_ref(cdp);
2221
2222 /*
2223 * Now we add the directory inode to the transaction.
2224 * We waited until now since xfs_dir_ialloc might start
2225 * a new transaction. Had we joined the transaction
2226 * earlier, the locks might have gotten released. An error
2227 * from here on will result in the transaction cancel
2228 * unlocking dp so don't do it explicitly in the error path.
2229 */
2230 IHOLD(dp);
2231 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2232 unlock_dp_on_error = B_FALSE;
2233
2234 xfs_bmap_init(&free_list, &first_block);
2235
2236 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2237 &first_block, &free_list, resblks ?
2238 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2239 if (error) {
2240 ASSERT(error != ENOSPC);
2241 goto error1;
2242 }
2243 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2244
2245 error = xfs_dir_init(tp, cdp, dp);
2246 if (error)
2247 goto error2;
2248
2249 error = xfs_bumplink(tp, dp);
2250 if (error)
2251 goto error2;
2252
2253 created = B_TRUE;
2254
2255 *ipp = cdp;
2256 IHOLD(cdp);
2257
2258 /*
2259 * Attach the dquots to the new inode and modify the icount incore.
2260 */
2261 XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2262
2263 /*
2264 * If this is a synchronous mount, make sure that the
2265 * mkdir transaction goes to disk before returning to
2266 * the user.
2267 */
2268 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2269 xfs_trans_set_sync(tp);
2270 }
2271
2272 error = xfs_bmap_finish(&tp, &free_list, &committed);
2273 if (error) {
2274 IRELE(cdp);
2275 goto error2;
2276 }
2277
2278 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2279 XFS_QM_DQRELE(mp, udqp);
2280 XFS_QM_DQRELE(mp, gdqp);
2281 if (error) {
2282 IRELE(cdp);
2283 }
2284
2285 /* Fall through to std_return with error = 0 or errno from
2286 * xfs_trans_commit. */
2287
2288std_return:
2289 if ((created || (error != 0 && dm_event_sent != 0)) &&
2290 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2291 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2292 dp, DM_RIGHT_NULL,
2293 created ? cdp : NULL,
2294 DM_RIGHT_NULL,
2295 dir_name->name, NULL,
2296 mode, error, 0);
2297 }
2298 return error;
2299
2300 error2:
2301 error1:
2302 xfs_bmap_cancel(&free_list);
2303 abort_return:
2304 cancel_flags |= XFS_TRANS_ABORT;
2305 error_return:
2306 xfs_trans_cancel(tp, cancel_flags);
2307 XFS_QM_DQRELE(mp, udqp);
2308 XFS_QM_DQRELE(mp, gdqp);
2309
2310 if (unlock_dp_on_error)
2311 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2312
2313 goto std_return;
2314}
2315
2316int 2133int
2317xfs_symlink( 2134xfs_symlink(
2318 xfs_inode_t *dp, 2135 xfs_inode_t *dp,
@@ -2587,51 +2404,6 @@ std_return:
2587} 2404}
2588 2405
2589int 2406int
2590xfs_inode_flush(
2591 xfs_inode_t *ip,
2592 int flags)
2593{
2594 xfs_mount_t *mp = ip->i_mount;
2595 int error = 0;
2596
2597 if (XFS_FORCED_SHUTDOWN(mp))
2598 return XFS_ERROR(EIO);
2599
2600 /*
2601 * Bypass inodes which have already been cleaned by
2602 * the inode flush clustering code inside xfs_iflush
2603 */
2604 if (xfs_inode_clean(ip))
2605 return 0;
2606
2607 /*
2608 * We make this non-blocking if the inode is contended,
2609 * return EAGAIN to indicate to the caller that they
2610 * did not succeed. This prevents the flush path from
2611 * blocking on inodes inside another operation right
2612 * now, they get caught later by xfs_sync.
2613 */
2614 if (flags & FLUSH_SYNC) {
2615 xfs_ilock(ip, XFS_ILOCK_SHARED);
2616 xfs_iflock(ip);
2617 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
2618 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
2619 xfs_iunlock(ip, XFS_ILOCK_SHARED);
2620 return EAGAIN;
2621 }
2622 } else {
2623 return EAGAIN;
2624 }
2625
2626 error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
2627 : XFS_IFLUSH_ASYNC_NOBLOCK);
2628 xfs_iunlock(ip, XFS_ILOCK_SHARED);
2629
2630 return error;
2631}
2632
2633
2634int
2635xfs_set_dmattrs( 2407xfs_set_dmattrs(
2636 xfs_inode_t *ip, 2408 xfs_inode_t *ip,
2637 u_int evmask, 2409 u_int evmask,
@@ -2676,7 +2448,7 @@ xfs_reclaim(
2676 ASSERT(!VN_MAPPED(VFS_I(ip))); 2448 ASSERT(!VN_MAPPED(VFS_I(ip)));
2677 2449
2678 /* bad inode, get out here ASAP */ 2450 /* bad inode, get out here ASAP */
2679 if (VN_BAD(VFS_I(ip))) { 2451 if (is_bad_inode(VFS_I(ip))) {
2680 xfs_ireclaim(ip); 2452 xfs_ireclaim(ip);
2681 return 0; 2453 return 0;
2682 } 2454 }
@@ -3090,7 +2862,7 @@ xfs_free_file_space(
3090 2862
3091 /* 2863 /*
3092 * Need to zero the stuff we're not freeing, on disk. 2864 * Need to zero the stuff we're not freeing, on disk.
3093 * If its a realtime file & can't use unwritten extents then we 2865 * If it's a realtime file & can't use unwritten extents then we
3094 * actually need to zero the extent edges. Otherwise xfs_bunmapi 2866 * actually need to zero the extent edges. Otherwise xfs_bunmapi
3095 * will take care of it for us. 2867 * will take care of it for us.
3096 */ 2868 */
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 76df328c61b4..04373c6c61ff 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33 struct xfs_name *target_name); 33 struct xfs_name *target_name);
34int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
35 mode_t mode, struct xfs_inode **ipp, cred_t *credp);
36int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
37 xfs_off_t *offset, filldir_t filldir); 35 xfs_off_t *offset, filldir_t filldir);
38int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
39 const char *target_path, mode_t mode, struct xfs_inode **ipp, 37 const char *target_path, mode_t mode, struct xfs_inode **ipp,
40 cred_t *credp); 38 cred_t *credp);
41int xfs_inode_flush(struct xfs_inode *ip, int flags);
42int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
43int xfs_reclaim(struct xfs_inode *ip); 40int xfs_reclaim(struct xfs_inode *ip);
44int xfs_change_file_space(struct xfs_inode *ip, int cmd, 41int xfs_change_file_space(struct xfs_inode *ip, int cmd,