aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2009-05-20 03:02:28 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-05-20 03:02:28 -0400
commit521c180874dae86f675d23c4eade4dba8b1f2cc8 (patch)
tree7509303da3a9a1b40a26f6811f321c89cd31737b /fs
parentf1a11e0576c7a73d759d05d776692b2b2d37172b (diff)
parent64d1304a64477629cb16b75491a77bafe6f86963 (diff)
Merge branch 'core/urgent' into core/futexes
Merge reason: this branch was on an pre -rc1 base, merge it up to -rc6+ to get the latest upstream fixes. Conflicts: kernel/futex.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c12
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/super.c3
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/netdevices.c3
-rw-r--r--fs/afs/super.c7
-rw-r--r--fs/autofs/dirhash.c34
-rw-r--r--fs/autofs4/dev-ioctl.c12
-rw-r--r--fs/autofs4/expire.c4
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/befs/super.c1
-rw-r--r--fs/binfmt_elf_fdpic.c4
-rw-r--r--fs/bio.c125
-rw-r--r--fs/btrfs/Makefile19
-rw-r--r--fs/btrfs/acl.c18
-rw-r--r--fs/btrfs/async-thread.c60
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/ctree.c56
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/disk-io.c104
-rw-r--r--fs/btrfs/extent-tree.c51
-rw-r--r--fs/btrfs/extent_io.c167
-rw-r--r--fs/btrfs/extent_map.c17
-rw-r--r--fs/btrfs/file.c95
-rw-r--r--fs/btrfs/free-space-cache.c15
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c185
-rw-r--r--fs/btrfs/ioctl.c62
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/super.c56
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c159
-rw-r--r--fs/btrfs/volumes.h16
-rw-r--r--fs/buffer.c101
-rw-r--r--fs/cifs/CHANGES16
-rw-r--r--fs/cifs/README10
-rw-r--r--fs/cifs/cifs_dfs_ref.c32
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifs_unicode.c198
-rw-r--r--fs/cifs/cifs_unicode.h23
-rw-r--r--fs/cifs/cifsfs.c54
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h19
-rw-r--r--fs/cifs/cifspdu.h8
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c221
-rw-r--r--fs/cifs/connect.c1357
-rw-r--r--fs/cifs/dir.c160
-rw-r--r--fs/cifs/dns_resolve.c2
-rw-r--r--fs/cifs/file.c141
-rw-r--r--fs/cifs/inode.c98
-rw-r--r--fs/cifs/link.c114
-rw-r--r--fs/cifs/misc.c71
-rw-r--r--fs/cifs/netmisc.c2
-rw-r--r--fs/cifs/nterr.h9
-rw-r--r--fs/cifs/ntlmssp.h68
-rw-r--r--fs/cifs/readdir.c78
-rw-r--r--fs/cifs/sess.c373
-rw-r--r--fs/cifs/smberr.h1
-rw-r--r--fs/compat.c52
-rw-r--r--fs/compat_ioctl.c7
-rw-r--r--fs/configfs/symlink.c2
-rw-r--r--fs/dcache.c3
-rw-r--r--fs/debugfs/inode.c16
-rw-r--r--fs/devpts/inode.c23
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/ecryptfs/crypto.c21
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h1
-rw-r--r--fs/ecryptfs/inode.c37
-rw-r--r--fs/ecryptfs/main.c19
-rw-r--r--fs/ecryptfs/messaging.c82
-rw-r--r--fs/ecryptfs/miscdev.c43
-rw-r--r--fs/ecryptfs/mmap.c11
-rw-r--r--fs/ecryptfs/read_write.c32
-rw-r--r--fs/ecryptfs/super.c7
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c111
-rw-r--r--fs/ext2/inode.c44
-rw-r--r--fs/ext2/super.c4
-rw-r--r--fs/ext3/Kconfig19
-rw-r--r--fs/ext3/inode.c23
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext4/extents.c39
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/inode.c58
-rw-r--r--fs/ext4/super.c9
-rw-r--r--fs/fat/Kconfig3
-rw-r--r--fs/fcntl.c6
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/fuse/file.c8
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/glock.c11
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/inode.c8
-rw-r--r--fs/gfs2/inode.h14
-rw-r--r--fs/gfs2/ops_file.c12
-rw-r--r--fs/gfs2/ops_fstype.c13
-rw-r--r--fs/gfs2/ops_inode.c1
-rw-r--r--fs/gfs2/quota.c4
-rw-r--r--fs/gfs2/rgrp.c13
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hpfs/super.c3
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/inode.c117
-rw-r--r--fs/ioctl.c75
-rw-r--r--fs/jbd/commit.c9
-rw-r--r--fs/jbd/revoke.c44
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/revoke.c21
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/malloc.c6
-rw-r--r--fs/libfs.c19
-rw-r--r--fs/lockd/svc.c15
-rw-r--r--fs/lockd/svclock.c13
-rw-r--r--fs/namei.c15
-rw-r--r--fs/namespace.c30
-rw-r--r--fs/ncpfs/ioctl.c21
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/super.c24
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs3proc.c10
-rw-r--r--fs/nfsd/nfs4callback.c47
-rw-r--r--fs/nfsd/nfs4proc.c246
-rw-r--r--fs/nfsd/nfs4recover.c120
-rw-r--r--fs/nfsd/nfs4state.c1195
-rw-r--r--fs/nfsd/nfs4xdr.c649
-rw-r--r--fs/nfsd/nfsctl.c38
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c88
-rw-r--r--fs/nfsd/vfs.c71
-rw-r--r--fs/nilfs2/Makefile5
-rw-r--r--fs/nilfs2/alloc.c504
-rw-r--r--fs/nilfs2/alloc.h72
-rw-r--r--fs/nilfs2/bmap.c788
-rw-r--r--fs/nilfs2/bmap.h244
-rw-r--r--fs/nilfs2/bmap_union.h42
-rw-r--r--fs/nilfs2/btnode.c316
-rw-r--r--fs/nilfs2/btnode.h58
-rw-r--r--fs/nilfs2/btree.c2269
-rw-r--r--fs/nilfs2/btree.h117
-rw-r--r--fs/nilfs2/cpfile.c925
-rw-r--r--fs/nilfs2/cpfile.h45
-rw-r--r--fs/nilfs2/dat.c430
-rw-r--r--fs/nilfs2/dat.h52
-rw-r--r--fs/nilfs2/dir.c711
-rw-r--r--fs/nilfs2/direct.c436
-rw-r--r--fs/nilfs2/direct.h78
-rw-r--r--fs/nilfs2/file.c160
-rw-r--r--fs/nilfs2/gcdat.c84
-rw-r--r--fs/nilfs2/gcinode.c288
-rw-r--r--fs/nilfs2/ifile.c150
-rw-r--r--fs/nilfs2/ifile.h53
-rw-r--r--fs/nilfs2/inode.c785
-rw-r--r--fs/nilfs2/ioctl.c665
-rw-r--r--fs/nilfs2/mdt.c564
-rw-r--r--fs/nilfs2/mdt.h125
-rw-r--r--fs/nilfs2/namei.c474
-rw-r--r--fs/nilfs2/nilfs.h314
-rw-r--r--fs/nilfs2/page.c541
-rw-r--r--fs/nilfs2/page.h76
-rw-r--r--fs/nilfs2/recovery.c919
-rw-r--r--fs/nilfs2/sb.h102
-rw-r--r--fs/nilfs2/segbuf.c439
-rw-r--r--fs/nilfs2/segbuf.h201
-rw-r--r--fs/nilfs2/seglist.h85
-rw-r--r--fs/nilfs2/segment.c2978
-rw-r--r--fs/nilfs2/segment.h244
-rw-r--r--fs/nilfs2/sufile.c558
-rw-r--r--fs/nilfs2/sufile.h125
-rw-r--r--fs/nilfs2/super.c1326
-rw-r--r--fs/nilfs2/the_nilfs.c641
-rw-r--r--fs/nilfs2/the_nilfs.h298
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ocfs2/dcache.c15
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/export.c9
-rw-r--r--fs/ocfs2/file.c94
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/suballoc.c21
-rw-r--r--fs/ocfs2/symlink.c77
-rw-r--r--fs/open.c2
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/pipe.c42
-rw-r--r--fs/proc/array.c13
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/stat.c5
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/quota/Makefile9
-rw-r--r--fs/ramfs/inode.c19
-rw-r--r--fs/read_write.c14
-rw-r--r--fs/reiserfs/dir.c24
-rw-r--r--fs/reiserfs/namei.c17
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c260
-rw-r--r--fs/reiserfs/xattr_security.c12
-rw-r--r--fs/romfs/Kconfig48
-rw-r--r--fs/romfs/Makefile9
-rw-r--r--fs/romfs/inode.c665
-rw-r--r--fs/romfs/internal.h47
-rw-r--r--fs/romfs/mmap-nommu.c75
-rw-r--r--fs/romfs/storage.c293
-rw-r--r--fs/romfs/super.c654
-rw-r--r--fs/splice.c355
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/super.c10
-rw-r--r--fs/stat.c137
-rw-r--r--fs/super.c84
-rw-r--r--fs/sysfs/bin.c13
-rw-r--r--fs/sysfs/file.c16
-rw-r--r--fs/ubifs/budget.c37
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/file.c16
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/gc.c428
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/log.c5
-rw-r--r--fs/ubifs/lpt_commit.c34
-rw-r--r--fs/ubifs/recovery.c70
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/sb.c36
-rw-r--r--fs/ubifs/shrinker.c6
-rw-r--r--fs/ubifs/super.c40
-rw-r--r--fs/ubifs/tnc.c2
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c2
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/xattr.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c38
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c18
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c78
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h9
-rw-r--r--fs/xfs/xfs_bmap.c192
-rw-r--r--fs/xfs/xfs_iget.c23
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_iomap.c61
-rw-r--r--fs/xfs/xfs_iomap.h3
-rw-r--r--fs/xfs/xfs_log.c78
-rw-r--r--fs/xfs/xfs_mount.c3
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_vnodeops.c7
259 files changed, 26685 insertions, 5261 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 5f8ab8adb5f5..ab5547ff29a1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/smp_lock.h>
40#include <net/9p/9p.h> 41#include <net/9p/9p.h>
41#include <net/9p/client.h> 42#include <net/9p/client.h>
42 43
@@ -155,6 +156,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
155 156
156 root = d_alloc_root(inode); 157 root = d_alloc_root(inode);
157 if (!root) { 158 if (!root) {
159 iput(inode);
158 retval = -ENOMEM; 160 retval = -ENOMEM;
159 goto release_sb; 161 goto release_sb;
160 } 162 }
@@ -173,10 +175,7 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
173 return 0; 175 return 0;
174 176
175release_sb: 177release_sb:
176 if (sb) { 178 deactivate_locked_super(sb);
177 up_write(&sb->s_umount);
178 deactivate_super(sb);
179 }
180 179
181free_stat: 180free_stat:
182 kfree(st); 181 kfree(st);
@@ -230,9 +229,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
230static void 229static void
231v9fs_umount_begin(struct super_block *sb) 230v9fs_umount_begin(struct super_block *sb)
232{ 231{
233 struct v9fs_session_info *v9ses = sb->s_fs_info; 232 struct v9fs_session_info *v9ses;
234 233
234 lock_kernel();
235 v9ses = sb->s_fs_info;
235 v9fs_session_cancel(v9ses); 236 v9fs_session_cancel(v9ses);
237 unlock_kernel();
236} 238}
237 239
238static const struct super_operations v9fs_super_ops = { 240static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 86b203fc3c56..9f7270f36b2a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -175,9 +175,34 @@ source "fs/qnx4/Kconfig"
175source "fs/romfs/Kconfig" 175source "fs/romfs/Kconfig"
176source "fs/sysv/Kconfig" 176source "fs/sysv/Kconfig"
177source "fs/ufs/Kconfig" 177source "fs/ufs/Kconfig"
178
179source "fs/exofs/Kconfig" 178source "fs/exofs/Kconfig"
180 179
180config NILFS2_FS
181 tristate "NILFS2 file system support (EXPERIMENTAL)"
182 depends on BLOCK && EXPERIMENTAL
183 select CRC32
184 help
185 NILFS2 is a log-structured file system (LFS) supporting continuous
186 snapshotting. In addition to versioning capability of the entire
187 file system, users can even restore files mistakenly overwritten or
188 destroyed just a few seconds ago. Since this file system can keep
189 consistency like conventional LFS, it achieves quick recovery after
190 system crashes.
191
192 NILFS2 creates a number of checkpoints every few seconds or per
193 synchronous write basis (unless there is no change). Users can
194 select significant versions among continuously created checkpoints,
195 and can change them into snapshots which will be preserved for long
196 periods until they are changed back to checkpoints. Each
197 snapshot is mountable as a read-only file system concurrently with
198 its writable mount, and this feature is convenient for online backup.
199
200 Some features including atime, extended attributes, and POSIX ACLs,
201 are not supported yet.
202
203 To compile this file system support as a module, choose M here: the
204 module will be called nilfs2. If unsure, say N.
205
181endif # MISC_FILESYSTEMS 206endif # MISC_FILESYSTEMS
182 207
183menuconfig NETWORK_FILESYSTEMS 208menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 70b2aed87133..af6d04700d9c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_JFS_FS) += jfs/
114obj-$(CONFIG_XFS_FS) += xfs/ 114obj-$(CONFIG_XFS_FS) += xfs/
115obj-$(CONFIG_9P_FS) += 9p/ 115obj-$(CONFIG_9P_FS) += 9p/
116obj-$(CONFIG_AFS_FS) += afs/ 116obj-$(CONFIG_AFS_FS) += afs/
117obj-$(CONFIG_NILFS2_FS) += nilfs2/
117obj-$(CONFIG_BEFS_FS) += befs/ 118obj-$(CONFIG_BEFS_FS) += befs/
118obj-$(CONFIG_HOSTFS) += hostfs/ 119obj-$(CONFIG_HOSTFS) += hostfs/
119obj-$(CONFIG_HPPFS) += hppfs/ 120obj-$(CONFIG_HPPFS) += hppfs/
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5ce695e707fe..63f5183f263b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -507,8 +507,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
507 kfree(new_opts); 507 kfree(new_opts);
508 return -EINVAL; 508 return -EINVAL;
509 } 509 }
510 kfree(sb->s_options); 510 replace_mount_options(sb, new_opts);
511 sb->s_options = new_opts;
512 511
513 sbi->s_flags = mount_flags; 512 sbi->s_flags = mount_flags;
514 sbi->s_mode = mode; 513 sbi->s_mode = mode;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7a1d942ef68d..0149dab365e7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -102,6 +102,7 @@ int afs_release(struct inode *inode, struct file *file)
102 return 0; 102 return 0;
103} 103}
104 104
105#ifdef CONFIG_AFS_FSCACHE
105/* 106/*
106 * deal with notification that a page was read from the cache 107 * deal with notification that a page was read from the cache
107 */ 108 */
@@ -117,6 +118,7 @@ static void afs_file_readpage_read_complete(struct page *page,
117 SetPageUptodate(page); 118 SetPageUptodate(page);
118 unlock_page(page); 119 unlock_page(page);
119} 120}
121#endif
120 122
121/* 123/*
122 * AFS read page from file, directory or symlink 124 * AFS read page from file, directory or symlink
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 49f189423063..7ad36506c256 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
20 struct net_device *dev; 20 struct net_device *dev;
21 int ret = -ENODEV; 21 int ret = -ENODEV;
22 22
23 if (maclen != ETH_ALEN) 23 BUG_ON(maclen != ETH_ALEN);
24 BUG();
25 24
26 rtnl_lock(); 25 rtnl_lock();
27 dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); 26 dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index aee239a048cb..76828e5f8a39 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -405,21 +405,20 @@ static int afs_get_sb(struct file_system_type *fs_type,
405 sb->s_flags = flags; 405 sb->s_flags = flags;
406 ret = afs_fill_super(sb, &params); 406 ret = afs_fill_super(sb, &params);
407 if (ret < 0) { 407 if (ret < 0) {
408 up_write(&sb->s_umount); 408 deactivate_locked_super(sb);
409 deactivate_super(sb);
410 goto error; 409 goto error;
411 } 410 }
412 sb->s_options = new_opts; 411 save_mount_options(sb, new_opts);
413 sb->s_flags |= MS_ACTIVE; 412 sb->s_flags |= MS_ACTIVE;
414 } else { 413 } else {
415 _debug("reuse"); 414 _debug("reuse");
416 kfree(new_opts);
417 ASSERTCMP(sb->s_flags, &, MS_ACTIVE); 415 ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
418 } 416 }
419 417
420 simple_set_mnt(mnt, sb); 418 simple_set_mnt(mnt, sb);
421 afs_put_volume(params.volume); 419 afs_put_volume(params.volume);
422 afs_put_cell(params.cell); 420 afs_put_cell(params.cell);
421 kfree(new_opts);
423 _leave(" = 0 [%p]", sb); 422 _leave(" = 0 [%p]", sb);
424 return 0; 423 return 0;
425 424
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index bf8c8af98004..4eb4d8dfb2f1 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
39{ 39{
40 struct autofs_dirhash *dh = &sbi->dirhash; 40 struct autofs_dirhash *dh = &sbi->dirhash;
41 struct autofs_dir_ent *ent; 41 struct autofs_dir_ent *ent;
42 struct dentry *dentry;
43 unsigned long timeout = sbi->exp_timeout; 42 unsigned long timeout = sbi->exp_timeout;
44 43
45 while (1) { 44 while (1) {
45 struct path path;
46 int umount_ok;
47
46 if ( list_empty(&dh->expiry_head) || sbi->catatonic ) 48 if ( list_empty(&dh->expiry_head) || sbi->catatonic )
47 return NULL; /* No entries */ 49 return NULL; /* No entries */
48 /* We keep the list sorted by last_usage and want old stuff */ 50 /* We keep the list sorted by last_usage and want old stuff */
@@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
57 return ent; /* Symlinks are always expirable */ 59 return ent; /* Symlinks are always expirable */
58 60
59 /* Get the dentry for the autofs subdirectory */ 61 /* Get the dentry for the autofs subdirectory */
60 dentry = ent->dentry; 62 path.dentry = ent->dentry;
61 63
62 if ( !dentry ) { 64 if (!path.dentry) {
63 /* Should only happen in catatonic mode */ 65 /* Should only happen in catatonic mode */
64 printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name); 66 printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
65 autofs_delete_usage(ent); 67 autofs_delete_usage(ent);
66 continue; 68 continue;
67 } 69 }
68 70
69 if ( !dentry->d_inode ) { 71 if (!path.dentry->d_inode) {
70 dput(dentry); 72 dput(path.dentry);
71 printk("autofs: negative dentry on expiry queue: %s\n", 73 printk("autofs: negative dentry on expiry queue: %s\n",
72 ent->name); 74 ent->name);
73 autofs_delete_usage(ent); 75 autofs_delete_usage(ent);
@@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
76 78
77 /* Make sure entry is mounted and unused; note that dentry will 79 /* Make sure entry is mounted and unused; note that dentry will
78 point to the mounted-on-top root. */ 80 point to the mounted-on-top root. */
79 if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) { 81 if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
82 !d_mountpoint(path.dentry)) {
80 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 83 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
81 continue; 84 continue;
82 } 85 }
83 mntget(mnt); 86 path.mnt = mnt;
84 dget(dentry); 87 path_get(&path);
85 if (!follow_down(&mnt, &dentry)) { 88 if (!follow_down(&path.mnt, &path.dentry)) {
86 dput(dentry); 89 path_put(&path);
87 mntput(mnt);
88 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
89 continue; 91 continue;
90 } 92 }
91 while (d_mountpoint(dentry) && follow_down(&mnt, &dentry)) 93 while (d_mountpoint(path.dentry) &&
94 follow_down(&path.mnt, &path.dentry))
92 ; 95 ;
93 dput(dentry); 96 umount_ok = may_umount(path.mnt);
97 path_put(&path);
94 98
95 if ( may_umount(mnt) ) { 99 if (umount_ok) {
96 mntput(mnt);
97 DPRINTK(("autofs: signaling expire on %s\n", ent->name)); 100 DPRINTK(("autofs: signaling expire on %s\n", ent->name));
98 return ent; /* Expirable! */ 101 return ent; /* Expirable! */
99 } 102 }
100 DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name)); 103 DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
101 mntput(mnt);
102 } 104 }
103 return NULL; /* No expirable entries */ 105 return NULL; /* No expirable entries */
104} 106}
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9e5ae8a4f5c8..84168c0dcc2d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -54,11 +54,10 @@ static int check_name(const char *name)
54 * Check a string doesn't overrun the chunk of 54 * Check a string doesn't overrun the chunk of
55 * memory we copied from user land. 55 * memory we copied from user land.
56 */ 56 */
57static int invalid_str(char *str, void *end) 57static int invalid_str(char *str, size_t size)
58{ 58{
59 while ((void *) str <= end) 59 if (memchr(str, 0, size))
60 if (!*str++) 60 return 0;
61 return 0;
62 return -EINVAL; 61 return -EINVAL;
63} 62}
64 63
@@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
138 } 137 }
139 138
140 if (param->size > sizeof(*param)) { 139 if (param->size > sizeof(*param)) {
141 err = invalid_str(param->path, 140 err = invalid_str(param->path, param->size - sizeof(*param));
142 (void *) ((size_t) param + param->size));
143 if (err) { 141 if (err) {
144 AUTOFS_WARN( 142 AUTOFS_WARN(
145 "path string terminator missing for cmd(0x%08x)", 143 "path string terminator missing for cmd(0x%08x)",
@@ -488,7 +486,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
488 } 486 }
489 487
490 path = param->path; 488 path = param->path;
491 devid = sbi->sb->s_dev; 489 devid = new_encode_dev(sbi->sb->s_dev);
492 490
493 param->requester.uid = param->requester.gid = -1; 491 param->requester.uid = param->requester.gid = -1;
494 492
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 75f7ddacf7d6..3077d8f16523 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -70,8 +70,10 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
70 * Otherwise it's an offset mount and we need to check 70 * Otherwise it's an offset mount and we need to check
71 * if we can umount its mount, if there is one. 71 * if we can umount its mount, if there is one.
72 */ 72 */
73 if (!d_mountpoint(dentry)) 73 if (!d_mountpoint(dentry)) {
74 status = 0;
74 goto done; 75 goto done;
76 }
75 } 77 }
76 78
77 /* Update the expiry counter if fs is busy */ 79 /* Update the expiry counter if fs is busy */
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20 21
21#endif /* __KERNEL__ */ 22#endif /* __KERNEL__ */
22 23
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 41f2b4d0093e..ca40f828f64d 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <asm/page.h> /* for PAGE_SIZE */
11 12
12#include "befs.h" 13#include "befs.h"
13#include "super.h" 14#include "super.h"
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 70cfc4b84ae0..fdb66faa24f1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1388,7 +1388,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
1388 prstatus->pr_sigpend = p->pending.signal.sig[0]; 1388 prstatus->pr_sigpend = p->pending.signal.sig[0];
1389 prstatus->pr_sighold = p->blocked.sig[0]; 1389 prstatus->pr_sighold = p->blocked.sig[0];
1390 prstatus->pr_pid = task_pid_vnr(p); 1390 prstatus->pr_pid = task_pid_vnr(p);
1391 prstatus->pr_ppid = task_pid_vnr(p->parent); 1391 prstatus->pr_ppid = task_pid_vnr(p->real_parent);
1392 prstatus->pr_pgrp = task_pgrp_vnr(p); 1392 prstatus->pr_pgrp = task_pgrp_vnr(p);
1393 prstatus->pr_sid = task_session_vnr(p); 1393 prstatus->pr_sid = task_session_vnr(p);
1394 if (thread_group_leader(p)) { 1394 if (thread_group_leader(p)) {
@@ -1433,7 +1433,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1433 psinfo->pr_psargs[len] = 0; 1433 psinfo->pr_psargs[len] = 0;
1434 1434
1435 psinfo->pr_pid = task_pid_vnr(p); 1435 psinfo->pr_pid = task_pid_vnr(p);
1436 psinfo->pr_ppid = task_pid_vnr(p->parent); 1436 psinfo->pr_ppid = task_pid_vnr(p->real_parent);
1437 psinfo->pr_pgrp = task_pgrp_vnr(p); 1437 psinfo->pr_pgrp = task_pgrp_vnr(p);
1438 psinfo->pr_sid = task_session_vnr(p); 1438 psinfo->pr_sid = task_session_vnr(p);
1439 1439
diff --git a/fs/bio.c b/fs/bio.c
index e0c9e545bbfa..98711647ece4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
175 struct bio_vec *bvl; 175 struct bio_vec *bvl;
176 176
177 /* 177 /*
178 * If 'bs' is given, lookup the pool and do the mempool alloc.
179 * If not, this is a bio_kmalloc() allocation and just do a
180 * kzalloc() for the exact number of vecs right away.
181 */
182 if (!bs)
183 bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
184
185 /*
186 * see comment near bvec_array define! 178 * see comment near bvec_array define!
187 */ 179 */
188 switch (nr) { 180 switch (nr) {
@@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
260 mempool_free(p, bs->bio_pool); 252 mempool_free(p, bs->bio_pool);
261} 253}
262 254
263/*
264 * default destructor for a bio allocated with bio_alloc_bioset()
265 */
266static void bio_fs_destructor(struct bio *bio)
267{
268 bio_free(bio, fs_bio_set);
269}
270
271static void bio_kmalloc_destructor(struct bio *bio)
272{
273 if (bio_has_allocated_vec(bio))
274 kfree(bio->bi_io_vec);
275 kfree(bio);
276}
277
278void bio_init(struct bio *bio) 255void bio_init(struct bio *bio)
279{ 256{
280 memset(bio, 0, sizeof(*bio)); 257 memset(bio, 0, sizeof(*bio));
@@ -301,21 +278,15 @@ void bio_init(struct bio *bio)
301 **/ 278 **/
302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 279struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
303{ 280{
281 unsigned long idx = BIO_POOL_NONE;
304 struct bio_vec *bvl = NULL; 282 struct bio_vec *bvl = NULL;
305 struct bio *bio = NULL; 283 struct bio *bio;
306 unsigned long idx = 0; 284 void *p;
307 void *p = NULL; 285
308 286 p = mempool_alloc(bs->bio_pool, gfp_mask);
309 if (bs) { 287 if (unlikely(!p))
310 p = mempool_alloc(bs->bio_pool, gfp_mask); 288 return NULL;
311 if (!p) 289 bio = p + bs->front_pad;
312 goto err;
313 bio = p + bs->front_pad;
314 } else {
315 bio = kmalloc(sizeof(*bio), gfp_mask);
316 if (!bio)
317 goto err;
318 }
319 290
320 bio_init(bio); 291 bio_init(bio);
321 292
@@ -332,22 +303,33 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
332 303
333 nr_iovecs = bvec_nr_vecs(idx); 304 nr_iovecs = bvec_nr_vecs(idx);
334 } 305 }
306out_set:
335 bio->bi_flags |= idx << BIO_POOL_OFFSET; 307 bio->bi_flags |= idx << BIO_POOL_OFFSET;
336 bio->bi_max_vecs = nr_iovecs; 308 bio->bi_max_vecs = nr_iovecs;
337out_set:
338 bio->bi_io_vec = bvl; 309 bio->bi_io_vec = bvl;
339
340 return bio; 310 return bio;
341 311
342err_free: 312err_free:
343 if (bs) 313 mempool_free(p, bs->bio_pool);
344 mempool_free(p, bs->bio_pool);
345 else
346 kfree(bio);
347err:
348 return NULL; 314 return NULL;
349} 315}
350 316
317static void bio_fs_destructor(struct bio *bio)
318{
319 bio_free(bio, fs_bio_set);
320}
321
322/**
323 * bio_alloc - allocate a new bio, memory pool backed
324 * @gfp_mask: allocation mask to use
325 * @nr_iovecs: number of iovecs
326 *
327 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask
328 * contains __GFP_WAIT, the allocation is guaranteed to succeed.
329 *
330 * RETURNS:
331 * Pointer to new bio on success, NULL on failure.
332 */
351struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 333struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
352{ 334{
353 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 335 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -358,19 +340,45 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
358 return bio; 340 return bio;
359} 341}
360 342
361/* 343static void bio_kmalloc_destructor(struct bio *bio)
362 * Like bio_alloc(), but doesn't use a mempool backing. This means that 344{
363 * it CAN fail, but while bio_alloc() can only be used for allocations 345 if (bio_integrity(bio))
364 * that have a short (finite) life span, bio_kmalloc() should be used 346 bio_integrity_free(bio);
365 * for more permanent bio allocations (like allocating some bio's for 347 kfree(bio);
366 * initalization or setup purposes). 348}
367 */ 349
350/**
351 * bio_alloc - allocate a bio for I/O
352 * @gfp_mask: the GFP_ mask given to the slab allocator
353 * @nr_iovecs: number of iovecs to pre-allocate
354 *
355 * Description:
356 * bio_alloc will allocate a bio and associated bio_vec array that can hold
357 * at least @nr_iovecs entries. Allocations will be done from the
358 * fs_bio_set. Also see @bio_alloc_bioset.
359 *
360 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
361 * a bio. This is due to the mempool guarantees. To make this work, callers
362 * must never allocate more than 1 bio at the time from this pool. Callers
363 * that need to allocate more than 1 bio must always submit the previously
364 * allocate bio for IO before attempting to allocate a new one. Failure to
365 * do so can cause livelocks under memory pressure.
366 *
367 **/
368struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) 368struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
369{ 369{
370 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); 370 struct bio *bio;
371 371
372 if (bio) 372 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
373 bio->bi_destructor = bio_kmalloc_destructor; 373 gfp_mask);
374 if (unlikely(!bio))
375 return NULL;
376
377 bio_init(bio);
378 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
379 bio->bi_max_vecs = nr_iovecs;
380 bio->bi_io_vec = bio->bi_inline_vecs;
381 bio->bi_destructor = bio_kmalloc_destructor;
374 382
375 return bio; 383 return bio;
376} 384}
@@ -809,12 +817,15 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
809 len += iov[i].iov_len; 817 len += iov[i].iov_len;
810 } 818 }
811 819
820 if (offset)
821 nr_pages++;
822
812 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); 823 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
813 if (!bmd) 824 if (!bmd)
814 return ERR_PTR(-ENOMEM); 825 return ERR_PTR(-ENOMEM);
815 826
816 ret = -ENOMEM; 827 ret = -ENOMEM;
817 bio = bio_alloc(gfp_mask, nr_pages); 828 bio = bio_kmalloc(gfp_mask, nr_pages);
818 if (!bio) 829 if (!bio)
819 goto out_bmd; 830 goto out_bmd;
820 831
@@ -938,7 +949,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
938 if (!nr_pages) 949 if (!nr_pages)
939 return ERR_PTR(-EINVAL); 950 return ERR_PTR(-EINVAL);
940 951
941 bio = bio_alloc(gfp_mask, nr_pages); 952 bio = bio_kmalloc(gfp_mask, nr_pages);
942 if (!bio) 953 if (!bio)
943 return ERR_PTR(-ENOMEM); 954 return ERR_PTR(-ENOMEM);
944 955
@@ -1122,7 +1133,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1122 int offset, i; 1133 int offset, i;
1123 struct bio *bio; 1134 struct bio *bio;
1124 1135
1125 bio = bio_alloc(gfp_mask, nr_pages); 1136 bio = bio_kmalloc(gfp_mask, nr_pages);
1126 if (!bio) 1137 if (!bio)
1127 return ERR_PTR(-ENOMEM); 1138 return ERR_PTR(-ENOMEM);
1128 1139
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9adf5e4f7e96..94212844a9bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,25 +1,10 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3 1
4obj-$(CONFIG_BTRFS_FS) := btrfs.o 2obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ 3
4btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \ 5 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o delayed-ref.o 10 compression.o delayed-ref.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7fdd184a528d..cbba000dccbe 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 return ERR_PTR(-EINVAL); 60 return ERR_PTR(-EINVAL);
61 } 61 }
62 62
63 /* Handle the cached NULL acl case without locking */
64 acl = ACCESS_ONCE(*p_acl);
65 if (!acl)
66 return acl;
67
63 spin_lock(&inode->i_lock); 68 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED) 69 acl = *p_acl;
65 acl = posix_acl_dup(*p_acl); 70 if (acl != BTRFS_ACL_NOT_CACHED)
71 acl = posix_acl_dup(acl);
66 spin_unlock(&inode->i_lock); 72 spin_unlock(&inode->i_lock);
67 73
68 if (acl) 74 if (acl != BTRFS_ACL_NOT_CACHED)
69 return acl; 75 return acl;
70 76
71
72 size = __btrfs_getxattr(inode, name, "", 0); 77 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) { 78 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS); 79 value = kzalloc(size, GFP_NOFS);
@@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
80 btrfs_update_cached_acl(inode, p_acl, acl); 85 btrfs_update_cached_acl(inode, p_acl, acl);
81 } 86 }
82 kfree(value); 87 kfree(value);
83 } else if (size == -ENOENT) { 88 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
89 /* FIXME, who returns -ENOENT? I think nobody */
84 acl = NULL; 90 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl); 91 btrfs_update_cached_acl(inode, p_acl, acl);
92 } else {
93 acl = ERR_PTR(-EIO);
86 } 94 }
87 95
88 return acl; 96 return acl;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 51bfdfc8fcda..502c3d61de62 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -25,6 +25,7 @@
25#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
26#define WORK_DONE_BIT 1 26#define WORK_DONE_BIT 1
27#define WORK_ORDER_DONE_BIT 2 27#define WORK_ORDER_DONE_BIT 2
28#define WORK_HIGH_PRIO_BIT 3
28 29
29/* 30/*
30 * container for the kthread task pointer and the list of pending work 31 * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
36 37
37 /* list of struct btrfs_work that are waiting for service */ 38 /* list of struct btrfs_work that are waiting for service */
38 struct list_head pending; 39 struct list_head pending;
40 struct list_head prio_pending;
39 41
40 /* list of worker threads from struct btrfs_workers */ 42 /* list of worker threads from struct btrfs_workers */
41 struct list_head worker_list; 43 struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
103 105
104 spin_lock_irqsave(&workers->lock, flags); 106 spin_lock_irqsave(&workers->lock, flags);
105 107
106 while (!list_empty(&workers->order_list)) { 108 while (1) {
107 work = list_entry(workers->order_list.next, 109 if (!list_empty(&workers->prio_order_list)) {
108 struct btrfs_work, order_list); 110 work = list_entry(workers->prio_order_list.next,
109 111 struct btrfs_work, order_list);
112 } else if (!list_empty(&workers->order_list)) {
113 work = list_entry(workers->order_list.next,
114 struct btrfs_work, order_list);
115 } else {
116 break;
117 }
110 if (!test_bit(WORK_DONE_BIT, &work->flags)) 118 if (!test_bit(WORK_DONE_BIT, &work->flags))
111 break; 119 break;
112 120
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
143 do { 151 do {
144 spin_lock_irq(&worker->lock); 152 spin_lock_irq(&worker->lock);
145again_locked: 153again_locked:
146 while (!list_empty(&worker->pending)) { 154 while (1) {
147 cur = worker->pending.next; 155 if (!list_empty(&worker->prio_pending))
156 cur = worker->prio_pending.next;
157 else if (!list_empty(&worker->pending))
158 cur = worker->pending.next;
159 else
160 break;
161
148 work = list_entry(cur, struct btrfs_work, list); 162 work = list_entry(cur, struct btrfs_work, list);
149 list_del(&work->list); 163 list_del(&work->list);
150 clear_bit(WORK_QUEUED_BIT, &work->flags); 164 clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
163 177
164 spin_lock_irq(&worker->lock); 178 spin_lock_irq(&worker->lock);
165 check_idle_worker(worker); 179 check_idle_worker(worker);
166
167 } 180 }
168 if (freezing(current)) { 181 if (freezing(current)) {
169 worker->working = 0; 182 worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
178 * jump_in? 191 * jump_in?
179 */ 192 */
180 smp_mb(); 193 smp_mb();
181 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending) ||
195 !list_empty(&worker->prio_pending))
182 continue; 196 continue;
183 197
184 /* 198 /*
@@ -191,7 +205,8 @@ again_locked:
191 */ 205 */
192 schedule_timeout(1); 206 schedule_timeout(1);
193 smp_mb(); 207 smp_mb();
194 if (!list_empty(&worker->pending)) 208 if (!list_empty(&worker->pending) ||
209 !list_empty(&worker->prio_pending))
195 continue; 210 continue;
196 211
197 if (kthread_should_stop()) 212 if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
200 /* still no more work?, sleep for real */ 215 /* still no more work?, sleep for real */
201 spin_lock_irq(&worker->lock); 216 spin_lock_irq(&worker->lock);
202 set_current_state(TASK_INTERRUPTIBLE); 217 set_current_state(TASK_INTERRUPTIBLE);
203 if (!list_empty(&worker->pending)) 218 if (!list_empty(&worker->pending) ||
219 !list_empty(&worker->prio_pending))
204 goto again_locked; 220 goto again_locked;
205 221
206 /* 222 /*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
248 INIT_LIST_HEAD(&workers->worker_list); 264 INIT_LIST_HEAD(&workers->worker_list);
249 INIT_LIST_HEAD(&workers->idle_list); 265 INIT_LIST_HEAD(&workers->idle_list);
250 INIT_LIST_HEAD(&workers->order_list); 266 INIT_LIST_HEAD(&workers->order_list);
267 INIT_LIST_HEAD(&workers->prio_order_list);
251 spin_lock_init(&workers->lock); 268 spin_lock_init(&workers->lock);
252 workers->max_workers = max; 269 workers->max_workers = max;
253 workers->idle_thresh = 32; 270 workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
273 } 290 }
274 291
275 INIT_LIST_HEAD(&worker->pending); 292 INIT_LIST_HEAD(&worker->pending);
293 INIT_LIST_HEAD(&worker->prio_pending);
276 INIT_LIST_HEAD(&worker->worker_list); 294 INIT_LIST_HEAD(&worker->worker_list);
277 spin_lock_init(&worker->lock); 295 spin_lock_init(&worker->lock);
278 atomic_set(&worker->num_pending, 0); 296 atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
396 goto out; 414 goto out;
397 415
398 spin_lock_irqsave(&worker->lock, flags); 416 spin_lock_irqsave(&worker->lock, flags);
399 list_add_tail(&work->list, &worker->pending); 417 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
418 list_add_tail(&work->list, &worker->prio_pending);
419 else
420 list_add_tail(&work->list, &worker->pending);
400 atomic_inc(&worker->num_pending); 421 atomic_inc(&worker->num_pending);
401 422
402 /* by definition we're busy, take ourselves off the idle 423 /* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
422 return 0; 443 return 0;
423} 444}
424 445
446void btrfs_set_work_high_prio(struct btrfs_work *work)
447{
448 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
449}
450
425/* 451/*
426 * places a struct btrfs_work into the pending queue of one of the kthreads 452 * places a struct btrfs_work into the pending queue of one of the kthreads
427 */ 453 */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
438 worker = find_worker(workers); 464 worker = find_worker(workers);
439 if (workers->ordered) { 465 if (workers->ordered) {
440 spin_lock_irqsave(&workers->lock, flags); 466 spin_lock_irqsave(&workers->lock, flags);
441 list_add_tail(&work->order_list, &workers->order_list); 467 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
468 list_add_tail(&work->order_list,
469 &workers->prio_order_list);
470 } else {
471 list_add_tail(&work->order_list, &workers->order_list);
472 }
442 spin_unlock_irqrestore(&workers->lock, flags); 473 spin_unlock_irqrestore(&workers->lock, flags);
443 } else { 474 } else {
444 INIT_LIST_HEAD(&work->order_list); 475 INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
446 477
447 spin_lock_irqsave(&worker->lock, flags); 478 spin_lock_irqsave(&worker->lock, flags);
448 479
449 list_add_tail(&work->list, &worker->pending); 480 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
481 list_add_tail(&work->list, &worker->prio_pending);
482 else
483 list_add_tail(&work->list, &worker->pending);
450 atomic_inc(&worker->num_pending); 484 atomic_inc(&worker->num_pending);
451 check_busy_worker(worker); 485 check_busy_worker(worker);
452 486
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed8b63e..1b511c109db6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
85 * of work items waiting for completion 85 * of work items waiting for completion
86 */ 86 */
87 struct list_head order_list; 87 struct list_head order_list;
88 struct list_head prio_order_list;
88 89
89 /* lock for finding the next worker thread to queue on */ 90 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock; 91 spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers); 99int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 100void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work); 101int btrfs_requeue_work(struct btrfs_work *work);
102void btrfs_set_work_high_prio(struct btrfs_work *work);
101#endif 103#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e5b2533b691a..fedf8b9f03a2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1325 int ret = 0; 1325 int ret = 0;
1326 int blocksize; 1326 int blocksize;
1327 1327
1328 parent = path->nodes[level - 1]; 1328 parent = path->nodes[level + 1];
1329 if (!parent) 1329 if (!parent)
1330 return 0; 1330 return 0;
1331 1331
1332 nritems = btrfs_header_nritems(parent); 1332 nritems = btrfs_header_nritems(parent);
1333 slot = path->slots[level]; 1333 slot = path->slots[level + 1];
1334 blocksize = btrfs_level_size(root, level); 1334 blocksize = btrfs_level_size(root, level);
1335 1335
1336 if (slot > 0) { 1336 if (slot > 0) {
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1341 block1 = 0; 1341 block1 = 0;
1342 free_extent_buffer(eb); 1342 free_extent_buffer(eb);
1343 } 1343 }
1344 if (slot < nritems) { 1344 if (slot + 1 < nritems) {
1345 block2 = btrfs_node_blockptr(parent, slot + 1); 1345 block2 = btrfs_node_blockptr(parent, slot + 1);
1346 gen = btrfs_node_ptr_generation(parent, slot + 1); 1346 gen = btrfs_node_ptr_generation(parent, slot + 1);
1347 eb = btrfs_find_tree_block(root, block2, blocksize); 1347 eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1351 } 1351 }
1352 if (block1 || block2) { 1352 if (block1 || block2) {
1353 ret = -EAGAIN; 1353 ret = -EAGAIN;
1354
1355 /* release the whole path */
1354 btrfs_release_path(root, path); 1356 btrfs_release_path(root, path);
1357
1358 /* read the blocks */
1355 if (block1) 1359 if (block1)
1356 readahead_tree_block(root, block1, blocksize, 0); 1360 readahead_tree_block(root, block1, blocksize, 0);
1357 if (block2) 1361 if (block2)
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1361 eb = read_tree_block(root, block1, blocksize, 0); 1365 eb = read_tree_block(root, block1, blocksize, 0);
1362 free_extent_buffer(eb); 1366 free_extent_buffer(eb);
1363 } 1367 }
1364 if (block1) { 1368 if (block2) {
1365 eb = read_tree_block(root, block2, blocksize, 0); 1369 eb = read_tree_block(root, block2, blocksize, 0);
1366 free_extent_buffer(eb); 1370 free_extent_buffer(eb);
1367 } 1371 }
@@ -1465,6 +1469,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1465 u32 blocksize; 1469 u32 blocksize;
1466 struct extent_buffer *b = *eb_ret; 1470 struct extent_buffer *b = *eb_ret;
1467 struct extent_buffer *tmp; 1471 struct extent_buffer *tmp;
1472 int ret;
1468 1473
1469 blocknr = btrfs_node_blockptr(b, slot); 1474 blocknr = btrfs_node_blockptr(b, slot);
1470 gen = btrfs_node_ptr_generation(b, slot); 1475 gen = btrfs_node_ptr_generation(b, slot);
@@ -1472,6 +1477,10 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1472 1477
1473 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1478 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1479 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1480 /*
1481 * we found an up to date block without sleeping, return
1482 * right away
1483 */
1475 *eb_ret = tmp; 1484 *eb_ret = tmp;
1476 return 0; 1485 return 0;
1477 } 1486 }
@@ -1479,18 +1488,34 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1479 /* 1488 /*
1480 * reduce lock contention at high levels 1489 * reduce lock contention at high levels
1481 * of the btree by dropping locks before 1490 * of the btree by dropping locks before
1482 * we read. 1491 * we read. Don't release the lock on the current
1492 * level because we need to walk this node to figure
1493 * out which blocks to read.
1483 */ 1494 */
1484 btrfs_release_path(NULL, p); 1495 btrfs_unlock_up_safe(p, level + 1);
1496 btrfs_set_path_blocking(p);
1497
1485 if (tmp) 1498 if (tmp)
1486 free_extent_buffer(tmp); 1499 free_extent_buffer(tmp);
1487 if (p->reada) 1500 if (p->reada)
1488 reada_for_search(root, p, level, slot, key->objectid); 1501 reada_for_search(root, p, level, slot, key->objectid);
1489 1502
1503 btrfs_release_path(NULL, p);
1504
1505 ret = -EAGAIN;
1490 tmp = read_tree_block(root, blocknr, blocksize, gen); 1506 tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 if (tmp) 1507 if (tmp) {
1508 /*
1509 * If the read above didn't mark this buffer up to date,
1510 * it will never end up being up to date. Set ret to EIO now
1511 * and give up so that our caller doesn't loop forever
1512 * on our EAGAINs.
1513 */
1514 if (!btrfs_buffer_uptodate(tmp, 0))
1515 ret = -EIO;
1492 free_extent_buffer(tmp); 1516 free_extent_buffer(tmp);
1493 return -EAGAIN; 1517 }
1518 return ret;
1494} 1519}
1495 1520
1496/* 1521/*
@@ -1689,6 +1714,9 @@ cow_done:
1689 if (ret == -EAGAIN) 1714 if (ret == -EAGAIN)
1690 goto again; 1715 goto again;
1691 1716
1717 if (ret == -EIO)
1718 goto done;
1719
1692 if (!p->skip_locking) { 1720 if (!p->skip_locking) {
1693 int lret; 1721 int lret;
1694 1722
@@ -1731,6 +1759,8 @@ done:
1731 */ 1759 */
1732 if (!p->leave_spinning) 1760 if (!p->leave_spinning)
1733 btrfs_set_path_blocking(p); 1761 btrfs_set_path_blocking(p);
1762 if (ret < 0)
1763 btrfs_release_path(root, p);
1734 return ret; 1764 return ret;
1735} 1765}
1736 1766
@@ -4205,6 +4235,11 @@ again:
4205 if (ret == -EAGAIN) 4235 if (ret == -EAGAIN)
4206 goto again; 4236 goto again;
4207 4237
4238 if (ret < 0) {
4239 btrfs_release_path(root, path);
4240 goto done;
4241 }
4242
4208 if (!path->skip_locking) { 4243 if (!path->skip_locking) {
4209 ret = btrfs_try_spin_lock(next); 4244 ret = btrfs_try_spin_lock(next);
4210 if (!ret) { 4245 if (!ret) {
@@ -4239,6 +4274,11 @@ again:
4239 if (ret == -EAGAIN) 4274 if (ret == -EAGAIN)
4240 goto again; 4275 goto again;
4241 4276
4277 if (ret < 0) {
4278 btrfs_release_path(root, path);
4279 goto done;
4280 }
4281
4242 if (!path->skip_locking) { 4282 if (!path->skip_locking) {
4243 btrfs_assert_tree_locked(path->nodes[level]); 4283 btrfs_assert_tree_locked(path->nodes[level]);
4244 ret = btrfs_try_spin_lock(next); 4284 ret = btrfs_try_spin_lock(next);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad96495dedc5..4414a5d9983a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -881,6 +881,9 @@ struct btrfs_fs_info {
881 u64 metadata_alloc_profile; 881 u64 metadata_alloc_profile;
882 u64 system_alloc_profile; 882 u64 system_alloc_profile;
883 883
884 unsigned data_chunk_allocations;
885 unsigned metadata_ratio;
886
884 void *bdev_holder; 887 void *bdev_holder;
885}; 888};
886 889
@@ -2174,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2174extern struct file_operations btrfs_file_operations; 2177extern struct file_operations btrfs_file_operations;
2175int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2178int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2176 struct btrfs_root *root, struct inode *inode, 2179 struct btrfs_root *root, struct inode *inode,
2177 u64 start, u64 end, u64 inline_limit, u64 *hint_block); 2180 u64 start, u64 end, u64 locked_end,
2181 u64 inline_limit, u64 *hint_block);
2178int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2182int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2179 struct btrfs_root *root, 2183 struct btrfs_root *root,
2180 struct inode *inode, u64 start, u64 end); 2184 struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa8035f36..4b0ea0b80c23 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
232 memcpy(&found, result, csum_size); 232 memcpy(&found, result, csum_size);
233 233
234 read_extent_buffer(buf, &val, 0, csum_size); 234 read_extent_buffer(buf, &val, 0, csum_size);
235 printk(KERN_INFO "btrfs: %s checksum verify failed " 235 if (printk_ratelimit()) {
236 "on %llu wanted %X found %X level %d\n", 236 printk(KERN_INFO "btrfs: %s checksum verify "
237 root->fs_info->sb->s_id, 237 "failed on %llu wanted %X found %X "
238 buf->start, val, found, btrfs_header_level(buf)); 238 "level %d\n",
239 root->fs_info->sb->s_id,
240 (unsigned long long)buf->start, val, found,
241 btrfs_header_level(buf));
242 }
239 if (result != (char *)&inline_result) 243 if (result != (char *)&inline_result)
240 kfree(result); 244 kfree(result);
241 return 1; 245 return 1;
@@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
268 ret = 0; 272 ret = 0;
269 goto out; 273 goto out;
270 } 274 }
271 printk("parent transid verify failed on %llu wanted %llu found %llu\n", 275 if (printk_ratelimit()) {
272 (unsigned long long)eb->start, 276 printk("parent transid verify failed on %llu wanted %llu "
273 (unsigned long long)parent_transid, 277 "found %llu\n",
274 (unsigned long long)btrfs_header_generation(eb)); 278 (unsigned long long)eb->start,
279 (unsigned long long)parent_transid,
280 (unsigned long long)btrfs_header_generation(eb));
281 }
275 ret = 1; 282 ret = 1;
276 clear_extent_buffer_uptodate(io_tree, eb); 283 clear_extent_buffer_uptodate(io_tree, eb);
277out: 284out:
@@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
415 422
416 found_start = btrfs_header_bytenr(eb); 423 found_start = btrfs_header_bytenr(eb);
417 if (found_start != start) { 424 if (found_start != start) {
418 printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", 425 if (printk_ratelimit()) {
419 (unsigned long long)found_start, 426 printk(KERN_INFO "btrfs bad tree block start "
420 (unsigned long long)eb->start); 427 "%llu %llu\n",
428 (unsigned long long)found_start,
429 (unsigned long long)eb->start);
430 }
421 ret = -EIO; 431 ret = -EIO;
422 goto err; 432 goto err;
423 } 433 }
@@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
429 goto err; 439 goto err;
430 } 440 }
431 if (check_tree_block_fsid(root, eb)) { 441 if (check_tree_block_fsid(root, eb)) {
432 printk(KERN_INFO "btrfs bad fsid on block %llu\n", 442 if (printk_ratelimit()) {
433 (unsigned long long)eb->start); 443 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
444 (unsigned long long)eb->start);
445 }
434 ret = -EIO; 446 ret = -EIO;
435 goto err; 447 goto err;
436 } 448 }
@@ -579,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
579 async->bio_flags = bio_flags; 591 async->bio_flags = bio_flags;
580 592
581 atomic_inc(&fs_info->nr_async_submits); 593 atomic_inc(&fs_info->nr_async_submits);
594
595 if (rw & (1 << BIO_RW_SYNCIO))
596 btrfs_set_work_high_prio(&async->work);
597
582 btrfs_queue_worker(&fs_info->workers, &async->work); 598 btrfs_queue_worker(&fs_info->workers, &async->work);
583#if 0
584 int limit = btrfs_async_submit_limit(fs_info);
585 if (atomic_read(&fs_info->nr_async_submits) > limit) {
586 wait_event_timeout(fs_info->async_submit_wait,
587 (atomic_read(&fs_info->nr_async_submits) < limit),
588 HZ/10);
589 599
590 wait_event_timeout(fs_info->async_submit_wait,
591 (atomic_read(&fs_info->nr_async_bios) < limit),
592 HZ/10);
593 }
594#endif
595 while (atomic_read(&fs_info->async_submit_draining) && 600 while (atomic_read(&fs_info->async_submit_draining) &&
596 atomic_read(&fs_info->nr_async_submits)) { 601 atomic_read(&fs_info->nr_async_submits)) {
597 wait_event(fs_info->async_submit_wait, 602 wait_event(fs_info->async_submit_wait,
@@ -656,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
656 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 661 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
657 mirror_num, 0); 662 mirror_num, 0);
658 } 663 }
664
659 /* 665 /*
660 * kthread helpers are used to submit writes so that checksumming 666 * kthread helpers are used to submit writes so that checksumming
661 * can happen in parallel across all CPUs 667 * can happen in parallel across all CPUs
@@ -765,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
765 } 771 }
766} 772}
767 773
768#if 0
769static int btree_writepage(struct page *page, struct writeback_control *wbc)
770{
771 struct buffer_head *bh;
772 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
773 struct buffer_head *head;
774 if (!page_has_buffers(page)) {
775 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
776 (1 << BH_Dirty)|(1 << BH_Uptodate));
777 }
778 head = page_buffers(page);
779 bh = head;
780 do {
781 if (buffer_dirty(bh))
782 csum_tree_block(root, bh, 0);
783 bh = bh->b_this_page;
784 } while (bh != head);
785 return block_write_full_page(page, btree_get_block, wbc);
786}
787#endif
788
789static struct address_space_operations btree_aops = { 774static struct address_space_operations btree_aops = {
790 .readpage = btree_readpage, 775 .readpage = btree_readpage,
791 .writepage = btree_writepage, 776 .writepage = btree_writepage,
@@ -863,8 +848,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
863 848
864 if (ret == 0) 849 if (ret == 0)
865 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 850 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
866 else
867 WARN_ON(1);
868 return buf; 851 return buf;
869 852
870} 853}
@@ -1273,11 +1256,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1273 int ret = 0; 1256 int ret = 0;
1274 struct btrfs_device *device; 1257 struct btrfs_device *device;
1275 struct backing_dev_info *bdi; 1258 struct backing_dev_info *bdi;
1276#if 0 1259
1277 if ((bdi_bits & (1 << BDI_write_congested)) &&
1278 btrfs_congested_async(info, 0))
1279 return 1;
1280#endif
1281 list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1260 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1282 if (!device->bdev) 1261 if (!device->bdev)
1283 continue; 1262 continue;
@@ -1599,6 +1578,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1599 fs_info->btree_inode = new_inode(sb); 1578 fs_info->btree_inode = new_inode(sb);
1600 fs_info->btree_inode->i_ino = 1; 1579 fs_info->btree_inode->i_ino = 1;
1601 fs_info->btree_inode->i_nlink = 1; 1580 fs_info->btree_inode->i_nlink = 1;
1581 fs_info->metadata_ratio = 8;
1602 1582
1603 fs_info->thread_pool_size = min_t(unsigned long, 1583 fs_info->thread_pool_size = min_t(unsigned long,
1604 num_online_cpus() + 2, 8); 1584 num_online_cpus() + 2, 8);
@@ -1689,7 +1669,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1689 if (features) { 1669 if (features) {
1690 printk(KERN_ERR "BTRFS: couldn't mount because of " 1670 printk(KERN_ERR "BTRFS: couldn't mount because of "
1691 "unsupported optional features (%Lx).\n", 1671 "unsupported optional features (%Lx).\n",
1692 features); 1672 (unsigned long long)features);
1693 err = -EINVAL; 1673 err = -EINVAL;
1694 goto fail_iput; 1674 goto fail_iput;
1695 } 1675 }
@@ -1699,7 +1679,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1699 if (!(sb->s_flags & MS_RDONLY) && features) { 1679 if (!(sb->s_flags & MS_RDONLY) && features) {
1700 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " 1680 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1701 "unsupported option features (%Lx).\n", 1681 "unsupported option features (%Lx).\n",
1702 features); 1682 (unsigned long long)features);
1703 err = -EINVAL; 1683 err = -EINVAL;
1704 goto fail_iput; 1684 goto fail_iput;
1705 } 1685 }
@@ -2095,10 +2075,10 @@ static int write_dev_supers(struct btrfs_device *device,
2095 device->barriers = 0; 2075 device->barriers = 0;
2096 get_bh(bh); 2076 get_bh(bh);
2097 lock_buffer(bh); 2077 lock_buffer(bh);
2098 ret = submit_bh(WRITE, bh); 2078 ret = submit_bh(WRITE_SYNC, bh);
2099 } 2079 }
2100 } else { 2080 } else {
2101 ret = submit_bh(WRITE, bh); 2081 ret = submit_bh(WRITE_SYNC, bh);
2102 } 2082 }
2103 2083
2104 if (!ret && wait) { 2084 if (!ret && wait) {
@@ -2291,7 +2271,7 @@ int close_ctree(struct btrfs_root *root)
2291 2271
2292 if (fs_info->delalloc_bytes) { 2272 if (fs_info->delalloc_bytes) {
2293 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 2273 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2294 fs_info->delalloc_bytes); 2274 (unsigned long long)fs_info->delalloc_bytes);
2295 } 2275 }
2296 if (fs_info->total_ref_cache_size) { 2276 if (fs_info->total_ref_cache_size) {
2297 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", 2277 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
@@ -2328,16 +2308,6 @@ int close_ctree(struct btrfs_root *root)
2328 btrfs_stop_workers(&fs_info->endio_write_workers); 2308 btrfs_stop_workers(&fs_info->endio_write_workers);
2329 btrfs_stop_workers(&fs_info->submit_workers); 2309 btrfs_stop_workers(&fs_info->submit_workers);
2330 2310
2331#if 0
2332 while (!list_empty(&fs_info->hashers)) {
2333 struct btrfs_hasher *hasher;
2334 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2335 hashers);
2336 list_del(&hasher->hashers);
2337 crypto_free_hash(&fs_info->hash_tfm);
2338 kfree(hasher);
2339 }
2340#endif
2341 btrfs_close_devices(fs_info->fs_devices); 2311 btrfs_close_devices(fs_info->fs_devices);
2342 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2312 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2343 2313
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 178df4c67de4..3e2c7c738f23 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -312,7 +312,7 @@ btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
312} 312}
313 313
314/* 314/*
315 * return the block group that contains teh given bytenr 315 * return the block group that contains the given bytenr
316 */ 316 */
317struct btrfs_block_group_cache *btrfs_lookup_block_group( 317struct btrfs_block_group_cache *btrfs_lookup_block_group(
318 struct btrfs_fs_info *info, 318 struct btrfs_fs_info *info,
@@ -1844,10 +1844,14 @@ again:
1844 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 1844 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
1845 ", %llu bytes_used, %llu bytes_reserved, " 1845 ", %llu bytes_used, %llu bytes_reserved, "
1846 "%llu bytes_pinned, %llu bytes_readonly, %llu may use" 1846 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
1847 "%llu total\n", bytes, data_sinfo->bytes_delalloc, 1847 "%llu total\n", (unsigned long long)bytes,
1848 data_sinfo->bytes_used, data_sinfo->bytes_reserved, 1848 (unsigned long long)data_sinfo->bytes_delalloc,
1849 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, 1849 (unsigned long long)data_sinfo->bytes_used,
1850 data_sinfo->bytes_may_use, data_sinfo->total_bytes); 1850 (unsigned long long)data_sinfo->bytes_reserved,
1851 (unsigned long long)data_sinfo->bytes_pinned,
1852 (unsigned long long)data_sinfo->bytes_readonly,
1853 (unsigned long long)data_sinfo->bytes_may_use,
1854 (unsigned long long)data_sinfo->total_bytes);
1851 return -ENOSPC; 1855 return -ENOSPC;
1852 } 1856 }
1853 data_sinfo->bytes_may_use += bytes; 1857 data_sinfo->bytes_may_use += bytes;
@@ -1918,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1918 spin_unlock(&info->lock); 1922 spin_unlock(&info->lock);
1919} 1923}
1920 1924
1925static void force_metadata_allocation(struct btrfs_fs_info *info)
1926{
1927 struct list_head *head = &info->space_info;
1928 struct btrfs_space_info *found;
1929
1930 rcu_read_lock();
1931 list_for_each_entry_rcu(found, head, list) {
1932 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
1933 found->force_alloc = 1;
1934 }
1935 rcu_read_unlock();
1936}
1937
1921static int do_chunk_alloc(struct btrfs_trans_handle *trans, 1938static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1922 struct btrfs_root *extent_root, u64 alloc_bytes, 1939 struct btrfs_root *extent_root, u64 alloc_bytes,
1923 u64 flags, int force) 1940 u64 flags, int force)
1924{ 1941{
1925 struct btrfs_space_info *space_info; 1942 struct btrfs_space_info *space_info;
1943 struct btrfs_fs_info *fs_info = extent_root->fs_info;
1926 u64 thresh; 1944 u64 thresh;
1927 int ret = 0; 1945 int ret = 0;
1928 1946
1929 mutex_lock(&extent_root->fs_info->chunk_mutex); 1947 mutex_lock(&fs_info->chunk_mutex);
1930 1948
1931 flags = btrfs_reduce_alloc_profile(extent_root, flags); 1949 flags = btrfs_reduce_alloc_profile(extent_root, flags);
1932 1950
@@ -1958,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1958 } 1976 }
1959 spin_unlock(&space_info->lock); 1977 spin_unlock(&space_info->lock);
1960 1978
1979 /*
1980 * if we're doing a data chunk, go ahead and make sure that
1981 * we keep a reasonable number of metadata chunks allocated in the
1982 * FS as well.
1983 */
1984 if (flags & BTRFS_BLOCK_GROUP_DATA) {
1985 fs_info->data_chunk_allocations++;
1986 if (!(fs_info->data_chunk_allocations %
1987 fs_info->metadata_ratio))
1988 force_metadata_allocation(fs_info);
1989 }
1990
1961 ret = btrfs_alloc_chunk(trans, extent_root, flags); 1991 ret = btrfs_alloc_chunk(trans, extent_root, flags);
1962 if (ret) 1992 if (ret)
1963 space_info->full = 1; 1993 space_info->full = 1;
@@ -2798,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2798 info->bytes_pinned - info->bytes_reserved), 2828 info->bytes_pinned - info->bytes_reserved),
2799 (info->full) ? "" : "not "); 2829 (info->full) ? "" : "not ");
2800 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 2830 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
2801 " may_use=%llu, used=%llu\n", info->total_bytes, 2831 " may_use=%llu, used=%llu\n",
2802 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, 2832 (unsigned long long)info->total_bytes,
2803 info->bytes_used); 2833 (unsigned long long)info->bytes_pinned,
2834 (unsigned long long)info->bytes_delalloc,
2835 (unsigned long long)info->bytes_may_use,
2836 (unsigned long long)info->bytes_used);
2804 2837
2805 down_read(&info->groups_sem); 2838 down_read(&info->groups_sem);
2806 list_for_each_entry(cache, &info->block_groups, list) { 2839 list_for_each_entry(cache, &info->block_groups, list) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb2bee8b7fbf..fe9eb990e443 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,12 +17,6 @@
17#include "ctree.h" 17#include "ctree.h"
18#include "btrfs_inode.h" 18#include "btrfs_inode.h"
19 19
20/* temporary define until extent_map moves out of btrfs */
21struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
22 unsigned long extra_flags,
23 void (*ctor)(void *, struct kmem_cache *,
24 unsigned long));
25
26static struct kmem_cache *extent_state_cache; 20static struct kmem_cache *extent_state_cache;
27static struct kmem_cache *extent_buffer_cache; 21static struct kmem_cache *extent_buffer_cache;
28 22
@@ -50,20 +44,23 @@ struct extent_page_data {
50 /* tells writepage not to lock the state bits for this range 44 /* tells writepage not to lock the state bits for this range
51 * it still does the unlocking 45 * it still does the unlocking
52 */ 46 */
53 int extent_locked; 47 unsigned int extent_locked:1;
48
49 /* tells the submit_bio code to use a WRITE_SYNC */
50 unsigned int sync_io:1;
54}; 51};
55 52
56int __init extent_io_init(void) 53int __init extent_io_init(void)
57{ 54{
58 extent_state_cache = btrfs_cache_create("extent_state", 55 extent_state_cache = kmem_cache_create("extent_state",
59 sizeof(struct extent_state), 0, 56 sizeof(struct extent_state), 0,
60 NULL); 57 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
61 if (!extent_state_cache) 58 if (!extent_state_cache)
62 return -ENOMEM; 59 return -ENOMEM;
63 60
64 extent_buffer_cache = btrfs_cache_create("extent_buffers", 61 extent_buffer_cache = kmem_cache_create("extent_buffers",
65 sizeof(struct extent_buffer), 0, 62 sizeof(struct extent_buffer), 0,
66 NULL); 63 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
67 if (!extent_buffer_cache) 64 if (!extent_buffer_cache)
68 goto free_state_cache; 65 goto free_state_cache;
69 return 0; 66 return 0;
@@ -1404,69 +1401,6 @@ out:
1404 return total_bytes; 1401 return total_bytes;
1405} 1402}
1406 1403
1407#if 0
1408/*
1409 * helper function to lock both pages and extents in the tree.
1410 * pages must be locked first.
1411 */
1412static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1413{
1414 unsigned long index = start >> PAGE_CACHE_SHIFT;
1415 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1416 struct page *page;
1417 int err;
1418
1419 while (index <= end_index) {
1420 page = grab_cache_page(tree->mapping, index);
1421 if (!page) {
1422 err = -ENOMEM;
1423 goto failed;
1424 }
1425 if (IS_ERR(page)) {
1426 err = PTR_ERR(page);
1427 goto failed;
1428 }
1429 index++;
1430 }
1431 lock_extent(tree, start, end, GFP_NOFS);
1432 return 0;
1433
1434failed:
1435 /*
1436 * we failed above in getting the page at 'index', so we undo here
1437 * up to but not including the page at 'index'
1438 */
1439 end_index = index;
1440 index = start >> PAGE_CACHE_SHIFT;
1441 while (index < end_index) {
1442 page = find_get_page(tree->mapping, index);
1443 unlock_page(page);
1444 page_cache_release(page);
1445 index++;
1446 }
1447 return err;
1448}
1449
1450/*
1451 * helper function to unlock both pages and extents in the tree.
1452 */
1453static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1454{
1455 unsigned long index = start >> PAGE_CACHE_SHIFT;
1456 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1457 struct page *page;
1458
1459 while (index <= end_index) {
1460 page = find_get_page(tree->mapping, index);
1461 unlock_page(page);
1462 page_cache_release(page);
1463 index++;
1464 }
1465 unlock_extent(tree, start, end, GFP_NOFS);
1466 return 0;
1467}
1468#endif
1469
1470/* 1404/*
1471 * set the private field for a given byte offset in the tree. If there isn't 1405 * set the private field for a given byte offset in the tree. If there isn't
1472 * an extent_state there already, this does nothing. 1406 * an extent_state there already, this does nothing.
@@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2101 return ret; 2035 return ret;
2102} 2036}
2103 2037
2038static noinline void update_nr_written(struct page *page,
2039 struct writeback_control *wbc,
2040 unsigned long nr_written)
2041{
2042 wbc->nr_to_write -= nr_written;
2043 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2044 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2045 page->mapping->writeback_index = page->index + nr_written;
2046}
2047
2104/* 2048/*
2105 * the writepage semantics are similar to regular writepage. extent 2049 * the writepage semantics are similar to regular writepage. extent
2106 * records are inserted to lock ranges in the tree, and as dirty areas 2050 * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 u64 delalloc_end; 2080 u64 delalloc_end;
2137 int page_started; 2081 int page_started;
2138 int compressed; 2082 int compressed;
2083 int write_flags;
2139 unsigned long nr_written = 0; 2084 unsigned long nr_written = 0;
2140 2085
2086 if (wbc->sync_mode == WB_SYNC_ALL)
2087 write_flags = WRITE_SYNC_PLUG;
2088 else
2089 write_flags = WRITE;
2090
2141 WARN_ON(!PageLocked(page)); 2091 WARN_ON(!PageLocked(page));
2142 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2092 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2143 if (page->index > end_index || 2093 if (page->index > end_index ||
@@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2164 delalloc_end = 0; 2114 delalloc_end = 0;
2165 page_started = 0; 2115 page_started = 0;
2166 if (!epd->extent_locked) { 2116 if (!epd->extent_locked) {
2117 /*
2118 * make sure the wbc mapping index is at least updated
2119 * to this page.
2120 */
2121 update_nr_written(page, wbc, 0);
2122
2167 while (delalloc_end < page_end) { 2123 while (delalloc_end < page_end) {
2168 nr_delalloc = find_lock_delalloc_range(inode, tree, 2124 nr_delalloc = find_lock_delalloc_range(inode, tree,
2169 page, 2125 page,
@@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2185 */ 2141 */
2186 if (page_started) { 2142 if (page_started) {
2187 ret = 0; 2143 ret = 0;
2188 goto update_nr_written; 2144 /*
2145 * we've unlocked the page, so we can't update
2146 * the mapping's writeback index, just update
2147 * nr_to_write.
2148 */
2149 wbc->nr_to_write -= nr_written;
2150 goto done_unlocked;
2189 } 2151 }
2190 } 2152 }
2191 lock_extent(tree, start, page_end, GFP_NOFS); 2153 lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2198 if (ret == -EAGAIN) { 2160 if (ret == -EAGAIN) {
2199 unlock_extent(tree, start, page_end, GFP_NOFS); 2161 unlock_extent(tree, start, page_end, GFP_NOFS);
2200 redirty_page_for_writepage(wbc, page); 2162 redirty_page_for_writepage(wbc, page);
2163 update_nr_written(page, wbc, nr_written);
2201 unlock_page(page); 2164 unlock_page(page);
2202 ret = 0; 2165 ret = 0;
2203 goto update_nr_written; 2166 goto done_unlocked;
2204 } 2167 }
2205 } 2168 }
2206 2169
2207 nr_written++; 2170 /*
2171 * we don't want to touch the inode after unlocking the page,
2172 * so we update the mapping writeback index now
2173 */
2174 update_nr_written(page, wbc, nr_written + 1);
2208 2175
2209 end = page_end; 2176 end = page_end;
2210 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) 2177 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2314 (unsigned long long)end); 2281 (unsigned long long)end);
2315 } 2282 }
2316 2283
2317 ret = submit_extent_page(WRITE, tree, page, sector, 2284 ret = submit_extent_page(write_flags, tree, page,
2318 iosize, pg_offset, bdev, 2285 sector, iosize, pg_offset,
2319 &epd->bio, max_nr, 2286 bdev, &epd->bio, max_nr,
2320 end_bio_extent_writepage, 2287 end_bio_extent_writepage,
2321 0, 0, 0); 2288 0, 0, 0);
2322 if (ret) 2289 if (ret)
@@ -2336,11 +2303,8 @@ done:
2336 unlock_extent(tree, unlock_start, page_end, GFP_NOFS); 2303 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2337 unlock_page(page); 2304 unlock_page(page);
2338 2305
2339update_nr_written: 2306done_unlocked:
2340 wbc->nr_to_write -= nr_written; 2307
2341 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2342 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2343 page->mapping->writeback_index = page->index + nr_written;
2344 return 0; 2308 return 0;
2345} 2309}
2346 2310
@@ -2460,15 +2424,23 @@ retry:
2460 return ret; 2424 return ret;
2461} 2425}
2462 2426
2463static noinline void flush_write_bio(void *data) 2427static void flush_epd_write_bio(struct extent_page_data *epd)
2464{ 2428{
2465 struct extent_page_data *epd = data;
2466 if (epd->bio) { 2429 if (epd->bio) {
2467 submit_one_bio(WRITE, epd->bio, 0, 0); 2430 if (epd->sync_io)
2431 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2432 else
2433 submit_one_bio(WRITE, epd->bio, 0, 0);
2468 epd->bio = NULL; 2434 epd->bio = NULL;
2469 } 2435 }
2470} 2436}
2471 2437
2438static noinline void flush_write_bio(void *data)
2439{
2440 struct extent_page_data *epd = data;
2441 flush_epd_write_bio(epd);
2442}
2443
2472int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2444int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2473 get_extent_t *get_extent, 2445 get_extent_t *get_extent,
2474 struct writeback_control *wbc) 2446 struct writeback_control *wbc)
@@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2480 .tree = tree, 2452 .tree = tree,
2481 .get_extent = get_extent, 2453 .get_extent = get_extent,
2482 .extent_locked = 0, 2454 .extent_locked = 0,
2455 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2483 }; 2456 };
2484 struct writeback_control wbc_writepages = { 2457 struct writeback_control wbc_writepages = {
2485 .bdi = wbc->bdi, 2458 .bdi = wbc->bdi,
2486 .sync_mode = WB_SYNC_NONE, 2459 .sync_mode = wbc->sync_mode,
2487 .older_than_this = NULL, 2460 .older_than_this = NULL,
2488 .nr_to_write = 64, 2461 .nr_to_write = 64,
2489 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2462 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2490 .range_end = (loff_t)-1, 2463 .range_end = (loff_t)-1,
2491 }; 2464 };
2492 2465
2493
2494 ret = __extent_writepage(page, wbc, &epd); 2466 ret = __extent_writepage(page, wbc, &epd);
2495 2467
2496 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2468 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2497 __extent_writepage, &epd, flush_write_bio); 2469 __extent_writepage, &epd, flush_write_bio);
2498 if (epd.bio) 2470 flush_epd_write_bio(&epd);
2499 submit_one_bio(WRITE, epd.bio, 0, 0);
2500 return ret; 2471 return ret;
2501} 2472}
2502 2473
@@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2515 .tree = tree, 2486 .tree = tree,
2516 .get_extent = get_extent, 2487 .get_extent = get_extent,
2517 .extent_locked = 1, 2488 .extent_locked = 1,
2489 .sync_io = mode == WB_SYNC_ALL,
2518 }; 2490 };
2519 struct writeback_control wbc_writepages = { 2491 struct writeback_control wbc_writepages = {
2520 .bdi = inode->i_mapping->backing_dev_info, 2492 .bdi = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2540 start += PAGE_CACHE_SIZE; 2512 start += PAGE_CACHE_SIZE;
2541 } 2513 }
2542 2514
2543 if (epd.bio) 2515 flush_epd_write_bio(&epd);
2544 submit_one_bio(WRITE, epd.bio, 0, 0);
2545 return ret; 2516 return ret;
2546} 2517}
2547 2518
@@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree,
2556 .tree = tree, 2527 .tree = tree,
2557 .get_extent = get_extent, 2528 .get_extent = get_extent,
2558 .extent_locked = 0, 2529 .extent_locked = 0,
2530 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2559 }; 2531 };
2560 2532
2561 ret = extent_write_cache_pages(tree, mapping, wbc, 2533 ret = extent_write_cache_pages(tree, mapping, wbc,
2562 __extent_writepage, &epd, 2534 __extent_writepage, &epd,
2563 flush_write_bio); 2535 flush_write_bio);
2564 if (epd.bio) 2536 flush_epd_write_bio(&epd);
2565 submit_one_bio(WRITE, epd.bio, 0, 0);
2566 return ret; 2537 return ret;
2567} 2538}
2568 2539
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b187917b36fa..30c9365861e6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -6,19 +6,14 @@
6#include <linux/hardirq.h> 6#include <linux/hardirq.h>
7#include "extent_map.h" 7#include "extent_map.h"
8 8
9/* temporary define until extent_map moves out of btrfs */
10struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
11 unsigned long extra_flags,
12 void (*ctor)(void *, struct kmem_cache *,
13 unsigned long));
14 9
15static struct kmem_cache *extent_map_cache; 10static struct kmem_cache *extent_map_cache;
16 11
17int __init extent_map_init(void) 12int __init extent_map_init(void)
18{ 13{
19 extent_map_cache = btrfs_cache_create("extent_map", 14 extent_map_cache = kmem_cache_create("extent_map",
20 sizeof(struct extent_map), 0, 15 sizeof(struct extent_map), 0,
21 NULL); 16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
22 if (!extent_map_cache) 17 if (!extent_map_cache)
23 return -ENOMEM; 18 return -ENOMEM;
24 return 0; 19 return 0;
@@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43 tree->map.rb_node = NULL; 38 tree->map.rb_node = NULL;
44 spin_lock_init(&tree->lock); 39 spin_lock_init(&tree->lock);
45} 40}
46EXPORT_SYMBOL(extent_map_tree_init);
47 41
48/** 42/**
49 * alloc_extent_map - allocate new extent map structure 43 * alloc_extent_map - allocate new extent map structure
@@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask)
64 atomic_set(&em->refs, 1); 58 atomic_set(&em->refs, 1);
65 return em; 59 return em;
66} 60}
67EXPORT_SYMBOL(alloc_extent_map);
68 61
69/** 62/**
70 * free_extent_map - drop reference count of an extent_map 63 * free_extent_map - drop reference count of an extent_map
@@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em)
83 kmem_cache_free(extent_map_cache, em); 76 kmem_cache_free(extent_map_cache, em);
84 } 77 }
85} 78}
86EXPORT_SYMBOL(free_extent_map);
87 79
88static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 80static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
89 struct rb_node *node) 81 struct rb_node *node)
@@ -264,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
264out: 256out:
265 return ret; 257 return ret;
266} 258}
267EXPORT_SYMBOL(add_extent_mapping);
268 259
269/* simple helper to do math around the end of an extent, handling wrap */ 260/* simple helper to do math around the end of an extent, handling wrap */
270static u64 range_end(u64 start, u64 len) 261static u64 range_end(u64 start, u64 len)
@@ -326,7 +317,6 @@ found:
326out: 317out:
327 return em; 318 return em;
328} 319}
329EXPORT_SYMBOL(lookup_extent_mapping);
330 320
331/** 321/**
332 * remove_extent_mapping - removes an extent_map from the extent tree 322 * remove_extent_mapping - removes an extent_map from the extent tree
@@ -346,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
346 em->in_tree = 0; 336 em->in_tree = 0;
347 return ret; 337 return ret;
348} 338}
349EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c9fb46ccd08..1d51dc38bb49 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
272 return 0; 272 return 0;
273} 273}
274 274
275int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
276{
277 return 0;
278#if 0
279 struct btrfs_path *path;
280 struct btrfs_key found_key;
281 struct extent_buffer *leaf;
282 struct btrfs_file_extent_item *extent;
283 u64 last_offset = 0;
284 int nritems;
285 int slot;
286 int found_type;
287 int ret;
288 int err = 0;
289 u64 extent_end = 0;
290
291 path = btrfs_alloc_path();
292 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
293 last_offset, 0);
294 while (1) {
295 nritems = btrfs_header_nritems(path->nodes[0]);
296 if (path->slots[0] >= nritems) {
297 ret = btrfs_next_leaf(root, path);
298 if (ret)
299 goto out;
300 nritems = btrfs_header_nritems(path->nodes[0]);
301 }
302 slot = path->slots[0];
303 leaf = path->nodes[0];
304 btrfs_item_key_to_cpu(leaf, &found_key, slot);
305 if (found_key.objectid != inode->i_ino)
306 break;
307 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
308 goto out;
309
310 if (found_key.offset < last_offset) {
311 WARN_ON(1);
312 btrfs_print_leaf(root, leaf);
313 printk(KERN_ERR "inode %lu found offset %llu "
314 "expected %llu\n", inode->i_ino,
315 (unsigned long long)found_key.offset,
316 (unsigned long long)last_offset);
317 err = 1;
318 goto out;
319 }
320 extent = btrfs_item_ptr(leaf, slot,
321 struct btrfs_file_extent_item);
322 found_type = btrfs_file_extent_type(leaf, extent);
323 if (found_type == BTRFS_FILE_EXTENT_REG) {
324 extent_end = found_key.offset +
325 btrfs_file_extent_num_bytes(leaf, extent);
326 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
327 struct btrfs_item *item;
328 item = btrfs_item_nr(leaf, slot);
329 extent_end = found_key.offset +
330 btrfs_file_extent_inline_len(leaf, extent);
331 extent_end = (extent_end + root->sectorsize - 1) &
332 ~((u64)root->sectorsize - 1);
333 }
334 last_offset = extent_end;
335 path->slots[0]++;
336 }
337 if (0 && last_offset < inode->i_size) {
338 WARN_ON(1);
339 btrfs_print_leaf(root, leaf);
340 printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
341 inode->i_ino, (unsigned long long)last_offset,
342 (unsigned long long)inode->i_size);
343 err = 1;
344
345 }
346out:
347 btrfs_free_path(path);
348 return err;
349#endif
350}
351
352/* 275/*
353 * this is very complex, but the basic idea is to drop all extents 276 * this is very complex, but the basic idea is to drop all extents
354 * in the range start - end. hint_block is filled in with a block number 277 * in the range start - end. hint_block is filled in with a block number
@@ -363,15 +286,16 @@ out:
363 */ 286 */
364noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, 287noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
365 struct btrfs_root *root, struct inode *inode, 288 struct btrfs_root *root, struct inode *inode,
366 u64 start, u64 end, u64 inline_limit, u64 *hint_byte) 289 u64 start, u64 end, u64 locked_end,
290 u64 inline_limit, u64 *hint_byte)
367{ 291{
368 u64 extent_end = 0; 292 u64 extent_end = 0;
369 u64 locked_end = end;
370 u64 search_start = start; 293 u64 search_start = start;
371 u64 leaf_start; 294 u64 leaf_start;
372 u64 ram_bytes = 0; 295 u64 ram_bytes = 0;
373 u64 orig_parent = 0; 296 u64 orig_parent = 0;
374 u64 disk_bytenr = 0; 297 u64 disk_bytenr = 0;
298 u64 orig_locked_end = locked_end;
375 u8 compression; 299 u8 compression;
376 u8 encryption; 300 u8 encryption;
377 u16 other_encoding = 0; 301 u16 other_encoding = 0;
@@ -684,11 +608,10 @@ next_slot:
684 } 608 }
685out: 609out:
686 btrfs_free_path(path); 610 btrfs_free_path(path);
687 if (locked_end > end) { 611 if (locked_end > orig_locked_end) {
688 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, 612 unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
689 GFP_NOFS); 613 locked_end - 1, GFP_NOFS);
690 } 614 }
691 btrfs_check_file(root, inode);
692 return ret; 615 return ret;
693} 616}
694 617
@@ -830,7 +753,7 @@ again:
830 753
831 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 754 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
832 BUG_ON(ret); 755 BUG_ON(ret);
833 goto done; 756 goto release;
834 } else if (split == start) { 757 } else if (split == start) {
835 if (locked_end < extent_end) { 758 if (locked_end < extent_end) {
836 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 759 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -926,6 +849,8 @@ again:
926 } 849 }
927done: 850done:
928 btrfs_mark_buffer_dirty(leaf); 851 btrfs_mark_buffer_dirty(leaf);
852
853release:
929 btrfs_release_path(root, path); 854 btrfs_release_path(root, path);
930 if (split_end && split == start) { 855 if (split_end && split == start) {
931 split = end; 856 split = end;
@@ -1131,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1131 if (will_write) { 1056 if (will_write) {
1132 btrfs_fdatawrite_range(inode->i_mapping, pos, 1057 btrfs_fdatawrite_range(inode->i_mapping, pos,
1133 pos + write_bytes - 1, 1058 pos + write_bytes - 1,
1134 WB_SYNC_NONE); 1059 WB_SYNC_ALL);
1135 } else { 1060 } else {
1136 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1061 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1137 num_pages); 1062 num_pages);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 768b9523662d..0bc93657b460 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
332 printk(KERN_ERR "couldn't find space %llu to free\n", 332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset); 333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", 334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached, block_group->key.objectid, 335 block_group->cached,
336 block_group->key.offset); 336 (unsigned long long)block_group->key.objectid,
337 (unsigned long long)block_group->key.offset);
337 btrfs_dump_free_space(block_group, bytes); 338 btrfs_dump_free_space(block_group, bytes);
338 } else if (info) { 339 } else if (info) {
339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " 340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
340 "but wanted offset=%llu bytes=%llu\n", 341 "but wanted offset=%llu bytes=%llu\n",
341 info->offset, info->bytes, offset, bytes); 342 (unsigned long long)info->offset,
343 (unsigned long long)info->bytes,
344 (unsigned long long)offset,
345 (unsigned long long)bytes);
342 } 346 }
343 WARN_ON(1); 347 WARN_ON(1);
344 } 348 }
@@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
357 info = rb_entry(n, struct btrfs_free_space, offset_index); 361 info = rb_entry(n, struct btrfs_free_space, offset_index);
358 if (info->bytes >= bytes) 362 if (info->bytes >= bytes)
359 count++; 363 count++;
360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, 364 printk(KERN_ERR "entry offset %llu, bytes %llu\n",
361 info->bytes); 365 (unsigned long long)info->offset,
366 (unsigned long long)info->bytes);
362 } 367 }
363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
364 "\n", count); 369 "\n", count);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc7334d833c9..9abbced1123d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
79 } 79 }
80 path = btrfs_alloc_path(); 80 path = btrfs_alloc_path();
81 BUG_ON(!path); 81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); 82 search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start; 83 search_key.objectid = search_start;
84 search_key.type = 0; 84 search_key.type = 0;
85 search_key.offset = 0; 85 search_key.offset = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0d1dd492a58..1c8b0190d031 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops;
70static struct kmem_cache *btrfs_inode_cachep; 70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep; 71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep; 72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep; 73struct kmem_cache *btrfs_path_cachep;
75 74
76#define S_SHIFT 12 75#define S_SHIFT 12
@@ -234,7 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
234 } 233 }
235 234
236 ret = btrfs_drop_extents(trans, root, inode, start, 235 ret = btrfs_drop_extents(trans, root, inode, start,
237 aligned_end, start, &hint_byte); 236 aligned_end, aligned_end, start, &hint_byte);
238 BUG_ON(ret); 237 BUG_ON(ret);
239 238
240 if (isize > actual_end) 239 if (isize > actual_end)
@@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1439 struct inode *inode, u64 file_pos, 1438 struct inode *inode, u64 file_pos,
1440 u64 disk_bytenr, u64 disk_num_bytes, 1439 u64 disk_bytenr, u64 disk_num_bytes,
1441 u64 num_bytes, u64 ram_bytes, 1440 u64 num_bytes, u64 ram_bytes,
1441 u64 locked_end,
1442 u8 compression, u8 encryption, 1442 u8 compression, u8 encryption,
1443 u16 other_encoding, int extent_type) 1443 u16 other_encoding, int extent_type)
1444{ 1444{
@@ -1455,7 +1455,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1455 1455
1456 path->leave_spinning = 1; 1456 path->leave_spinning = 1;
1457 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1458 file_pos + num_bytes, file_pos, &hint); 1458 file_pos + num_bytes, locked_end,
1459 file_pos, &hint);
1459 BUG_ON(ret); 1460 BUG_ON(ret);
1460 1461
1461 ins.objectid = inode->i_ino; 1462 ins.objectid = inode->i_ino;
@@ -1590,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1590 ordered_extent->disk_len, 1591 ordered_extent->disk_len,
1591 ordered_extent->len, 1592 ordered_extent->len,
1592 ordered_extent->len, 1593 ordered_extent->len,
1594 ordered_extent->file_offset +
1595 ordered_extent->len,
1593 compressed, 0, 0, 1596 compressed, 0, 0,
1594 BTRFS_FILE_EXTENT_REG); 1597 BTRFS_FILE_EXTENT_REG);
1595 BUG_ON(ret); 1598 BUG_ON(ret);
@@ -1819,10 +1822,12 @@ good:
1819 return 0; 1822 return 0;
1820 1823
1821zeroit: 1824zeroit:
1822 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " 1825 if (printk_ratelimit()) {
1823 "private %llu\n", page->mapping->host->i_ino, 1826 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1824 (unsigned long long)start, csum, 1827 "private %llu\n", page->mapping->host->i_ino,
1825 (unsigned long long)private); 1828 (unsigned long long)start, csum,
1829 (unsigned long long)private);
1830 }
1826 memset(kaddr + offset, 1, end - start + 1); 1831 memset(kaddr + offset, 1, end - start + 1);
1827 flush_dcache_page(page); 1832 flush_dcache_page(page);
1828 kunmap_atomic(kaddr, KM_USER0); 1833 kunmap_atomic(kaddr, KM_USER0);
@@ -2011,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2011} 2016}
2012 2017
2013/* 2018/*
2019 * very simple check to peek ahead in the leaf looking for xattrs. If we
2020 * don't find any xattrs, we know there can't be any acls.
2021 *
2022 * slot is the slot the inode is in, objectid is the objectid of the inode
2023 */
2024static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2025 int slot, u64 objectid)
2026{
2027 u32 nritems = btrfs_header_nritems(leaf);
2028 struct btrfs_key found_key;
2029 int scanned = 0;
2030
2031 slot++;
2032 while (slot < nritems) {
2033 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2034
2035 /* we found a different objectid, there must not be acls */
2036 if (found_key.objectid != objectid)
2037 return 0;
2038
2039 /* we found an xattr, assume we've got an acl */
2040 if (found_key.type == BTRFS_XATTR_ITEM_KEY)
2041 return 1;
2042
2043 /*
2044 * we found a key greater than an xattr key, there can't
2045 * be any acls later on
2046 */
2047 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
2048 return 0;
2049
2050 slot++;
2051 scanned++;
2052
2053 /*
2054 * it goes inode, inode backrefs, xattrs, extents,
2055 * so if there are a ton of hard links to an inode there can
2056 * be a lot of backrefs. Don't waste time searching too hard,
2057 * this is just an optimization
2058 */
2059 if (scanned >= 8)
2060 break;
2061 }
2062 /* we hit the end of the leaf before we found an xattr or
2063 * something larger than an xattr. We have to assume the inode
2064 * has acls
2065 */
2066 return 1;
2067}
2068
2069/*
2014 * read an inode from the btree into the in-memory inode 2070 * read an inode from the btree into the in-memory inode
2015 */ 2071 */
2016void btrfs_read_locked_inode(struct inode *inode) 2072void btrfs_read_locked_inode(struct inode *inode)
@@ -2021,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2021 struct btrfs_timespec *tspec; 2077 struct btrfs_timespec *tspec;
2022 struct btrfs_root *root = BTRFS_I(inode)->root; 2078 struct btrfs_root *root = BTRFS_I(inode)->root;
2023 struct btrfs_key location; 2079 struct btrfs_key location;
2080 int maybe_acls;
2024 u64 alloc_group_block; 2081 u64 alloc_group_block;
2025 u32 rdev; 2082 u32 rdev;
2026 int ret; 2083 int ret;
@@ -2067,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode)
2067 2124
2068 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2125 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2069 2126
2127 /*
2128 * try to precache a NULL acl entry for files that don't have
2129 * any xattrs or acls
2130 */
2131 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
2132 if (!maybe_acls) {
2133 BTRFS_I(inode)->i_acl = NULL;
2134 BTRFS_I(inode)->i_default_acl = NULL;
2135 }
2136
2070 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2137 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2071 alloc_group_block, 0); 2138 alloc_group_block, 0);
2072 btrfs_free_path(path); 2139 btrfs_free_path(path);
@@ -2877,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2877 err = btrfs_drop_extents(trans, root, inode, 2944 err = btrfs_drop_extents(trans, root, inode,
2878 cur_offset, 2945 cur_offset,
2879 cur_offset + hole_size, 2946 cur_offset + hole_size,
2947 block_end,
2880 cur_offset, &hint_byte); 2948 cur_offset, &hint_byte);
2881 if (err) 2949 if (err)
2882 break; 2950 break;
@@ -3041,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode)
3041{ 3109{
3042 struct btrfs_inode *bi = BTRFS_I(inode); 3110 struct btrfs_inode *bi = BTRFS_I(inode);
3043 3111
3044 bi->i_acl = NULL; 3112 bi->i_acl = BTRFS_ACL_NOT_CACHED;
3045 bi->i_default_acl = NULL; 3113 bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
3046 3114
3047 bi->generation = 0; 3115 bi->generation = 0;
3048 bi->sequence = 0; 3116 bi->sequence = 0;
@@ -3054,6 +3122,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3054 bi->flags = 0; 3122 bi->flags = 0;
3055 bi->index_cnt = (u64)-1; 3123 bi->index_cnt = (u64)-1;
3056 bi->last_unlink_trans = 0; 3124 bi->last_unlink_trans = 0;
3125 bi->ordered_data_close = 0;
3057 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3126 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3058 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3127 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3059 inode->i_mapping, GFP_NOFS); 3128 inode->i_mapping, GFP_NOFS);
@@ -4227,7 +4296,6 @@ out:
4227 } 4296 }
4228 if (err) { 4297 if (err) {
4229 free_extent_map(em); 4298 free_extent_map(em);
4230 WARN_ON(1);
4231 return ERR_PTR(err); 4299 return ERR_PTR(err);
4232 } 4300 }
4233 return em; 4301 return em;
@@ -4634,47 +4702,36 @@ void btrfs_destroy_cachep(void)
4634 kmem_cache_destroy(btrfs_trans_handle_cachep); 4702 kmem_cache_destroy(btrfs_trans_handle_cachep);
4635 if (btrfs_transaction_cachep) 4703 if (btrfs_transaction_cachep)
4636 kmem_cache_destroy(btrfs_transaction_cachep); 4704 kmem_cache_destroy(btrfs_transaction_cachep);
4637 if (btrfs_bit_radix_cachep)
4638 kmem_cache_destroy(btrfs_bit_radix_cachep);
4639 if (btrfs_path_cachep) 4705 if (btrfs_path_cachep)
4640 kmem_cache_destroy(btrfs_path_cachep); 4706 kmem_cache_destroy(btrfs_path_cachep);
4641} 4707}
4642 4708
4643struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4644 unsigned long extra_flags,
4645 void (*ctor)(void *))
4646{
4647 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4648 SLAB_MEM_SPREAD | extra_flags), ctor);
4649}
4650
4651int btrfs_init_cachep(void) 4709int btrfs_init_cachep(void)
4652{ 4710{
4653 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", 4711 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
4654 sizeof(struct btrfs_inode), 4712 sizeof(struct btrfs_inode), 0,
4655 0, init_once); 4713 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
4656 if (!btrfs_inode_cachep) 4714 if (!btrfs_inode_cachep)
4657 goto fail; 4715 goto fail;
4658 btrfs_trans_handle_cachep = 4716
4659 btrfs_cache_create("btrfs_trans_handle_cache", 4717 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
4660 sizeof(struct btrfs_trans_handle), 4718 sizeof(struct btrfs_trans_handle), 0,
4661 0, NULL); 4719 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
4662 if (!btrfs_trans_handle_cachep) 4720 if (!btrfs_trans_handle_cachep)
4663 goto fail; 4721 goto fail;
4664 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", 4722
4665 sizeof(struct btrfs_transaction), 4723 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
4666 0, NULL); 4724 sizeof(struct btrfs_transaction), 0,
4725 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
4667 if (!btrfs_transaction_cachep) 4726 if (!btrfs_transaction_cachep)
4668 goto fail; 4727 goto fail;
4669 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", 4728
4670 sizeof(struct btrfs_path), 4729 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
4671 0, NULL); 4730 sizeof(struct btrfs_path), 0,
4731 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
4672 if (!btrfs_path_cachep) 4732 if (!btrfs_path_cachep)
4673 goto fail; 4733 goto fail;
4674 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, 4734
4675 SLAB_DESTROY_BY_RCU, NULL);
4676 if (!btrfs_bit_radix_cachep)
4677 goto fail;
4678 return 0; 4735 return 0;
4679fail: 4736fail:
4680 btrfs_destroy_cachep(); 4737 btrfs_destroy_cachep();
@@ -4970,10 +5027,10 @@ out_fail:
4970 return err; 5027 return err;
4971} 5028}
4972 5029
4973static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 5030static int prealloc_file_range(struct btrfs_trans_handle *trans,
4974 u64 alloc_hint, int mode) 5031 struct inode *inode, u64 start, u64 end,
5032 u64 locked_end, u64 alloc_hint, int mode)
4975{ 5033{
4976 struct btrfs_trans_handle *trans;
4977 struct btrfs_root *root = BTRFS_I(inode)->root; 5034 struct btrfs_root *root = BTRFS_I(inode)->root;
4978 struct btrfs_key ins; 5035 struct btrfs_key ins;
4979 u64 alloc_size; 5036 u64 alloc_size;
@@ -4981,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4981 u64 num_bytes = end - start; 5038 u64 num_bytes = end - start;
4982 int ret = 0; 5039 int ret = 0;
4983 5040
4984 trans = btrfs_join_transaction(root, 1);
4985 BUG_ON(!trans);
4986 btrfs_set_trans_block_group(trans, inode);
4987
4988 while (num_bytes > 0) { 5041 while (num_bytes > 0) {
4989 alloc_size = min(num_bytes, root->fs_info->max_extent); 5042 alloc_size = min(num_bytes, root->fs_info->max_extent);
4990 ret = btrfs_reserve_extent(trans, root, alloc_size, 5043 ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -4997,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4997 ret = insert_reserved_file_extent(trans, inode, 5050 ret = insert_reserved_file_extent(trans, inode,
4998 cur_offset, ins.objectid, 5051 cur_offset, ins.objectid,
4999 ins.offset, ins.offset, 5052 ins.offset, ins.offset,
5000 ins.offset, 0, 0, 0, 5053 ins.offset, locked_end,
5054 0, 0, 0,
5001 BTRFS_FILE_EXTENT_PREALLOC); 5055 BTRFS_FILE_EXTENT_PREALLOC);
5002 BUG_ON(ret); 5056 BUG_ON(ret);
5003 num_bytes -= ins.offset; 5057 num_bytes -= ins.offset;
@@ -5015,7 +5069,6 @@ out:
5015 BUG_ON(ret); 5069 BUG_ON(ret);
5016 } 5070 }
5017 5071
5018 btrfs_end_transaction(trans, root);
5019 return ret; 5072 return ret;
5020} 5073}
5021 5074
@@ -5027,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5027 u64 alloc_start; 5080 u64 alloc_start;
5028 u64 alloc_end; 5081 u64 alloc_end;
5029 u64 alloc_hint = 0; 5082 u64 alloc_hint = 0;
5083 u64 locked_end;
5030 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5084 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
5031 struct extent_map *em; 5085 struct extent_map *em;
5086 struct btrfs_trans_handle *trans;
5032 int ret; 5087 int ret;
5033 5088
5034 alloc_start = offset & ~mask; 5089 alloc_start = offset & ~mask;
5035 alloc_end = (offset + len + mask) & ~mask; 5090 alloc_end = (offset + len + mask) & ~mask;
5036 5091
5092 /*
5093 * wait for ordered IO before we have any locks. We'll loop again
5094 * below with the locks held.
5095 */
5096 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
5097
5037 mutex_lock(&inode->i_mutex); 5098 mutex_lock(&inode->i_mutex);
5038 if (alloc_start > inode->i_size) { 5099 if (alloc_start > inode->i_size) {
5039 ret = btrfs_cont_expand(inode, alloc_start); 5100 ret = btrfs_cont_expand(inode, alloc_start);
@@ -5041,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5041 goto out; 5102 goto out;
5042 } 5103 }
5043 5104
5105 locked_end = alloc_end - 1;
5044 while (1) { 5106 while (1) {
5045 struct btrfs_ordered_extent *ordered; 5107 struct btrfs_ordered_extent *ordered;
5046 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, 5108
5047 alloc_end - 1, GFP_NOFS); 5109 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
5110 if (!trans) {
5111 ret = -EIO;
5112 goto out;
5113 }
5114
5115 /* the extent lock is ordered inside the running
5116 * transaction
5117 */
5118 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5119 GFP_NOFS);
5048 ordered = btrfs_lookup_first_ordered_extent(inode, 5120 ordered = btrfs_lookup_first_ordered_extent(inode,
5049 alloc_end - 1); 5121 alloc_end - 1);
5050 if (ordered && 5122 if (ordered &&
@@ -5052,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5052 ordered->file_offset < alloc_end) { 5124 ordered->file_offset < alloc_end) {
5053 btrfs_put_ordered_extent(ordered); 5125 btrfs_put_ordered_extent(ordered);
5054 unlock_extent(&BTRFS_I(inode)->io_tree, 5126 unlock_extent(&BTRFS_I(inode)->io_tree,
5055 alloc_start, alloc_end - 1, GFP_NOFS); 5127 alloc_start, locked_end, GFP_NOFS);
5128 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5129
5130 /*
5131 * we can't wait on the range with the transaction
5132 * running or with the extent lock held
5133 */
5056 btrfs_wait_ordered_range(inode, alloc_start, 5134 btrfs_wait_ordered_range(inode, alloc_start,
5057 alloc_end - alloc_start); 5135 alloc_end - alloc_start);
5058 } else { 5136 } else {
@@ -5070,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5070 last_byte = min(extent_map_end(em), alloc_end); 5148 last_byte = min(extent_map_end(em), alloc_end);
5071 last_byte = (last_byte + mask) & ~mask; 5149 last_byte = (last_byte + mask) & ~mask;
5072 if (em->block_start == EXTENT_MAP_HOLE) { 5150 if (em->block_start == EXTENT_MAP_HOLE) {
5073 ret = prealloc_file_range(inode, cur_offset, 5151 ret = prealloc_file_range(trans, inode, cur_offset,
5074 last_byte, alloc_hint, mode); 5152 last_byte, locked_end + 1,
5153 alloc_hint, mode);
5075 if (ret < 0) { 5154 if (ret < 0) {
5076 free_extent_map(em); 5155 free_extent_map(em);
5077 break; 5156 break;
@@ -5087,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5087 break; 5166 break;
5088 } 5167 }
5089 } 5168 }
5090 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, 5169 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5091 GFP_NOFS); 5170 GFP_NOFS);
5171
5172 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5092out: 5173out:
5093 mutex_unlock(&inode->i_mutex); 5174 mutex_unlock(&inode->i_mutex);
5094 return ret; 5175 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7594bec1be10..2624b53ea783 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -437,10 +437,6 @@ out_unlock:
437 return 0; 437 return 0;
438} 438}
439 439
440/*
441 * Called inside transaction, so use GFP_NOFS
442 */
443
444static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) 440static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
445{ 441{
446 u64 new_size; 442 u64 new_size;
@@ -461,15 +457,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
461 if (!capable(CAP_SYS_ADMIN)) 457 if (!capable(CAP_SYS_ADMIN))
462 return -EPERM; 458 return -EPERM;
463 459
464 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 460 vol_args = memdup_user(arg, sizeof(*vol_args));
465 461 if (IS_ERR(vol_args))
466 if (!vol_args) 462 return PTR_ERR(vol_args);
467 return -ENOMEM;
468
469 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
470 ret = -EFAULT;
471 goto out;
472 }
473 463
474 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 464 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
475 namelen = strlen(vol_args->name); 465 namelen = strlen(vol_args->name);
@@ -483,11 +473,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
483 *devstr = '\0'; 473 *devstr = '\0';
484 devstr = vol_args->name; 474 devstr = vol_args->name;
485 devid = simple_strtoull(devstr, &end, 10); 475 devid = simple_strtoull(devstr, &end, 10);
486 printk(KERN_INFO "resizing devid %llu\n", devid); 476 printk(KERN_INFO "resizing devid %llu\n",
477 (unsigned long long)devid);
487 } 478 }
488 device = btrfs_find_device(root, devid, NULL, NULL); 479 device = btrfs_find_device(root, devid, NULL, NULL);
489 if (!device) { 480 if (!device) {
490 printk(KERN_INFO "resizer unable to find device %llu\n", devid); 481 printk(KERN_INFO "resizer unable to find device %llu\n",
482 (unsigned long long)devid);
491 ret = -EINVAL; 483 ret = -EINVAL;
492 goto out_unlock; 484 goto out_unlock;
493 } 485 }
@@ -545,7 +537,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
545 537
546out_unlock: 538out_unlock:
547 mutex_unlock(&root->fs_info->volume_mutex); 539 mutex_unlock(&root->fs_info->volume_mutex);
548out:
549 kfree(vol_args); 540 kfree(vol_args);
550 return ret; 541 return ret;
551} 542}
@@ -565,15 +556,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
565 if (root->fs_info->sb->s_flags & MS_RDONLY) 556 if (root->fs_info->sb->s_flags & MS_RDONLY)
566 return -EROFS; 557 return -EROFS;
567 558
568 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 559 vol_args = memdup_user(arg, sizeof(*vol_args));
569 560 if (IS_ERR(vol_args))
570 if (!vol_args) 561 return PTR_ERR(vol_args);
571 return -ENOMEM;
572
573 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
574 ret = -EFAULT;
575 goto out;
576 }
577 562
578 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 563 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
579 namelen = strlen(vol_args->name); 564 namelen = strlen(vol_args->name);
@@ -675,19 +660,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
675 if (!capable(CAP_SYS_ADMIN)) 660 if (!capable(CAP_SYS_ADMIN))
676 return -EPERM; 661 return -EPERM;
677 662
678 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 663 vol_args = memdup_user(arg, sizeof(*vol_args));
679 664 if (IS_ERR(vol_args))
680 if (!vol_args) 665 return PTR_ERR(vol_args);
681 return -ENOMEM;
682 666
683 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
684 ret = -EFAULT;
685 goto out;
686 }
687 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 667 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
688 ret = btrfs_init_new_device(root, vol_args->name); 668 ret = btrfs_init_new_device(root, vol_args->name);
689 669
690out:
691 kfree(vol_args); 670 kfree(vol_args);
692 return ret; 671 return ret;
693} 672}
@@ -703,19 +682,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
703 if (root->fs_info->sb->s_flags & MS_RDONLY) 682 if (root->fs_info->sb->s_flags & MS_RDONLY)
704 return -EROFS; 683 return -EROFS;
705 684
706 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 685 vol_args = memdup_user(arg, sizeof(*vol_args));
707 686 if (IS_ERR(vol_args))
708 if (!vol_args) 687 return PTR_ERR(vol_args);
709 return -ENOMEM;
710 688
711 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
712 ret = -EFAULT;
713 goto out;
714 }
715 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 689 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
716 ret = btrfs_rm_device(root, vol_args->name); 690 ret = btrfs_rm_device(root, vol_args->name);
717 691
718out:
719 kfree(vol_args); 692 kfree(vol_args);
720 return ret; 693 return ret;
721} 694}
@@ -830,7 +803,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
830 BUG_ON(!trans); 803 BUG_ON(!trans);
831 804
832 /* punch hole in destination first */ 805 /* punch hole in destination first */
833 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); 806 btrfs_drop_extents(trans, root, inode, off, off + len,
807 off + len, 0, &hint_byte);
834 808
835 /* clone data */ 809 /* clone data */
836 key.objectid = src->i_ino; 810 key.objectid = src->i_ino;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 53c87b197d70..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
489 /* start IO across the range first to instantiate any delalloc 489 /* start IO across the range first to instantiate any delalloc
490 * extents 490 * extents
491 */ 491 */
492 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 492 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
493 493
494 /* The compression code will leave pages locked but return from 494 /* The compression code will leave pages locked but return from
495 * writepage without setting the page writeback. Starting again 495 * writepage without setting the page writeback. Starting again
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9744af9d71e9..2ff7cd2db25f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_flushoncommit, Opt_err, 71 Opt_ratio, Opt_flushoncommit, Opt_err,
72}; 72};
73 73
74static match_table_t tokens = { 74static match_table_t tokens = {
@@ -87,6 +87,7 @@ static match_table_t tokens = {
87 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"}, 88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"}, 89 {Opt_flushoncommit, "flushoncommit"},
90 {Opt_ratio, "metadata_ratio=%d"},
90 {Opt_err, NULL}, 91 {Opt_err, NULL},
91}; 92};
92 93
@@ -195,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
195 info->max_extent = max_t(u64, 196 info->max_extent = max_t(u64,
196 info->max_extent, root->sectorsize); 197 info->max_extent, root->sectorsize);
197 printk(KERN_INFO "btrfs: max_extent at %llu\n", 198 printk(KERN_INFO "btrfs: max_extent at %llu\n",
198 info->max_extent); 199 (unsigned long long)info->max_extent);
199 } 200 }
200 break; 201 break;
201 case Opt_max_inline: 202 case Opt_max_inline:
@@ -210,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
210 root->sectorsize); 211 root->sectorsize);
211 } 212 }
212 printk(KERN_INFO "btrfs: max_inline at %llu\n", 213 printk(KERN_INFO "btrfs: max_inline at %llu\n",
213 info->max_inline); 214 (unsigned long long)info->max_inline);
214 } 215 }
215 break; 216 break;
216 case Opt_alloc_start: 217 case Opt_alloc_start:
@@ -220,7 +221,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
220 kfree(num); 221 kfree(num);
221 printk(KERN_INFO 222 printk(KERN_INFO
222 "btrfs: allocations start at %llu\n", 223 "btrfs: allocations start at %llu\n",
223 info->alloc_start); 224 (unsigned long long)info->alloc_start);
224 } 225 }
225 break; 226 break;
226 case Opt_noacl: 227 case Opt_noacl:
@@ -234,6 +235,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
234 printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); 235 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
235 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); 236 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
236 break; 237 break;
238 case Opt_ratio:
239 intarg = 0;
240 match_int(&args[0], &intarg);
241 if (intarg) {
242 info->metadata_ratio = intarg;
243 printk(KERN_INFO "btrfs: metadata ratio %d\n",
244 info->metadata_ratio);
245 }
246 break;
237 default: 247 default:
238 break; 248 break;
239 } 249 }
@@ -410,11 +420,14 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
410 if (btrfs_test_opt(root, NOBARRIER)) 420 if (btrfs_test_opt(root, NOBARRIER))
411 seq_puts(seq, ",nobarrier"); 421 seq_puts(seq, ",nobarrier");
412 if (info->max_extent != (u64)-1) 422 if (info->max_extent != (u64)-1)
413 seq_printf(seq, ",max_extent=%llu", info->max_extent); 423 seq_printf(seq, ",max_extent=%llu",
424 (unsigned long long)info->max_extent);
414 if (info->max_inline != 8192 * 1024) 425 if (info->max_inline != 8192 * 1024)
415 seq_printf(seq, ",max_inline=%llu", info->max_inline); 426 seq_printf(seq, ",max_inline=%llu",
427 (unsigned long long)info->max_inline);
416 if (info->alloc_start != 0) 428 if (info->alloc_start != 0)
417 seq_printf(seq, ",alloc_start=%llu", info->alloc_start); 429 seq_printf(seq, ",alloc_start=%llu",
430 (unsigned long long)info->alloc_start);
418 if (info->thread_pool_size != min_t(unsigned long, 431 if (info->thread_pool_size != min_t(unsigned long,
419 num_online_cpus() + 2, 8)) 432 num_online_cpus() + 2, 8))
420 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 433 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -423,9 +436,9 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
423 if (btrfs_test_opt(root, SSD)) 436 if (btrfs_test_opt(root, SSD))
424 seq_puts(seq, ",ssd"); 437 seq_puts(seq, ",ssd");
425 if (btrfs_test_opt(root, NOTREELOG)) 438 if (btrfs_test_opt(root, NOTREELOG))
426 seq_puts(seq, ",no-treelog"); 439 seq_puts(seq, ",notreelog");
427 if (btrfs_test_opt(root, FLUSHONCOMMIT)) 440 if (btrfs_test_opt(root, FLUSHONCOMMIT))
428 seq_puts(seq, ",flush-on-commit"); 441 seq_puts(seq, ",flushoncommit");
429 if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) 442 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
430 seq_puts(seq, ",noacl"); 443 seq_puts(seq, ",noacl");
431 return 0; 444 return 0;
@@ -489,8 +502,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
489 502
490 if (s->s_root) { 503 if (s->s_root) {
491 if ((flags ^ s->s_flags) & MS_RDONLY) { 504 if ((flags ^ s->s_flags) & MS_RDONLY) {
492 up_write(&s->s_umount); 505 deactivate_locked_super(s);
493 deactivate_super(s);
494 error = -EBUSY; 506 error = -EBUSY;
495 goto error_close_devices; 507 goto error_close_devices;
496 } 508 }
@@ -504,8 +516,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
504 error = btrfs_fill_super(s, fs_devices, data, 516 error = btrfs_fill_super(s, fs_devices, data,
505 flags & MS_SILENT ? 1 : 0); 517 flags & MS_SILENT ? 1 : 0);
506 if (error) { 518 if (error) {
507 up_write(&s->s_umount); 519 deactivate_locked_super(s);
508 deactivate_super(s);
509 goto error_free_subvol_name; 520 goto error_free_subvol_name;
510 } 521 }
511 522
@@ -522,15 +533,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
522 mutex_unlock(&s->s_root->d_inode->i_mutex); 533 mutex_unlock(&s->s_root->d_inode->i_mutex);
523 534
524 if (IS_ERR(root)) { 535 if (IS_ERR(root)) {
525 up_write(&s->s_umount); 536 deactivate_locked_super(s);
526 deactivate_super(s);
527 error = PTR_ERR(root); 537 error = PTR_ERR(root);
528 goto error_free_subvol_name; 538 goto error_free_subvol_name;
529 } 539 }
530 if (!root->d_inode) { 540 if (!root->d_inode) {
531 dput(root); 541 dput(root);
532 up_write(&s->s_umount); 542 deactivate_locked_super(s);
533 deactivate_super(s);
534 error = -ENXIO; 543 error = -ENXIO;
535 goto error_free_subvol_name; 544 goto error_free_subvol_name;
536 } 545 }
@@ -635,14 +644,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
635 if (!capable(CAP_SYS_ADMIN)) 644 if (!capable(CAP_SYS_ADMIN))
636 return -EPERM; 645 return -EPERM;
637 646
638 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 647 vol = memdup_user((void __user *)arg, sizeof(*vol));
639 if (!vol) 648 if (IS_ERR(vol))
640 return -ENOMEM; 649 return PTR_ERR(vol);
641
642 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
643 ret = -EFAULT;
644 goto out;
645 }
646 650
647 switch (cmd) { 651 switch (cmd) {
648 case BTRFS_IOC_SCAN_DEV: 652 case BTRFS_IOC_SCAN_DEV:
@@ -650,7 +654,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
650 &btrfs_fs_type, &fs_devices); 654 &btrfs_fs_type, &fs_devices);
651 break; 655 break;
652 } 656 }
653out: 657
654 kfree(vol); 658 kfree(vol);
655 return ret; 659 return ret;
656} 660}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2869b3361eb6..01b143605ec1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -687,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
687 prepare_to_wait(&info->transaction_wait, &wait, 687 prepare_to_wait(&info->transaction_wait, &wait,
688 TASK_UNINTERRUPTIBLE); 688 TASK_UNINTERRUPTIBLE);
689 mutex_unlock(&info->trans_mutex); 689 mutex_unlock(&info->trans_mutex);
690
691 atomic_dec(&info->throttles);
692 wake_up(&info->transaction_throttle);
693
690 schedule(); 694 schedule();
695
696 atomic_inc(&info->throttles);
691 mutex_lock(&info->trans_mutex); 697 mutex_lock(&info->trans_mutex);
692 finish_wait(&info->transaction_wait, &wait); 698 finish_wait(&info->transaction_wait, &wait);
693 } 699 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 25f20ea11f27..db5e212e8445 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -536,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
536 saved_nbytes = inode_get_bytes(inode); 536 saved_nbytes = inode_get_bytes(inode);
537 /* drop any overlapping extents */ 537 /* drop any overlapping extents */
538 ret = btrfs_drop_extents(trans, root, inode, 538 ret = btrfs_drop_extents(trans, root, inode,
539 start, extent_end, start, &alloc_hint); 539 start, extent_end, extent_end, start, &alloc_hint);
540 BUG_ON(ret); 540 BUG_ON(ret);
541 541
542 if (found_type == BTRFS_FILE_EXTENT_REG || 542 if (found_type == BTRFS_FILE_EXTENT_REG ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0913e469728..5f01dad4b696 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
125 return NULL; 125 return NULL;
126} 126}
127 127
128static void requeue_list(struct btrfs_pending_bios *pending_bios,
129 struct bio *head, struct bio *tail)
130{
131
132 struct bio *old_head;
133
134 old_head = pending_bios->head;
135 pending_bios->head = head;
136 if (pending_bios->tail)
137 tail->bi_next = old_head;
138 else
139 pending_bios->tail = tail;
140}
141
128/* 142/*
129 * we try to collect pending bios for a device so we don't get a large 143 * we try to collect pending bios for a device so we don't get a large
130 * number of procs sending bios down to the same device. This greatly 144 * number of procs sending bios down to the same device. This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
141 struct bio *pending; 155 struct bio *pending;
142 struct backing_dev_info *bdi; 156 struct backing_dev_info *bdi;
143 struct btrfs_fs_info *fs_info; 157 struct btrfs_fs_info *fs_info;
158 struct btrfs_pending_bios *pending_bios;
144 struct bio *tail; 159 struct bio *tail;
145 struct bio *cur; 160 struct bio *cur;
146 int again = 0; 161 int again = 0;
147 unsigned long num_run = 0; 162 unsigned long num_run;
163 unsigned long num_sync_run;
148 unsigned long limit; 164 unsigned long limit;
149 unsigned long last_waited = 0; 165 unsigned long last_waited = 0;
150 166
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
153 limit = btrfs_async_submit_limit(fs_info); 169 limit = btrfs_async_submit_limit(fs_info);
154 limit = limit * 2 / 3; 170 limit = limit * 2 / 3;
155 171
172 /* we want to make sure that every time we switch from the sync
173 * list to the normal list, we unplug
174 */
175 num_sync_run = 0;
176
156loop: 177loop:
157 spin_lock(&device->io_lock); 178 spin_lock(&device->io_lock);
179 num_run = 0;
158 180
159loop_lock: 181loop_lock:
182
160 /* take all the bios off the list at once and process them 183 /* take all the bios off the list at once and process them
161 * later on (without the lock held). But, remember the 184 * later on (without the lock held). But, remember the
162 * tail and other pointers so the bios can be properly reinserted 185 * tail and other pointers so the bios can be properly reinserted
163 * into the list if we hit congestion 186 * into the list if we hit congestion
164 */ 187 */
165 pending = device->pending_bios; 188 if (device->pending_sync_bios.head)
166 tail = device->pending_bio_tail; 189 pending_bios = &device->pending_sync_bios;
190 else
191 pending_bios = &device->pending_bios;
192
193 pending = pending_bios->head;
194 tail = pending_bios->tail;
167 WARN_ON(pending && !tail); 195 WARN_ON(pending && !tail);
168 device->pending_bios = NULL;
169 device->pending_bio_tail = NULL;
170 196
171 /* 197 /*
172 * if pending was null this time around, no bios need processing 198 * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
176 * device->running_pending is used to synchronize with the 202 * device->running_pending is used to synchronize with the
177 * schedule_bio code. 203 * schedule_bio code.
178 */ 204 */
179 if (pending) { 205 if (device->pending_sync_bios.head == NULL &&
180 again = 1; 206 device->pending_bios.head == NULL) {
181 device->running_pending = 1;
182 } else {
183 again = 0; 207 again = 0;
184 device->running_pending = 0; 208 device->running_pending = 0;
209 } else {
210 again = 1;
211 device->running_pending = 1;
185 } 212 }
213
214 pending_bios->head = NULL;
215 pending_bios->tail = NULL;
216
186 spin_unlock(&device->io_lock); 217 spin_unlock(&device->io_lock);
187 218
219 /*
220 * if we're doing the regular priority list, make sure we unplug
221 * for any high prio bios we've sent down
222 */
223 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
224 num_sync_run = 0;
225 blk_run_backing_dev(bdi, NULL);
226 }
227
188 while (pending) { 228 while (pending) {
229
230 rmb();
231 if (pending_bios != &device->pending_sync_bios &&
232 device->pending_sync_bios.head &&
233 num_run > 16) {
234 cond_resched();
235 spin_lock(&device->io_lock);
236 requeue_list(pending_bios, pending, tail);
237 goto loop_lock;
238 }
239
189 cur = pending; 240 cur = pending;
190 pending = pending->bi_next; 241 pending = pending->bi_next;
191 cur->bi_next = NULL; 242 cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
196 wake_up(&fs_info->async_submit_wait); 247 wake_up(&fs_info->async_submit_wait);
197 248
198 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 249 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
199 bio_get(cur);
200 submit_bio(cur->bi_rw, cur); 250 submit_bio(cur->bi_rw, cur);
201 bio_put(cur);
202 num_run++; 251 num_run++;
252 if (bio_sync(cur))
253 num_sync_run++;
254
255 if (need_resched()) {
256 if (num_sync_run) {
257 blk_run_backing_dev(bdi, NULL);
258 num_sync_run = 0;
259 }
260 cond_resched();
261 }
203 262
204 /* 263 /*
205 * we made progress, there is more work to do and the bdi 264 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
208 */ 267 */
209 if (pending && bdi_write_congested(bdi) && num_run > 16 && 268 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
210 fs_info->fs_devices->open_devices > 1) { 269 fs_info->fs_devices->open_devices > 1) {
211 struct bio *old_head;
212 struct io_context *ioc; 270 struct io_context *ioc;
213 271
214 ioc = current->io_context; 272 ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
233 * against it before looping 291 * against it before looping
234 */ 292 */
235 last_waited = ioc->last_waited; 293 last_waited = ioc->last_waited;
294 if (need_resched()) {
295 if (num_sync_run) {
296 blk_run_backing_dev(bdi, NULL);
297 num_sync_run = 0;
298 }
299 cond_resched();
300 }
236 continue; 301 continue;
237 } 302 }
238 spin_lock(&device->io_lock); 303 spin_lock(&device->io_lock);
239 304 requeue_list(pending_bios, pending, tail);
240 old_head = device->pending_bios;
241 device->pending_bios = pending;
242 if (device->pending_bio_tail)
243 tail->bi_next = old_head;
244 else
245 device->pending_bio_tail = tail;
246
247 device->running_pending = 1; 305 device->running_pending = 1;
248 306
249 spin_unlock(&device->io_lock); 307 spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
251 goto done; 309 goto done;
252 } 310 }
253 } 311 }
312
313 if (num_sync_run) {
314 num_sync_run = 0;
315 blk_run_backing_dev(bdi, NULL);
316 }
317
318 cond_resched();
254 if (again) 319 if (again)
255 goto loop; 320 goto loop;
256 321
257 spin_lock(&device->io_lock); 322 spin_lock(&device->io_lock);
258 if (device->pending_bios) 323 if (device->pending_bios.head || device->pending_sync_bios.head)
259 goto loop_lock; 324 goto loop_lock;
260 spin_unlock(&device->io_lock); 325 spin_unlock(&device->io_lock);
261 326
@@ -1478,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1478 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1543 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1479 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1544 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1480 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1545 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1481 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1546 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1482 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1547 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1483 btrfs_mark_buffer_dirty(leaf); 1548 btrfs_mark_buffer_dirty(leaf);
1484 1549
@@ -1875,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1875 device->total_bytes = new_size; 1940 device->total_bytes = new_size;
1876 if (device->writeable) 1941 if (device->writeable)
1877 device->fs_devices->total_rw_bytes -= diff; 1942 device->fs_devices->total_rw_bytes -= diff;
1878 ret = btrfs_update_device(trans, device);
1879 if (ret) {
1880 unlock_chunks(root);
1881 btrfs_end_transaction(trans, root);
1882 goto done;
1883 }
1884 WARN_ON(diff > old_total);
1885 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1886 unlock_chunks(root); 1943 unlock_chunks(root);
1887 btrfs_end_transaction(trans, root); 1944 btrfs_end_transaction(trans, root);
1888 1945
@@ -1914,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1914 length = btrfs_dev_extent_length(l, dev_extent); 1971 length = btrfs_dev_extent_length(l, dev_extent);
1915 1972
1916 if (key.offset + length <= new_size) 1973 if (key.offset + length <= new_size)
1917 goto done; 1974 break;
1918 1975
1919 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 1976 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1920 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 1977 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1927,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1927 goto done; 1984 goto done;
1928 } 1985 }
1929 1986
1987 /* Shrinking succeeded, else we would be at "done". */
1988 trans = btrfs_start_transaction(root, 1);
1989 if (!trans) {
1990 ret = -ENOMEM;
1991 goto done;
1992 }
1993 lock_chunks(root);
1994
1995 device->disk_total_bytes = new_size;
1996 /* Now btrfs_update_device() will change the on-disk size. */
1997 ret = btrfs_update_device(trans, device);
1998 if (ret) {
1999 unlock_chunks(root);
2000 btrfs_end_transaction(trans, root);
2001 goto done;
2002 }
2003 WARN_ON(diff > old_total);
2004 btrfs_set_super_total_bytes(super_copy, old_total - diff);
2005 unlock_chunks(root);
2006 btrfs_end_transaction(trans, root);
1930done: 2007done:
1931 btrfs_free_path(path); 2008 btrfs_free_path(path);
1932 return ret; 2009 return ret;
@@ -2497,7 +2574,7 @@ again:
2497 max_errors = 1; 2574 max_errors = 1;
2498 } 2575 }
2499 } 2576 }
2500 if (multi_ret && rw == WRITE && 2577 if (multi_ret && (rw & (1 << BIO_RW)) &&
2501 stripes_allocated < stripes_required) { 2578 stripes_allocated < stripes_required) {
2502 stripes_allocated = map->num_stripes; 2579 stripes_allocated = map->num_stripes;
2503 free_extent_map(em); 2580 free_extent_map(em);
@@ -2762,6 +2839,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2762 int rw, struct bio *bio) 2839 int rw, struct bio *bio)
2763{ 2840{
2764 int should_queue = 1; 2841 int should_queue = 1;
2842 struct btrfs_pending_bios *pending_bios;
2765 2843
2766 /* don't bother with additional async steps for reads, right now */ 2844 /* don't bother with additional async steps for reads, right now */
2767 if (!(rw & (1 << BIO_RW))) { 2845 if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2861,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
2783 bio->bi_rw |= rw; 2861 bio->bi_rw |= rw;
2784 2862
2785 spin_lock(&device->io_lock); 2863 spin_lock(&device->io_lock);
2864 if (bio_sync(bio))
2865 pending_bios = &device->pending_sync_bios;
2866 else
2867 pending_bios = &device->pending_bios;
2786 2868
2787 if (device->pending_bio_tail) 2869 if (pending_bios->tail)
2788 device->pending_bio_tail->bi_next = bio; 2870 pending_bios->tail->bi_next = bio;
2789 2871
2790 device->pending_bio_tail = bio; 2872 pending_bios->tail = bio;
2791 if (!device->pending_bios) 2873 if (!pending_bios->head)
2792 device->pending_bios = bio; 2874 pending_bios->head = bio;
2793 if (device->running_pending) 2875 if (device->running_pending)
2794 should_queue = 0; 2876 should_queue = 0;
2795 2877
@@ -3006,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
3006 unsigned long ptr; 3088 unsigned long ptr;
3007 3089
3008 device->devid = btrfs_device_id(leaf, dev_item); 3090 device->devid = btrfs_device_id(leaf, dev_item);
3009 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3091 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3092 device->total_bytes = device->disk_total_bytes;
3010 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3093 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
3011 device->type = btrfs_device_type(leaf, dev_item); 3094 device->type = btrfs_device_type(leaf, dev_item);
3012 device->io_align = btrfs_device_io_align(leaf, dev_item); 3095 device->io_align = btrfs_device_io_align(leaf, dev_item);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2185de72ff7d..5c3ff6d02fd7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
23#include "async-thread.h" 23#include "async-thread.h"
24 24
25struct buffer_head; 25struct buffer_head;
26struct btrfs_pending_bios {
27 struct bio *head;
28 struct bio *tail;
29};
30
26struct btrfs_device { 31struct btrfs_device {
27 struct list_head dev_list; 32 struct list_head dev_list;
28 struct list_head dev_alloc_list; 33 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices; 34 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root; 35 struct btrfs_root *dev_root;
31 struct bio *pending_bios; 36
32 struct bio *pending_bio_tail; 37 /* regular prio bios */
38 struct btrfs_pending_bios pending_bios;
39 /* WRITE_SYNC bios */
40 struct btrfs_pending_bios pending_sync_bios;
41
33 int running_pending; 42 int running_pending;
34 u64 generation; 43 u64 generation;
35 44
@@ -52,6 +61,9 @@ struct btrfs_device {
52 /* size of the device */ 61 /* size of the device */
53 u64 total_bytes; 62 u64 total_bytes;
54 63
64 /* size of the disk */
65 u64 disk_total_bytes;
66
55 /* bytes used */ 67 /* bytes used */
56 u64 bytes_used; 68 u64 bytes_used;
57 69
diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a896ff78..aed297739eb0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,7 +360,7 @@ still_busy:
360 * Completion handler for block_write_full_page() - pages which are unlocked 360 * Completion handler for block_write_full_page() - pages which are unlocked
361 * during I/O, and which have PageWriteback cleared upon I/O completion. 361 * during I/O, and which have PageWriteback cleared upon I/O completion.
362 */ 362 */
363static void end_buffer_async_write(struct buffer_head *bh, int uptodate) 363void end_buffer_async_write(struct buffer_head *bh, int uptodate)
364{ 364{
365 char b[BDEVNAME_SIZE]; 365 char b[BDEVNAME_SIZE];
366 unsigned long flags; 366 unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
438 set_buffer_async_read(bh); 438 set_buffer_async_read(bh);
439} 439}
440 440
441void mark_buffer_async_write(struct buffer_head *bh) 441void mark_buffer_async_write_endio(struct buffer_head *bh,
442 bh_end_io_t *handler)
442{ 443{
443 bh->b_end_io = end_buffer_async_write; 444 bh->b_end_io = handler;
444 set_buffer_async_write(bh); 445 set_buffer_async_write(bh);
445} 446}
447
448void mark_buffer_async_write(struct buffer_head *bh)
449{
450 mark_buffer_async_write_endio(bh, end_buffer_async_write);
451}
446EXPORT_SYMBOL(mark_buffer_async_write); 452EXPORT_SYMBOL(mark_buffer_async_write);
447 453
448 454
@@ -547,7 +553,7 @@ repeat:
547 return err; 553 return err;
548} 554}
549 555
550void do_thaw_all(unsigned long unused) 556void do_thaw_all(struct work_struct *work)
551{ 557{
552 struct super_block *sb; 558 struct super_block *sb;
553 char b[BDEVNAME_SIZE]; 559 char b[BDEVNAME_SIZE];
@@ -567,6 +573,7 @@ restart:
567 goto restart; 573 goto restart;
568 } 574 }
569 spin_unlock(&sb_lock); 575 spin_unlock(&sb_lock);
576 kfree(work);
570 printk(KERN_WARNING "Emergency Thaw complete\n"); 577 printk(KERN_WARNING "Emergency Thaw complete\n");
571} 578}
572 579
@@ -577,7 +584,13 @@ restart:
577 */ 584 */
578void emergency_thaw_all(void) 585void emergency_thaw_all(void)
579{ 586{
580 pdflush_operation(do_thaw_all, 0); 587 struct work_struct *work;
588
589 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590 if (work) {
591 INIT_WORK(work, do_thaw_all);
592 schedule_work(work);
593 }
581} 594}
582 595
583/** 596/**
@@ -737,7 +750,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
737{ 750{
738 struct buffer_head *bh; 751 struct buffer_head *bh;
739 struct list_head tmp; 752 struct list_head tmp;
740 struct address_space *mapping; 753 struct address_space *mapping, *prev_mapping = NULL;
741 int err = 0, err2; 754 int err = 0, err2;
742 755
743 INIT_LIST_HEAD(&tmp); 756 INIT_LIST_HEAD(&tmp);
@@ -762,7 +775,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
762 * contents - it is a noop if I/O is still in 775 * contents - it is a noop if I/O is still in
763 * flight on potentially older contents. 776 * flight on potentially older contents.
764 */ 777 */
765 ll_rw_block(SWRITE_SYNC, 1, &bh); 778 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
779
780 /*
781 * Kick off IO for the previous mapping. Note
782 * that we will not run the very last mapping,
783 * wait_on_buffer() will do that for us
784 * through sync_buffer().
785 */
786 if (prev_mapping && prev_mapping != mapping)
787 blk_run_address_space(prev_mapping);
788 prev_mapping = mapping;
789
766 brelse(bh); 790 brelse(bh);
767 spin_lock(lock); 791 spin_lock(lock);
768 } 792 }
@@ -1585,9 +1609,20 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1585 * locked buffer. This only can happen if someone has written the buffer 1609 * locked buffer. This only can happen if someone has written the buffer
1586 * directly, with submit_bh(). At the address_space level PageWriteback 1610 * directly, with submit_bh(). At the address_space level PageWriteback
1587 * prevents this contention from occurring. 1611 * prevents this contention from occurring.
1612 *
1613 * If block_write_full_page() is called with wbc->sync_mode ==
1614 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1615 * causes the writes to be flagged as synchronous writes, but the
1616 * block device queue will NOT be unplugged, since usually many pages
1617 * will be pushed to the out before the higher-level caller actually
1618 * waits for the writes to be completed. The various wait functions,
1619 * such as wait_on_writeback_range() will ultimately call sync_page()
1620 * which will ultimately call blk_run_backing_dev(), which will end up
1621 * unplugging the device queue.
1588 */ 1622 */
1589static int __block_write_full_page(struct inode *inode, struct page *page, 1623static int __block_write_full_page(struct inode *inode, struct page *page,
1590 get_block_t *get_block, struct writeback_control *wbc) 1624 get_block_t *get_block, struct writeback_control *wbc,
1625 bh_end_io_t *handler)
1591{ 1626{
1592 int err; 1627 int err;
1593 sector_t block; 1628 sector_t block;
@@ -1595,7 +1630,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1595 struct buffer_head *bh, *head; 1630 struct buffer_head *bh, *head;
1596 const unsigned blocksize = 1 << inode->i_blkbits; 1631 const unsigned blocksize = 1 << inode->i_blkbits;
1597 int nr_underway = 0; 1632 int nr_underway = 0;
1598 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 1633 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1634 WRITE_SYNC_PLUG : WRITE);
1599 1635
1600 BUG_ON(!PageLocked(page)); 1636 BUG_ON(!PageLocked(page));
1601 1637
@@ -1671,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1671 continue; 1707 continue;
1672 } 1708 }
1673 if (test_clear_buffer_dirty(bh)) { 1709 if (test_clear_buffer_dirty(bh)) {
1674 mark_buffer_async_write(bh); 1710 mark_buffer_async_write_endio(bh, handler);
1675 } else { 1711 } else {
1676 unlock_buffer(bh); 1712 unlock_buffer(bh);
1677 } 1713 }
@@ -1724,7 +1760,7 @@ recover:
1724 if (buffer_mapped(bh) && buffer_dirty(bh) && 1760 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1725 !buffer_delay(bh)) { 1761 !buffer_delay(bh)) {
1726 lock_buffer(bh); 1762 lock_buffer(bh);
1727 mark_buffer_async_write(bh); 1763 mark_buffer_async_write_endio(bh, handler);
1728 } else { 1764 } else {
1729 /* 1765 /*
1730 * The buffer may have been set dirty during 1766 * The buffer may have been set dirty during
@@ -2361,7 +2397,8 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2361 if ((page->mapping != inode->i_mapping) || 2397 if ((page->mapping != inode->i_mapping) ||
2362 (page_offset(page) > size)) { 2398 (page_offset(page) > size)) {
2363 /* page got truncated out from underneath us */ 2399 /* page got truncated out from underneath us */
2364 goto out_unlock; 2400 unlock_page(page);
2401 goto out;
2365 } 2402 }
2366 2403
2367 /* page is wholly or partially inside EOF */ 2404 /* page is wholly or partially inside EOF */
@@ -2375,14 +2412,15 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2375 ret = block_commit_write(page, 0, end); 2412 ret = block_commit_write(page, 0, end);
2376 2413
2377 if (unlikely(ret)) { 2414 if (unlikely(ret)) {
2415 unlock_page(page);
2378 if (ret == -ENOMEM) 2416 if (ret == -ENOMEM)
2379 ret = VM_FAULT_OOM; 2417 ret = VM_FAULT_OOM;
2380 else /* -ENOSPC, -EIO, etc */ 2418 else /* -ENOSPC, -EIO, etc */
2381 ret = VM_FAULT_SIGBUS; 2419 ret = VM_FAULT_SIGBUS;
2382 } 2420 } else
2421 ret = VM_FAULT_LOCKED;
2383 2422
2384out_unlock: 2423out:
2385 unlock_page(page);
2386 return ret; 2424 return ret;
2387} 2425}
2388 2426
@@ -2650,7 +2688,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
2650out: 2688out:
2651 ret = mpage_writepage(page, get_block, wbc); 2689 ret = mpage_writepage(page, get_block, wbc);
2652 if (ret == -EAGAIN) 2690 if (ret == -EAGAIN)
2653 ret = __block_write_full_page(inode, page, get_block, wbc); 2691 ret = __block_write_full_page(inode, page, get_block, wbc,
2692 end_buffer_async_write);
2654 return ret; 2693 return ret;
2655} 2694}
2656EXPORT_SYMBOL(nobh_writepage); 2695EXPORT_SYMBOL(nobh_writepage);
@@ -2808,9 +2847,10 @@ out:
2808 2847
2809/* 2848/*
2810 * The generic ->writepage function for buffer-backed address_spaces 2849 * The generic ->writepage function for buffer-backed address_spaces
2850 * this form passes in the end_io handler used to finish the IO.
2811 */ 2851 */
2812int block_write_full_page(struct page *page, get_block_t *get_block, 2852int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2813 struct writeback_control *wbc) 2853 struct writeback_control *wbc, bh_end_io_t *handler)
2814{ 2854{
2815 struct inode * const inode = page->mapping->host; 2855 struct inode * const inode = page->mapping->host;
2816 loff_t i_size = i_size_read(inode); 2856 loff_t i_size = i_size_read(inode);
@@ -2819,7 +2859,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2819 2859
2820 /* Is the page fully inside i_size? */ 2860 /* Is the page fully inside i_size? */
2821 if (page->index < end_index) 2861 if (page->index < end_index)
2822 return __block_write_full_page(inode, page, get_block, wbc); 2862 return __block_write_full_page(inode, page, get_block, wbc,
2863 handler);
2823 2864
2824 /* Is the page fully outside i_size? (truncate in progress) */ 2865 /* Is the page fully outside i_size? (truncate in progress) */
2825 offset = i_size & (PAGE_CACHE_SIZE-1); 2866 offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2842,9 +2883,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2842 * writes to that region are not written out to the file." 2883 * writes to that region are not written out to the file."
2843 */ 2884 */
2844 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2885 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2845 return __block_write_full_page(inode, page, get_block, wbc); 2886 return __block_write_full_page(inode, page, get_block, wbc, handler);
2846} 2887}
2847 2888
2889/*
2890 * The generic ->writepage function for buffer-backed address_spaces
2891 */
2892int block_write_full_page(struct page *page, get_block_t *get_block,
2893 struct writeback_control *wbc)
2894{
2895 return block_write_full_page_endio(page, get_block, wbc,
2896 end_buffer_async_write);
2897}
2898
2899
2848sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2900sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2849 get_block_t *get_block) 2901 get_block_t *get_block)
2850{ 2902{
@@ -2957,12 +3009,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2957 for (i = 0; i < nr; i++) { 3009 for (i = 0; i < nr; i++) {
2958 struct buffer_head *bh = bhs[i]; 3010 struct buffer_head *bh = bhs[i];
2959 3011
2960 if (rw == SWRITE || rw == SWRITE_SYNC) 3012 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
2961 lock_buffer(bh); 3013 lock_buffer(bh);
2962 else if (!trylock_buffer(bh)) 3014 else if (!trylock_buffer(bh))
2963 continue; 3015 continue;
2964 3016
2965 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { 3017 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3018 rw == SWRITE_SYNC_PLUG) {
2966 if (test_clear_buffer_dirty(bh)) { 3019 if (test_clear_buffer_dirty(bh)) {
2967 bh->b_end_io = end_buffer_write_sync; 3020 bh->b_end_io = end_buffer_write_sync;
2968 get_bh(bh); 3021 get_bh(bh);
@@ -2998,7 +3051,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
2998 if (test_clear_buffer_dirty(bh)) { 3051 if (test_clear_buffer_dirty(bh)) {
2999 get_bh(bh); 3052 get_bh(bh);
3000 bh->b_end_io = end_buffer_write_sync; 3053 bh->b_end_io = end_buffer_write_sync;
3001 ret = submit_bh(WRITE, bh); 3054 ret = submit_bh(WRITE_SYNC, bh);
3002 wait_on_buffer(bh); 3055 wait_on_buffer(bh);
3003 if (buffer_eopnotsupp(bh)) { 3056 if (buffer_eopnotsupp(bh)) {
3004 clear_buffer_eopnotsupp(bh); 3057 clear_buffer_eopnotsupp(bh);
@@ -3312,9 +3365,11 @@ EXPORT_SYMBOL(block_read_full_page);
3312EXPORT_SYMBOL(block_sync_page); 3365EXPORT_SYMBOL(block_sync_page);
3313EXPORT_SYMBOL(block_truncate_page); 3366EXPORT_SYMBOL(block_truncate_page);
3314EXPORT_SYMBOL(block_write_full_page); 3367EXPORT_SYMBOL(block_write_full_page);
3368EXPORT_SYMBOL(block_write_full_page_endio);
3315EXPORT_SYMBOL(cont_write_begin); 3369EXPORT_SYMBOL(cont_write_begin);
3316EXPORT_SYMBOL(end_buffer_read_sync); 3370EXPORT_SYMBOL(end_buffer_read_sync);
3317EXPORT_SYMBOL(end_buffer_write_sync); 3371EXPORT_SYMBOL(end_buffer_write_sync);
3372EXPORT_SYMBOL(end_buffer_async_write);
3318EXPORT_SYMBOL(file_fsync); 3373EXPORT_SYMBOL(file_fsync);
3319EXPORT_SYMBOL(generic_block_bmap); 3374EXPORT_SYMBOL(generic_block_bmap);
3320EXPORT_SYMBOL(generic_cont_expand_simple); 3375EXPORT_SYMBOL(generic_cont_expand_simple);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 65984006192c..f20c4069c220 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,16 @@
1Version 1.58
2------------
3Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
4when the UTF-8 string is composed of unusually long (more than 4 byte) converted
5characters. Add support for mounting root of a share which redirects immediately
6to DFS target. Convert string conversion functions from Unicode to more
7accurately mark string length before allocating memory (which may help the
8rare cases where a UTF-8 string is much larger than the UCS2 string that
9we converted from). Fix endianness of the vcnum field used during
10session setup to distinguish multiple mounts to same server from different
11userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
12flag to be set to 2, and mount must enable krb5 to turn on extended security).
13
1Version 1.57 14Version 1.57
2------------ 15------------
3Improve support for multiple security contexts to the same server. We 16Improve support for multiple security contexts to the same server. We
@@ -15,7 +28,8 @@ Posix file open support added (turned off after one attempt if server
15fails to support it properly, as with Samba server versions prior to 3.3.2) 28fails to support it properly, as with Samba server versions prior to 3.3.2)
16Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too 29Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
17little memory for the "nativeFileSystem" field returned by the server 30little memory for the "nativeFileSystem" field returned by the server
18during mount). 31during mount). Endian convert inode numbers if necessary (makes it easier
32to compare inode numbers on network files from big endian systems).
19 33
20Version 1.56 34Version 1.56
21------------ 35------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 07434181623b..db208ddb9899 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -651,7 +651,15 @@ Experimental When set to 1 used to enable certain experimental
651 signing turned on in case buffer was modified 651 signing turned on in case buffer was modified
652 just before it was sent, also this flag will 652 just before it was sent, also this flag will
653 be used to use the new experimental directory change 653 be used to use the new experimental directory change
654 notification code). 654 notification code). When set to 2 enables
655 an additional experimental feature, "raw ntlmssp"
656 session establishment support (which allows
657 specifying "sec=ntlmssp" on mount). The Linux cifs
658 module will use ntlmv2 authentication encapsulated
659 in "raw ntlmssp" (not using SPNEGO) when
660 "sec=ntlmssp" is specified on mount.
661 This support also requires building cifs with
662 the CONFIG_CIFS_EXPERIMENTAL configuration flag.
655 663
656These experimental features and tracing can be enabled by changing flags in 664These experimental features and tracing can be enabled by changing flags in
657/proc/fs/cifs (after the cifs module has been installed or built into the 665/proc/fs/cifs (after the cifs module has been installed or built into the
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 5fdbf8a14472..83d62759c7c7 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -340,28 +340,24 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
341 341
342 for (i = 0; i < num_referrals; i++) { 342 for (i = 0; i < num_referrals; i++) {
343 int len;
343 dump_referral(referrals+i); 344 dump_referral(referrals+i);
344 /* connect to a storage node */ 345 /* connect to a node */
345 if (referrals[i].flags & DFSREF_STORAGE_SERVER) { 346 len = strlen(referrals[i].node_name);
346 int len; 347 if (len < 2) {
347 len = strlen(referrals[i].node_name); 348 cERROR(1, ("%s: Net Address path too short: %s",
348 if (len < 2) {
349 cERROR(1, ("%s: Net Address path too short: %s",
350 __func__, referrals[i].node_name)); 349 __func__, referrals[i].node_name));
351 rc = -EINVAL; 350 rc = -EINVAL;
352 goto out_err; 351 goto out_err;
353 } 352 }
354 mnt = cifs_dfs_do_refmount(nd->path.mnt, 353 mnt = cifs_dfs_do_refmount(nd->path.mnt,
355 nd->path.dentry, 354 nd->path.dentry, referrals + i);
356 referrals + i); 355 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
357 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
358 __func__,
359 referrals[i].node_name, mnt)); 356 referrals[i].node_name, mnt));
360 357
361 /* complete mount procedure if we accured submount */ 358 /* complete mount procedure if we accured submount */
362 if (!IS_ERR(mnt)) 359 if (!IS_ERR(mnt))
363 break; 360 break;
364 }
365 } 361 }
366 362
367 /* we need it cause for() above could exit without valid submount */ 363 /* we need it cause for() above could exit without valid submount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 3fd3a9df043a..67bf93a40d2e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -41,7 +41,7 @@ cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
41 41
42 /* attach the data */ 42 /* attach the data */
43 memcpy(payload, data, datalen); 43 memcpy(payload, data, datalen);
44 rcu_assign_pointer(key->payload.data, payload); 44 key->payload.data = payload;
45 ret = 0; 45 ret = 0;
46 46
47error: 47error:
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7d75272a6b3f..60e3c4253de0 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifs_unicode.c 2 * fs/cifs/cifs_unicode.c
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2000,2005 4 * Copyright (c) International Business Machines Corp., 2000,2009
5 * Modified by Steve French (sfrench@us.ibm.com) 5 * Modified by Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -26,31 +26,157 @@
26#include "cifs_debug.h" 26#include "cifs_debug.h"
27 27
28/* 28/*
29 * NAME: cifs_strfromUCS() 29 * cifs_ucs2_bytes - how long will a string be after conversion?
30 * 30 * @ucs - pointer to input string
31 * FUNCTION: Convert little-endian unicode string to character string 31 * @maxbytes - don't go past this many bytes of input string
32 * @codepage - destination codepage
32 * 33 *
34 * Walk a ucs2le string and return the number of bytes that the string will
35 * be after being converted to the given charset, not including any null
36 * termination required. Don't walk past maxbytes in the source buffer.
33 */ 37 */
34int 38int
35cifs_strfromUCS_le(char *to, const __le16 *from, 39cifs_ucs2_bytes(const __le16 *from, int maxbytes,
36 int len, const struct nls_table *codepage) 40 const struct nls_table *codepage)
37{ 41{
38 int i; 42 int i;
39 int outlen = 0; 43 int charlen, outlen = 0;
44 int maxwords = maxbytes / 2;
45 char tmp[NLS_MAX_CHARSET_SIZE];
40 46
41 for (i = 0; (i < len) && from[i]; i++) { 47 for (i = 0; from[i] && i < maxwords; i++) {
42 int charlen; 48 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
43 /* 2.4.0 kernel or greater */ 49 NLS_MAX_CHARSET_SIZE);
44 charlen = 50 if (charlen > 0)
45 codepage->uni2char(le16_to_cpu(from[i]), &to[outlen],
46 NLS_MAX_CHARSET_SIZE);
47 if (charlen > 0) {
48 outlen += charlen; 51 outlen += charlen;
49 } else { 52 else
50 to[outlen++] = '?'; 53 outlen++;
54 }
55
56 return outlen;
57}
58
59/*
60 * cifs_mapchar - convert a little-endian char to proper char in codepage
61 * @target - where converted character should be copied
62 * @src_char - 2 byte little-endian source character
63 * @cp - codepage to which character should be converted
64 * @mapchar - should character be mapped according to mapchars mount option?
65 *
66 * This function handles the conversion of a single character. It is the
67 * responsibility of the caller to ensure that the target buffer is large
68 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
69 */
70static int
71cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
72 bool mapchar)
73{
74 int len = 1;
75
76 if (!mapchar)
77 goto cp_convert;
78
79 /*
80 * BB: Cannot handle remapping UNI_SLASH until all the calls to
81 * build_path_from_dentry are modified, as they use slash as
82 * separator.
83 */
84 switch (le16_to_cpu(src_char)) {
85 case UNI_COLON:
86 *target = ':';
87 break;
88 case UNI_ASTERIK:
89 *target = '*';
90 break;
91 case UNI_QUESTION:
92 *target = '?';
93 break;
94 case UNI_PIPE:
95 *target = '|';
96 break;
97 case UNI_GRTRTHAN:
98 *target = '>';
99 break;
100 case UNI_LESSTHAN:
101 *target = '<';
102 break;
103 default:
104 goto cp_convert;
105 }
106
107out:
108 return len;
109
110cp_convert:
111 len = cp->uni2char(le16_to_cpu(src_char), target,
112 NLS_MAX_CHARSET_SIZE);
113 if (len <= 0) {
114 *target = '?';
115 len = 1;
116 }
117 goto out;
118}
119
120/*
121 * cifs_from_ucs2 - convert utf16le string to local charset
122 * @to - destination buffer
123 * @from - source buffer
124 * @tolen - destination buffer size (in bytes)
125 * @fromlen - source buffer size (in bytes)
126 * @codepage - codepage to which characters should be converted
127 * @mapchar - should characters be remapped according to the mapchars option?
128 *
129 * Convert a little-endian ucs2le string (as sent by the server) to a string
130 * in the provided codepage. The tolen and fromlen parameters are to ensure
131 * that the code doesn't walk off of the end of the buffer (which is always
132 * a danger if the alignment of the source buffer is off). The destination
133 * string is always properly null terminated and fits in the destination
134 * buffer. Returns the length of the destination string in bytes (including
135 * null terminator).
136 *
137 * Note that some windows versions actually send multiword UTF-16 characters
138 * instead of straight UCS-2. The linux nls routines however aren't able to
139 * deal with those characters properly. In the event that we get some of
140 * those characters, they won't be translated properly.
141 */
142int
143cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
144 const struct nls_table *codepage, bool mapchar)
145{
146 int i, charlen, safelen;
147 int outlen = 0;
148 int nullsize = nls_nullsize(codepage);
149 int fromwords = fromlen / 2;
150 char tmp[NLS_MAX_CHARSET_SIZE];
151
152 /*
153 * because the chars can be of varying widths, we need to take care
154 * not to overflow the destination buffer when we get close to the
155 * end of it. Until we get to this offset, we don't need to check
156 * for overflow however.
157 */
158 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
159
160 for (i = 0; i < fromwords && from[i]; i++) {
161 /*
162 * check to see if converting this character might make the
163 * conversion bleed into the null terminator
164 */
165 if (outlen >= safelen) {
166 charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
167 if ((outlen + charlen) > (tolen - nullsize))
168 break;
51 } 169 }
170
171 /* put converted char into 'to' buffer */
172 charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
173 outlen += charlen;
52 } 174 }
53 to[outlen] = 0; 175
176 /* properly null-terminate string */
177 for (i = 0; i < nullsize; i++)
178 to[outlen++] = 0;
179
54 return outlen; 180 return outlen;
55} 181}
56 182
@@ -88,3 +214,41 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
88 return i; 214 return i;
89} 215}
90 216
217/*
218 * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
219 * @src - source string
220 * @maxlen - don't walk past this many bytes in the source string
221 * @is_unicode - is this a unicode string?
222 * @codepage - destination codepage
223 *
224 * Take a string given by the server, convert it to the local codepage and
225 * put it in a new buffer. Returns a pointer to the new string or NULL on
226 * error.
227 */
228char *
229cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
230 const struct nls_table *codepage)
231{
232 int len;
233 char *dst;
234
235 if (is_unicode) {
236 len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
237 len += nls_nullsize(codepage);
238 dst = kmalloc(len, GFP_KERNEL);
239 if (!dst)
240 return NULL;
241 cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
242 false);
243 } else {
244 len = strnlen(src, maxlen);
245 len++;
246 dst = kmalloc(len, GFP_KERNEL);
247 if (!dst)
248 return NULL;
249 strlcpy(dst, src, len);
250 }
251
252 return dst;
253}
254
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 14eb9a2395d3..650638275a6f 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -5,7 +5,7 @@
5 * Convert a unicode character to upper or lower case using 5 * Convert a unicode character to upper or lower case using
6 * compressed tables. 6 * compressed tables.
7 * 7 *
8 * Copyright (c) International Business Machines Corp., 2000,2007 8 * Copyright (c) International Business Machines Corp., 2000,2009
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by 11 * it under the terms of the GNU General Public License as published by
@@ -37,6 +37,19 @@
37 37
38#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ 38#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
39 39
40/*
41 * Windows maps these to the user defined 16 bit Unicode range since they are
42 * reserved symbols (along with \ and /), otherwise illegal to store
43 * in filenames in NTFS
44 */
45#define UNI_ASTERIK (__u16) ('*' + 0xF000)
46#define UNI_QUESTION (__u16) ('?' + 0xF000)
47#define UNI_COLON (__u16) (':' + 0xF000)
48#define UNI_GRTRTHAN (__u16) ('>' + 0xF000)
49#define UNI_LESSTHAN (__u16) ('<' + 0xF000)
50#define UNI_PIPE (__u16) ('|' + 0xF000)
51#define UNI_SLASH (__u16) ('\\' + 0xF000)
52
40/* Just define what we want from uniupr.h. We don't want to define the tables 53/* Just define what we want from uniupr.h. We don't want to define the tables
41 * in each source file. 54 * in each source file.
42 */ 55 */
@@ -59,8 +72,14 @@ extern struct UniCaseRange UniLowerRange[];
59#endif /* UNIUPR_NOLOWER */ 72#endif /* UNIUPR_NOLOWER */
60 73
61#ifdef __KERNEL__ 74#ifdef __KERNEL__
62int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *); 75int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
76 const struct nls_table *codepage, bool mapchar);
77int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
78 const struct nls_table *codepage);
63int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *); 79int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
80char *cifs_strndup_from_ucs(const char *src, const int maxlen,
81 const bool is_unicode,
82 const struct nls_table *codepage);
64#endif 83#endif
65 84
66/* 85/*
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 38491fd3871d..5e6d35804d73 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,6 +35,7 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/smp_lock.h>
38#include "cifsfs.h" 39#include "cifsfs.h"
39#include "cifspdu.h" 40#include "cifspdu.h"
40#define DECLARE_GLOBALS_HERE 41#define DECLARE_GLOBALS_HERE
@@ -66,9 +67,6 @@ unsigned int sign_CIFS_PDUs = 1;
66extern struct task_struct *oplockThread; /* remove sparse warning */ 67extern struct task_struct *oplockThread; /* remove sparse warning */
67struct task_struct *oplockThread = NULL; 68struct task_struct *oplockThread = NULL;
68/* extern struct task_struct * dnotifyThread; remove sparse warning */ 69/* extern struct task_struct * dnotifyThread; remove sparse warning */
69#ifdef CONFIG_CIFS_EXPERIMENTAL
70static struct task_struct *dnotifyThread = NULL;
71#endif
72static const struct super_operations cifs_super_ops; 70static const struct super_operations cifs_super_ops;
73unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 71unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
74module_param(CIFSMaxBufSize, int, 0); 72module_param(CIFSMaxBufSize, int, 0);
@@ -316,6 +314,7 @@ cifs_alloc_inode(struct super_block *sb)
316 cifs_inode->clientCanCacheAll = false; 314 cifs_inode->clientCanCacheAll = false;
317 cifs_inode->delete_pending = false; 315 cifs_inode->delete_pending = false;
318 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
317 cifs_inode->server_eof = 0;
319 318
320 /* Can not set i_flags here - they get immediately overwritten 319 /* Can not set i_flags here - they get immediately overwritten
321 to zero by the VFS */ 320 to zero by the VFS */
@@ -532,6 +531,7 @@ static void cifs_umount_begin(struct super_block *sb)
532 if (tcon == NULL) 531 if (tcon == NULL)
533 return; 532 return;
534 533
534 lock_kernel();
535 read_lock(&cifs_tcp_ses_lock); 535 read_lock(&cifs_tcp_ses_lock);
536 if (tcon->tc_count == 1) 536 if (tcon->tc_count == 1)
537 tcon->tidStatus = CifsExiting; 537 tcon->tidStatus = CifsExiting;
@@ -550,6 +550,7 @@ static void cifs_umount_begin(struct super_block *sb)
550 } 550 }
551/* BB FIXME - finish add checks for tidStatus BB */ 551/* BB FIXME - finish add checks for tidStatus BB */
552 552
553 unlock_kernel();
553 return; 554 return;
554} 555}
555 556
@@ -601,8 +602,7 @@ cifs_get_sb(struct file_system_type *fs_type,
601 602
602 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0); 603 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
603 if (rc) { 604 if (rc) {
604 up_write(&sb->s_umount); 605 deactivate_locked_super(sb);
605 deactivate_super(sb);
606 return rc; 606 return rc;
607 } 607 }
608 sb->s_flags |= MS_ACTIVE; 608 sb->s_flags |= MS_ACTIVE;
@@ -1040,34 +1040,6 @@ static int cifs_oplock_thread(void *dummyarg)
1040 return 0; 1040 return 0;
1041} 1041}
1042 1042
1043#ifdef CONFIG_CIFS_EXPERIMENTAL
1044static int cifs_dnotify_thread(void *dummyarg)
1045{
1046 struct list_head *tmp;
1047 struct TCP_Server_Info *server;
1048
1049 do {
1050 if (try_to_freeze())
1051 continue;
1052 set_current_state(TASK_INTERRUPTIBLE);
1053 schedule_timeout(15*HZ);
1054 /* check if any stuck requests that need
1055 to be woken up and wakeq so the
1056 thread can wake up and error out */
1057 read_lock(&cifs_tcp_ses_lock);
1058 list_for_each(tmp, &cifs_tcp_ses_list) {
1059 server = list_entry(tmp, struct TCP_Server_Info,
1060 tcp_ses_list);
1061 if (atomic_read(&server->inFlight))
1062 wake_up_all(&server->response_q);
1063 }
1064 read_unlock(&cifs_tcp_ses_lock);
1065 } while (!kthread_should_stop());
1066
1067 return 0;
1068}
1069#endif
1070
1071static int __init 1043static int __init
1072init_cifs(void) 1044init_cifs(void)
1073{ 1045{
@@ -1144,21 +1116,8 @@ init_cifs(void)
1144 goto out_unregister_dfs_key_type; 1116 goto out_unregister_dfs_key_type;
1145 } 1117 }
1146 1118
1147#ifdef CONFIG_CIFS_EXPERIMENTAL
1148 dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
1149 if (IS_ERR(dnotifyThread)) {
1150 rc = PTR_ERR(dnotifyThread);
1151 cERROR(1, ("error %d create dnotify thread", rc));
1152 goto out_stop_oplock_thread;
1153 }
1154#endif
1155
1156 return 0; 1119 return 0;
1157 1120
1158#ifdef CONFIG_CIFS_EXPERIMENTAL
1159 out_stop_oplock_thread:
1160#endif
1161 kthread_stop(oplockThread);
1162 out_unregister_dfs_key_type: 1121 out_unregister_dfs_key_type:
1163#ifdef CONFIG_CIFS_DFS_UPCALL 1122#ifdef CONFIG_CIFS_DFS_UPCALL
1164 unregister_key_type(&key_type_dns_resolver); 1123 unregister_key_type(&key_type_dns_resolver);
@@ -1196,9 +1155,6 @@ exit_cifs(void)
1196 cifs_destroy_inodecache(); 1155 cifs_destroy_inodecache();
1197 cifs_destroy_mids(); 1156 cifs_destroy_mids();
1198 cifs_destroy_request_bufs(); 1157 cifs_destroy_request_bufs();
1199#ifdef CONFIG_CIFS_EXPERIMENTAL
1200 kthread_stop(dnotifyThread);
1201#endif
1202 kthread_stop(oplockThread); 1158 kthread_stop(oplockThread);
1203} 1159}
1204 1160
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 77e190dc2883..051b71cfdea9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
100extern const struct export_operations cifs_export_ops; 100extern const struct export_operations cifs_export_ops;
101#endif /* EXPERIMENTAL */ 101#endif /* EXPERIMENTAL */
102 102
103#define CIFS_VERSION "1.57" 103#define CIFS_VERSION "1.58"
104#endif /* _CIFSFS_H */ 104#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9fbf4dff5da6..a61ab772c6f6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -82,8 +82,8 @@ enum securityEnum {
82 LANMAN, /* Legacy LANMAN auth */ 82 LANMAN, /* Legacy LANMAN auth */
83 NTLM, /* Legacy NTLM012 auth with NTLM hash */ 83 NTLM, /* Legacy NTLM012 auth with NTLM hash */
84 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 84 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
85 RawNTLMSSP, /* NTLMSSP without SPNEGO */ 85 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
86 NTLMSSP, /* NTLMSSP via SPNEGO */ 86 NTLMSSP, /* NTLMSSP via SPNEGO, NTLMv2 hash */
87 Kerberos, /* Kerberos via SPNEGO */ 87 Kerberos, /* Kerberos via SPNEGO */
88 MSKerberos, /* MS Kerberos via SPNEGO */ 88 MSKerberos, /* MS Kerberos via SPNEGO */
89}; 89};
@@ -350,7 +350,7 @@ struct cifsFileInfo {
350 bool invalidHandle:1; /* file closed via session abend */ 350 bool invalidHandle:1; /* file closed via session abend */
351 bool messageMode:1; /* for pipes: message vs byte mode */ 351 bool messageMode:1; /* for pipes: message vs byte mode */
352 atomic_t wrtPending; /* handle in use - defer close */ 352 atomic_t wrtPending; /* handle in use - defer close */
353 struct semaphore fh_sem; /* prevents reopen race after dead ses*/ 353 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
354 struct cifs_search_info srch_inf; 354 struct cifs_search_info srch_inf;
355}; 355};
356 356
@@ -370,6 +370,7 @@ struct cifsInodeInfo {
370 bool clientCanCacheAll:1; /* read and writebehind oplock */ 370 bool clientCanCacheAll:1; /* read and writebehind oplock */
371 bool oplockPending:1; 371 bool oplockPending:1;
372 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 372 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
373 u64 server_eof; /* current file size on server */
373 struct inode vfs_inode; 374 struct inode vfs_inode;
374}; 375};
375 376
@@ -530,6 +531,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
530#define CIFSSEC_MAY_PLNTXT 0 531#define CIFSSEC_MAY_PLNTXT 0
531#endif /* weak passwords */ 532#endif /* weak passwords */
532#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ 533#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */
534#define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */
533 535
534#define CIFSSEC_MUST_SIGN 0x01001 536#define CIFSSEC_MUST_SIGN 0x01001
535/* note that only one of the following can be set so the 537/* note that only one of the following can be set so the
@@ -542,22 +544,23 @@ require use of the stronger protocol */
542#define CIFSSEC_MUST_LANMAN 0x10010 544#define CIFSSEC_MUST_LANMAN 0x10010
543#define CIFSSEC_MUST_PLNTXT 0x20020 545#define CIFSSEC_MUST_PLNTXT 0x20020
544#ifdef CONFIG_CIFS_UPCALL 546#ifdef CONFIG_CIFS_UPCALL
545#define CIFSSEC_MASK 0x3F03F /* allows weak security but also krb5 */ 547#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */
546#else 548#else
547#define CIFSSEC_MASK 0x37037 /* current flags supported if weak */ 549#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */
548#endif /* UPCALL */ 550#endif /* UPCALL */
549#else /* do not allow weak pw hash */ 551#else /* do not allow weak pw hash */
550#ifdef CONFIG_CIFS_UPCALL 552#ifdef CONFIG_CIFS_UPCALL
551#define CIFSSEC_MASK 0x0F00F /* flags supported if no weak allowed */ 553#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
552#else 554#else
553#define CIFSSEC_MASK 0x07007 /* flags supported if no weak allowed */ 555#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */
554#endif /* UPCALL */ 556#endif /* UPCALL */
555#endif /* WEAK_PW_HASH */ 557#endif /* WEAK_PW_HASH */
556#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 558#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
559#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
557 560
558#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2) 561#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
559#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 562#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
560#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5) 563#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
561/* 564/*
562 ***************************************************************** 565 *****************************************************************
563 * All constants go here 566 * All constants go here
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b370489c8da5..a785f69dbc9f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2163,7 +2163,7 @@ typedef struct {
2163 __le32 Type; 2163 __le32 Type;
2164 __le64 DevMajor; 2164 __le64 DevMajor;
2165 __le64 DevMinor; 2165 __le64 DevMinor;
2166 __u64 UniqueId; 2166 __le64 UniqueId;
2167 __le64 Permissions; 2167 __le64 Permissions;
2168 __le64 Nlinks; 2168 __le64 Nlinks;
2169} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ 2169} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */
@@ -2308,7 +2308,7 @@ struct unlink_psx_rq { /* level 0x20a SetPathInfo */
2308} __attribute__((packed)); 2308} __attribute__((packed));
2309 2309
2310struct file_internal_info { 2310struct file_internal_info {
2311 __u64 UniqueId; /* inode number */ 2311 __le64 UniqueId; /* inode number */
2312} __attribute__((packed)); /* level 0x3ee */ 2312} __attribute__((packed)); /* level 0x3ee */
2313 2313
2314struct file_mode_info { 2314struct file_mode_info {
@@ -2338,7 +2338,7 @@ typedef struct {
2338 __le32 Type; 2338 __le32 Type;
2339 __le64 DevMajor; 2339 __le64 DevMajor;
2340 __le64 DevMinor; 2340 __le64 DevMinor;
2341 __u64 UniqueId; 2341 __le64 UniqueId;
2342 __le64 Permissions; 2342 __le64 Permissions;
2343 __le64 Nlinks; 2343 __le64 Nlinks;
2344 char FileName[1]; 2344 char FileName[1];
@@ -2386,7 +2386,7 @@ typedef struct {
2386 __le32 FileNameLength; 2386 __le32 FileNameLength;
2387 __le32 EaSize; /* EA size */ 2387 __le32 EaSize; /* EA size */
2388 __le32 Reserved; 2388 __le32 Reserved;
2389 __u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ 2389 __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
2390 char FileName[1]; 2390 char FileName[1];
2391} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ 2391} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
2392 2392
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 4167716d32f2..fae083930eee 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -260,8 +260,7 @@ extern int CIFSUnixCreateSymLink(const int xid,
260 const struct nls_table *nls_codepage); 260 const struct nls_table *nls_codepage);
261extern int CIFSSMBUnixQuerySymLink(const int xid, 261extern int CIFSSMBUnixQuerySymLink(const int xid,
262 struct cifsTconInfo *tcon, 262 struct cifsTconInfo *tcon,
263 const unsigned char *searchName, 263 const unsigned char *searchName, char **syminfo,
264 char *syminfo, const int buflen,
265 const struct nls_table *nls_codepage); 264 const struct nls_table *nls_codepage);
266extern int CIFSSMBQueryReparseLinkInfo(const int xid, 265extern int CIFSSMBQueryReparseLinkInfo(const int xid,
267 struct cifsTconInfo *tcon, 266 struct cifsTconInfo *tcon,
@@ -307,8 +306,6 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
307 const unsigned char *searchName, __u64 *inode_number, 306 const unsigned char *searchName, __u64 *inode_number,
308 const struct nls_table *nls_codepage, 307 const struct nls_table *nls_codepage,
309 int remap_special_chars); 308 int remap_special_chars);
310extern int cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
311 const struct nls_table *codepage);
312extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, 309extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
313 const struct nls_table *cp, int mapChars); 310 const struct nls_table *cp, int mapChars);
314 311
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bc09c998631f..5759ba53dc96 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifssmb.c 2 * fs/cifs/cifssmb.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * Contains the routines for constructing the SMB PDUs themselves 7 * Contains the routines for constructing the SMB PDUs themselves
@@ -81,41 +81,6 @@ static struct {
81#endif /* CONFIG_CIFS_WEAK_PW_HASH */ 81#endif /* CONFIG_CIFS_WEAK_PW_HASH */
82#endif /* CIFS_POSIX */ 82#endif /* CIFS_POSIX */
83 83
84/* Allocates buffer into dst and copies smb string from src to it.
85 * caller is responsible for freeing dst if function returned 0.
86 * returns:
87 * on success - 0
88 * on failure - errno
89 */
90static int
91cifs_strncpy_to_host(char **dst, const char *src, const int maxlen,
92 const bool is_unicode, const struct nls_table *nls_codepage)
93{
94 int plen;
95
96 if (is_unicode) {
97 plen = UniStrnlen((wchar_t *)src, maxlen);
98 *dst = kmalloc(plen + 2, GFP_KERNEL);
99 if (!*dst)
100 goto cifs_strncpy_to_host_ErrExit;
101 cifs_strfromUCS_le(*dst, (__le16 *)src, plen, nls_codepage);
102 } else {
103 plen = strnlen(src, maxlen);
104 *dst = kmalloc(plen + 2, GFP_KERNEL);
105 if (!*dst)
106 goto cifs_strncpy_to_host_ErrExit;
107 strncpy(*dst, src, plen);
108 }
109 (*dst)[plen] = 0;
110 (*dst)[plen+1] = 0; /* harmless for ASCII case, needed for Unicode */
111 return 0;
112
113cifs_strncpy_to_host_ErrExit:
114 cERROR(1, ("Failed to allocate buffer for string\n"));
115 return -ENOMEM;
116}
117
118
119/* Mark as invalid, all open files on tree connections since they 84/* Mark as invalid, all open files on tree connections since they
120 were closed when session to server was lost */ 85 were closed when session to server was lost */
121static void mark_open_files_invalid(struct cifsTconInfo *pTcon) 86static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
@@ -484,6 +449,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
484 cFYI(1, ("Kerberos only mechanism, enable extended security")); 449 cFYI(1, ("Kerberos only mechanism, enable extended security"));
485 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 450 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
486 } 451 }
452#ifdef CONFIG_CIFS_EXPERIMENTAL
453 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
454 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
455 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
456 cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
457 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
458 }
459#endif
487 460
488 count = 0; 461 count = 0;
489 for (i = 0; i < CIFS_NUM_PROT; i++) { 462 for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -620,6 +593,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
620 server->secType = NTLMv2; 593 server->secType = NTLMv2;
621 else if (secFlags & CIFSSEC_MAY_KRB5) 594 else if (secFlags & CIFSSEC_MAY_KRB5)
622 server->secType = Kerberos; 595 server->secType = Kerberos;
596 else if (secFlags & CIFSSEC_MAY_NTLMSSP)
597 server->secType = NTLMSSP;
623 else if (secFlags & CIFSSEC_MAY_LANMAN) 598 else if (secFlags & CIFSSEC_MAY_LANMAN)
624 server->secType = LANMAN; 599 server->secType = LANMAN;
625/* #ifdef CONFIG_CIFS_EXPERIMENTAL 600/* #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -1626,6 +1601,8 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1626 int smb_hdr_len; 1601 int smb_hdr_len;
1627 int resp_buf_type = 0; 1602 int resp_buf_type = 0;
1628 1603
1604 *nbytes = 0;
1605
1629 cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count)); 1606 cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
1630 1607
1631 if (tcon->ses->capabilities & CAP_LARGE_FILES) { 1608 if (tcon->ses->capabilities & CAP_LARGE_FILES) {
@@ -1682,11 +1659,9 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1682 cifs_stats_inc(&tcon->num_writes); 1659 cifs_stats_inc(&tcon->num_writes);
1683 if (rc) { 1660 if (rc) {
1684 cFYI(1, ("Send error Write2 = %d", rc)); 1661 cFYI(1, ("Send error Write2 = %d", rc));
1685 *nbytes = 0;
1686 } else if (resp_buf_type == 0) { 1662 } else if (resp_buf_type == 0) {
1687 /* presumably this can not happen, but best to be safe */ 1663 /* presumably this can not happen, but best to be safe */
1688 rc = -EIO; 1664 rc = -EIO;
1689 *nbytes = 0;
1690 } else { 1665 } else {
1691 WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base; 1666 WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
1692 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1667 *nbytes = le16_to_cpu(pSMBr->CountHigh);
@@ -2417,8 +2392,7 @@ winCreateHardLinkRetry:
2417 2392
2418int 2393int
2419CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon, 2394CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
2420 const unsigned char *searchName, 2395 const unsigned char *searchName, char **symlinkinfo,
2421 char *symlinkinfo, const int buflen,
2422 const struct nls_table *nls_codepage) 2396 const struct nls_table *nls_codepage)
2423{ 2397{
2424/* SMB_QUERY_FILE_UNIX_LINK */ 2398/* SMB_QUERY_FILE_UNIX_LINK */
@@ -2428,6 +2402,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
2428 int bytes_returned; 2402 int bytes_returned;
2429 int name_len; 2403 int name_len;
2430 __u16 params, byte_count; 2404 __u16 params, byte_count;
2405 char *data_start;
2431 2406
2432 cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName)); 2407 cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
2433 2408
@@ -2482,30 +2457,26 @@ querySymLinkRetry:
2482 /* decode response */ 2457 /* decode response */
2483 2458
2484 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 2459 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
2485 if (rc || (pSMBr->ByteCount < 2))
2486 /* BB also check enough total bytes returned */ 2460 /* BB also check enough total bytes returned */
2487 rc = -EIO; /* bad smb */ 2461 if (rc || (pSMBr->ByteCount < 2))
2462 rc = -EIO;
2488 else { 2463 else {
2489 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 2464 bool is_unicode;
2490 __u16 count = le16_to_cpu(pSMBr->t2.DataCount); 2465 u16 count = le16_to_cpu(pSMBr->t2.DataCount);
2466
2467 data_start = ((char *) &pSMBr->hdr.Protocol) +
2468 le16_to_cpu(pSMBr->t2.DataOffset);
2469
2470 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
2471 is_unicode = true;
2472 else
2473 is_unicode = false;
2491 2474
2492 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
2493 name_len = UniStrnlen((wchar_t *) ((char *)
2494 &pSMBr->hdr.Protocol + data_offset),
2495 min_t(const int, buflen, count) / 2);
2496 /* BB FIXME investigate remapping reserved chars here */ 2475 /* BB FIXME investigate remapping reserved chars here */
2497 cifs_strfromUCS_le(symlinkinfo, 2476 *symlinkinfo = cifs_strndup_from_ucs(data_start, count,
2498 (__le16 *) ((char *)&pSMBr->hdr.Protocol 2477 is_unicode, nls_codepage);
2499 + data_offset), 2478 if (!symlinkinfo)
2500 name_len, nls_codepage); 2479 rc = -ENOMEM;
2501 } else {
2502 strncpy(symlinkinfo,
2503 (char *) &pSMBr->hdr.Protocol +
2504 data_offset,
2505 min_t(const int, buflen, count));
2506 }
2507 symlinkinfo[buflen] = 0;
2508 /* just in case so calling code does not go off the end of buffer */
2509 } 2480 }
2510 } 2481 }
2511 cifs_buf_release(pSMB); 2482 cifs_buf_release(pSMB);
@@ -2603,7 +2574,6 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
2603 *pparmlen = parm_count; 2574 *pparmlen = parm_count;
2604 return 0; 2575 return 0;
2605} 2576}
2606#endif /* CIFS_EXPERIMENTAL */
2607 2577
2608int 2578int
2609CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, 2579CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
@@ -2613,7 +2583,6 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2613{ 2583{
2614 int rc = 0; 2584 int rc = 0;
2615 int bytes_returned; 2585 int bytes_returned;
2616 int name_len;
2617 struct smb_com_transaction_ioctl_req *pSMB; 2586 struct smb_com_transaction_ioctl_req *pSMB;
2618 struct smb_com_transaction_ioctl_rsp *pSMBr; 2587 struct smb_com_transaction_ioctl_rsp *pSMBr;
2619 2588
@@ -2650,59 +2619,55 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2650 } else { /* decode response */ 2619 } else { /* decode response */
2651 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); 2620 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
2652 __u32 data_count = le32_to_cpu(pSMBr->DataCount); 2621 __u32 data_count = le32_to_cpu(pSMBr->DataCount);
2653 if ((pSMBr->ByteCount < 2) || (data_offset > 512)) 2622 if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
2654 /* BB also check enough total bytes returned */ 2623 /* BB also check enough total bytes returned */
2655 rc = -EIO; /* bad smb */ 2624 rc = -EIO; /* bad smb */
2656 else { 2625 goto qreparse_out;
2657 if (data_count && (data_count < 2048)) { 2626 }
2658 char *end_of_smb = 2 /* sizeof byte count */ + 2627 if (data_count && (data_count < 2048)) {
2659 pSMBr->ByteCount + 2628 char *end_of_smb = 2 /* sizeof byte count */ +
2660 (char *)&pSMBr->ByteCount; 2629 pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
2661 2630
2662 struct reparse_data *reparse_buf = 2631 struct reparse_data *reparse_buf =
2663 (struct reparse_data *) 2632 (struct reparse_data *)
2664 ((char *)&pSMBr->hdr.Protocol 2633 ((char *)&pSMBr->hdr.Protocol
2665 + data_offset); 2634 + data_offset);
2666 if ((char *)reparse_buf >= end_of_smb) { 2635 if ((char *)reparse_buf >= end_of_smb) {
2667 rc = -EIO; 2636 rc = -EIO;
2668 goto qreparse_out; 2637 goto qreparse_out;
2669 } 2638 }
2670 if ((reparse_buf->LinkNamesBuf + 2639 if ((reparse_buf->LinkNamesBuf +
2671 reparse_buf->TargetNameOffset + 2640 reparse_buf->TargetNameOffset +
2672 reparse_buf->TargetNameLen) > 2641 reparse_buf->TargetNameLen) > end_of_smb) {
2673 end_of_smb) { 2642 cFYI(1, ("reparse buf beyond SMB"));
2674 cFYI(1, ("reparse buf beyond SMB")); 2643 rc = -EIO;
2675 rc = -EIO; 2644 goto qreparse_out;
2676 goto qreparse_out; 2645 }
2677 }
2678 2646
2679 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) { 2647 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
2680 name_len = UniStrnlen((wchar_t *) 2648 cifs_from_ucs2(symlinkinfo, (__le16 *)
2681 (reparse_buf->LinkNamesBuf + 2649 (reparse_buf->LinkNamesBuf +
2682 reparse_buf->TargetNameOffset), 2650 reparse_buf->TargetNameOffset),
2683 min(buflen/2, 2651 buflen,
2684 reparse_buf->TargetNameLen / 2)); 2652 reparse_buf->TargetNameLen,
2685 cifs_strfromUCS_le(symlinkinfo, 2653 nls_codepage, 0);
2686 (__le16 *) (reparse_buf->LinkNamesBuf + 2654 } else { /* ASCII names */
2687 reparse_buf->TargetNameOffset), 2655 strncpy(symlinkinfo,
2688 name_len, nls_codepage); 2656 reparse_buf->LinkNamesBuf +
2689 } else { /* ASCII names */ 2657 reparse_buf->TargetNameOffset,
2690 strncpy(symlinkinfo, 2658 min_t(const int, buflen,
2691 reparse_buf->LinkNamesBuf + 2659 reparse_buf->TargetNameLen));
2692 reparse_buf->TargetNameOffset,
2693 min_t(const int, buflen,
2694 reparse_buf->TargetNameLen));
2695 }
2696 } else {
2697 rc = -EIO;
2698 cFYI(1, ("Invalid return data count on "
2699 "get reparse info ioctl"));
2700 } 2660 }
2701 symlinkinfo[buflen] = 0; /* just in case so the caller 2661 } else {
2702 does not go off the end of the buffer */ 2662 rc = -EIO;
2703 cFYI(1, ("readlink result - %s", symlinkinfo)); 2663 cFYI(1, ("Invalid return data count on "
2664 "get reparse info ioctl"));
2704 } 2665 }
2666 symlinkinfo[buflen] = 0; /* just in case so the caller
2667 does not go off the end of the buffer */
2668 cFYI(1, ("readlink result - %s", symlinkinfo));
2705 } 2669 }
2670
2706qreparse_out: 2671qreparse_out:
2707 cifs_buf_release(pSMB); 2672 cifs_buf_release(pSMB);
2708 2673
@@ -2711,6 +2676,7 @@ qreparse_out:
2711 2676
2712 return rc; 2677 return rc;
2713} 2678}
2679#endif /* CIFS_EXPERIMENTAL */
2714 2680
2715#ifdef CONFIG_CIFS_POSIX 2681#ifdef CONFIG_CIFS_POSIX
2716 2682
@@ -3918,7 +3884,7 @@ GetInodeNumberRetry:
3918 } 3884 }
3919 pfinfo = (struct file_internal_info *) 3885 pfinfo = (struct file_internal_info *)
3920 (data_offset + (char *) &pSMBr->hdr.Protocol); 3886 (data_offset + (char *) &pSMBr->hdr.Protocol);
3921 *inode_number = pfinfo->UniqueId; 3887 *inode_number = le64_to_cpu(pfinfo->UniqueId);
3922 } 3888 }
3923 } 3889 }
3924GetInodeNumOut: 3890GetInodeNumOut:
@@ -3928,27 +3894,6 @@ GetInodeNumOut:
3928 return rc; 3894 return rc;
3929} 3895}
3930 3896
3931/* computes length of UCS string converted to host codepage
3932 * @src: UCS string
3933 * @maxlen: length of the input string in UCS characters
3934 * (not in bytes)
3935 *
3936 * return: size of input string in host codepage
3937 */
3938static int hostlen_fromUCS(const __le16 *src, const int maxlen,
3939 const struct nls_table *nls_codepage) {
3940 int i;
3941 int hostlen = 0;
3942 char to[4];
3943 int charlen;
3944 for (i = 0; (i < maxlen) && src[i]; ++i) {
3945 charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
3946 to, NLS_MAX_CHARSET_SIZE);
3947 hostlen += charlen > 0 ? charlen : 1;
3948 }
3949 return hostlen;
3950}
3951
3952/* parses DFS refferal V3 structure 3897/* parses DFS refferal V3 structure
3953 * caller is responsible for freeing target_nodes 3898 * caller is responsible for freeing target_nodes
3954 * returns: 3899 * returns:
@@ -3994,7 +3939,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3994 3939
3995 cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n", 3940 cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
3996 *num_of_nodes, 3941 *num_of_nodes,
3997 le16_to_cpu(pSMBr->DFSFlags))); 3942 le32_to_cpu(pSMBr->DFSFlags)));
3998 3943
3999 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * 3944 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
4000 *num_of_nodes, GFP_KERNEL); 3945 *num_of_nodes, GFP_KERNEL);
@@ -4010,14 +3955,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4010 int max_len; 3955 int max_len;
4011 struct dfs_info3_param *node = (*target_nodes)+i; 3956 struct dfs_info3_param *node = (*target_nodes)+i;
4012 3957
4013 node->flags = le16_to_cpu(pSMBr->DFSFlags); 3958 node->flags = le32_to_cpu(pSMBr->DFSFlags);
4014 if (is_unicode) { 3959 if (is_unicode) {
4015 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, 3960 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
4016 GFP_KERNEL); 3961 GFP_KERNEL);
4017 cifsConvertToUCS((__le16 *) tmp, searchName, 3962 cifsConvertToUCS((__le16 *) tmp, searchName,
4018 PATH_MAX, nls_codepage, remap); 3963 PATH_MAX, nls_codepage, remap);
4019 node->path_consumed = hostlen_fromUCS(tmp, 3964 node->path_consumed = cifs_ucs2_bytes(tmp,
4020 le16_to_cpu(pSMBr->PathConsumed)/2, 3965 le16_to_cpu(pSMBr->PathConsumed),
4021 nls_codepage); 3966 nls_codepage);
4022 kfree(tmp); 3967 kfree(tmp);
4023 } else 3968 } else
@@ -4029,20 +3974,20 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4029 /* copy DfsPath */ 3974 /* copy DfsPath */
4030 temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); 3975 temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
4031 max_len = data_end - temp; 3976 max_len = data_end - temp;
4032 rc = cifs_strncpy_to_host(&(node->path_name), temp, 3977 node->path_name = cifs_strndup_from_ucs(temp, max_len,
4033 max_len, is_unicode, nls_codepage); 3978 is_unicode, nls_codepage);
4034 if (rc) 3979 if (!node->path_name) {
3980 rc = -ENOMEM;
4035 goto parse_DFS_referrals_exit; 3981 goto parse_DFS_referrals_exit;
3982 }
4036 3983
4037 /* copy link target UNC */ 3984 /* copy link target UNC */
4038 temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); 3985 temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
4039 max_len = data_end - temp; 3986 max_len = data_end - temp;
4040 rc = cifs_strncpy_to_host(&(node->node_name), temp, 3987 node->node_name = cifs_strndup_from_ucs(temp, max_len,
4041 max_len, is_unicode, nls_codepage); 3988 is_unicode, nls_codepage);
4042 if (rc) 3989 if (!node->node_name)
4043 goto parse_DFS_referrals_exit; 3990 rc = -ENOMEM;
4044
4045 ref += le16_to_cpu(ref->Size);
4046 } 3991 }
4047 3992
4048parse_DFS_referrals_exit: 3993parse_DFS_referrals_exit:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0de3b5615a22..4aa81a507b74 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/connect.c 2 * fs/cifs/connect.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/pagevec.h> 33#include <linux/pagevec.h>
34#include <linux/freezer.h> 34#include <linux/freezer.h>
35#include <linux/namei.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/processor.h> 37#include <asm/processor.h>
37#include <net/ipv6.h> 38#include <net/ipv6.h>
@@ -978,6 +979,13 @@ cifs_parse_mount_options(char *options, const char *devname,
978 return 1; 979 return 1;
979 } else if (strnicmp(value, "krb5", 4) == 0) { 980 } else if (strnicmp(value, "krb5", 4) == 0) {
980 vol->secFlg |= CIFSSEC_MAY_KRB5; 981 vol->secFlg |= CIFSSEC_MAY_KRB5;
982#ifdef CONFIG_CIFS_EXPERIMENTAL
983 } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
984 vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
985 CIFSSEC_MUST_SIGN;
986 } else if (strnicmp(value, "ntlmssp", 7) == 0) {
987 vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
988#endif
981 } else if (strnicmp(value, "ntlmv2i", 7) == 0) { 989 } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
982 vol->secFlg |= CIFSSEC_MAY_NTLMV2 | 990 vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
983 CIFSSEC_MUST_SIGN; 991 CIFSSEC_MUST_SIGN;
@@ -2214,9 +2222,58 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
2214 return rc; 2222 return rc;
2215} 2223}
2216 2224
2225static void
2226cleanup_volume_info(struct smb_vol **pvolume_info)
2227{
2228 struct smb_vol *volume_info;
2229
2230 if (!pvolume_info && !*pvolume_info)
2231 return;
2232
2233 volume_info = *pvolume_info;
2234 kzfree(volume_info->password);
2235 kfree(volume_info->UNC);
2236 kfree(volume_info->prepath);
2237 kfree(volume_info);
2238 *pvolume_info = NULL;
2239 return;
2240}
2241
2242#ifdef CONFIG_CIFS_DFS_UPCALL
2243/* build_path_to_root returns full path to root when
2244 * we do not have an exiting connection (tcon) */
2245static char *
2246build_unc_path_to_root(const struct smb_vol *volume_info,
2247 const struct cifs_sb_info *cifs_sb)
2248{
2249 char *full_path;
2250
2251 int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
2252 full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
2253 if (full_path == NULL)
2254 return ERR_PTR(-ENOMEM);
2255
2256 strncpy(full_path, volume_info->UNC, unc_len);
2257 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
2258 int i;
2259 for (i = 0; i < unc_len; i++) {
2260 if (full_path[i] == '\\')
2261 full_path[i] = '/';
2262 }
2263 }
2264
2265 if (cifs_sb->prepathlen)
2266 strncpy(full_path + unc_len, cifs_sb->prepath,
2267 cifs_sb->prepathlen);
2268
2269 full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
2270 return full_path;
2271}
2272#endif
2273
2217int 2274int
2218cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, 2275cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2219 char *mount_data, const char *devname) 2276 char *mount_data_global, const char *devname)
2220{ 2277{
2221 int rc = 0; 2278 int rc = 0;
2222 int xid; 2279 int xid;
@@ -2225,6 +2282,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2225 struct cifsTconInfo *tcon = NULL; 2282 struct cifsTconInfo *tcon = NULL;
2226 struct TCP_Server_Info *srvTcp = NULL; 2283 struct TCP_Server_Info *srvTcp = NULL;
2227 char *full_path; 2284 char *full_path;
2285 char *mount_data = mount_data_global;
2286#ifdef CONFIG_CIFS_DFS_UPCALL
2287 struct dfs_info3_param *referrals = NULL;
2288 unsigned int num_referrals = 0;
2289 int referral_walks_count = 0;
2290try_mount_again:
2291#endif
2292 full_path = NULL;
2228 2293
2229 xid = GetXid(); 2294 xid = GetXid();
2230 2295
@@ -2371,11 +2436,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2371 } 2436 }
2372 } 2437 }
2373 2438
2374 /* check for null share name ie connect to dfs root */
2375 if ((strchr(volume_info->UNC + 3, '\\') == NULL) 2439 if ((strchr(volume_info->UNC + 3, '\\') == NULL)
2376 && (strchr(volume_info->UNC + 3, '/') == NULL)) { 2440 && (strchr(volume_info->UNC + 3, '/') == NULL)) {
2377 /* rc = connect_to_dfs_path(...) */ 2441 cERROR(1, ("Missing share name"));
2378 cFYI(1, ("DFS root not supported"));
2379 rc = -ENODEV; 2442 rc = -ENODEV;
2380 goto mount_fail_check; 2443 goto mount_fail_check;
2381 } else { 2444 } else {
@@ -2392,7 +2455,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2392 } 2455 }
2393 } 2456 }
2394 if (rc) 2457 if (rc)
2395 goto mount_fail_check; 2458 goto remote_path_check;
2396 tcon->seal = volume_info->seal; 2459 tcon->seal = volume_info->seal;
2397 write_lock(&cifs_tcp_ses_lock); 2460 write_lock(&cifs_tcp_ses_lock);
2398 list_add(&tcon->tcon_list, &pSesInfo->tcon_list); 2461 list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
@@ -2417,19 +2480,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2417 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2480 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
2418 sb->s_time_gran = 100; 2481 sb->s_time_gran = 100;
2419 2482
2420mount_fail_check: 2483 if (rc)
2421 /* on error free sesinfo and tcon struct if needed */ 2484 goto remote_path_check;
2422 if (rc) { 2485
2423 /* If find_unc succeeded then rc == 0 so we can not end */
2424 /* up accidently freeing someone elses tcon struct */
2425 if (tcon)
2426 cifs_put_tcon(tcon);
2427 else if (pSesInfo)
2428 cifs_put_smb_ses(pSesInfo);
2429 else
2430 cifs_put_tcp_session(srvTcp);
2431 goto out;
2432 }
2433 cifs_sb->tcon = tcon; 2486 cifs_sb->tcon = tcon;
2434 2487
2435 /* do not care if following two calls succeed - informational */ 2488 /* do not care if following two calls succeed - informational */
@@ -2461,7 +2514,9 @@ mount_fail_check:
2461 cifs_sb->rsize = min(cifs_sb->rsize, 2514 cifs_sb->rsize = min(cifs_sb->rsize,
2462 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); 2515 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
2463 2516
2464 if (!rc && cifs_sb->prepathlen) { 2517remote_path_check:
2518 /* check if a whole path (including prepath) is not remote */
2519 if (!rc && cifs_sb->prepathlen && tcon) {
2465 /* build_path_to_root works only when we have a valid tcon */ 2520 /* build_path_to_root works only when we have a valid tcon */
2466 full_path = cifs_build_path_to_root(cifs_sb); 2521 full_path = cifs_build_path_to_root(cifs_sb);
2467 if (full_path == NULL) { 2522 if (full_path == NULL) {
@@ -2469,1079 +2524,91 @@ mount_fail_check:
2469 goto mount_fail_check; 2524 goto mount_fail_check;
2470 } 2525 }
2471 rc = is_path_accessible(xid, tcon, cifs_sb, full_path); 2526 rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
2472 if (rc) { 2527 if (rc != -EREMOTE) {
2473 cERROR(1, ("Path %s in not accessible: %d",
2474 full_path, rc));
2475 kfree(full_path); 2528 kfree(full_path);
2476 goto mount_fail_check; 2529 goto mount_fail_check;
2477 } 2530 }
2478 kfree(full_path); 2531 kfree(full_path);
2479 } 2532 }
2480 2533
2481 /* volume_info->password is freed above when existing session found 2534 /* get referral if needed */
2482 (in which case it is not needed anymore) but when new sesion is created 2535 if (rc == -EREMOTE) {
2483 the password ptr is put in the new session structure (in which case the 2536#ifdef CONFIG_CIFS_DFS_UPCALL
2484 password will be freed at unmount time) */ 2537 if (referral_walks_count > MAX_NESTED_LINKS) {
2485out: 2538 /*
2486 /* zero out password before freeing */ 2539 * BB: when we implement proper loop detection,
2487 if (volume_info) { 2540 * we will remove this check. But now we need it
2488 if (volume_info->password != NULL) { 2541 * to prevent an indefinite loop if 'DFS tree' is
2489 memset(volume_info->password, 0, 2542 * misconfigured (i.e. has loops).
2490 strlen(volume_info->password)); 2543 */
2491 kfree(volume_info->password); 2544 rc = -ELOOP;
2492 } 2545 goto mount_fail_check;
2493 kfree(volume_info->UNC);
2494 kfree(volume_info->prepath);
2495 kfree(volume_info);
2496 }
2497 FreeXid(xid);
2498 return rc;
2499}
2500
2501static int
2502CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2503 char session_key[CIFS_SESS_KEY_SIZE],
2504 const struct nls_table *nls_codepage)
2505{
2506 struct smb_hdr *smb_buffer;
2507 struct smb_hdr *smb_buffer_response;
2508 SESSION_SETUP_ANDX *pSMB;
2509 SESSION_SETUP_ANDX *pSMBr;
2510 char *bcc_ptr;
2511 char *user;
2512 char *domain;
2513 int rc = 0;
2514 int remaining_words = 0;
2515 int bytes_returned = 0;
2516 int len;
2517 __u32 capabilities;
2518 __u16 count;
2519
2520 cFYI(1, ("In sesssetup"));
2521 if (ses == NULL)
2522 return -EINVAL;
2523 user = ses->userName;
2524 domain = ses->domainName;
2525 smb_buffer = cifs_buf_get();
2526
2527 if (smb_buffer == NULL)
2528 return -ENOMEM;
2529
2530 smb_buffer_response = smb_buffer;
2531 pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
2532
2533 /* send SMBsessionSetup here */
2534 header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
2535 NULL /* no tCon exists yet */ , 13 /* wct */ );
2536
2537 smb_buffer->Mid = GetNextMid(ses->server);
2538 pSMB->req_no_secext.AndXCommand = 0xFF;
2539 pSMB->req_no_secext.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
2540 pSMB->req_no_secext.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
2541
2542 if (ses->server->secMode &
2543 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
2544 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
2545
2546 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
2547 CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
2548 if (ses->capabilities & CAP_UNICODE) {
2549 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
2550 capabilities |= CAP_UNICODE;
2551 }
2552 if (ses->capabilities & CAP_STATUS32) {
2553 smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
2554 capabilities |= CAP_STATUS32;
2555 }
2556 if (ses->capabilities & CAP_DFS) {
2557 smb_buffer->Flags2 |= SMBFLG2_DFS;
2558 capabilities |= CAP_DFS;
2559 }
2560 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
2561
2562 pSMB->req_no_secext.CaseInsensitivePasswordLength =
2563 cpu_to_le16(CIFS_SESS_KEY_SIZE);
2564
2565 pSMB->req_no_secext.CaseSensitivePasswordLength =
2566 cpu_to_le16(CIFS_SESS_KEY_SIZE);
2567 bcc_ptr = pByteArea(smb_buffer);
2568 memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
2569 bcc_ptr += CIFS_SESS_KEY_SIZE;
2570 memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
2571 bcc_ptr += CIFS_SESS_KEY_SIZE;
2572
2573 if (ses->capabilities & CAP_UNICODE) {
2574 if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode */
2575 *bcc_ptr = 0;
2576 bcc_ptr++;
2577 }
2578 if (user == NULL)
2579 bytes_returned = 0; /* skip null user */
2580 else
2581 bytes_returned =
2582 cifs_strtoUCS((__le16 *) bcc_ptr, user, 100,
2583 nls_codepage);
2584 /* convert number of 16 bit words to bytes */
2585 bcc_ptr += 2 * bytes_returned;
2586 bcc_ptr += 2; /* trailing null */
2587 if (domain == NULL)
2588 bytes_returned =
2589 cifs_strtoUCS((__le16 *) bcc_ptr,
2590 "CIFS_LINUX_DOM", 32, nls_codepage);
2591 else
2592 bytes_returned =
2593 cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
2594 nls_codepage);
2595 bcc_ptr += 2 * bytes_returned;
2596 bcc_ptr += 2;
2597 bytes_returned =
2598 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
2599 32, nls_codepage);
2600 bcc_ptr += 2 * bytes_returned;
2601 bytes_returned =
2602 cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release,
2603 32, nls_codepage);
2604 bcc_ptr += 2 * bytes_returned;
2605 bcc_ptr += 2;
2606 bytes_returned =
2607 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
2608 64, nls_codepage);
2609 bcc_ptr += 2 * bytes_returned;
2610 bcc_ptr += 2;
2611 } else {
2612 if (user != NULL) {
2613 strncpy(bcc_ptr, user, 200);
2614 bcc_ptr += strnlen(user, 200);
2615 }
2616 *bcc_ptr = 0;
2617 bcc_ptr++;
2618 if (domain == NULL) {
2619 strcpy(bcc_ptr, "CIFS_LINUX_DOM");
2620 bcc_ptr += strlen("CIFS_LINUX_DOM") + 1;
2621 } else {
2622 strncpy(bcc_ptr, domain, 64);
2623 bcc_ptr += strnlen(domain, 64);
2624 *bcc_ptr = 0;
2625 bcc_ptr++;
2626 }
2627 strcpy(bcc_ptr, "Linux version ");
2628 bcc_ptr += strlen("Linux version ");
2629 strcpy(bcc_ptr, utsname()->release);
2630 bcc_ptr += strlen(utsname()->release) + 1;
2631 strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
2632 bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
2633 }
2634 count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
2635 smb_buffer->smb_buf_length += count;
2636 pSMB->req_no_secext.ByteCount = cpu_to_le16(count);
2637
2638 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
2639 &bytes_returned, CIFS_LONG_OP);
2640 if (rc) {
2641/* rc = map_smb_to_linux_error(smb_buffer_response); now done in SendReceive */
2642 } else if ((smb_buffer_response->WordCount == 3)
2643 || (smb_buffer_response->WordCount == 4)) {
2644 __u16 action = le16_to_cpu(pSMBr->resp.Action);
2645 __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
2646 if (action & GUEST_LOGIN)
2647 cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
2648 ses->Suid = smb_buffer_response->Uid; /* UID left in wire format
2649 (little endian) */
2650 cFYI(1, ("UID = %d ", ses->Suid));
2651 /* response can have either 3 or 4 word count - Samba sends 3 */
2652 bcc_ptr = pByteArea(smb_buffer_response);
2653 if ((pSMBr->resp.hdr.WordCount == 3)
2654 || ((pSMBr->resp.hdr.WordCount == 4)
2655 && (blob_len < pSMBr->resp.ByteCount))) {
2656 if (pSMBr->resp.hdr.WordCount == 4)
2657 bcc_ptr += blob_len;
2658
2659 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
2660 if ((long) (bcc_ptr) % 2) {
2661 remaining_words =
2662 (BCC(smb_buffer_response) - 1) / 2;
2663 /* Unicode strings must be word
2664 aligned */
2665 bcc_ptr++;
2666 } else {
2667 remaining_words =
2668 BCC(smb_buffer_response) / 2;
2669 }
2670 len =
2671 UniStrnlen((wchar_t *) bcc_ptr,
2672 remaining_words - 1);
2673/* We look for obvious messed up bcc or strings in response so we do not go off
2674 the end since (at least) WIN2K and Windows XP have a major bug in not null
2675 terminating last Unicode string in response */
2676 if (ses->serverOS)
2677 kfree(ses->serverOS);
2678 ses->serverOS = kzalloc(2 * (len + 1),
2679 GFP_KERNEL);
2680 if (ses->serverOS == NULL)
2681 goto sesssetup_nomem;
2682 cifs_strfromUCS_le(ses->serverOS,
2683 (__le16 *)bcc_ptr,
2684 len, nls_codepage);
2685 bcc_ptr += 2 * (len + 1);
2686 remaining_words -= len + 1;
2687 ses->serverOS[2 * len] = 0;
2688 ses->serverOS[1 + (2 * len)] = 0;
2689 if (remaining_words > 0) {
2690 len = UniStrnlen((wchar_t *)bcc_ptr,
2691 remaining_words-1);
2692 kfree(ses->serverNOS);
2693 ses->serverNOS = kzalloc(2 * (len + 1),
2694 GFP_KERNEL);
2695 if (ses->serverNOS == NULL)
2696 goto sesssetup_nomem;
2697 cifs_strfromUCS_le(ses->serverNOS,
2698 (__le16 *)bcc_ptr,
2699 len, nls_codepage);
2700 bcc_ptr += 2 * (len + 1);
2701 ses->serverNOS[2 * len] = 0;
2702 ses->serverNOS[1 + (2 * len)] = 0;
2703 if (strncmp(ses->serverNOS,
2704 "NT LAN Manager 4", 16) == 0) {
2705 cFYI(1, ("NT4 server"));
2706 ses->flags |= CIFS_SES_NT4;
2707 }
2708 remaining_words -= len + 1;
2709 if (remaining_words > 0) {
2710 len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
2711 /* last string is not always null terminated
2712 (for e.g. for Windows XP & 2000) */
2713 if (ses->serverDomain)
2714 kfree(ses->serverDomain);
2715 ses->serverDomain =
2716 kzalloc(2*(len+1),
2717 GFP_KERNEL);
2718 if (ses->serverDomain == NULL)
2719 goto sesssetup_nomem;
2720 cifs_strfromUCS_le(ses->serverDomain,
2721 (__le16 *)bcc_ptr,
2722 len, nls_codepage);
2723 bcc_ptr += 2 * (len + 1);
2724 ses->serverDomain[2*len] = 0;
2725 ses->serverDomain[1+(2*len)] = 0;
2726 } else { /* else no more room so create
2727 dummy domain string */
2728 if (ses->serverDomain)
2729 kfree(ses->serverDomain);
2730 ses->serverDomain =
2731 kzalloc(2, GFP_KERNEL);
2732 }
2733 } else { /* no room so create dummy domain
2734 and NOS string */
2735
2736 /* if these kcallocs fail not much we
2737 can do, but better to not fail the
2738 sesssetup itself */
2739 kfree(ses->serverDomain);
2740 ses->serverDomain =
2741 kzalloc(2, GFP_KERNEL);
2742 kfree(ses->serverNOS);
2743 ses->serverNOS =
2744 kzalloc(2, GFP_KERNEL);
2745 }
2746 } else { /* ASCII */
2747 len = strnlen(bcc_ptr, 1024);
2748 if (((long) bcc_ptr + len) - (long)
2749 pByteArea(smb_buffer_response)
2750 <= BCC(smb_buffer_response)) {
2751 kfree(ses->serverOS);
2752 ses->serverOS = kzalloc(len + 1,
2753 GFP_KERNEL);
2754 if (ses->serverOS == NULL)
2755 goto sesssetup_nomem;
2756 strncpy(ses->serverOS, bcc_ptr, len);
2757
2758 bcc_ptr += len;
2759 /* null terminate the string */
2760 bcc_ptr[0] = 0;
2761 bcc_ptr++;
2762
2763 len = strnlen(bcc_ptr, 1024);
2764 kfree(ses->serverNOS);
2765 ses->serverNOS = kzalloc(len + 1,
2766 GFP_KERNEL);
2767 if (ses->serverNOS == NULL)
2768 goto sesssetup_nomem;
2769 strncpy(ses->serverNOS, bcc_ptr, len);
2770 bcc_ptr += len;
2771 bcc_ptr[0] = 0;
2772 bcc_ptr++;
2773
2774 len = strnlen(bcc_ptr, 1024);
2775 if (ses->serverDomain)
2776 kfree(ses->serverDomain);
2777 ses->serverDomain = kzalloc(len + 1,
2778 GFP_KERNEL);
2779 if (ses->serverDomain == NULL)
2780 goto sesssetup_nomem;
2781 strncpy(ses->serverDomain, bcc_ptr,
2782 len);
2783 bcc_ptr += len;
2784 bcc_ptr[0] = 0;
2785 bcc_ptr++;
2786 } else
2787 cFYI(1,
2788 ("Variable field of length %d "
2789 "extends beyond end of smb ",
2790 len));
2791 }
2792 } else {
2793 cERROR(1, ("Security Blob Length extends beyond "
2794 "end of SMB"));
2795 } 2546 }
2796 } else { 2547 /* convert forward to back slashes in prepath here if needed */
2797 cERROR(1, ("Invalid Word count %d: ", 2548 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
2798 smb_buffer_response->WordCount)); 2549 convert_delimiter(cifs_sb->prepath,
2799 rc = -EIO; 2550 CIFS_DIR_SEP(cifs_sb));
2800 } 2551 full_path = build_unc_path_to_root(volume_info, cifs_sb);
2801sesssetup_nomem: /* do not return an error on nomem for the info strings, 2552 if (IS_ERR(full_path)) {
2802 since that could make reconnection harder, and 2553 rc = PTR_ERR(full_path);
2803 reconnection might be needed to free memory */ 2554 goto mount_fail_check;
2804 cifs_buf_release(smb_buffer);
2805
2806 return rc;
2807}
2808
2809static int
2810CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
2811 struct cifsSesInfo *ses, bool *pNTLMv2_flag,
2812 const struct nls_table *nls_codepage)
2813{
2814 struct smb_hdr *smb_buffer;
2815 struct smb_hdr *smb_buffer_response;
2816 SESSION_SETUP_ANDX *pSMB;
2817 SESSION_SETUP_ANDX *pSMBr;
2818 char *bcc_ptr;
2819 char *domain;
2820 int rc = 0;
2821 int remaining_words = 0;
2822 int bytes_returned = 0;
2823 int len;
2824 int SecurityBlobLength = sizeof(NEGOTIATE_MESSAGE);
2825 PNEGOTIATE_MESSAGE SecurityBlob;
2826 PCHALLENGE_MESSAGE SecurityBlob2;
2827 __u32 negotiate_flags, capabilities;
2828 __u16 count;
2829
2830 cFYI(1, ("In NTLMSSP sesssetup (negotiate)"));
2831 if (ses == NULL)
2832 return -EINVAL;
2833 domain = ses->domainName;
2834 *pNTLMv2_flag = false;
2835 smb_buffer = cifs_buf_get();
2836 if (smb_buffer == NULL) {
2837 return -ENOMEM;
2838 }
2839 smb_buffer_response = smb_buffer;
2840 pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
2841 pSMBr = (SESSION_SETUP_ANDX *) smb_buffer_response;
2842
2843 /* send SMBsessionSetup here */
2844 header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
2845 NULL /* no tCon exists yet */ , 12 /* wct */ );
2846
2847 smb_buffer->Mid = GetNextMid(ses->server);
2848 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
2849 pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
2850
2851 pSMB->req.AndXCommand = 0xFF;
2852 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
2853 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
2854
2855 if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
2856 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
2857
2858 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
2859 CAP_EXTENDED_SECURITY;
2860 if (ses->capabilities & CAP_UNICODE) {
2861 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
2862 capabilities |= CAP_UNICODE;
2863 }
2864 if (ses->capabilities & CAP_STATUS32) {
2865 smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
2866 capabilities |= CAP_STATUS32;
2867 }
2868 if (ses->capabilities & CAP_DFS) {
2869 smb_buffer->Flags2 |= SMBFLG2_DFS;
2870 capabilities |= CAP_DFS;
2871 }
2872 pSMB->req.Capabilities = cpu_to_le32(capabilities);
2873
2874 bcc_ptr = (char *) &pSMB->req.SecurityBlob;
2875 SecurityBlob = (PNEGOTIATE_MESSAGE) bcc_ptr;
2876 strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
2877 SecurityBlob->MessageType = NtLmNegotiate;
2878 negotiate_flags =
2879 NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_OEM |
2880 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_NTLM |
2881 NTLMSSP_NEGOTIATE_56 |
2882 /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128;
2883 if (sign_CIFS_PDUs)
2884 negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN;
2885/* if (ntlmv2_support)
2886 negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;*/
2887 /* setup pointers to domain name and workstation name */
2888 bcc_ptr += SecurityBlobLength;
2889
2890 SecurityBlob->WorkstationName.Buffer = 0;
2891 SecurityBlob->WorkstationName.Length = 0;
2892 SecurityBlob->WorkstationName.MaximumLength = 0;
2893
2894 /* Domain not sent on first Sesssetup in NTLMSSP, instead it is sent
2895 along with username on auth request (ie the response to challenge) */
2896 SecurityBlob->DomainName.Buffer = 0;
2897 SecurityBlob->DomainName.Length = 0;
2898 SecurityBlob->DomainName.MaximumLength = 0;
2899 if (ses->capabilities & CAP_UNICODE) {
2900 if ((long) bcc_ptr % 2) {
2901 *bcc_ptr = 0;
2902 bcc_ptr++;
2903 } 2555 }
2904 2556
2905 bytes_returned = 2557 cFYI(1, ("Getting referral for: %s", full_path));
2906 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ", 2558 rc = get_dfs_path(xid, pSesInfo , full_path + 1,
2907 32, nls_codepage); 2559 cifs_sb->local_nls, &num_referrals, &referrals,
2908 bcc_ptr += 2 * bytes_returned; 2560 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
2909 bytes_returned = 2561 if (!rc && num_referrals > 0) {
2910 cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, 2562 char *fake_devname = NULL;
2911 nls_codepage); 2563
2912 bcc_ptr += 2 * bytes_returned; 2564 if (mount_data != mount_data_global)
2913 bcc_ptr += 2; /* null terminate Linux version */ 2565 kfree(mount_data);
2914 bytes_returned = 2566 mount_data = cifs_compose_mount_options(
2915 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, 2567 cifs_sb->mountdata, full_path + 1,
2916 64, nls_codepage); 2568 referrals, &fake_devname);
2917 bcc_ptr += 2 * bytes_returned; 2569 kfree(fake_devname);
2918 *(bcc_ptr + 1) = 0; 2570 free_dfs_info_array(referrals, num_referrals);
2919 *(bcc_ptr + 2) = 0; 2571
2920 bcc_ptr += 2; /* null terminate network opsys string */ 2572 if (tcon)
2921 *(bcc_ptr + 1) = 0; 2573 cifs_put_tcon(tcon);
2922 *(bcc_ptr + 2) = 0; 2574 else if (pSesInfo)
2923 bcc_ptr += 2; /* null domain */ 2575 cifs_put_smb_ses(pSesInfo);
2924 } else { /* ASCII */ 2576
2925 strcpy(bcc_ptr, "Linux version "); 2577 cleanup_volume_info(&volume_info);
2926 bcc_ptr += strlen("Linux version "); 2578 FreeXid(xid);
2927 strcpy(bcc_ptr, utsname()->release); 2579 kfree(full_path);
2928 bcc_ptr += strlen(utsname()->release) + 1; 2580 referral_walks_count++;
2929 strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); 2581 goto try_mount_again;
2930 bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
2931 bcc_ptr++; /* empty domain field */
2932 *bcc_ptr = 0;
2933 }
2934 SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
2935 pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
2936 count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
2937 smb_buffer->smb_buf_length += count;
2938 pSMB->req.ByteCount = cpu_to_le16(count);
2939
2940 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
2941 &bytes_returned, CIFS_LONG_OP);
2942
2943 if (smb_buffer_response->Status.CifsError ==
2944 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
2945 rc = 0;
2946
2947 if (rc) {
2948/* rc = map_smb_to_linux_error(smb_buffer_response); *//* done in SendReceive now */
2949 } else if ((smb_buffer_response->WordCount == 3)
2950 || (smb_buffer_response->WordCount == 4)) {
2951 __u16 action = le16_to_cpu(pSMBr->resp.Action);
2952 __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
2953
2954 if (action & GUEST_LOGIN)
2955 cFYI(1, ("Guest login"));
2956 /* Do we want to set anything in SesInfo struct when guest login? */
2957
2958 bcc_ptr = pByteArea(smb_buffer_response);
2959 /* response can have either 3 or 4 word count - Samba sends 3 */
2960
2961 SecurityBlob2 = (PCHALLENGE_MESSAGE) bcc_ptr;
2962 if (SecurityBlob2->MessageType != NtLmChallenge) {
2963 cFYI(1, ("Unexpected NTLMSSP message type received %d",
2964 SecurityBlob2->MessageType));
2965 } else if (ses) {
2966 ses->Suid = smb_buffer_response->Uid; /* UID left in le format */
2967 cFYI(1, ("UID = %d", ses->Suid));
2968 if ((pSMBr->resp.hdr.WordCount == 3)
2969 || ((pSMBr->resp.hdr.WordCount == 4)
2970 && (blob_len <
2971 pSMBr->resp.ByteCount))) {
2972
2973 if (pSMBr->resp.hdr.WordCount == 4) {
2974 bcc_ptr += blob_len;
2975 cFYI(1, ("Security Blob Length %d",
2976 blob_len));
2977 }
2978
2979 cFYI(1, ("NTLMSSP Challenge rcvd"));
2980
2981 memcpy(ses->server->cryptKey,
2982 SecurityBlob2->Challenge,
2983 CIFS_CRYPTO_KEY_SIZE);
2984 if (SecurityBlob2->NegotiateFlags &
2985 cpu_to_le32(NTLMSSP_NEGOTIATE_NTLMV2))
2986 *pNTLMv2_flag = true;
2987
2988 if ((SecurityBlob2->NegotiateFlags &
2989 cpu_to_le32(NTLMSSP_NEGOTIATE_ALWAYS_SIGN))
2990 || (sign_CIFS_PDUs > 1))
2991 ses->server->secMode |=
2992 SECMODE_SIGN_REQUIRED;
2993 if ((SecurityBlob2->NegotiateFlags &
2994 cpu_to_le32(NTLMSSP_NEGOTIATE_SIGN)) && (sign_CIFS_PDUs))
2995 ses->server->secMode |=
2996 SECMODE_SIGN_ENABLED;
2997
2998 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
2999 if ((long) (bcc_ptr) % 2) {
3000 remaining_words =
3001 (BCC(smb_buffer_response)
3002 - 1) / 2;
3003 /* Must word align unicode strings */
3004 bcc_ptr++;
3005 } else {
3006 remaining_words =
3007 BCC
3008 (smb_buffer_response) / 2;
3009 }
3010 len =
3011 UniStrnlen((wchar_t *) bcc_ptr,
3012 remaining_words - 1);
3013/* We look for obvious messed up bcc or strings in response so we do not go off
3014 the end since (at least) WIN2K and Windows XP have a major bug in not null
3015 terminating last Unicode string in response */
3016 if (ses->serverOS)
3017 kfree(ses->serverOS);
3018 ses->serverOS =
3019 kzalloc(2 * (len + 1), GFP_KERNEL);
3020 cifs_strfromUCS_le(ses->serverOS,
3021 (__le16 *)
3022 bcc_ptr, len,
3023 nls_codepage);
3024 bcc_ptr += 2 * (len + 1);
3025 remaining_words -= len + 1;
3026 ses->serverOS[2 * len] = 0;
3027 ses->serverOS[1 + (2 * len)] = 0;
3028 if (remaining_words > 0) {
3029 len = UniStrnlen((wchar_t *)
3030 bcc_ptr,
3031 remaining_words
3032 - 1);
3033 kfree(ses->serverNOS);
3034 ses->serverNOS =
3035 kzalloc(2 * (len + 1),
3036 GFP_KERNEL);
3037 cifs_strfromUCS_le(ses->
3038 serverNOS,
3039 (__le16 *)
3040 bcc_ptr,
3041 len,
3042 nls_codepage);
3043 bcc_ptr += 2 * (len + 1);
3044 ses->serverNOS[2 * len] = 0;
3045 ses->serverNOS[1 +
3046 (2 * len)] = 0;
3047 remaining_words -= len + 1;
3048 if (remaining_words > 0) {
3049 len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
3050 /* last string not always null terminated
3051 (for e.g. for Windows XP & 2000) */
3052 kfree(ses->serverDomain);
3053 ses->serverDomain =
3054 kzalloc(2 *
3055 (len +
3056 1),
3057 GFP_KERNEL);
3058 cifs_strfromUCS_le
3059 (ses->serverDomain,
3060 (__le16 *)bcc_ptr,
3061 len, nls_codepage);
3062 bcc_ptr +=
3063 2 * (len + 1);
3064 ses->serverDomain[2*len]
3065 = 0;
3066 ses->serverDomain
3067 [1 + (2 * len)]
3068 = 0;
3069 } /* else no more room so create dummy domain string */
3070 else {
3071 kfree(ses->serverDomain);
3072 ses->serverDomain =
3073 kzalloc(2,
3074 GFP_KERNEL);
3075 }
3076 } else { /* no room so create dummy domain and NOS string */
3077 kfree(ses->serverDomain);
3078 ses->serverDomain =
3079 kzalloc(2, GFP_KERNEL);
3080 kfree(ses->serverNOS);
3081 ses->serverNOS =
3082 kzalloc(2, GFP_KERNEL);
3083 }
3084 } else { /* ASCII */
3085 len = strnlen(bcc_ptr, 1024);
3086 if (((long) bcc_ptr + len) - (long)
3087 pByteArea(smb_buffer_response)
3088 <= BCC(smb_buffer_response)) {
3089 if (ses->serverOS)
3090 kfree(ses->serverOS);
3091 ses->serverOS =
3092 kzalloc(len + 1,
3093 GFP_KERNEL);
3094 strncpy(ses->serverOS,
3095 bcc_ptr, len);
3096
3097 bcc_ptr += len;
3098 bcc_ptr[0] = 0; /* null terminate string */
3099 bcc_ptr++;
3100
3101 len = strnlen(bcc_ptr, 1024);
3102 kfree(ses->serverNOS);
3103 ses->serverNOS =
3104 kzalloc(len + 1,
3105 GFP_KERNEL);
3106 strncpy(ses->serverNOS, bcc_ptr, len);
3107 bcc_ptr += len;
3108 bcc_ptr[0] = 0;
3109 bcc_ptr++;
3110
3111 len = strnlen(bcc_ptr, 1024);
3112 kfree(ses->serverDomain);
3113 ses->serverDomain =
3114 kzalloc(len + 1,
3115 GFP_KERNEL);
3116 strncpy(ses->serverDomain,
3117 bcc_ptr, len);
3118 bcc_ptr += len;
3119 bcc_ptr[0] = 0;
3120 bcc_ptr++;
3121 } else
3122 cFYI(1,
3123 ("field of length %d "
3124 "extends beyond end of smb",
3125 len));
3126 }
3127 } else {
3128 cERROR(1, ("Security Blob Length extends beyond"
3129 " end of SMB"));
3130 }
3131 } else {
3132 cERROR(1, ("No session structure passed in."));
3133 } 2582 }
3134 } else { 2583#else /* No DFS support, return error on mount */
3135 cERROR(1, ("Invalid Word count %d:", 2584 rc = -EOPNOTSUPP;
3136 smb_buffer_response->WordCount)); 2585#endif
3137 rc = -EIO;
3138 }
3139
3140 cifs_buf_release(smb_buffer);
3141
3142 return rc;
3143}
3144static int
3145CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
3146 char *ntlm_session_key, bool ntlmv2_flag,
3147 const struct nls_table *nls_codepage)
3148{
3149 struct smb_hdr *smb_buffer;
3150 struct smb_hdr *smb_buffer_response;
3151 SESSION_SETUP_ANDX *pSMB;
3152 SESSION_SETUP_ANDX *pSMBr;
3153 char *bcc_ptr;
3154 char *user;
3155 char *domain;
3156 int rc = 0;
3157 int remaining_words = 0;
3158 int bytes_returned = 0;
3159 int len;
3160 int SecurityBlobLength = sizeof(AUTHENTICATE_MESSAGE);
3161 PAUTHENTICATE_MESSAGE SecurityBlob;
3162 __u32 negotiate_flags, capabilities;
3163 __u16 count;
3164
3165 cFYI(1, ("In NTLMSSPSessSetup (Authenticate)"));
3166 if (ses == NULL)
3167 return -EINVAL;
3168 user = ses->userName;
3169 domain = ses->domainName;
3170 smb_buffer = cifs_buf_get();
3171 if (smb_buffer == NULL) {
3172 return -ENOMEM;
3173 }
3174 smb_buffer_response = smb_buffer;
3175 pSMB = (SESSION_SETUP_ANDX *)smb_buffer;
3176 pSMBr = (SESSION_SETUP_ANDX *)smb_buffer_response;
3177
3178 /* send SMBsessionSetup here */
3179 header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
3180 NULL /* no tCon exists yet */ , 12 /* wct */ );
3181
3182 smb_buffer->Mid = GetNextMid(ses->server);
3183 pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
3184 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
3185 pSMB->req.AndXCommand = 0xFF;
3186 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
3187 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
3188
3189 pSMB->req.hdr.Uid = ses->Suid;
3190
3191 if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
3192 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
3193
3194 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
3195 CAP_EXTENDED_SECURITY;
3196 if (ses->capabilities & CAP_UNICODE) {
3197 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
3198 capabilities |= CAP_UNICODE;
3199 }
3200 if (ses->capabilities & CAP_STATUS32) {
3201 smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
3202 capabilities |= CAP_STATUS32;
3203 } 2586 }
3204 if (ses->capabilities & CAP_DFS) {
3205 smb_buffer->Flags2 |= SMBFLG2_DFS;
3206 capabilities |= CAP_DFS;
3207 }
3208 pSMB->req.Capabilities = cpu_to_le32(capabilities);
3209
3210 bcc_ptr = (char *)&pSMB->req.SecurityBlob;
3211 SecurityBlob = (PAUTHENTICATE_MESSAGE)bcc_ptr;
3212 strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
3213 SecurityBlob->MessageType = NtLmAuthenticate;
3214 bcc_ptr += SecurityBlobLength;
3215 negotiate_flags = NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_REQUEST_TARGET |
3216 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_TARGET_INFO |
3217 0x80000000 | NTLMSSP_NEGOTIATE_128;
3218 if (sign_CIFS_PDUs)
3219 negotiate_flags |= /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN |*/ NTLMSSP_NEGOTIATE_SIGN;
3220 if (ntlmv2_flag)
3221 negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;
3222
3223/* setup pointers to domain name and workstation name */
3224
3225 SecurityBlob->WorkstationName.Buffer = 0;
3226 SecurityBlob->WorkstationName.Length = 0;
3227 SecurityBlob->WorkstationName.MaximumLength = 0;
3228 SecurityBlob->SessionKey.Length = 0;
3229 SecurityBlob->SessionKey.MaximumLength = 0;
3230 SecurityBlob->SessionKey.Buffer = 0;
3231
3232 SecurityBlob->LmChallengeResponse.Length = 0;
3233 SecurityBlob->LmChallengeResponse.MaximumLength = 0;
3234 SecurityBlob->LmChallengeResponse.Buffer = 0;
3235
3236 SecurityBlob->NtChallengeResponse.Length =
3237 cpu_to_le16(CIFS_SESS_KEY_SIZE);
3238 SecurityBlob->NtChallengeResponse.MaximumLength =
3239 cpu_to_le16(CIFS_SESS_KEY_SIZE);
3240 memcpy(bcc_ptr, ntlm_session_key, CIFS_SESS_KEY_SIZE);
3241 SecurityBlob->NtChallengeResponse.Buffer =
3242 cpu_to_le32(SecurityBlobLength);
3243 SecurityBlobLength += CIFS_SESS_KEY_SIZE;
3244 bcc_ptr += CIFS_SESS_KEY_SIZE;
3245 2587
3246 if (ses->capabilities & CAP_UNICODE) { 2588mount_fail_check:
3247 if (domain == NULL) { 2589 /* on error free sesinfo and tcon struct if needed */
3248 SecurityBlob->DomainName.Buffer = 0;
3249 SecurityBlob->DomainName.Length = 0;
3250 SecurityBlob->DomainName.MaximumLength = 0;
3251 } else {
3252 __u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
3253 nls_codepage);
3254 ln *= 2;
3255 SecurityBlob->DomainName.MaximumLength =
3256 cpu_to_le16(ln);
3257 SecurityBlob->DomainName.Buffer =
3258 cpu_to_le32(SecurityBlobLength);
3259 bcc_ptr += ln;
3260 SecurityBlobLength += ln;
3261 SecurityBlob->DomainName.Length = cpu_to_le16(ln);
3262 }
3263 if (user == NULL) {
3264 SecurityBlob->UserName.Buffer = 0;
3265 SecurityBlob->UserName.Length = 0;
3266 SecurityBlob->UserName.MaximumLength = 0;
3267 } else {
3268 __u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, user, 64,
3269 nls_codepage);
3270 ln *= 2;
3271 SecurityBlob->UserName.MaximumLength =
3272 cpu_to_le16(ln);
3273 SecurityBlob->UserName.Buffer =
3274 cpu_to_le32(SecurityBlobLength);
3275 bcc_ptr += ln;
3276 SecurityBlobLength += ln;
3277 SecurityBlob->UserName.Length = cpu_to_le16(ln);
3278 }
3279
3280 /* SecurityBlob->WorkstationName.Length =
3281 cifs_strtoUCS((__le16 *) bcc_ptr, "AMACHINE",64, nls_codepage);
3282 SecurityBlob->WorkstationName.Length *= 2;
3283 SecurityBlob->WorkstationName.MaximumLength =
3284 cpu_to_le16(SecurityBlob->WorkstationName.Length);
3285 SecurityBlob->WorkstationName.Buffer =
3286 cpu_to_le32(SecurityBlobLength);
3287 bcc_ptr += SecurityBlob->WorkstationName.Length;
3288 SecurityBlobLength += SecurityBlob->WorkstationName.Length;
3289 SecurityBlob->WorkstationName.Length =
3290 cpu_to_le16(SecurityBlob->WorkstationName.Length); */
3291
3292 if ((long) bcc_ptr % 2) {
3293 *bcc_ptr = 0;
3294 bcc_ptr++;
3295 }
3296 bytes_returned =
3297 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
3298 32, nls_codepage);
3299 bcc_ptr += 2 * bytes_returned;
3300 bytes_returned =
3301 cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
3302 nls_codepage);
3303 bcc_ptr += 2 * bytes_returned;
3304 bcc_ptr += 2; /* null term version string */
3305 bytes_returned =
3306 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
3307 64, nls_codepage);
3308 bcc_ptr += 2 * bytes_returned;
3309 *(bcc_ptr + 1) = 0;
3310 *(bcc_ptr + 2) = 0;
3311 bcc_ptr += 2; /* null terminate network opsys string */
3312 *(bcc_ptr + 1) = 0;
3313 *(bcc_ptr + 2) = 0;
3314 bcc_ptr += 2; /* null domain */
3315 } else { /* ASCII */
3316 if (domain == NULL) {
3317 SecurityBlob->DomainName.Buffer = 0;
3318 SecurityBlob->DomainName.Length = 0;
3319 SecurityBlob->DomainName.MaximumLength = 0;
3320 } else {
3321 __u16 ln;
3322 negotiate_flags |= NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED;
3323 strncpy(bcc_ptr, domain, 63);
3324 ln = strnlen(domain, 64);
3325 SecurityBlob->DomainName.MaximumLength =
3326 cpu_to_le16(ln);
3327 SecurityBlob->DomainName.Buffer =
3328 cpu_to_le32(SecurityBlobLength);
3329 bcc_ptr += ln;
3330 SecurityBlobLength += ln;
3331 SecurityBlob->DomainName.Length = cpu_to_le16(ln);
3332 }
3333 if (user == NULL) {
3334 SecurityBlob->UserName.Buffer = 0;
3335 SecurityBlob->UserName.Length = 0;
3336 SecurityBlob->UserName.MaximumLength = 0;
3337 } else {
3338 __u16 ln;
3339 strncpy(bcc_ptr, user, 63);
3340 ln = strnlen(user, 64);
3341 SecurityBlob->UserName.MaximumLength = cpu_to_le16(ln);
3342 SecurityBlob->UserName.Buffer =
3343 cpu_to_le32(SecurityBlobLength);
3344 bcc_ptr += ln;
3345 SecurityBlobLength += ln;
3346 SecurityBlob->UserName.Length = cpu_to_le16(ln);
3347 }
3348 /* BB fill in our workstation name if known BB */
3349
3350 strcpy(bcc_ptr, "Linux version ");
3351 bcc_ptr += strlen("Linux version ");
3352 strcpy(bcc_ptr, utsname()->release);
3353 bcc_ptr += strlen(utsname()->release) + 1;
3354 strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
3355 bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
3356 bcc_ptr++; /* null domain */
3357 *bcc_ptr = 0;
3358 }
3359 SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
3360 pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
3361 count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
3362 smb_buffer->smb_buf_length += count;
3363 pSMB->req.ByteCount = cpu_to_le16(count);
3364
3365 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
3366 &bytes_returned, CIFS_LONG_OP);
3367 if (rc) { 2590 if (rc) {
3368/* rc = map_smb_to_linux_error(smb_buffer_response) done in SendReceive now */ 2591 if (mount_data != mount_data_global)
3369 } else if ((smb_buffer_response->WordCount == 3) || 2592 kfree(mount_data);
3370 (smb_buffer_response->WordCount == 4)) { 2593 /* If find_unc succeeded then rc == 0 so we can not end */
3371 __u16 action = le16_to_cpu(pSMBr->resp.Action); 2594 /* up accidently freeing someone elses tcon struct */
3372 __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength); 2595 if (tcon)
3373 if (action & GUEST_LOGIN) 2596 cifs_put_tcon(tcon);
3374 cFYI(1, ("Guest login")); /* BB Should we set anything 2597 else if (pSesInfo)
3375 in SesInfo struct ? */ 2598 cifs_put_smb_ses(pSesInfo);
3376/* if (SecurityBlob2->MessageType != NtLm??) { 2599 else
3377 cFYI("Unexpected message type on auth response is %d")); 2600 cifs_put_tcp_session(srvTcp);
3378 } */ 2601 goto out;
3379
3380 if (ses) {
3381 cFYI(1,
3382 ("Check challenge UID %d vs auth response UID %d",
3383 ses->Suid, smb_buffer_response->Uid));
3384 /* UID left in wire format */
3385 ses->Suid = smb_buffer_response->Uid;
3386 bcc_ptr = pByteArea(smb_buffer_response);
3387 /* response can have either 3 or 4 word count - Samba sends 3 */
3388 if ((pSMBr->resp.hdr.WordCount == 3)
3389 || ((pSMBr->resp.hdr.WordCount == 4)
3390 && (blob_len <
3391 pSMBr->resp.ByteCount))) {
3392 if (pSMBr->resp.hdr.WordCount == 4) {
3393 bcc_ptr +=
3394 blob_len;
3395 cFYI(1,
3396 ("Security Blob Length %d ",
3397 blob_len));
3398 }
3399
3400 cFYI(1,
3401 ("NTLMSSP response to Authenticate "));
3402
3403 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
3404 if ((long) (bcc_ptr) % 2) {
3405 remaining_words =
3406 (BCC(smb_buffer_response)
3407 - 1) / 2;
3408 bcc_ptr++; /* Unicode strings must be word aligned */
3409 } else {
3410 remaining_words = BCC(smb_buffer_response) / 2;
3411 }
3412 len = UniStrnlen((wchar_t *) bcc_ptr,
3413 remaining_words - 1);
3414/* We look for obvious messed up bcc or strings in response so we do not go off
3415 the end since (at least) WIN2K and Windows XP have a major bug in not null
3416 terminating last Unicode string in response */
3417 if (ses->serverOS)
3418 kfree(ses->serverOS);
3419 ses->serverOS =
3420 kzalloc(2 * (len + 1), GFP_KERNEL);
3421 cifs_strfromUCS_le(ses->serverOS,
3422 (__le16 *)
3423 bcc_ptr, len,
3424 nls_codepage);
3425 bcc_ptr += 2 * (len + 1);
3426 remaining_words -= len + 1;
3427 ses->serverOS[2 * len] = 0;
3428 ses->serverOS[1 + (2 * len)] = 0;
3429 if (remaining_words > 0) {
3430 len = UniStrnlen((wchar_t *)
3431 bcc_ptr,
3432 remaining_words
3433 - 1);
3434 kfree(ses->serverNOS);
3435 ses->serverNOS =
3436 kzalloc(2 * (len + 1),
3437 GFP_KERNEL);
3438 cifs_strfromUCS_le(ses->
3439 serverNOS,
3440 (__le16 *)
3441 bcc_ptr,
3442 len,
3443 nls_codepage);
3444 bcc_ptr += 2 * (len + 1);
3445 ses->serverNOS[2 * len] = 0;
3446 ses->serverNOS[1+(2*len)] = 0;
3447 remaining_words -= len + 1;
3448 if (remaining_words > 0) {
3449 len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
3450 /* last string not always null terminated (e.g. for Windows XP & 2000) */
3451 if (ses->serverDomain)
3452 kfree(ses->serverDomain);
3453 ses->serverDomain =
3454 kzalloc(2 *
3455 (len +
3456 1),
3457 GFP_KERNEL);
3458 cifs_strfromUCS_le
3459 (ses->
3460 serverDomain,
3461 (__le16 *)
3462 bcc_ptr, len,
3463 nls_codepage);
3464 bcc_ptr +=
3465 2 * (len + 1);
3466 ses->
3467 serverDomain[2
3468 * len]
3469 = 0;
3470 ses->
3471 serverDomain[1
3472 +
3473 (2
3474 *
3475 len)]
3476 = 0;
3477 } /* else no more room so create dummy domain string */
3478 else {
3479 if (ses->serverDomain)
3480 kfree(ses->serverDomain);
3481 ses->serverDomain = kzalloc(2,GFP_KERNEL);
3482 }
3483 } else { /* no room so create dummy domain and NOS string */
3484 if (ses->serverDomain)
3485 kfree(ses->serverDomain);
3486 ses->serverDomain = kzalloc(2, GFP_KERNEL);
3487 kfree(ses->serverNOS);
3488 ses->serverNOS = kzalloc(2, GFP_KERNEL);
3489 }
3490 } else { /* ASCII */
3491 len = strnlen(bcc_ptr, 1024);
3492 if (((long) bcc_ptr + len) -
3493 (long) pByteArea(smb_buffer_response)
3494 <= BCC(smb_buffer_response)) {
3495 if (ses->serverOS)
3496 kfree(ses->serverOS);
3497 ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
3498 strncpy(ses->serverOS,bcc_ptr, len);
3499
3500 bcc_ptr += len;
3501 bcc_ptr[0] = 0; /* null terminate the string */
3502 bcc_ptr++;
3503
3504 len = strnlen(bcc_ptr, 1024);
3505 kfree(ses->serverNOS);
3506 ses->serverNOS = kzalloc(len+1,
3507 GFP_KERNEL);
3508 strncpy(ses->serverNOS,
3509 bcc_ptr, len);
3510 bcc_ptr += len;
3511 bcc_ptr[0] = 0;
3512 bcc_ptr++;
3513
3514 len = strnlen(bcc_ptr, 1024);
3515 if (ses->serverDomain)
3516 kfree(ses->serverDomain);
3517 ses->serverDomain =
3518 kzalloc(len+1,
3519 GFP_KERNEL);
3520 strncpy(ses->serverDomain,
3521 bcc_ptr, len);
3522 bcc_ptr += len;
3523 bcc_ptr[0] = 0;
3524 bcc_ptr++;
3525 } else
3526 cFYI(1, ("field of length %d "
3527 "extends beyond end of smb ",
3528 len));
3529 }
3530 } else {
3531 cERROR(1, ("Security Blob extends beyond end "
3532 "of SMB"));
3533 }
3534 } else {
3535 cERROR(1, ("No session structure passed in."));
3536 }
3537 } else {
3538 cERROR(1, ("Invalid Word count %d: ",
3539 smb_buffer_response->WordCount));
3540 rc = -EIO;
3541 } 2602 }
3542 2603
3543 cifs_buf_release(smb_buffer); 2604 /* volume_info->password is freed above when existing session found
3544 2605 (in which case it is not needed anymore) but when new sesion is created
2606 the password ptr is put in the new session structure (in which case the
2607 password will be freed at unmount time) */
2608out:
2609 /* zero out password before freeing */
2610 cleanup_volume_info(&volume_info);
2611 FreeXid(xid);
3545 return rc; 2612 return rc;
3546} 2613}
3547 2614
@@ -3556,7 +2623,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3556 TCONX_RSP *pSMBr; 2623 TCONX_RSP *pSMBr;
3557 unsigned char *bcc_ptr; 2624 unsigned char *bcc_ptr;
3558 int rc = 0; 2625 int rc = 0;
3559 int length; 2626 int length, bytes_left;
3560 __u16 count; 2627 __u16 count;
3561 2628
3562 if (ses == NULL) 2629 if (ses == NULL)
@@ -3644,14 +2711,22 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3644 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 2711 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
3645 CIFS_STD_OP); 2712 CIFS_STD_OP);
3646 2713
3647 /* if (rc) rc = map_smb_to_linux_error(smb_buffer_response); */
3648 /* above now done in SendReceive */ 2714 /* above now done in SendReceive */
3649 if ((rc == 0) && (tcon != NULL)) { 2715 if ((rc == 0) && (tcon != NULL)) {
2716 bool is_unicode;
2717
3650 tcon->tidStatus = CifsGood; 2718 tcon->tidStatus = CifsGood;
3651 tcon->need_reconnect = false; 2719 tcon->need_reconnect = false;
3652 tcon->tid = smb_buffer_response->Tid; 2720 tcon->tid = smb_buffer_response->Tid;
3653 bcc_ptr = pByteArea(smb_buffer_response); 2721 bcc_ptr = pByteArea(smb_buffer_response);
3654 length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2); 2722 bytes_left = BCC(smb_buffer_response);
2723 length = strnlen(bcc_ptr, bytes_left - 2);
2724 if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
2725 is_unicode = true;
2726 else
2727 is_unicode = false;
2728
2729
3655 /* skip service field (NB: this field is always ASCII) */ 2730 /* skip service field (NB: this field is always ASCII) */
3656 if (length == 3) { 2731 if (length == 3) {
3657 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && 2732 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
@@ -3666,40 +2741,16 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3666 } 2741 }
3667 } 2742 }
3668 bcc_ptr += length + 1; 2743 bcc_ptr += length + 1;
2744 bytes_left -= (length + 1);
3669 strncpy(tcon->treeName, tree, MAX_TREE_SIZE); 2745 strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
3670 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) { 2746
3671 length = UniStrnlen((wchar_t *) bcc_ptr, 512); 2747 /* mostly informational -- no need to fail on error here */
3672 if ((bcc_ptr + (2 * length)) - 2748 tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
3673 pByteArea(smb_buffer_response) <= 2749 bytes_left, is_unicode,
3674 BCC(smb_buffer_response)) { 2750 nls_codepage);
3675 kfree(tcon->nativeFileSystem); 2751
3676 tcon->nativeFileSystem = 2752 cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
3677 kzalloc(2*(length + 1), GFP_KERNEL); 2753
3678 if (tcon->nativeFileSystem)
3679 cifs_strfromUCS_le(
3680 tcon->nativeFileSystem,
3681 (__le16 *) bcc_ptr,
3682 length, nls_codepage);
3683 bcc_ptr += 2 * length;
3684 bcc_ptr[0] = 0; /* null terminate the string */
3685 bcc_ptr[1] = 0;
3686 bcc_ptr += 2;
3687 }
3688 /* else do not bother copying these information fields*/
3689 } else {
3690 length = strnlen(bcc_ptr, 1024);
3691 if ((bcc_ptr + length) -
3692 pByteArea(smb_buffer_response) <=
3693 BCC(smb_buffer_response)) {
3694 kfree(tcon->nativeFileSystem);
3695 tcon->nativeFileSystem =
3696 kzalloc(length + 1, GFP_KERNEL);
3697 if (tcon->nativeFileSystem)
3698 strncpy(tcon->nativeFileSystem, bcc_ptr,
3699 length);
3700 }
3701 /* else do not bother copying these information fields*/
3702 }
3703 if ((smb_buffer_response->WordCount == 3) || 2754 if ((smb_buffer_response->WordCount == 3) ||
3704 (smb_buffer_response->WordCount == 7)) 2755 (smb_buffer_response->WordCount == 7))
3705 /* field is in same location */ 2756 /* field is in same location */
@@ -3738,8 +2789,6 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3738 struct nls_table *nls_info) 2789 struct nls_table *nls_info)
3739{ 2790{
3740 int rc = 0; 2791 int rc = 0;
3741 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
3742 bool ntlmv2_flag = false;
3743 int first_time = 0; 2792 int first_time = 0;
3744 struct TCP_Server_Info *server = pSesInfo->server; 2793 struct TCP_Server_Info *server = pSesInfo->server;
3745 2794
@@ -3771,83 +2820,19 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3771 pSesInfo->capabilities = server->capabilities; 2820 pSesInfo->capabilities = server->capabilities;
3772 if (linuxExtEnabled == 0) 2821 if (linuxExtEnabled == 0)
3773 pSesInfo->capabilities &= (~CAP_UNIX); 2822 pSesInfo->capabilities &= (~CAP_UNIX);
3774 /* pSesInfo->sequence_number = 0;*/ 2823
3775 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", 2824 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
3776 server->secMode, server->capabilities, server->timeAdj)); 2825 server->secMode, server->capabilities, server->timeAdj));
3777 2826
3778 if (experimEnabled < 2) 2827 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
3779 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
3780 else if (extended_security
3781 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3782 && (server->secType == NTLMSSP)) {
3783 rc = -EOPNOTSUPP;
3784 } else if (extended_security
3785 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3786 && (server->secType == RawNTLMSSP)) {
3787 cFYI(1, ("NTLMSSP sesssetup"));
3788 rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
3789 nls_info);
3790 if (!rc) {
3791 if (ntlmv2_flag) {
3792 char *v2_response;
3793 cFYI(1, ("more secure NTLM ver2 hash"));
3794 if (CalcNTLMv2_partial_mac_key(pSesInfo,
3795 nls_info)) {
3796 rc = -ENOMEM;
3797 goto ss_err_exit;
3798 } else
3799 v2_response = kmalloc(16 + 64 /* blob*/,
3800 GFP_KERNEL);
3801 if (v2_response) {
3802 CalcNTLMv2_response(pSesInfo,
3803 v2_response);
3804 /* if (first_time)
3805 cifs_calculate_ntlmv2_mac_key */
3806 kfree(v2_response);
3807 /* BB Put dummy sig in SessSetup PDU? */
3808 } else {
3809 rc = -ENOMEM;
3810 goto ss_err_exit;
3811 }
3812
3813 } else {
3814 SMBNTencrypt(pSesInfo->password,
3815 server->cryptKey,
3816 ntlm_session_key);
3817
3818 if (first_time)
3819 cifs_calculate_mac_key(
3820 &server->mac_signing_key,
3821 ntlm_session_key,
3822 pSesInfo->password);
3823 }
3824 /* for better security the weaker lanman hash not sent
3825 in AuthSessSetup so we no longer calculate it */
3826
3827 rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo,
3828 ntlm_session_key,
3829 ntlmv2_flag,
3830 nls_info);
3831 }
3832 } else { /* old style NTLM 0.12 session setup */
3833 SMBNTencrypt(pSesInfo->password, server->cryptKey,
3834 ntlm_session_key);
3835
3836 if (first_time)
3837 cifs_calculate_mac_key(&server->mac_signing_key,
3838 ntlm_session_key,
3839 pSesInfo->password);
3840
3841 rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
3842 }
3843 if (rc) { 2828 if (rc) {
3844 cERROR(1, ("Send error in SessSetup = %d", rc)); 2829 cERROR(1, ("Send error in SessSetup = %d", rc));
3845 } else { 2830 } else {
3846 cFYI(1, ("CIFS Session Established successfully")); 2831 cFYI(1, ("CIFS Session Established successfully"));
3847 spin_lock(&GlobalMid_Lock); 2832 spin_lock(&GlobalMid_Lock);
3848 pSesInfo->status = CifsGood; 2833 pSesInfo->status = CifsGood;
3849 pSesInfo->need_reconnect = false; 2834 pSesInfo->need_reconnect = false;
3850 spin_unlock(&GlobalMid_Lock); 2835 spin_unlock(&GlobalMid_Lock);
3851 } 2836 }
3852 2837
3853ss_err_exit: 2838ss_err_exit:
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 54dce78fbb73..11431ed72a7f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,12 +129,62 @@ cifs_bp_rename_retry:
129 return full_path; 129 return full_path;
130} 130}
131 131
132static void
133cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
134 struct cifsTconInfo *tcon, bool write_only)
135{
136 int oplock = 0;
137 struct cifsFileInfo *pCifsFile;
138 struct cifsInodeInfo *pCifsInode;
139
140 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
141
142 if (pCifsFile == NULL)
143 return;
144
145 if (oplockEnabled)
146 oplock = REQ_OPLOCK;
147
148 pCifsFile->netfid = fileHandle;
149 pCifsFile->pid = current->tgid;
150 pCifsFile->pInode = newinode;
151 pCifsFile->invalidHandle = false;
152 pCifsFile->closePend = false;
153 mutex_init(&pCifsFile->fh_mutex);
154 mutex_init(&pCifsFile->lock_mutex);
155 INIT_LIST_HEAD(&pCifsFile->llist);
156 atomic_set(&pCifsFile->wrtPending, 0);
157
158 /* set the following in open now
159 pCifsFile->pfile = file; */
160 write_lock(&GlobalSMBSeslock);
161 list_add(&pCifsFile->tlist, &tcon->openFileList);
162 pCifsInode = CIFS_I(newinode);
163 if (pCifsInode) {
164 /* if readable file instance put first in list*/
165 if (write_only)
166 list_add_tail(&pCifsFile->flist,
167 &pCifsInode->openFileList);
168 else
169 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
170
171 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
172 pCifsInode->clientCanCacheAll = true;
173 pCifsInode->clientCanCacheRead = true;
174 cFYI(1, ("Exclusive Oplock inode %p", newinode));
175 } else if ((oplock & 0xF) == OPLOCK_READ)
176 pCifsInode->clientCanCacheRead = true;
177 }
178 write_unlock(&GlobalSMBSeslock);
179}
180
132int cifs_posix_open(char *full_path, struct inode **pinode, 181int cifs_posix_open(char *full_path, struct inode **pinode,
133 struct super_block *sb, int mode, int oflags, 182 struct super_block *sb, int mode, int oflags,
134 int *poplock, __u16 *pnetfid, int xid) 183 int *poplock, __u16 *pnetfid, int xid)
135{ 184{
136 int rc; 185 int rc;
137 __u32 oplock; 186 __u32 oplock;
187 bool write_only = false;
138 FILE_UNIX_BASIC_INFO *presp_data; 188 FILE_UNIX_BASIC_INFO *presp_data;
139 __u32 posix_flags = 0; 189 __u32 posix_flags = 0;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 190 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -172,6 +222,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
172 if (oflags & O_DIRECT) 222 if (oflags & O_DIRECT)
173 posix_flags |= SMB_O_DIRECT; 223 posix_flags |= SMB_O_DIRECT;
174 224
225 if (!(oflags & FMODE_READ))
226 write_only = true;
175 227
176 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, 228 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
177 pnetfid, presp_data, &oplock, full_path, 229 pnetfid, presp_data, &oplock, full_path,
@@ -187,8 +239,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
187 if (!pinode) 239 if (!pinode)
188 goto posix_open_ret; /* caller does not need info */ 240 goto posix_open_ret; /* caller does not need info */
189 241
190 if (*pinode == NULL) 242 if (*pinode == NULL) {
191 *pinode = cifs_new_inode(sb, &presp_data->UniqueId); 243 __u64 unique_id = le64_to_cpu(presp_data->UniqueId);
244 *pinode = cifs_new_inode(sb, &unique_id);
245 }
192 /* else an inode was passed in. Update its info, don't create one */ 246 /* else an inode was passed in. Update its info, don't create one */
193 247
194 /* We do not need to close the file if new_inode fails since 248 /* We do not need to close the file if new_inode fails since
@@ -198,6 +252,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
198 252
199 posix_fill_in_inode(*pinode, presp_data, 1); 253 posix_fill_in_inode(*pinode, presp_data, 1);
200 254
255 cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
256
201posix_open_ret: 257posix_open_ret:
202 kfree(presp_data); 258 kfree(presp_data);
203 return rc; 259 return rc;
@@ -225,6 +281,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
225 int create_options = CREATE_NOT_DIR; 281 int create_options = CREATE_NOT_DIR;
226 int oplock = 0; 282 int oplock = 0;
227 int oflags; 283 int oflags;
284 bool posix_create = false;
228 /* 285 /*
229 * BB below access is probably too much for mknod to request 286 * BB below access is probably too much for mknod to request
230 * but we have to do query and setpathinfo so requesting 287 * but we have to do query and setpathinfo so requesting
@@ -239,7 +296,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
239 char *full_path = NULL; 296 char *full_path = NULL;
240 FILE_ALL_INFO *buf = NULL; 297 FILE_ALL_INFO *buf = NULL;
241 struct inode *newinode = NULL; 298 struct inode *newinode = NULL;
242 struct cifsInodeInfo *pCifsInode;
243 int disposition = FILE_OVERWRITE_IF; 299 int disposition = FILE_OVERWRITE_IF;
244 bool write_only = false; 300 bool write_only = false;
245 301
@@ -273,11 +329,13 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
273 negotation. EREMOTE indicates DFS junction, which is not 329 negotation. EREMOTE indicates DFS junction, which is not
274 handled in posix open */ 330 handled in posix open */
275 331
276 if ((rc == 0) && (newinode == NULL)) 332 if (rc == 0) {
277 goto cifs_create_get_file_info; /* query inode info */ 333 posix_create = true;
278 else if (rc == 0) /* success, no need to query */ 334 if (newinode == NULL) /* query inode info */
279 goto cifs_create_set_dentry; 335 goto cifs_create_get_file_info;
280 else if ((rc != -EIO) && (rc != -EREMOTE) && 336 else /* success, no need to query */
337 goto cifs_create_set_dentry;
338 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
281 (rc != -EOPNOTSUPP)) /* path not found or net err */ 339 (rc != -EOPNOTSUPP)) /* path not found or net err */
282 goto cifs_create_out; 340 goto cifs_create_out;
283 /* else fallthrough to retry, using older open call, this is 341 /* else fallthrough to retry, using older open call, this is
@@ -409,45 +467,9 @@ cifs_create_set_dentry:
409 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { 467 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
410 /* mknod case - do not leave file open */ 468 /* mknod case - do not leave file open */
411 CIFSSMBClose(xid, tcon, fileHandle); 469 CIFSSMBClose(xid, tcon, fileHandle);
412 } else if (newinode) { 470 } else if (!(posix_create) && (newinode)) {
413 struct cifsFileInfo *pCifsFile = 471 cifs_fill_fileinfo(newinode, fileHandle,
414 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 472 cifs_sb->tcon, write_only);
415
416 if (pCifsFile == NULL)
417 goto cifs_create_out;
418 pCifsFile->netfid = fileHandle;
419 pCifsFile->pid = current->tgid;
420 pCifsFile->pInode = newinode;
421 pCifsFile->invalidHandle = false;
422 pCifsFile->closePend = false;
423 init_MUTEX(&pCifsFile->fh_sem);
424 mutex_init(&pCifsFile->lock_mutex);
425 INIT_LIST_HEAD(&pCifsFile->llist);
426 atomic_set(&pCifsFile->wrtPending, 0);
427
428 /* set the following in open now
429 pCifsFile->pfile = file; */
430 write_lock(&GlobalSMBSeslock);
431 list_add(&pCifsFile->tlist, &tcon->openFileList);
432 pCifsInode = CIFS_I(newinode);
433 if (pCifsInode) {
434 /* if readable file instance put first in list*/
435 if (write_only) {
436 list_add_tail(&pCifsFile->flist,
437 &pCifsInode->openFileList);
438 } else {
439 list_add(&pCifsFile->flist,
440 &pCifsInode->openFileList);
441 }
442 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
443 pCifsInode->clientCanCacheAll = true;
444 pCifsInode->clientCanCacheRead = true;
445 cFYI(1, ("Exclusive Oplock inode %p",
446 newinode));
447 } else if ((oplock & 0xF) == OPLOCK_READ)
448 pCifsInode->clientCanCacheRead = true;
449 }
450 write_unlock(&GlobalSMBSeslock);
451 } 473 }
452cifs_create_out: 474cifs_create_out:
453 kfree(buf); 475 kfree(buf);
@@ -580,17 +602,21 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
580 return rc; 602 return rc;
581} 603}
582 604
583
584struct dentry * 605struct dentry *
585cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, 606cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
586 struct nameidata *nd) 607 struct nameidata *nd)
587{ 608{
588 int xid; 609 int xid;
589 int rc = 0; /* to get around spurious gcc warning, set to zero here */ 610 int rc = 0; /* to get around spurious gcc warning, set to zero here */
611 int oplock = 0;
612 int mode;
613 __u16 fileHandle = 0;
614 bool posix_open = false;
590 struct cifs_sb_info *cifs_sb; 615 struct cifs_sb_info *cifs_sb;
591 struct cifsTconInfo *pTcon; 616 struct cifsTconInfo *pTcon;
592 struct inode *newInode = NULL; 617 struct inode *newInode = NULL;
593 char *full_path = NULL; 618 char *full_path = NULL;
619 struct file *filp;
594 620
595 xid = GetXid(); 621 xid = GetXid();
596 622
@@ -632,12 +658,37 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
632 } 658 }
633 cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); 659 cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
634 660
635 if (pTcon->unix_ext) 661 if (pTcon->unix_ext) {
636 rc = cifs_get_inode_info_unix(&newInode, full_path, 662 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
637 parent_dir_inode->i_sb, xid); 663 (nd->flags & LOOKUP_OPEN)) {
638 else 664 if (!((nd->intent.open.flags & O_CREAT) &&
665 (nd->intent.open.flags & O_EXCL))) {
666 mode = nd->intent.open.create_mode &
667 ~current_umask();
668 rc = cifs_posix_open(full_path, &newInode,
669 parent_dir_inode->i_sb, mode,
670 nd->intent.open.flags, &oplock,
671 &fileHandle, xid);
672 /*
673 * This code works around a bug in
674 * samba posix open in samba versions 3.3.1
675 * and earlier where create works
676 * but open fails with invalid parameter.
677 * If either of these error codes are
678 * returned, follow the normal lookup.
679 * Otherwise, the error during posix open
680 * is handled.
681 */
682 if ((rc != -EINVAL) && (rc != -EOPNOTSUPP))
683 posix_open = true;
684 }
685 }
686 if (!posix_open)
687 rc = cifs_get_inode_info_unix(&newInode, full_path,
688 parent_dir_inode->i_sb, xid);
689 } else
639 rc = cifs_get_inode_info(&newInode, full_path, NULL, 690 rc = cifs_get_inode_info(&newInode, full_path, NULL,
640 parent_dir_inode->i_sb, xid, NULL); 691 parent_dir_inode->i_sb, xid, NULL);
641 692
642 if ((rc == 0) && (newInode != NULL)) { 693 if ((rc == 0) && (newInode != NULL)) {
643 if (pTcon->nocase) 694 if (pTcon->nocase)
@@ -645,7 +696,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
645 else 696 else
646 direntry->d_op = &cifs_dentry_ops; 697 direntry->d_op = &cifs_dentry_ops;
647 d_add(direntry, newInode); 698 d_add(direntry, newInode);
648 699 if (posix_open)
700 filp = lookup_instantiate_filp(nd, direntry, NULL);
649 /* since paths are not looked up by component - the parent 701 /* since paths are not looked up by component - the parent
650 directories are presumed to be good here */ 702 directories are presumed to be good here */
651 renew_parental_timestamps(direntry); 703 renew_parental_timestamps(direntry);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1e0c1bd8f2e4..df4a306f697e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -78,7 +78,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
78 } 78 }
79 79
80 key->type_data.x[0] = datalen; 80 key->type_data.x[0] = datalen;
81 rcu_assign_pointer(key->payload.data, ip); 81 key->payload.data = ip;
82 82
83 return rc; 83 return rc;
84} 84}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 81747acca4c4..38c06f826575 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -46,7 +46,7 @@ static inline struct cifsFileInfo *cifs_init_private(
46 memset(private_data, 0, sizeof(struct cifsFileInfo)); 46 memset(private_data, 0, sizeof(struct cifsFileInfo));
47 private_data->netfid = netfid; 47 private_data->netfid = netfid;
48 private_data->pid = current->tgid; 48 private_data->pid = current->tgid;
49 init_MUTEX(&private_data->fh_sem); 49 mutex_init(&private_data->fh_mutex);
50 mutex_init(&private_data->lock_mutex); 50 mutex_init(&private_data->lock_mutex);
51 INIT_LIST_HEAD(&private_data->llist); 51 INIT_LIST_HEAD(&private_data->llist);
52 private_data->pfile = file; /* needed for writepage */ 52 private_data->pfile = file; /* needed for writepage */
@@ -129,15 +129,12 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
129 struct file *file, struct cifsInodeInfo *pCifsInode, 129 struct file *file, struct cifsInodeInfo *pCifsInode,
130 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) 130 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
131{ 131{
132 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
133/* struct timespec temp; */ /* BB REMOVEME BB */
134 132
135 file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 133 file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
136 if (file->private_data == NULL) 134 if (file->private_data == NULL)
137 return -ENOMEM; 135 return -ENOMEM;
138 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid); 136 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
139 write_lock(&GlobalSMBSeslock); 137 write_lock(&GlobalSMBSeslock);
140 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
141 138
142 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 139 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
143 if (pCifsInode == NULL) { 140 if (pCifsInode == NULL) {
@@ -145,17 +142,6 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
145 return -EINVAL; 142 return -EINVAL;
146 } 143 }
147 144
148 /* want handles we can use to read with first
149 in the list so we do not have to walk the
150 list to search for one in write_begin */
151 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
152 list_add_tail(&pCifsFile->flist,
153 &pCifsInode->openFileList);
154 } else {
155 list_add(&pCifsFile->flist,
156 &pCifsInode->openFileList);
157 }
158
159 if (pCifsInode->clientCanCacheRead) { 145 if (pCifsInode->clientCanCacheRead) {
160 /* we have the inode open somewhere else 146 /* we have the inode open somewhere else
161 no need to discard cache data */ 147 no need to discard cache data */
@@ -284,35 +270,32 @@ int cifs_open(struct inode *inode, struct file *file)
284 cifs_sb = CIFS_SB(inode->i_sb); 270 cifs_sb = CIFS_SB(inode->i_sb);
285 tcon = cifs_sb->tcon; 271 tcon = cifs_sb->tcon;
286 272
287 if (file->f_flags & O_CREAT) { 273 /* search inode for this file and fill in file->private_data */
288 /* search inode for this file and fill in file->private_data */ 274 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
289 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 275 read_lock(&GlobalSMBSeslock);
290 read_lock(&GlobalSMBSeslock); 276 list_for_each(tmp, &pCifsInode->openFileList) {
291 list_for_each(tmp, &pCifsInode->openFileList) { 277 pCifsFile = list_entry(tmp, struct cifsFileInfo,
292 pCifsFile = list_entry(tmp, struct cifsFileInfo, 278 flist);
293 flist); 279 if ((pCifsFile->pfile == NULL) &&
294 if ((pCifsFile->pfile == NULL) && 280 (pCifsFile->pid == current->tgid)) {
295 (pCifsFile->pid == current->tgid)) { 281 /* mode set in cifs_create */
296 /* mode set in cifs_create */ 282
297 283 /* needed for writepage */
298 /* needed for writepage */ 284 pCifsFile->pfile = file;
299 pCifsFile->pfile = file; 285
300 286 file->private_data = pCifsFile;
301 file->private_data = pCifsFile; 287 break;
302 break;
303 }
304 }
305 read_unlock(&GlobalSMBSeslock);
306 if (file->private_data != NULL) {
307 rc = 0;
308 FreeXid(xid);
309 return rc;
310 } else {
311 if (file->f_flags & O_EXCL)
312 cERROR(1, ("could not find file instance for "
313 "new file %p", file));
314 } 288 }
315 } 289 }
290 read_unlock(&GlobalSMBSeslock);
291
292 if (file->private_data != NULL) {
293 rc = 0;
294 FreeXid(xid);
295 return rc;
296 } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
297 cERROR(1, ("could not find file instance for "
298 "new file %p", file));
316 299
317 full_path = build_path_from_dentry(file->f_path.dentry); 300 full_path = build_path_from_dentry(file->f_path.dentry);
318 if (full_path == NULL) { 301 if (full_path == NULL) {
@@ -500,9 +483,9 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
500 return -EBADF; 483 return -EBADF;
501 484
502 xid = GetXid(); 485 xid = GetXid();
503 down(&pCifsFile->fh_sem); 486 mutex_unlock(&pCifsFile->fh_mutex);
504 if (!pCifsFile->invalidHandle) { 487 if (!pCifsFile->invalidHandle) {
505 up(&pCifsFile->fh_sem); 488 mutex_lock(&pCifsFile->fh_mutex);
506 FreeXid(xid); 489 FreeXid(xid);
507 return 0; 490 return 0;
508 } 491 }
@@ -533,7 +516,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
533 if (full_path == NULL) { 516 if (full_path == NULL) {
534 rc = -ENOMEM; 517 rc = -ENOMEM;
535reopen_error_exit: 518reopen_error_exit:
536 up(&pCifsFile->fh_sem); 519 mutex_lock(&pCifsFile->fh_mutex);
537 FreeXid(xid); 520 FreeXid(xid);
538 return rc; 521 return rc;
539 } 522 }
@@ -575,14 +558,14 @@ reopen_error_exit:
575 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 558 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
576 CIFS_MOUNT_MAP_SPECIAL_CHR); 559 CIFS_MOUNT_MAP_SPECIAL_CHR);
577 if (rc) { 560 if (rc) {
578 up(&pCifsFile->fh_sem); 561 mutex_lock(&pCifsFile->fh_mutex);
579 cFYI(1, ("cifs_open returned 0x%x", rc)); 562 cFYI(1, ("cifs_open returned 0x%x", rc));
580 cFYI(1, ("oplock: %d", oplock)); 563 cFYI(1, ("oplock: %d", oplock));
581 } else { 564 } else {
582reopen_success: 565reopen_success:
583 pCifsFile->netfid = netfid; 566 pCifsFile->netfid = netfid;
584 pCifsFile->invalidHandle = false; 567 pCifsFile->invalidHandle = false;
585 up(&pCifsFile->fh_sem); 568 mutex_lock(&pCifsFile->fh_mutex);
586 pCifsInode = CIFS_I(inode); 569 pCifsInode = CIFS_I(inode);
587 if (pCifsInode) { 570 if (pCifsInode) {
588 if (can_flush) { 571 if (can_flush) {
@@ -971,6 +954,40 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
971 return rc; 954 return rc;
972} 955}
973 956
957/*
958 * Set the timeout on write requests past EOF. For some servers (Windows)
959 * these calls can be very long.
960 *
961 * If we're writing >10M past the EOF we give a 180s timeout. Anything less
962 * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
963 * The 10M cutoff is totally arbitrary. A better scheme for this would be
964 * welcome if someone wants to suggest one.
965 *
966 * We may be able to do a better job with this if there were some way to
967 * declare that a file should be sparse.
968 */
969static int
970cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
971{
972 if (offset <= cifsi->server_eof)
973 return CIFS_STD_OP;
974 else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
975 return CIFS_VLONG_OP;
976 else
977 return CIFS_LONG_OP;
978}
979
980/* update the file size (if needed) after a write */
981static void
982cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
983 unsigned int bytes_written)
984{
985 loff_t end_of_write = offset + bytes_written;
986
987 if (end_of_write > cifsi->server_eof)
988 cifsi->server_eof = end_of_write;
989}
990
974ssize_t cifs_user_write(struct file *file, const char __user *write_data, 991ssize_t cifs_user_write(struct file *file, const char __user *write_data,
975 size_t write_size, loff_t *poffset) 992 size_t write_size, loff_t *poffset)
976{ 993{
@@ -981,6 +998,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
981 struct cifsTconInfo *pTcon; 998 struct cifsTconInfo *pTcon;
982 int xid, long_op; 999 int xid, long_op;
983 struct cifsFileInfo *open_file; 1000 struct cifsFileInfo *open_file;
1001 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
984 1002
985 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1003 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
986 1004
@@ -1000,11 +1018,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1000 1018
1001 xid = GetXid(); 1019 xid = GetXid();
1002 1020
1003 if (*poffset > file->f_path.dentry->d_inode->i_size) 1021 long_op = cifs_write_timeout(cifsi, *poffset);
1004 long_op = CIFS_VLONG_OP; /* writes past EOF take long time */
1005 else
1006 long_op = CIFS_LONG_OP;
1007
1008 for (total_written = 0; write_size > total_written; 1022 for (total_written = 0; write_size > total_written;
1009 total_written += bytes_written) { 1023 total_written += bytes_written) {
1010 rc = -EAGAIN; 1024 rc = -EAGAIN;
@@ -1048,8 +1062,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1048 FreeXid(xid); 1062 FreeXid(xid);
1049 return rc; 1063 return rc;
1050 } 1064 }
1051 } else 1065 } else {
1066 cifs_update_eof(cifsi, *poffset, bytes_written);
1052 *poffset += bytes_written; 1067 *poffset += bytes_written;
1068 }
1053 long_op = CIFS_STD_OP; /* subsequent writes fast - 1069 long_op = CIFS_STD_OP; /* subsequent writes fast -
1054 15 seconds is plenty */ 1070 15 seconds is plenty */
1055 } 1071 }
@@ -1085,6 +1101,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1085 struct cifsTconInfo *pTcon; 1101 struct cifsTconInfo *pTcon;
1086 int xid, long_op; 1102 int xid, long_op;
1087 struct cifsFileInfo *open_file; 1103 struct cifsFileInfo *open_file;
1104 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
1088 1105
1089 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1106 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1090 1107
@@ -1099,11 +1116,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1099 1116
1100 xid = GetXid(); 1117 xid = GetXid();
1101 1118
1102 if (*poffset > file->f_path.dentry->d_inode->i_size) 1119 long_op = cifs_write_timeout(cifsi, *poffset);
1103 long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */
1104 else
1105 long_op = CIFS_LONG_OP;
1106
1107 for (total_written = 0; write_size > total_written; 1120 for (total_written = 0; write_size > total_written;
1108 total_written += bytes_written) { 1121 total_written += bytes_written) {
1109 rc = -EAGAIN; 1122 rc = -EAGAIN;
@@ -1166,8 +1179,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1166 FreeXid(xid); 1179 FreeXid(xid);
1167 return rc; 1180 return rc;
1168 } 1181 }
1169 } else 1182 } else {
1183 cifs_update_eof(cifsi, *poffset, bytes_written);
1170 *poffset += bytes_written; 1184 *poffset += bytes_written;
1185 }
1171 long_op = CIFS_STD_OP; /* subsequent writes fast - 1186 long_op = CIFS_STD_OP; /* subsequent writes fast -
1172 15 seconds is plenty */ 1187 15 seconds is plenty */
1173 } 1188 }
@@ -1380,11 +1395,12 @@ static int cifs_writepages(struct address_space *mapping,
1380 int nr_pages; 1395 int nr_pages;
1381 __u64 offset = 0; 1396 __u64 offset = 0;
1382 struct cifsFileInfo *open_file; 1397 struct cifsFileInfo *open_file;
1398 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
1383 struct page *page; 1399 struct page *page;
1384 struct pagevec pvec; 1400 struct pagevec pvec;
1385 int rc = 0; 1401 int rc = 0;
1386 int scanned = 0; 1402 int scanned = 0;
1387 int xid; 1403 int xid, long_op;
1388 1404
1389 cifs_sb = CIFS_SB(mapping->host->i_sb); 1405 cifs_sb = CIFS_SB(mapping->host->i_sb);
1390 1406
@@ -1528,12 +1544,15 @@ retry:
1528 cERROR(1, ("No writable handles for inode")); 1544 cERROR(1, ("No writable handles for inode"));
1529 rc = -EBADF; 1545 rc = -EBADF;
1530 } else { 1546 } else {
1547 long_op = cifs_write_timeout(cifsi, offset);
1531 rc = CIFSSMBWrite2(xid, cifs_sb->tcon, 1548 rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
1532 open_file->netfid, 1549 open_file->netfid,
1533 bytes_to_write, offset, 1550 bytes_to_write, offset,
1534 &bytes_written, iov, n_iov, 1551 &bytes_written, iov, n_iov,
1535 CIFS_LONG_OP); 1552 long_op);
1536 atomic_dec(&open_file->wrtPending); 1553 atomic_dec(&open_file->wrtPending);
1554 cifs_update_eof(cifsi, offset, bytes_written);
1555
1537 if (rc || bytes_written < bytes_to_write) { 1556 if (rc || bytes_written < bytes_to_write) {
1538 cERROR(1, ("Write2 ret %d, wrote %d", 1557 cERROR(1, ("Write2 ret %d, wrote %d",
1539 rc, bytes_written)); 1558 rc, bytes_written));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f121a80fdd6f..9c869a6dcba1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -143,6 +143,7 @@ static void cifs_unix_info_to_inode(struct inode *inode,
143 143
144 inode->i_nlink = le64_to_cpu(info->Nlinks); 144 inode->i_nlink = le64_to_cpu(info->Nlinks);
145 145
146 cifsInfo->server_eof = end_of_file;
146 spin_lock(&inode->i_lock); 147 spin_lock(&inode->i_lock);
147 if (is_size_safe_to_change(cifsInfo, end_of_file)) { 148 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
148 /* 149 /*
@@ -276,7 +277,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
276 277
277 /* get new inode */ 278 /* get new inode */
278 if (*pinode == NULL) { 279 if (*pinode == NULL) {
279 *pinode = cifs_new_inode(sb, &find_data.UniqueId); 280 __u64 unique_id = le64_to_cpu(find_data.UniqueId);
281 *pinode = cifs_new_inode(sb, &unique_id);
280 if (*pinode == NULL) { 282 if (*pinode == NULL) {
281 rc = -ENOMEM; 283 rc = -ENOMEM;
282 goto cgiiu_exit; 284 goto cgiiu_exit;
@@ -605,12 +607,12 @@ int cifs_get_inode_info(struct inode **pinode,
605 inode->i_mode |= S_IFREG; 607 inode->i_mode |= S_IFREG;
606 } 608 }
607 609
610 cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile);
608 spin_lock(&inode->i_lock); 611 spin_lock(&inode->i_lock);
609 if (is_size_safe_to_change(cifsInfo, 612 if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) {
610 le64_to_cpu(pfindData->EndOfFile))) {
611 /* can not safely shrink the file size here if the 613 /* can not safely shrink the file size here if the
612 client is writing to it due to potential races */ 614 client is writing to it due to potential races */
613 i_size_write(inode, le64_to_cpu(pfindData->EndOfFile)); 615 i_size_write(inode, cifsInfo->server_eof);
614 616
615 /* 512 bytes (2**9) is the fake blocksize that must be 617 /* 512 bytes (2**9) is the fake blocksize that must be
616 used for this calculation */ 618 used for this calculation */
@@ -960,13 +962,21 @@ undo_setattr:
960 goto out_close; 962 goto out_close;
961} 963}
962 964
965
966/*
967 * If dentry->d_inode is null (usually meaning the cached dentry
968 * is a negative dentry) then we would attempt a standard SMB delete, but
969 * if that fails we can not attempt the fall back mechanisms on EACESS
970 * but will return the EACESS to the caller. Note that the VFS does not call
971 * unlink on negative dentries currently.
972 */
963int cifs_unlink(struct inode *dir, struct dentry *dentry) 973int cifs_unlink(struct inode *dir, struct dentry *dentry)
964{ 974{
965 int rc = 0; 975 int rc = 0;
966 int xid; 976 int xid;
967 char *full_path = NULL; 977 char *full_path = NULL;
968 struct inode *inode = dentry->d_inode; 978 struct inode *inode = dentry->d_inode;
969 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 979 struct cifsInodeInfo *cifs_inode;
970 struct super_block *sb = dir->i_sb; 980 struct super_block *sb = dir->i_sb;
971 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 981 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
972 struct cifsTconInfo *tcon = cifs_sb->tcon; 982 struct cifsTconInfo *tcon = cifs_sb->tcon;
@@ -1010,7 +1020,7 @@ psx_del_no_retry:
1010 rc = cifs_rename_pending_delete(full_path, dentry, xid); 1020 rc = cifs_rename_pending_delete(full_path, dentry, xid);
1011 if (rc == 0) 1021 if (rc == 0)
1012 drop_nlink(inode); 1022 drop_nlink(inode);
1013 } else if (rc == -EACCES && dosattr == 0) { 1023 } else if ((rc == -EACCES) && (dosattr == 0) && inode) {
1014 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); 1024 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
1015 if (attrs == NULL) { 1025 if (attrs == NULL) {
1016 rc = -ENOMEM; 1026 rc = -ENOMEM;
@@ -1018,7 +1028,8 @@ psx_del_no_retry:
1018 } 1028 }
1019 1029
1020 /* try to reset dos attributes */ 1030 /* try to reset dos attributes */
1021 origattr = cifsInode->cifsAttrs; 1031 cifs_inode = CIFS_I(inode);
1032 origattr = cifs_inode->cifsAttrs;
1022 if (origattr == 0) 1033 if (origattr == 0)
1023 origattr |= ATTR_NORMAL; 1034 origattr |= ATTR_NORMAL;
1024 dosattr = origattr & ~ATTR_READONLY; 1035 dosattr = origattr & ~ATTR_READONLY;
@@ -1039,13 +1050,13 @@ psx_del_no_retry:
1039 1050
1040out_reval: 1051out_reval:
1041 if (inode) { 1052 if (inode) {
1042 cifsInode = CIFS_I(inode); 1053 cifs_inode = CIFS_I(inode);
1043 cifsInode->time = 0; /* will force revalidate to get info 1054 cifs_inode->time = 0; /* will force revalidate to get info
1044 when needed */ 1055 when needed */
1045 inode->i_ctime = current_fs_time(sb); 1056 inode->i_ctime = current_fs_time(sb);
1046 } 1057 }
1047 dir->i_ctime = dir->i_mtime = current_fs_time(sb); 1058 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
1048 cifsInode = CIFS_I(dir); 1059 cifs_inode = CIFS_I(dir);
1049 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ 1060 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
1050 1061
1051 kfree(full_path); 1062 kfree(full_path);
@@ -1138,6 +1149,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1138 cFYI(1, ("posix mkdir returned 0x%x", rc)); 1149 cFYI(1, ("posix mkdir returned 0x%x", rc));
1139 d_drop(direntry); 1150 d_drop(direntry);
1140 } else { 1151 } else {
1152 __u64 unique_id;
1141 if (pInfo->Type == cpu_to_le32(-1)) { 1153 if (pInfo->Type == cpu_to_le32(-1)) {
1142 /* no return info, go query for it */ 1154 /* no return info, go query for it */
1143 kfree(pInfo); 1155 kfree(pInfo);
@@ -1151,8 +1163,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1151 else 1163 else
1152 direntry->d_op = &cifs_dentry_ops; 1164 direntry->d_op = &cifs_dentry_ops;
1153 1165
1154 newinode = cifs_new_inode(inode->i_sb, 1166 unique_id = le64_to_cpu(pInfo->UniqueId);
1155 &pInfo->UniqueId); 1167 newinode = cifs_new_inode(inode->i_sb, &unique_id);
1156 if (newinode == NULL) { 1168 if (newinode == NULL) {
1157 kfree(pInfo); 1169 kfree(pInfo);
1158 goto mkdir_get_info; 1170 goto mkdir_get_info;
@@ -1450,7 +1462,8 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1450 checking the UniqueId via FILE_INTERNAL_INFO */ 1462 checking the UniqueId via FILE_INTERNAL_INFO */
1451 1463
1452unlink_target: 1464unlink_target:
1453 if ((rc == -EACCES) || (rc == -EEXIST)) { 1465 /* Try unlinking the target dentry if it's not negative */
1466 if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
1454 tmprc = cifs_unlink(target_dir, target_dentry); 1467 tmprc = cifs_unlink(target_dir, target_dentry);
1455 if (tmprc) 1468 if (tmprc)
1456 goto cifs_rename_exit; 1469 goto cifs_rename_exit;
@@ -1753,6 +1766,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1753 } 1766 }
1754 1767
1755 if (rc == 0) { 1768 if (rc == 0) {
1769 cifsInode->server_eof = attrs->ia_size;
1756 rc = cifs_vmtruncate(inode, attrs->ia_size); 1770 rc = cifs_vmtruncate(inode, attrs->ia_size);
1757 cifs_truncate_page(inode->i_mapping, inode->i_size); 1771 cifs_truncate_page(inode->i_mapping, inode->i_size);
1758 } 1772 }
@@ -1792,20 +1806,21 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1792 goto out; 1806 goto out;
1793 } 1807 }
1794 1808
1795 if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { 1809 /*
1796 /* 1810 * Attempt to flush data before changing attributes. We need to do
1797 Flush data before changing file size or changing the last 1811 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
1798 write time of the file on the server. If the 1812 * ownership or mode then we may also need to do this. Here, we take
1799 flush returns error, store it to report later and continue. 1813 * the safe way out and just do the flush on all setattr requests. If
1800 BB: This should be smarter. Why bother flushing pages that 1814 * the flush returns error, store it to report later and continue.
1801 will be truncated anyway? Also, should we error out here if 1815 *
1802 the flush returns error? 1816 * BB: This should be smarter. Why bother flushing pages that
1803 */ 1817 * will be truncated anyway? Also, should we error out here if
1804 rc = filemap_write_and_wait(inode->i_mapping); 1818 * the flush returns error?
1805 if (rc != 0) { 1819 */
1806 cifsInode->write_behind_rc = rc; 1820 rc = filemap_write_and_wait(inode->i_mapping);
1807 rc = 0; 1821 if (rc != 0) {
1808 } 1822 cifsInode->write_behind_rc = rc;
1823 rc = 0;
1809 } 1824 }
1810 1825
1811 if (attrs->ia_valid & ATTR_SIZE) { 1826 if (attrs->ia_valid & ATTR_SIZE) {
@@ -1903,20 +1918,21 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1903 return -ENOMEM; 1918 return -ENOMEM;
1904 } 1919 }
1905 1920
1906 if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { 1921 /*
1907 /* 1922 * Attempt to flush data before changing attributes. We need to do
1908 Flush data before changing file size or changing the last 1923 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
1909 write time of the file on the server. If the 1924 * ownership or mode then we may also need to do this. Here, we take
1910 flush returns error, store it to report later and continue. 1925 * the safe way out and just do the flush on all setattr requests. If
1911 BB: This should be smarter. Why bother flushing pages that 1926 * the flush returns error, store it to report later and continue.
1912 will be truncated anyway? Also, should we error out here if 1927 *
1913 the flush returns error? 1928 * BB: This should be smarter. Why bother flushing pages that
1914 */ 1929 * will be truncated anyway? Also, should we error out here if
1915 rc = filemap_write_and_wait(inode->i_mapping); 1930 * the flush returns error?
1916 if (rc != 0) { 1931 */
1917 cifsInode->write_behind_rc = rc; 1932 rc = filemap_write_and_wait(inode->i_mapping);
1918 rc = 0; 1933 if (rc != 0) {
1919 } 1934 cifsInode->write_behind_rc = rc;
1935 rc = 0;
1920 } 1936 }
1921 1937
1922 if (attrs->ia_valid & ATTR_SIZE) { 1938 if (attrs->ia_valid & ATTR_SIZE) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 63f644000ce5..ea9d11e3dcbb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -119,16 +119,11 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
119 full_path = build_path_from_dentry(direntry); 119 full_path = build_path_from_dentry(direntry);
120 120
121 if (!full_path) 121 if (!full_path)
122 goto out_no_free; 122 goto out;
123 123
124 cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode)); 124 cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
125 cifs_sb = CIFS_SB(inode->i_sb); 125 cifs_sb = CIFS_SB(inode->i_sb);
126 pTcon = cifs_sb->tcon; 126 pTcon = cifs_sb->tcon;
127 target_path = kmalloc(PATH_MAX, GFP_KERNEL);
128 if (!target_path) {
129 target_path = ERR_PTR(-ENOMEM);
130 goto out;
131 }
132 127
133 /* We could change this to: 128 /* We could change this to:
134 if (pTcon->unix_ext) 129 if (pTcon->unix_ext)
@@ -138,8 +133,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
138 133
139 if (pTcon->ses->capabilities & CAP_UNIX) 134 if (pTcon->ses->capabilities & CAP_UNIX)
140 rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path, 135 rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
141 target_path, 136 &target_path,
142 PATH_MAX-1,
143 cifs_sb->local_nls); 137 cifs_sb->local_nls);
144 else { 138 else {
145 /* BB add read reparse point symlink code here */ 139 /* BB add read reparse point symlink code here */
@@ -148,22 +142,16 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
148 /* BB Add MAC style xsymlink check here if enabled */ 142 /* BB Add MAC style xsymlink check here if enabled */
149 } 143 }
150 144
151 if (rc == 0) { 145 if (rc != 0) {
152
153/* BB Add special case check for Samba DFS symlinks */
154
155 target_path[PATH_MAX-1] = 0;
156 } else {
157 kfree(target_path); 146 kfree(target_path);
158 target_path = ERR_PTR(rc); 147 target_path = ERR_PTR(rc);
159 } 148 }
160 149
161out:
162 kfree(full_path); 150 kfree(full_path);
163out_no_free: 151out:
164 FreeXid(xid); 152 FreeXid(xid);
165 nd_set_link(nd, target_path); 153 nd_set_link(nd, target_path);
166 return NULL; /* No cookie */ 154 return NULL;
167} 155}
168 156
169int 157int
@@ -224,98 +212,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
224 return rc; 212 return rc;
225} 213}
226 214
227int
228cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
229{
230 struct inode *inode = direntry->d_inode;
231 int rc = -EACCES;
232 int xid;
233 int oplock = 0;
234 struct cifs_sb_info *cifs_sb;
235 struct cifsTconInfo *pTcon;
236 char *full_path = NULL;
237 char *tmpbuffer;
238 int len;
239 __u16 fid;
240
241 xid = GetXid();
242 cifs_sb = CIFS_SB(inode->i_sb);
243 pTcon = cifs_sb->tcon;
244
245/* BB would it be safe against deadlock to grab this sem
246 even though rename itself grabs the sem and calls lookup? */
247/* mutex_lock(&inode->i_sb->s_vfs_rename_mutex);*/
248 full_path = build_path_from_dentry(direntry);
249/* mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);*/
250
251 if (full_path == NULL) {
252 FreeXid(xid);
253 return -ENOMEM;
254 }
255
256 cFYI(1,
257 ("Full path: %s inode = 0x%p pBuffer = 0x%p buflen = %d",
258 full_path, inode, pBuffer, buflen));
259 if (buflen > PATH_MAX)
260 len = PATH_MAX;
261 else
262 len = buflen;
263 tmpbuffer = kmalloc(len, GFP_KERNEL);
264 if (tmpbuffer == NULL) {
265 kfree(full_path);
266 FreeXid(xid);
267 return -ENOMEM;
268 }
269
270/* BB add read reparse point symlink code and
271 Unix extensions symlink code here BB */
272/* We could disable this based on pTcon->unix_ext flag instead ... but why? */
273 if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
274 rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
275 tmpbuffer,
276 len - 1,
277 cifs_sb->local_nls);
278 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
279 cERROR(1, ("SFU style symlinks not implemented yet"));
280 /* add open and read as in fs/cifs/inode.c */
281 } else {
282 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
283 OPEN_REPARSE_POINT, &fid, &oplock, NULL,
284 cifs_sb->local_nls,
285 cifs_sb->mnt_cifs_flags &
286 CIFS_MOUNT_MAP_SPECIAL_CHR);
287 if (!rc) {
288 rc = CIFSSMBQueryReparseLinkInfo(xid, pTcon, full_path,
289 tmpbuffer,
290 len - 1,
291 fid,
292 cifs_sb->local_nls);
293 if (CIFSSMBClose(xid, pTcon, fid)) {
294 cFYI(1, ("Error closing junction point "
295 "(open for ioctl)"));
296 }
297 /* If it is a DFS junction earlier we would have gotten
298 PATH_NOT_COVERED returned from server so we do
299 not need to request the DFS info here */
300 }
301 }
302 /* BB Anything else to do to handle recursive links? */
303 /* BB Should we be using page ops here? */
304
305 /* BB null terminate returned string in pBuffer? BB */
306 if (rc == 0) {
307 rc = vfs_readlink(direntry, pBuffer, len, tmpbuffer);
308 cFYI(1,
309 ("vfs_readlink called from cifs_readlink returned %d",
310 rc));
311 }
312
313 kfree(tmpbuffer);
314 kfree(full_path);
315 FreeXid(xid);
316 return rc;
317}
318
319void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie) 215void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
320{ 216{
321 char *p = nd_get_link(nd); 217 char *p = nd_get_link(nd);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4c89c572891a..e079a9190ec4 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -635,77 +635,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
635 return; 635 return;
636} 636}
637 637
638/* Windows maps these to the user defined 16 bit Unicode range since they are
639 reserved symbols (along with \ and /), otherwise illegal to store
640 in filenames in NTFS */
641#define UNI_ASTERIK (__u16) ('*' + 0xF000)
642#define UNI_QUESTION (__u16) ('?' + 0xF000)
643#define UNI_COLON (__u16) (':' + 0xF000)
644#define UNI_GRTRTHAN (__u16) ('>' + 0xF000)
645#define UNI_LESSTHAN (__u16) ('<' + 0xF000)
646#define UNI_PIPE (__u16) ('|' + 0xF000)
647#define UNI_SLASH (__u16) ('\\' + 0xF000)
648
649/* Convert 16 bit Unicode pathname from wire format to string in current code
650 page. Conversion may involve remapping up the seven characters that are
651 only legal in POSIX-like OS (if they are present in the string). Path
652 names are little endian 16 bit Unicode on the wire */
653int
654cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
655 const struct nls_table *cp)
656{
657 int i, j, len;
658 __u16 src_char;
659
660 for (i = 0, j = 0; i < maxlen; i++) {
661 src_char = le16_to_cpu(source[i]);
662 switch (src_char) {
663 case 0:
664 goto cUCS_out; /* BB check this BB */
665 case UNI_COLON:
666 target[j] = ':';
667 break;
668 case UNI_ASTERIK:
669 target[j] = '*';
670 break;
671 case UNI_QUESTION:
672 target[j] = '?';
673 break;
674 /* BB We can not handle remapping slash until
675 all the calls to build_path_from_dentry
676 are modified, as they use slash as separator BB */
677 /* case UNI_SLASH:
678 target[j] = '\\';
679 break;*/
680 case UNI_PIPE:
681 target[j] = '|';
682 break;
683 case UNI_GRTRTHAN:
684 target[j] = '>';
685 break;
686 case UNI_LESSTHAN:
687 target[j] = '<';
688 break;
689 default:
690 len = cp->uni2char(src_char, &target[j],
691 NLS_MAX_CHARSET_SIZE);
692 if (len > 0) {
693 j += len;
694 continue;
695 } else {
696 target[j] = '?';
697 }
698 }
699 j++;
700 /* make sure we do not overrun callers allocated temp buffer */
701 if (j >= (2 * NAME_MAX))
702 break;
703 }
704cUCS_out:
705 target[j] = 0;
706 return j;
707}
708
709/* Convert 16 bit Unicode pathname to wire format from string in current code 638/* Convert 16 bit Unicode pathname to wire format from string in current code
710 page. Conversion may involve remapping up the seven characters that are 639 page. Conversion may involve remapping up the seven characters that are
711 only legal in POSIX-like OS (if they are present in the string). Path 640 only legal in POSIX-like OS (if they are present in the string). Path
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8703d68f5b20..e2fe998989a3 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -79,6 +79,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
79 {ErrQuota, -EDQUOT}, 79 {ErrQuota, -EDQUOT},
80 {ErrNotALink, -ENOLINK}, 80 {ErrNotALink, -ENOLINK},
81 {ERRnetlogonNotStarted, -ENOPROTOOPT}, 81 {ERRnetlogonNotStarted, -ENOPROTOOPT},
82 {ERRsymlink, -EOPNOTSUPP},
82 {ErrTooManyLinks, -EMLINK}, 83 {ErrTooManyLinks, -EMLINK},
83 {0, 0} 84 {0, 0}
84}; 85};
@@ -714,6 +715,7 @@ static const struct {
714 ERRDOS, ERRnoaccess, 0xc000028f}, { 715 ERRDOS, ERRnoaccess, 0xc000028f}, {
715 ERRDOS, ERRnoaccess, 0xc0000290}, { 716 ERRDOS, ERRnoaccess, 0xc0000290}, {
716 ERRDOS, ERRbadfunc, 0xc000029c}, { 717 ERRDOS, ERRbadfunc, 0xc000029c}, {
718 ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
717 ERRDOS, ERRinvlevel, 0x007c0001}, }; 719 ERRDOS, ERRinvlevel, 0x007c0001}, };
718 720
719/***************************************************************************** 721/*****************************************************************************
diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h
index 588abbb9d08c..257267367d41 100644
--- a/fs/cifs/nterr.h
+++ b/fs/cifs/nterr.h
@@ -35,8 +35,6 @@ struct nt_err_code_struct {
35extern const struct nt_err_code_struct nt_errs[]; 35extern const struct nt_err_code_struct nt_errs[];
36 36
37/* Win32 Status codes. */ 37/* Win32 Status codes. */
38
39#define STATUS_BUFFER_OVERFLOW 0x80000005
40#define STATUS_MORE_ENTRIES 0x0105 38#define STATUS_MORE_ENTRIES 0x0105
41#define ERROR_INVALID_PARAMETER 0x0057 39#define ERROR_INVALID_PARAMETER 0x0057
42#define ERROR_INSUFFICIENT_BUFFER 0x007a 40#define ERROR_INSUFFICIENT_BUFFER 0x007a
@@ -50,6 +48,13 @@ extern const struct nt_err_code_struct nt_errs[];
50#define STATUS_SOME_UNMAPPED 0x0107 48#define STATUS_SOME_UNMAPPED 0x0107
51#define STATUS_BUFFER_OVERFLOW 0x80000005 49#define STATUS_BUFFER_OVERFLOW 0x80000005
52#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a 50#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a
51#define NT_STATUS_MEDIA_CHANGED 0x8000001c
52#define NT_STATUS_END_OF_MEDIA 0x8000001e
53#define NT_STATUS_MEDIA_CHECK 0x80000020
54#define NT_STATUS_NO_DATA_DETECTED 0x8000001c
55#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d
56#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288
57#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288
53#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001 58#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001
54#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002 59#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002
55#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003 60#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index c377d8065d99..49c9a4e75319 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -27,29 +27,39 @@
27#define UnknownMessage cpu_to_le32(8) 27#define UnknownMessage cpu_to_le32(8)
28 28
29/* Negotiate Flags */ 29/* Negotiate Flags */
30#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are in unicode */ 30#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are unicode */
31#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */ 31#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */
32#define NTLMSSP_REQUEST_TARGET 0x04 /* Server return its auth realm */ 32#define NTLMSSP_REQUEST_TARGET 0x04 /* Srv returns its auth realm */
33#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signature capability */ 33/* define reserved9 0x08 */
34#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */ 34#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signing capability */
35#define NTLMSSP_NEGOTIATE_DGRAM 0x0040 35#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */
36#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Sign/seal use LM session key */ 36#define NTLMSSP_NEGOTIATE_DGRAM 0x0040
37#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */ 37#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Use LM session key */
38#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 38/* defined reserved 8 0x0100 */
39#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */
40#define NTLMSSP_NEGOTIATE_NT_ONLY 0x0400 /* Lanman not allowed */
41#define NTLMSSP_ANONYMOUS 0x0800
42#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */
39#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000 43#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000
40#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server on same machine */ 44#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server same machine */
41#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign for all security levels */ 45#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign. All security levels */
42#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000 46#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000
43#define NTLMSSP_TARGET_TYPE_SERVER 0x20000 47#define NTLMSSP_TARGET_TYPE_SERVER 0x20000
44#define NTLMSSP_TARGET_TYPE_SHARE 0x40000 48#define NTLMSSP_TARGET_TYPE_SHARE 0x40000
45#define NTLMSSP_NEGOTIATE_NTLMV2 0x80000 49#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/
46#define NTLMSSP_REQUEST_INIT_RESP 0x100000 50/* #define NTLMSSP_REQUEST_INIT_RESP 0x100000 */
47#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 51#define NTLMSSP_NEGOTIATE_IDENTIFY 0x100000
48#define NTLMSSP_REQUEST_NOT_NT_KEY 0x400000 52#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 /* reserved5 */
53#define NTLMSSP_REQUEST_NON_NT_KEY 0x400000
49#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 54#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
50#define NTLMSSP_NEGOTIATE_128 0x20000000 55/* #define reserved4 0x1000000 */
51#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 56#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */
52#define NTLMSSP_NEGOTIATE_56 0x80000000 57/* #define reserved3 0x4000000 */
58/* #define reserved2 0x8000000 */
59/* #define reserved1 0x10000000 */
60#define NTLMSSP_NEGOTIATE_128 0x20000000
61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
62#define NTLMSSP_NEGOTIATE_56 0x80000000
53 63
54/* Although typedefs are not commonly used for structure definitions */ 64/* Although typedefs are not commonly used for structure definitions */
55/* in the Linux kernel, in this particular case they are useful */ 65/* in the Linux kernel, in this particular case they are useful */
@@ -60,32 +70,36 @@
60typedef struct _SECURITY_BUFFER { 70typedef struct _SECURITY_BUFFER {
61 __le16 Length; 71 __le16 Length;
62 __le16 MaximumLength; 72 __le16 MaximumLength;
63 __le32 Buffer; /* offset to buffer */ 73 __le32 BufferOffset; /* offset to buffer */
64} __attribute__((packed)) SECURITY_BUFFER; 74} __attribute__((packed)) SECURITY_BUFFER;
65 75
66typedef struct _NEGOTIATE_MESSAGE { 76typedef struct _NEGOTIATE_MESSAGE {
67 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; 77 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
68 __le32 MessageType; /* 1 */ 78 __le32 MessageType; /* NtLmNegotiate = 1 */
69 __le32 NegotiateFlags; 79 __le32 NegotiateFlags;
70 SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */ 80 SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */
71 SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ 81 SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */
82 /* SECURITY_BUFFER for version info not present since we
83 do not set the version is present flag */
72 char DomainString[0]; 84 char DomainString[0];
73 /* followed by WorkstationString */ 85 /* followed by WorkstationString */
74} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; 86} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
75 87
76typedef struct _CHALLENGE_MESSAGE { 88typedef struct _CHALLENGE_MESSAGE {
77 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; 89 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
78 __le32 MessageType; /* 2 */ 90 __le32 MessageType; /* NtLmChallenge = 2 */
79 SECURITY_BUFFER TargetName; 91 SECURITY_BUFFER TargetName;
80 __le32 NegotiateFlags; 92 __le32 NegotiateFlags;
81 __u8 Challenge[CIFS_CRYPTO_KEY_SIZE]; 93 __u8 Challenge[CIFS_CRYPTO_KEY_SIZE];
82 __u8 Reserved[8]; 94 __u8 Reserved[8];
83 SECURITY_BUFFER TargetInfoArray; 95 SECURITY_BUFFER TargetInfoArray;
96 /* SECURITY_BUFFER for version info not present since we
97 do not set the version is present flag */
84} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE; 98} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE;
85 99
86typedef struct _AUTHENTICATE_MESSAGE { 100typedef struct _AUTHENTICATE_MESSAGE {
87 __u8 Signature[sizeof (NTLMSSP_SIGNATURE)]; 101 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
88 __le32 MessageType; /* 3 */ 102 __le32 MessageType; /* NtLmsAuthenticate = 3 */
89 SECURITY_BUFFER LmChallengeResponse; 103 SECURITY_BUFFER LmChallengeResponse;
90 SECURITY_BUFFER NtChallengeResponse; 104 SECURITY_BUFFER NtChallengeResponse;
91 SECURITY_BUFFER DomainName; 105 SECURITY_BUFFER DomainName;
@@ -93,5 +107,7 @@ typedef struct _AUTHENTICATE_MESSAGE {
93 SECURITY_BUFFER WorkstationName; 107 SECURITY_BUFFER WorkstationName;
94 SECURITY_BUFFER SessionKey; 108 SECURITY_BUFFER SessionKey;
95 __le32 NegotiateFlags; 109 __le32 NegotiateFlags;
110 /* SECURITY_BUFFER for version info not present since we
111 do not set the version is present flag */
96 char UserString[0]; 112 char UserString[0];
97} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; 113} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c2c01ff4c32c..964e097c8203 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -31,6 +31,13 @@
31#include "cifs_fs_sb.h" 31#include "cifs_fs_sb.h"
32#include "cifsfs.h" 32#include "cifsfs.h"
33 33
34/*
35 * To be safe - for UCS to UTF-8 with strings loaded with the rare long
36 * characters alloc more to account for such multibyte target UTF-8
37 * characters.
38 */
39#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2)
40
34#ifdef CONFIG_CIFS_DEBUG2 41#ifdef CONFIG_CIFS_DEBUG2
35static void dump_cifs_file_struct(struct file *file, char *label) 42static void dump_cifs_file_struct(struct file *file, char *label)
36{ 43{
@@ -239,6 +246,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
239 if (atomic_read(&cifsInfo->inUse) == 0) 246 if (atomic_read(&cifsInfo->inUse) == 0)
240 atomic_set(&cifsInfo->inUse, 1); 247 atomic_set(&cifsInfo->inUse, 1);
241 248
249 cifsInfo->server_eof = end_of_file;
242 spin_lock(&tmp_inode->i_lock); 250 spin_lock(&tmp_inode->i_lock);
243 if (is_size_safe_to_change(cifsInfo, end_of_file)) { 251 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
244 /* can not safely change the file size here if the 252 /* can not safely change the file size here if the
@@ -375,6 +383,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
375 tmp_inode->i_gid = le64_to_cpu(pfindData->Gid); 383 tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
376 tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks); 384 tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
377 385
386 cifsInfo->server_eof = end_of_file;
378 spin_lock(&tmp_inode->i_lock); 387 spin_lock(&tmp_inode->i_lock);
379 if (is_size_safe_to_change(cifsInfo, end_of_file)) { 388 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
380 /* can not safely change the file size here if the 389 /* can not safely change the file size here if the
@@ -436,6 +445,38 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
436 } 445 }
437} 446}
438 447
448/* BB eventually need to add the following helper function to
449 resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
450 we try to do FindFirst on (NTFS) directory symlinks */
451/*
452int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
453 int xid)
454{
455 __u16 fid;
456 int len;
457 int oplock = 0;
458 int rc;
459 struct cifsTconInfo *ptcon = cifs_sb->tcon;
460 char *tmpbuffer;
461
462 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
463 OPEN_REPARSE_POINT, &fid, &oplock, NULL,
464 cifs_sb->local_nls,
465 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
466 if (!rc) {
467 tmpbuffer = kmalloc(maxpath);
468 rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
469 tmpbuffer,
470 maxpath -1,
471 fid,
472 cifs_sb->local_nls);
473 if (CIFSSMBClose(xid, ptcon, fid)) {
474 cFYI(1, ("Error closing temporary reparsepoint open)"));
475 }
476 }
477}
478 */
479
439static int initiate_cifs_search(const int xid, struct file *file) 480static int initiate_cifs_search(const int xid, struct file *file)
440{ 481{
441 int rc = 0; 482 int rc = 0;
@@ -491,7 +532,10 @@ ffirst_retry:
491 CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb)); 532 CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
492 if (rc == 0) 533 if (rc == 0)
493 cifsFile->invalidHandle = false; 534 cifsFile->invalidHandle = false;
494 if ((rc == -EOPNOTSUPP) && 535 /* BB add following call to handle readdir on new NTFS symlink errors
536 else if STATUS_STOPPED_ON_SYMLINK
537 call get_symlink_reparse_path and retry with new path */
538 else if ((rc == -EOPNOTSUPP) &&
495 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { 539 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
496 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 540 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
497 goto ffirst_retry; 541 goto ffirst_retry;
@@ -820,7 +864,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
820/* inode num, inode type and filename returned */ 864/* inode num, inode type and filename returned */
821static int cifs_get_name_from_search_buf(struct qstr *pqst, 865static int cifs_get_name_from_search_buf(struct qstr *pqst,
822 char *current_entry, __u16 level, unsigned int unicode, 866 char *current_entry, __u16 level, unsigned int unicode,
823 struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum) 867 struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
824{ 868{
825 int rc = 0; 869 int rc = 0;
826 unsigned int len = 0; 870 unsigned int len = 0;
@@ -840,7 +884,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
840 len = strnlen(filename, PATH_MAX); 884 len = strnlen(filename, PATH_MAX);
841 } 885 }
842 886
843 *pinum = pFindData->UniqueId; 887 *pinum = le64_to_cpu(pFindData->UniqueId);
844 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { 888 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
845 FILE_DIRECTORY_INFO *pFindData = 889 FILE_DIRECTORY_INFO *pFindData =
846 (FILE_DIRECTORY_INFO *)current_entry; 890 (FILE_DIRECTORY_INFO *)current_entry;
@@ -856,7 +900,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
856 (SEARCH_ID_FULL_DIR_INFO *)current_entry; 900 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
857 filename = &pFindData->FileName[0]; 901 filename = &pFindData->FileName[0];
858 len = le32_to_cpu(pFindData->FileNameLength); 902 len = le32_to_cpu(pFindData->FileNameLength);
859 *pinum = pFindData->UniqueId; 903 *pinum = le64_to_cpu(pFindData->UniqueId);
860 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { 904 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
861 FILE_BOTH_DIRECTORY_INFO *pFindData = 905 FILE_BOTH_DIRECTORY_INFO *pFindData =
862 (FILE_BOTH_DIRECTORY_INFO *)current_entry; 906 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
@@ -879,14 +923,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
879 } 923 }
880 924
881 if (unicode) { 925 if (unicode) {
882 /* BB fixme - test with long names */ 926 pqst->len = cifs_from_ucs2((char *) pqst->name,
883 /* Note converted filename can be longer than in unicode */ 927 (__le16 *) filename,
884 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) 928 UNICODE_NAME_MAX,
885 pqst->len = cifs_convertUCSpath((char *)pqst->name, 929 min(len, max_len), nlt,
886 (__le16 *)filename, len/2, nlt); 930 cifs_sb->mnt_cifs_flags &
887 else 931 CIFS_MOUNT_MAP_SPECIAL_CHR);
888 pqst->len = cifs_strfromUCS_le((char *)pqst->name,
889 (__le16 *)filename, len/2, nlt);
890 } else { 932 } else {
891 pqst->name = filename; 933 pqst->name = filename;
892 pqst->len = len; 934 pqst->len = len;
@@ -896,8 +938,8 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
896 return rc; 938 return rc;
897} 939}
898 940
899static int cifs_filldir(char *pfindEntry, struct file *file, 941static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
900 filldir_t filldir, void *direntry, char *scratch_buf, int max_len) 942 void *direntry, char *scratch_buf, unsigned int max_len)
901{ 943{
902 int rc = 0; 944 int rc = 0;
903 struct qstr qstring; 945 struct qstr qstring;
@@ -994,7 +1036,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
994 int num_to_fill = 0; 1036 int num_to_fill = 0;
995 char *tmp_buf = NULL; 1037 char *tmp_buf = NULL;
996 char *end_of_smb; 1038 char *end_of_smb;
997 int max_len; 1039 unsigned int max_len;
998 1040
999 xid = GetXid(); 1041 xid = GetXid();
1000 1042
@@ -1068,11 +1110,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
1068 cifsFile->srch_inf.ntwrk_buf_start); 1110 cifsFile->srch_inf.ntwrk_buf_start);
1069 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 1111 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
1070 1112
1071 /* To be safe - for UCS to UTF-8 with strings loaded 1113 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
1072 with the rare long characters alloc more to account for
1073 such multibyte target UTF-8 characters. cifs_unicode.c,
1074 which actually does the conversion, has the same limit */
1075 tmp_buf = kmalloc((2 * NAME_MAX) + 4, GFP_KERNEL);
1076 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 1114 for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
1077 if (current_entry == NULL) { 1115 if (current_entry == NULL) {
1078 /* evaluate whether this case is an error */ 1116 /* evaluate whether this case is an error */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5c68b4282be9..897a052270f9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * SMB/CIFS session setup handling routines 4 * SMB/CIFS session setup handling routines
5 * 5 *
6 * Copyright (c) International Business Machines Corp., 2006, 2007 6 * Copyright (c) International Business Machines Corp., 2006, 2009
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * 8 *
9 * This library is free software; you can redistribute it and/or modify 9 * This library is free software; you can redistribute it and/or modify
@@ -111,7 +111,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
111get_vc_num_exit: 111get_vc_num_exit:
112 write_unlock(&cifs_tcp_ses_lock); 112 write_unlock(&cifs_tcp_ses_lock);
113 113
114 return le16_to_cpu(vcnum); 114 return cpu_to_le16(vcnum);
115} 115}
116 116
117static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) 117static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
@@ -277,85 +277,51 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
277 *pbcc_area = bcc_ptr; 277 *pbcc_area = bcc_ptr;
278} 278}
279 279
280static int decode_unicode_ssetup(char **pbcc_area, int bleft, 280static void
281 struct cifsSesInfo *ses, 281decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
282 const struct nls_table *nls_cp) 282 const struct nls_table *nls_cp)
283{ 283{
284 int rc = 0; 284 int len;
285 int words_left, len;
286 char *data = *pbcc_area; 285 char *data = *pbcc_area;
287 286
288
289
290 cFYI(1, ("bleft %d", bleft)); 287 cFYI(1, ("bleft %d", bleft));
291 288
292 289 /*
293 /* SMB header is unaligned, so cifs servers word align start of 290 * Windows servers do not always double null terminate their final
294 Unicode strings */ 291 * Unicode string. Check to see if there are an uneven number of bytes
295 data++; 292 * left. If so, then add an extra NULL pad byte to the end of the
296 bleft--; /* Windows servers do not always double null terminate 293 * response.
297 their final Unicode string - in which case we 294 *
298 now will not attempt to decode the byte of junk 295 * See section 2.7.2 in "Implementing CIFS" for details
299 which follows it */ 296 */
300 297 if (bleft % 2) {
301 words_left = bleft / 2; 298 data[bleft] = 0;
302 299 ++bleft;
303 /* save off server operating system */ 300 }
304 len = UniStrnlen((wchar_t *) data, words_left);
305
306/* We look for obvious messed up bcc or strings in response so we do not go off
307 the end since (at least) WIN2K and Windows XP have a major bug in not null
308 terminating last Unicode string in response */
309 if (len >= words_left)
310 return rc;
311 301
312 kfree(ses->serverOS); 302 kfree(ses->serverOS);
313 /* UTF-8 string will not grow more than four times as big as UCS-16 */ 303 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
314 ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); 304 cFYI(1, ("serverOS=%s", ses->serverOS));
315 if (ses->serverOS != NULL) 305 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
316 cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp); 306 data += len;
317 data += 2 * (len + 1); 307 bleft -= len;
318 words_left -= len + 1; 308 if (bleft <= 0)
319 309 return;
320 /* save off server network operating system */
321 len = UniStrnlen((wchar_t *) data, words_left);
322
323 if (len >= words_left)
324 return rc;
325 310
326 kfree(ses->serverNOS); 311 kfree(ses->serverNOS);
327 ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); 312 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
328 if (ses->serverNOS != NULL) { 313 cFYI(1, ("serverNOS=%s", ses->serverNOS));
329 cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, 314 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
330 nls_cp); 315 data += len;
331 if (strncmp(ses->serverNOS, "NT LAN Manager 4", 16) == 0) { 316 bleft -= len;
332 cFYI(1, ("NT4 server")); 317 if (bleft <= 0)
333 ses->flags |= CIFS_SES_NT4; 318 return;
334 }
335 }
336 data += 2 * (len + 1);
337 words_left -= len + 1;
338
339 /* save off server domain */
340 len = UniStrnlen((wchar_t *) data, words_left);
341
342 if (len > words_left)
343 return rc;
344 319
345 kfree(ses->serverDomain); 320 kfree(ses->serverDomain);
346 ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */ 321 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
347 if (ses->serverDomain != NULL) { 322 cFYI(1, ("serverDomain=%s", ses->serverDomain));
348 cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len,
349 nls_cp);
350 ses->serverDomain[2*len] = 0;
351 ses->serverDomain[(2*len) + 1] = 0;
352 }
353 data += 2 * (len + 1);
354 words_left -= len + 1;
355 323
356 cFYI(1, ("words left: %d", words_left)); 324 return;
357
358 return rc;
359} 325}
360 326
361static int decode_ascii_ssetup(char **pbcc_area, int bleft, 327static int decode_ascii_ssetup(char **pbcc_area, int bleft,
@@ -412,6 +378,186 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
412 return rc; 378 return rc;
413} 379}
414 380
381static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
382 struct cifsSesInfo *ses)
383{
384 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
385
386 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
387 cERROR(1, ("challenge blob len %d too small", blob_len));
388 return -EINVAL;
389 }
390
391 if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
392 cERROR(1, ("blob signature incorrect %s", pblob->Signature));
393 return -EINVAL;
394 }
395 if (pblob->MessageType != NtLmChallenge) {
396 cERROR(1, ("Incorrect message type %d", pblob->MessageType));
397 return -EINVAL;
398 }
399
400 memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
401 /* BB we could decode pblob->NegotiateFlags; some may be useful */
402 /* In particular we can examine sign flags */
403 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
404 we must set the MIC field of the AUTHENTICATE_MESSAGE */
405
406 return 0;
407}
408
409#ifdef CONFIG_CIFS_EXPERIMENTAL
410/* BB Move to ntlmssp.c eventually */
411
412/* We do not malloc the blob, it is passed in pbuffer, because
413 it is fixed size, and small, making this approach cleaner */
414static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
415 struct cifsSesInfo *ses)
416{
417 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
418 __u32 flags;
419
420 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
421 sec_blob->MessageType = NtLmNegotiate;
422
423 /* BB is NTLMV2 session security format easier to use here? */
424 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
425 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
426 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
427 if (ses->server->secMode &
428 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
429 flags |= NTLMSSP_NEGOTIATE_SIGN;
430 if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
431 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
432
433 sec_blob->NegotiateFlags |= cpu_to_le32(flags);
434
435 sec_blob->WorkstationName.BufferOffset = 0;
436 sec_blob->WorkstationName.Length = 0;
437 sec_blob->WorkstationName.MaximumLength = 0;
438
439 /* Domain name is sent on the Challenge not Negotiate NTLMSSP request */
440 sec_blob->DomainName.BufferOffset = 0;
441 sec_blob->DomainName.Length = 0;
442 sec_blob->DomainName.MaximumLength = 0;
443}
444
445/* We do not malloc the blob, it is passed in pbuffer, because its
446 maximum possible size is fixed and small, making this approach cleaner.
447 This function returns the length of the data in the blob */
448static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
449 struct cifsSesInfo *ses,
450 const struct nls_table *nls_cp, int first)
451{
452 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
453 __u32 flags;
454 unsigned char *tmp;
455 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
456
457 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
458 sec_blob->MessageType = NtLmAuthenticate;
459
460 flags = NTLMSSP_NEGOTIATE_56 |
461 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
462 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
463 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
464 if (ses->server->secMode &
465 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
466 flags |= NTLMSSP_NEGOTIATE_SIGN;
467 if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
468 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
469
470 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
471 sec_blob->NegotiateFlags |= cpu_to_le32(flags);
472
473 sec_blob->LmChallengeResponse.BufferOffset =
474 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
475 sec_blob->LmChallengeResponse.Length = 0;
476 sec_blob->LmChallengeResponse.MaximumLength = 0;
477
478 /* calculate session key, BB what about adding similar ntlmv2 path? */
479 SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
480 if (first)
481 cifs_calculate_mac_key(&ses->server->mac_signing_key,
482 ntlm_session_key, ses->password);
483
484 memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
485 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
486 sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
487 sec_blob->NtChallengeResponse.MaximumLength =
488 cpu_to_le16(CIFS_SESS_KEY_SIZE);
489
490 tmp += CIFS_SESS_KEY_SIZE;
491
492 if (ses->domainName == NULL) {
493 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
494 sec_blob->DomainName.Length = 0;
495 sec_blob->DomainName.MaximumLength = 0;
496 tmp += 2;
497 } else {
498 int len;
499 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
500 MAX_USERNAME_SIZE, nls_cp);
501 len *= 2; /* unicode is 2 bytes each */
502 len += 2; /* trailing null */
503 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
504 sec_blob->DomainName.Length = cpu_to_le16(len);
505 sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
506 tmp += len;
507 }
508
509 if (ses->userName == NULL) {
510 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
511 sec_blob->UserName.Length = 0;
512 sec_blob->UserName.MaximumLength = 0;
513 tmp += 2;
514 } else {
515 int len;
516 len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
517 MAX_USERNAME_SIZE, nls_cp);
518 len *= 2; /* unicode is 2 bytes each */
519 len += 2; /* trailing null */
520 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
521 sec_blob->UserName.Length = cpu_to_le16(len);
522 sec_blob->UserName.MaximumLength = cpu_to_le16(len);
523 tmp += len;
524 }
525
526 sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer);
527 sec_blob->WorkstationName.Length = 0;
528 sec_blob->WorkstationName.MaximumLength = 0;
529 tmp += 2;
530
531 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
532 sec_blob->SessionKey.Length = 0;
533 sec_blob->SessionKey.MaximumLength = 0;
534 return tmp - pbuffer;
535}
536
537
538static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
539 struct cifsSesInfo *ses)
540{
541 build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
542 pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
543
544 return;
545}
546
547static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
548 struct cifsSesInfo *ses,
549 const struct nls_table *nls, int first_time)
550{
551 int bloblen;
552
553 bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
554 first_time);
555 pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
556
557 return bloblen;
558}
559#endif
560
415int 561int
416CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, 562CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
417 const struct nls_table *nls_cp) 563 const struct nls_table *nls_cp)
@@ -430,6 +576,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
430 __u16 action; 576 __u16 action;
431 int bytes_remaining; 577 int bytes_remaining;
432 struct key *spnego_key = NULL; 578 struct key *spnego_key = NULL;
579 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
433 580
434 if (ses == NULL) 581 if (ses == NULL)
435 return -EINVAL; 582 return -EINVAL;
@@ -437,6 +584,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
437 type = ses->server->secType; 584 type = ses->server->secType;
438 585
439 cFYI(1, ("sess setup type %d", type)); 586 cFYI(1, ("sess setup type %d", type));
587ssetup_ntlmssp_authenticate:
588 if (phase == NtLmChallenge)
589 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
590
440 if (type == LANMAN) { 591 if (type == LANMAN) {
441#ifndef CONFIG_CIFS_WEAK_PW_HASH 592#ifndef CONFIG_CIFS_WEAK_PW_HASH
442 /* LANMAN and plaintext are less secure and off by default. 593 /* LANMAN and plaintext are less secure and off by default.
@@ -650,9 +801,53 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
650 goto ssetup_exit; 801 goto ssetup_exit;
651#endif /* CONFIG_CIFS_UPCALL */ 802#endif /* CONFIG_CIFS_UPCALL */
652 } else { 803 } else {
804#ifdef CONFIG_CIFS_EXPERIMENTAL
805 if ((experimEnabled > 1) && (type == RawNTLMSSP)) {
806 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
807 cERROR(1, ("NTLMSSP requires Unicode support"));
808 rc = -ENOSYS;
809 goto ssetup_exit;
810 }
811
812 cFYI(1, ("ntlmssp session setup phase %d", phase));
813 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
814 capabilities |= CAP_EXTENDED_SECURITY;
815 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
816 if (phase == NtLmNegotiate) {
817 setup_ntlmssp_neg_req(pSMB, ses);
818 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
819 } else if (phase == NtLmAuthenticate) {
820 int blob_len;
821 blob_len = setup_ntlmssp_auth_req(pSMB, ses,
822 nls_cp,
823 first_time);
824 iov[1].iov_len = blob_len;
825 /* Make sure that we tell the server that we
826 are using the uid that it just gave us back
827 on the response (challenge) */
828 smb_buf->Uid = ses->Suid;
829 } else {
830 cERROR(1, ("invalid phase %d", phase));
831 rc = -ENOSYS;
832 goto ssetup_exit;
833 }
834 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
835 /* unicode strings must be word aligned */
836 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
837 *bcc_ptr = 0;
838 bcc_ptr++;
839 }
840 unicode_oslm_strings(&bcc_ptr, nls_cp);
841 } else {
842 cERROR(1, ("secType %d not supported!", type));
843 rc = -ENOSYS;
844 goto ssetup_exit;
845 }
846#else
653 cERROR(1, ("secType %d not supported!", type)); 847 cERROR(1, ("secType %d not supported!", type));
654 rc = -ENOSYS; 848 rc = -ENOSYS;
655 goto ssetup_exit; 849 goto ssetup_exit;
850#endif
656 } 851 }
657 852
658 iov[2].iov_base = str_area; 853 iov[2].iov_base = str_area;
@@ -668,12 +863,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
668 /* SMB request buf freed in SendReceive2 */ 863 /* SMB request buf freed in SendReceive2 */
669 864
670 cFYI(1, ("ssetup rc from sendrecv2 is %d", rc)); 865 cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
671 if (rc)
672 goto ssetup_exit;
673 866
674 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 867 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
675 smb_buf = (struct smb_hdr *)iov[0].iov_base; 868 smb_buf = (struct smb_hdr *)iov[0].iov_base;
676 869
870 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
871 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
872 if (phase != NtLmNegotiate) {
873 cERROR(1, ("Unexpected more processing error"));
874 goto ssetup_exit;
875 }
876 /* NTLMSSP Negotiate sent now processing challenge (response) */
877 phase = NtLmChallenge; /* process ntlmssp challenge */
878 rc = 0; /* MORE_PROC rc is not an error here, but expected */
879 }
880 if (rc)
881 goto ssetup_exit;
882
677 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { 883 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
678 rc = -EIO; 884 rc = -EIO;
679 cERROR(1, ("bad word count %d", smb_buf->WordCount)); 885 cERROR(1, ("bad word count %d", smb_buf->WordCount));
@@ -692,22 +898,33 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
692 if (smb_buf->WordCount == 4) { 898 if (smb_buf->WordCount == 4) {
693 __u16 blob_len; 899 __u16 blob_len;
694 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 900 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
695 bcc_ptr += blob_len;
696 if (blob_len > bytes_remaining) { 901 if (blob_len > bytes_remaining) {
697 cERROR(1, ("bad security blob length %d", blob_len)); 902 cERROR(1, ("bad security blob length %d", blob_len));
698 rc = -EINVAL; 903 rc = -EINVAL;
699 goto ssetup_exit; 904 goto ssetup_exit;
700 } 905 }
906 if (phase == NtLmChallenge) {
907 rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
908 /* now goto beginning for ntlmssp authenticate phase */
909 if (rc)
910 goto ssetup_exit;
911 }
912 bcc_ptr += blob_len;
701 bytes_remaining -= blob_len; 913 bytes_remaining -= blob_len;
702 } 914 }
703 915
704 /* BB check if Unicode and decode strings */ 916 /* BB check if Unicode and decode strings */
705 if (smb_buf->Flags2 & SMBFLG2_UNICODE) 917 if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
706 rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining, 918 /* unicode string area must be word-aligned */
707 ses, nls_cp); 919 if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
708 else 920 ++bcc_ptr;
921 --bytes_remaining;
922 }
923 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
924 } else {
709 rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, 925 rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
710 ses, nls_cp); 926 ses, nls_cp);
927 }
711 928
712ssetup_exit: 929ssetup_exit:
713 if (spnego_key) { 930 if (spnego_key) {
@@ -721,5 +938,9 @@ ssetup_exit:
721 } else if (resp_buf_type == CIFS_LARGE_BUFFER) 938 } else if (resp_buf_type == CIFS_LARGE_BUFFER)
722 cifs_buf_release(iov[0].iov_base); 939 cifs_buf_release(iov[0].iov_base);
723 940
941 /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
942 if ((phase == NtLmChallenge) && (rc == 0))
943 goto ssetup_ntlmssp_authenticate;
944
724 return rc; 945 return rc;
725} 946}
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index 7f50e8577c1c..c5084d27db7c 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -110,6 +110,7 @@
110 110
111/* Below errors are used internally (do not come over the wire) for passthrough 111/* Below errors are used internally (do not come over the wire) for passthrough
112 from STATUS codes to POSIX only */ 112 from STATUS codes to POSIX only */
113#define ERRsymlink 0xFFFD
113#define ErrTooManyLinks 0xFFFE 114#define ErrTooManyLinks 0xFFFE
114 115
115/* Following error codes may be generated with the ERRSRV error class.*/ 116/* Following error codes may be generated with the ERRSRV error class.*/
diff --git a/fs/compat.c b/fs/compat.c
index 1c859dae758f..681ed81e6be0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
181 struct compat_stat __user *statbuf) 181 struct compat_stat __user *statbuf)
182{ 182{
183 struct kstat stat; 183 struct kstat stat;
184 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 184 int error;
185 185
186 if (!error) 186 error = vfs_stat(filename, &stat);
187 error = cp_compat_stat(&stat, statbuf); 187 if (error)
188 return error; 188 return error;
189 return cp_compat_stat(&stat, statbuf);
189} 190}
190 191
191asmlinkage long compat_sys_newlstat(char __user * filename, 192asmlinkage long compat_sys_newlstat(char __user * filename,
192 struct compat_stat __user *statbuf) 193 struct compat_stat __user *statbuf)
193{ 194{
194 struct kstat stat; 195 struct kstat stat;
195 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 196 int error;
196 197
197 if (!error) 198 error = vfs_lstat(filename, &stat);
198 error = cp_compat_stat(&stat, statbuf); 199 if (error)
199 return error; 200 return error;
201 return cp_compat_stat(&stat, statbuf);
200} 202}
201 203
202#ifndef __ARCH_WANT_STAT64 204#ifndef __ARCH_WANT_STAT64
@@ -204,21 +206,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
204 struct compat_stat __user *statbuf, int flag) 206 struct compat_stat __user *statbuf, int flag)
205{ 207{
206 struct kstat stat; 208 struct kstat stat;
207 int error = -EINVAL; 209 int error;
208
209 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
210 goto out;
211
212 if (flag & AT_SYMLINK_NOFOLLOW)
213 error = vfs_lstat_fd(dfd, filename, &stat);
214 else
215 error = vfs_stat_fd(dfd, filename, &stat);
216
217 if (!error)
218 error = cp_compat_stat(&stat, statbuf);
219 210
220out: 211 error = vfs_fstatat(dfd, filename, &stat, flag);
221 return error; 212 if (error)
213 return error;
214 return cp_compat_stat(&stat, statbuf);
222} 215}
223#endif 216#endif
224 217
@@ -1236,7 +1229,7 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
1236 1229
1237asmlinkage ssize_t 1230asmlinkage ssize_t
1238compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, 1231compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
1239 unsigned long vlen, u32 pos_high, u32 pos_low) 1232 unsigned long vlen, u32 pos_low, u32 pos_high)
1240{ 1233{
1241 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1234 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1242 struct file *file; 1235 struct file *file;
@@ -1293,7 +1286,7 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
1293 1286
1294asmlinkage ssize_t 1287asmlinkage ssize_t
1295compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, 1288compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
1296 unsigned long vlen, u32 pos_high, u32 pos_low) 1289 unsigned long vlen, u32 pos_low, u32 pos_high)
1297{ 1290{
1298 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1291 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1299 struct file *file; 1292 struct file *file;
@@ -1483,6 +1476,7 @@ int compat_do_execve(char * filename,
1483 struct linux_binprm *bprm; 1476 struct linux_binprm *bprm;
1484 struct file *file; 1477 struct file *file;
1485 struct files_struct *displaced; 1478 struct files_struct *displaced;
1479 bool clear_in_exec;
1486 int retval; 1480 int retval;
1487 1481
1488 retval = unshare_files(&displaced); 1482 retval = unshare_files(&displaced);
@@ -1505,8 +1499,9 @@ int compat_do_execve(char * filename,
1505 goto out_unlock; 1499 goto out_unlock;
1506 1500
1507 retval = check_unsafe_exec(bprm); 1501 retval = check_unsafe_exec(bprm);
1508 if (retval) 1502 if (retval < 0)
1509 goto out_unlock; 1503 goto out_unlock;
1504 clear_in_exec = retval;
1510 1505
1511 file = open_exec(filename); 1506 file = open_exec(filename);
1512 retval = PTR_ERR(file); 1507 retval = PTR_ERR(file);
@@ -1553,9 +1548,7 @@ int compat_do_execve(char * filename,
1553 goto out; 1548 goto out;
1554 1549
1555 /* execve succeeded */ 1550 /* execve succeeded */
1556 write_lock(&current->fs->lock);
1557 current->fs->in_exec = 0; 1551 current->fs->in_exec = 0;
1558 write_unlock(&current->fs->lock);
1559 current->in_execve = 0; 1552 current->in_execve = 0;
1560 mutex_unlock(&current->cred_exec_mutex); 1553 mutex_unlock(&current->cred_exec_mutex);
1561 acct_update_integrals(current); 1554 acct_update_integrals(current);
@@ -1575,9 +1568,8 @@ out_file:
1575 } 1568 }
1576 1569
1577out_unmark: 1570out_unmark:
1578 write_lock(&current->fs->lock); 1571 if (clear_in_exec)
1579 current->fs->in_exec = 0; 1572 current->fs->in_exec = 0;
1580 write_unlock(&current->fs->lock);
1581 1573
1582out_unlock: 1574out_unlock:
1583 current->in_execve = 0; 1575 current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3e87ce443ea2..b83f6bcfa51a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,7 +58,6 @@
58#include <linux/i2c.h> 58#include <linux/i2c.h>
59#include <linux/i2c-dev.h> 59#include <linux/i2c-dev.h>
60#include <linux/atalk.h> 60#include <linux/atalk.h>
61#include <linux/loop.h>
62 61
63#include <net/bluetooth/bluetooth.h> 62#include <net/bluetooth/bluetooth.h>
64#include <net/bluetooth/hci.h> 63#include <net/bluetooth/hci.h>
@@ -68,6 +67,7 @@
68#include <linux/gigaset_dev.h> 67#include <linux/gigaset_dev.h>
69 68
70#ifdef CONFIG_BLOCK 69#ifdef CONFIG_BLOCK
70#include <linux/loop.h>
71#include <scsi/scsi.h> 71#include <scsi/scsi.h>
72#include <scsi/scsi_ioctl.h> 72#include <scsi/scsi_ioctl.h>
73#include <scsi/sg.h> 73#include <scsi/sg.h>
@@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
2660HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl) 2660HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
2661/* block stuff */ 2661/* block stuff */
2662#ifdef CONFIG_BLOCK 2662#ifdef CONFIG_BLOCK
2663/* loop */
2664IGNORE_IOCTL(LOOP_CLR_FD)
2663/* Raw devices */ 2665/* Raw devices */
2664HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) 2666HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
2665HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) 2667HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
@@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
2728IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) 2730IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
2729IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) 2731IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
2730 2732
2731/* loop */
2732IGNORE_IOCTL(LOOP_CLR_FD)
2733
2734#ifdef CONFIG_SPARC 2733#ifdef CONFIG_SPARC
2735/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ 2734/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
2736IGNORE_IOCTL(FBIOGTYPE) 2735IGNORE_IOCTL(FBIOGTYPE)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 932a92b31483..c8afa6b1d91d 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -135,7 +135,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
135 struct path path; 135 struct path path;
136 struct configfs_dirent *sd; 136 struct configfs_dirent *sd;
137 struct config_item *parent_item; 137 struct config_item *parent_item;
138 struct config_item *target_item; 138 struct config_item *target_item = NULL;
139 struct config_item_type *type; 139 struct config_item_type *type;
140 140
141 ret = -EPERM; /* What lack-of-symlink returns */ 141 ret = -EPERM; /* What lack-of-symlink returns */
diff --git a/fs/dcache.c b/fs/dcache.c
index 761d30be2683..75659a6fd1f8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -481,7 +481,7 @@ restart:
481 if ((flags & DCACHE_REFERENCED) 481 if ((flags & DCACHE_REFERENCED)
482 && (dentry->d_flags & DCACHE_REFERENCED)) { 482 && (dentry->d_flags & DCACHE_REFERENCED)) {
483 dentry->d_flags &= ~DCACHE_REFERENCED; 483 dentry->d_flags &= ~DCACHE_REFERENCED;
484 list_move_tail(&dentry->d_lru, &referenced); 484 list_move(&dentry->d_lru, &referenced);
485 spin_unlock(&dentry->d_lock); 485 spin_unlock(&dentry->d_lock);
486 } else { 486 } else {
487 list_move_tail(&dentry->d_lru, &tmp); 487 list_move_tail(&dentry->d_lru, &tmp);
@@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2149 int result; 2149 int result;
2150 unsigned long seq; 2150 unsigned long seq;
2151 2151
2152 /* FIXME: This is old behavior, needed? Please check callers. */
2153 if (new_dentry == old_dentry) 2152 if (new_dentry == old_dentry)
2154 return 1; 2153 return 1;
2155 2154
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
30 30
31static struct vfsmount *debugfs_mount; 31static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 32static int debugfs_mount_count;
33static bool debugfs_registered;
33 34
34static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) 35static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
35{ 36{
@@ -496,6 +497,16 @@ exit:
496} 497}
497EXPORT_SYMBOL_GPL(debugfs_rename); 498EXPORT_SYMBOL_GPL(debugfs_rename);
498 499
500/**
501 * debugfs_initialized - Tells whether debugfs has been registered
502 */
503bool debugfs_initialized(void)
504{
505 return debugfs_registered;
506}
507EXPORT_SYMBOL_GPL(debugfs_initialized);
508
509
499static struct kobject *debug_kobj; 510static struct kobject *debug_kobj;
500 511
501static int __init debugfs_init(void) 512static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
509 retval = register_filesystem(&debug_fs_type); 520 retval = register_filesystem(&debug_fs_type);
510 if (retval) 521 if (retval)
511 kobject_put(debug_kobj); 522 kobject_put(debug_kobj);
523 else
524 debugfs_registered = true;
525
512 return retval; 526 return retval;
513} 527}
514 528
515static void __exit debugfs_exit(void) 529static void __exit debugfs_exit(void)
516{ 530{
531 debugfs_registered = false;
532
517 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 533 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
518 unregister_filesystem(&debug_fs_type); 534 unregister_filesystem(&debug_fs_type);
519 kobject_put(debug_kobj); 535 kobject_put(debug_kobj);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 63a4a59e4148..c68edb969441 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -90,6 +90,15 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
90#define PARSE_MOUNT 0 90#define PARSE_MOUNT 0
91#define PARSE_REMOUNT 1 91#define PARSE_REMOUNT 1
92 92
93/*
94 * parse_mount_options():
95 * Set @opts to mount options specified in @data. If an option is not
96 * specified in @data, set it to its default value. The exception is
97 * 'newinstance' option which can only be set/cleared on a mount (i.e.
98 * cannot be changed during remount).
99 *
100 * Note: @data may be NULL (in which case all options are set to default).
101 */
93static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) 102static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
94{ 103{
95 char *p; 104 char *p;
@@ -355,12 +364,9 @@ static int devpts_get_sb(struct file_system_type *fs_type,
355 struct pts_mount_opts opts; 364 struct pts_mount_opts opts;
356 struct super_block *s; 365 struct super_block *s;
357 366
358 memset(&opts, 0, sizeof(opts)); 367 error = parse_mount_options(data, PARSE_MOUNT, &opts);
359 if (data) { 368 if (error)
360 error = parse_mount_options(data, PARSE_MOUNT, &opts); 369 return error;
361 if (error)
362 return error;
363 }
364 370
365 if (opts.newinstance) 371 if (opts.newinstance)
366 s = sget(fs_type, NULL, set_anon_super, NULL); 372 s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -389,11 +395,10 @@ static int devpts_get_sb(struct file_system_type *fs_type,
389 return 0; 395 return 0;
390 396
391out_dput: 397out_dput:
392 dput(s->s_root); 398 dput(s->s_root); /* undo dget() in simple_set_mnt() */
393 399
394out_undo_sget: 400out_undo_sget:
395 up_write(&s->s_umount); 401 deactivate_locked_super(s);
396 deactivate_super(s);
397 return error; 402 return error;
398} 403}
399 404
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..05763bbc2050 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -307,8 +307,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
307 struct bio *bio; 307 struct bio *bio;
308 308
309 bio = bio_alloc(GFP_KERNEL, nr_vecs); 309 bio = bio_alloc(GFP_KERNEL, nr_vecs);
310 if (bio == NULL)
311 return -ENOMEM;
312 310
313 bio->bi_bdev = bdev; 311 bio->bi_bdev = bdev;
314 bio->bi_sector = first_sector; 312 bio->bi_sector = first_sector;
@@ -1126,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 int acquire_i_mutex = 0; 1124 int acquire_i_mutex = 0;
1127 1125
1128 if (rw & WRITE) 1126 if (rw & WRITE)
1129 rw = WRITE_SYNC; 1127 rw = WRITE_ODIRECT;
1130 1128
1131 if (bdev) 1129 if (bdev)
1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); 1130 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 8b65f289ee00..b91851f1cda3 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page)
483 ecryptfs_inode = page->mapping->host; 483 ecryptfs_inode = page->mapping->host;
484 crypt_stat = 484 crypt_stat =
485 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 485 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
486 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 486 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
487 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page,
488 0, PAGE_CACHE_SIZE);
489 if (rc)
490 printk(KERN_ERR "%s: Error attempting to copy "
491 "page at index [%ld]\n", __func__,
492 page->index);
493 goto out;
494 }
495 enc_extent_page = alloc_page(GFP_USER); 487 enc_extent_page = alloc_page(GFP_USER);
496 if (!enc_extent_page) { 488 if (!enc_extent_page) {
497 rc = -ENOMEM; 489 rc = -ENOMEM;
@@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page)
620 ecryptfs_inode = page->mapping->host; 612 ecryptfs_inode = page->mapping->host;
621 crypt_stat = 613 crypt_stat =
622 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 614 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
623 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 615 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
624 rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
625 PAGE_CACHE_SIZE,
626 ecryptfs_inode);
627 if (rc)
628 printk(KERN_ERR "%s: Error attempting to copy "
629 "page at index [%ld]\n", __func__,
630 page->index);
631 goto out;
632 }
633 enc_extent_page = alloc_page(GFP_USER); 616 enc_extent_page = alloc_page(GFP_USER);
634 if (!enc_extent_page) { 617 if (!enc_extent_page) {
635 rc = -ENOMEM; 618 rc = -ENOMEM;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 064c5820e4e5..00b30a2d5466 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat {
269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800
270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000 271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000
272#define ECRYPTFS_UNLINK_SIGS 0x00004000
272 u32 flags; 273 u32 flags;
273 unsigned int file_version; 274 unsigned int file_version;
274 size_t iv_bytes; 275 size_t iv_bytes;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 55b3145b8072..2f0945d63297 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
379 goto out_d_drop; 379 goto out_d_drop;
380 } 380 }
381 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 381 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
382 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
382 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, 383 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
383 lower_dir_dentry, 384 lower_dir_dentry,
384 ecryptfs_dentry->d_name.len); 385 ecryptfs_dentry->d_name.len);
386 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
385 if (IS_ERR(lower_dentry)) { 387 if (IS_ERR(lower_dentry)) {
386 rc = PTR_ERR(lower_dentry); 388 rc = PTR_ERR(lower_dentry);
387 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 389 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
406 "filename; rc = [%d]\n", __func__, rc); 408 "filename; rc = [%d]\n", __func__, rc);
407 goto out_d_drop; 409 goto out_d_drop;
408 } 410 }
411 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
409 lower_dentry = lookup_one_len(encrypted_and_encoded_name, 412 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
410 lower_dir_dentry, 413 lower_dir_dentry,
411 encrypted_and_encoded_name_size - 1); 414 encrypted_and_encoded_name_size - 1);
415 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
412 if (IS_ERR(lower_dentry)) { 416 if (IS_ERR(lower_dentry)) {
413 rc = PTR_ERR(lower_dentry); 417 rc = PTR_ERR(lower_dentry);
414 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 418 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -636,8 +640,9 @@ static int
636ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) 640ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
637{ 641{
638 char *lower_buf; 642 char *lower_buf;
643 size_t lower_bufsiz;
639 struct dentry *lower_dentry; 644 struct dentry *lower_dentry;
640 struct ecryptfs_crypt_stat *crypt_stat; 645 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
641 char *plaintext_name; 646 char *plaintext_name;
642 size_t plaintext_name_size; 647 size_t plaintext_name_size;
643 mm_segment_t old_fs; 648 mm_segment_t old_fs;
@@ -648,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
648 rc = -EINVAL; 653 rc = -EINVAL;
649 goto out; 654 goto out;
650 } 655 }
651 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 656 mount_crypt_stat = &ecryptfs_superblock_to_private(
657 dentry->d_sb)->mount_crypt_stat;
658 /*
659 * If the lower filename is encrypted, it will result in a significantly
660 * longer name. If needed, truncate the name after decode and decrypt.
661 */
662 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
663 lower_bufsiz = PATH_MAX;
664 else
665 lower_bufsiz = bufsiz;
652 /* Released in this function */ 666 /* Released in this function */
653 lower_buf = kmalloc(bufsiz, GFP_KERNEL); 667 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
654 if (lower_buf == NULL) { 668 if (lower_buf == NULL) {
655 printk(KERN_ERR "%s: Out of memory whilst attempting to " 669 printk(KERN_ERR "%s: Out of memory whilst attempting to "
656 "kmalloc [%d] bytes\n", __func__, bufsiz); 670 "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
657 rc = -ENOMEM; 671 rc = -ENOMEM;
658 goto out; 672 goto out;
659 } 673 }
@@ -661,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
661 set_fs(get_ds()); 675 set_fs(get_ds());
662 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, 676 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
663 (char __user *)lower_buf, 677 (char __user *)lower_buf,
664 bufsiz); 678 lower_bufsiz);
665 set_fs(old_fs); 679 set_fs(old_fs);
666 if (rc >= 0) { 680 if (rc >= 0) {
667 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, 681 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
@@ -674,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
674 rc); 688 rc);
675 goto out_free_lower_buf; 689 goto out_free_lower_buf;
676 } 690 }
677 rc = copy_to_user(buf, plaintext_name, plaintext_name_size); 691 /* Check for bufsiz <= 0 done in sys_readlinkat() */
692 rc = copy_to_user(buf, plaintext_name,
693 min((size_t) bufsiz, plaintext_name_size));
678 if (rc) 694 if (rc)
679 rc = -EFAULT; 695 rc = -EFAULT;
680 else 696 else
@@ -814,6 +830,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
814 size_t num_zeros = (PAGE_CACHE_SIZE 830 size_t num_zeros = (PAGE_CACHE_SIZE
815 - (new_length & ~PAGE_CACHE_MASK)); 831 - (new_length & ~PAGE_CACHE_MASK));
816 832
833 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
834 rc = vmtruncate(inode, new_length);
835 if (rc)
836 goto out_free;
837 rc = vmtruncate(lower_dentry->d_inode, new_length);
838 goto out_free;
839 }
817 if (num_zeros) { 840 if (num_zeros) {
818 char *zeros_virt; 841 char *zeros_virt;
819 842
@@ -915,8 +938,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
915 } 938 }
916 rc = 0; 939 rc = 0;
917 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); 940 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
918 mutex_unlock(&crypt_stat->cs_mutex);
919 goto out;
920 } 941 }
921 } 942 }
922 mutex_unlock(&crypt_stat->cs_mutex); 943 mutex_unlock(&crypt_stat->cs_mutex);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index aed56c25539b..9f0aa9883c28 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
190 init_special_inode(inode, lower_inode->i_mode, 190 init_special_inode(inode, lower_inode->i_mode,
191 lower_inode->i_rdev); 191 lower_inode->i_rdev);
192 dentry->d_op = &ecryptfs_dops; 192 dentry->d_op = &ecryptfs_dops;
193 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
194 d_add(dentry, inode);
195 else
196 d_instantiate(dentry, inode);
197 fsstack_copy_attr_all(inode, lower_inode, NULL); 193 fsstack_copy_attr_all(inode, lower_inode, NULL);
198 /* This size will be overwritten for real files w/ headers and 194 /* This size will be overwritten for real files w/ headers and
199 * other metadata */ 195 * other metadata */
200 fsstack_copy_inode_size(inode, lower_inode); 196 fsstack_copy_inode_size(inode, lower_inode);
197 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
198 d_add(dentry, inode);
199 else
200 d_instantiate(dentry, inode);
201out: 201out:
202 return rc; 202 return rc;
203} 203}
@@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, 209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, 210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_err }; 211 ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
212 212
213static const match_table_t tokens = { 213static const match_table_t tokens = {
214 {ecryptfs_opt_sig, "sig=%s"}, 214 {ecryptfs_opt_sig, "sig=%s"},
@@ -222,6 +222,7 @@ static const match_table_t tokens = {
222 {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, 222 {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, 223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, 224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
225 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
225 {ecryptfs_opt_err, NULL} 226 {ecryptfs_opt_err, NULL}
226}; 227};
227 228
@@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
402 fn_cipher_key_bytes; 403 fn_cipher_key_bytes;
403 fn_cipher_key_bytes_set = 1; 404 fn_cipher_key_bytes_set = 1;
404 break; 405 break;
406 case ecryptfs_opt_unlink_sigs:
407 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
408 break;
405 case ecryptfs_opt_err: 409 case ecryptfs_opt_err:
406 default: 410 default:
407 printk(KERN_WARNING 411 printk(KERN_WARNING
@@ -610,9 +614,8 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
610 } 614 }
611 goto out; 615 goto out;
612out_abort: 616out_abort:
613 dput(sb->s_root); 617 dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
614 up_write(&sb->s_umount); 618 deactivate_locked_super(sb);
615 deactivate_super(sb);
616out: 619out:
617 return rc; 620 return rc;
618} 621}
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 295e7fa56755..f1c17e87c5fb 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -133,45 +133,6 @@ out:
133 return rc; 133 return rc;
134} 134}
135 135
136static int
137ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
138 struct ecryptfs_msg_ctx **msg_ctx);
139
140/**
141 * ecryptfs_send_raw_message
142 * @msg_type: Message type
143 * @daemon: Daemon struct for recipient of message
144 *
145 * A raw message is one that does not include an ecryptfs_message
146 * struct. It simply has a type.
147 *
148 * Must be called with ecryptfs_daemon_hash_mux held.
149 *
150 * Returns zero on success; non-zero otherwise
151 */
152static int ecryptfs_send_raw_message(u8 msg_type,
153 struct ecryptfs_daemon *daemon)
154{
155 struct ecryptfs_msg_ctx *msg_ctx;
156 int rc;
157
158 rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
159 if (rc) {
160 printk(KERN_ERR "%s: Error whilst attempting to send "
161 "message to ecryptfsd; rc = [%d]\n", __func__, rc);
162 goto out;
163 }
164 /* Raw messages are logically context-free (e.g., no
165 * reply is expected), so we set the state of the
166 * ecryptfs_msg_ctx object to indicate that it should
167 * be freed as soon as the message is sent. */
168 mutex_lock(&msg_ctx->mux);
169 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
170 mutex_unlock(&msg_ctx->mux);
171out:
172 return rc;
173}
174
175/** 136/**
176 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct 137 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
177 * @daemon: Pointer to set to newly allocated daemon struct 138 * @daemon: Pointer to set to newly allocated daemon struct
@@ -212,49 +173,6 @@ out:
212} 173}
213 174
214/** 175/**
215 * ecryptfs_process_helo
216 * @euid: The user ID owner of the message
217 * @user_ns: The namespace in which @euid applies
218 * @pid: The process ID for the userspace program that sent the
219 * message
220 *
221 * Adds the euid and pid values to the daemon euid hash. If an euid
222 * already has a daemon pid registered, the daemon will be
223 * unregistered before the new daemon is put into the hash list.
224 * Returns zero after adding a new daemon to the hash list;
225 * non-zero otherwise.
226 */
227int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
228 struct pid *pid)
229{
230 struct ecryptfs_daemon *new_daemon;
231 struct ecryptfs_daemon *old_daemon;
232 int rc;
233
234 mutex_lock(&ecryptfs_daemon_hash_mux);
235 rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
236 if (rc != 0) {
237 printk(KERN_WARNING "Received request from user [%d] "
238 "to register daemon [0x%p]; unregistering daemon "
239 "[0x%p]\n", euid, pid, old_daemon->pid);
240 rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
241 if (rc)
242 printk(KERN_WARNING "Failed to send QUIT "
243 "message to daemon [0x%p]; rc = [%d]\n",
244 old_daemon->pid, rc);
245 hlist_del(&old_daemon->euid_chain);
246 kfree(old_daemon);
247 }
248 rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
249 if (rc)
250 printk(KERN_ERR "%s: The gods are displeased with this attempt "
251 "to create a new daemon object for euid [%d]; pid "
252 "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
253 mutex_unlock(&ecryptfs_daemon_hash_mux);
254 return rc;
255}
256
257/**
258 * ecryptfs_exorcise_daemon - Destroy the daemon struct 176 * ecryptfs_exorcise_daemon - Destroy the daemon struct
259 * 177 *
260 * Must be called ceremoniously while in possession of 178 * Must be called ceremoniously while in possession of
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a67fea655f49..4ec8f61ccf5a 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
193 int rc = 0; 193 int rc = 0;
194 194
195 mutex_lock(&msg_ctx->mux); 195 mutex_lock(&msg_ctx->mux);
196 if (data) { 196 msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
197 msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), 197 GFP_KERNEL);
198 GFP_KERNEL); 198 if (!msg_ctx->msg) {
199 if (!msg_ctx->msg) { 199 rc = -ENOMEM;
200 rc = -ENOMEM; 200 printk(KERN_ERR "%s: Out of memory whilst attempting "
201 printk(KERN_ERR "%s: Out of memory whilst attempting " 201 "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
202 "to kmalloc(%zd, GFP_KERNEL)\n", __func__, 202 (sizeof(*msg_ctx->msg) + data_size));
203 (sizeof(*msg_ctx->msg) + data_size)); 203 goto out_unlock;
204 goto out_unlock; 204 }
205 }
206 } else
207 msg_ctx->msg = NULL;
208 msg_ctx->msg->index = msg_ctx->index; 205 msg_ctx->msg->index = msg_ctx->index;
209 msg_ctx->msg->data_len = data_size; 206 msg_ctx->msg->data_len = data_size;
210 msg_ctx->type = msg_type; 207 msg_ctx->type = msg_type;
211 if (data) { 208 memcpy(msg_ctx->msg->data, data, data_size);
212 memcpy(msg_ctx->msg->data, data, data_size); 209 msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
213 msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
214 } else
215 msg_ctx->msg_size = 0;
216 mutex_lock(&daemon->mux); 210 mutex_lock(&daemon->mux);
217 list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); 211 list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
218 daemon->num_queued_msg_ctx++; 212 daemon->num_queued_msg_ctx++;
@@ -418,18 +412,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
418 412
419 if (count == 0) 413 if (count == 0)
420 goto out; 414 goto out;
421 data = kmalloc(count, GFP_KERNEL); 415
422 if (!data) { 416 data = memdup_user(buf, count);
423 printk(KERN_ERR "%s: Out of memory whilst attempting to " 417 if (IS_ERR(data)) {
424 "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); 418 printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
419 __func__, PTR_ERR(data));
425 goto out; 420 goto out;
426 } 421 }
427 rc = copy_from_user(data, buf, count);
428 if (rc) {
429 printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
430 __func__, rc);
431 goto out_free;
432 }
433 sz = count; 422 sz = count;
434 i = 0; 423 i = 0;
435 switch (data[i++]) { 424 switch (data[i++]) {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 46cec2b69796..5c6bab9786e3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
449 struct ecryptfs_crypt_stat *crypt_stat; 449 struct ecryptfs_crypt_stat *crypt_stat;
450 450
451 crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; 451 crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
452 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
452 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 453 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
453 return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode); 454 return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
454 else 455 else
@@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file,
490 ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); 491 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
491 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 492 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
492 "(page w/ index = [0x%.16x], to = [%d])\n", index, to); 493 "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
494 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
495 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
496 to);
497 if (!rc) {
498 rc = copied;
499 fsstack_copy_inode_size(ecryptfs_inode,
500 ecryptfs_inode_to_lower(ecryptfs_inode));
501 }
502 goto out;
503 }
493 /* Fills in zeros if 'to' goes beyond inode size */ 504 /* Fills in zeros if 'to' goes beyond inode size */
494 rc = fill_zeros_to_end_of_page(page, to); 505 rc = fill_zeros_to_end_of_page(page, to);
495 if (rc) { 506 if (rc) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 75c2ea9fee35..a137c6ea2fee 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
117 size_t size) 117 size_t size)
118{ 118{
119 struct page *ecryptfs_page; 119 struct page *ecryptfs_page;
120 struct ecryptfs_crypt_stat *crypt_stat;
121 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
120 char *ecryptfs_page_virt; 122 char *ecryptfs_page_virt;
121 loff_t ecryptfs_file_size = 123 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
122 i_size_read(ecryptfs_file->f_dentry->d_inode);
123 loff_t data_offset = 0; 124 loff_t data_offset = 0;
124 loff_t pos; 125 loff_t pos;
125 int rc = 0; 126 int rc = 0;
126 127
128 crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
127 /* 129 /*
128 * if we are writing beyond current size, then start pos 130 * if we are writing beyond current size, then start pos
129 * at the current size - we'll fill in zeros from there. 131 * at the current size - we'll fill in zeros from there.
@@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
184 flush_dcache_page(ecryptfs_page); 186 flush_dcache_page(ecryptfs_page);
185 SetPageUptodate(ecryptfs_page); 187 SetPageUptodate(ecryptfs_page);
186 unlock_page(ecryptfs_page); 188 unlock_page(ecryptfs_page);
187 rc = ecryptfs_encrypt_page(ecryptfs_page); 189 if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
190 rc = ecryptfs_encrypt_page(ecryptfs_page);
191 else
192 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
193 ecryptfs_page,
194 start_offset_in_page,
195 data_offset);
188 page_cache_release(ecryptfs_page); 196 page_cache_release(ecryptfs_page);
189 if (rc) { 197 if (rc) {
190 printk(KERN_ERR "%s: Error encrypting " 198 printk(KERN_ERR "%s: Error encrypting "
@@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
194 pos += num_bytes; 202 pos += num_bytes;
195 } 203 }
196 if ((offset + size) > ecryptfs_file_size) { 204 if ((offset + size) > ecryptfs_file_size) {
197 i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size)); 205 i_size_write(ecryptfs_inode, (offset + size));
198 rc = ecryptfs_write_inode_size_to_metadata( 206 if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
199 ecryptfs_file->f_dentry->d_inode); 207 rc = ecryptfs_write_inode_size_to_metadata(
200 if (rc) { 208 ecryptfs_inode);
201 printk(KERN_ERR "Problem with " 209 if (rc) {
202 "ecryptfs_write_inode_size_to_metadata; " 210 printk(KERN_ERR "Problem with "
203 "rc = [%d]\n", rc); 211 "ecryptfs_write_inode_size_to_metadata; "
204 goto out; 212 "rc = [%d]\n", rc);
213 goto out;
214 }
205 } 215 }
206 } 216 }
207out: 217out:
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index c27ac2b358a1..fa4c7e7d15d9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
170 list_for_each_entry(walker, 170 list_for_each_entry(walker,
171 &mount_crypt_stat->global_auth_tok_list, 171 &mount_crypt_stat->global_auth_tok_list,
172 mount_crypt_stat_list) { 172 mount_crypt_stat_list) {
173 seq_printf(m, ",ecryptfs_sig=%s", walker->sig); 173 if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
174 seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
175 else
176 seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
174 } 177 }
175 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); 178 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
176 179
@@ -186,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
186 seq_printf(m, ",ecryptfs_xattr_metadata"); 189 seq_printf(m, ",ecryptfs_xattr_metadata");
187 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) 190 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
188 seq_printf(m, ",ecryptfs_encrypted_view"); 191 seq_printf(m, ",ecryptfs_encrypted_view");
192 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
193 seq_printf(m, ",ecryptfs_unlink_sigs");
189 194
190 return 0; 195 return 0;
191} 196}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a89f370fadb5..5458e80fc558 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1212,7 +1212,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1212 1212
1213SYSCALL_DEFINE1(epoll_create, int, size) 1213SYSCALL_DEFINE1(epoll_create, int, size)
1214{ 1214{
1215 if (size < 0) 1215 if (size <= 0)
1216 return -EINVAL; 1216 return -EINVAL;
1217 1217
1218 return sys_epoll_create1(0); 1218 return sys_epoll_create1(0);
diff --git a/fs/exec.c b/fs/exec.c
index 052a961e41aa..895823d0149d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -69,17 +69,18 @@ int suid_dumpable = 0;
69static LIST_HEAD(formats); 69static LIST_HEAD(formats);
70static DEFINE_RWLOCK(binfmt_lock); 70static DEFINE_RWLOCK(binfmt_lock);
71 71
72int register_binfmt(struct linux_binfmt * fmt) 72int __register_binfmt(struct linux_binfmt * fmt, int insert)
73{ 73{
74 if (!fmt) 74 if (!fmt)
75 return -EINVAL; 75 return -EINVAL;
76 write_lock(&binfmt_lock); 76 write_lock(&binfmt_lock);
77 list_add(&fmt->lh, &formats); 77 insert ? list_add(&fmt->lh, &formats) :
78 list_add_tail(&fmt->lh, &formats);
78 write_unlock(&binfmt_lock); 79 write_unlock(&binfmt_lock);
79 return 0; 80 return 0;
80} 81}
81 82
82EXPORT_SYMBOL(register_binfmt); 83EXPORT_SYMBOL(__register_binfmt);
83 84
84void unregister_binfmt(struct linux_binfmt * fmt) 85void unregister_binfmt(struct linux_binfmt * fmt)
85{ 86{
@@ -104,40 +105,28 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
104SYSCALL_DEFINE1(uselib, const char __user *, library) 105SYSCALL_DEFINE1(uselib, const char __user *, library)
105{ 106{
106 struct file *file; 107 struct file *file;
107 struct nameidata nd;
108 char *tmp = getname(library); 108 char *tmp = getname(library);
109 int error = PTR_ERR(tmp); 109 int error = PTR_ERR(tmp);
110 110
111 if (!IS_ERR(tmp)) { 111 if (IS_ERR(tmp))
112 error = path_lookup_open(AT_FDCWD, tmp, 112 goto out;
113 LOOKUP_FOLLOW, &nd, 113
114 FMODE_READ|FMODE_EXEC); 114 file = do_filp_open(AT_FDCWD, tmp,
115 putname(tmp); 115 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
116 } 116 MAY_READ | MAY_EXEC | MAY_OPEN);
117 if (error) 117 putname(tmp);
118 error = PTR_ERR(file);
119 if (IS_ERR(file))
118 goto out; 120 goto out;
119 121
120 error = -EINVAL; 122 error = -EINVAL;
121 if (!S_ISREG(nd.path.dentry->d_inode->i_mode)) 123 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
122 goto exit; 124 goto exit;
123 125
124 error = -EACCES; 126 error = -EACCES;
125 if (nd.path.mnt->mnt_flags & MNT_NOEXEC) 127 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
126 goto exit; 128 goto exit;
127 129
128 error = inode_permission(nd.path.dentry->d_inode,
129 MAY_READ | MAY_EXEC | MAY_OPEN);
130 if (error)
131 goto exit;
132 error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
133 if (error)
134 goto exit;
135
136 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
137 error = PTR_ERR(file);
138 if (IS_ERR(file))
139 goto out;
140
141 fsnotify_open(file->f_path.dentry); 130 fsnotify_open(file->f_path.dentry);
142 131
143 error = -ENOEXEC; 132 error = -ENOEXEC;
@@ -159,13 +148,10 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
159 } 148 }
160 read_unlock(&binfmt_lock); 149 read_unlock(&binfmt_lock);
161 } 150 }
151exit:
162 fput(file); 152 fput(file);
163out: 153out:
164 return error; 154 return error;
165exit:
166 release_open_intent(&nd);
167 path_put(&nd.path);
168 goto out;
169} 155}
170 156
171#ifdef CONFIG_MMU 157#ifdef CONFIG_MMU
@@ -660,47 +646,33 @@ EXPORT_SYMBOL(setup_arg_pages);
660 646
661struct file *open_exec(const char *name) 647struct file *open_exec(const char *name)
662{ 648{
663 struct nameidata nd;
664 struct file *file; 649 struct file *file;
665 int err; 650 int err;
666 651
667 err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, 652 file = do_filp_open(AT_FDCWD, name,
668 FMODE_READ|FMODE_EXEC); 653 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
669 if (err) 654 MAY_EXEC | MAY_OPEN);
655 if (IS_ERR(file))
670 goto out; 656 goto out;
671 657
672 err = -EACCES; 658 err = -EACCES;
673 if (!S_ISREG(nd.path.dentry->d_inode->i_mode)) 659 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
674 goto out_path_put; 660 goto exit;
675
676 if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
677 goto out_path_put;
678
679 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
680 if (err)
681 goto out_path_put;
682 err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
683 if (err)
684 goto out_path_put;
685 661
686 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE); 662 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
687 if (IS_ERR(file)) 663 goto exit;
688 return file;
689 664
690 fsnotify_open(file->f_path.dentry); 665 fsnotify_open(file->f_path.dentry);
691 666
692 err = deny_write_access(file); 667 err = deny_write_access(file);
693 if (err) { 668 if (err)
694 fput(file); 669 goto exit;
695 goto out;
696 }
697 670
671out:
698 return file; 672 return file;
699 673
700 out_path_put: 674exit:
701 release_open_intent(&nd); 675 fput(file);
702 path_put(&nd.path);
703 out:
704 return ERR_PTR(err); 676 return ERR_PTR(err);
705} 677}
706EXPORT_SYMBOL(open_exec); 678EXPORT_SYMBOL(open_exec);
@@ -1060,7 +1032,6 @@ EXPORT_SYMBOL(install_exec_creds);
1060int check_unsafe_exec(struct linux_binprm *bprm) 1032int check_unsafe_exec(struct linux_binprm *bprm)
1061{ 1033{
1062 struct task_struct *p = current, *t; 1034 struct task_struct *p = current, *t;
1063 unsigned long flags;
1064 unsigned n_fs; 1035 unsigned n_fs;
1065 int res = 0; 1036 int res = 0;
1066 1037
@@ -1068,21 +1039,22 @@ int check_unsafe_exec(struct linux_binprm *bprm)
1068 1039
1069 n_fs = 1; 1040 n_fs = 1;
1070 write_lock(&p->fs->lock); 1041 write_lock(&p->fs->lock);
1071 lock_task_sighand(p, &flags); 1042 rcu_read_lock();
1072 for (t = next_thread(p); t != p; t = next_thread(t)) { 1043 for (t = next_thread(p); t != p; t = next_thread(t)) {
1073 if (t->fs == p->fs) 1044 if (t->fs == p->fs)
1074 n_fs++; 1045 n_fs++;
1075 } 1046 }
1047 rcu_read_unlock();
1076 1048
1077 if (p->fs->users > n_fs) { 1049 if (p->fs->users > n_fs) {
1078 bprm->unsafe |= LSM_UNSAFE_SHARE; 1050 bprm->unsafe |= LSM_UNSAFE_SHARE;
1079 } else { 1051 } else {
1080 if (p->fs->in_exec) 1052 res = -EAGAIN;
1081 res = -EAGAIN; 1053 if (!p->fs->in_exec) {
1082 p->fs->in_exec = 1; 1054 p->fs->in_exec = 1;
1055 res = 1;
1056 }
1083 } 1057 }
1084
1085 unlock_task_sighand(p, &flags);
1086 write_unlock(&p->fs->lock); 1058 write_unlock(&p->fs->lock);
1087 1059
1088 return res; 1060 return res;
@@ -1284,6 +1256,7 @@ int do_execve(char * filename,
1284 struct linux_binprm *bprm; 1256 struct linux_binprm *bprm;
1285 struct file *file; 1257 struct file *file;
1286 struct files_struct *displaced; 1258 struct files_struct *displaced;
1259 bool clear_in_exec;
1287 int retval; 1260 int retval;
1288 1261
1289 retval = unshare_files(&displaced); 1262 retval = unshare_files(&displaced);
@@ -1306,8 +1279,9 @@ int do_execve(char * filename,
1306 goto out_unlock; 1279 goto out_unlock;
1307 1280
1308 retval = check_unsafe_exec(bprm); 1281 retval = check_unsafe_exec(bprm);
1309 if (retval) 1282 if (retval < 0)
1310 goto out_unlock; 1283 goto out_unlock;
1284 clear_in_exec = retval;
1311 1285
1312 file = open_exec(filename); 1286 file = open_exec(filename);
1313 retval = PTR_ERR(file); 1287 retval = PTR_ERR(file);
@@ -1355,9 +1329,7 @@ int do_execve(char * filename,
1355 goto out; 1329 goto out;
1356 1330
1357 /* execve succeeded */ 1331 /* execve succeeded */
1358 write_lock(&current->fs->lock);
1359 current->fs->in_exec = 0; 1332 current->fs->in_exec = 0;
1360 write_unlock(&current->fs->lock);
1361 current->in_execve = 0; 1333 current->in_execve = 0;
1362 mutex_unlock(&current->cred_exec_mutex); 1334 mutex_unlock(&current->cred_exec_mutex);
1363 acct_update_integrals(current); 1335 acct_update_integrals(current);
@@ -1377,9 +1349,8 @@ out_file:
1377 } 1349 }
1378 1350
1379out_unmark: 1351out_unmark:
1380 write_lock(&current->fs->lock); 1352 if (clear_in_exec)
1381 current->fs->in_exec = 0; 1353 current->fs->in_exec = 0;
1382 write_unlock(&current->fs->lock);
1383 1354
1384out_unlock: 1355out_unlock:
1385 current->in_execve = 0; 1356 current->in_execve = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b95563663..acf678831103 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
590 590
591 if (depth == 0) 591 if (depth == 0)
592 return (err); 592 return (err);
593reread:
594 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
595 593
594 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
596 /* Simplest case - block found, no allocation needed */ 595 /* Simplest case - block found, no allocation needed */
597 if (!partial) { 596 if (!partial) {
598 first_block = le32_to_cpu(chain[depth - 1].key); 597 first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
602 while (count < maxblocks && count <= blocks_to_boundary) { 601 while (count < maxblocks && count <= blocks_to_boundary) {
603 ext2_fsblk_t blk; 602 ext2_fsblk_t blk;
604 603
605 if (!verify_chain(chain, partial)) { 604 if (!verify_chain(chain, chain + depth - 1)) {
606 /* 605 /*
607 * Indirect block might be removed by 606 * Indirect block might be removed by
608 * truncate while we were reading it. 607 * truncate while we were reading it.
609 * Handling of that case: forget what we've 608 * Handling of that case: forget what we've
610 * got now, go to reread. 609 * got now, go to reread.
611 */ 610 */
611 err = -EAGAIN;
612 count = 0; 612 count = 0;
613 goto changed; 613 break;
614 } 614 }
615 blk = le32_to_cpu(*(chain[depth-1].p + count)); 615 blk = le32_to_cpu(*(chain[depth-1].p + count));
616 if (blk == first_block + count) 616 if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
618 else 618 else
619 break; 619 break;
620 } 620 }
621 goto got_it; 621 if (err != -EAGAIN)
622 goto got_it;
622 } 623 }
623 624
624 /* Next simple case - plain lookup or failed read of indirect block */ 625 /* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
626 goto cleanup; 627 goto cleanup;
627 628
628 mutex_lock(&ei->truncate_mutex); 629 mutex_lock(&ei->truncate_mutex);
630 /*
631 * If the indirect block is missing while we are reading
632 * the chain(ext3_get_branch() returns -EAGAIN err), or
633 * if the chain has been changed after we grab the semaphore,
634 * (either because another process truncated this branch, or
635 * another get_block allocated this branch) re-grab the chain to see if
636 * the request block has been allocated or not.
637 *
638 * Since we already block the truncate/other get_block
639 * at this point, we will have the current copy of the chain when we
640 * splice the branch into the tree.
641 */
642 if (err == -EAGAIN || !verify_chain(chain, partial)) {
643 while (partial > chain) {
644 brelse(partial->bh);
645 partial--;
646 }
647 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
648 if (!partial) {
649 count++;
650 mutex_unlock(&ei->truncate_mutex);
651 if (err)
652 goto cleanup;
653 clear_buffer_new(bh_result);
654 goto got_it;
655 }
656 }
629 657
630 /* 658 /*
631 * Okay, we need to do block allocation. Lazily initialize the block 659 * Okay, we need to do block allocation. Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
683 partial--; 711 partial--;
684 } 712 }
685 return err; 713 return err;
686changed:
687 while (partial > chain) {
688 brelse(partial->bh);
689 partial--;
690 }
691 goto reread;
692} 714}
693 715
694int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) 716int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f983225266dc..5c4afe652245 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
1395 blk++; 1395 blk++;
1396 } 1396 }
1397out: 1397out:
1398 if (len == towrite) 1398 if (len == towrite) {
1399 mutex_unlock(&inode->i_mutex);
1399 return err; 1400 return err;
1401 }
1400 if (inode->i_size < off+len-towrite) 1402 if (inode->i_size < off+len-towrite)
1401 i_size_write(inode, off+len-towrite); 1403 i_size_write(inode, off+len-towrite);
1402 inode->i_version++; 1404 inode->i_version++;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
28 To compile this file system support as a module, choose M here: the 28 To compile this file system support as a module, choose M here: the
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)"
33 depends on EXT3_FS
34 help
35 If a filesystem does not explicitly specify a data ordering
36 mode, and the journal capability allowed it, ext3 used to
37 historically default to 'data=ordered'.
38
39 That was a rather unfortunate choice, because it leads to all
40 kinds of latency problems, and the 'data=writeback' mode is more
41 appropriate these days.
42
43 You should probably always answer 'n' here, and if you really
44 want to use 'data=ordered' mode, set it in the filesystem itself
45 with 'tune2fs -o journal_data_ordered'.
46
47 But if you really want to enable the legacy default, you can do
48 so by answering 'y' to this question.
49
31config EXT3_FS_XATTR 50config EXT3_FS_XATTR
32 bool "Ext3 extended attributes" 51 bool "Ext3 extended attributes"
33 depends on EXT3_FS 52 depends on EXT3_FS
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 466a332e0bd1..fcfa24361856 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1521,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page,
1521 if (!page_has_buffers(page)) { 1521 if (!page_has_buffers(page)) {
1522 create_empty_buffers(page, inode->i_sb->s_blocksize, 1522 create_empty_buffers(page, inode->i_sb->s_blocksize,
1523 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1523 (1 << BH_Dirty)|(1 << BH_Uptodate));
1524 } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { 1524 page_bufs = page_buffers(page);
1525 /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */ 1525 } else {
1526 return block_write_full_page(page, NULL, wbc); 1526 page_bufs = page_buffers(page);
1527 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
1528 NULL, buffer_unmapped)) {
1529 /* Provide NULL get_block() to catch bugs if buffers
1530 * weren't really mapped */
1531 return block_write_full_page(page, NULL, wbc);
1532 }
1527 } 1533 }
1528 page_bufs = page_buffers(page);
1529
1530 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1534 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1531 1535
1532 if (IS_ERR(handle)) { 1536 if (IS_ERR(handle)) {
@@ -1581,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page,
1581 if (ext3_journal_current_handle()) 1585 if (ext3_journal_current_handle())
1582 goto out_fail; 1586 goto out_fail;
1583 1587
1588 if (page_has_buffers(page)) {
1589 if (!walk_page_buffers(NULL, page_buffers(page), 0,
1590 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1591 /* Provide NULL get_block() to catch bugs if buffers
1592 * weren't really mapped */
1593 return block_write_full_page(page, NULL, wbc);
1594 }
1595 }
1596
1584 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1597 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1585 if (IS_ERR(handle)) { 1598 if (IS_ERR(handle)) {
1586 ret = PTR_ERR(handle); 1599 ret = PTR_ERR(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
44#include "acl.h" 44#include "acl.h"
45#include "namei.h" 45#include "namei.h"
46 46
47#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
48 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
49#else
50 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
51#endif
52
47static int ext3_load_journal(struct super_block *, struct ext3_super_block *, 53static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 54 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 55static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1919 cope, else JOURNAL_DATA */ 1925 cope, else JOURNAL_DATA */
1920 if (journal_check_available_features 1926 if (journal_check_available_features
1921 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) 1927 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
1922 set_opt(sbi->s_mount_opt, ORDERED_DATA); 1928 set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
1923 else 1929 else
1924 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 1930 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1925 break; 1931 break;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ac77d8b8251d..e3a55eb8b26a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,11 +326,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
326 326
327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328{ 328{
329 ext4_fsblk_t block = ext_pblock(ext); 329 ext4_fsblk_t block = ext_pblock(ext), valid_block;
330 int len = ext4_ext_get_actual_len(ext); 330 int len = ext4_ext_get_actual_len(ext);
331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332 if (unlikely(block < le32_to_cpu(es->s_first_data_block) || 332
333 ((block + len) > ext4_blocks_count(es)))) 333 valid_block = le32_to_cpu(es->s_first_data_block) +
334 EXT4_SB(inode->i_sb)->s_gdb_count;
335 if (unlikely(block <= valid_block ||
336 ((block + len) > ext4_blocks_count(es))))
334 return 0; 337 return 0;
335 else 338 else
336 return 1; 339 return 1;
@@ -339,10 +342,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
339static int ext4_valid_extent_idx(struct inode *inode, 342static int ext4_valid_extent_idx(struct inode *inode,
340 struct ext4_extent_idx *ext_idx) 343 struct ext4_extent_idx *ext_idx)
341{ 344{
342 ext4_fsblk_t block = idx_pblock(ext_idx); 345 ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
343 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 346 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
344 if (unlikely(block < le32_to_cpu(es->s_first_data_block) || 347
345 (block > ext4_blocks_count(es)))) 348 valid_block = le32_to_cpu(es->s_first_data_block) +
349 EXT4_SB(inode->i_sb)->s_gdb_count;
350 if (unlikely(block <= valid_block ||
351 (block >= ext4_blocks_count(es))))
346 return 0; 352 return 0;
347 else 353 else
348 return 1; 354 return 1;
@@ -1835,11 +1841,13 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1835{ 1841{
1836 struct ext4_ext_cache *cex; 1842 struct ext4_ext_cache *cex;
1837 BUG_ON(len == 0); 1843 BUG_ON(len == 0);
1844 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1838 cex = &EXT4_I(inode)->i_cached_extent; 1845 cex = &EXT4_I(inode)->i_cached_extent;
1839 cex->ec_type = type; 1846 cex->ec_type = type;
1840 cex->ec_block = block; 1847 cex->ec_block = block;
1841 cex->ec_len = len; 1848 cex->ec_len = len;
1842 cex->ec_start = start; 1849 cex->ec_start = start;
1850 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1843} 1851}
1844 1852
1845/* 1853/*
@@ -1896,12 +1904,17 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1896 struct ext4_extent *ex) 1904 struct ext4_extent *ex)
1897{ 1905{
1898 struct ext4_ext_cache *cex; 1906 struct ext4_ext_cache *cex;
1907 int ret = EXT4_EXT_CACHE_NO;
1899 1908
1909 /*
1910 * We borrow i_block_reservation_lock to protect i_cached_extent
1911 */
1912 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1900 cex = &EXT4_I(inode)->i_cached_extent; 1913 cex = &EXT4_I(inode)->i_cached_extent;
1901 1914
1902 /* has cache valid data? */ 1915 /* has cache valid data? */
1903 if (cex->ec_type == EXT4_EXT_CACHE_NO) 1916 if (cex->ec_type == EXT4_EXT_CACHE_NO)
1904 return EXT4_EXT_CACHE_NO; 1917 goto errout;
1905 1918
1906 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 1919 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1907 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 1920 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
@@ -1912,11 +1925,11 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1912 ext_debug("%u cached by %u:%u:%llu\n", 1925 ext_debug("%u cached by %u:%u:%llu\n",
1913 block, 1926 block,
1914 cex->ec_block, cex->ec_len, cex->ec_start); 1927 cex->ec_block, cex->ec_len, cex->ec_start);
1915 return cex->ec_type; 1928 ret = cex->ec_type;
1916 } 1929 }
1917 1930errout:
1918 /* not in cache */ 1931 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1919 return EXT4_EXT_CACHE_NO; 1932 return ret;
1920} 1933}
1921 1934
1922/* 1935/*
@@ -2416,8 +2429,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2416 len = ee_len; 2429 len = ee_len;
2417 2430
2418 bio = bio_alloc(GFP_NOIO, len); 2431 bio = bio_alloc(GFP_NOIO, len);
2419 if (!bio)
2420 return -ENOMEM;
2421 bio->bi_sector = ee_pblock; 2432 bio->bi_sector = ee_pblock;
2422 bio->bi_bdev = inode->i_sb->s_bdev; 2433 bio->bi_bdev = inode->i_sb->s_bdev;
2423 2434
@@ -2871,6 +2882,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2871 if (allocated > max_blocks) 2882 if (allocated > max_blocks)
2872 allocated = max_blocks; 2883 allocated = max_blocks;
2873 set_buffer_unwritten(bh_result); 2884 set_buffer_unwritten(bh_result);
2885 bh_result->b_bdev = inode->i_sb->s_bdev;
2886 bh_result->b_blocknr = newblock;
2874 goto out2; 2887 goto out2;
2875 } 2888 }
2876 2889
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 47b84e8df568..f18e0a08a6b5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -585,6 +585,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
585fallback: 585fallback:
586 ngroups = sbi->s_groups_count; 586 ngroups = sbi->s_groups_count;
587 avefreei = freei / ngroups; 587 avefreei = freei / ngroups;
588fallback_retry:
588 parent_group = EXT4_I(parent)->i_block_group; 589 parent_group = EXT4_I(parent)->i_block_group;
589 for (i = 0; i < ngroups; i++) { 590 for (i = 0; i < ngroups; i++) {
590 grp = (parent_group + i) % ngroups; 591 grp = (parent_group + i) % ngroups;
@@ -602,7 +603,7 @@ fallback:
602 * filesystems the above test can fail to find any blockgroups 603 * filesystems the above test can fail to find any blockgroups
603 */ 604 */
604 avefreei = 0; 605 avefreei = 0;
605 goto fallback; 606 goto fallback_retry;
606 } 607 }
607 608
608 return -1; 609 return -1;
@@ -831,11 +832,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
831 ret2 = find_group_flex(sb, dir, &group); 832 ret2 = find_group_flex(sb, dir, &group);
832 if (ret2 == -1) { 833 if (ret2 == -1) {
833 ret2 = find_group_other(sb, dir, &group, mode); 834 ret2 = find_group_other(sb, dir, &group, mode);
834 if (ret2 == 0 && once) 835 if (ret2 == 0 && once) {
835 once = 0; 836 once = 0;
836 printk(KERN_NOTICE "ext4: find_group_flex " 837 printk(KERN_NOTICE "ext4: find_group_flex "
837 "failed, fallback succeeded dir %lu\n", 838 "failed, fallback succeeded dir %lu\n",
838 dir->i_ino); 839 dir->i_ino);
840 }
839 } 841 }
840 goto got_group; 842 goto got_group;
841 } 843 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a2e7952bc5f9..2a9ffd528dd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode,
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode, 374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 unsigned int *p, unsigned int max) { 375 __le32 *p, unsigned int max) {
376 376
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); 377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 unsigned int *bref = p; 378 __le32 *bref = p;
379 while (bref < p+max) { 379 while (bref < p+max) {
380 if (unlikely(*bref >= maxblocks)) { 380 if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
381 ext4_error(inode->i_sb, function, 381 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) " 382 "block reference %u >= max (%u) "
383 "in inode #%lu, offset=%d", 383 "in inode #%lu, offset=%d",
384 *bref, maxblocks, 384 le32_to_cpu(*bref), maxblocks,
385 inode->i_ino, (int)(bref-p)); 385 inode->i_ino, (int)(bref-p));
386 return -EIO; 386 return -EIO;
387 } 387 }
@@ -1149,6 +1149,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1149 int retval; 1149 int retval;
1150 1150
1151 clear_buffer_mapped(bh); 1151 clear_buffer_mapped(bh);
1152 clear_buffer_unwritten(bh);
1152 1153
1153 /* 1154 /*
1154 * Try to see if we can get the block without requesting 1155 * Try to see if we can get the block without requesting
@@ -1179,6 +1180,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1179 return retval; 1180 return retval;
1180 1181
1181 /* 1182 /*
1183 * When we call get_blocks without the create flag, the
1184 * BH_Unwritten flag could have gotten set if the blocks
1185 * requested were part of a uninitialized extent. We need to
1186 * clear this flag now that we are committed to convert all or
1187 * part of the uninitialized extent to be an initialized
1188 * extent. This is because we need to avoid the combination
1189 * of BH_Unwritten and BH_Mapped flags being simultaneously
1190 * set on the buffer_head.
1191 */
1192 clear_buffer_unwritten(bh);
1193
1194 /*
1182 * New blocks allocate and/or writing to uninitialized extent 1195 * New blocks allocate and/or writing to uninitialized extent
1183 * will possibly result in updating i_data, so we take 1196 * will possibly result in updating i_data, so we take
1184 * the write lock of i_data_sem, and call get_blocks() 1197 * the write lock of i_data_sem, and call get_blocks()
@@ -2297,6 +2310,10 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2297 struct buffer_head *bh_result, int create) 2310 struct buffer_head *bh_result, int create)
2298{ 2311{
2299 int ret = 0; 2312 int ret = 0;
2313 sector_t invalid_block = ~((sector_t) 0xffff);
2314
2315 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2316 invalid_block = ~0;
2300 2317
2301 BUG_ON(create == 0); 2318 BUG_ON(create == 0);
2302 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2319 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
@@ -2318,11 +2335,18 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2318 /* not enough space to reserve */ 2335 /* not enough space to reserve */
2319 return ret; 2336 return ret;
2320 2337
2321 map_bh(bh_result, inode->i_sb, 0); 2338 map_bh(bh_result, inode->i_sb, invalid_block);
2322 set_buffer_new(bh_result); 2339 set_buffer_new(bh_result);
2323 set_buffer_delay(bh_result); 2340 set_buffer_delay(bh_result);
2324 } else if (ret > 0) { 2341 } else if (ret > 0) {
2325 bh_result->b_size = (ret << inode->i_blkbits); 2342 bh_result->b_size = (ret << inode->i_blkbits);
2343 /*
2344 * With sub-block writes into unwritten extents
2345 * we also need to mark the buffer as new so that
2346 * the unwritten parts of the buffer gets correctly zeroed.
2347 */
2348 if (buffer_unwritten(bh_result))
2349 set_buffer_new(bh_result);
2326 ret = 0; 2350 ret = 0;
2327 } 2351 }
2328 2352
@@ -4357,11 +4381,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4357 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4381 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4358 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4382 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4359 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4383 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4360 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4384 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4361 cpu_to_le32(EXT4_OS_HURD)) {
4362 ei->i_file_acl |= 4385 ei->i_file_acl |=
4363 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4386 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4364 }
4365 inode->i_size = ext4_isize(raw_inode); 4387 inode->i_size = ext4_isize(raw_inode);
4366 ei->i_disksize = inode->i_size; 4388 ei->i_disksize = inode->i_size;
4367 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4389 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
@@ -4409,9 +4431,23 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4409 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4431 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4410 } 4432 }
4411 4433
4412 if (ei->i_flags & EXT4_EXTENTS_FL) { 4434 ret = 0;
4413 /* Validate extent which is part of inode */ 4435 if (ei->i_file_acl &&
4414 ret = ext4_ext_check_inode(inode); 4436 ((ei->i_file_acl <
4437 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
4438 EXT4_SB(sb)->s_gdb_count)) ||
4439 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
4440 ext4_error(sb, __func__,
4441 "bad extended attribute block %llu in inode #%lu",
4442 ei->i_file_acl, inode->i_ino);
4443 ret = -EIO;
4444 goto bad_inode;
4445 } else if (ei->i_flags & EXT4_EXTENTS_FL) {
4446 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4447 (S_ISLNK(inode->i_mode) &&
4448 !ext4_inode_is_fast_symlink(inode)))
4449 /* Validate extent which is part of inode */
4450 ret = ext4_ext_check_inode(inode);
4415 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4451 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4416 (S_ISLNK(inode->i_mode) && 4452 (S_ISLNK(inode->i_mode) &&
4417 !ext4_inode_is_fast_symlink(inode))) { 4453 !ext4_inode_is_fast_symlink(inode))) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9987bba99db3..2958f4e6f222 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2508 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2508 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2509 goto cantfind_ext4; 2509 goto cantfind_ext4;
2510 2510
2511 /* check blocks count against device size */
2512 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2513 if (blocks_count && ext4_blocks_count(es) > blocks_count) {
2514 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
2515 "exceeds size of device (%llu blocks)\n",
2516 ext4_blocks_count(es), blocks_count);
2517 goto failed_mount;
2518 }
2519
2511 /* 2520 /*
2512 * It makes no sense for the first data block to be beyond the end 2521 * It makes no sense for the first data block to be beyond the end
2513 * of the filesystem. 2522 * of the filesystem.
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index d0a69ff25375..182f9ffe2b51 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET
95 Note that "utf8" is not recommended for FAT filesystems. 95 Note that "utf8" is not recommended for FAT filesystems.
96 If unsure, you shouldn't set "utf8" here. 96 If unsure, you shouldn't set "utf8" here.
97 See <file:Documentation/filesystems/vfat.txt> for more information. 97 See <file:Documentation/filesystems/vfat.txt> for more information.
98
99 Enable any character sets you need in File Systems/Native Language
100 Support.
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cc8e4de2fee5..1ad703150dee 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -117,11 +117,13 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
117{ 117{
118 if (unlikely(newfd == oldfd)) { /* corner case */ 118 if (unlikely(newfd == oldfd)) { /* corner case */
119 struct files_struct *files = current->files; 119 struct files_struct *files = current->files;
120 int retval = oldfd;
121
120 rcu_read_lock(); 122 rcu_read_lock();
121 if (!fcheck_files(files, oldfd)) 123 if (!fcheck_files(files, oldfd))
122 oldfd = -EBADF; 124 retval = -EBADF;
123 rcu_read_unlock(); 125 rcu_read_unlock();
124 return oldfd; 126 return retval;
125 } 127 }
126 return sys_dup3(oldfd, newfd, 0); 128 return sys_dup3(oldfd, newfd, 0);
127} 129}
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa70260e6d1..a24c58e181db 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
199 return retval; 199 return retval;
200} 200}
201 201
202int get_filesystem_list(char * buf) 202int __init get_filesystem_list(char *buf)
203{ 203{
204 int len = 0; 204 int len = 0;
205 struct file_system_type * tmp; 205 struct file_system_type * tmp;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2b25133524a3..06f30e965676 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -938,9 +938,9 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
938} 938}
939 939
940static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, 940static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
941 unsigned *nbytesp, int write) 941 size_t *nbytesp, int write)
942{ 942{
943 unsigned nbytes = *nbytesp; 943 size_t nbytes = *nbytesp;
944 unsigned long user_addr = (unsigned long) buf; 944 unsigned long user_addr = (unsigned long) buf;
945 unsigned offset = user_addr & ~PAGE_MASK; 945 unsigned offset = user_addr & ~PAGE_MASK;
946 int npages; 946 int npages;
@@ -955,7 +955,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
955 return 0; 955 return 0;
956 } 956 }
957 957
958 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 958 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
959 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 959 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
960 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 960 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
961 down_read(&current->mm->mmap_sem); 961 down_read(&current->mm->mmap_sem);
@@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1298 if (vma->vm_flags & VM_MAYSHARE) 1298 if (vma->vm_flags & VM_MAYSHARE)
1299 return -ENODEV; 1299 return -ENODEV;
1300 1300
1301 invalidate_inode_pages2(file->f_mapping);
1302
1301 return generic_file_mmap(file, vma); 1303 return generic_file_mmap(file, vma);
1302} 1304}
1303 1305
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 459b73dd45e1..91f7c85f1ffd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,6 +19,7 @@
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/exportfs.h> 21#include <linux/exportfs.h>
22#include <linux/smp_lock.h>
22 23
23MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); 24MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
24MODULE_DESCRIPTION("Filesystem in Userspace"); 25MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -259,7 +260,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
259 260
260static void fuse_umount_begin(struct super_block *sb) 261static void fuse_umount_begin(struct super_block *sb)
261{ 262{
263 lock_kernel();
262 fuse_abort_conn(get_fuse_conn_super(sb)); 264 fuse_abort_conn(get_fuse_conn_super(sb));
265 unlock_kernel();
263} 266}
264 267
265static void fuse_send_destroy(struct fuse_conn *fc) 268static void fuse_send_destroy(struct fuse_conn *fc)
@@ -908,6 +911,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
908 err_put_root: 911 err_put_root:
909 dput(root_dentry); 912 dput(root_dentry);
910 err_put_conn: 913 err_put_conn:
914 bdi_destroy(&fc->bdi);
911 fuse_conn_put(fc); 915 fuse_conn_put(fc);
912 err_fput: 916 err_fput:
913 fput(file); 917 fput(file);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3984e47d1d33..ff4981090489 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -597,7 +597,6 @@ __acquires(&gl->gl_spin)
597 597
598 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); 598 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
599 599
600 down_read(&gfs2_umount_flush_sem);
601 if (test_bit(GLF_DEMOTE, &gl->gl_flags) && 600 if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
602 gl->gl_demote_state != gl->gl_state) { 601 gl->gl_demote_state != gl->gl_state) {
603 if (find_first_holder(gl)) 602 if (find_first_holder(gl))
@@ -614,15 +613,14 @@ __acquires(&gl->gl_spin)
614 if (ret == 0) 613 if (ret == 0)
615 goto out_unlock; 614 goto out_unlock;
616 if (ret == 2) 615 if (ret == 2)
617 goto out_sem; 616 goto out;
618 gh = find_first_waiter(gl); 617 gh = find_first_waiter(gl);
619 gl->gl_target = gh->gh_state; 618 gl->gl_target = gh->gh_state;
620 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 619 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
621 do_error(gl, 0); /* Fail queued try locks */ 620 do_error(gl, 0); /* Fail queued try locks */
622 } 621 }
623 do_xmote(gl, gh, gl->gl_target); 622 do_xmote(gl, gh, gl->gl_target);
624out_sem: 623out:
625 up_read(&gfs2_umount_flush_sem);
626 return; 624 return;
627 625
628out_sched: 626out_sched:
@@ -631,7 +629,7 @@ out_sched:
631 gfs2_glock_put(gl); 629 gfs2_glock_put(gl);
632out_unlock: 630out_unlock:
633 clear_bit(GLF_LOCK, &gl->gl_flags); 631 clear_bit(GLF_LOCK, &gl->gl_flags);
634 goto out_sem; 632 goto out;
635} 633}
636 634
637static void glock_work_func(struct work_struct *work) 635static void glock_work_func(struct work_struct *work)
@@ -641,6 +639,7 @@ static void glock_work_func(struct work_struct *work)
641 639
642 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 640 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
643 finish_xmote(gl, gl->gl_reply); 641 finish_xmote(gl, gl->gl_reply);
642 down_read(&gfs2_umount_flush_sem);
644 spin_lock(&gl->gl_spin); 643 spin_lock(&gl->gl_spin);
645 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 644 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
646 gl->gl_state != LM_ST_UNLOCKED && 645 gl->gl_state != LM_ST_UNLOCKED &&
@@ -653,6 +652,7 @@ static void glock_work_func(struct work_struct *work)
653 } 652 }
654 run_queue(gl, 0); 653 run_queue(gl, 0);
655 spin_unlock(&gl->gl_spin); 654 spin_unlock(&gl->gl_spin);
655 up_read(&gfs2_umount_flush_sem);
656 if (!delay || 656 if (!delay ||
657 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 657 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
658 gfs2_glock_put(gl); 658 gfs2_glock_put(gl);
@@ -1304,6 +1304,7 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1304 nr--; 1304 nr--;
1305 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1305 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1306 gfs2_glock_put(gl); 1306 gfs2_glock_put(gl);
1307 got_ref = 0;
1307 } 1308 }
1308 spin_lock(&lru_lock); 1309 spin_lock(&lru_lock);
1309 if (may_demote) 1310 if (may_demote)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index bf23a62aa925..70f87f43afa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl)
156 error = filemap_fdatawait(metamapping); 156 error = filemap_fdatawait(metamapping);
157 mapping_set_error(metamapping, error); 157 mapping_set_error(metamapping, error);
158 gfs2_ail_empty_gl(gl); 158 gfs2_ail_empty_gl(gl);
159 /*
160 * Writeback of the data mapping may cause the dirty flag to be set
161 * so we have to clear it again here.
162 */
163 smp_mb__before_clear_bit();
164 clear_bit(GLF_DIRTY, &gl->gl_flags);
159} 165}
160 166
161/** 167/**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7b277d449155..5a31d426116f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -137,15 +137,15 @@ void gfs2_set_iop(struct inode *inode)
137 if (S_ISREG(mode)) { 137 if (S_ISREG(mode)) {
138 inode->i_op = &gfs2_file_iops; 138 inode->i_op = &gfs2_file_iops;
139 if (gfs2_localflocks(sdp)) 139 if (gfs2_localflocks(sdp))
140 inode->i_fop = gfs2_file_fops_nolock; 140 inode->i_fop = &gfs2_file_fops_nolock;
141 else 141 else
142 inode->i_fop = gfs2_file_fops; 142 inode->i_fop = &gfs2_file_fops;
143 } else if (S_ISDIR(mode)) { 143 } else if (S_ISDIR(mode)) {
144 inode->i_op = &gfs2_dir_iops; 144 inode->i_op = &gfs2_dir_iops;
145 if (gfs2_localflocks(sdp)) 145 if (gfs2_localflocks(sdp))
146 inode->i_fop = gfs2_dir_fops_nolock; 146 inode->i_fop = &gfs2_dir_fops_nolock;
147 else 147 else
148 inode->i_fop = gfs2_dir_fops; 148 inode->i_fop = &gfs2_dir_fops;
149 } else if (S_ISLNK(mode)) { 149 } else if (S_ISLNK(mode)) {
150 inode->i_op = &gfs2_symlink_iops; 150 inode->i_op = &gfs2_symlink_iops;
151 } else { 151 } else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index dca4fee3078b..c30be2b66580 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,21 +101,23 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
101extern const struct inode_operations gfs2_file_iops; 101extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops; 102extern const struct inode_operations gfs2_dir_iops;
103extern const struct inode_operations gfs2_symlink_iops; 103extern const struct inode_operations gfs2_symlink_iops;
104extern const struct file_operations *gfs2_file_fops_nolock; 104extern const struct file_operations gfs2_file_fops_nolock;
105extern const struct file_operations *gfs2_dir_fops_nolock; 105extern const struct file_operations gfs2_dir_fops_nolock;
106 106
107extern void gfs2_set_inode_flags(struct inode *inode); 107extern void gfs2_set_inode_flags(struct inode *inode);
108 108
109#ifdef CONFIG_GFS2_FS_LOCKING_DLM 109#ifdef CONFIG_GFS2_FS_LOCKING_DLM
110extern const struct file_operations *gfs2_file_fops; 110extern const struct file_operations gfs2_file_fops;
111extern const struct file_operations *gfs2_dir_fops; 111extern const struct file_operations gfs2_dir_fops;
112
112static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) 113static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
113{ 114{
114 return sdp->sd_args.ar_localflocks; 115 return sdp->sd_args.ar_localflocks;
115} 116}
116#else /* Single node only */ 117#else /* Single node only */
117#define gfs2_file_fops NULL 118#define gfs2_file_fops gfs2_file_fops_nolock
118#define gfs2_dir_fops NULL 119#define gfs2_dir_fops gfs2_dir_fops_nolock
120
119static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) 121static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
120{ 122{
121 return 1; 123 return 1;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 70b9b8548945..5d82e91887e3 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -413,7 +413,9 @@ out_unlock:
413 gfs2_glock_dq(&gh); 413 gfs2_glock_dq(&gh);
414out: 414out:
415 gfs2_holder_uninit(&gh); 415 gfs2_holder_uninit(&gh);
416 if (ret) 416 if (ret == -ENOMEM)
417 ret = VM_FAULT_OOM;
418 else if (ret)
417 ret = VM_FAULT_SIGBUS; 419 ret = VM_FAULT_SIGBUS;
418 return ret; 420 return ret;
419} 421}
@@ -705,7 +707,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
705 } 707 }
706} 708}
707 709
708const struct file_operations *gfs2_file_fops = &(const struct file_operations){ 710const struct file_operations gfs2_file_fops = {
709 .llseek = gfs2_llseek, 711 .llseek = gfs2_llseek,
710 .read = do_sync_read, 712 .read = do_sync_read,
711 .aio_read = generic_file_aio_read, 713 .aio_read = generic_file_aio_read,
@@ -723,7 +725,7 @@ const struct file_operations *gfs2_file_fops = &(const struct file_operations){
723 .setlease = gfs2_setlease, 725 .setlease = gfs2_setlease,
724}; 726};
725 727
726const struct file_operations *gfs2_dir_fops = &(const struct file_operations){ 728const struct file_operations gfs2_dir_fops = {
727 .readdir = gfs2_readdir, 729 .readdir = gfs2_readdir,
728 .unlocked_ioctl = gfs2_ioctl, 730 .unlocked_ioctl = gfs2_ioctl,
729 .open = gfs2_open, 731 .open = gfs2_open,
@@ -735,7 +737,7 @@ const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
735 737
736#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ 738#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
737 739
738const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){ 740const struct file_operations gfs2_file_fops_nolock = {
739 .llseek = gfs2_llseek, 741 .llseek = gfs2_llseek,
740 .read = do_sync_read, 742 .read = do_sync_read,
741 .aio_read = generic_file_aio_read, 743 .aio_read = generic_file_aio_read,
@@ -751,7 +753,7 @@ const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operat
751 .setlease = generic_setlease, 753 .setlease = generic_setlease,
752}; 754};
753 755
754const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){ 756const struct file_operations gfs2_dir_fops_nolock = {
755 .readdir = gfs2_readdir, 757 .readdir = gfs2_readdir,
756 .unlocked_ioctl = gfs2_ioctl, 758 .unlocked_ioctl = gfs2_ioctl,
757 .open = gfs2_open, 759 .open = gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 51883b3ad89c..1ff9473ea753 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -272,11 +272,6 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
272 lock_page(page); 272 lock_page(page);
273 273
274 bio = bio_alloc(GFP_NOFS, 1); 274 bio = bio_alloc(GFP_NOFS, 1);
275 if (unlikely(!bio)) {
276 __free_page(page);
277 return -ENOBUFS;
278 }
279
280 bio->bi_sector = sector * (sb->s_blocksize >> 9); 275 bio->bi_sector = sector * (sb->s_blocksize >> 9);
281 bio->bi_bdev = sb->s_bdev; 276 bio->bi_bdev = sb->s_bdev;
282 bio_add_page(bio, page, PAGE_SIZE, 0); 277 bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -1287,21 +1282,21 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1287static struct super_block *get_gfs2_sb(const char *dev_name) 1282static struct super_block *get_gfs2_sb(const char *dev_name)
1288{ 1283{
1289 struct super_block *sb; 1284 struct super_block *sb;
1290 struct nameidata nd; 1285 struct path path;
1291 int error; 1286 int error;
1292 1287
1293 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); 1288 error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
1294 if (error) { 1289 if (error) {
1295 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1290 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
1296 dev_name, error); 1291 dev_name, error);
1297 return NULL; 1292 return NULL;
1298 } 1293 }
1299 sb = nd.path.dentry->d_inode->i_sb; 1294 sb = path.dentry->d_inode->i_sb;
1300 if (sb && (sb->s_type == &gfs2_fs_type)) 1295 if (sb && (sb->s_type == &gfs2_fs_type))
1301 atomic_inc(&sb->s_active); 1296 atomic_inc(&sb->s_active);
1302 else 1297 else
1303 sb = NULL; 1298 sb = NULL;
1304 path_put(&nd.path); 1299 path_put(&path);
1305 return sb; 1300 return sb;
1306} 1301}
1307 1302
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index abd5429ae285..1c70fa5168d6 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -371,6 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
371 ip = ghs[1].gh_gl->gl_object; 371 ip = ghs[1].gh_gl->gl_object;
372 372
373 ip->i_disksize = size; 373 ip->i_disksize = size;
374 i_size_write(inode, size);
374 375
375 error = gfs2_meta_inode_buffer(ip, &dibh); 376 error = gfs2_meta_inode_buffer(ip, &dibh);
376 377
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8d53f66b5bcc..152e6c4a0dca 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -81,7 +81,7 @@ struct gfs2_quota_change_host {
81 81
82static LIST_HEAD(qd_lru_list); 82static LIST_HEAD(qd_lru_list);
83static atomic_t qd_lru_count = ATOMIC_INIT(0); 83static atomic_t qd_lru_count = ATOMIC_INIT(0);
84static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED; 84static DEFINE_SPINLOCK(qd_lru_lock);
85 85
86int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) 86int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
87{ 87{
@@ -1364,7 +1364,7 @@ int gfs2_quotad(void *data)
1364 refrigerator(); 1364 refrigerator();
1365 t = min(quotad_timeo, statfs_timeo); 1365 t = min(quotad_timeo, statfs_timeo);
1366 1366
1367 prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE); 1367 prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
1368 spin_lock(&sdp->sd_trunc_lock); 1368 spin_lock(&sdp->sd_trunc_lock);
1369 empty = list_empty(&sdp->sd_trunc_list); 1369 empty = list_empty(&sdp->sd_trunc_list);
1370 spin_unlock(&sdp->sd_trunc_lock); 1370 spin_unlock(&sdp->sd_trunc_lock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f03d024038ea..565038243fa2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
212 if (tmp == 0) 212 if (tmp == 0)
213 return BFITNOENT; 213 return BFITNOENT;
214 ptr--; 214 ptr--;
215 bit = fls64(tmp); 215 bit = __ffs64(tmp);
216 bit--; /* fls64 always adds one to the bit count */
217 bit /= 2; /* two bits per entry in the bitmap */ 216 bit /= 2; /* two bits per entry in the bitmap */
218 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; 217 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
219} 218}
@@ -1445,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1445u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) 1444u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1446{ 1445{
1447 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1446 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1447 struct buffer_head *dibh;
1448 struct gfs2_alloc *al = ip->i_alloc; 1448 struct gfs2_alloc *al = ip->i_alloc;
1449 struct gfs2_rgrpd *rgd = al->al_rgd; 1449 struct gfs2_rgrpd *rgd = al->al_rgd;
1450 u32 goal, blk; 1450 u32 goal, blk;
1451 u64 block; 1451 u64 block;
1452 int error;
1452 1453
1453 if (rgrp_contains_block(rgd, ip->i_goal)) 1454 if (rgrp_contains_block(rgd, ip->i_goal))
1454 goal = ip->i_goal - rgd->rd_data0; 1455 goal = ip->i_goal - rgd->rd_data0;
@@ -1461,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1461 rgd->rd_last_alloc = blk; 1462 rgd->rd_last_alloc = blk;
1462 block = rgd->rd_data0 + blk; 1463 block = rgd->rd_data0 + blk;
1463 ip->i_goal = block; 1464 ip->i_goal = block;
1464 1465 error = gfs2_meta_inode_buffer(ip, &dibh);
1466 if (error == 0) {
1467 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
1468 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1469 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
1470 brelse(dibh);
1471 }
1465 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); 1472 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
1466 rgd->rd_free -= *n; 1473 rgd->rd_free -= *n;
1467 1474
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9435dda8f1e0..a1cbff2b4d99 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
70 BUG(); 70 BUG();
71 return 0; 71 return 0;
72 } 72 }
73
74 if (!tree)
75 return 0;
76
73 if (tree->node_size >= PAGE_CACHE_SIZE) { 77 if (tree->node_size >= PAGE_CACHE_SIZE) {
74 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); 78 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
75 spin_lock(&tree->hash_lock); 79 spin_lock(&tree->hash_lock);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 36ca2e1a4fa3..7b6165f25fbe 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb)
349 if (HFS_SB(sb)->nls_disk) 349 if (HFS_SB(sb)->nls_disk)
350 unload_nls(HFS_SB(sb)->nls_disk); 350 unload_nls(HFS_SB(sb)->nls_disk);
351 351
352 free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
352 kfree(HFS_SB(sb)); 353 kfree(HFS_SB(sb));
353 sb->s_fs_info = NULL; 354 sb->s_fs_info = NULL;
354} 355}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fecf402d7b8a..fc77965be841 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -423,8 +423,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
423 423
424 if (!(*flags & MS_RDONLY)) mark_dirty(s); 424 if (!(*flags & MS_RDONLY)) mark_dirty(s);
425 425
426 kfree(s->s_options); 426 replace_mount_options(s, new_opts);
427 s->s_options = new_opts;
428 427
429 return 0; 428 return 0;
430 429
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 23a3c76711e0..c1462d43e721 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -26,7 +26,6 @@
26#include <linux/pagevec.h> 26#include <linux/pagevec.h>
27#include <linux/parser.h> 27#include <linux/parser.h>
28#include <linux/mman.h> 28#include <linux/mman.h>
29#include <linux/quotaops.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <linux/dnotify.h> 30#include <linux/dnotify.h>
32#include <linux/statfs.h> 31#include <linux/statfs.h>
@@ -313,16 +312,6 @@ out:
313 return retval; 312 return retval;
314} 313}
315 314
316/*
317 * Read a page. Again trivial. If it didn't already exist
318 * in the page cache, it is zero-filled.
319 */
320static int hugetlbfs_readpage(struct file *file, struct page * page)
321{
322 unlock_page(page);
323 return -EINVAL;
324}
325
326static int hugetlbfs_write_begin(struct file *file, 315static int hugetlbfs_write_begin(struct file *file,
327 struct address_space *mapping, 316 struct address_space *mapping,
328 loff_t pos, unsigned len, unsigned flags, 317 loff_t pos, unsigned len, unsigned flags,
@@ -702,7 +691,6 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
702} 691}
703 692
704static const struct address_space_operations hugetlbfs_aops = { 693static const struct address_space_operations hugetlbfs_aops = {
705 .readpage = hugetlbfs_readpage,
706 .write_begin = hugetlbfs_write_begin, 694 .write_begin = hugetlbfs_write_begin,
707 .write_end = hugetlbfs_write_end, 695 .write_end = hugetlbfs_write_end,
708 .set_page_dirty = hugetlbfs_set_page_dirty, 696 .set_page_dirty = hugetlbfs_set_page_dirty,
@@ -842,7 +830,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
842bad_val: 830bad_val:
843 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", 831 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
844 args[0].from, p); 832 args[0].from, p);
845 return 1; 833 return -EINVAL;
846} 834}
847 835
848static int 836static int
diff --git a/fs/inode.c b/fs/inode.c
index d06d6d268de9..0571983755dc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -99,7 +99,7 @@ static DEFINE_MUTEX(iprune_mutex);
99 */ 99 */
100struct inodes_stat_t inodes_stat; 100struct inodes_stat_t inodes_stat;
101 101
102static struct kmem_cache * inode_cachep __read_mostly; 102static struct kmem_cache *inode_cachep __read_mostly;
103 103
104static void wake_up_inode(struct inode *inode) 104static void wake_up_inode(struct inode *inode)
105{ 105{
@@ -124,7 +124,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
124 static struct inode_operations empty_iops; 124 static struct inode_operations empty_iops;
125 static const struct file_operations empty_fops; 125 static const struct file_operations empty_fops;
126 126
127 struct address_space * const mapping = &inode->i_data; 127 struct address_space *const mapping = &inode->i_data;
128 128
129 inode->i_sb = sb; 129 inode->i_sb = sb;
130 inode->i_blkbits = sb->s_blocksize_bits; 130 inode->i_blkbits = sb->s_blocksize_bits;
@@ -216,7 +216,7 @@ static struct inode *alloc_inode(struct super_block *sb)
216 return NULL; 216 return NULL;
217} 217}
218 218
219void destroy_inode(struct inode *inode) 219void destroy_inode(struct inode *inode)
220{ 220{
221 BUG_ON(inode_has_buffers(inode)); 221 BUG_ON(inode_has_buffers(inode));
222 security_inode_free(inode); 222 security_inode_free(inode);
@@ -252,12 +252,11 @@ void inode_init_once(struct inode *inode)
252 mutex_init(&inode->inotify_mutex); 252 mutex_init(&inode->inotify_mutex);
253#endif 253#endif
254} 254}
255
256EXPORT_SYMBOL(inode_init_once); 255EXPORT_SYMBOL(inode_init_once);
257 256
258static void init_once(void *foo) 257static void init_once(void *foo)
259{ 258{
260 struct inode * inode = (struct inode *) foo; 259 struct inode *inode = (struct inode *) foo;
261 260
262 inode_init_once(inode); 261 inode_init_once(inode);
263} 262}
@@ -265,7 +264,7 @@ static void init_once(void *foo)
265/* 264/*
266 * inode_lock must be held 265 * inode_lock must be held
267 */ 266 */
268void __iget(struct inode * inode) 267void __iget(struct inode *inode)
269{ 268{
270 if (atomic_read(&inode->i_count)) { 269 if (atomic_read(&inode->i_count)) {
271 atomic_inc(&inode->i_count); 270 atomic_inc(&inode->i_count);
@@ -289,7 +288,7 @@ void clear_inode(struct inode *inode)
289{ 288{
290 might_sleep(); 289 might_sleep();
291 invalidate_inode_buffers(inode); 290 invalidate_inode_buffers(inode);
292 291
293 BUG_ON(inode->i_data.nrpages); 292 BUG_ON(inode->i_data.nrpages);
294 BUG_ON(!(inode->i_state & I_FREEING)); 293 BUG_ON(!(inode->i_state & I_FREEING));
295 BUG_ON(inode->i_state & I_CLEAR); 294 BUG_ON(inode->i_state & I_CLEAR);
@@ -303,7 +302,6 @@ void clear_inode(struct inode *inode)
303 cd_forget(inode); 302 cd_forget(inode);
304 inode->i_state = I_CLEAR; 303 inode->i_state = I_CLEAR;
305} 304}
306
307EXPORT_SYMBOL(clear_inode); 305EXPORT_SYMBOL(clear_inode);
308 306
309/* 307/*
@@ -351,8 +349,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
351 349
352 next = head->next; 350 next = head->next;
353 for (;;) { 351 for (;;) {
354 struct list_head * tmp = next; 352 struct list_head *tmp = next;
355 struct inode * inode; 353 struct inode *inode;
356 354
357 /* 355 /*
358 * We can reschedule here without worrying about the list's 356 * We can reschedule here without worrying about the list's
@@ -391,7 +389,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
391 * fails because there are busy inodes then a non zero value is returned. 389 * fails because there are busy inodes then a non zero value is returned.
392 * If the discard is successful all the inodes have been discarded. 390 * If the discard is successful all the inodes have been discarded.
393 */ 391 */
394int invalidate_inodes(struct super_block * sb) 392int invalidate_inodes(struct super_block *sb)
395{ 393{
396 int busy; 394 int busy;
397 LIST_HEAD(throw_away); 395 LIST_HEAD(throw_away);
@@ -407,7 +405,6 @@ int invalidate_inodes(struct super_block * sb)
407 405
408 return busy; 406 return busy;
409} 407}
410
411EXPORT_SYMBOL(invalidate_inodes); 408EXPORT_SYMBOL(invalidate_inodes);
412 409
413static int can_unuse(struct inode *inode) 410static int can_unuse(struct inode *inode)
@@ -504,7 +501,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
504 * Nasty deadlock avoidance. We may hold various FS locks, 501 * Nasty deadlock avoidance. We may hold various FS locks,
505 * and we don't want to recurse into the FS that called us 502 * and we don't want to recurse into the FS that called us
506 * in clear_inode() and friends.. 503 * in clear_inode() and friends..
507 */ 504 */
508 if (!(gfp_mask & __GFP_FS)) 505 if (!(gfp_mask & __GFP_FS))
509 return -1; 506 return -1;
510 prune_icache(nr); 507 prune_icache(nr);
@@ -524,10 +521,13 @@ static void __wait_on_freeing_inode(struct inode *inode);
524 * by hand after calling find_inode now! This simplifies iunique and won't 521 * by hand after calling find_inode now! This simplifies iunique and won't
525 * add any additional branch in the common code. 522 * add any additional branch in the common code.
526 */ 523 */
527static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) 524static struct inode *find_inode(struct super_block *sb,
525 struct hlist_head *head,
526 int (*test)(struct inode *, void *),
527 void *data)
528{ 528{
529 struct hlist_node *node; 529 struct hlist_node *node;
530 struct inode * inode = NULL; 530 struct inode *inode = NULL;
531 531
532repeat: 532repeat:
533 hlist_for_each_entry(inode, node, head, i_hash) { 533 hlist_for_each_entry(inode, node, head, i_hash) {
@@ -548,10 +548,11 @@ repeat:
548 * find_inode_fast is the fast path version of find_inode, see the comment at 548 * find_inode_fast is the fast path version of find_inode, see the comment at
549 * iget_locked for details. 549 * iget_locked for details.
550 */ 550 */
551static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) 551static struct inode *find_inode_fast(struct super_block *sb,
552 struct hlist_head *head, unsigned long ino)
552{ 553{
553 struct hlist_node *node; 554 struct hlist_node *node;
554 struct inode * inode = NULL; 555 struct inode *inode = NULL;
555 556
556repeat: 557repeat:
557 hlist_for_each_entry(inode, node, head, i_hash) { 558 hlist_for_each_entry(inode, node, head, i_hash) {
@@ -631,10 +632,10 @@ struct inode *new_inode(struct super_block *sb)
631 * here to attempt to avoid that. 632 * here to attempt to avoid that.
632 */ 633 */
633 static unsigned int last_ino; 634 static unsigned int last_ino;
634 struct inode * inode; 635 struct inode *inode;
635 636
636 spin_lock_prefetch(&inode_lock); 637 spin_lock_prefetch(&inode_lock);
637 638
638 inode = alloc_inode(sb); 639 inode = alloc_inode(sb);
639 if (inode) { 640 if (inode) {
640 spin_lock(&inode_lock); 641 spin_lock(&inode_lock);
@@ -645,7 +646,6 @@ struct inode *new_inode(struct super_block *sb)
645 } 646 }
646 return inode; 647 return inode;
647} 648}
648
649EXPORT_SYMBOL(new_inode); 649EXPORT_SYMBOL(new_inode);
650 650
651void unlock_new_inode(struct inode *inode) 651void unlock_new_inode(struct inode *inode)
@@ -674,7 +674,6 @@ void unlock_new_inode(struct inode *inode)
674 inode->i_state &= ~(I_LOCK|I_NEW); 674 inode->i_state &= ~(I_LOCK|I_NEW);
675 wake_up_inode(inode); 675 wake_up_inode(inode);
676} 676}
677
678EXPORT_SYMBOL(unlock_new_inode); 677EXPORT_SYMBOL(unlock_new_inode);
679 678
680/* 679/*
@@ -683,13 +682,17 @@ EXPORT_SYMBOL(unlock_new_inode);
683 * We no longer cache the sb_flags in i_flags - see fs.h 682 * We no longer cache the sb_flags in i_flags - see fs.h
684 * -- rmk@arm.uk.linux.org 683 * -- rmk@arm.uk.linux.org
685 */ 684 */
686static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) 685static struct inode *get_new_inode(struct super_block *sb,
686 struct hlist_head *head,
687 int (*test)(struct inode *, void *),
688 int (*set)(struct inode *, void *),
689 void *data)
687{ 690{
688 struct inode * inode; 691 struct inode *inode;
689 692
690 inode = alloc_inode(sb); 693 inode = alloc_inode(sb);
691 if (inode) { 694 if (inode) {
692 struct inode * old; 695 struct inode *old;
693 696
694 spin_lock(&inode_lock); 697 spin_lock(&inode_lock);
695 /* We released the lock, so.. */ 698 /* We released the lock, so.. */
@@ -731,13 +734,14 @@ set_failed:
731 * get_new_inode_fast is the fast path version of get_new_inode, see the 734 * get_new_inode_fast is the fast path version of get_new_inode, see the
732 * comment at iget_locked for details. 735 * comment at iget_locked for details.
733 */ 736 */
734static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) 737static struct inode *get_new_inode_fast(struct super_block *sb,
738 struct hlist_head *head, unsigned long ino)
735{ 739{
736 struct inode * inode; 740 struct inode *inode;
737 741
738 inode = alloc_inode(sb); 742 inode = alloc_inode(sb);
739 if (inode) { 743 if (inode) {
740 struct inode * old; 744 struct inode *old;
741 745
742 spin_lock(&inode_lock); 746 spin_lock(&inode_lock);
743 /* We released the lock, so.. */ 747 /* We released the lock, so.. */
@@ -823,7 +827,6 @@ struct inode *igrab(struct inode *inode)
823 spin_unlock(&inode_lock); 827 spin_unlock(&inode_lock);
824 return inode; 828 return inode;
825} 829}
826
827EXPORT_SYMBOL(igrab); 830EXPORT_SYMBOL(igrab);
828 831
829/** 832/**
@@ -924,7 +927,6 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
924 927
925 return ifind(sb, head, test, data, 0); 928 return ifind(sb, head, test, data, 0);
926} 929}
927
928EXPORT_SYMBOL(ilookup5_nowait); 930EXPORT_SYMBOL(ilookup5_nowait);
929 931
930/** 932/**
@@ -953,7 +955,6 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
953 955
954 return ifind(sb, head, test, data, 1); 956 return ifind(sb, head, test, data, 1);
955} 957}
956
957EXPORT_SYMBOL(ilookup5); 958EXPORT_SYMBOL(ilookup5);
958 959
959/** 960/**
@@ -976,7 +977,6 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
976 977
977 return ifind_fast(sb, head, ino); 978 return ifind_fast(sb, head, ino);
978} 979}
979
980EXPORT_SYMBOL(ilookup); 980EXPORT_SYMBOL(ilookup);
981 981
982/** 982/**
@@ -1015,7 +1015,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
1015 */ 1015 */
1016 return get_new_inode(sb, head, test, set, data); 1016 return get_new_inode(sb, head, test, set, data);
1017} 1017}
1018
1019EXPORT_SYMBOL(iget5_locked); 1018EXPORT_SYMBOL(iget5_locked);
1020 1019
1021/** 1020/**
@@ -1047,7 +1046,6 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1047 */ 1046 */
1048 return get_new_inode_fast(sb, head, ino); 1047 return get_new_inode_fast(sb, head, ino);
1049} 1048}
1050
1051EXPORT_SYMBOL(iget_locked); 1049EXPORT_SYMBOL(iget_locked);
1052 1050
1053int insert_inode_locked(struct inode *inode) 1051int insert_inode_locked(struct inode *inode)
@@ -1076,7 +1074,6 @@ int insert_inode_locked(struct inode *inode)
1076 iput(old); 1074 iput(old);
1077 } 1075 }
1078} 1076}
1079
1080EXPORT_SYMBOL(insert_inode_locked); 1077EXPORT_SYMBOL(insert_inode_locked);
1081 1078
1082int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1079int insert_inode_locked4(struct inode *inode, unsigned long hashval,
@@ -1106,7 +1103,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1106 iput(old); 1103 iput(old);
1107 } 1104 }
1108} 1105}
1109
1110EXPORT_SYMBOL(insert_inode_locked4); 1106EXPORT_SYMBOL(insert_inode_locked4);
1111 1107
1112/** 1108/**
@@ -1124,7 +1120,6 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
1124 hlist_add_head(&inode->i_hash, head); 1120 hlist_add_head(&inode->i_hash, head);
1125 spin_unlock(&inode_lock); 1121 spin_unlock(&inode_lock);
1126} 1122}
1127
1128EXPORT_SYMBOL(__insert_inode_hash); 1123EXPORT_SYMBOL(__insert_inode_hash);
1129 1124
1130/** 1125/**
@@ -1139,7 +1134,6 @@ void remove_inode_hash(struct inode *inode)
1139 hlist_del_init(&inode->i_hash); 1134 hlist_del_init(&inode->i_hash);
1140 spin_unlock(&inode_lock); 1135 spin_unlock(&inode_lock);
1141} 1136}
1142
1143EXPORT_SYMBOL(remove_inode_hash); 1137EXPORT_SYMBOL(remove_inode_hash);
1144 1138
1145/* 1139/*
@@ -1187,7 +1181,6 @@ void generic_delete_inode(struct inode *inode)
1187 BUG_ON(inode->i_state != I_CLEAR); 1181 BUG_ON(inode->i_state != I_CLEAR);
1188 destroy_inode(inode); 1182 destroy_inode(inode);
1189} 1183}
1190
1191EXPORT_SYMBOL(generic_delete_inode); 1184EXPORT_SYMBOL(generic_delete_inode);
1192 1185
1193static void generic_forget_inode(struct inode *inode) 1186static void generic_forget_inode(struct inode *inode)
@@ -1237,12 +1230,11 @@ void generic_drop_inode(struct inode *inode)
1237 else 1230 else
1238 generic_forget_inode(inode); 1231 generic_forget_inode(inode);
1239} 1232}
1240
1241EXPORT_SYMBOL_GPL(generic_drop_inode); 1233EXPORT_SYMBOL_GPL(generic_drop_inode);
1242 1234
1243/* 1235/*
1244 * Called when we're dropping the last reference 1236 * Called when we're dropping the last reference
1245 * to an inode. 1237 * to an inode.
1246 * 1238 *
1247 * Call the FS "drop()" function, defaulting to 1239 * Call the FS "drop()" function, defaulting to
1248 * the legacy UNIX filesystem behaviour.. 1240 * the legacy UNIX filesystem behaviour..
@@ -1262,7 +1254,7 @@ static inline void iput_final(struct inode *inode)
1262} 1254}
1263 1255
1264/** 1256/**
1265 * iput - put an inode 1257 * iput - put an inode
1266 * @inode: inode to put 1258 * @inode: inode to put
1267 * 1259 *
1268 * Puts an inode, dropping its usage count. If the inode use count hits 1260 * Puts an inode, dropping its usage count. If the inode use count hits
@@ -1279,7 +1271,6 @@ void iput(struct inode *inode)
1279 iput_final(inode); 1271 iput_final(inode);
1280 } 1272 }
1281} 1273}
1282
1283EXPORT_SYMBOL(iput); 1274EXPORT_SYMBOL(iput);
1284 1275
1285/** 1276/**
@@ -1290,10 +1281,10 @@ EXPORT_SYMBOL(iput);
1290 * Returns the block number on the device holding the inode that 1281 * Returns the block number on the device holding the inode that
1291 * is the disk block number for the block of the file requested. 1282 * is the disk block number for the block of the file requested.
1292 * That is, asked for block 4 of inode 1 the function will return the 1283 * That is, asked for block 4 of inode 1 the function will return the
1293 * disk block relative to the disk start that holds that block of the 1284 * disk block relative to the disk start that holds that block of the
1294 * file. 1285 * file.
1295 */ 1286 */
1296sector_t bmap(struct inode * inode, sector_t block) 1287sector_t bmap(struct inode *inode, sector_t block)
1297{ 1288{
1298 sector_t res = 0; 1289 sector_t res = 0;
1299 if (inode->i_mapping->a_ops->bmap) 1290 if (inode->i_mapping->a_ops->bmap)
@@ -1425,7 +1416,6 @@ void file_update_time(struct file *file)
1425 mark_inode_dirty_sync(inode); 1416 mark_inode_dirty_sync(inode);
1426 mnt_drop_write(file->f_path.mnt); 1417 mnt_drop_write(file->f_path.mnt);
1427} 1418}
1428
1429EXPORT_SYMBOL(file_update_time); 1419EXPORT_SYMBOL(file_update_time);
1430 1420
1431int inode_needs_sync(struct inode *inode) 1421int inode_needs_sync(struct inode *inode)
@@ -1436,7 +1426,6 @@ int inode_needs_sync(struct inode *inode)
1436 return 1; 1426 return 1;
1437 return 0; 1427 return 0;
1438} 1428}
1439
1440EXPORT_SYMBOL(inode_needs_sync); 1429EXPORT_SYMBOL(inode_needs_sync);
1441 1430
1442int inode_wait(void *word) 1431int inode_wait(void *word)
@@ -1470,42 +1459,6 @@ static void __wait_on_freeing_inode(struct inode *inode)
1470 spin_lock(&inode_lock); 1459 spin_lock(&inode_lock);
1471} 1460}
1472 1461
1473/*
1474 * We rarely want to lock two inodes that do not have a parent/child
1475 * relationship (such as directory, child inode) simultaneously. The
1476 * vast majority of file systems should be able to get along fine
1477 * without this. Do not use these functions except as a last resort.
1478 */
1479void inode_double_lock(struct inode *inode1, struct inode *inode2)
1480{
1481 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
1482 if (inode1)
1483 mutex_lock(&inode1->i_mutex);
1484 else if (inode2)
1485 mutex_lock(&inode2->i_mutex);
1486 return;
1487 }
1488
1489 if (inode1 < inode2) {
1490 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
1491 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
1492 } else {
1493 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1494 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1495 }
1496}
1497EXPORT_SYMBOL(inode_double_lock);
1498
1499void inode_double_unlock(struct inode *inode1, struct inode *inode2)
1500{
1501 if (inode1)
1502 mutex_unlock(&inode1->i_mutex);
1503
1504 if (inode2 && inode2 != inode1)
1505 mutex_unlock(&inode2->i_mutex);
1506}
1507EXPORT_SYMBOL(inode_double_unlock);
1508
1509static __initdata unsigned long ihash_entries; 1462static __initdata unsigned long ihash_entries;
1510static int __init set_ihash_entries(char *str) 1463static int __init set_ihash_entries(char *str)
1511{ 1464{
diff --git a/fs/ioctl.c b/fs/ioctl.c
index ac2d47e43926..82d9c42b8bac 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -258,7 +258,7 @@ int __generic_block_fiemap(struct inode *inode,
258 long long length = 0, map_len = 0; 258 long long length = 0, map_len = 0;
259 u64 logical = 0, phys = 0, size = 0; 259 u64 logical = 0, phys = 0, size = 0;
260 u32 flags = FIEMAP_EXTENT_MERGED; 260 u32 flags = FIEMAP_EXTENT_MERGED;
261 int ret = 0; 261 int ret = 0, past_eof = 0, whole_file = 0;
262 262
263 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) 263 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
264 return ret; 264 return ret;
@@ -266,6 +266,9 @@ int __generic_block_fiemap(struct inode *inode,
266 start_blk = logical_to_blk(inode, start); 266 start_blk = logical_to_blk(inode, start);
267 267
268 length = (long long)min_t(u64, len, i_size_read(inode)); 268 length = (long long)min_t(u64, len, i_size_read(inode));
269 if (length < len)
270 whole_file = 1;
271
269 map_len = length; 272 map_len = length;
270 273
271 do { 274 do {
@@ -282,11 +285,26 @@ int __generic_block_fiemap(struct inode *inode,
282 285
283 /* HOLE */ 286 /* HOLE */
284 if (!buffer_mapped(&tmp)) { 287 if (!buffer_mapped(&tmp)) {
288 length -= blk_to_logical(inode, 1);
289 start_blk++;
290
291 /*
292 * we want to handle the case where there is an
293 * allocated block at the front of the file, and then
294 * nothing but holes up to the end of the file properly,
295 * to make sure that extent at the front gets properly
296 * marked with FIEMAP_EXTENT_LAST
297 */
298 if (!past_eof &&
299 blk_to_logical(inode, start_blk) >=
300 blk_to_logical(inode, 0)+i_size_read(inode))
301 past_eof = 1;
302
285 /* 303 /*
286 * first hole after going past the EOF, this is our 304 * first hole after going past the EOF, this is our
287 * last extent 305 * last extent
288 */ 306 */
289 if (length <= 0) { 307 if (past_eof && size) {
290 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; 308 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
291 ret = fiemap_fill_next_extent(fieinfo, logical, 309 ret = fiemap_fill_next_extent(fieinfo, logical,
292 phys, size, 310 phys, size,
@@ -294,15 +312,37 @@ int __generic_block_fiemap(struct inode *inode,
294 break; 312 break;
295 } 313 }
296 314
297 length -= blk_to_logical(inode, 1);
298
299 /* if we have holes up to/past EOF then we're done */ 315 /* if we have holes up to/past EOF then we're done */
300 if (length <= 0) 316 if (length <= 0 || past_eof)
301 break; 317 break;
302
303 start_blk++;
304 } else { 318 } else {
305 if (length <= 0 && size) { 319 /*
320 * we have gone over the length of what we wanted to
321 * map, and it wasn't the entire file, so add the extent
322 * we got last time and exit.
323 *
324 * This is for the case where say we want to map all the
325 * way up to the second to the last block in a file, but
326 * the last block is a hole, making the second to last
327 * block FIEMAP_EXTENT_LAST. In this case we want to
328 * see if there is a hole after the second to last block
329 * so we can mark it properly. If we found data after
330 * we exceeded the length we were requesting, then we
331 * are good to go, just add the extent to the fieinfo
332 * and break
333 */
334 if (length <= 0 && !whole_file) {
335 ret = fiemap_fill_next_extent(fieinfo, logical,
336 phys, size,
337 flags);
338 break;
339 }
340
341 /*
342 * if size != 0 then we know we already have an extent
343 * to add, so add it.
344 */
345 if (size) {
306 ret = fiemap_fill_next_extent(fieinfo, logical, 346 ret = fiemap_fill_next_extent(fieinfo, logical,
307 phys, size, 347 phys, size,
308 flags); 348 flags);
@@ -319,19 +359,14 @@ int __generic_block_fiemap(struct inode *inode,
319 start_blk += logical_to_blk(inode, size); 359 start_blk += logical_to_blk(inode, size);
320 360
321 /* 361 /*
322 * if we are past the EOF we need to loop again to see 362 * If we are past the EOF, then we need to make sure as
323 * if there is a hole so we can mark this extent as the 363 * soon as we find a hole that the last extent we found
324 * last one, and if not keep mapping things until we 364 * is marked with FIEMAP_EXTENT_LAST
325 * find a hole, or we run out of slots in the extent
326 * array
327 */ 365 */
328 if (length <= 0) 366 if (!past_eof &&
329 continue; 367 logical+size >=
330 368 blk_to_logical(inode, 0)+i_size_read(inode))
331 ret = fiemap_fill_next_extent(fieinfo, logical, phys, 369 past_eof = 1;
332 size, flags);
333 if (ret)
334 break;
335 } 370 }
336 cond_resched(); 371 cond_resched();
337 } while (1); 372 } while (1);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9c8981..06560c520f49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
351 spin_lock(&journal->j_state_lock); 351 spin_lock(&journal->j_state_lock);
352 commit_transaction->t_state = T_LOCKED; 352 commit_transaction->t_state = T_LOCKED;
353 353
354 /*
355 * Use plugged writes here, since we want to submit several before
356 * we unplug the device. We don't do explicit unplugging in here,
357 * instead we rely on sync_buffer() doing the unplug for us.
358 */
354 if (commit_transaction->t_synchronous_commit) 359 if (commit_transaction->t_synchronous_commit)
355 write_op = WRITE_SYNC; 360 write_op = WRITE_SYNC_PLUG;
356 spin_lock(&commit_transaction->t_handle_lock); 361 spin_lock(&commit_transaction->t_handle_lock);
357 while (commit_transaction->t_updates) { 362 while (commit_transaction->t_updates) {
358 DEFINE_WAIT(wait); 363 DEFINE_WAIT(wait);
@@ -497,7 +502,7 @@ void journal_commit_transaction(journal_t *journal)
497 err = 0; 502 err = 0;
498 } 503 }
499 504
500 journal_write_revoke_records(journal, commit_transaction); 505 journal_write_revoke_records(journal, commit_transaction, write_op);
501 506
502 /* 507 /*
503 * If we found any dirty or locked buffers, then we should have 508 * If we found any dirty or locked buffers, then we should have
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbdc..da6cd9bdaabc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -55,6 +55,25 @@
55 * need do nothing. 55 * need do nothing.
56 * RevokeValid set, Revoked set: 56 * RevokeValid set, Revoked set:
57 * buffer has been revoked. 57 * buffer has been revoked.
58 *
59 * Locking rules:
60 * We keep two hash tables of revoke records. One hashtable belongs to the
61 * running transaction (is pointed to by journal->j_revoke), the other one
62 * belongs to the committing transaction. Accesses to the second hash table
63 * happen only from the kjournald and no other thread touches this table. Also
64 * journal_switch_revoke_table() which switches which hashtable belongs to the
65 * running and which to the committing transaction is called only from
66 * kjournald. Therefore we need no locks when accessing the hashtable belonging
67 * to the committing transaction.
68 *
69 * All users operating on the hash table belonging to the running transaction
70 * have a handle to the transaction. Therefore they are safe from kjournald
71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used.
73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed.
58 */ 77 */
59 78
60#ifndef __KERNEL__ 79#ifndef __KERNEL__
@@ -67,6 +86,7 @@
67#include <linux/slab.h> 86#include <linux/slab.h>
68#include <linux/list.h> 87#include <linux/list.h>
69#include <linux/init.h> 88#include <linux/init.h>
89#include <linux/bio.h>
70#endif 90#endif
71#include <linux/log2.h> 91#include <linux/log2.h>
72 92
@@ -99,8 +119,8 @@ struct jbd_revoke_table_s
99#ifdef __KERNEL__ 119#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *, 120static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *, 121 struct journal_head **, int *,
102 struct jbd_revoke_record_s *); 122 struct jbd_revoke_record_s *, int);
103static void flush_descriptor(journal_t *, struct journal_head *, int); 123static void flush_descriptor(journal_t *, struct journal_head *, int, int);
104#endif 124#endif
105 125
106/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
@@ -402,8 +422,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
402 * the second time we would still have a pending revoke to cancel. So, 422 * the second time we would still have a pending revoke to cancel. So,
403 * do not trust the Revoked bit on buffers unless RevokeValid is also 423 * do not trust the Revoked bit on buffers unless RevokeValid is also
404 * set. 424 * set.
405 *
406 * The caller must have the journal locked.
407 */ 425 */
408int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 426int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
409{ 427{
@@ -481,12 +499,9 @@ void journal_switch_revoke_table(journal_t *journal)
481/* 499/*
482 * Write revoke records to the journal for all entries in the current 500 * Write revoke records to the journal for all entries in the current
483 * revoke hash, deleting the entries as we go. 501 * revoke hash, deleting the entries as we go.
484 *
485 * Called with the journal lock held.
486 */ 502 */
487
488void journal_write_revoke_records(journal_t *journal, 503void journal_write_revoke_records(journal_t *journal,
489 transaction_t *transaction) 504 transaction_t *transaction, int write_op)
490{ 505{
491 struct journal_head *descriptor; 506 struct journal_head *descriptor;
492 struct jbd_revoke_record_s *record; 507 struct jbd_revoke_record_s *record;
@@ -510,14 +525,14 @@ void journal_write_revoke_records(journal_t *journal,
510 hash_list->next; 525 hash_list->next;
511 write_one_revoke_record(journal, transaction, 526 write_one_revoke_record(journal, transaction,
512 &descriptor, &offset, 527 &descriptor, &offset,
513 record); 528 record, write_op);
514 count++; 529 count++;
515 list_del(&record->hash); 530 list_del(&record->hash);
516 kmem_cache_free(revoke_record_cache, record); 531 kmem_cache_free(revoke_record_cache, record);
517 } 532 }
518 } 533 }
519 if (descriptor) 534 if (descriptor)
520 flush_descriptor(journal, descriptor, offset); 535 flush_descriptor(journal, descriptor, offset, write_op);
521 jbd_debug(1, "Wrote %d revoke records\n", count); 536 jbd_debug(1, "Wrote %d revoke records\n", count);
522} 537}
523 538
@@ -530,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
530 transaction_t *transaction, 545 transaction_t *transaction,
531 struct journal_head **descriptorp, 546 struct journal_head **descriptorp,
532 int *offsetp, 547 int *offsetp,
533 struct jbd_revoke_record_s *record) 548 struct jbd_revoke_record_s *record,
549 int write_op)
534{ 550{
535 struct journal_head *descriptor; 551 struct journal_head *descriptor;
536 int offset; 552 int offset;
@@ -549,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
549 /* Make sure we have a descriptor with space left for the record */ 565 /* Make sure we have a descriptor with space left for the record */
550 if (descriptor) { 566 if (descriptor) {
551 if (offset == journal->j_blocksize) { 567 if (offset == journal->j_blocksize) {
552 flush_descriptor(journal, descriptor, offset); 568 flush_descriptor(journal, descriptor, offset, write_op);
553 descriptor = NULL; 569 descriptor = NULL;
554 } 570 }
555 } 571 }
@@ -586,7 +602,7 @@ static void write_one_revoke_record(journal_t *journal,
586 602
587static void flush_descriptor(journal_t *journal, 603static void flush_descriptor(journal_t *journal,
588 struct journal_head *descriptor, 604 struct journal_head *descriptor,
589 int offset) 605 int offset, int write_op)
590{ 606{
591 journal_revoke_header_t *header; 607 journal_revoke_header_t *header;
592 struct buffer_head *bh = jh2bh(descriptor); 608 struct buffer_head *bh = jh2bh(descriptor);
@@ -601,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
601 set_buffer_jwrite(bh); 617 set_buffer_jwrite(bh);
602 BUFFER_TRACE(bh, "write"); 618 BUFFER_TRACE(bh, "write");
603 set_buffer_dirty(bh); 619 set_buffer_dirty(bh);
604 ll_rw_block(SWRITE, 1, &bh); 620 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
605} 621}
606#endif 622#endif
607 623
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..0b7d3b8226fd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
138 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
139 barrier_done = 1; 139 barrier_done = 1;
140 } 140 }
141 ret = submit_bh(WRITE_SYNC, bh); 141 ret = submit_bh(WRITE_SYNC_PLUG, bh);
142 if (barrier_done) 142 if (barrier_done)
143 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
144 144
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
159 lock_buffer(bh); 159 lock_buffer(bh);
160 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
161 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
162 ret = submit_bh(WRITE_SYNC, bh); 162 ret = submit_bh(WRITE_SYNC_PLUG, bh);
163 } 163 }
164 *cbh = bh; 164 *cbh = bh;
165 return ret; 165 return ret;
@@ -190,7 +190,7 @@ retry:
190 set_buffer_uptodate(bh); 190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync; 191 bh->b_end_io = journal_end_buffer_io_sync;
192 192
193 ret = submit_bh(WRITE_SYNC, bh); 193 ret = submit_bh(WRITE_SYNC_PLUG, bh);
194 if (ret) { 194 if (ret) {
195 unlock_buffer(bh); 195 unlock_buffer(bh);
196 return ret; 196 return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
402 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
403 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
404 404
405 /*
406 * Use plugged writes here, since we want to submit several before
407 * we unplug the device. We don't do explicit unplugging in here,
408 * instead we rely on sync_buffer() doing the unplug for us.
409 */
405 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
406 write_op = WRITE_SYNC; 411 write_op = WRITE_SYNC_PLUG;
407 stats.u.run.rs_wait = commit_transaction->t_max_wait; 412 stats.u.run.rs_wait = commit_transaction->t_max_wait;
408 stats.u.run.rs_locked = jiffies; 413 stats.u.run.rs_locked = jiffies;
409 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 414 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -501,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
501 if (err) 506 if (err)
502 jbd2_journal_abort(journal, err); 507 jbd2_journal_abort(journal, err);
503 508
504 jbd2_journal_write_revoke_records(journal, commit_transaction); 509 jbd2_journal_write_revoke_records(journal, commit_transaction,
510 write_op);
505 511
506 jbd_debug(3, "JBD: commit phase 2\n"); 512 jbd_debug(3, "JBD: commit phase 2\n");
507 513
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index bbe6d592d8b3..a360b06af2e3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -86,6 +86,7 @@
86#include <linux/slab.h> 86#include <linux/slab.h>
87#include <linux/list.h> 87#include <linux/list.h>
88#include <linux/init.h> 88#include <linux/init.h>
89#include <linux/bio.h>
89#endif 90#endif
90#include <linux/log2.h> 91#include <linux/log2.h>
91 92
@@ -118,8 +119,8 @@ struct jbd2_revoke_table_s
118#ifdef __KERNEL__ 119#ifdef __KERNEL__
119static void write_one_revoke_record(journal_t *, transaction_t *, 120static void write_one_revoke_record(journal_t *, transaction_t *,
120 struct journal_head **, int *, 121 struct journal_head **, int *,
121 struct jbd2_revoke_record_s *); 122 struct jbd2_revoke_record_s *, int);
122static void flush_descriptor(journal_t *, struct journal_head *, int); 123static void flush_descriptor(journal_t *, struct journal_head *, int, int);
123#endif 124#endif
124 125
125/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
@@ -499,7 +500,8 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
499 * revoke hash, deleting the entries as we go. 500 * revoke hash, deleting the entries as we go.
500 */ 501 */
501void jbd2_journal_write_revoke_records(journal_t *journal, 502void jbd2_journal_write_revoke_records(journal_t *journal,
502 transaction_t *transaction) 503 transaction_t *transaction,
504 int write_op)
503{ 505{
504 struct journal_head *descriptor; 506 struct journal_head *descriptor;
505 struct jbd2_revoke_record_s *record; 507 struct jbd2_revoke_record_s *record;
@@ -523,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
523 hash_list->next; 525 hash_list->next;
524 write_one_revoke_record(journal, transaction, 526 write_one_revoke_record(journal, transaction,
525 &descriptor, &offset, 527 &descriptor, &offset,
526 record); 528 record, write_op);
527 count++; 529 count++;
528 list_del(&record->hash); 530 list_del(&record->hash);
529 kmem_cache_free(jbd2_revoke_record_cache, record); 531 kmem_cache_free(jbd2_revoke_record_cache, record);
530 } 532 }
531 } 533 }
532 if (descriptor) 534 if (descriptor)
533 flush_descriptor(journal, descriptor, offset); 535 flush_descriptor(journal, descriptor, offset, write_op);
534 jbd_debug(1, "Wrote %d revoke records\n", count); 536 jbd_debug(1, "Wrote %d revoke records\n", count);
535} 537}
536 538
@@ -543,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
543 transaction_t *transaction, 545 transaction_t *transaction,
544 struct journal_head **descriptorp, 546 struct journal_head **descriptorp,
545 int *offsetp, 547 int *offsetp,
546 struct jbd2_revoke_record_s *record) 548 struct jbd2_revoke_record_s *record,
549 int write_op)
547{ 550{
548 struct journal_head *descriptor; 551 struct journal_head *descriptor;
549 int offset; 552 int offset;
@@ -562,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
562 /* Make sure we have a descriptor with space left for the record */ 565 /* Make sure we have a descriptor with space left for the record */
563 if (descriptor) { 566 if (descriptor) {
564 if (offset == journal->j_blocksize) { 567 if (offset == journal->j_blocksize) {
565 flush_descriptor(journal, descriptor, offset); 568 flush_descriptor(journal, descriptor, offset, write_op);
566 descriptor = NULL; 569 descriptor = NULL;
567 } 570 }
568 } 571 }
@@ -607,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal,
607 610
608static void flush_descriptor(journal_t *journal, 611static void flush_descriptor(journal_t *journal,
609 struct journal_head *descriptor, 612 struct journal_head *descriptor,
610 int offset) 613 int offset, int write_op)
611{ 614{
612 jbd2_journal_revoke_header_t *header; 615 jbd2_journal_revoke_header_t *header;
613 struct buffer_head *bh = jh2bh(descriptor); 616 struct buffer_head *bh = jh2bh(descriptor);
@@ -622,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
622 set_buffer_jwrite(bh); 625 set_buffer_jwrite(bh);
623 BUFFER_TRACE(bh, "write"); 626 BUFFER_TRACE(bh, "write");
624 set_buffer_dirty(bh); 627 set_buffer_dirty(bh);
625 ll_rw_block(SWRITE, 1, &bh); 628 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
626} 629}
627#endif 630#endif
628 631
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 77ccf8cb0823..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
38 size_t s; 38 size_t s;
39 39
40 size -= sizeof(struct jffs2_acl_header); 40 size -= sizeof(struct jffs2_acl_header);
41 s = size - 4 * sizeof(struct jffs2_acl_entry_short); 41 if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
42 if (s < 0) {
43 if (size % sizeof(struct jffs2_acl_entry_short)) 42 if (size % sizeof(struct jffs2_acl_entry_short))
44 return -1; 43 return -1;
45 return size / sizeof(struct jffs2_acl_entry_short); 44 return size / sizeof(struct jffs2_acl_entry_short);
46 } else { 45 } else {
46 s = size - 4 * sizeof(struct jffs2_acl_entry_short);
47 if (s % sizeof(struct jffs2_acl_entry)) 47 if (s % sizeof(struct jffs2_acl_entry))
48 return -1; 48 return -1;
49 return s / sizeof(struct jffs2_acl_entry) + 4; 49 return s / sizeof(struct jffs2_acl_entry) + 4;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) 284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
285{ 285{
286 struct jffs2_xattr_datum *xd; 286 struct jffs2_xattr_datum *xd;
287 xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); 287 xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
288 dbg_memalloc("%p\n", xd); 288 dbg_memalloc("%p\n", xd);
289 289
290 memset(xd, 0, sizeof(struct jffs2_xattr_datum));
291 xd->class = RAWNODE_CLASS_XATTR_DATUM; 290 xd->class = RAWNODE_CLASS_XATTR_DATUM;
292 xd->node = (void *)xd; 291 xd->node = (void *)xd;
293 INIT_LIST_HEAD(&xd->xindex); 292 INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
303struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) 302struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
304{ 303{
305 struct jffs2_xattr_ref *ref; 304 struct jffs2_xattr_ref *ref;
306 ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); 305 ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
307 dbg_memalloc("%p\n", ref); 306 dbg_memalloc("%p\n", ref);
308 307
309 memset(ref, 0, sizeof(struct jffs2_xattr_ref));
310 ref->class = RAWNODE_CLASS_XATTR_REF; 308 ref->class = RAWNODE_CLASS_XATTR_REF;
311 ref->node = (void *)ref; 309 ref->node = (void *)ref;
312 return ref; 310 return ref;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..80046ddf5063 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,8 +246,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
246 return 0; 246 return 0;
247 247
248Enomem: 248Enomem:
249 up_write(&s->s_umount); 249 deactivate_locked_super(s);
250 deactivate_super(s);
251 return -ENOMEM; 250 return -ENOMEM;
252} 251}
253 252
@@ -575,6 +574,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
575 * possibly a read which collects the result - which is stored in a 574 * possibly a read which collects the result - which is stored in a
576 * file-local buffer. 575 * file-local buffer.
577 */ 576 */
577
578void simple_transaction_set(struct file *file, size_t n)
579{
580 struct simple_transaction_argresp *ar = file->private_data;
581
582 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
583
584 /*
585 * The barrier ensures that ar->size will really remain zero until
586 * ar->data is ready for reading.
587 */
588 smp_mb();
589 ar->size = n;
590}
591
578char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 592char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
579{ 593{
580 struct simple_transaction_argresp *ar; 594 struct simple_transaction_argresp *ar;
@@ -820,6 +834,7 @@ EXPORT_SYMBOL(simple_sync_file);
820EXPORT_SYMBOL(simple_unlink); 834EXPORT_SYMBOL(simple_unlink);
821EXPORT_SYMBOL(simple_read_from_buffer); 835EXPORT_SYMBOL(simple_read_from_buffer);
822EXPORT_SYMBOL(memory_read_from_buffer); 836EXPORT_SYMBOL(memory_read_from_buffer);
837EXPORT_SYMBOL(simple_transaction_set);
823EXPORT_SYMBOL(simple_transaction_get); 838EXPORT_SYMBOL(simple_transaction_get);
824EXPORT_SYMBOL(simple_transaction_read); 839EXPORT_SYMBOL(simple_transaction_read);
825EXPORT_SYMBOL(simple_transaction_release); 840EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abf83881f68a..1a54ae14a192 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -104,6 +104,16 @@ static void set_grace_period(void)
104 schedule_delayed_work(&grace_period_end, grace_period); 104 schedule_delayed_work(&grace_period_end, grace_period);
105} 105}
106 106
107static void restart_grace(void)
108{
109 if (nlmsvc_ops) {
110 cancel_delayed_work_sync(&grace_period_end);
111 locks_end_grace(&lockd_manager);
112 nlmsvc_invalidate_all();
113 set_grace_period();
114 }
115}
116
107/* 117/*
108 * This is the lockd kernel thread 118 * This is the lockd kernel thread
109 */ 119 */
@@ -149,10 +159,7 @@ lockd(void *vrqstp)
149 159
150 if (signalled()) { 160 if (signalled()) {
151 flush_signals(current); 161 flush_signals(current);
152 if (nlmsvc_ops) { 162 restart_grace();
153 nlmsvc_invalidate_all();
154 set_grace_period();
155 }
156 continue; 163 continue;
157 } 164 }
158 165
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
426 ret = nlm_granted; 426 ret = nlm_granted;
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 /*
430 * If this is a blocking request for an
431 * already pending lock request then we need
432 * to put it back on lockd's block list
433 */
434 if (wait)
435 break;
429 ret = nlm_lck_denied; 436 ret = nlm_lck_denied;
430 break; 437 goto out;
431 case FILE_LOCK_DEFERRED: 438 case FILE_LOCK_DEFERRED:
432 if (wait) 439 if (wait)
433 break; 440 break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 450 goto out;
444 } 451 }
445 452
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
450 ret = nlm_lck_blocked; 453 ret = nlm_lck_blocked;
451 454
452 /* Append to list of blocked */ 455 /* Append to list of blocked */
diff --git a/fs/namei.c b/fs/namei.c
index b8433ebfae05..967c3db92724 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1130,8 +1130,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1130 * @nd: pointer to nameidata 1130 * @nd: pointer to nameidata
1131 * @open_flags: open intent flags 1131 * @open_flags: open intent flags
1132 */ 1132 */
1133int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags, 1133static int path_lookup_open(int dfd, const char *name,
1134 struct nameidata *nd, int open_flags) 1134 unsigned int lookup_flags, struct nameidata *nd, int open_flags)
1135{ 1135{
1136 struct file *filp = get_empty_filp(); 1136 struct file *filp = get_empty_filp();
1137 int err; 1137 int err;
@@ -1248,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1248 int err; 1248 int err;
1249 struct qstr this; 1249 struct qstr this;
1250 1250
1251 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1252
1251 err = __lookup_one_len(name, &this, base, len); 1253 err = __lookup_one_len(name, &this, base, len);
1252 if (err) 1254 if (err)
1253 return ERR_PTR(err); 1255 return ERR_PTR(err);
@@ -1635,18 +1637,19 @@ static int open_will_write_to_fs(int flag, struct inode *inode)
1635 * open_to_namei_flags() for more details. 1637 * open_to_namei_flags() for more details.
1636 */ 1638 */
1637struct file *do_filp_open(int dfd, const char *pathname, 1639struct file *do_filp_open(int dfd, const char *pathname,
1638 int open_flag, int mode) 1640 int open_flag, int mode, int acc_mode)
1639{ 1641{
1640 struct file *filp; 1642 struct file *filp;
1641 struct nameidata nd; 1643 struct nameidata nd;
1642 int acc_mode, error; 1644 int error;
1643 struct path path; 1645 struct path path;
1644 struct dentry *dir; 1646 struct dentry *dir;
1645 int count = 0; 1647 int count = 0;
1646 int will_write; 1648 int will_write;
1647 int flag = open_to_namei_flags(open_flag); 1649 int flag = open_to_namei_flags(open_flag);
1648 1650
1649 acc_mode = MAY_OPEN | ACC_MODE(flag); 1651 if (!acc_mode)
1652 acc_mode = MAY_OPEN | ACC_MODE(flag);
1650 1653
1651 /* O_TRUNC implies we need access checks for write permissions */ 1654 /* O_TRUNC implies we need access checks for write permissions */
1652 if (flag & O_TRUNC) 1655 if (flag & O_TRUNC)
@@ -1867,7 +1870,7 @@ do_link:
1867 */ 1870 */
1868struct file *filp_open(const char *filename, int flags, int mode) 1871struct file *filp_open(const char *filename, int flags, int mode)
1869{ 1872{
1870 return do_filp_open(AT_FDCWD, filename, flags, mode); 1873 return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
1871} 1874}
1872EXPORT_SYMBOL(filp_open); 1875EXPORT_SYMBOL(filp_open);
1873 1876
diff --git a/fs/namespace.c b/fs/namespace.c
index c6f54e4c4290..134d494158d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -695,12 +695,16 @@ static inline void mangle(struct seq_file *m, const char *s)
695 */ 695 */
696int generic_show_options(struct seq_file *m, struct vfsmount *mnt) 696int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
697{ 697{
698 const char *options = mnt->mnt_sb->s_options; 698 const char *options;
699
700 rcu_read_lock();
701 options = rcu_dereference(mnt->mnt_sb->s_options);
699 702
700 if (options != NULL && options[0]) { 703 if (options != NULL && options[0]) {
701 seq_putc(m, ','); 704 seq_putc(m, ',');
702 mangle(m, options); 705 mangle(m, options);
703 } 706 }
707 rcu_read_unlock();
704 708
705 return 0; 709 return 0;
706} 710}
@@ -721,11 +725,22 @@ EXPORT_SYMBOL(generic_show_options);
721 */ 725 */
722void save_mount_options(struct super_block *sb, char *options) 726void save_mount_options(struct super_block *sb, char *options)
723{ 727{
724 kfree(sb->s_options); 728 BUG_ON(sb->s_options);
725 sb->s_options = kstrdup(options, GFP_KERNEL); 729 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
726} 730}
727EXPORT_SYMBOL(save_mount_options); 731EXPORT_SYMBOL(save_mount_options);
728 732
733void replace_mount_options(struct super_block *sb, char *options)
734{
735 char *old = sb->s_options;
736 rcu_assign_pointer(sb->s_options, options);
737 if (old) {
738 synchronize_rcu();
739 kfree(old);
740 }
741}
742EXPORT_SYMBOL(replace_mount_options);
743
729#ifdef CONFIG_PROC_FS 744#ifdef CONFIG_PROC_FS
730/* iterator */ 745/* iterator */
731static void *m_start(struct seq_file *m, loff_t *pos) 746static void *m_start(struct seq_file *m, loff_t *pos)
@@ -1073,9 +1088,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1073 */ 1088 */
1074 1089
1075 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1090 if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1076 lock_kernel();
1077 sb->s_op->umount_begin(sb); 1091 sb->s_op->umount_begin(sb);
1078 unlock_kernel();
1079 } 1092 }
1080 1093
1081 /* 1094 /*
@@ -1377,7 +1390,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1377 if (parent_path) { 1390 if (parent_path) {
1378 detach_mnt(source_mnt, parent_path); 1391 detach_mnt(source_mnt, parent_path);
1379 attach_mnt(source_mnt, path); 1392 attach_mnt(source_mnt, path);
1380 touch_mnt_namespace(current->nsproxy->mnt_ns); 1393 touch_mnt_namespace(parent_path->mnt->mnt_ns);
1381 } else { 1394 } else {
1382 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 1395 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
1383 commit_tree(source_mnt); 1396 commit_tree(source_mnt);
@@ -1920,8 +1933,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1920 if (data_page) 1933 if (data_page)
1921 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1934 ((char *)data_page)[PAGE_SIZE - 1] = 0;
1922 1935
1923 /* Default to relatime */ 1936 /* Default to relatime unless overriden */
1924 mnt_flags |= MNT_RELATIME; 1937 if (!(flags & MS_NOATIME))
1938 mnt_flags |= MNT_RELATIME;
1925 1939
1926 /* Separate the per-mountpoint flags */ 1940 /* Separate the per-mountpoint flags */
1927 if (flags & MS_NOSUID) 1941 if (flags & MS_NOSUID)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index f54360f50a9c..fa038df63ac8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -660,13 +660,10 @@ outrel:
660 if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) 660 if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
661 return -ENOMEM; 661 return -ENOMEM;
662 if (user.object_name_len) { 662 if (user.object_name_len) {
663 newname = kmalloc(user.object_name_len, GFP_USER); 663 newname = memdup_user(user.object_name,
664 if (!newname) 664 user.object_name_len);
665 return -ENOMEM; 665 if (IS_ERR(newname))
666 if (copy_from_user(newname, user.object_name, user.object_name_len)) { 666 return PTR_ERR(newname);
667 kfree(newname);
668 return -EFAULT;
669 }
670 } else { 667 } else {
671 newname = NULL; 668 newname = NULL;
672 } 669 }
@@ -760,13 +757,9 @@ outrel:
760 if (user.len > NCP_PRIVATE_DATA_MAX_LEN) 757 if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
761 return -ENOMEM; 758 return -ENOMEM;
762 if (user.len) { 759 if (user.len) {
763 new = kmalloc(user.len, GFP_USER); 760 new = memdup_user(user.data, user.len);
764 if (!new) 761 if (IS_ERR(new))
765 return -ENOMEM; 762 return PTR_ERR(new);
766 if (copy_from_user(new, user.data, user.len)) {
767 kfree(new);
768 return -EFAULT;
769 }
770 } else { 763 } else {
771 new = NULL; 764 new = NULL;
772 } 765 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 370b190a09d1..89f98e9a024b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1943,7 +1943,8 @@ int nfs_permission(struct inode *inode, int mask)
1943 case S_IFREG: 1943 case S_IFREG:
1944 /* NFSv4 has atomic_open... */ 1944 /* NFSv4 has atomic_open... */
1945 if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) 1945 if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
1946 && (mask & MAY_OPEN)) 1946 && (mask & MAY_OPEN)
1947 && !(mask & MAY_EXEC))
1947 goto out; 1948 goto out;
1948 break; 1949 break;
1949 case S_IFDIR: 1950 case S_IFDIR:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3523b895eb4b..ec7e27d00bc6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -516,13 +516,11 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
516 goto out_unlock; 516 goto out_unlock;
517 517
518 ret = nfs_updatepage(filp, page, 0, pagelen); 518 ret = nfs_updatepage(filp, page, 0, pagelen);
519 if (ret == 0)
520 ret = pagelen;
521out_unlock: 519out_unlock:
520 if (!ret)
521 return VM_FAULT_LOCKED;
522 unlock_page(page); 522 unlock_page(page);
523 if (ret) 523 return VM_FAULT_SIGBUS;
524 ret = VM_FAULT_SIGBUS;
525 return ret;
526} 524}
527 525
528static struct vm_operations_struct nfs_file_vm_ops = { 526static struct vm_operations_struct nfs_file_vm_ops = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e6a1932c7110..35869a4921f1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -713,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
713 if (args->npages != 0) 713 if (args->npages != 0)
714 xdr_encode_pages(buf, args->pages, 0, args->len); 714 xdr_encode_pages(buf, args->pages, 0, args->len);
715 else 715 else
716 req->rq_slen += args->len; 716 req->rq_slen = xdr_adjust_iovec(req->rq_svec,
717 p + XDR_QUADLEN(args->len));
717 718
718 err = nfsacl_encode(buf, base, args->inode, 719 err = nfsacl_encode(buf, base, args->inode,
719 (args->mask & NFS_ACL) ? 720 (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82eaadbff408..d2d67781c579 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -683,9 +683,12 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
683 */ 683 */
684static void nfs_umount_begin(struct super_block *sb) 684static void nfs_umount_begin(struct super_block *sb)
685{ 685{
686 struct nfs_server *server = NFS_SB(sb); 686 struct nfs_server *server;
687 struct rpc_clnt *rpc; 687 struct rpc_clnt *rpc;
688 688
689 lock_kernel();
690
691 server = NFS_SB(sb);
689 /* -EIO all pending I/O */ 692 /* -EIO all pending I/O */
690 rpc = server->client_acl; 693 rpc = server->client_acl;
691 if (!IS_ERR(rpc)) 694 if (!IS_ERR(rpc))
@@ -693,6 +696,8 @@ static void nfs_umount_begin(struct super_block *sb)
693 rpc = server->client; 696 rpc = server->client;
694 if (!IS_ERR(rpc)) 697 if (!IS_ERR(rpc))
695 rpc_killall_tasks(rpc); 698 rpc_killall_tasks(rpc);
699
700 unlock_kernel();
696} 701}
697 702
698/* 703/*
@@ -1228,7 +1233,6 @@ static int nfs_parse_mount_options(char *raw,
1228 goto out_nomem; 1233 goto out_nomem;
1229 token = match_token(string, 1234 token = match_token(string,
1230 nfs_xprt_protocol_tokens, args); 1235 nfs_xprt_protocol_tokens, args);
1231 kfree(string);
1232 1236
1233 switch (token) { 1237 switch (token) {
1234 case Opt_xprt_udp: 1238 case Opt_xprt_udp:
@@ -1258,6 +1262,7 @@ static int nfs_parse_mount_options(char *raw,
1258 goto out_nomem; 1262 goto out_nomem;
1259 token = match_token(string, 1263 token = match_token(string,
1260 nfs_xprt_protocol_tokens, args); 1264 nfs_xprt_protocol_tokens, args);
1265 kfree(string);
1261 1266
1262 switch (token) { 1267 switch (token) {
1263 case Opt_xprt_udp: 1268 case Opt_xprt_udp:
@@ -2106,8 +2111,7 @@ out_err_nosb:
2106error_splat_root: 2111error_splat_root:
2107 dput(mntroot); 2112 dput(mntroot);
2108error_splat_super: 2113error_splat_super:
2109 up_write(&s->s_umount); 2114 deactivate_locked_super(s);
2110 deactivate_super(s);
2111 goto out; 2115 goto out;
2112} 2116}
2113 2117
@@ -2203,8 +2207,7 @@ out_err_noserver:
2203 return error; 2207 return error;
2204 2208
2205error_splat_super: 2209error_splat_super:
2206 up_write(&s->s_umount); 2210 deactivate_locked_super(s);
2207 deactivate_super(s);
2208 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2211 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2209 return error; 2212 return error;
2210} 2213}
@@ -2464,8 +2467,7 @@ out_free:
2464error_splat_root: 2467error_splat_root:
2465 dput(mntroot); 2468 dput(mntroot);
2466error_splat_super: 2469error_splat_super:
2467 up_write(&s->s_umount); 2470 deactivate_locked_super(s);
2468 deactivate_super(s);
2469 goto out; 2471 goto out;
2470} 2472}
2471 2473
@@ -2559,8 +2561,7 @@ out_err_noserver:
2559 return error; 2561 return error;
2560 2562
2561error_splat_super: 2563error_splat_super:
2562 up_write(&s->s_umount); 2564 deactivate_locked_super(s);
2563 deactivate_super(s);
2564 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2565 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2565 return error; 2566 return error;
2566} 2567}
@@ -2644,8 +2645,7 @@ out_err_noserver:
2644 return error; 2645 return error;
2645 2646
2646error_splat_super: 2647error_splat_super:
2647 up_write(&s->s_umount); 2648 deactivate_locked_super(s);
2648 deactivate_super(s);
2649 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 2649 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2650 return error; 2650 return error;
2651} 2651}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
1config NFSD 1config NFSD
2 tristate "NFS server support" 2 tristate "NFS server support"
3 depends on INET 3 depends on INET
4 depends on FILE_LOCKING
4 select LOCKD 5 select LOCKD
5 select SUNRPC 6 select SUNRPC
6 select EXPORTFS 7 select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
18#include <linux/unistd.h> 18#include <linux/unistd.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/major.h> 20#include <linux/major.h>
21#include <linux/magic.h>
21 22
22#include <linux/sunrpc/svc.h> 23#include <linux/sunrpc/svc.h>
23#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
202 struct nfsd3_writeres *resp) 203 struct nfsd3_writeres *resp)
203{ 204{
204 __be32 nfserr; 205 __be32 nfserr;
206 unsigned long cnt = argp->len;
205 207
206 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 208 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
207 SVCFH_fmt(&argp->fh), 209 SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
214 nfserr = nfsd_write(rqstp, &resp->fh, NULL, 216 nfserr = nfsd_write(rqstp, &resp->fh, NULL,
215 argp->offset, 217 argp->offset,
216 rqstp->rq_vec, argp->vlen, 218 rqstp->rq_vec, argp->vlen,
217 argp->len, 219 &cnt,
218 &resp->committed); 220 &resp->committed);
219 resp->count = argp->count; 221 resp->count = cnt;
220 RETURN_STATUS(nfserr); 222 RETURN_STATUS(nfserr);
221} 223}
222 224
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
569 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; 571 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
570 572
571 /* Note that we don't care for remote fs's here */ 573 /* Note that we don't care for remote fs's here */
572 if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { 574 if (sb->s_magic == MSDOS_SUPER_MAGIC) {
573 resp->f_properties = NFS3_FSF_BILLYBOY; 575 resp->f_properties = NFS3_FSF_BILLYBOY;
574 } 576 }
575 resp->f_maxfilesize = sb->s_maxbytes; 577 resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
610 resp->p_link_max = EXT2_LINK_MAX; 612 resp->p_link_max = EXT2_LINK_MAX;
611 resp->p_name_max = EXT2_NAME_LEN; 613 resp->p_name_max = EXT2_NAME_LEN;
612 break; 614 break;
613 case 0x4d44: /* MSDOS_SUPER_MAGIC */ 615 case MSDOS_SUPER_MAGIC:
614 resp->p_case_insensitive = 1; 616 resp->p_case_insensitive = 1;
615 resp->p_case_preserving = 0; 617 resp->p_case_preserving = 0;
616 break; 618 break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) 218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
219{ 219{
220 __be32 *p; 220 __be32 *p;
221 int len = cb_rec->cbr_fhlen; 221 int len = cb_rec->cbr_fh.fh_size;
222 222
223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
224 WRITE32(OP_CB_RECALL); 224 WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); 226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
227 WRITE32(cb_rec->cbr_trunc); 227 WRITE32(cb_rec->cbr_trunc);
228 WRITE32(len); 228 WRITE32(len);
229 WRITEMEM(cb_rec->cbr_fhval, len); 229 WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
230 return 0; 230 return 0;
231} 231}
232 232
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
361/* Reference counting, callback cleanup, etc., all look racy as heck. 361/* Reference counting, callback cleanup, etc., all look racy as heck.
362 * And why is cb_set an atomic? */ 362 * And why is cb_set an atomic? */
363 363
364static int do_probe_callback(void *data) 364static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
365{ 365{
366 struct nfs4_client *clp = data;
367 struct sockaddr_in addr; 366 struct sockaddr_in addr;
368 struct nfs4_callback *cb = &clp->cl_callback; 367 struct nfs4_callback *cb = &clp->cl_callback;
369 struct rpc_timeout timeparms = { 368 struct rpc_timeout timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 383 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 .client_name = clp->cl_principal, 384 .client_name = clp->cl_principal,
386 }; 385 };
387 struct rpc_message msg = {
388 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
389 .rpc_argp = clp,
390 };
391 struct rpc_clnt *client; 386 struct rpc_clnt *client;
392 int status;
393 387
394 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { 388 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
395 status = nfserr_cb_path_down; 389 return ERR_PTR(-EINVAL);
396 goto out_err;
397 }
398 390
399 /* Initialize address */ 391 /* Initialize address */
400 memset(&addr, 0, sizeof(addr)); 392 memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
404 396
405 /* Create RPC client */ 397 /* Create RPC client */
406 client = rpc_create(&args); 398 client = rpc_create(&args);
399 if (IS_ERR(client))
400 dprintk("NFSD: couldn't create callback client: %ld\n",
401 PTR_ERR(client));
402 return client;
403
404}
405
406static int do_probe_callback(void *data)
407{
408 struct nfs4_client *clp = data;
409 struct nfs4_callback *cb = &clp->cl_callback;
410 struct rpc_message msg = {
411 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
412 .rpc_argp = clp,
413 };
414 struct rpc_clnt *client;
415 int status;
416
417 client = setup_callback_client(clp);
407 if (IS_ERR(client)) { 418 if (IS_ERR(client)) {
408 dprintk("NFSD: couldn't create callback client\n");
409 status = PTR_ERR(client); 419 status = PTR_ERR(client);
420 dprintk("NFSD: couldn't create callback client: %d\n",
421 status);
410 goto out_err; 422 goto out_err;
411 } 423 }
412 424
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
422out_release_client: 434out_release_client:
423 rpc_shutdown_client(client); 435 rpc_shutdown_client(client);
424out_err: 436out_err:
425 dprintk("NFSD: warning: no callback path to client %.*s\n", 437 dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
426 (int)clp->cl_name.len, clp->cl_name.data); 438 (int)clp->cl_name.len, clp->cl_name.data, status);
427 put_nfs4_client(clp); 439 put_nfs4_client(clp);
428 return status; 440 return 0;
429} 441}
430 442
431/* 443/*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
451 463
452/* 464/*
453 * called with dp->dl_count inc'ed. 465 * called with dp->dl_count inc'ed.
454 * nfs4_lock_state() may or may not have been called.
455 */ 466 */
456void 467void
457nfsd4_cb_recall(struct nfs4_delegation *dp) 468nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
93 open->op_truncate = 0; 93 open->op_truncate = 0;
94 94
95 if (open->op_create) { 95 if (open->op_create) {
96 /* FIXME: check session persistence and pnfs flags.
97 * The nfsv4.1 spec requires the following semantics:
98 *
99 * Persistent | pNFS | Server REQUIRED | Client Allowed
100 * Reply Cache | server | |
101 * -------------+--------+-----------------+--------------------
102 * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1
103 * | | | (SHOULD)
104 * | | and EXCLUSIVE4 | or EXCLUSIVE4
105 * | | | (SHOULD NOT)
106 * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1
107 * yes | no | GUARDED4 | GUARDED4
108 * yes | yes | GUARDED4 | GUARDED4
109 */
110
96 /* 111 /*
97 * Note: create modes (UNCHECKED,GUARDED...) are the same 112 * Note: create modes (UNCHECKED,GUARDED...) are the same
98 * in NFSv4 as in v3. 113 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
103 (u32 *)open->op_verf.data, 118 (u32 *)open->op_verf.data,
104 &open->op_truncate, &created); 119 &open->op_truncate, &created);
105 120
106 /* If we ever decide to use different attrs to store the 121 /*
107 * verifier in nfsd_create_v3, then we'll need to change this 122 * Following rfc 3530 14.2.16, use the returned bitmask
123 * to indicate which attributes we used to store the
124 * verifier:
108 */ 125 */
109 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) 126 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
110 open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | 127 open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
111 FATTR4_WORD1_TIME_MODIFY); 128 FATTR4_WORD1_TIME_MODIFY);
112 } else { 129 } else {
113 status = nfsd_lookup(rqstp, current_fh, 130 status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
118 goto out; 135 goto out;
119 136
120 set_change_info(&open->op_cinfo, current_fh); 137 set_change_info(&open->op_cinfo, current_fh);
121
122 /* set reply cache */
123 fh_dup2(current_fh, &resfh); 138 fh_dup2(current_fh, &resfh);
124 open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
125 memcpy(open->op_stateowner->so_replay.rp_openfh,
126 &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
127 139
140 /* set reply cache */
141 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
142 &resfh.fh_handle);
128 if (!created) 143 if (!created)
129 status = do_open_permission(rqstp, current_fh, open, 144 status = do_open_permission(rqstp, current_fh, open,
130 NFSD_MAY_NOP); 145 NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
150 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); 165 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
151 166
152 /* set replay cache */ 167 /* set replay cache */
153 open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; 168 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
154 memcpy(open->op_stateowner->so_replay.rp_openfh, 169 &current_fh->fh_handle);
155 &current_fh->fh_handle.fh_base,
156 current_fh->fh_handle.fh_size);
157 170
158 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 171 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
159 (open->op_iattr.ia_size == 0); 172 (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
164 return status; 177 return status;
165} 178}
166 179
180static void
181copy_clientid(clientid_t *clid, struct nfsd4_session *session)
182{
183 struct nfsd4_sessionid *sid =
184 (struct nfsd4_sessionid *)session->se_sessionid.data;
185
186 clid->cl_boot = sid->clientid.cl_boot;
187 clid->cl_id = sid->clientid.cl_id;
188}
167 189
168static __be32 190static __be32
169nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 191nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
170 struct nfsd4_open *open) 192 struct nfsd4_open *open)
171{ 193{
172 __be32 status; 194 __be32 status;
195 struct nfsd4_compoundres *resp;
196
173 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 197 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
174 (int)open->op_fname.len, open->op_fname.data, 198 (int)open->op_fname.len, open->op_fname.data,
175 open->op_stateowner); 199 open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
178 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) 202 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
179 return nfserr_inval; 203 return nfserr_inval;
180 204
205 if (nfsd4_has_session(cstate))
206 copy_clientid(&open->op_clientid, cstate->session);
207
181 nfs4_lock_state(); 208 nfs4_lock_state();
182 209
183 /* check seqid for replay. set nfs4_owner */ 210 /* check seqid for replay. set nfs4_owner */
184 status = nfsd4_process_open1(open); 211 resp = rqstp->rq_resp;
212 status = nfsd4_process_open1(&resp->cstate, open);
185 if (status == nfserr_replay_me) { 213 if (status == nfserr_replay_me) {
186 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 214 struct nfs4_replay *rp = &open->op_stateowner->so_replay;
187 fh_put(&cstate->current_fh); 215 fh_put(&cstate->current_fh);
188 cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; 216 fh_copy_shallow(&cstate->current_fh.fh_handle,
189 memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, 217 &rp->rp_openfh);
190 rp->rp_openfh_len);
191 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); 218 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
192 if (status) 219 if (status)
193 dprintk("nfsd4_open: replay failed" 220 dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
209 236
210 switch (open->op_claim_type) { 237 switch (open->op_claim_type) {
211 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 238 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
212 status = nfserr_inval;
213 if (open->op_create)
214 goto out;
215 /* fall through */
216 case NFS4_OPEN_CLAIM_NULL: 239 case NFS4_OPEN_CLAIM_NULL:
217 /* 240 /*
218 * (1) set CURRENT_FH to the file being opened, 241 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
455 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 478 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
456 return nfserr_inval; 479 return nfserr_inval;
457 480
458 getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 481 getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
459 getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 482 getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
483 getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
460 484
461 getattr->ga_fhp = &cstate->current_fh; 485 getattr->ga_fhp = &cstate->current_fh;
462 return nfs_ok; 486 return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
520 544
521 nfs4_lock_state(); 545 nfs4_lock_state();
522 /* check stateid */ 546 /* check stateid */
523 if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, 547 if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
524 &read->rd_stateid, 548 RD_STATE, &read->rd_filp))) {
525 CHECK_FH | RD_STATE, &read->rd_filp))) {
526 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); 549 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
527 goto out; 550 goto out;
528 } 551 }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
548 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 571 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
549 return nfserr_inval; 572 return nfserr_inval;
550 573
551 readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 574 readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
552 readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 575 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
576 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
553 577
554 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || 578 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
555 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) 579 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
653 677
654 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { 678 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
655 nfs4_lock_state(); 679 nfs4_lock_state();
656 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 680 status = nfs4_preprocess_stateid_op(cstate,
657 &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); 681 &setattr->sa_stateid, WR_STATE, NULL);
658 nfs4_unlock_state(); 682 nfs4_unlock_state();
659 if (status) { 683 if (status) {
660 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); 684 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
685 struct file *filp = NULL; 709 struct file *filp = NULL;
686 u32 *p; 710 u32 *p;
687 __be32 status = nfs_ok; 711 __be32 status = nfs_ok;
712 unsigned long cnt;
688 713
689 /* no need to check permission - this will be done in nfsd_write() */ 714 /* no need to check permission - this will be done in nfsd_write() */
690 715
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
692 return nfserr_inval; 717 return nfserr_inval;
693 718
694 nfs4_lock_state(); 719 nfs4_lock_state();
695 status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, 720 status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
696 CHECK_FH | WR_STATE, &filp);
697 if (filp) 721 if (filp)
698 get_file(filp); 722 get_file(filp);
699 nfs4_unlock_state(); 723 nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
703 return status; 727 return status;
704 } 728 }
705 729
706 write->wr_bytes_written = write->wr_buflen; 730 cnt = write->wr_buflen;
707 write->wr_how_written = write->wr_stable_how; 731 write->wr_how_written = write->wr_stable_how;
708 p = (u32 *)write->wr_verifier.data; 732 p = (u32 *)write->wr_verifier.data;
709 *p++ = nfssvc_boot.tv_sec; 733 *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
711 735
712 status = nfsd_write(rqstp, &cstate->current_fh, filp, 736 status = nfsd_write(rqstp, &cstate->current_fh, filp,
713 write->wr_offset, rqstp->rq_vec, write->wr_vlen, 737 write->wr_offset, rqstp->rq_vec, write->wr_vlen,
714 write->wr_buflen, &write->wr_how_written); 738 &cnt, &write->wr_how_written);
715 if (filp) 739 if (filp)
716 fput(filp); 740 fput(filp);
717 741
742 write->wr_bytes_written = cnt;
743
718 if (status == nfserr_symlink) 744 if (status == nfserr_symlink)
719 status = nfserr_inval; 745 status = nfserr_inval;
720 return status; 746 return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
737 if (status) 763 if (status)
738 return status; 764 return status;
739 765
740 if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) 766 if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
741 || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 767 || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
768 || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
742 return nfserr_attrnotsupp; 769 return nfserr_attrnotsupp;
743 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) 770 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
744 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) 771 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
766 if (status) 793 if (status)
767 goto out_kfree; 794 goto out_kfree;
768 795
769 p = buf + 3; 796 /* skip bitmap */
797 p = buf + 1 + ntohl(buf[0]);
770 status = nfserr_not_same; 798 status = nfserr_not_same;
771 if (ntohl(*p++) != verify->ve_attrlen) 799 if (ntohl(*p++) != verify->ve_attrlen)
772 goto out_kfree; 800 goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
813 nfsdstats.nfs4_opcount[opnum]++; 841 nfsdstats.nfs4_opcount[opnum]++;
814} 842}
815 843
816static void cstate_free(struct nfsd4_compound_state *cstate)
817{
818 if (cstate == NULL)
819 return;
820 fh_put(&cstate->current_fh);
821 fh_put(&cstate->save_fh);
822 BUG_ON(cstate->replay_owner);
823 kfree(cstate);
824}
825
826static struct nfsd4_compound_state *cstate_alloc(void)
827{
828 struct nfsd4_compound_state *cstate;
829
830 cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
831 if (cstate == NULL)
832 return NULL;
833 fh_init(&cstate->current_fh, NFS4_FHSIZE);
834 fh_init(&cstate->save_fh, NFS4_FHSIZE);
835 cstate->replay_owner = NULL;
836 return cstate;
837}
838
839typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, 844typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
840 void *); 845 void *);
846enum nfsd4_op_flags {
847 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
848 ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */
849 ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */
850};
841 851
842struct nfsd4_operation { 852struct nfsd4_operation {
843 nfsd4op_func op_func; 853 nfsd4op_func op_func;
844 u32 op_flags; 854 u32 op_flags;
845/* Most ops require a valid current filehandle; a few don't: */
846#define ALLOWED_WITHOUT_FH 1
847/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
848#define ALLOWED_ON_ABSENT_FS 2
849 char *op_name; 855 char *op_name;
850}; 856};
851 857
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
854static const char *nfsd4_op_name(unsigned opnum); 860static const char *nfsd4_op_name(unsigned opnum);
855 861
856/* 862/*
863 * This is a replay of a compound for which no cache entry pages
864 * were used. Encode the sequence operation, and if cachethis is FALSE
865 * encode the uncache rep error on the next operation.
866 */
867static __be32
868nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
869 struct nfsd4_compoundres *resp)
870{
871 struct nfsd4_op *op;
872
873 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
874 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
875
876 /* Encode the replayed sequence operation */
877 BUG_ON(resp->opcnt != 1);
878 op = &args->ops[resp->opcnt - 1];
879 nfsd4_encode_operation(resp, op);
880
881 /*return nfserr_retry_uncached_rep in next operation. */
882 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
883 op = &args->ops[resp->opcnt++];
884 op->status = nfserr_retry_uncached_rep;
885 nfsd4_encode_operation(resp, op);
886 }
887 return op->status;
888}
889
890/*
891 * Enforce NFSv4.1 COMPOUND ordering rules.
892 *
893 * TODO:
894 * - enforce NFS4ERR_NOT_ONLY_OP,
895 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
896 */
897static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
898{
899 if (args->minorversion && args->opcnt > 0) {
900 struct nfsd4_op *op = &args->ops[0];
901 return (op->status == nfserr_op_illegal) ||
902 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
903 }
904 return true;
905}
906
907/*
857 * COMPOUND call. 908 * COMPOUND call.
858 */ 909 */
859static __be32 910static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
863{ 914{
864 struct nfsd4_op *op; 915 struct nfsd4_op *op;
865 struct nfsd4_operation *opdesc; 916 struct nfsd4_operation *opdesc;
866 struct nfsd4_compound_state *cstate = NULL; 917 struct nfsd4_compound_state *cstate = &resp->cstate;
867 int slack_bytes; 918 int slack_bytes;
868 __be32 status; 919 __be32 status;
869 920
870 resp->xbuf = &rqstp->rq_res; 921 resp->xbuf = &rqstp->rq_res;
871 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; 922 resp->p = rqstp->rq_res.head[0].iov_base +
923 rqstp->rq_res.head[0].iov_len;
872 resp->tagp = resp->p; 924 resp->tagp = resp->p;
873 /* reserve space for: taglen, tag, and opcnt */ 925 /* reserve space for: taglen, tag, and opcnt */
874 resp->p += 2 + XDR_QUADLEN(args->taglen); 926 resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
877 resp->tag = args->tag; 929 resp->tag = args->tag;
878 resp->opcnt = 0; 930 resp->opcnt = 0;
879 resp->rqstp = rqstp; 931 resp->rqstp = rqstp;
932 resp->cstate.minorversion = args->minorversion;
933 resp->cstate.replay_owner = NULL;
934 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
935 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
936 /* Use the deferral mechanism only for NFSv4.0 compounds */
937 rqstp->rq_usedeferral = (args->minorversion == 0);
880 938
881 /* 939 /*
882 * According to RFC3010, this takes precedence over all other errors. 940 * According to RFC3010, this takes precedence over all other errors.
883 */ 941 */
884 status = nfserr_minor_vers_mismatch; 942 status = nfserr_minor_vers_mismatch;
885 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) 943 if (args->minorversion > nfsd_supported_minorversion)
886 goto out; 944 goto out;
887 945
888 status = nfserr_resource; 946 if (!nfs41_op_ordering_ok(args)) {
889 cstate = cstate_alloc(); 947 op = &args->ops[0];
890 if (cstate == NULL) 948 op->status = nfserr_sequence_pos;
891 goto out; 949 goto encode_op;
950 }
892 951
893 status = nfs_ok; 952 status = nfs_ok;
894 while (!status && resp->opcnt < args->opcnt) { 953 while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
897 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", 956 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
898 resp->opcnt, args->opcnt, op->opnum, 957 resp->opcnt, args->opcnt, op->opnum,
899 nfsd4_op_name(op->opnum)); 958 nfsd4_op_name(op->opnum));
900
901 /* 959 /*
902 * The XDR decode routines may have pre-set op->status; 960 * The XDR decode routines may have pre-set op->status;
903 * for example, if there is a miscellaneous XDR error 961 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
938 BUG_ON(op->status == nfs_ok); 996 BUG_ON(op->status == nfs_ok);
939 997
940encode_op: 998encode_op:
999 /* Only from SEQUENCE or CREATE_SESSION */
1000 if (resp->cstate.status == nfserr_replay_cache) {
1001 dprintk("%s NFS4.1 replay from cache\n", __func__);
1002 if (nfsd4_not_cached(resp))
1003 status = nfsd4_enc_uncached_replay(args, resp);
1004 else
1005 status = op->status;
1006 goto out;
1007 }
941 if (op->status == nfserr_replay_me) { 1008 if (op->status == nfserr_replay_me) {
942 op->replay = &cstate->replay_owner->so_replay; 1009 op->replay = &cstate->replay_owner->so_replay;
943 nfsd4_encode_replay(resp, op); 1010 nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
961 1028
962 nfsd4_increment_op_stats(op->opnum); 1029 nfsd4_increment_op_stats(op->opnum);
963 } 1030 }
1031 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1032 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1033 status = nfserr_jukebox;
1034 }
964 1035
965 cstate_free(cstate); 1036 resp->cstate.status = status;
1037 fh_put(&resp->cstate.current_fh);
1038 fh_put(&resp->cstate.save_fh);
1039 BUG_ON(resp->cstate.replay_owner);
966out: 1040out:
967 nfsd4_release_compoundargs(args); 1041 nfsd4_release_compoundargs(args);
1042 /* Reset deferral mechanism for RPC deferrals */
1043 rqstp->rq_usedeferral = 1;
968 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 1044 dprintk("nfsv4 compound returned %d\n", ntohl(status));
969 return status; 1045 return status;
970} 1046}
971 1047
972static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { 1048static struct nfsd4_operation nfsd4_ops[] = {
973 [OP_ACCESS] = { 1049 [OP_ACCESS] = {
974 .op_func = (nfsd4op_func)nfsd4_access, 1050 .op_func = (nfsd4op_func)nfsd4_access,
975 .op_name = "OP_ACCESS", 1051 .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1045 .op_name = "OP_PUTFH", 1121 .op_name = "OP_PUTFH",
1046 }, 1122 },
1047 [OP_PUTPUBFH] = { 1123 [OP_PUTPUBFH] = {
1048 /* unsupported, just for future reference: */ 1124 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1049 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1125 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1050 .op_name = "OP_PUTPUBFH", 1126 .op_name = "OP_PUTPUBFH",
1051 }, 1127 },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1119 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1195 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1120 .op_name = "OP_RELEASE_LOCKOWNER", 1196 .op_name = "OP_RELEASE_LOCKOWNER",
1121 }, 1197 },
1198
1199 /* NFSv4.1 operations */
1200 [OP_EXCHANGE_ID] = {
1201 .op_func = (nfsd4op_func)nfsd4_exchange_id,
1202 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1203 .op_name = "OP_EXCHANGE_ID",
1204 },
1205 [OP_CREATE_SESSION] = {
1206 .op_func = (nfsd4op_func)nfsd4_create_session,
1207 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1208 .op_name = "OP_CREATE_SESSION",
1209 },
1210 [OP_DESTROY_SESSION] = {
1211 .op_func = (nfsd4op_func)nfsd4_destroy_session,
1212 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1213 .op_name = "OP_DESTROY_SESSION",
1214 },
1215 [OP_SEQUENCE] = {
1216 .op_func = (nfsd4op_func)nfsd4_sequence,
1217 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1218 .op_name = "OP_SEQUENCE",
1219 },
1122}; 1220};
1123 1221
1124static const char *nfsd4_op_name(unsigned opnum) 1222static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..b5348405046b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
182 182
183typedef int (recdir_func)(struct dentry *, struct dentry *); 183typedef int (recdir_func)(struct dentry *, struct dentry *);
184 184
185struct dentry_list { 185struct name_list {
186 struct dentry *dentry; 186 char name[HEXDIR_LEN];
187 struct list_head list; 187 struct list_head list;
188}; 188};
189 189
190struct dentry_list_arg {
191 struct list_head dentries;
192 struct dentry *parent;
193};
194
195static int 190static int
196nfsd4_build_dentrylist(void *arg, const char *name, int namlen, 191nfsd4_build_namelist(void *arg, const char *name, int namlen,
197 loff_t offset, u64 ino, unsigned int d_type) 192 loff_t offset, u64 ino, unsigned int d_type)
198{ 193{
199 struct dentry_list_arg *dla = arg; 194 struct list_head *names = arg;
200 struct list_head *dentries = &dla->dentries; 195 struct name_list *entry;
201 struct dentry *parent = dla->parent;
202 struct dentry *dentry;
203 struct dentry_list *child;
204 196
205 if (name && isdotent(name, namlen)) 197 if (namlen != HEXDIR_LEN - 1)
206 return 0; 198 return 0;
207 dentry = lookup_one_len(name, parent, namlen); 199 entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
208 if (IS_ERR(dentry)) 200 if (entry == NULL)
209 return PTR_ERR(dentry);
210 child = kmalloc(sizeof(*child), GFP_KERNEL);
211 if (child == NULL)
212 return -ENOMEM; 201 return -ENOMEM;
213 child->dentry = dentry; 202 memcpy(entry->name, name, HEXDIR_LEN - 1);
214 list_add(&child->list, dentries); 203 entry->name[HEXDIR_LEN - 1] = '\0';
204 list_add(&entry->list, names);
215 return 0; 205 return 0;
216} 206}
217 207
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
220{ 210{
221 const struct cred *original_cred; 211 const struct cred *original_cred;
222 struct file *filp; 212 struct file *filp;
223 struct dentry_list_arg dla = { 213 LIST_HEAD(names);
224 .parent = dir, 214 struct name_list *entry;
225 }; 215 struct dentry *dentry;
226 struct list_head *dentries = &dla.dentries;
227 struct dentry_list *child;
228 int status; 216 int status;
229 217
230 if (!rec_dir_init) 218 if (!rec_dir_init)
@@ -233,67 +221,42 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
233 status = nfs4_save_creds(&original_cred); 221 status = nfs4_save_creds(&original_cred);
234 if (status < 0) 222 if (status < 0)
235 return status; 223 return status;
236 INIT_LIST_HEAD(dentries);
237 224
238 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 225 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
239 current_cred()); 226 current_cred());
240 status = PTR_ERR(filp); 227 status = PTR_ERR(filp);
241 if (IS_ERR(filp)) 228 if (IS_ERR(filp))
242 goto out; 229 goto out;
243 INIT_LIST_HEAD(dentries); 230 status = vfs_readdir(filp, nfsd4_build_namelist, &names);
244 status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
245 fput(filp); 231 fput(filp);
246 while (!list_empty(dentries)) { 232 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
247 child = list_entry(dentries->next, struct dentry_list, list); 233 while (!list_empty(&names)) {
248 status = f(dir, child->dentry); 234 entry = list_entry(names.next, struct name_list, list);
235
236 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
237 if (IS_ERR(dentry)) {
238 status = PTR_ERR(dentry);
239 break;
240 }
241 status = f(dir, dentry);
242 dput(dentry);
249 if (status) 243 if (status)
250 goto out; 244 break;
251 list_del(&child->list); 245 list_del(&entry->list);
252 dput(child->dentry); 246 kfree(entry);
253 kfree(child);
254 } 247 }
248 mutex_unlock(&dir->d_inode->i_mutex);
255out: 249out:
256 while (!list_empty(dentries)) { 250 while (!list_empty(&names)) {
257 child = list_entry(dentries->next, struct dentry_list, list); 251 entry = list_entry(names.next, struct name_list, list);
258 list_del(&child->list); 252 list_del(&entry->list);
259 dput(child->dentry); 253 kfree(entry);
260 kfree(child);
261 } 254 }
262 nfs4_reset_creds(original_cred); 255 nfs4_reset_creds(original_cred);
263 return status; 256 return status;
264} 257}
265 258
266static int 259static int
267nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
268{
269 int status;
270
271 if (!S_ISREG(dir->d_inode->i_mode)) {
272 printk("nfsd4: non-file found in client recovery directory\n");
273 return -EINVAL;
274 }
275 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
276 status = vfs_unlink(dir->d_inode, dentry);
277 mutex_unlock(&dir->d_inode->i_mutex);
278 return status;
279}
280
281static int
282nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
283{
284 int status;
285
286 /* For now this directory should already be empty, but we empty it of
287 * any regular files anyway, just in case the directory was created by
288 * a kernel from the future.... */
289 nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
290 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
291 status = vfs_rmdir(dir->d_inode, dentry);
292 mutex_unlock(&dir->d_inode->i_mutex);
293 return status;
294}
295
296static int
297nfsd4_unlink_clid_dir(char *name, int namlen) 260nfsd4_unlink_clid_dir(char *name, int namlen)
298{ 261{
299 struct dentry *dentry; 262 struct dentry *dentry;
@@ -301,20 +264,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
301 264
302 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 265 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
303 266
304 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 267 mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
305 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 268 dentry = lookup_one_len(name, rec_dir.dentry, namlen);
306 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
307 if (IS_ERR(dentry)) { 269 if (IS_ERR(dentry)) {
308 status = PTR_ERR(dentry); 270 status = PTR_ERR(dentry);
309 return status; 271 goto out_unlock;
310 } 272 }
311 status = -ENOENT; 273 status = -ENOENT;
312 if (!dentry->d_inode) 274 if (!dentry->d_inode)
313 goto out; 275 goto out;
314 276 status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
315 status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
316out: 277out:
317 dput(dentry); 278 dput(dentry);
279out_unlock:
280 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
318 return status; 281 return status;
319} 282}
320 283
@@ -353,10 +316,11 @@ purge_old(struct dentry *parent, struct dentry *child)
353{ 316{
354 int status; 317 int status;
355 318
356 if (nfs4_has_reclaimed_state(child->d_name.name)) 319 /* note: we currently use this path only for minorversion 0 */
320 if (nfs4_has_reclaimed_state(child->d_name.name, false))
357 return 0; 321 return 0;
358 322
359 status = nfsd4_clear_clid_dir(parent, child); 323 status = vfs_rmdir(parent->d_inode, child);
360 if (status) 324 if (status)
361 printk("failed to remove client recovery directory %s\n", 325 printk("failed to remove client recovery directory %s\n",
362 child->d_name.name); 326 child->d_name.name);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..3b711f5147a7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
68static u32 nfs4_init; 68static u32 nfs4_init;
69static stateid_t zerostateid; /* bits all 0 */ 69static stateid_t zerostateid; /* bits all 0 */
70static stateid_t onestateid; /* bits all 1 */ 70static stateid_t onestateid; /* bits all 1 */
71static u64 current_sessionid = 1;
71 72
72#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) 73#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
73#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 74#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */
75/* forward declarations */ 76/* forward declarations */
76static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); 77static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
77static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); 78static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
78static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; 79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
80static void nfs4_set_recdir(char *recdir); 80static void nfs4_set_recdir(char *recdir);
81 81
82/* Locking: 82/* Locking: */
83 * 83
84 * client_mutex: 84/* Currently used for almost all code touching nfsv4 state: */
85 * protects clientid_hashtbl[], clientstr_hashtbl[],
86 * unconfstr_hashtbl[], uncofid_hashtbl[].
87 */
88static DEFINE_MUTEX(client_mutex); 85static DEFINE_MUTEX(client_mutex);
89 86
87/*
88 * Currently used for the del_recall_lru and file hash table. In an
89 * effort to decrease the scope of the client_mutex, this spinlock may
90 * eventually cover more:
91 */
92static DEFINE_SPINLOCK(recall_lock);
93
90static struct kmem_cache *stateowner_slab = NULL; 94static struct kmem_cache *stateowner_slab = NULL;
91static struct kmem_cache *file_slab = NULL; 95static struct kmem_cache *file_slab = NULL;
92static struct kmem_cache *stateid_slab = NULL; 96static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
117 return x; 121 return x;
118} 122}
119 123
120/* forward declarations */
121static void release_stateowner(struct nfs4_stateowner *sop);
122static void release_stateid(struct nfs4_stateid *stp, int flags);
123
124/*
125 * Delegation state
126 */
127
128/* recall_lock protects the del_recall_lru */
129static DEFINE_SPINLOCK(recall_lock);
130static struct list_head del_recall_lru; 124static struct list_head del_recall_lru;
131 125
132static void
133free_nfs4_file(struct kref *kref)
134{
135 struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
136 list_del(&fp->fi_hash);
137 iput(fp->fi_inode);
138 kmem_cache_free(file_slab, fp);
139}
140
141static inline void 126static inline void
142put_nfs4_file(struct nfs4_file *fi) 127put_nfs4_file(struct nfs4_file *fi)
143{ 128{
144 kref_put(&fi->fi_ref, free_nfs4_file); 129 if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
130 list_del(&fi->fi_hash);
131 spin_unlock(&recall_lock);
132 iput(fi->fi_inode);
133 kmem_cache_free(file_slab, fi);
134 }
145} 135}
146 136
147static inline void 137static inline void
148get_nfs4_file(struct nfs4_file *fi) 138get_nfs4_file(struct nfs4_file *fi)
149{ 139{
150 kref_get(&fi->fi_ref); 140 atomic_inc(&fi->fi_ref);
151} 141}
152 142
153static int num_delegations; 143static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
220 dp->dl_stateid.si_stateownerid = current_delegid++; 210 dp->dl_stateid.si_stateownerid = current_delegid++;
221 dp->dl_stateid.si_fileid = 0; 211 dp->dl_stateid.si_fileid = 0;
222 dp->dl_stateid.si_generation = 0; 212 dp->dl_stateid.si_generation = 0;
223 dp->dl_fhlen = current_fh->fh_handle.fh_size; 213 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
224 memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
225 current_fh->fh_handle.fh_size);
226 dp->dl_time = 0; 214 dp->dl_time = 0;
227 atomic_set(&dp->dl_count, 1); 215 atomic_set(&dp->dl_count, 1);
228 list_add(&dp->dl_perfile, &fp->fi_delegations); 216 list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,290 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
311static struct list_head client_lru; 299static struct list_head client_lru;
312static struct list_head close_lru; 300static struct list_head close_lru;
313 301
302static void unhash_generic_stateid(struct nfs4_stateid *stp)
303{
304 list_del(&stp->st_hash);
305 list_del(&stp->st_perfile);
306 list_del(&stp->st_perstateowner);
307}
308
309static void free_generic_stateid(struct nfs4_stateid *stp)
310{
311 put_nfs4_file(stp->st_file);
312 kmem_cache_free(stateid_slab, stp);
313}
314
315static void release_lock_stateid(struct nfs4_stateid *stp)
316{
317 unhash_generic_stateid(stp);
318 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
319 free_generic_stateid(stp);
320}
321
322static void unhash_lockowner(struct nfs4_stateowner *sop)
323{
324 struct nfs4_stateid *stp;
325
326 list_del(&sop->so_idhash);
327 list_del(&sop->so_strhash);
328 list_del(&sop->so_perstateid);
329 while (!list_empty(&sop->so_stateids)) {
330 stp = list_first_entry(&sop->so_stateids,
331 struct nfs4_stateid, st_perstateowner);
332 release_lock_stateid(stp);
333 }
334}
335
336static void release_lockowner(struct nfs4_stateowner *sop)
337{
338 unhash_lockowner(sop);
339 nfs4_put_stateowner(sop);
340}
341
342static void
343release_stateid_lockowners(struct nfs4_stateid *open_stp)
344{
345 struct nfs4_stateowner *lock_sop;
346
347 while (!list_empty(&open_stp->st_lockowners)) {
348 lock_sop = list_entry(open_stp->st_lockowners.next,
349 struct nfs4_stateowner, so_perstateid);
350 /* list_del(&open_stp->st_lockowners); */
351 BUG_ON(lock_sop->so_is_open_owner);
352 release_lockowner(lock_sop);
353 }
354}
355
356static void release_open_stateid(struct nfs4_stateid *stp)
357{
358 unhash_generic_stateid(stp);
359 release_stateid_lockowners(stp);
360 nfsd_close(stp->st_vfs_file);
361 free_generic_stateid(stp);
362}
363
364static void unhash_openowner(struct nfs4_stateowner *sop)
365{
366 struct nfs4_stateid *stp;
367
368 list_del(&sop->so_idhash);
369 list_del(&sop->so_strhash);
370 list_del(&sop->so_perclient);
371 list_del(&sop->so_perstateid); /* XXX: necessary? */
372 while (!list_empty(&sop->so_stateids)) {
373 stp = list_first_entry(&sop->so_stateids,
374 struct nfs4_stateid, st_perstateowner);
375 release_open_stateid(stp);
376 }
377}
378
379static void release_openowner(struct nfs4_stateowner *sop)
380{
381 unhash_openowner(sop);
382 list_del(&sop->so_close_lru);
383 nfs4_put_stateowner(sop);
384}
385
386static DEFINE_SPINLOCK(sessionid_lock);
387#define SESSION_HASH_SIZE 512
388static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
389
390static inline int
391hash_sessionid(struct nfs4_sessionid *sessionid)
392{
393 struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
394
395 return sid->sequence % SESSION_HASH_SIZE;
396}
397
398static inline void
399dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
400{
401 u32 *ptr = (u32 *)(&sessionid->data[0]);
402 dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
403}
404
405static void
406gen_sessionid(struct nfsd4_session *ses)
407{
408 struct nfs4_client *clp = ses->se_client;
409 struct nfsd4_sessionid *sid;
410
411 sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
412 sid->clientid = clp->cl_clientid;
413 sid->sequence = current_sessionid++;
414 sid->reserved = 0;
415}
416
417/*
418 * Give the client the number of slots it requests bound by
419 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
420 *
421 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
422 * should (up to a point) re-negotiate active sessions and reduce their
423 * slot usage to make rooom for new connections. For now we just fail the
424 * create session.
425 */
426static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
427{
428 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
429
430 spin_lock(&nfsd_serv->sv_lock);
431 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
432 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
433 nfsd_serv->sv_drc_pages_used += np;
434 spin_unlock(&nfsd_serv->sv_lock);
435
436 if (np <= 0) {
437 status = nfserr_resource;
438 fchan->maxreqs = 0;
439 } else
440 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
441
442 return status;
443}
444
445/*
446 * fchan holds the client values on input, and the server values on output
447 */
448static int init_forechannel_attrs(struct svc_rqst *rqstp,
449 struct nfsd4_session *session,
450 struct nfsd4_channel_attrs *fchan)
451{
452 int status = 0;
453 __u32 maxcount = svc_max_payload(rqstp);
454
455 /* headerpadsz set to zero in encode routine */
456
457 /* Use the client's max request and max response size if possible */
458 if (fchan->maxreq_sz > maxcount)
459 fchan->maxreq_sz = maxcount;
460 session->se_fmaxreq_sz = fchan->maxreq_sz;
461
462 if (fchan->maxresp_sz > maxcount)
463 fchan->maxresp_sz = maxcount;
464 session->se_fmaxresp_sz = fchan->maxresp_sz;
465
466 /* Set the max response cached size our default which is
467 * a multiple of PAGE_SIZE and small */
468 session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
469 fchan->maxresp_cached = session->se_fmaxresp_cached;
470
471 /* Use the client's maxops if possible */
472 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
473 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
474 session->se_fmaxops = fchan->maxops;
475
476 /* try to use the client requested number of slots */
477 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
478 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
479
480 /* FIXME: Error means no more DRC pages so the server should
481 * recover pages from existing sessions. For now fail session
482 * creation.
483 */
484 status = set_forechannel_maxreqs(fchan);
485
486 session->se_fnumslots = fchan->maxreqs;
487 return status;
488}
489
490static int
491alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
492 struct nfsd4_create_session *cses)
493{
494 struct nfsd4_session *new, tmp;
495 int idx, status = nfserr_resource, slotsize;
496
497 memset(&tmp, 0, sizeof(tmp));
498
499 /* FIXME: For now, we just accept the client back channel attributes. */
500 status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
501 if (status)
502 goto out;
503
504 /* allocate struct nfsd4_session and slot table in one piece */
505 slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
506 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
507 if (!new)
508 goto out;
509
510 memcpy(new, &tmp, sizeof(*new));
511
512 new->se_client = clp;
513 gen_sessionid(new);
514 idx = hash_sessionid(&new->se_sessionid);
515 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
516 NFS4_MAX_SESSIONID_LEN);
517
518 new->se_flags = cses->flags;
519 kref_init(&new->se_ref);
520 spin_lock(&sessionid_lock);
521 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
522 list_add(&new->se_perclnt, &clp->cl_sessions);
523 spin_unlock(&sessionid_lock);
524
525 status = nfs_ok;
526out:
527 return status;
528}
529
530/* caller must hold sessionid_lock */
531static struct nfsd4_session *
532find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
533{
534 struct nfsd4_session *elem;
535 int idx;
536
537 dump_sessionid(__func__, sessionid);
538 idx = hash_sessionid(sessionid);
539 dprintk("%s: idx is %d\n", __func__, idx);
540 /* Search in the appropriate list */
541 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
542 dump_sessionid("list traversal", &elem->se_sessionid);
543 if (!memcmp(elem->se_sessionid.data, sessionid->data,
544 NFS4_MAX_SESSIONID_LEN)) {
545 return elem;
546 }
547 }
548
549 dprintk("%s: session not found\n", __func__);
550 return NULL;
551}
552
553/* caller must hold sessionid_lock */
554static void
555unhash_session(struct nfsd4_session *ses)
556{
557 list_del(&ses->se_hash);
558 list_del(&ses->se_perclnt);
559}
560
561static void
562release_session(struct nfsd4_session *ses)
563{
564 spin_lock(&sessionid_lock);
565 unhash_session(ses);
566 spin_unlock(&sessionid_lock);
567 nfsd4_put_session(ses);
568}
569
570static void nfsd4_release_respages(struct page **respages, short resused);
571
572void
573free_session(struct kref *kref)
574{
575 struct nfsd4_session *ses;
576 int i;
577
578 ses = container_of(kref, struct nfsd4_session, se_ref);
579 for (i = 0; i < ses->se_fnumslots; i++) {
580 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
581 nfsd4_release_respages(e->ce_respages, e->ce_resused);
582 }
583 kfree(ses);
584}
585
314static inline void 586static inline void
315renew_client(struct nfs4_client *clp) 587renew_client(struct nfs4_client *clp)
316{ 588{
@@ -330,8 +602,8 @@ STALE_CLIENTID(clientid_t *clid)
330{ 602{
331 if (clid->cl_boot == boot_time) 603 if (clid->cl_boot == boot_time)
332 return 0; 604 return 0;
333 dprintk("NFSD stale clientid (%08x/%08x)\n", 605 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
334 clid->cl_boot, clid->cl_id); 606 clid->cl_boot, clid->cl_id, boot_time);
335 return 1; 607 return 1;
336} 608}
337 609
@@ -376,6 +648,8 @@ static inline void
376free_client(struct nfs4_client *clp) 648free_client(struct nfs4_client *clp)
377{ 649{
378 shutdown_callback_client(clp); 650 shutdown_callback_client(clp);
651 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
652 clp->cl_slot.sl_cache_entry.ce_resused);
379 if (clp->cl_cred.cr_group_info) 653 if (clp->cl_cred.cr_group_info)
380 put_group_info(clp->cl_cred.cr_group_info); 654 put_group_info(clp->cl_cred.cr_group_info);
381 kfree(clp->cl_principal); 655 kfree(clp->cl_principal);
@@ -420,7 +694,13 @@ expire_client(struct nfs4_client *clp)
420 list_del(&clp->cl_lru); 694 list_del(&clp->cl_lru);
421 while (!list_empty(&clp->cl_openowners)) { 695 while (!list_empty(&clp->cl_openowners)) {
422 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 696 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
423 release_stateowner(sop); 697 release_openowner(sop);
698 }
699 while (!list_empty(&clp->cl_sessions)) {
700 struct nfsd4_session *ses;
701 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
702 se_perclnt);
703 release_session(ses);
424 } 704 }
425 put_nfs4_client(clp); 705 put_nfs4_client(clp);
426} 706}
@@ -439,6 +719,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
439 INIT_LIST_HEAD(&clp->cl_strhash); 719 INIT_LIST_HEAD(&clp->cl_strhash);
440 INIT_LIST_HEAD(&clp->cl_openowners); 720 INIT_LIST_HEAD(&clp->cl_openowners);
441 INIT_LIST_HEAD(&clp->cl_delegations); 721 INIT_LIST_HEAD(&clp->cl_delegations);
722 INIT_LIST_HEAD(&clp->cl_sessions);
442 INIT_LIST_HEAD(&clp->cl_lru); 723 INIT_LIST_HEAD(&clp->cl_lru);
443 return clp; 724 return clp;
444} 725}
@@ -568,25 +849,45 @@ find_unconfirmed_client(clientid_t *clid)
568 return NULL; 849 return NULL;
569} 850}
570 851
852/*
853 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
854 * parameter. Matching is based on the fact the at least one of the
855 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
856 *
857 * FIXME: we need to unify the clientid namespaces for nfsv4.x
858 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
859 * and SET_CLIENTID{,_CONFIRM}
860 */
861static inline int
862match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
863{
864 bool has_exchange_flags = (clp->cl_exchange_flags != 0);
865 return use_exchange_id == has_exchange_flags;
866}
867
571static struct nfs4_client * 868static struct nfs4_client *
572find_confirmed_client_by_str(const char *dname, unsigned int hashval) 869find_confirmed_client_by_str(const char *dname, unsigned int hashval,
870 bool use_exchange_id)
573{ 871{
574 struct nfs4_client *clp; 872 struct nfs4_client *clp;
575 873
576 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 874 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
577 if (same_name(clp->cl_recdir, dname)) 875 if (same_name(clp->cl_recdir, dname) &&
876 match_clientid_establishment(clp, use_exchange_id))
578 return clp; 877 return clp;
579 } 878 }
580 return NULL; 879 return NULL;
581} 880}
582 881
583static struct nfs4_client * 882static struct nfs4_client *
584find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) 883find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
884 bool use_exchange_id)
585{ 885{
586 struct nfs4_client *clp; 886 struct nfs4_client *clp;
587 887
588 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 888 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
589 if (same_name(clp->cl_recdir, dname)) 889 if (same_name(clp->cl_recdir, dname) &&
890 match_clientid_establishment(clp, use_exchange_id))
590 return clp; 891 return clp;
591 } 892 }
592 return NULL; 893 return NULL;
@@ -685,6 +986,534 @@ out_err:
685 return; 986 return;
686} 987}
687 988
989void
990nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
991{
992 struct nfsd4_compoundres *resp = rqstp->rq_resp;
993
994 resp->cstate.statp = statp;
995}
996
997/*
998 * Dereference the result pages.
999 */
1000static void
1001nfsd4_release_respages(struct page **respages, short resused)
1002{
1003 int i;
1004
1005 dprintk("--> %s\n", __func__);
1006 for (i = 0; i < resused; i++) {
1007 if (!respages[i])
1008 continue;
1009 put_page(respages[i]);
1010 respages[i] = NULL;
1011 }
1012}
1013
1014static void
1015nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
1016{
1017 int i;
1018
1019 for (i = 0; i < count; i++) {
1020 topages[i] = frompages[i];
1021 if (!topages[i])
1022 continue;
1023 get_page(topages[i]);
1024 }
1025}
1026
1027/*
1028 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
1029 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
1030 * length of the XDR response is less than se_fmaxresp_cached
1031 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
1032 * of the reply (e.g. readdir).
1033 *
1034 * Store the base and length of the rq_req.head[0] page
1035 * of the NFSv4.1 data, just past the rpc header.
1036 */
1037void
1038nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1039{
1040 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1041 struct svc_rqst *rqstp = resp->rqstp;
1042 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1043 struct nfsd4_op *op = &args->ops[resp->opcnt];
1044 struct kvec *resv = &rqstp->rq_res.head[0];
1045
1046 dprintk("--> %s entry %p\n", __func__, entry);
1047
1048 /* Don't cache a failed OP_SEQUENCE. */
1049 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1050 return;
1051
1052 nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
1053 entry->ce_opcnt = resp->opcnt;
1054 entry->ce_status = resp->cstate.status;
1055
1056 /*
1057 * Don't need a page to cache just the sequence operation - the slot
1058 * does this for us!
1059 */
1060
1061 if (nfsd4_not_cached(resp)) {
1062 entry->ce_resused = 0;
1063 entry->ce_rpchdrlen = 0;
1064 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
1065 resp->cstate.slot->sl_cache_entry.ce_cachethis);
1066 return;
1067 }
1068 entry->ce_resused = rqstp->rq_resused;
1069 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1070 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1071 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1072 entry->ce_resused);
1073 entry->ce_datav.iov_base = resp->cstate.statp;
1074 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1075 (char *)page_address(rqstp->rq_respages[0]));
1076 /* Current request rpc header length*/
1077 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1078 (char *)page_address(rqstp->rq_respages[0]);
1079}
1080
1081/*
1082 * We keep the rpc header, but take the nfs reply from the replycache.
1083 */
1084static int
1085nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1086 struct nfsd4_cache_entry *entry)
1087{
1088 struct svc_rqst *rqstp = resp->rqstp;
1089 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1090 int len;
1091
1092 /* Current request rpc header length*/
1093 len = (char *)resp->cstate.statp -
1094 (char *)page_address(rqstp->rq_respages[0]);
1095 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1096 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1097 entry->ce_datav.iov_len);
1098 return 0;
1099 }
1100 /* copy the cached reply nfsd data past the current rpc header */
1101 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1102 entry->ce_datav.iov_len);
1103 resv->iov_len = len + entry->ce_datav.iov_len;
1104 return 1;
1105}
1106
1107/*
1108 * Keep the first page of the replay. Copy the NFSv4.1 data from the first
1109 * cached page. Replace any futher replay pages from the cache.
1110 */
1111__be32
1112nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1113 struct nfsd4_sequence *seq)
1114{
1115 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1116 __be32 status;
1117
1118 dprintk("--> %s entry %p\n", __func__, entry);
1119
1120 /*
1121 * If this is just the sequence operation, we did not keep
1122 * a page in the cache entry because we can just use the
1123 * slot info stored in struct nfsd4_sequence that was checked
1124 * against the slot in nfsd4_sequence().
1125 *
1126 * This occurs when seq->cachethis is FALSE, or when the client
1127 * session inactivity timer fires and a solo sequence operation
1128 * is sent (lease renewal).
1129 */
1130 if (seq && nfsd4_not_cached(resp)) {
1131 seq->maxslots = resp->cstate.session->se_fnumslots;
1132 return nfs_ok;
1133 }
1134
1135 if (!nfsd41_copy_replay_data(resp, entry)) {
1136 /*
1137 * Not enough room to use the replay rpc header, send the
1138 * cached header. Release all the allocated result pages.
1139 */
1140 svc_free_res_pages(resp->rqstp);
1141 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1142 entry->ce_resused);
1143 } else {
1144 /* Release all but the first allocated result page */
1145
1146 resp->rqstp->rq_resused--;
1147 svc_free_res_pages(resp->rqstp);
1148
1149 nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
1150 &entry->ce_respages[1],
1151 entry->ce_resused - 1);
1152 }
1153
1154 resp->rqstp->rq_resused = entry->ce_resused;
1155 resp->opcnt = entry->ce_opcnt;
1156 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
1157 status = entry->ce_status;
1158
1159 return status;
1160}
1161
1162/*
1163 * Set the exchange_id flags returned by the server.
1164 */
1165static void
1166nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
1167{
1168 /* pNFS is not supported */
1169 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
1170
1171 /* Referrals are supported, Migration is not. */
1172 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
1173
1174 /* set the wire flags to return to client. */
1175 clid->flags = new->cl_exchange_flags;
1176}
1177
1178__be32
1179nfsd4_exchange_id(struct svc_rqst *rqstp,
1180 struct nfsd4_compound_state *cstate,
1181 struct nfsd4_exchange_id *exid)
1182{
1183 struct nfs4_client *unconf, *conf, *new;
1184 int status;
1185 unsigned int strhashval;
1186 char dname[HEXDIR_LEN];
1187 nfs4_verifier verf = exid->verifier;
1188 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1189
1190 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1191 " ip_addr=%u flags %x, spa_how %d\n",
1192 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1193 ip_addr, exid->flags, exid->spa_how);
1194
1195 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1196 return nfserr_inval;
1197
1198 /* Currently only support SP4_NONE */
1199 switch (exid->spa_how) {
1200 case SP4_NONE:
1201 break;
1202 case SP4_SSV:
1203 return nfserr_encr_alg_unsupp;
1204 default:
1205 BUG(); /* checked by xdr code */
1206 case SP4_MACH_CRED:
1207 return nfserr_serverfault; /* no excuse :-/ */
1208 }
1209
1210 status = nfs4_make_rec_clidname(dname, &exid->clname);
1211
1212 if (status)
1213 goto error;
1214
1215 strhashval = clientstr_hashval(dname);
1216
1217 nfs4_lock_state();
1218 status = nfs_ok;
1219
1220 conf = find_confirmed_client_by_str(dname, strhashval, true);
1221 if (conf) {
1222 if (!same_verf(&verf, &conf->cl_verifier)) {
1223 /* 18.35.4 case 8 */
1224 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1225 status = nfserr_not_same;
1226 goto out;
1227 }
1228 /* Client reboot: destroy old state */
1229 expire_client(conf);
1230 goto out_new;
1231 }
1232 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1233 /* 18.35.4 case 9 */
1234 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1235 status = nfserr_perm;
1236 goto out;
1237 }
1238 expire_client(conf);
1239 goto out_new;
1240 }
1241 if (ip_addr != conf->cl_addr &&
1242 !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
1243 /* Client collision. 18.35.4 case 3 */
1244 status = nfserr_clid_inuse;
1245 goto out;
1246 }
1247 /*
1248 * Set bit when the owner id and verifier map to an already
1249 * confirmed client id (18.35.3).
1250 */
1251 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
1252
1253 /*
1254 * Falling into 18.35.4 case 2, possible router replay.
1255 * Leave confirmed record intact and return same result.
1256 */
1257 copy_verf(conf, &verf);
1258 new = conf;
1259 goto out_copy;
1260 } else {
1261 /* 18.35.4 case 7 */
1262 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1263 status = nfserr_noent;
1264 goto out;
1265 }
1266 }
1267
1268 unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
1269 if (unconf) {
1270 /*
1271 * Possible retry or client restart. Per 18.35.4 case 4,
1272 * a new unconfirmed record should be generated regardless
1273 * of whether any properties have changed.
1274 */
1275 expire_client(unconf);
1276 }
1277
1278out_new:
1279 /* Normal case */
1280 new = create_client(exid->clname, dname);
1281 if (new == NULL) {
1282 status = nfserr_resource;
1283 goto out;
1284 }
1285
1286 copy_verf(new, &verf);
1287 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1288 new->cl_addr = ip_addr;
1289 gen_clid(new);
1290 gen_confirm(new);
1291 add_to_unconfirmed(new, strhashval);
1292out_copy:
1293 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1294 exid->clientid.cl_id = new->cl_clientid.cl_id;
1295
1296 new->cl_slot.sl_seqid = 0;
1297 exid->seqid = 1;
1298 nfsd4_set_ex_flags(new, exid);
1299
1300 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1301 new->cl_slot.sl_seqid, new->cl_exchange_flags);
1302 status = nfs_ok;
1303
1304out:
1305 nfs4_unlock_state();
1306error:
1307 dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
1308 return status;
1309}
1310
1311static int
1312check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
1313{
1314 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
1315 slot->sl_seqid);
1316
1317 /* The slot is in use, and no response has been sent. */
1318 if (slot->sl_inuse) {
1319 if (seqid == slot->sl_seqid)
1320 return nfserr_jukebox;
1321 else
1322 return nfserr_seq_misordered;
1323 }
1324 /* Normal */
1325 if (likely(seqid == slot->sl_seqid + 1))
1326 return nfs_ok;
1327 /* Replay */
1328 if (seqid == slot->sl_seqid)
1329 return nfserr_replay_cache;
1330 /* Wraparound */
1331 if (seqid == 1 && (slot->sl_seqid + 1) == 0)
1332 return nfs_ok;
1333 /* Misordered replay or misordered new request */
1334 return nfserr_seq_misordered;
1335}
1336
1337__be32
1338nfsd4_create_session(struct svc_rqst *rqstp,
1339 struct nfsd4_compound_state *cstate,
1340 struct nfsd4_create_session *cr_ses)
1341{
1342 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1343 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1344 struct nfs4_client *conf, *unconf;
1345 struct nfsd4_slot *slot = NULL;
1346 int status = 0;
1347
1348 nfs4_lock_state();
1349 unconf = find_unconfirmed_client(&cr_ses->clientid);
1350 conf = find_confirmed_client(&cr_ses->clientid);
1351
1352 if (conf) {
1353 slot = &conf->cl_slot;
1354 status = check_slot_seqid(cr_ses->seqid, slot);
1355 if (status == nfserr_replay_cache) {
1356 dprintk("Got a create_session replay! seqid= %d\n",
1357 slot->sl_seqid);
1358 cstate->slot = slot;
1359 cstate->status = status;
1360 /* Return the cached reply status */
1361 status = nfsd4_replay_cache_entry(resp, NULL);
1362 goto out;
1363 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
1364 status = nfserr_seq_misordered;
1365 dprintk("Sequence misordered!\n");
1366 dprintk("Expected seqid= %d but got seqid= %d\n",
1367 slot->sl_seqid, cr_ses->seqid);
1368 goto out;
1369 }
1370 conf->cl_slot.sl_seqid++;
1371 } else if (unconf) {
1372 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1373 (ip_addr != unconf->cl_addr)) {
1374 status = nfserr_clid_inuse;
1375 goto out;
1376 }
1377
1378 slot = &unconf->cl_slot;
1379 status = check_slot_seqid(cr_ses->seqid, slot);
1380 if (status) {
1381 /* an unconfirmed replay returns misordered */
1382 status = nfserr_seq_misordered;
1383 goto out;
1384 }
1385
1386 slot->sl_seqid++; /* from 0 to 1 */
1387 move_to_confirmed(unconf);
1388
1389 /*
1390 * We do not support RDMA or persistent sessions
1391 */
1392 cr_ses->flags &= ~SESSION4_PERSIST;
1393 cr_ses->flags &= ~SESSION4_RDMA;
1394
1395 conf = unconf;
1396 } else {
1397 status = nfserr_stale_clientid;
1398 goto out;
1399 }
1400
1401 status = alloc_init_session(rqstp, conf, cr_ses);
1402 if (status)
1403 goto out;
1404
1405 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1406 NFS4_MAX_SESSIONID_LEN);
1407 cr_ses->seqid = slot->sl_seqid;
1408
1409 slot->sl_inuse = true;
1410 cstate->slot = slot;
1411 /* Ensure a page is used for the cache */
1412 slot->sl_cache_entry.ce_cachethis = 1;
1413out:
1414 nfs4_unlock_state();
1415 dprintk("%s returns %d\n", __func__, ntohl(status));
1416 return status;
1417}
1418
1419__be32
1420nfsd4_destroy_session(struct svc_rqst *r,
1421 struct nfsd4_compound_state *cstate,
1422 struct nfsd4_destroy_session *sessionid)
1423{
1424 struct nfsd4_session *ses;
1425 u32 status = nfserr_badsession;
1426
1427 /* Notes:
1428 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
1429 * - Should we return nfserr_back_chan_busy if waiting for
1430 * callbacks on to-be-destroyed session?
1431 * - Do we need to clear any callback info from previous session?
1432 */
1433
1434 dump_sessionid(__func__, &sessionid->sessionid);
1435 spin_lock(&sessionid_lock);
1436 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1437 if (!ses) {
1438 spin_unlock(&sessionid_lock);
1439 goto out;
1440 }
1441
1442 unhash_session(ses);
1443 spin_unlock(&sessionid_lock);
1444
1445 /* wait for callbacks */
1446 shutdown_callback_client(ses->se_client);
1447 nfsd4_put_session(ses);
1448 status = nfs_ok;
1449out:
1450 dprintk("%s returns %d\n", __func__, ntohl(status));
1451 return status;
1452}
1453
1454__be32
1455nfsd4_sequence(struct svc_rqst *rqstp,
1456 struct nfsd4_compound_state *cstate,
1457 struct nfsd4_sequence *seq)
1458{
1459 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1460 struct nfsd4_session *session;
1461 struct nfsd4_slot *slot;
1462 int status;
1463
1464 if (resp->opcnt != 1)
1465 return nfserr_sequence_pos;
1466
1467 spin_lock(&sessionid_lock);
1468 status = nfserr_badsession;
1469 session = find_in_sessionid_hashtbl(&seq->sessionid);
1470 if (!session)
1471 goto out;
1472
1473 status = nfserr_badslot;
1474 if (seq->slotid >= session->se_fnumslots)
1475 goto out;
1476
1477 slot = &session->se_slots[seq->slotid];
1478 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1479
1480 status = check_slot_seqid(seq->seqid, slot);
1481 if (status == nfserr_replay_cache) {
1482 cstate->slot = slot;
1483 cstate->session = session;
1484 /* Return the cached reply status and set cstate->status
1485 * for nfsd4_svc_encode_compoundres processing */
1486 status = nfsd4_replay_cache_entry(resp, seq);
1487 cstate->status = nfserr_replay_cache;
1488 goto replay_cache;
1489 }
1490 if (status)
1491 goto out;
1492
1493 /* Success! bump slot seqid */
1494 slot->sl_inuse = true;
1495 slot->sl_seqid = seq->seqid;
1496 slot->sl_cache_entry.ce_cachethis = seq->cachethis;
1497 /* Always set the cache entry cachethis for solo sequence */
1498 if (nfsd4_is_solo_sequence(resp))
1499 slot->sl_cache_entry.ce_cachethis = 1;
1500
1501 cstate->slot = slot;
1502 cstate->session = session;
1503
1504replay_cache:
1505 /* Renew the clientid on success and on replay.
1506 * Hold a session reference until done processing the compound:
1507 * nfsd4_put_session called only if the cstate slot is set.
1508 */
1509 renew_client(session->se_client);
1510 nfsd4_get_session(session);
1511out:
1512 spin_unlock(&sessionid_lock);
1513 dprintk("%s: return %d\n", __func__, ntohl(status));
1514 return status;
1515}
1516
688__be32 1517__be32
689nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1518nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
690 struct nfsd4_setclientid *setclid) 1519 struct nfsd4_setclientid *setclid)
@@ -716,14 +1545,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
716 strhashval = clientstr_hashval(dname); 1545 strhashval = clientstr_hashval(dname);
717 1546
718 nfs4_lock_state(); 1547 nfs4_lock_state();
719 conf = find_confirmed_client_by_str(dname, strhashval); 1548 conf = find_confirmed_client_by_str(dname, strhashval, false);
720 if (conf) { 1549 if (conf) {
721 /* RFC 3530 14.2.33 CASE 0: */ 1550 /* RFC 3530 14.2.33 CASE 0: */
722 status = nfserr_clid_inuse; 1551 status = nfserr_clid_inuse;
723 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 1552 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
724 || conf->cl_addr != sin->sin_addr.s_addr) { 1553 dprintk("NFSD: setclientid: string in use by client"
725 dprintk("NFSD: setclientid: string in use by clientat %pI4\n", 1554 " at %pI4\n", &conf->cl_addr);
726 &conf->cl_addr);
727 goto out; 1555 goto out;
728 } 1556 }
729 } 1557 }
@@ -732,7 +1560,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
732 * has a description of SETCLIENTID request processing consisting 1560 * has a description of SETCLIENTID request processing consisting
733 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1561 * of 5 bullet points, labeled as CASE0 - CASE4 below.
734 */ 1562 */
735 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1563 unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
736 status = nfserr_resource; 1564 status = nfserr_resource;
737 if (!conf) { 1565 if (!conf) {
738 /* 1566 /*
@@ -887,7 +1715,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
887 unsigned int hash = 1715 unsigned int hash =
888 clientstr_hashval(unconf->cl_recdir); 1716 clientstr_hashval(unconf->cl_recdir);
889 conf = find_confirmed_client_by_str(unconf->cl_recdir, 1717 conf = find_confirmed_client_by_str(unconf->cl_recdir,
890 hash); 1718 hash, false);
891 if (conf) { 1719 if (conf) {
892 nfsd4_remove_clid_dir(conf); 1720 nfsd4_remove_clid_dir(conf);
893 expire_client(conf); 1721 expire_client(conf);
@@ -923,11 +1751,13 @@ alloc_init_file(struct inode *ino)
923 1751
924 fp = kmem_cache_alloc(file_slab, GFP_KERNEL); 1752 fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
925 if (fp) { 1753 if (fp) {
926 kref_init(&fp->fi_ref); 1754 atomic_set(&fp->fi_ref, 1);
927 INIT_LIST_HEAD(&fp->fi_hash); 1755 INIT_LIST_HEAD(&fp->fi_hash);
928 INIT_LIST_HEAD(&fp->fi_stateids); 1756 INIT_LIST_HEAD(&fp->fi_stateids);
929 INIT_LIST_HEAD(&fp->fi_delegations); 1757 INIT_LIST_HEAD(&fp->fi_delegations);
1758 spin_lock(&recall_lock);
930 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1759 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1760 spin_unlock(&recall_lock);
931 fp->fi_inode = igrab(ino); 1761 fp->fi_inode = igrab(ino);
932 fp->fi_id = current_fileid++; 1762 fp->fi_id = current_fileid++;
933 fp->fi_had_conflict = false; 1763 fp->fi_had_conflict = false;
@@ -1037,48 +1867,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
1037 return sop; 1867 return sop;
1038} 1868}
1039 1869
1040static void
1041release_stateid_lockowners(struct nfs4_stateid *open_stp)
1042{
1043 struct nfs4_stateowner *lock_sop;
1044
1045 while (!list_empty(&open_stp->st_lockowners)) {
1046 lock_sop = list_entry(open_stp->st_lockowners.next,
1047 struct nfs4_stateowner, so_perstateid);
1048 /* list_del(&open_stp->st_lockowners); */
1049 BUG_ON(lock_sop->so_is_open_owner);
1050 release_stateowner(lock_sop);
1051 }
1052}
1053
1054static void
1055unhash_stateowner(struct nfs4_stateowner *sop)
1056{
1057 struct nfs4_stateid *stp;
1058
1059 list_del(&sop->so_idhash);
1060 list_del(&sop->so_strhash);
1061 if (sop->so_is_open_owner)
1062 list_del(&sop->so_perclient);
1063 list_del(&sop->so_perstateid);
1064 while (!list_empty(&sop->so_stateids)) {
1065 stp = list_entry(sop->so_stateids.next,
1066 struct nfs4_stateid, st_perstateowner);
1067 if (sop->so_is_open_owner)
1068 release_stateid(stp, OPEN_STATE);
1069 else
1070 release_stateid(stp, LOCK_STATE);
1071 }
1072}
1073
1074static void
1075release_stateowner(struct nfs4_stateowner *sop)
1076{
1077 unhash_stateowner(sop);
1078 list_del(&sop->so_close_lru);
1079 nfs4_put_stateowner(sop);
1080}
1081
1082static inline void 1870static inline void
1083init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 1871init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
1084 struct nfs4_stateowner *sop = open->op_stateowner; 1872 struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1888,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1100 stp->st_stateid.si_generation = 0; 1888 stp->st_stateid.si_generation = 0;
1101 stp->st_access_bmap = 0; 1889 stp->st_access_bmap = 0;
1102 stp->st_deny_bmap = 0; 1890 stp->st_deny_bmap = 0;
1103 __set_bit(open->op_share_access, &stp->st_access_bmap); 1891 __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
1892 &stp->st_access_bmap);
1104 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 1893 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
1105 stp->st_openstp = NULL; 1894 stp->st_openstp = NULL;
1106} 1895}
1107 1896
1108static void 1897static void
1109release_stateid(struct nfs4_stateid *stp, int flags)
1110{
1111 struct file *filp = stp->st_vfs_file;
1112
1113 list_del(&stp->st_hash);
1114 list_del(&stp->st_perfile);
1115 list_del(&stp->st_perstateowner);
1116 if (flags & OPEN_STATE) {
1117 release_stateid_lockowners(stp);
1118 stp->st_vfs_file = NULL;
1119 nfsd_close(filp);
1120 } else if (flags & LOCK_STATE)
1121 locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
1122 put_nfs4_file(stp->st_file);
1123 kmem_cache_free(stateid_slab, stp);
1124}
1125
1126static void
1127move_to_close_lru(struct nfs4_stateowner *sop) 1898move_to_close_lru(struct nfs4_stateowner *sop)
1128{ 1899{
1129 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); 1900 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1931,33 @@ find_file(struct inode *ino)
1160 unsigned int hashval = file_hashval(ino); 1931 unsigned int hashval = file_hashval(ino);
1161 struct nfs4_file *fp; 1932 struct nfs4_file *fp;
1162 1933
1934 spin_lock(&recall_lock);
1163 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { 1935 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
1164 if (fp->fi_inode == ino) { 1936 if (fp->fi_inode == ino) {
1165 get_nfs4_file(fp); 1937 get_nfs4_file(fp);
1938 spin_unlock(&recall_lock);
1166 return fp; 1939 return fp;
1167 } 1940 }
1168 } 1941 }
1942 spin_unlock(&recall_lock);
1169 return NULL; 1943 return NULL;
1170} 1944}
1171 1945
1172static inline int access_valid(u32 x) 1946static inline int access_valid(u32 x, u32 minorversion)
1173{ 1947{
1174 if (x < NFS4_SHARE_ACCESS_READ) 1948 if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
1175 return 0; 1949 return 0;
1176 if (x > NFS4_SHARE_ACCESS_BOTH) 1950 if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
1951 return 0;
1952 x &= ~NFS4_SHARE_ACCESS_MASK;
1953 if (minorversion && x) {
1954 if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
1955 return 0;
1956 if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
1957 return 0;
1958 x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
1959 }
1960 if (x)
1177 return 0; 1961 return 0;
1178 return 1; 1962 return 1;
1179} 1963}
@@ -1409,7 +2193,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
1409 2193
1410 2194
1411__be32 2195__be32
1412nfsd4_process_open1(struct nfsd4_open *open) 2196nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2197 struct nfsd4_open *open)
1413{ 2198{
1414 clientid_t *clientid = &open->op_clientid; 2199 clientid_t *clientid = &open->op_clientid;
1415 struct nfs4_client *clp = NULL; 2200 struct nfs4_client *clp = NULL;
@@ -1432,10 +2217,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
1432 return nfserr_expired; 2217 return nfserr_expired;
1433 goto renew; 2218 goto renew;
1434 } 2219 }
2220 /* When sessions are used, skip open sequenceid processing */
2221 if (nfsd4_has_session(cstate))
2222 goto renew;
1435 if (!sop->so_confirmed) { 2223 if (!sop->so_confirmed) {
1436 /* Replace unconfirmed owners without checking for replay. */ 2224 /* Replace unconfirmed owners without checking for replay. */
1437 clp = sop->so_client; 2225 clp = sop->so_client;
1438 release_stateowner(sop); 2226 release_openowner(sop);
1439 open->op_stateowner = NULL; 2227 open->op_stateowner = NULL;
1440 goto renew; 2228 goto renew;
1441 } 2229 }
@@ -1709,6 +2497,7 @@ out:
1709__be32 2497__be32
1710nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 2498nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
1711{ 2499{
2500 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1712 struct nfs4_file *fp = NULL; 2501 struct nfs4_file *fp = NULL;
1713 struct inode *ino = current_fh->fh_dentry->d_inode; 2502 struct inode *ino = current_fh->fh_dentry->d_inode;
1714 struct nfs4_stateid *stp = NULL; 2503 struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2505,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1716 __be32 status; 2505 __be32 status;
1717 2506
1718 status = nfserr_inval; 2507 status = nfserr_inval;
1719 if (!access_valid(open->op_share_access) 2508 if (!access_valid(open->op_share_access, resp->cstate.minorversion)
1720 || !deny_valid(open->op_share_deny)) 2509 || !deny_valid(open->op_share_deny))
1721 goto out; 2510 goto out;
1722 /* 2511 /*
@@ -1764,12 +2553,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1764 init_stateid(stp, fp, open); 2553 init_stateid(stp, fp, open);
1765 status = nfsd4_truncate(rqstp, current_fh, open); 2554 status = nfsd4_truncate(rqstp, current_fh, open);
1766 if (status) { 2555 if (status) {
1767 release_stateid(stp, OPEN_STATE); 2556 release_open_stateid(stp);
1768 goto out; 2557 goto out;
1769 } 2558 }
2559 if (nfsd4_has_session(&resp->cstate))
2560 update_stateid(&stp->st_stateid);
1770 } 2561 }
1771 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2562 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
1772 2563
2564 if (nfsd4_has_session(&resp->cstate))
2565 open->op_stateowner->so_confirmed = 1;
2566
1773 /* 2567 /*
1774 * Attempt to hand out a delegation. No error return, because the 2568 * Attempt to hand out a delegation. No error return, because the
1775 * OPEN succeeds even if we fail. 2569 * OPEN succeeds even if we fail.
@@ -1790,7 +2584,8 @@ out:
1790 * To finish the open response, we just need to set the rflags. 2584 * To finish the open response, we just need to set the rflags.
1791 */ 2585 */
1792 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; 2586 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
1793 if (!open->op_stateowner->so_confirmed) 2587 if (!open->op_stateowner->so_confirmed &&
2588 !nfsd4_has_session(&resp->cstate))
1794 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 2589 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
1795 2590
1796 return status; 2591 return status;
@@ -1898,7 +2693,7 @@ nfs4_laundromat(void)
1898 } 2693 }
1899 dprintk("NFSD: purging unused open stateowner (so_id %d)\n", 2694 dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
1900 sop->so_id); 2695 sop->so_id);
1901 release_stateowner(sop); 2696 release_openowner(sop);
1902 } 2697 }
1903 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) 2698 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
1904 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; 2699 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2778,7 @@ out:
1983static inline __be32 2778static inline __be32
1984check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) 2779check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1985{ 2780{
1986 /* Trying to call delegreturn with a special stateid? Yuch: */ 2781 if (ONE_STATEID(stateid) && (flags & RD_STATE))
1987 if (!(flags & (RD_STATE | WR_STATE)))
1988 return nfserr_bad_stateid;
1989 else if (ONE_STATEID(stateid) && (flags & RD_STATE))
1990 return nfs_ok; 2782 return nfs_ok;
1991 else if (locks_in_grace()) { 2783 else if (locks_in_grace()) {
1992 /* Answer in remaining cases depends on existance of 2784 /* Answer in remaining cases depends on existance of
@@ -2005,14 +2797,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
2005 * that are not able to provide mandatory locking. 2797 * that are not able to provide mandatory locking.
2006 */ 2798 */
2007static inline int 2799static inline int
2008io_during_grace_disallowed(struct inode *inode, int flags) 2800grace_disallows_io(struct inode *inode)
2009{ 2801{
2010 return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) 2802 return locks_in_grace() && mandatory_lock(inode);
2011 && mandatory_lock(inode);
2012} 2803}
2013 2804
2014static int check_stateid_generation(stateid_t *in, stateid_t *ref) 2805static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
2015{ 2806{
2807 /*
2808 * When sessions are used the stateid generation number is ignored
2809 * when it is zero.
2810 */
2811 if ((flags & HAS_SESSION) && in->si_generation == 0)
2812 goto out;
2813
2016 /* If the client sends us a stateid from the future, it's buggy: */ 2814 /* If the client sends us a stateid from the future, it's buggy: */
2017 if (in->si_generation > ref->si_generation) 2815 if (in->si_generation > ref->si_generation)
2018 return nfserr_bad_stateid; 2816 return nfserr_bad_stateid;
@@ -2028,74 +2826,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
2028 */ 2826 */
2029 if (in->si_generation < ref->si_generation) 2827 if (in->si_generation < ref->si_generation)
2030 return nfserr_old_stateid; 2828 return nfserr_old_stateid;
2829out:
2031 return nfs_ok; 2830 return nfs_ok;
2032} 2831}
2033 2832
2833static int is_delegation_stateid(stateid_t *stateid)
2834{
2835 return stateid->si_fileid == 0;
2836}
2837
2034/* 2838/*
2035* Checks for stateid operations 2839* Checks for stateid operations
2036*/ 2840*/
2037__be32 2841__be32
2038nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) 2842nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2843 stateid_t *stateid, int flags, struct file **filpp)
2039{ 2844{
2040 struct nfs4_stateid *stp = NULL; 2845 struct nfs4_stateid *stp = NULL;
2041 struct nfs4_delegation *dp = NULL; 2846 struct nfs4_delegation *dp = NULL;
2042 stateid_t *stidp; 2847 struct svc_fh *current_fh = &cstate->current_fh;
2043 struct inode *ino = current_fh->fh_dentry->d_inode; 2848 struct inode *ino = current_fh->fh_dentry->d_inode;
2044 __be32 status; 2849 __be32 status;
2045 2850
2046 dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
2047 stateid->si_boot, stateid->si_stateownerid,
2048 stateid->si_fileid, stateid->si_generation);
2049 if (filpp) 2851 if (filpp)
2050 *filpp = NULL; 2852 *filpp = NULL;
2051 2853
2052 if (io_during_grace_disallowed(ino, flags)) 2854 if (grace_disallows_io(ino))
2053 return nfserr_grace; 2855 return nfserr_grace;
2054 2856
2857 if (nfsd4_has_session(cstate))
2858 flags |= HAS_SESSION;
2859
2055 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 2860 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2056 return check_special_stateids(current_fh, stateid, flags); 2861 return check_special_stateids(current_fh, stateid, flags);
2057 2862
2058 /* STALE STATEID */
2059 status = nfserr_stale_stateid; 2863 status = nfserr_stale_stateid;
2060 if (STALE_STATEID(stateid)) 2864 if (STALE_STATEID(stateid))
2061 goto out; 2865 goto out;
2062 2866
2063 /* BAD STATEID */
2064 status = nfserr_bad_stateid; 2867 status = nfserr_bad_stateid;
2065 if (!stateid->si_fileid) { /* delegation stateid */ 2868 if (is_delegation_stateid(stateid)) {
2066 if(!(dp = find_delegation_stateid(ino, stateid))) { 2869 dp = find_delegation_stateid(ino, stateid);
2067 dprintk("NFSD: delegation stateid not found\n"); 2870 if (!dp)
2068 goto out; 2871 goto out;
2069 } 2872 status = check_stateid_generation(stateid, &dp->dl_stateid,
2070 stidp = &dp->dl_stateid; 2873 flags);
2874 if (status)
2875 goto out;
2876 status = nfs4_check_delegmode(dp, flags);
2877 if (status)
2878 goto out;
2879 renew_client(dp->dl_client);
2880 if (filpp)
2881 *filpp = dp->dl_vfs_file;
2071 } else { /* open or lock stateid */ 2882 } else { /* open or lock stateid */
2072 if (!(stp = find_stateid(stateid, flags))) { 2883 stp = find_stateid(stateid, flags);
2073 dprintk("NFSD: open or lock stateid not found\n"); 2884 if (!stp)
2074 goto out; 2885 goto out;
2075 } 2886 if (nfs4_check_fh(current_fh, stp))
2076 if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
2077 goto out; 2887 goto out;
2078 if (!stp->st_stateowner->so_confirmed) 2888 if (!stp->st_stateowner->so_confirmed)
2079 goto out; 2889 goto out;
2080 stidp = &stp->st_stateid; 2890 status = check_stateid_generation(stateid, &stp->st_stateid,
2081 } 2891 flags);
2082 status = check_stateid_generation(stateid, stidp); 2892 if (status)
2083 if (status) 2893 goto out;
2084 goto out; 2894 status = nfs4_check_openmode(stp, flags);
2085 if (stp) { 2895 if (status)
2086 if ((status = nfs4_check_openmode(stp,flags)))
2087 goto out; 2896 goto out;
2088 renew_client(stp->st_stateowner->so_client); 2897 renew_client(stp->st_stateowner->so_client);
2089 if (filpp) 2898 if (filpp)
2090 *filpp = stp->st_vfs_file; 2899 *filpp = stp->st_vfs_file;
2091 } else {
2092 if ((status = nfs4_check_delegmode(dp, flags)))
2093 goto out;
2094 renew_client(dp->dl_client);
2095 if (flags & DELEG_RET)
2096 unhash_delegation(dp);
2097 if (filpp)
2098 *filpp = dp->dl_vfs_file;
2099 } 2900 }
2100 status = nfs_ok; 2901 status = nfs_ok;
2101out: 2902out:
@@ -2113,10 +2914,14 @@ setlkflg (int type)
2113 * Checks for sequence id mutating operations. 2914 * Checks for sequence id mutating operations.
2114 */ 2915 */
2115static __be32 2916static __be32
2116nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) 2917nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2918 stateid_t *stateid, int flags,
2919 struct nfs4_stateowner **sopp,
2920 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
2117{ 2921{
2118 struct nfs4_stateid *stp; 2922 struct nfs4_stateid *stp;
2119 struct nfs4_stateowner *sop; 2923 struct nfs4_stateowner *sop;
2924 struct svc_fh *current_fh = &cstate->current_fh;
2120 __be32 status; 2925 __be32 status;
2121 2926
2122 dprintk("NFSD: preprocess_seqid_op: seqid=%d " 2927 dprintk("NFSD: preprocess_seqid_op: seqid=%d "
@@ -2134,6 +2939,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2134 2939
2135 if (STALE_STATEID(stateid)) 2940 if (STALE_STATEID(stateid))
2136 return nfserr_stale_stateid; 2941 return nfserr_stale_stateid;
2942
2943 if (nfsd4_has_session(cstate))
2944 flags |= HAS_SESSION;
2945
2137 /* 2946 /*
2138 * We return BAD_STATEID if filehandle doesn't match stateid, 2947 * We return BAD_STATEID if filehandle doesn't match stateid,
2139 * the confirmed flag is incorrecly set, or the generation 2948 * the confirmed flag is incorrecly set, or the generation
@@ -2166,8 +2975,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2166 if (lock->lk_is_new) { 2975 if (lock->lk_is_new) {
2167 if (!sop->so_is_open_owner) 2976 if (!sop->so_is_open_owner)
2168 return nfserr_bad_stateid; 2977 return nfserr_bad_stateid;
2169 if (!same_clid(&clp->cl_clientid, lockclid)) 2978 if (!(flags & HAS_SESSION) &&
2170 return nfserr_bad_stateid; 2979 !same_clid(&clp->cl_clientid, lockclid))
2980 return nfserr_bad_stateid;
2171 /* stp is the open stateid */ 2981 /* stp is the open stateid */
2172 status = nfs4_check_openmode(stp, lkflg); 2982 status = nfs4_check_openmode(stp, lkflg);
2173 if (status) 2983 if (status)
@@ -2190,7 +3000,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2190 * For the moment, we ignore the possibility of 3000 * For the moment, we ignore the possibility of
2191 * generation number wraparound. 3001 * generation number wraparound.
2192 */ 3002 */
2193 if (seqid != sop->so_seqid) 3003 if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
2194 goto check_replay; 3004 goto check_replay;
2195 3005
2196 if (sop->so_confirmed && flags & CONFIRM) { 3006 if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3013,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2203 " confirmed yet!\n"); 3013 " confirmed yet!\n");
2204 return nfserr_bad_stateid; 3014 return nfserr_bad_stateid;
2205 } 3015 }
2206 status = check_stateid_generation(stateid, &stp->st_stateid); 3016 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
2207 if (status) 3017 if (status)
2208 return status; 3018 return status;
2209 renew_client(sop->so_client); 3019 renew_client(sop->so_client);
@@ -2239,7 +3049,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2239 3049
2240 nfs4_lock_state(); 3050 nfs4_lock_state();
2241 3051
2242 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3052 if ((status = nfs4_preprocess_seqid_op(cstate,
2243 oc->oc_seqid, &oc->oc_req_stateid, 3053 oc->oc_seqid, &oc->oc_req_stateid,
2244 CONFIRM | OPEN_STATE, 3054 CONFIRM | OPEN_STATE,
2245 &oc->oc_stateowner, &stp, NULL))) 3055 &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3114,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
2304 (int)cstate->current_fh.fh_dentry->d_name.len, 3114 (int)cstate->current_fh.fh_dentry->d_name.len,
2305 cstate->current_fh.fh_dentry->d_name.name); 3115 cstate->current_fh.fh_dentry->d_name.name);
2306 3116
2307 if (!access_valid(od->od_share_access) 3117 if (!access_valid(od->od_share_access, cstate->minorversion)
2308 || !deny_valid(od->od_share_deny)) 3118 || !deny_valid(od->od_share_deny))
2309 return nfserr_inval; 3119 return nfserr_inval;
2310 3120
2311 nfs4_lock_state(); 3121 nfs4_lock_state();
2312 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3122 if ((status = nfs4_preprocess_seqid_op(cstate,
2313 od->od_seqid, 3123 od->od_seqid,
2314 &od->od_stateid, 3124 &od->od_stateid,
2315 OPEN_STATE, 3125 OPEN_STATE,
@@ -2362,7 +3172,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2362 3172
2363 nfs4_lock_state(); 3173 nfs4_lock_state();
2364 /* check close_lru for replay */ 3174 /* check close_lru for replay */
2365 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3175 if ((status = nfs4_preprocess_seqid_op(cstate,
2366 close->cl_seqid, 3176 close->cl_seqid,
2367 &close->cl_stateid, 3177 &close->cl_stateid,
2368 OPEN_STATE | CLOSE_STATE, 3178 OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3183,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2373 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 3183 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
2374 3184
2375 /* release_stateid() calls nfsd_close() if needed */ 3185 /* release_stateid() calls nfsd_close() if needed */
2376 release_stateid(stp, OPEN_STATE); 3186 release_open_stateid(stp);
2377 3187
2378 /* place unused nfs4_stateowners on so_close_lru list to be 3188 /* place unused nfs4_stateowners on so_close_lru list to be
2379 * released by the laundromat service after the lease period 3189 * released by the laundromat service after the lease period
@@ -2394,16 +3204,40 @@ __be32
2394nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3204nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2395 struct nfsd4_delegreturn *dr) 3205 struct nfsd4_delegreturn *dr)
2396{ 3206{
3207 struct nfs4_delegation *dp;
3208 stateid_t *stateid = &dr->dr_stateid;
3209 struct inode *inode;
2397 __be32 status; 3210 __be32 status;
3211 int flags = 0;
2398 3212
2399 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3213 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
2400 goto out; 3214 return status;
3215 inode = cstate->current_fh.fh_dentry->d_inode;
2401 3216
3217 if (nfsd4_has_session(cstate))
3218 flags |= HAS_SESSION;
2402 nfs4_lock_state(); 3219 nfs4_lock_state();
2403 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 3220 status = nfserr_bad_stateid;
2404 &dr->dr_stateid, DELEG_RET, NULL); 3221 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2405 nfs4_unlock_state(); 3222 goto out;
3223 status = nfserr_stale_stateid;
3224 if (STALE_STATEID(stateid))
3225 goto out;
3226 status = nfserr_bad_stateid;
3227 if (!is_delegation_stateid(stateid))
3228 goto out;
3229 dp = find_delegation_stateid(inode, stateid);
3230 if (!dp)
3231 goto out;
3232 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3233 if (status)
3234 goto out;
3235 renew_client(dp->dl_client);
3236
3237 unhash_delegation(dp);
2406out: 3238out:
3239 nfs4_unlock_state();
3240
2407 return status; 3241 return status;
2408} 3242}
2409 3243
@@ -2684,11 +3518,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2684 struct nfs4_file *fp; 3518 struct nfs4_file *fp;
2685 3519
2686 status = nfserr_stale_clientid; 3520 status = nfserr_stale_clientid;
2687 if (STALE_CLIENTID(&lock->lk_new_clientid)) 3521 if (!nfsd4_has_session(cstate) &&
3522 STALE_CLIENTID(&lock->lk_new_clientid))
2688 goto out; 3523 goto out;
2689 3524
2690 /* validate and update open stateid and open seqid */ 3525 /* validate and update open stateid and open seqid */
2691 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3526 status = nfs4_preprocess_seqid_op(cstate,
2692 lock->lk_new_open_seqid, 3527 lock->lk_new_open_seqid,
2693 &lock->lk_new_open_stateid, 3528 &lock->lk_new_open_stateid,
2694 OPEN_STATE, 3529 OPEN_STATE,
@@ -2715,7 +3550,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2715 goto out; 3550 goto out;
2716 } else { 3551 } else {
2717 /* lock (lock owner + lock stateid) already exists */ 3552 /* lock (lock owner + lock stateid) already exists */
2718 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3553 status = nfs4_preprocess_seqid_op(cstate,
2719 lock->lk_old_lock_seqid, 3554 lock->lk_old_lock_seqid,
2720 &lock->lk_old_lock_stateid, 3555 &lock->lk_old_lock_stateid,
2721 LOCK_STATE, 3556 LOCK_STATE,
@@ -2788,7 +3623,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2788 } 3623 }
2789out: 3624out:
2790 if (status && lock->lk_is_new && lock_sop) 3625 if (status && lock->lk_is_new && lock_sop)
2791 release_stateowner(lock_sop); 3626 release_lockowner(lock_sop);
2792 if (lock->lk_replay_owner) { 3627 if (lock->lk_replay_owner) {
2793 nfs4_get_stateowner(lock->lk_replay_owner); 3628 nfs4_get_stateowner(lock->lk_replay_owner);
2794 cstate->replay_owner = lock->lk_replay_owner; 3629 cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3673,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2838 nfs4_lock_state(); 3673 nfs4_lock_state();
2839 3674
2840 status = nfserr_stale_clientid; 3675 status = nfserr_stale_clientid;
2841 if (STALE_CLIENTID(&lockt->lt_clientid)) 3676 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
2842 goto out; 3677 goto out;
2843 3678
2844 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { 3679 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3746,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2911 3746
2912 nfs4_lock_state(); 3747 nfs4_lock_state();
2913 3748
2914 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3749 if ((status = nfs4_preprocess_seqid_op(cstate,
2915 locku->lu_seqid, 3750 locku->lu_seqid,
2916 &locku->lu_stateid, 3751 &locku->lu_stateid,
2917 LOCK_STATE, 3752 LOCK_STATE,
@@ -3037,7 +3872,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3037 /* unhash_stateowner deletes so_perclient only 3872 /* unhash_stateowner deletes so_perclient only
3038 * for openowners. */ 3873 * for openowners. */
3039 list_del(&sop->so_perclient); 3874 list_del(&sop->so_perclient);
3040 release_stateowner(sop); 3875 release_lockowner(sop);
3041 } 3876 }
3042out: 3877out:
3043 nfs4_unlock_state(); 3878 nfs4_unlock_state();
@@ -3051,12 +3886,12 @@ alloc_reclaim(void)
3051} 3886}
3052 3887
3053int 3888int
3054nfs4_has_reclaimed_state(const char *name) 3889nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
3055{ 3890{
3056 unsigned int strhashval = clientstr_hashval(name); 3891 unsigned int strhashval = clientstr_hashval(name);
3057 struct nfs4_client *clp; 3892 struct nfs4_client *clp;
3058 3893
3059 clp = find_confirmed_client_by_str(name, strhashval); 3894 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
3060 return clp ? 1 : 0; 3895 return clp ? 1 : 0;
3061} 3896}
3062 3897
@@ -3153,6 +3988,8 @@ nfs4_state_init(void)
3153 INIT_LIST_HEAD(&unconf_str_hashtbl[i]); 3988 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
3154 INIT_LIST_HEAD(&unconf_id_hashtbl[i]); 3989 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
3155 } 3990 }
3991 for (i = 0; i < SESSION_HASH_SIZE; i++)
3992 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
3156 for (i = 0; i < FILE_HASH_SIZE; i++) { 3993 for (i = 0; i < FILE_HASH_SIZE; i++) {
3157 INIT_LIST_HEAD(&file_hashtbl[i]); 3994 INIT_LIST_HEAD(&file_hashtbl[i]);
3158 } 3995 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b73549d293be 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
45#include <linux/fs.h> 45#include <linux/fs.h>
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/vfs.h> 47#include <linux/vfs.h>
48#include <linux/utsname.h>
48#include <linux/sunrpc/xdr.h> 49#include <linux/sunrpc/xdr.h>
49#include <linux/sunrpc/svc.h> 50#include <linux/sunrpc/svc.h>
50#include <linux/sunrpc/clnt.h> 51#include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
188 return p; 189 return p;
189} 190}
190 191
192static int zero_clientid(clientid_t *clid)
193{
194 return (clid->cl_boot == 0) && (clid->cl_id == 0);
195}
196
191static int 197static int
192defer_free(struct nfsd4_compoundargs *argp, 198defer_free(struct nfsd4_compoundargs *argp,
193 void (*release)(const void *), void *p) 199 void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
230 236
231 bmval[0] = 0; 237 bmval[0] = 0;
232 bmval[1] = 0; 238 bmval[1] = 0;
239 bmval[2] = 0;
233 240
234 READ_BUF(4); 241 READ_BUF(4);
235 READ32(bmlen); 242 READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
241 READ32(bmval[0]); 248 READ32(bmval[0]);
242 if (bmlen > 1) 249 if (bmlen > 1)
243 READ32(bmval[1]); 250 READ32(bmval[1]);
251 if (bmlen > 2)
252 READ32(bmval[2]);
244 253
245 DECODE_TAIL; 254 DECODE_TAIL;
246} 255}
247 256
257static u32 nfsd_attrmask[] = {
258 NFSD_WRITEABLE_ATTRS_WORD0,
259 NFSD_WRITEABLE_ATTRS_WORD1,
260 NFSD_WRITEABLE_ATTRS_WORD2
261};
262
263static u32 nfsd41_ex_attrmask[] = {
264 NFSD_SUPPATTR_EXCLCREAT_WORD0,
265 NFSD_SUPPATTR_EXCLCREAT_WORD1,
266 NFSD_SUPPATTR_EXCLCREAT_WORD2
267};
268
248static __be32 269static __be32
249nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, 270nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
250 struct nfs4_acl **acl) 271 struct iattr *iattr, struct nfs4_acl **acl)
251{ 272{
252 int expected_len, len = 0; 273 int expected_len, len = 0;
253 u32 dummy32; 274 u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
263 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; 284 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
264 * read-only attributes return ERR_INVAL. 285 * read-only attributes return ERR_INVAL.
265 */ 286 */
266 if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 287 if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
288 (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
289 (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
267 return nfserr_attrnotsupp; 290 return nfserr_attrnotsupp;
268 if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1)) 291 if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
292 (bmval[2] & ~writable[2]))
269 return nfserr_inval; 293 return nfserr_inval;
270 294
271 READ_BUF(4); 295 READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
400 goto xdr_error; 424 goto xdr_error;
401 } 425 }
402 } 426 }
427 BUG_ON(bmval[2]); /* no such writeable attr supported yet */
403 if (len != expected_len) 428 if (len != expected_len)
404 goto xdr_error; 429 goto xdr_error;
405 430
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
493 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 518 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
494 return status; 519 return status;
495 520
496 if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) 521 status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
522 &create->cr_iattr, &create->cr_acl);
523 if (status)
497 goto out; 524 goto out;
498 525
499 DECODE_TAIL; 526 DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
583 READ_BUF(lockt->lt_owner.len); 610 READ_BUF(lockt->lt_owner.len);
584 READMEM(lockt->lt_owner.data, lockt->lt_owner.len); 611 READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
585 612
613 if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
614 return nfserr_inval;
586 DECODE_TAIL; 615 DECODE_TAIL;
587} 616}
588 617
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
652 switch (open->op_createmode) { 681 switch (open->op_createmode) {
653 case NFS4_CREATE_UNCHECKED: 682 case NFS4_CREATE_UNCHECKED:
654 case NFS4_CREATE_GUARDED: 683 case NFS4_CREATE_GUARDED:
655 if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) 684 status = nfsd4_decode_fattr(argp, open->op_bmval,
685 nfsd_attrmask, &open->op_iattr, &open->op_acl);
686 if (status)
656 goto out; 687 goto out;
657 break; 688 break;
658 case NFS4_CREATE_EXCLUSIVE: 689 case NFS4_CREATE_EXCLUSIVE:
659 READ_BUF(8); 690 READ_BUF(8);
660 COPYMEM(open->op_verf.data, 8); 691 COPYMEM(open->op_verf.data, 8);
661 break; 692 break;
693 case NFS4_CREATE_EXCLUSIVE4_1:
694 if (argp->minorversion < 1)
695 goto xdr_error;
696 READ_BUF(8);
697 COPYMEM(open->op_verf.data, 8);
698 status = nfsd4_decode_fattr(argp, open->op_bmval,
699 nfsd41_ex_attrmask, &open->op_iattr,
700 &open->op_acl);
701 if (status)
702 goto out;
703 break;
662 default: 704 default:
663 goto xdr_error; 705 goto xdr_error;
664 } 706 }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
851 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); 893 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
852 if (status) 894 if (status)
853 return status; 895 return status;
854 return nfsd4_decode_fattr(argp, setattr->sa_bmval, 896 return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
855 &setattr->sa_iattr, &setattr->sa_acl); 897 &setattr->sa_iattr, &setattr->sa_acl);
856} 898}
857 899
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
993 READ_BUF(rlockowner->rl_owner.len); 1035 READ_BUF(rlockowner->rl_owner.len);
994 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); 1036 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
995 1037
1038 if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
1039 return nfserr_inval;
1040 DECODE_TAIL;
1041}
1042
1043static __be32
1044nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1045 struct nfsd4_exchange_id *exid)
1046{
1047 int dummy;
1048 DECODE_HEAD;
1049
1050 READ_BUF(NFS4_VERIFIER_SIZE);
1051 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
1052
1053 READ_BUF(4);
1054 READ32(exid->clname.len);
1055
1056 READ_BUF(exid->clname.len);
1057 SAVEMEM(exid->clname.data, exid->clname.len);
1058
1059 READ_BUF(4);
1060 READ32(exid->flags);
1061
1062 /* Ignore state_protect4_a */
1063 READ_BUF(4);
1064 READ32(exid->spa_how);
1065 switch (exid->spa_how) {
1066 case SP4_NONE:
1067 break;
1068 case SP4_MACH_CRED:
1069 /* spo_must_enforce */
1070 READ_BUF(4);
1071 READ32(dummy);
1072 READ_BUF(dummy * 4);
1073 p += dummy;
1074
1075 /* spo_must_allow */
1076 READ_BUF(4);
1077 READ32(dummy);
1078 READ_BUF(dummy * 4);
1079 p += dummy;
1080 break;
1081 case SP4_SSV:
1082 /* ssp_ops */
1083 READ_BUF(4);
1084 READ32(dummy);
1085 READ_BUF(dummy * 4);
1086 p += dummy;
1087
1088 READ_BUF(4);
1089 READ32(dummy);
1090 READ_BUF(dummy * 4);
1091 p += dummy;
1092
1093 /* ssp_hash_algs<> */
1094 READ_BUF(4);
1095 READ32(dummy);
1096 READ_BUF(dummy);
1097 p += XDR_QUADLEN(dummy);
1098
1099 /* ssp_encr_algs<> */
1100 READ_BUF(4);
1101 READ32(dummy);
1102 READ_BUF(dummy);
1103 p += XDR_QUADLEN(dummy);
1104
1105 /* ssp_window and ssp_num_gss_handles */
1106 READ_BUF(8);
1107 READ32(dummy);
1108 READ32(dummy);
1109 break;
1110 default:
1111 goto xdr_error;
1112 }
1113
1114 /* Ignore Implementation ID */
1115 READ_BUF(4); /* nfs_impl_id4 array length */
1116 READ32(dummy);
1117
1118 if (dummy > 1)
1119 goto xdr_error;
1120
1121 if (dummy == 1) {
1122 /* nii_domain */
1123 READ_BUF(4);
1124 READ32(dummy);
1125 READ_BUF(dummy);
1126 p += XDR_QUADLEN(dummy);
1127
1128 /* nii_name */
1129 READ_BUF(4);
1130 READ32(dummy);
1131 READ_BUF(dummy);
1132 p += XDR_QUADLEN(dummy);
1133
1134 /* nii_date */
1135 READ_BUF(12);
1136 p += 3;
1137 }
1138 DECODE_TAIL;
1139}
1140
1141static __be32
1142nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1143 struct nfsd4_create_session *sess)
1144{
1145 DECODE_HEAD;
1146
1147 u32 dummy;
1148 char *machine_name;
1149 int i;
1150 int nr_secflavs;
1151
1152 READ_BUF(16);
1153 COPYMEM(&sess->clientid, 8);
1154 READ32(sess->seqid);
1155 READ32(sess->flags);
1156
1157 /* Fore channel attrs */
1158 READ_BUF(28);
1159 READ32(dummy); /* headerpadsz is always 0 */
1160 READ32(sess->fore_channel.maxreq_sz);
1161 READ32(sess->fore_channel.maxresp_sz);
1162 READ32(sess->fore_channel.maxresp_cached);
1163 READ32(sess->fore_channel.maxops);
1164 READ32(sess->fore_channel.maxreqs);
1165 READ32(sess->fore_channel.nr_rdma_attrs);
1166 if (sess->fore_channel.nr_rdma_attrs == 1) {
1167 READ_BUF(4);
1168 READ32(sess->fore_channel.rdma_attrs);
1169 } else if (sess->fore_channel.nr_rdma_attrs > 1) {
1170 dprintk("Too many fore channel attr bitmaps!\n");
1171 goto xdr_error;
1172 }
1173
1174 /* Back channel attrs */
1175 READ_BUF(28);
1176 READ32(dummy); /* headerpadsz is always 0 */
1177 READ32(sess->back_channel.maxreq_sz);
1178 READ32(sess->back_channel.maxresp_sz);
1179 READ32(sess->back_channel.maxresp_cached);
1180 READ32(sess->back_channel.maxops);
1181 READ32(sess->back_channel.maxreqs);
1182 READ32(sess->back_channel.nr_rdma_attrs);
1183 if (sess->back_channel.nr_rdma_attrs == 1) {
1184 READ_BUF(4);
1185 READ32(sess->back_channel.rdma_attrs);
1186 } else if (sess->back_channel.nr_rdma_attrs > 1) {
1187 dprintk("Too many back channel attr bitmaps!\n");
1188 goto xdr_error;
1189 }
1190
1191 READ_BUF(8);
1192 READ32(sess->callback_prog);
1193
1194 /* callback_sec_params4 */
1195 READ32(nr_secflavs);
1196 for (i = 0; i < nr_secflavs; ++i) {
1197 READ_BUF(4);
1198 READ32(dummy);
1199 switch (dummy) {
1200 case RPC_AUTH_NULL:
1201 /* Nothing to read */
1202 break;
1203 case RPC_AUTH_UNIX:
1204 READ_BUF(8);
1205 /* stamp */
1206 READ32(dummy);
1207
1208 /* machine name */
1209 READ32(dummy);
1210 READ_BUF(dummy);
1211 SAVEMEM(machine_name, dummy);
1212
1213 /* uid, gid */
1214 READ_BUF(8);
1215 READ32(sess->uid);
1216 READ32(sess->gid);
1217
1218 /* more gids */
1219 READ_BUF(4);
1220 READ32(dummy);
1221 READ_BUF(dummy * 4);
1222 for (i = 0; i < dummy; ++i)
1223 READ32(dummy);
1224 break;
1225 case RPC_AUTH_GSS:
1226 dprintk("RPC_AUTH_GSS callback secflavor "
1227 "not supported!\n");
1228 READ_BUF(8);
1229 /* gcbp_service */
1230 READ32(dummy);
1231 /* gcbp_handle_from_server */
1232 READ32(dummy);
1233 READ_BUF(dummy);
1234 p += XDR_QUADLEN(dummy);
1235 /* gcbp_handle_from_client */
1236 READ_BUF(4);
1237 READ32(dummy);
1238 READ_BUF(dummy);
1239 p += XDR_QUADLEN(dummy);
1240 break;
1241 default:
1242 dprintk("Illegal callback secflavor\n");
1243 return nfserr_inval;
1244 }
1245 }
1246 DECODE_TAIL;
1247}
1248
1249static __be32
1250nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
1251 struct nfsd4_destroy_session *destroy_session)
1252{
1253 DECODE_HEAD;
1254 READ_BUF(NFS4_MAX_SESSIONID_LEN);
1255 COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1256
1257 DECODE_TAIL;
1258}
1259
1260static __be32
1261nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1262 struct nfsd4_sequence *seq)
1263{
1264 DECODE_HEAD;
1265
1266 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
1267 COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1268 READ32(seq->seqid);
1269 READ32(seq->slotid);
1270 READ32(seq->maxslots);
1271 READ32(seq->cachethis);
1272
996 DECODE_TAIL; 1273 DECODE_TAIL;
997} 1274}
998 1275
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1005static __be32 1282static __be32
1006nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) 1283nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
1007{ 1284{
1008 return nfserr_opnotsupp; 1285 return nfserr_notsupp;
1009} 1286}
1010 1287
1011typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); 1288typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1031 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, 1308 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
1032 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, 1309 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
1033 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, 1310 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
1034 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, 1311 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop,
1035 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, 1312 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
1036 [OP_READ] = (nfsd4_dec)nfsd4_decode_read, 1313 [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
1037 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, 1314 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1050 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, 1327 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
1051}; 1328};
1052 1329
1330static nfsd4_dec nfsd41_dec_ops[] = {
1331 [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access,
1332 [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close,
1333 [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit,
1334 [OP_CREATE] (nfsd4_dec)nfsd4_decode_create,
1335 [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp,
1336 [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn,
1337 [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr,
1338 [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop,
1339 [OP_LINK] (nfsd4_dec)nfsd4_decode_link,
1340 [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock,
1341 [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt,
1342 [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku,
1343 [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup,
1344 [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop,
1345 [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify,
1346 [OP_OPEN] (nfsd4_dec)nfsd4_decode_open,
1347 [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp,
1349 [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade,
1350 [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh,
1351 [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop,
1353 [OP_READ] (nfsd4_dec)nfsd4_decode_read,
1354 [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir,
1355 [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop,
1356 [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove,
1357 [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename,
1358 [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp,
1359 [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop,
1360 [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop,
1361 [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo,
1362 [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr,
1363 [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1364 [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
1365 [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify,
1366 [OP_WRITE] (nfsd4_dec)nfsd4_decode_write,
1367 [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp,
1368
1369 /* new operations for NFSv4.1 */
1370 [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp,
1371 [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
1372 [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id,
1373 [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session,
1374 [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session,
1375 [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1376 [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1377 [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp,
1378 [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp,
1379 [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp,
1380 [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp,
1381 [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp,
1382 [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp,
1383 [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence,
1384 [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp,
1385 [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1386 [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1387 [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1388 [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp,
1389};
1390
1053struct nfsd4_minorversion_ops { 1391struct nfsd4_minorversion_ops {
1054 nfsd4_dec *decoders; 1392 nfsd4_dec *decoders;
1055 int nops; 1393 int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
1057 1395
1058static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1396static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1059 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1397 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1398 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1060}; 1399};
1061 1400
1062static __be32 1401static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1412{ 1751{
1413 u32 bmval0 = bmval[0]; 1752 u32 bmval0 = bmval[0];
1414 u32 bmval1 = bmval[1]; 1753 u32 bmval1 = bmval[1];
1754 u32 bmval2 = bmval[2];
1415 struct kstat stat; 1755 struct kstat stat;
1416 struct svc_fh tempfh; 1756 struct svc_fh tempfh;
1417 struct kstatfs statfs; 1757 struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1425 int err; 1765 int err;
1426 int aclsupport = 0; 1766 int aclsupport = 0;
1427 struct nfs4_acl *acl = NULL; 1767 struct nfs4_acl *acl = NULL;
1768 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1769 u32 minorversion = resp->cstate.minorversion;
1428 1770
1429 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 1771 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
1430 BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); 1772 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
1431 BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); 1773 BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
1774 BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
1432 1775
1433 if (exp->ex_fslocs.migrated) { 1776 if (exp->ex_fslocs.migrated) {
1777 BUG_ON(bmval[2]);
1434 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); 1778 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
1435 if (status) 1779 if (status)
1436 goto out; 1780 goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1476 if ((buflen -= 16) < 0) 1820 if ((buflen -= 16) < 0)
1477 goto out_resource; 1821 goto out_resource;
1478 1822
1479 WRITE32(2); 1823 if (unlikely(bmval2)) {
1480 WRITE32(bmval0); 1824 WRITE32(3);
1481 WRITE32(bmval1); 1825 WRITE32(bmval0);
1826 WRITE32(bmval1);
1827 WRITE32(bmval2);
1828 } else if (likely(bmval1)) {
1829 WRITE32(2);
1830 WRITE32(bmval0);
1831 WRITE32(bmval1);
1832 } else {
1833 WRITE32(1);
1834 WRITE32(bmval0);
1835 }
1482 attrlenp = p++; /* to be backfilled later */ 1836 attrlenp = p++; /* to be backfilled later */
1483 1837
1484 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { 1838 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
1485 u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; 1839 u32 word0 = nfsd_suppattrs0(minorversion);
1840 u32 word1 = nfsd_suppattrs1(minorversion);
1841 u32 word2 = nfsd_suppattrs2(minorversion);
1842
1486 if ((buflen -= 12) < 0) 1843 if ((buflen -= 12) < 0)
1487 goto out_resource; 1844 goto out_resource;
1488 if (!aclsupport) 1845 if (!aclsupport)
1489 word0 &= ~FATTR4_WORD0_ACL; 1846 word0 &= ~FATTR4_WORD0_ACL;
1490 if (!exp->ex_fslocs.locations) 1847 if (!exp->ex_fslocs.locations)
1491 word0 &= ~FATTR4_WORD0_FS_LOCATIONS; 1848 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1492 WRITE32(2); 1849 if (!word2) {
1493 WRITE32(word0); 1850 WRITE32(2);
1494 WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); 1851 WRITE32(word0);
1852 WRITE32(word1);
1853 } else {
1854 WRITE32(3);
1855 WRITE32(word0);
1856 WRITE32(word1);
1857 WRITE32(word2);
1858 }
1495 } 1859 }
1496 if (bmval0 & FATTR4_WORD0_TYPE) { 1860 if (bmval0 & FATTR4_WORD0_TYPE) {
1497 if ((buflen -= 4) < 0) 1861 if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
1801 } 2165 }
1802 WRITE64(stat.ino); 2166 WRITE64(stat.ino);
1803 } 2167 }
2168 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2169 WRITE32(3);
2170 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
2171 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
2172 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
2173 }
2174
1804 *attrlenp = htonl((char *)p - (char *)attrlenp - 4); 2175 *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
1805 *countp = p - buffer; 2176 *countp = p - buffer;
1806 status = nfs_ok; 2177 status = nfs_ok;
@@ -1843,6 +2214,15 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
1843 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); 2214 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
1844 if (IS_ERR(dentry)) 2215 if (IS_ERR(dentry))
1845 return nfserrno(PTR_ERR(dentry)); 2216 return nfserrno(PTR_ERR(dentry));
2217 if (!dentry->d_inode) {
2218 /*
2219 * nfsd_buffered_readdir drops the i_mutex between
2220 * readdir and calling this callback, leaving a window
2221 * where this directory entry could have gone away.
2222 */
2223 dput(dentry);
2224 return nfserr_noent;
2225 }
1846 2226
1847 exp_get(exp); 2227 exp_get(exp);
1848 /* 2228 /*
@@ -1905,6 +2285,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1905 struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); 2285 struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
1906 int buflen; 2286 int buflen;
1907 __be32 *p = cd->buffer; 2287 __be32 *p = cd->buffer;
2288 __be32 *cookiep;
1908 __be32 nfserr = nfserr_toosmall; 2289 __be32 nfserr = nfserr_toosmall;
1909 2290
1910 /* In nfsv4, "." and ".." never make it onto the wire.. */ 2291 /* In nfsv4, "." and ".." never make it onto the wire.. */
@@ -1921,7 +2302,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1921 goto fail; 2302 goto fail;
1922 2303
1923 *p++ = xdr_one; /* mark entry present */ 2304 *p++ = xdr_one; /* mark entry present */
1924 cd->offset = p; /* remember pointer */ 2305 cookiep = p;
1925 p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ 2306 p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
1926 p = xdr_encode_array(p, name, namlen); /* name length & name */ 2307 p = xdr_encode_array(p, name, namlen); /* name length & name */
1927 2308
@@ -1935,6 +2316,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1935 goto fail; 2316 goto fail;
1936 case nfserr_dropit: 2317 case nfserr_dropit:
1937 goto fail; 2318 goto fail;
2319 case nfserr_noent:
2320 goto skip_entry;
1938 default: 2321 default:
1939 /* 2322 /*
1940 * If the client requested the RDATTR_ERROR attribute, 2323 * If the client requested the RDATTR_ERROR attribute,
@@ -1953,6 +2336,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1953 } 2336 }
1954 cd->buflen -= (p - cd->buffer); 2337 cd->buflen -= (p - cd->buffer);
1955 cd->buffer = p; 2338 cd->buffer = p;
2339 cd->offset = cookiep;
2340skip_entry:
1956 cd->common.err = nfs_ok; 2341 cd->common.err = nfs_ok;
1957 return 0; 2342 return 0;
1958fail: 2343fail:
@@ -2572,6 +2957,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
2572} 2957}
2573 2958
2574static __be32 2959static __be32
2960nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
2961 struct nfsd4_exchange_id *exid)
2962{
2963 ENCODE_HEAD;
2964 char *major_id;
2965 char *server_scope;
2966 int major_id_sz;
2967 int server_scope_sz;
2968 uint64_t minor_id = 0;
2969
2970 if (nfserr)
2971 return nfserr;
2972
2973 major_id = utsname()->nodename;
2974 major_id_sz = strlen(major_id);
2975 server_scope = utsname()->nodename;
2976 server_scope_sz = strlen(server_scope);
2977
2978 RESERVE_SPACE(
2979 8 /* eir_clientid */ +
2980 4 /* eir_sequenceid */ +
2981 4 /* eir_flags */ +
2982 4 /* spr_how (SP4_NONE) */ +
2983 8 /* so_minor_id */ +
2984 4 /* so_major_id.len */ +
2985 (XDR_QUADLEN(major_id_sz) * 4) +
2986 4 /* eir_server_scope.len */ +
2987 (XDR_QUADLEN(server_scope_sz) * 4) +
2988 4 /* eir_server_impl_id.count (0) */);
2989
2990 WRITEMEM(&exid->clientid, 8);
2991 WRITE32(exid->seqid);
2992 WRITE32(exid->flags);
2993
2994 /* state_protect4_r. Currently only support SP4_NONE */
2995 BUG_ON(exid->spa_how != SP4_NONE);
2996 WRITE32(exid->spa_how);
2997
2998 /* The server_owner struct */
2999 WRITE64(minor_id); /* Minor id */
3000 /* major id */
3001 WRITE32(major_id_sz);
3002 WRITEMEM(major_id, major_id_sz);
3003
3004 /* Server scope */
3005 WRITE32(server_scope_sz);
3006 WRITEMEM(server_scope, server_scope_sz);
3007
3008 /* Implementation id */
3009 WRITE32(0); /* zero length nfs_impl_id4 array */
3010 ADJUST_ARGS();
3011 return 0;
3012}
3013
3014static __be32
3015nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
3016 struct nfsd4_create_session *sess)
3017{
3018 ENCODE_HEAD;
3019
3020 if (nfserr)
3021 return nfserr;
3022
3023 RESERVE_SPACE(24);
3024 WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3025 WRITE32(sess->seqid);
3026 WRITE32(sess->flags);
3027 ADJUST_ARGS();
3028
3029 RESERVE_SPACE(28);
3030 WRITE32(0); /* headerpadsz */
3031 WRITE32(sess->fore_channel.maxreq_sz);
3032 WRITE32(sess->fore_channel.maxresp_sz);
3033 WRITE32(sess->fore_channel.maxresp_cached);
3034 WRITE32(sess->fore_channel.maxops);
3035 WRITE32(sess->fore_channel.maxreqs);
3036 WRITE32(sess->fore_channel.nr_rdma_attrs);
3037 ADJUST_ARGS();
3038
3039 if (sess->fore_channel.nr_rdma_attrs) {
3040 RESERVE_SPACE(4);
3041 WRITE32(sess->fore_channel.rdma_attrs);
3042 ADJUST_ARGS();
3043 }
3044
3045 RESERVE_SPACE(28);
3046 WRITE32(0); /* headerpadsz */
3047 WRITE32(sess->back_channel.maxreq_sz);
3048 WRITE32(sess->back_channel.maxresp_sz);
3049 WRITE32(sess->back_channel.maxresp_cached);
3050 WRITE32(sess->back_channel.maxops);
3051 WRITE32(sess->back_channel.maxreqs);
3052 WRITE32(sess->back_channel.nr_rdma_attrs);
3053 ADJUST_ARGS();
3054
3055 if (sess->back_channel.nr_rdma_attrs) {
3056 RESERVE_SPACE(4);
3057 WRITE32(sess->back_channel.rdma_attrs);
3058 ADJUST_ARGS();
3059 }
3060 return 0;
3061}
3062
3063static __be32
3064nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
3065 struct nfsd4_destroy_session *destroy_session)
3066{
3067 return nfserr;
3068}
3069
3070__be32
3071nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3072 struct nfsd4_sequence *seq)
3073{
3074 ENCODE_HEAD;
3075
3076 if (nfserr)
3077 return nfserr;
3078
3079 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
3080 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3081 WRITE32(seq->seqid);
3082 WRITE32(seq->slotid);
3083 WRITE32(seq->maxslots);
3084 /*
3085 * FIXME: for now:
3086 * target_maxslots = maxslots
3087 * status_flags = 0
3088 */
3089 WRITE32(seq->maxslots);
3090 WRITE32(0);
3091
3092 ADJUST_ARGS();
3093 return 0;
3094}
3095
3096static __be32
2575nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3097nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2576{ 3098{
2577 return nfserr; 3099 return nfserr;
@@ -2579,6 +3101,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2579 3101
2580typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); 3102typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
2581 3103
3104/*
3105 * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
3106 * since we don't need to filter out obsolete ops as this is
3107 * done in the decoding phase.
3108 */
2582static nfsd4_enc nfsd4_enc_ops[] = { 3109static nfsd4_enc nfsd4_enc_ops[] = {
2583 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, 3110 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
2584 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, 3111 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3144,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
2617 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, 3144 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
2618 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, 3145 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
2619 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, 3146 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
3147
3148 /* NFSv4.1 operations */
3149 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3150 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3151 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3152 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3153 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
3154 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3155 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3156 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3157 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3158 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3159 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3160 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3161 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
3162 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3163 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3164 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3165 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3166 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3167 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
2620}; 3168};
2621 3169
3170/*
3171 * Calculate the total amount of memory that the compound response has taken
3172 * after encoding the current operation.
3173 *
3174 * pad: add on 8 bytes for the next operation's op_code and status so that
3175 * there is room to cache a failure on the next operation.
3176 *
3177 * Compare this length to the session se_fmaxresp_cached.
3178 *
3179 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3180 * will be at least a page and will therefore hold the xdr_buf head.
3181 */
3182static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3183{
3184 int status = 0;
3185 struct xdr_buf *xb = &resp->rqstp->rq_res;
3186 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
3187 struct nfsd4_session *session = NULL;
3188 struct nfsd4_slot *slot = resp->cstate.slot;
3189 u32 length, tlen = 0, pad = 8;
3190
3191 if (!nfsd4_has_session(&resp->cstate))
3192 return status;
3193
3194 session = resp->cstate.session;
3195 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
3196 return status;
3197
3198 if (resp->opcnt >= args->opcnt)
3199 pad = 0; /* this is the last operation */
3200
3201 if (xb->page_len == 0) {
3202 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
3203 } else {
3204 if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
3205 tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
3206
3207 length = xb->head[0].iov_len + xb->page_len + tlen + pad;
3208 }
3209 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3210 length, xb->page_len, tlen, pad);
3211
3212 if (length <= session->se_fmaxresp_cached)
3213 return status;
3214 else
3215 return nfserr_rep_too_big_to_cache;
3216}
3217
2622void 3218void
2623nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 3219nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2624{ 3220{
@@ -2635,6 +3231,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2635 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || 3231 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
2636 !nfsd4_enc_ops[op->opnum]); 3232 !nfsd4_enc_ops[op->opnum]);
2637 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3233 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3234 /* nfsd4_check_drc_limit guarantees enough room for error status */
3235 if (!op->status && nfsd4_check_drc_limit(resp))
3236 op->status = nfserr_rep_too_big_to_cache;
2638status: 3237status:
2639 /* 3238 /*
2640 * Note: We write the status directly, instead of using WRITE32(), 3239 * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3334,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
2735 iov = &rqstp->rq_res.head[0]; 3334 iov = &rqstp->rq_res.head[0];
2736 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3335 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
2737 BUG_ON(iov->iov_len > PAGE_SIZE); 3336 BUG_ON(iov->iov_len > PAGE_SIZE);
3337 if (nfsd4_has_session(&resp->cstate)) {
3338 if (resp->cstate.status == nfserr_replay_cache &&
3339 !nfsd4_not_cached(resp)) {
3340 iov->iov_len = resp->cstate.iovlen;
3341 } else {
3342 nfsd4_store_cache_entry(resp);
3343 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3344 resp->cstate.slot->sl_inuse = 0;
3345 }
3346 if (resp->cstate.session)
3347 nfsd4_put_session(resp->cstate.session);
3348 }
2738 return 1; 3349 return 1;
2739} 3350}
2740 3351
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
60 NFSD_FO_UnlockFS, 60 NFSD_FO_UnlockFS,
61 NFSD_Threads, 61 NFSD_Threads,
62 NFSD_Pool_Threads, 62 NFSD_Pool_Threads,
63 NFSD_Pool_Stats,
63 NFSD_Versions, 64 NFSD_Versions,
64 NFSD_Ports, 65 NFSD_Ports,
65 NFSD_MaxBlkSize, 66 NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
172 .owner = THIS_MODULE, 173 .owner = THIS_MODULE,
173}; 174};
174 175
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177
178static struct file_operations pool_stats_operations = {
179 .open = nfsd_pool_stats_open,
180 .read = seq_read,
181 .llseek = seq_lseek,
182 .release = seq_release,
183 .owner = THIS_MODULE,
184};
185
175/*----------------------------------------------------------------------------*/ 186/*----------------------------------------------------------------------------*/
176/* 187/*
177 * payload - write methods 188 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
781static ssize_t __write_versions(struct file *file, char *buf, size_t size) 792static ssize_t __write_versions(struct file *file, char *buf, size_t size)
782{ 793{
783 char *mesg = buf; 794 char *mesg = buf;
784 char *vers, sign; 795 char *vers, *minorp, sign;
785 int len, num; 796 int len, num;
797 unsigned minor;
786 ssize_t tlen = 0; 798 ssize_t tlen = 0;
787 char *sep; 799 char *sep;
788 800
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
803 do { 815 do {
804 sign = *vers; 816 sign = *vers;
805 if (sign == '+' || sign == '-') 817 if (sign == '+' || sign == '-')
806 num = simple_strtol((vers+1), NULL, 0); 818 num = simple_strtol((vers+1), &minorp, 0);
807 else 819 else
808 num = simple_strtol(vers, NULL, 0); 820 num = simple_strtol(vers, &minorp, 0);
821 if (*minorp == '.') {
822 if (num < 4)
823 return -EINVAL;
824 minor = simple_strtoul(minorp+1, NULL, 0);
825 if (minor == 0)
826 return -EINVAL;
827 if (nfsd_minorversion(minor, sign == '-' ?
828 NFSD_CLEAR : NFSD_SET) < 0)
829 return -EINVAL;
830 goto next;
831 }
809 switch(num) { 832 switch(num) {
810 case 2: 833 case 2:
811 case 3: 834 case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
815 default: 838 default:
816 return -EINVAL; 839 return -EINVAL;
817 } 840 }
841 next:
818 vers += len + 1; 842 vers += len + 1;
819 tlen += len; 843 tlen += len;
820 } while ((len = qword_get(&mesg, vers, size)) > 0); 844 } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
833 num); 857 num);
834 sep = " "; 858 sep = " ";
835 } 859 }
860 if (nfsd_vers(4, NFSD_AVAIL))
861 for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
862 len += sprintf(buf+len, " %c4.%u",
863 (nfsd_vers(4, NFSD_TEST) &&
864 nfsd_minorversion(minor, NFSD_TEST)) ?
865 '+' : '-',
866 minor);
836 len += sprintf(buf+len, "\n"); 867 len += sprintf(buf+len, "\n");
837 return len; 868 return len;
838} 869}
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1248 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, 1279 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
1249 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1280 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
1250 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1281 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
1282 [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
1251 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1283 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1252 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1284 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1253 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1285 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
180{ 180{
181 __be32 nfserr; 181 __be32 nfserr;
182 int stable = 1; 182 int stable = 1;
183 unsigned long cnt = argp->len;
183 184
184 dprintk("nfsd: WRITE %s %d bytes at %d\n", 185 dprintk("nfsd: WRITE %s %d bytes at %d\n",
185 SVCFH_fmt(&argp->fh), 186 SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
188 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 189 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
189 argp->offset, 190 argp->offset,
190 rqstp->rq_vec, argp->vlen, 191 rqstp->rq_vec, argp->vlen,
191 argp->len, 192 &cnt,
192 &stable); 193 &stable);
193 return nfsd_return_attrs(nfserr, resp); 194 return nfsd_return_attrs(nfserr, resp);
194} 195}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7c09852be713..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/fs_struct.h> 23#include <linux/fs_struct.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/swap.h>
25 26
26#include <linux/sunrpc/types.h> 27#include <linux/sunrpc/types.h>
27#include <linux/sunrpc/stats.h> 28#include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
40extern struct svc_program nfsd_program; 41extern struct svc_program nfsd_program;
41static int nfsd(void *vrqstp); 42static int nfsd(void *vrqstp);
42struct timeval nfssvc_boot; 43struct timeval nfssvc_boot;
43static atomic_t nfsd_busy;
44static unsigned long nfsd_last_call;
45static DEFINE_SPINLOCK(nfsd_call_lock);
46 44
47/* 45/*
48 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members 46 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program nfsd_program = {
123 121
124}; 122};
125 123
124u32 nfsd_supported_minorversion;
125
126int nfsd_vers(int vers, enum vers_op change) 126int nfsd_vers(int vers, enum vers_op change)
127{ 127{
128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
149 } 149 }
150 return 0; 150 return 0;
151} 151}
152
153int nfsd_minorversion(u32 minorversion, enum vers_op change)
154{
155 if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
156 return -1;
157 switch(change) {
158 case NFSD_SET:
159 nfsd_supported_minorversion = minorversion;
160 break;
161 case NFSD_CLEAR:
162 if (minorversion == 0)
163 return -1;
164 nfsd_supported_minorversion = minorversion - 1;
165 break;
166 case NFSD_TEST:
167 return minorversion <= nfsd_supported_minorversion;
168 case NFSD_AVAIL:
169 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
170 }
171 return 0;
172}
173
152/* 174/*
153 * Maximum number of nfsd processes 175 * Maximum number of nfsd processes
154 */ 176 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
200 } 222 }
201} 223}
202 224
225/*
226 * Each session guarantees a negotiated per slot memory cache for replies
227 * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
228 * NFSv4.1 server might want to use more memory for a DRC than a machine
229 * with mutiple services.
230 *
231 * Impose a hard limit on the number of pages for the DRC which varies
232 * according to the machines free pages. This is of course only a default.
233 *
234 * For now this is a #defined shift which could be under admin control
235 * in the future.
236 */
237static void set_max_drc(void)
238{
239 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
240 #define NFSD_DRC_SIZE_SHIFT 7
241 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
242 >> NFSD_DRC_SIZE_SHIFT;
243 nfsd_serv->sv_drc_pages_used = 0;
244 dprintk("%s svc_drc_max_pages %u\n", __func__,
245 nfsd_serv->sv_drc_max_pages);
246}
203 247
204int nfsd_create_serv(void) 248int nfsd_create_serv(void)
205{ 249{
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
227 nfsd_max_blksize /= 2; 271 nfsd_max_blksize /= 2;
228 } 272 }
229 273
230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 274 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 nfsd_last_thread, nfsd, THIS_MODULE); 275 nfsd_last_thread, nfsd, THIS_MODULE);
233 if (nfsd_serv == NULL) 276 if (nfsd_serv == NULL)
234 err = -ENOMEM; 277 err = -ENOMEM;
278 else
279 set_max_drc();
235 280
236 do_gettimeofday(&nfssvc_boot); /* record boot time */ 281 do_gettimeofday(&nfssvc_boot); /* record boot time */
237 return err; 282 return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
375 return error; 420 return error;
376} 421}
377 422
378static inline void
379update_thread_usage(int busy_threads)
380{
381 unsigned long prev_call;
382 unsigned long diff;
383 int decile;
384
385 spin_lock(&nfsd_call_lock);
386 prev_call = nfsd_last_call;
387 nfsd_last_call = jiffies;
388 decile = busy_threads*10/nfsdstats.th_cnt;
389 if (decile>0 && decile <= 10) {
390 diff = nfsd_last_call - prev_call;
391 if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
392 nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
393 if (decile == 10)
394 nfsdstats.th_fullcnt++;
395 }
396 spin_unlock(&nfsd_call_lock);
397}
398 423
399/* 424/*
400 * This is the NFS server kernel thread 425 * This is the NFS server kernel thread
@@ -460,8 +485,6 @@ nfsd(void *vrqstp)
460 continue; 485 continue;
461 } 486 }
462 487
463 update_thread_usage(atomic_read(&nfsd_busy));
464 atomic_inc(&nfsd_busy);
465 488
466 /* Lock the export hash tables for reading. */ 489 /* Lock the export hash tables for reading. */
467 exp_readlock(); 490 exp_readlock();
@@ -470,8 +493,6 @@ nfsd(void *vrqstp)
470 493
471 /* Unlock export hash tables */ 494 /* Unlock export hash tables */
472 exp_readunlock(); 495 exp_readunlock();
473 update_thread_usage(atomic_read(&nfsd_busy));
474 atomic_dec(&nfsd_busy);
475 } 496 }
476 497
477 /* Clear signals before calling svc_exit_thread() */ 498 /* Clear signals before calling svc_exit_thread() */
@@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
539 + rqstp->rq_res.head[0].iov_len; 560 + rqstp->rq_res.head[0].iov_len;
540 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 561 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
541 562
563 /* NFSv4.1 DRC requires statp */
564 if (rqstp->rq_vers == 4)
565 nfsd4_set_statp(rqstp, statp);
566
542 /* Now call the procedure handler, and encode NFS status. */ 567 /* Now call the procedure handler, and encode NFS status. */
543 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 568 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
544 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 569 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
570 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 595 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
571 return 1; 596 return 1;
572} 597}
598
599int nfsd_pool_stats_open(struct inode *inode, struct file *file)
600{
601 if (nfsd_serv == NULL)
602 return -ENODEV;
603 return svc_pool_stats_open(nfsd_serv, file);
604}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..6c68ffd6b4bb 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
116 } 116 }
117 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 117 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
118 /* successfully crossed mount point */ 118 /* successfully crossed mount point */
119 exp_put(exp); 119 /*
120 *expp = exp2; 120 * This is subtle: dentry is *not* under mnt at this point.
121 * The only reason we are safe is that original mnt is pinned
122 * down by exp, so we should dput before putting exp.
123 */
121 dput(dentry); 124 dput(dentry);
122 *dpp = mounts; 125 *dpp = mounts;
126 exp_put(exp);
127 *expp = exp2;
123 } else { 128 } else {
124 exp_put(exp2); 129 exp_put(exp2);
125 dput(mounts); 130 dput(mounts);
@@ -366,8 +371,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
366 } 371 }
367 372
368 /* Revoke setuid/setgid on chown */ 373 /* Revoke setuid/setgid on chown */
369 if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || 374 if (!S_ISDIR(inode->i_mode) &&
370 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { 375 (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
376 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
371 iap->ia_valid |= ATTR_KILL_PRIV; 377 iap->ia_valid |= ATTR_KILL_PRIV;
372 if (iap->ia_valid & ATTR_MODE) { 378 if (iap->ia_valid & ATTR_MODE) {
373 /* we're setting mode too, just clear the s*id bits */ 379 /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +966,7 @@ static void kill_suid(struct dentry *dentry)
960static __be32 966static __be32
961nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 967nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
962 loff_t offset, struct kvec *vec, int vlen, 968 loff_t offset, struct kvec *vec, int vlen,
963 unsigned long cnt, int *stablep) 969 unsigned long *cnt, int *stablep)
964{ 970{
965 struct svc_export *exp; 971 struct svc_export *exp;
966 struct dentry *dentry; 972 struct dentry *dentry;
@@ -974,7 +980,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
974 err = nfserr_perm; 980 err = nfserr_perm;
975 981
976 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 982 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
977 (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) 983 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
978 goto out; 984 goto out;
979#endif 985#endif
980 986
@@ -1009,7 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1009 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 1015 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
1010 set_fs(oldfs); 1016 set_fs(oldfs);
1011 if (host_err >= 0) { 1017 if (host_err >= 0) {
1012 nfsdstats.io_write += cnt; 1018 nfsdstats.io_write += host_err;
1013 fsnotify_modify(file->f_path.dentry); 1019 fsnotify_modify(file->f_path.dentry);
1014 } 1020 }
1015 1021
@@ -1054,9 +1060,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1054 } 1060 }
1055 1061
1056 dprintk("nfsd: write complete host_err=%d\n", host_err); 1062 dprintk("nfsd: write complete host_err=%d\n", host_err);
1057 if (host_err >= 0) 1063 if (host_err >= 0) {
1058 err = 0; 1064 err = 0;
1059 else 1065 *cnt = host_err;
1066 } else
1060 err = nfserrno(host_err); 1067 err = nfserrno(host_err);
1061out: 1068out:
1062 return err; 1069 return err;
@@ -1098,7 +1105,7 @@ out:
1098 */ 1105 */
1099__be32 1106__be32
1100nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1107nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1101 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, 1108 loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
1102 int *stablep) 1109 int *stablep)
1103{ 1110{
1104 __be32 err = 0; 1111 __be32 err = 0;
@@ -1179,6 +1186,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
1179 return 0; 1186 return 0;
1180} 1187}
1181 1188
1189/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
1190 * setting size to 0 may fail for some specific file systems by the permission
1191 * checking which requires WRITE permission but the mode is 000.
1192 * we ignore the resizing(to 0) on the just new created file, since the size is
1193 * 0 after file created.
1194 *
1195 * call this only after vfs_create() is called.
1196 * */
1197static void
1198nfsd_check_ignore_resizing(struct iattr *iap)
1199{
1200 if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
1201 iap->ia_valid &= ~ATTR_SIZE;
1202}
1203
1182/* 1204/*
1183 * Create a file (regular, directory, device, fifo); UNIX sockets 1205 * Create a file (regular, directory, device, fifo); UNIX sockets
1184 * not yet implemented. 1206 * not yet implemented.
@@ -1274,6 +1296,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1274 switch (type) { 1296 switch (type) {
1275 case S_IFREG: 1297 case S_IFREG:
1276 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1298 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1299 if (!host_err)
1300 nfsd_check_ignore_resizing(iap);
1277 break; 1301 break;
1278 case S_IFDIR: 1302 case S_IFDIR:
1279 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1303 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1451,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1427 /* setattr will sync the child (or not) */ 1451 /* setattr will sync the child (or not) */
1428 } 1452 }
1429 1453
1454 nfsd_check_ignore_resizing(iap);
1455
1430 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1456 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1431 /* Cram the verifier into atime/mtime */ 1457 /* Cram the verifier into atime/mtime */
1432 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1458 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
@@ -1864,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
1864 return 0; 1890 return 0;
1865} 1891}
1866 1892
1867static int nfsd_buffered_readdir(struct file *file, filldir_t func, 1893static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
1868 struct readdir_cd *cdp, loff_t *offsetp) 1894 struct readdir_cd *cdp, loff_t *offsetp)
1869{ 1895{
1870 struct readdir_data buf; 1896 struct readdir_data buf;
1871 struct buffered_dirent *de; 1897 struct buffered_dirent *de;
@@ -1875,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
1875 1901
1876 buf.dirent = (void *)__get_free_page(GFP_KERNEL); 1902 buf.dirent = (void *)__get_free_page(GFP_KERNEL);
1877 if (!buf.dirent) 1903 if (!buf.dirent)
1878 return -ENOMEM; 1904 return nfserrno(-ENOMEM);
1879 1905
1880 offset = *offsetp; 1906 offset = *offsetp;
1881 1907
1882 while (1) { 1908 while (1) {
1909 struct inode *dir_inode = file->f_path.dentry->d_inode;
1883 unsigned int reclen; 1910 unsigned int reclen;
1884 1911
1885 cdp->err = nfserr_eof; /* will be cleared on successful read */ 1912 cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1898,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
1898 if (!size) 1925 if (!size)
1899 break; 1926 break;
1900 1927
1928 /*
1929 * Various filldir functions may end up calling back into
1930 * lookup_one_len() and the file system's ->lookup() method.
1931 * These expect i_mutex to be held, as it would within readdir.
1932 */
1933 host_err = mutex_lock_killable(&dir_inode->i_mutex);
1934 if (host_err)
1935 break;
1936
1901 de = (struct buffered_dirent *)buf.dirent; 1937 de = (struct buffered_dirent *)buf.dirent;
1902 while (size > 0) { 1938 while (size > 0) {
1903 offset = de->offset; 1939 offset = de->offset;
1904 1940
1905 if (func(cdp, de->name, de->namlen, de->offset, 1941 if (func(cdp, de->name, de->namlen, de->offset,
1906 de->ino, de->d_type)) 1942 de->ino, de->d_type))
1907 goto done; 1943 break;
1908 1944
1909 if (cdp->err != nfs_ok) 1945 if (cdp->err != nfs_ok)
1910 goto done; 1946 break;
1911 1947
1912 reclen = ALIGN(sizeof(*de) + de->namlen, 1948 reclen = ALIGN(sizeof(*de) + de->namlen,
1913 sizeof(u64)); 1949 sizeof(u64));
1914 size -= reclen; 1950 size -= reclen;
1915 de = (struct buffered_dirent *)((char *)de + reclen); 1951 de = (struct buffered_dirent *)((char *)de + reclen);
1916 } 1952 }
1953 mutex_unlock(&dir_inode->i_mutex);
1954 if (size > 0) /* We bailed out early */
1955 break;
1956
1917 offset = vfs_llseek(file, 0, SEEK_CUR); 1957 offset = vfs_llseek(file, 0, SEEK_CUR);
1918 } 1958 }
1919 1959
1920 done:
1921 free_page((unsigned long)(buf.dirent)); 1960 free_page((unsigned long)(buf.dirent));
1922 1961
1923 if (host_err) 1962 if (host_err)
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 000000000000..df3e62c1ddc5
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
1obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 000000000000..d69e6ae59251
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
1/*
2 * alloc.c - NILFS dat/inode allocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include <linux/fs.h>
28#include <linux/bitops.h>
29#include "mdt.h"
30#include "alloc.h"
31
32
33static inline unsigned long
34nilfs_palloc_groups_per_desc_block(const struct inode *inode)
35{
36 return (1UL << inode->i_blkbits) /
37 sizeof(struct nilfs_palloc_group_desc);
38}
39
40static inline unsigned long
41nilfs_palloc_groups_count(const struct inode *inode)
42{
43 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
44}
45
46int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
47{
48 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
49
50 mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
51 if (!mi->mi_bgl)
52 return -ENOMEM;
53
54 bgl_lock_init(mi->mi_bgl);
55
56 nilfs_mdt_set_entry_size(inode, entry_size, 0);
57
58 mi->mi_blocks_per_group =
59 DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
60 mi->mi_entries_per_block) + 1;
61 /* Number of blocks in a group including entry blocks and
62 a bitmap block */
63 mi->mi_blocks_per_desc_block =
64 nilfs_palloc_groups_per_desc_block(inode) *
65 mi->mi_blocks_per_group + 1;
66 /* Number of blocks per descriptor including the
67 descriptor block */
68 return 0;
69}
70
71static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
72 unsigned long *offset)
73{
74 __u64 group = nr;
75
76 *offset = do_div(group, nilfs_palloc_entries_per_group(inode));
77 return group;
78}
79
80static unsigned long
81nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
82{
83 unsigned long desc_block =
84 group / nilfs_palloc_groups_per_desc_block(inode);
85 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
86}
87
88static unsigned long
89nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
90{
91 unsigned long desc_offset =
92 group % nilfs_palloc_groups_per_desc_block(inode);
93 return nilfs_palloc_desc_blkoff(inode, group) + 1 +
94 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
95}
96
97static unsigned long
98nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
99 const struct nilfs_palloc_group_desc *desc)
100{
101 unsigned long nfree;
102
103 spin_lock(nilfs_mdt_bgl_lock(inode, group));
104 nfree = le32_to_cpu(desc->pg_nfrees);
105 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
106 return nfree;
107}
108
109static void
110nilfs_palloc_group_desc_add_entries(struct inode *inode,
111 unsigned long group,
112 struct nilfs_palloc_group_desc *desc,
113 u32 n)
114{
115 spin_lock(nilfs_mdt_bgl_lock(inode, group));
116 le32_add_cpu(&desc->pg_nfrees, n);
117 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
118}
119
120static unsigned long
121nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
122{
123 unsigned long group, group_offset;
124
125 group = nilfs_palloc_group(inode, nr, &group_offset);
126
127 return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
128 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
129}
130
131static void nilfs_palloc_desc_block_init(struct inode *inode,
132 struct buffer_head *bh, void *kaddr)
133{
134 struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
135 unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
136 __le32 nfrees;
137
138 nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
139 while (n-- > 0) {
140 desc->pg_nfrees = nfrees;
141 desc++;
142 }
143}
144
145static int nilfs_palloc_get_desc_block(struct inode *inode,
146 unsigned long group,
147 int create, struct buffer_head **bhp)
148{
149 return nilfs_mdt_get_block(inode,
150 nilfs_palloc_desc_blkoff(inode, group),
151 create, nilfs_palloc_desc_block_init, bhp);
152}
153
154static int nilfs_palloc_get_bitmap_block(struct inode *inode,
155 unsigned long group,
156 int create, struct buffer_head **bhp)
157{
158 return nilfs_mdt_get_block(inode,
159 nilfs_palloc_bitmap_blkoff(inode, group),
160 create, NULL, bhp);
161}
162
163int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
164 int create, struct buffer_head **bhp)
165{
166 return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
167 create, NULL, bhp);
168}
169
170static struct nilfs_palloc_group_desc *
171nilfs_palloc_block_get_group_desc(const struct inode *inode,
172 unsigned long group,
173 const struct buffer_head *bh, void *kaddr)
174{
175 return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
176 group % nilfs_palloc_groups_per_desc_block(inode);
177}
178
179static unsigned char *
180nilfs_palloc_block_get_bitmap(const struct inode *inode,
181 const struct buffer_head *bh, void *kaddr)
182{
183 return (unsigned char *)(kaddr + bh_offset(bh));
184}
185
186void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
187 const struct buffer_head *bh, void *kaddr)
188{
189 unsigned long entry_offset, group_offset;
190
191 nilfs_palloc_group(inode, nr, &group_offset);
192 entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
193
194 return kaddr + bh_offset(bh) +
195 entry_offset * NILFS_MDT(inode)->mi_entry_size;
196}
197
198static int nilfs_palloc_find_available_slot(struct inode *inode,
199 unsigned long group,
200 unsigned long target,
201 unsigned char *bitmap,
202 int bsize) /* size in bits */
203{
204 int curr, pos, end, i;
205
206 if (target > 0) {
207 end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
208 if (end > bsize)
209 end = bsize;
210 pos = nilfs_find_next_zero_bit(bitmap, end, target);
211 if (pos < end &&
212 !nilfs_set_bit_atomic(
213 nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
214 return pos;
215 } else
216 end = 0;
217
218 for (i = 0, curr = end;
219 i < bsize;
220 i += BITS_PER_LONG, curr += BITS_PER_LONG) {
221 /* wrap around */
222 if (curr >= bsize)
223 curr = 0;
224 while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
225 != ~0UL) {
226 end = curr + BITS_PER_LONG;
227 if (end > bsize)
228 end = bsize;
229 pos = nilfs_find_next_zero_bit(bitmap, end, curr);
230 if ((pos < end) &&
231 !nilfs_set_bit_atomic(
232 nilfs_mdt_bgl_lock(inode, group), pos,
233 bitmap))
234 return pos;
235 }
236 }
237 return -ENOSPC;
238}
239
240static unsigned long
241nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
242 unsigned long curr, unsigned long max)
243{
244 return min_t(unsigned long,
245 nilfs_palloc_groups_per_desc_block(inode) -
246 curr % nilfs_palloc_groups_per_desc_block(inode),
247 max - curr + 1);
248}
249
250int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
251 struct nilfs_palloc_req *req)
252{
253 struct buffer_head *desc_bh, *bitmap_bh;
254 struct nilfs_palloc_group_desc *desc;
255 unsigned char *bitmap;
256 void *desc_kaddr, *bitmap_kaddr;
257 unsigned long group, maxgroup, ngroups;
258 unsigned long group_offset, maxgroup_offset;
259 unsigned long n, entries_per_group, groups_per_desc_block;
260 unsigned long i, j;
261 int pos, ret;
262
263 ngroups = nilfs_palloc_groups_count(inode);
264 maxgroup = ngroups - 1;
265 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
266 entries_per_group = nilfs_palloc_entries_per_group(inode);
267 groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
268
269 for (i = 0; i < ngroups; i += n) {
270 if (group >= ngroups) {
271 /* wrap around */
272 group = 0;
273 maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
274 &maxgroup_offset) - 1;
275 }
276 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
277 if (ret < 0)
278 return ret;
279 desc_kaddr = kmap(desc_bh->b_page);
280 desc = nilfs_palloc_block_get_group_desc(
281 inode, group, desc_bh, desc_kaddr);
282 n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
283 maxgroup);
284 for (j = 0; j < n; j++, desc++, group++) {
285 if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
286 > 0) {
287 ret = nilfs_palloc_get_bitmap_block(
288 inode, group, 1, &bitmap_bh);
289 if (ret < 0)
290 goto out_desc;
291 bitmap_kaddr = kmap(bitmap_bh->b_page);
292 bitmap = nilfs_palloc_block_get_bitmap(
293 inode, bitmap_bh, bitmap_kaddr);
294 pos = nilfs_palloc_find_available_slot(
295 inode, group, group_offset, bitmap,
296 entries_per_group);
297 if (pos >= 0) {
298 /* found a free entry */
299 nilfs_palloc_group_desc_add_entries(
300 inode, group, desc, -1);
301 req->pr_entry_nr =
302 entries_per_group * group + pos;
303 kunmap(desc_bh->b_page);
304 kunmap(bitmap_bh->b_page);
305
306 req->pr_desc_bh = desc_bh;
307 req->pr_bitmap_bh = bitmap_bh;
308 return 0;
309 }
310 kunmap(bitmap_bh->b_page);
311 brelse(bitmap_bh);
312 }
313
314 group_offset = 0;
315 }
316
317 kunmap(desc_bh->b_page);
318 brelse(desc_bh);
319 }
320
321 /* no entries left */
322 return -ENOSPC;
323
324 out_desc:
325 kunmap(desc_bh->b_page);
326 brelse(desc_bh);
327 return ret;
328}
329
330void nilfs_palloc_commit_alloc_entry(struct inode *inode,
331 struct nilfs_palloc_req *req)
332{
333 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
334 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
335 nilfs_mdt_mark_dirty(inode);
336
337 brelse(req->pr_bitmap_bh);
338 brelse(req->pr_desc_bh);
339}
340
341void nilfs_palloc_commit_free_entry(struct inode *inode,
342 struct nilfs_palloc_req *req)
343{
344 struct nilfs_palloc_group_desc *desc;
345 unsigned long group, group_offset;
346 unsigned char *bitmap;
347 void *desc_kaddr, *bitmap_kaddr;
348
349 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
350 desc_kaddr = kmap(req->pr_desc_bh->b_page);
351 desc = nilfs_palloc_block_get_group_desc(inode, group,
352 req->pr_desc_bh, desc_kaddr);
353 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
354 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
355 bitmap_kaddr);
356
357 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
358 group_offset, bitmap))
359 printk(KERN_WARNING "%s: entry number %llu already freed\n",
360 __func__, (unsigned long long)req->pr_entry_nr);
361
362 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
363
364 kunmap(req->pr_bitmap_bh->b_page);
365 kunmap(req->pr_desc_bh->b_page);
366
367 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
368 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
369 nilfs_mdt_mark_dirty(inode);
370
371 brelse(req->pr_bitmap_bh);
372 brelse(req->pr_desc_bh);
373}
374
375void nilfs_palloc_abort_alloc_entry(struct inode *inode,
376 struct nilfs_palloc_req *req)
377{
378 struct nilfs_palloc_group_desc *desc;
379 void *desc_kaddr, *bitmap_kaddr;
380 unsigned char *bitmap;
381 unsigned long group, group_offset;
382
383 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
384 desc_kaddr = kmap(req->pr_desc_bh->b_page);
385 desc = nilfs_palloc_block_get_group_desc(inode, group,
386 req->pr_desc_bh, desc_kaddr);
387 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
388 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
389 bitmap_kaddr);
390 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
391 group_offset, bitmap))
392 printk(KERN_WARNING "%s: entry numer %llu already freed\n",
393 __func__, (unsigned long long)req->pr_entry_nr);
394
395 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
396
397 kunmap(req->pr_bitmap_bh->b_page);
398 kunmap(req->pr_desc_bh->b_page);
399
400 brelse(req->pr_bitmap_bh);
401 brelse(req->pr_desc_bh);
402
403 req->pr_entry_nr = 0;
404 req->pr_bitmap_bh = NULL;
405 req->pr_desc_bh = NULL;
406}
407
408int nilfs_palloc_prepare_free_entry(struct inode *inode,
409 struct nilfs_palloc_req *req)
410{
411 struct buffer_head *desc_bh, *bitmap_bh;
412 unsigned long group, group_offset;
413 int ret;
414
415 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
416 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
417 if (ret < 0)
418 return ret;
419 ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
420 if (ret < 0) {
421 brelse(desc_bh);
422 return ret;
423 }
424
425 req->pr_desc_bh = desc_bh;
426 req->pr_bitmap_bh = bitmap_bh;
427 return 0;
428}
429
430void nilfs_palloc_abort_free_entry(struct inode *inode,
431 struct nilfs_palloc_req *req)
432{
433 brelse(req->pr_bitmap_bh);
434 brelse(req->pr_desc_bh);
435
436 req->pr_entry_nr = 0;
437 req->pr_bitmap_bh = NULL;
438 req->pr_desc_bh = NULL;
439}
440
441static int
442nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
443{
444 __u64 first, last;
445
446 first = group * nilfs_palloc_entries_per_group(inode);
447 last = first + nilfs_palloc_entries_per_group(inode) - 1;
448 return (nr >= first) && (nr <= last);
449}
450
451int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
452{
453 struct buffer_head *desc_bh, *bitmap_bh;
454 struct nilfs_palloc_group_desc *desc;
455 unsigned char *bitmap;
456 void *desc_kaddr, *bitmap_kaddr;
457 unsigned long group, group_offset;
458 int i, j, n, ret;
459
460 for (i = 0; i < nitems; i += n) {
461 group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
462 ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
463 if (ret < 0)
464 return ret;
465 ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
466 &bitmap_bh);
467 if (ret < 0) {
468 brelse(desc_bh);
469 return ret;
470 }
471 desc_kaddr = kmap(desc_bh->b_page);
472 desc = nilfs_palloc_block_get_group_desc(
473 inode, group, desc_bh, desc_kaddr);
474 bitmap_kaddr = kmap(bitmap_bh->b_page);
475 bitmap = nilfs_palloc_block_get_bitmap(
476 inode, bitmap_bh, bitmap_kaddr);
477 for (j = i, n = 0;
478 (j < nitems) && nilfs_palloc_group_is_in(inode, group,
479 entry_nrs[j]);
480 j++, n++) {
481 nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
482 if (!nilfs_clear_bit_atomic(
483 nilfs_mdt_bgl_lock(inode, group),
484 group_offset, bitmap)) {
485 printk(KERN_WARNING
486 "%s: entry number %llu already freed\n",
487 __func__,
488 (unsigned long long)entry_nrs[j]);
489 }
490 }
491 nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
492
493 kunmap(bitmap_bh->b_page);
494 kunmap(desc_bh->b_page);
495
496 nilfs_mdt_mark_buffer_dirty(desc_bh);
497 nilfs_mdt_mark_buffer_dirty(bitmap_bh);
498 nilfs_mdt_mark_dirty(inode);
499
500 brelse(bitmap_bh);
501 brelse(desc_bh);
502 }
503 return 0;
504}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 000000000000..4ace5475c2c7
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
1/*
2 * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#ifndef _NILFS_ALLOC_H
26#define _NILFS_ALLOC_H
27
28#include <linux/types.h>
29#include <linux/buffer_head.h>
30#include <linux/fs.h>
31
32static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode)
34{
35 return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
36}
37
38int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
39int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
40 struct buffer_head **);
41void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *);
43
44/**
45 * nilfs_palloc_req - persistent alloctor request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
49 * @pr_entry_bh: buffer head of the buffer containing translation entries
50 */
51struct nilfs_palloc_req {
52 __u64 pr_entry_nr;
53 struct buffer_head *pr_desc_bh;
54 struct buffer_head *pr_bitmap_bh;
55 struct buffer_head *pr_entry_bh;
56};
57
58int nilfs_palloc_prepare_alloc_entry(struct inode *,
59 struct nilfs_palloc_req *);
60void nilfs_palloc_commit_alloc_entry(struct inode *,
61 struct nilfs_palloc_req *);
62void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
63void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
64int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
65void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
66int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
67
68#define nilfs_set_bit_atomic ext2_set_bit_atomic
69#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
70#define nilfs_find_next_zero_bit ext2_find_next_zero_bit
71
72#endif /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 000000000000..064279e33bbb
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,788 @@
1/*
2 * bmap.c - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include "nilfs.h"
27#include "bmap.h"
28#include "sb.h"
29#include "btnode.h"
30#include "mdt.h"
31#include "dat.h"
32#include "alloc.h"
33
34int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
35 __u64 *ptrp)
36{
37 __u64 ptr;
38 int ret;
39
40 down_read(&bmap->b_sem);
41 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
42 if (ret < 0)
43 goto out;
44 if (bmap->b_pops->bpop_translate != NULL) {
45 ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
46 if (ret < 0)
47 goto out;
48 *ptrp = ptr;
49 }
50
51 out:
52 up_read(&bmap->b_sem);
53 return ret;
54}
55
56
57/**
58 * nilfs_bmap_lookup - find a record
59 * @bmap: bmap
60 * @key: key
61 * @recp: pointer to record
62 *
63 * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
64 * @bmap.
65 *
66 * Return Value: On success, 0 is returned and the record associated with @key
67 * is stored in the place pointed by @recp. On error, one of the following
68 * negative error codes is returned.
69 *
70 * %-EIO - I/O error.
71 *
72 * %-ENOMEM - Insufficient amount of memory available.
73 *
74 * %-ENOENT - A record associated with @key does not exist.
75 */
76int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
77 unsigned long key,
78 unsigned long *recp)
79{
80 __u64 ptr;
81 int ret;
82
83 /* XXX: use macro for level 1 */
84 ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
85 if (recp != NULL)
86 *recp = ptr;
87 return ret;
88}
89
90static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
91{
92 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
93 __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
94 int ret, n;
95
96 if (bmap->b_ops->bop_check_insert != NULL) {
97 ret = bmap->b_ops->bop_check_insert(bmap, key);
98 if (ret > 0) {
99 n = bmap->b_ops->bop_gather_data(
100 bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
101 if (n < 0)
102 return n;
103 ret = nilfs_btree_convert_and_insert(
104 bmap, key, ptr, keys, ptrs, n,
105 NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
106 if (ret == 0)
107 bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
108
109 return ret;
110 } else if (ret < 0)
111 return ret;
112 }
113
114 return bmap->b_ops->bop_insert(bmap, key, ptr);
115}
116
117/**
118 * nilfs_bmap_insert - insert a new key-record pair into a bmap
119 * @bmap: bmap
120 * @key: key
121 * @rec: record
122 *
123 * Description: nilfs_bmap_insert() inserts the new key-record pair specified
124 * by @key and @rec into @bmap.
125 *
126 * Return Value: On success, 0 is returned. On error, one of the following
127 * negative error codes is returned.
128 *
129 * %-EIO - I/O error.
130 *
131 * %-ENOMEM - Insufficient amount of memory available.
132 *
133 * %-EEXIST - A record associated with @key already exist.
134 */
135int nilfs_bmap_insert(struct nilfs_bmap *bmap,
136 unsigned long key,
137 unsigned long rec)
138{
139 int ret;
140
141 down_write(&bmap->b_sem);
142 ret = nilfs_bmap_do_insert(bmap, key, rec);
143 up_write(&bmap->b_sem);
144 return ret;
145}
146
147static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
148{
149 __u64 keys[NILFS_BMAP_LARGE_LOW + 1];
150 __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
151 int ret, n;
152
153 if (bmap->b_ops->bop_check_delete != NULL) {
154 ret = bmap->b_ops->bop_check_delete(bmap, key);
155 if (ret > 0) {
156 n = bmap->b_ops->bop_gather_data(
157 bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
158 if (n < 0)
159 return n;
160 ret = nilfs_direct_delete_and_convert(
161 bmap, key, keys, ptrs, n,
162 NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
163 if (ret == 0)
164 bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
165
166 return ret;
167 } else if (ret < 0)
168 return ret;
169 }
170
171 return bmap->b_ops->bop_delete(bmap, key);
172}
173
174int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
175{
176 __u64 lastkey;
177 int ret;
178
179 down_read(&bmap->b_sem);
180 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
181 if (!ret)
182 *key = lastkey;
183 up_read(&bmap->b_sem);
184 return ret;
185}
186
187/**
188 * nilfs_bmap_delete - delete a key-record pair from a bmap
189 * @bmap: bmap
190 * @key: key
191 *
192 * Description: nilfs_bmap_delete() deletes the key-record pair specified by
193 * @key from @bmap.
194 *
195 * Return Value: On success, 0 is returned. On error, one of the following
196 * negative error codes is returned.
197 *
198 * %-EIO - I/O error.
199 *
200 * %-ENOMEM - Insufficient amount of memory available.
201 *
202 * %-ENOENT - A record associated with @key does not exist.
203 */
204int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
205{
206 int ret;
207
208 down_write(&bmap->b_sem);
209 ret = nilfs_bmap_do_delete(bmap, key);
210 up_write(&bmap->b_sem);
211 return ret;
212}
213
214static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
215{
216 __u64 lastkey;
217 int ret;
218
219 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
220 if (ret < 0) {
221 if (ret == -ENOENT)
222 ret = 0;
223 return ret;
224 }
225
226 while (key <= lastkey) {
227 ret = nilfs_bmap_do_delete(bmap, lastkey);
228 if (ret < 0)
229 return ret;
230 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
231 if (ret < 0) {
232 if (ret == -ENOENT)
233 ret = 0;
234 return ret;
235 }
236 }
237 return 0;
238}
239
240/**
241 * nilfs_bmap_truncate - truncate a bmap to a specified key
242 * @bmap: bmap
243 * @key: key
244 *
245 * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
246 * greater than or equal to @key from @bmap.
247 *
248 * Return Value: On success, 0 is returned. On error, one of the following
249 * negative error codes is returned.
250 *
251 * %-EIO - I/O error.
252 *
253 * %-ENOMEM - Insufficient amount of memory available.
254 */
255int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
256{
257 int ret;
258
259 down_write(&bmap->b_sem);
260 ret = nilfs_bmap_do_truncate(bmap, key);
261 up_write(&bmap->b_sem);
262 return ret;
263}
264
265/**
266 * nilfs_bmap_clear - free resources a bmap holds
267 * @bmap: bmap
268 *
269 * Description: nilfs_bmap_clear() frees resources associated with @bmap.
270 */
271void nilfs_bmap_clear(struct nilfs_bmap *bmap)
272{
273 down_write(&bmap->b_sem);
274 if (bmap->b_ops->bop_clear != NULL)
275 bmap->b_ops->bop_clear(bmap);
276 up_write(&bmap->b_sem);
277}
278
279/**
280 * nilfs_bmap_propagate - propagate dirty state
281 * @bmap: bmap
282 * @bh: buffer head
283 *
284 * Description: nilfs_bmap_propagate() marks the buffers that directly or
285 * indirectly refer to the block specified by @bh dirty.
286 *
287 * Return Value: On success, 0 is returned. On error, one of the following
288 * negative error codes is returned.
289 *
290 * %-EIO - I/O error.
291 *
292 * %-ENOMEM - Insufficient amount of memory available.
293 */
294int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
295{
296 int ret;
297
298 down_write(&bmap->b_sem);
299 ret = bmap->b_ops->bop_propagate(bmap, bh);
300 up_write(&bmap->b_sem);
301 return ret;
302}
303
304/**
305 * nilfs_bmap_lookup_dirty_buffers -
306 * @bmap: bmap
307 * @listp: pointer to buffer head list
308 */
309void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
310 struct list_head *listp)
311{
312 if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
313 bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
314}
315
316/**
317 * nilfs_bmap_assign - assign a new block number to a block
318 * @bmap: bmap
319 * @bhp: pointer to buffer head
320 * @blocknr: block number
321 * @binfo: block information
322 *
323 * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
324 * buffer specified by @bh.
325 *
326 * Return Value: On success, 0 is returned and the buffer head of a newly
327 * create buffer and the block information associated with the buffer are
328 * stored in the place pointed by @bh and @binfo, respectively. On error, one
329 * of the following negative error codes is returned.
330 *
331 * %-EIO - I/O error.
332 *
333 * %-ENOMEM - Insufficient amount of memory available.
334 */
335int nilfs_bmap_assign(struct nilfs_bmap *bmap,
336 struct buffer_head **bh,
337 unsigned long blocknr,
338 union nilfs_binfo *binfo)
339{
340 int ret;
341
342 down_write(&bmap->b_sem);
343 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
344 up_write(&bmap->b_sem);
345 return ret;
346}
347
348/**
349 * nilfs_bmap_mark - mark block dirty
350 * @bmap: bmap
351 * @key: key
352 * @level: level
353 *
354 * Description: nilfs_bmap_mark() marks the block specified by @key and @level
355 * as dirty.
356 *
357 * Return Value: On success, 0 is returned. On error, one of the following
358 * negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 */
364int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
365{
366 int ret;
367
368 if (bmap->b_ops->bop_mark == NULL)
369 return 0;
370
371 down_write(&bmap->b_sem);
372 ret = bmap->b_ops->bop_mark(bmap, key, level);
373 up_write(&bmap->b_sem);
374 return ret;
375}
376
377/**
378 * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
379 * @bmap: bmap
380 *
381 * Description: nilfs_test_and_clear() is the atomic operation to test and
382 * clear the dirty state of @bmap.
383 *
384 * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
385 */
386int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
387{
388 int ret;
389
390 down_write(&bmap->b_sem);
391 ret = nilfs_bmap_dirty(bmap);
392 nilfs_bmap_clear_dirty(bmap);
393 up_write(&bmap->b_sem);
394 return ret;
395}
396
397
398/*
399 * Internal use only
400 */
401
402void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
403{
404 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
405 if (NILFS_MDT(bmap->b_inode))
406 nilfs_mdt_mark_dirty(bmap->b_inode);
407 else
408 mark_inode_dirty(bmap->b_inode);
409}
410
411void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
412{
413 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
414 if (NILFS_MDT(bmap->b_inode))
415 nilfs_mdt_mark_dirty(bmap->b_inode);
416 else
417 mark_inode_dirty(bmap->b_inode);
418}
419
420int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
421 struct buffer_head **bhp)
422{
423 return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
424 ptr, 0, bhp, 0);
425}
426
427void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
428 struct buffer_head *bh)
429{
430 brelse(bh);
431}
432
433int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
434 struct buffer_head **bhp)
435{
436 int ret;
437
438 ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
439 ptr, 0, bhp, 1);
440 if (ret < 0)
441 return ret;
442 set_buffer_nilfs_volatile(*bhp);
443 return 0;
444}
445
446void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
447 struct buffer_head *bh)
448{
449 nilfs_btnode_delete(bh);
450}
451
452__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
453 const struct buffer_head *bh)
454{
455 struct buffer_head *pbh;
456 __u64 key;
457
458 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
459 bmap->b_inode->i_blkbits);
460 for (pbh = page_buffers(bh->b_page); pbh != bh;
461 pbh = pbh->b_this_page, key++);
462
463 return key;
464}
465
466__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
467{
468 __s64 diff;
469
470 diff = key - bmap->b_last_allocated_key;
471 if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
472 (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
473 (bmap->b_last_allocated_ptr + diff > 0))
474 return bmap->b_last_allocated_ptr + diff;
475 else
476 return NILFS_BMAP_INVALID_PTR;
477}
478
479static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
480{
481 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
482}
483
484#define NILFS_BMAP_GROUP_DIV 8
485__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
486{
487 struct inode *dat = nilfs_bmap_get_dat(bmap);
488 unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
489 unsigned long group = bmap->b_inode->i_ino / entries_per_group;
490
491 return group * entries_per_group +
492 (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
493 (entries_per_group / NILFS_BMAP_GROUP_DIV);
494}
495
496static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
497 union nilfs_bmap_ptr_req *req)
498{
499 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
500}
501
502static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req)
504{
505 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506}
507
508static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req)
510{
511 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
512}
513
514static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
515 union nilfs_bmap_ptr_req *req)
516{
517 return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
518}
519
520static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
521 union nilfs_bmap_ptr_req *req,
522 sector_t blocknr)
523{
524 nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
525 blocknr);
526}
527
528static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
529 union nilfs_bmap_ptr_req *req)
530{
531 nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
532}
533
534static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
535 union nilfs_bmap_ptr_req *req)
536{
537 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
538}
539
540static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
541 union nilfs_bmap_ptr_req *req)
542{
543 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
544}
545
546static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
547 union nilfs_bmap_ptr_req *req)
548{
549 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
550}
551
552static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
553 union nilfs_bmap_ptr_req *req)
554{
555 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
556}
557
558int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
559 sector_t blocknr)
560{
561 return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
562}
563
564int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
565{
566 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
567}
568
569int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
570 union nilfs_bmap_ptr_req *oldreq,
571 union nilfs_bmap_ptr_req *newreq)
572{
573 int ret;
574
575 ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
576 if (ret < 0)
577 return ret;
578 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
579 if (ret < 0)
580 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
581
582 return ret;
583}
584
585void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
586 union nilfs_bmap_ptr_req *oldreq,
587 union nilfs_bmap_ptr_req *newreq)
588{
589 bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
590 bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
591}
592
593void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
594 union nilfs_bmap_ptr_req *oldreq,
595 union nilfs_bmap_ptr_req *newreq)
596{
597 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
598 bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
599}
600
601static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
602 __u64 *ptrp)
603{
604 sector_t blocknr;
605 int ret;
606
607 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
608 if (ret < 0)
609 return ret;
610 if (ptrp != NULL)
611 *ptrp = blocknr;
612 return 0;
613}
614
615static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
616 union nilfs_bmap_ptr_req *req)
617{
618 /* ignore target ptr */
619 req->bpr_ptr = bmap->b_last_allocated_ptr++;
620 return 0;
621}
622
623static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
624 union nilfs_bmap_ptr_req *req)
625{
626 /* do nothing */
627}
628
629static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
630 union nilfs_bmap_ptr_req *req)
631{
632 bmap->b_last_allocated_ptr--;
633}
634
635static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
636 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
637 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
638 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
639 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
640 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
641 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
642 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
643 .bpop_commit_end_ptr = nilfs_bmap_commit_end_v,
644 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
645
646 .bpop_translate = nilfs_bmap_translate_v,
647};
648
649static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
650 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
651 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
652 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
653 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
654 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
655 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
656 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
657 .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt,
658 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
659
660 .bpop_translate = nilfs_bmap_translate_v,
661};
662
663static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
664 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p,
665 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p,
666 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p,
667 .bpop_prepare_start_ptr = NULL,
668 .bpop_commit_start_ptr = NULL,
669 .bpop_abort_start_ptr = NULL,
670 .bpop_prepare_end_ptr = NULL,
671 .bpop_commit_end_ptr = NULL,
672 .bpop_abort_end_ptr = NULL,
673
674 .bpop_translate = NULL,
675};
676
677static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
678 .bpop_prepare_alloc_ptr = NULL,
679 .bpop_commit_alloc_ptr = NULL,
680 .bpop_abort_alloc_ptr = NULL,
681 .bpop_prepare_start_ptr = NULL,
682 .bpop_commit_start_ptr = NULL,
683 .bpop_abort_start_ptr = NULL,
684 .bpop_prepare_end_ptr = NULL,
685 .bpop_commit_end_ptr = NULL,
686 .bpop_abort_end_ptr = NULL,
687
688 .bpop_translate = NULL,
689};
690
691static struct lock_class_key nilfs_bmap_dat_lock_key;
692
693/**
694 * nilfs_bmap_read - read a bmap from an inode
695 * @bmap: bmap
696 * @raw_inode: on-disk inode
697 *
698 * Description: nilfs_bmap_read() initializes the bmap @bmap.
699 *
700 * Return Value: On success, 0 is returned. On error, the following negative
701 * error code is returned.
702 *
703 * %-ENOMEM - Insufficient amount of memory available.
704 */
705int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
706{
707 if (raw_inode == NULL)
708 memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
709 else
710 memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
711
712 init_rwsem(&bmap->b_sem);
713 bmap->b_state = 0;
714 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
715 switch (bmap->b_inode->i_ino) {
716 case NILFS_DAT_INO:
717 bmap->b_pops = &nilfs_bmap_ptr_ops_p;
718 bmap->b_last_allocated_key = 0; /* XXX: use macro */
719 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
720 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
721 break;
722 case NILFS_CPFILE_INO:
723 case NILFS_SUFILE_INO:
724 bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
725 bmap->b_last_allocated_key = 0; /* XXX: use macro */
726 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
727 break;
728 default:
729 bmap->b_pops = &nilfs_bmap_ptr_ops_v;
730 bmap->b_last_allocated_key = 0; /* XXX: use macro */
731 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
732 break;
733 }
734
735 return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
736 nilfs_btree_init(bmap,
737 NILFS_BMAP_LARGE_LOW,
738 NILFS_BMAP_LARGE_HIGH) :
739 nilfs_direct_init(bmap,
740 NILFS_BMAP_SMALL_LOW,
741 NILFS_BMAP_SMALL_HIGH);
742}
743
744/**
745 * nilfs_bmap_write - write back a bmap to an inode
746 * @bmap: bmap
747 * @raw_inode: on-disk inode
748 *
749 * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
750 */
751void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
752{
753 down_write(&bmap->b_sem);
754 memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
755 NILFS_INODE_BMAP_SIZE * sizeof(__le64));
756 if (bmap->b_inode->i_ino == NILFS_DAT_INO)
757 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
758
759 up_write(&bmap->b_sem);
760}
761
762void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
763{
764 memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
765 init_rwsem(&bmap->b_sem);
766 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
767 bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
768 bmap->b_last_allocated_key = 0;
769 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
770 bmap->b_state = 0;
771 nilfs_btree_init_gc(bmap);
772}
773
774void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
775{
776 memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
777 init_rwsem(&gcbmap->b_sem);
778 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
779 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
780}
781
782void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
783{
784 memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
785 init_rwsem(&bmap->b_sem);
786 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
787 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
788}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 000000000000..4f2708abb1ba
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
1/*
2 * bmap.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_H
24#define _NILFS_BMAP_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "alloc.h"
31
32#define NILFS_BMAP_INVALID_PTR 0
33
34#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey)
35#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key)
36#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr)
37#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr)
38
39#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff))
40
41
42struct nilfs_bmap;
43
44/**
45 * union nilfs_bmap_ptr_req - request for bmap ptr
46 * @bpr_ptr: bmap pointer
47 * @bpr_req: request for persistent allocator
48 */
49union nilfs_bmap_ptr_req {
50 __u64 bpr_ptr;
51 struct nilfs_palloc_req bpr_req;
52};
53
54/**
55 * struct nilfs_bmap_stats - bmap statistics
56 * @bs_nblocks: number of blocks created or deleted
57 */
58struct nilfs_bmap_stats {
59 unsigned int bs_nblocks;
60};
61
62/**
63 * struct nilfs_bmap_operations - bmap operation table
64 */
65struct nilfs_bmap_operations {
66 int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
67 int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
68 int (*bop_delete)(struct nilfs_bmap *, __u64);
69 void (*bop_clear)(struct nilfs_bmap *);
70
71 int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
72 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
73 struct list_head *);
74
75 int (*bop_assign)(struct nilfs_bmap *,
76 struct buffer_head **,
77 sector_t,
78 union nilfs_binfo *);
79 int (*bop_mark)(struct nilfs_bmap *, __u64, int);
80
81 /* The following functions are internal use only. */
82 int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
83 int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
84 int (*bop_check_delete)(struct nilfs_bmap *, __u64);
85 int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
86};
87
88
89/**
90 * struct nilfs_bmap_ptr_operations - bmap ptr operation table
91 */
92struct nilfs_bmap_ptr_operations {
93 int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
94 union nilfs_bmap_ptr_req *);
95 void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
96 union nilfs_bmap_ptr_req *);
97 void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
98 union nilfs_bmap_ptr_req *);
99 int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
100 union nilfs_bmap_ptr_req *);
101 void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
102 union nilfs_bmap_ptr_req *,
103 sector_t);
104 void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
105 union nilfs_bmap_ptr_req *);
106 int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
107 union nilfs_bmap_ptr_req *);
108 void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
109 union nilfs_bmap_ptr_req *);
110 void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
111 union nilfs_bmap_ptr_req *);
112
113 int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
114};
115
116
117#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
118#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */)
119#define NILFS_BMAP_NEW_PTR_INIT \
120 (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
121
122static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
123{
124 return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
125}
126
127
128/**
129 * struct nilfs_bmap - bmap structure
130 * @b_u: raw data
131 * @b_sem: semaphore
132 * @b_inode: owner of bmap
133 * @b_ops: bmap operation table
134 * @b_pops: bmap ptr operation table
135 * @b_low: low watermark of conversion
136 * @b_high: high watermark of conversion
137 * @b_last_allocated_key: last allocated key for data block
138 * @b_last_allocated_ptr: last allocated ptr for data block
139 * @b_state: state
140 */
141struct nilfs_bmap {
142 union {
143 __u8 u_flags;
144 __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
145 } b_u;
146 struct rw_semaphore b_sem;
147 struct inode *b_inode;
148 const struct nilfs_bmap_operations *b_ops;
149 const struct nilfs_bmap_ptr_operations *b_pops;
150 __u64 b_low;
151 __u64 b_high;
152 __u64 b_last_allocated_key;
153 __u64 b_last_allocated_ptr;
154 int b_state;
155};
156
157/* state */
158#define NILFS_BMAP_DIRTY 0x00000001
159
160
161int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
162int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
163void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
164int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
165int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
166int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
167int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
168int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
169void nilfs_bmap_clear(struct nilfs_bmap *);
170int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
171void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
172int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
173 unsigned long, union nilfs_binfo *);
174int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
175int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
176
177void nilfs_bmap_init_gc(struct nilfs_bmap *);
178void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
179void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
180
181
182/*
183 * Internal use only
184 */
185
186int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
187int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
188
189
190__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
191 const struct buffer_head *);
192
193__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
194__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
195
196int nilfs_bmap_prepare_update(struct nilfs_bmap *,
197 union nilfs_bmap_ptr_req *,
198 union nilfs_bmap_ptr_req *);
199void nilfs_bmap_commit_update(struct nilfs_bmap *,
200 union nilfs_bmap_ptr_req *,
201 union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_update(struct nilfs_bmap *,
203 union nilfs_bmap_ptr_req *,
204 union nilfs_bmap_ptr_req *);
205
206void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
207void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
208
209
210int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
211 struct buffer_head **);
212void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
213int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
214 struct buffer_head **);
215void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
216
217
218/* Assume that bmap semaphore is locked. */
219static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
220{
221 return !!(bmap->b_state & NILFS_BMAP_DIRTY);
222}
223
224/* Assume that bmap semaphore is locked. */
225static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
226{
227 bmap->b_state |= NILFS_BMAP_DIRTY;
228}
229
230/* Assume that bmap semaphore is locked. */
231static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
232{
233 bmap->b_state &= ~NILFS_BMAP_DIRTY;
234}
235
236
237#define NILFS_BMAP_LARGE 0x1
238
239#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN
240#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX
241#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX
242#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX
243
244#endif /* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 000000000000..d41509bff47b
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
1/*
2 * bmap_union.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_UNION_H
24#define _NILFS_BMAP_UNION_H
25
26#include "bmap.h"
27#include "direct.h"
28#include "btree.h"
29
30/**
31 * nilfs_bmap_union -
32 * @bi_bmap: bmap structure
33 * @bi_btree: direct map structure
34 * @bi_direct: B-tree structure
35 */
36union nilfs_bmap_union {
37 struct nilfs_bmap bi_bmap;
38 struct nilfs_direct bi_direct;
39 struct nilfs_btree bi_btree;
40};
41
42#endif /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 000000000000..4cc07b2c30e0
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
1/*
2 * btnode.c - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * This file was originally written by Seiji Kihara <kihara@osrg.net>
21 * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
22 * stabilization and simplification.
23 *
24 */
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/mm.h>
29#include <linux/backing-dev.h>
30#include "nilfs.h"
31#include "mdt.h"
32#include "dat.h"
33#include "page.h"
34#include "btnode.h"
35
36
37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list);
42 spin_lock_init(&btnc->private_lock);
43
44 spin_lock_init(&btnc->i_mmap_lock);
45 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47}
48
49static struct address_space_operations def_btnode_aops;
50
51void nilfs_btnode_cache_init(struct address_space *btnc)
52{
53 btnc->host = NULL; /* can safely set to host inode ? */
54 btnc->flags = 0;
55 mapping_set_gfp_mask(btnc, GFP_NOFS);
56 btnc->assoc_mapping = NULL;
57 btnc->backing_dev_info = &default_backing_dev_info;
58 btnc->a_ops = &def_btnode_aops;
59}
60
61void nilfs_btnode_cache_clear(struct address_space *btnc)
62{
63 invalidate_mapping_pages(btnc, 0, -1);
64 truncate_inode_pages(btnc, 0);
65}
66
67int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
68 sector_t pblocknr, struct buffer_head **pbh,
69 int newblk)
70{
71 struct buffer_head *bh;
72 struct inode *inode = NILFS_BTNC_I(btnc);
73 int err;
74
75 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
76 if (unlikely(!bh))
77 return -ENOMEM;
78
79 err = -EEXIST; /* internal code */
80 if (newblk) {
81 if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
82 buffer_dirty(bh))) {
83 brelse(bh);
84 BUG();
85 }
86 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
87 bh->b_blocknr = blocknr;
88 set_buffer_mapped(bh);
89 set_buffer_uptodate(bh);
90 goto found;
91 }
92
93 if (buffer_uptodate(bh) || buffer_dirty(bh))
94 goto found;
95
96 if (pblocknr == 0) {
97 pblocknr = blocknr;
98 if (inode->i_ino != NILFS_DAT_INO) {
99 struct inode *dat =
100 nilfs_dat_inode(NILFS_I_NILFS(inode));
101
102 /* blocknr is a virtual block number */
103 err = nilfs_dat_translate(dat, blocknr, &pblocknr);
104 if (unlikely(err)) {
105 brelse(bh);
106 goto out_locked;
107 }
108 }
109 }
110 lock_buffer(bh);
111 if (buffer_uptodate(bh)) {
112 unlock_buffer(bh);
113 err = -EEXIST; /* internal code */
114 goto found;
115 }
116 set_buffer_mapped(bh);
117 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
118 bh->b_blocknr = pblocknr; /* set block address for read */
119 bh->b_end_io = end_buffer_read_sync;
120 get_bh(bh);
121 submit_bh(READ, bh);
122 bh->b_blocknr = blocknr; /* set back to the given block address */
123 err = 0;
124found:
125 *pbh = bh;
126
127out_locked:
128 unlock_page(bh->b_page);
129 page_cache_release(bh->b_page);
130 return err;
131}
132
133int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
134 sector_t pblocknr, struct buffer_head **pbh, int newblk)
135{
136 struct buffer_head *bh;
137 int err;
138
139 err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
140 if (err == -EEXIST) /* internal code (cache hit) */
141 return 0;
142 if (unlikely(err))
143 return err;
144
145 bh = *pbh;
146 wait_on_buffer(bh);
147 if (!buffer_uptodate(bh)) {
148 brelse(bh);
149 return -EIO;
150 }
151 return 0;
152}
153
154/**
155 * nilfs_btnode_delete - delete B-tree node buffer
156 * @bh: buffer to be deleted
157 *
158 * nilfs_btnode_delete() invalidates the specified buffer and delete the page
159 * including the buffer if the page gets unbusy.
160 */
161void nilfs_btnode_delete(struct buffer_head *bh)
162{
163 struct address_space *mapping;
164 struct page *page = bh->b_page;
165 pgoff_t index = page_index(page);
166 int still_dirty;
167
168 page_cache_get(page);
169 lock_page(page);
170 wait_on_page_writeback(page);
171
172 nilfs_forget_buffer(bh);
173 still_dirty = PageDirty(page);
174 mapping = page->mapping;
175 unlock_page(page);
176 page_cache_release(page);
177
178 if (!still_dirty && mapping)
179 invalidate_inode_pages2_range(mapping, index, index);
180}
181
182/**
183 * nilfs_btnode_prepare_change_key
184 * prepare to move contents of the block for old key to one of new key.
185 * the old buffer will not be removed, but might be reused for new buffer.
186 * it might return -ENOMEM because of memory allocation errors,
187 * and might return -EIO because of disk read errors.
188 */
189int nilfs_btnode_prepare_change_key(struct address_space *btnc,
190 struct nilfs_btnode_chkey_ctxt *ctxt)
191{
192 struct buffer_head *obh, *nbh;
193 struct inode *inode = NILFS_BTNC_I(btnc);
194 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
195 int err;
196
197 if (oldkey == newkey)
198 return 0;
199
200 obh = ctxt->bh;
201 ctxt->newbh = NULL;
202
203 if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
204 lock_page(obh->b_page);
205 /*
206 * We cannot call radix_tree_preload for the kernels older
207 * than 2.6.23, because it is not exported for modules.
208 */
209 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
210 if (err)
211 goto failed_unlock;
212 /* BUG_ON(oldkey != obh->b_page->index); */
213 if (unlikely(oldkey != obh->b_page->index))
214 NILFS_PAGE_BUG(obh->b_page,
215 "invalid oldkey %lld (newkey=%lld)",
216 (unsigned long long)oldkey,
217 (unsigned long long)newkey);
218
219retry:
220 spin_lock_irq(&btnc->tree_lock);
221 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
222 spin_unlock_irq(&btnc->tree_lock);
223 /*
224 * Note: page->index will not change to newkey until
225 * nilfs_btnode_commit_change_key() will be called.
226 * To protect the page in intermediate state, the page lock
227 * is held.
228 */
229 radix_tree_preload_end();
230 if (!err)
231 return 0;
232 else if (err != -EEXIST)
233 goto failed_unlock;
234
235 err = invalidate_inode_pages2_range(btnc, newkey, newkey);
236 if (!err)
237 goto retry;
238 /* fallback to copy mode */
239 unlock_page(obh->b_page);
240 }
241
242 err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
243 if (likely(!err)) {
244 BUG_ON(nbh == obh);
245 ctxt->newbh = nbh;
246 }
247 return err;
248
249 failed_unlock:
250 unlock_page(obh->b_page);
251 return err;
252}
253
254/**
255 * nilfs_btnode_commit_change_key
256 * commit the change_key operation prepared by prepare_change_key().
257 */
258void nilfs_btnode_commit_change_key(struct address_space *btnc,
259 struct nilfs_btnode_chkey_ctxt *ctxt)
260{
261 struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
262 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
263 struct page *opage;
264
265 if (oldkey == newkey)
266 return;
267
268 if (nbh == NULL) { /* blocksize == pagesize */
269 opage = obh->b_page;
270 if (unlikely(oldkey != opage->index))
271 NILFS_PAGE_BUG(opage,
272 "invalid oldkey %lld (newkey=%lld)",
273 (unsigned long long)oldkey,
274 (unsigned long long)newkey);
275 if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
276 BUG();
277
278 spin_lock_irq(&btnc->tree_lock);
279 radix_tree_delete(&btnc->page_tree, oldkey);
280 radix_tree_tag_set(&btnc->page_tree, newkey,
281 PAGECACHE_TAG_DIRTY);
282 spin_unlock_irq(&btnc->tree_lock);
283
284 opage->index = obh->b_blocknr = newkey;
285 unlock_page(opage);
286 } else {
287 nilfs_copy_buffer(nbh, obh);
288 nilfs_btnode_mark_dirty(nbh);
289
290 nbh->b_blocknr = newkey;
291 ctxt->bh = nbh;
292 nilfs_btnode_delete(obh); /* will decrement bh->b_count */
293 }
294}
295
296/**
297 * nilfs_btnode_abort_change_key
298 * abort the change_key operation prepared by prepare_change_key().
299 */
300void nilfs_btnode_abort_change_key(struct address_space *btnc,
301 struct nilfs_btnode_chkey_ctxt *ctxt)
302{
303 struct buffer_head *nbh = ctxt->newbh;
304 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
305
306 if (oldkey == newkey)
307 return;
308
309 if (nbh == NULL) { /* blocksize == pagesize */
310 spin_lock_irq(&btnc->tree_lock);
311 radix_tree_delete(&btnc->page_tree, newkey);
312 spin_unlock_irq(&btnc->tree_lock);
313 unlock_page(ctxt->bh->b_page);
314 } else
315 brelse(nbh);
316}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 000000000000..35faa86444a7
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
1/*
2 * btnode.h - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_BTNODE_H
25#define _NILFS_BTNODE_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/backing-dev.h>
31
32
33struct nilfs_btnode_chkey_ctxt {
34 __u64 oldkey;
35 __u64 newkey;
36 struct buffer_head *bh;
37 struct buffer_head *newbh;
38};
39
40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *);
42void nilfs_btnode_cache_clear(struct address_space *);
43int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
44 struct buffer_head **, int);
45int nilfs_btnode_get(struct address_space *, __u64, sector_t,
46 struct buffer_head **, int);
47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *);
50void nilfs_btnode_commit_change_key(struct address_space *,
51 struct nilfs_btnode_chkey_ctxt *);
52void nilfs_btnode_abort_change_key(struct address_space *,
53 struct nilfs_btnode_chkey_ctxt *);
54
55#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh)
56
57
58#endif /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 000000000000..6b37a2767293
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
1/*
2 * btree.c - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/slab.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include <linux/pagevec.h>
27#include "nilfs.h"
28#include "page.h"
29#include "btnode.h"
30#include "btree.h"
31#include "alloc.h"
32
33/**
34 * struct nilfs_btree_path - A path on which B-tree operations are executed
35 * @bp_bh: buffer head of node block
36 * @bp_sib_bh: buffer head of sibling node block
37 * @bp_index: index of child node
38 * @bp_oldreq: ptr end request for old ptr
39 * @bp_newreq: ptr alloc request for new ptr
40 * @bp_op: rebalance operation
41 */
42struct nilfs_btree_path {
43 struct buffer_head *bp_bh;
44 struct buffer_head *bp_sib_bh;
45 int bp_index;
46 union nilfs_bmap_ptr_req bp_oldreq;
47 union nilfs_bmap_ptr_req bp_newreq;
48 struct nilfs_btnode_chkey_ctxt bp_ctxt;
49 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
50 int, __u64 *, __u64 *);
51};
52
53/*
54 * B-tree path operations
55 */
56
57static struct kmem_cache *nilfs_btree_path_cache;
58
59int __init nilfs_btree_path_cache_init(void)
60{
61 nilfs_btree_path_cache =
62 kmem_cache_create("nilfs2_btree_path_cache",
63 sizeof(struct nilfs_btree_path) *
64 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
65 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
66}
67
68void nilfs_btree_path_cache_destroy(void)
69{
70 kmem_cache_destroy(nilfs_btree_path_cache);
71}
72
73static inline struct nilfs_btree_path *
74nilfs_btree_alloc_path(const struct nilfs_btree *btree)
75{
76 return (struct nilfs_btree_path *)
77 kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
78}
79
80static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
81 struct nilfs_btree_path *path)
82{
83 kmem_cache_free(nilfs_btree_path_cache, path);
84}
85
86static void nilfs_btree_init_path(const struct nilfs_btree *btree,
87 struct nilfs_btree_path *path)
88{
89 int level;
90
91 for (level = NILFS_BTREE_LEVEL_DATA;
92 level < NILFS_BTREE_LEVEL_MAX;
93 level++) {
94 path[level].bp_bh = NULL;
95 path[level].bp_sib_bh = NULL;
96 path[level].bp_index = 0;
97 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
98 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
99 path[level].bp_op = NULL;
100 }
101}
102
103static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
104 struct nilfs_btree_path *path)
105{
106 int level;
107
108 for (level = NILFS_BTREE_LEVEL_DATA;
109 level < NILFS_BTREE_LEVEL_MAX;
110 level++) {
111 if (path[level].bp_bh != NULL) {
112 nilfs_bmap_put_block(&btree->bt_bmap,
113 path[level].bp_bh);
114 path[level].bp_bh = NULL;
115 }
116 /* sib_bh is released or deleted by prepare or commit
117 * operations. */
118 path[level].bp_sib_bh = NULL;
119 path[level].bp_index = 0;
120 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
121 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
122 path[level].bp_op = NULL;
123 }
124}
125
126
127/*
128 * B-tree node operations
129 */
130
131static inline int
132nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
133 const struct nilfs_btree_node *node)
134{
135 return node->bn_flags;
136}
137
138static inline void
139nilfs_btree_node_set_flags(struct nilfs_btree *btree,
140 struct nilfs_btree_node *node,
141 int flags)
142{
143 node->bn_flags = flags;
144}
145
146static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
147 const struct nilfs_btree_node *node)
148{
149 return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
150}
151
152static inline int
153nilfs_btree_node_get_level(const struct nilfs_btree *btree,
154 const struct nilfs_btree_node *node)
155{
156 return node->bn_level;
157}
158
159static inline void
160nilfs_btree_node_set_level(struct nilfs_btree *btree,
161 struct nilfs_btree_node *node,
162 int level)
163{
164 node->bn_level = level;
165}
166
167static inline int
168nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
169 const struct nilfs_btree_node *node)
170{
171 return le16_to_cpu(node->bn_nchildren);
172}
173
174static inline void
175nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
176 struct nilfs_btree_node *node,
177 int nchildren)
178{
179 node->bn_nchildren = cpu_to_le16(nchildren);
180}
181
182static inline int
183nilfs_btree_node_size(const struct nilfs_btree *btree)
184{
185 return 1 << btree->bt_bmap.b_inode->i_blkbits;
186}
187
188static inline int
189nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
190 const struct nilfs_btree_node *node)
191{
192 return nilfs_btree_node_root(btree, node) ?
193 NILFS_BTREE_ROOT_NCHILDREN_MIN :
194 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
195}
196
197static inline int
198nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
199 const struct nilfs_btree_node *node)
200{
201 return nilfs_btree_node_root(btree, node) ?
202 NILFS_BTREE_ROOT_NCHILDREN_MAX :
203 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
204}
205
206static inline __le64 *
207nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
208 const struct nilfs_btree_node *node)
209{
210 return (__le64 *)((char *)(node + 1) +
211 (nilfs_btree_node_root(btree, node) ?
212 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
213}
214
215static inline __le64 *
216nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
217 const struct nilfs_btree_node *node)
218{
219 return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
220 nilfs_btree_node_nchildren_max(btree, node));
221}
222
223static inline __u64
224nilfs_btree_node_get_key(const struct nilfs_btree *btree,
225 const struct nilfs_btree_node *node, int index)
226{
227 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
228 index));
229}
230
231static inline void
232nilfs_btree_node_set_key(struct nilfs_btree *btree,
233 struct nilfs_btree_node *node, int index, __u64 key)
234{
235 *(nilfs_btree_node_dkeys(btree, node) + index) =
236 nilfs_bmap_key_to_dkey(key);
237}
238
239static inline __u64
240nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
241 const struct nilfs_btree_node *node,
242 int index)
243{
244 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
245 index));
246}
247
248static inline void
249nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
250 struct nilfs_btree_node *node,
251 int index,
252 __u64 ptr)
253{
254 *(nilfs_btree_node_dptrs(btree, node) + index) =
255 nilfs_bmap_ptr_to_dptr(ptr);
256}
257
258static void nilfs_btree_node_init(struct nilfs_btree *btree,
259 struct nilfs_btree_node *node,
260 int flags, int level, int nchildren,
261 const __u64 *keys, const __u64 *ptrs)
262{
263 __le64 *dkeys;
264 __le64 *dptrs;
265 int i;
266
267 nilfs_btree_node_set_flags(btree, node, flags);
268 nilfs_btree_node_set_level(btree, node, level);
269 nilfs_btree_node_set_nchildren(btree, node, nchildren);
270
271 dkeys = nilfs_btree_node_dkeys(btree, node);
272 dptrs = nilfs_btree_node_dptrs(btree, node);
273 for (i = 0; i < nchildren; i++) {
274 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
275 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
276 }
277}
278
279/* Assume the buffer heads corresponding to left and right are locked. */
280static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
281 struct nilfs_btree_node *left,
282 struct nilfs_btree_node *right,
283 int n)
284{
285 __le64 *ldkeys, *rdkeys;
286 __le64 *ldptrs, *rdptrs;
287 int lnchildren, rnchildren;
288
289 ldkeys = nilfs_btree_node_dkeys(btree, left);
290 ldptrs = nilfs_btree_node_dptrs(btree, left);
291 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
292
293 rdkeys = nilfs_btree_node_dkeys(btree, right);
294 rdptrs = nilfs_btree_node_dptrs(btree, right);
295 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
296
297 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
298 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
299 memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
300 memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
301
302 lnchildren += n;
303 rnchildren -= n;
304 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
305 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
306}
307
308/* Assume that the buffer heads corresponding to left and right are locked. */
309static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
310 struct nilfs_btree_node *left,
311 struct nilfs_btree_node *right,
312 int n)
313{
314 __le64 *ldkeys, *rdkeys;
315 __le64 *ldptrs, *rdptrs;
316 int lnchildren, rnchildren;
317
318 ldkeys = nilfs_btree_node_dkeys(btree, left);
319 ldptrs = nilfs_btree_node_dptrs(btree, left);
320 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
321
322 rdkeys = nilfs_btree_node_dkeys(btree, right);
323 rdptrs = nilfs_btree_node_dptrs(btree, right);
324 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
325
326 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
327 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
328 memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
329 memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
330
331 lnchildren -= n;
332 rnchildren += n;
333 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
334 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
335}
336
337/* Assume that the buffer head corresponding to node is locked. */
338static void nilfs_btree_node_insert(struct nilfs_btree *btree,
339 struct nilfs_btree_node *node,
340 __u64 key, __u64 ptr, int index)
341{
342 __le64 *dkeys;
343 __le64 *dptrs;
344 int nchildren;
345
346 dkeys = nilfs_btree_node_dkeys(btree, node);
347 dptrs = nilfs_btree_node_dptrs(btree, node);
348 nchildren = nilfs_btree_node_get_nchildren(btree, node);
349 if (index < nchildren) {
350 memmove(dkeys + index + 1, dkeys + index,
351 (nchildren - index) * sizeof(*dkeys));
352 memmove(dptrs + index + 1, dptrs + index,
353 (nchildren - index) * sizeof(*dptrs));
354 }
355 dkeys[index] = nilfs_bmap_key_to_dkey(key);
356 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
357 nchildren++;
358 nilfs_btree_node_set_nchildren(btree, node, nchildren);
359}
360
361/* Assume that the buffer head corresponding to node is locked. */
362static void nilfs_btree_node_delete(struct nilfs_btree *btree,
363 struct nilfs_btree_node *node,
364 __u64 *keyp, __u64 *ptrp, int index)
365{
366 __u64 key;
367 __u64 ptr;
368 __le64 *dkeys;
369 __le64 *dptrs;
370 int nchildren;
371
372 dkeys = nilfs_btree_node_dkeys(btree, node);
373 dptrs = nilfs_btree_node_dptrs(btree, node);
374 key = nilfs_bmap_dkey_to_key(dkeys[index]);
375 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
376 nchildren = nilfs_btree_node_get_nchildren(btree, node);
377 if (keyp != NULL)
378 *keyp = key;
379 if (ptrp != NULL)
380 *ptrp = ptr;
381
382 if (index < nchildren - 1) {
383 memmove(dkeys + index, dkeys + index + 1,
384 (nchildren - index - 1) * sizeof(*dkeys));
385 memmove(dptrs + index, dptrs + index + 1,
386 (nchildren - index - 1) * sizeof(*dptrs));
387 }
388 nchildren--;
389 nilfs_btree_node_set_nchildren(btree, node, nchildren);
390}
391
392static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
393 const struct nilfs_btree_node *node,
394 __u64 key, int *indexp)
395{
396 __u64 nkey;
397 int index, low, high, s;
398
399 /* binary search */
400 low = 0;
401 high = nilfs_btree_node_get_nchildren(btree, node) - 1;
402 index = 0;
403 s = 0;
404 while (low <= high) {
405 index = (low + high) / 2;
406 nkey = nilfs_btree_node_get_key(btree, node, index);
407 if (nkey == key) {
408 s = 0;
409 goto out;
410 } else if (nkey < key) {
411 low = index + 1;
412 s = -1;
413 } else {
414 high = index - 1;
415 s = 1;
416 }
417 }
418
419 /* adjust index */
420 if (nilfs_btree_node_get_level(btree, node) >
421 NILFS_BTREE_LEVEL_NODE_MIN) {
422 if ((s > 0) && (index > 0))
423 index--;
424 } else if (s < 0)
425 index++;
426
427 out:
428 *indexp = index;
429
430 return s == 0;
431}
432
433static inline struct nilfs_btree_node *
434nilfs_btree_get_root(const struct nilfs_btree *btree)
435{
436 return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
437}
438
439static inline struct nilfs_btree_node *
440nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
441 const struct nilfs_btree_path *path,
442 int level)
443{
444 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
445}
446
447static inline struct nilfs_btree_node *
448nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
449 const struct nilfs_btree_path *path,
450 int level)
451{
452 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
453}
454
455static inline int nilfs_btree_height(const struct nilfs_btree *btree)
456{
457 return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
458 + 1;
459}
460
461static inline struct nilfs_btree_node *
462nilfs_btree_get_node(const struct nilfs_btree *btree,
463 const struct nilfs_btree_path *path,
464 int level)
465{
466 return (level == nilfs_btree_height(btree) - 1) ?
467 nilfs_btree_get_root(btree) :
468 nilfs_btree_get_nonroot_node(btree, path, level);
469}
470
471static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
472 struct nilfs_btree_path *path,
473 __u64 key, __u64 *ptrp, int minlevel)
474{
475 struct nilfs_btree_node *node;
476 __u64 ptr;
477 int level, index, found, ret;
478
479 node = nilfs_btree_get_root(btree);
480 level = nilfs_btree_node_get_level(btree, node);
481 if ((level < minlevel) ||
482 (nilfs_btree_node_get_nchildren(btree, node) <= 0))
483 return -ENOENT;
484
485 found = nilfs_btree_node_lookup(btree, node, key, &index);
486 ptr = nilfs_btree_node_get_ptr(btree, node, index);
487 path[level].bp_bh = NULL;
488 path[level].bp_index = index;
489
490 for (level--; level >= minlevel; level--) {
491 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
492 &path[level].bp_bh);
493 if (ret < 0)
494 return ret;
495 node = nilfs_btree_get_nonroot_node(btree, path, level);
496 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
497 if (!found)
498 found = nilfs_btree_node_lookup(btree, node, key,
499 &index);
500 else
501 index = 0;
502 if (index < nilfs_btree_node_nchildren_max(btree, node))
503 ptr = nilfs_btree_node_get_ptr(btree, node, index);
504 else {
505 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
506 /* insert */
507 ptr = NILFS_BMAP_INVALID_PTR;
508 }
509 path[level].bp_index = index;
510 }
511 if (!found)
512 return -ENOENT;
513
514 if (ptrp != NULL)
515 *ptrp = ptr;
516
517 return 0;
518}
519
520static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
521 struct nilfs_btree_path *path,
522 __u64 *keyp, __u64 *ptrp)
523{
524 struct nilfs_btree_node *node;
525 __u64 ptr;
526 int index, level, ret;
527
528 node = nilfs_btree_get_root(btree);
529 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
530 if (index < 0)
531 return -ENOENT;
532 level = nilfs_btree_node_get_level(btree, node);
533 ptr = nilfs_btree_node_get_ptr(btree, node, index);
534 path[level].bp_bh = NULL;
535 path[level].bp_index = index;
536
537 for (level--; level > 0; level--) {
538 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
539 &path[level].bp_bh);
540 if (ret < 0)
541 return ret;
542 node = nilfs_btree_get_nonroot_node(btree, path, level);
543 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
544 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
545 ptr = nilfs_btree_node_get_ptr(btree, node, index);
546 path[level].bp_index = index;
547 }
548
549 if (keyp != NULL)
550 *keyp = nilfs_btree_node_get_key(btree, node, index);
551 if (ptrp != NULL)
552 *ptrp = ptr;
553
554 return 0;
555}
556
557static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
558 __u64 key, int level, __u64 *ptrp)
559{
560 struct nilfs_btree *btree;
561 struct nilfs_btree_path *path;
562 __u64 ptr;
563 int ret;
564
565 btree = (struct nilfs_btree *)bmap;
566 path = nilfs_btree_alloc_path(btree);
567 if (path == NULL)
568 return -ENOMEM;
569 nilfs_btree_init_path(btree, path);
570
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
572
573 if (ptrp != NULL)
574 *ptrp = ptr;
575
576 nilfs_btree_clear_path(btree, path);
577 nilfs_btree_free_path(btree, path);
578
579 return ret;
580}
581
582static void nilfs_btree_promote_key(struct nilfs_btree *btree,
583 struct nilfs_btree_path *path,
584 int level, __u64 key)
585{
586 if (level < nilfs_btree_height(btree) - 1) {
587 do {
588 lock_buffer(path[level].bp_bh);
589 nilfs_btree_node_set_key(
590 btree,
591 nilfs_btree_get_nonroot_node(
592 btree, path, level),
593 path[level].bp_index, key);
594 if (!buffer_dirty(path[level].bp_bh))
595 nilfs_btnode_mark_dirty(path[level].bp_bh);
596 unlock_buffer(path[level].bp_bh);
597 } while ((path[level].bp_index == 0) &&
598 (++level < nilfs_btree_height(btree) - 1));
599 }
600
601 /* root */
602 if (level == nilfs_btree_height(btree) - 1) {
603 nilfs_btree_node_set_key(btree,
604 nilfs_btree_get_root(btree),
605 path[level].bp_index, key);
606 }
607}
608
609static void nilfs_btree_do_insert(struct nilfs_btree *btree,
610 struct nilfs_btree_path *path,
611 int level, __u64 *keyp, __u64 *ptrp)
612{
613 struct nilfs_btree_node *node;
614
615 if (level < nilfs_btree_height(btree) - 1) {
616 lock_buffer(path[level].bp_bh);
617 node = nilfs_btree_get_nonroot_node(btree, path, level);
618 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
619 path[level].bp_index);
620 if (!buffer_dirty(path[level].bp_bh))
621 nilfs_btnode_mark_dirty(path[level].bp_bh);
622 unlock_buffer(path[level].bp_bh);
623
624 if (path[level].bp_index == 0)
625 nilfs_btree_promote_key(btree, path, level + 1,
626 nilfs_btree_node_get_key(
627 btree, node, 0));
628 } else {
629 node = nilfs_btree_get_root(btree);
630 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
631 path[level].bp_index);
632 }
633}
634
635static void nilfs_btree_carry_left(struct nilfs_btree *btree,
636 struct nilfs_btree_path *path,
637 int level, __u64 *keyp, __u64 *ptrp)
638{
639 struct nilfs_btree_node *node, *left;
640 int nchildren, lnchildren, n, move;
641
642 lock_buffer(path[level].bp_bh);
643 lock_buffer(path[level].bp_sib_bh);
644
645 node = nilfs_btree_get_nonroot_node(btree, path, level);
646 left = nilfs_btree_get_sib_node(btree, path, level);
647 nchildren = nilfs_btree_node_get_nchildren(btree, node);
648 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
649 move = 0;
650
651 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
652 if (n > path[level].bp_index) {
653 /* move insert point */
654 n--;
655 move = 1;
656 }
657
658 nilfs_btree_node_move_left(btree, left, node, n);
659
660 if (!buffer_dirty(path[level].bp_bh))
661 nilfs_btnode_mark_dirty(path[level].bp_bh);
662 if (!buffer_dirty(path[level].bp_sib_bh))
663 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
664
665 unlock_buffer(path[level].bp_bh);
666 unlock_buffer(path[level].bp_sib_bh);
667
668 nilfs_btree_promote_key(btree, path, level + 1,
669 nilfs_btree_node_get_key(btree, node, 0));
670
671 if (move) {
672 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
673 path[level].bp_bh = path[level].bp_sib_bh;
674 path[level].bp_sib_bh = NULL;
675 path[level].bp_index += lnchildren;
676 path[level + 1].bp_index--;
677 } else {
678 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
679 path[level].bp_sib_bh = NULL;
680 path[level].bp_index -= n;
681 }
682
683 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
684}
685
686static void nilfs_btree_carry_right(struct nilfs_btree *btree,
687 struct nilfs_btree_path *path,
688 int level, __u64 *keyp, __u64 *ptrp)
689{
690 struct nilfs_btree_node *node, *right;
691 int nchildren, rnchildren, n, move;
692
693 lock_buffer(path[level].bp_bh);
694 lock_buffer(path[level].bp_sib_bh);
695
696 node = nilfs_btree_get_nonroot_node(btree, path, level);
697 right = nilfs_btree_get_sib_node(btree, path, level);
698 nchildren = nilfs_btree_node_get_nchildren(btree, node);
699 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
700 move = 0;
701
702 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
703 if (n > nchildren - path[level].bp_index) {
704 /* move insert point */
705 n--;
706 move = 1;
707 }
708
709 nilfs_btree_node_move_right(btree, node, right, n);
710
711 if (!buffer_dirty(path[level].bp_bh))
712 nilfs_btnode_mark_dirty(path[level].bp_bh);
713 if (!buffer_dirty(path[level].bp_sib_bh))
714 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
715
716 unlock_buffer(path[level].bp_bh);
717 unlock_buffer(path[level].bp_sib_bh);
718
719 path[level + 1].bp_index++;
720 nilfs_btree_promote_key(btree, path, level + 1,
721 nilfs_btree_node_get_key(btree, right, 0));
722 path[level + 1].bp_index--;
723
724 if (move) {
725 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
726 path[level].bp_bh = path[level].bp_sib_bh;
727 path[level].bp_sib_bh = NULL;
728 path[level].bp_index -=
729 nilfs_btree_node_get_nchildren(btree, node);
730 path[level + 1].bp_index++;
731 } else {
732 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
733 path[level].bp_sib_bh = NULL;
734 }
735
736 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
737}
738
739static void nilfs_btree_split(struct nilfs_btree *btree,
740 struct nilfs_btree_path *path,
741 int level, __u64 *keyp, __u64 *ptrp)
742{
743 struct nilfs_btree_node *node, *right;
744 __u64 newkey;
745 __u64 newptr;
746 int nchildren, n, move;
747
748 lock_buffer(path[level].bp_bh);
749 lock_buffer(path[level].bp_sib_bh);
750
751 node = nilfs_btree_get_nonroot_node(btree, path, level);
752 right = nilfs_btree_get_sib_node(btree, path, level);
753 nchildren = nilfs_btree_node_get_nchildren(btree, node);
754 move = 0;
755
756 n = (nchildren + 1) / 2;
757 if (n > nchildren - path[level].bp_index) {
758 n--;
759 move = 1;
760 }
761
762 nilfs_btree_node_move_right(btree, node, right, n);
763
764 if (!buffer_dirty(path[level].bp_bh))
765 nilfs_btnode_mark_dirty(path[level].bp_bh);
766 if (!buffer_dirty(path[level].bp_sib_bh))
767 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
768
769 unlock_buffer(path[level].bp_bh);
770 unlock_buffer(path[level].bp_sib_bh);
771
772 newkey = nilfs_btree_node_get_key(btree, right, 0);
773 newptr = path[level].bp_newreq.bpr_ptr;
774
775 if (move) {
776 path[level].bp_index -=
777 nilfs_btree_node_get_nchildren(btree, node);
778 nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
779 path[level].bp_index);
780
781 *keyp = nilfs_btree_node_get_key(btree, right, 0);
782 *ptrp = path[level].bp_newreq.bpr_ptr;
783
784 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
785 path[level].bp_bh = path[level].bp_sib_bh;
786 path[level].bp_sib_bh = NULL;
787 } else {
788 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
789
790 *keyp = nilfs_btree_node_get_key(btree, right, 0);
791 *ptrp = path[level].bp_newreq.bpr_ptr;
792
793 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
794 path[level].bp_sib_bh = NULL;
795 }
796
797 path[level + 1].bp_index++;
798}
799
800static void nilfs_btree_grow(struct nilfs_btree *btree,
801 struct nilfs_btree_path *path,
802 int level, __u64 *keyp, __u64 *ptrp)
803{
804 struct nilfs_btree_node *root, *child;
805 int n;
806
807 lock_buffer(path[level].bp_sib_bh);
808
809 root = nilfs_btree_get_root(btree);
810 child = nilfs_btree_get_sib_node(btree, path, level);
811
812 n = nilfs_btree_node_get_nchildren(btree, root);
813
814 nilfs_btree_node_move_right(btree, root, child, n);
815 nilfs_btree_node_set_level(btree, root, level + 1);
816
817 if (!buffer_dirty(path[level].bp_sib_bh))
818 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
819
820 unlock_buffer(path[level].bp_sib_bh);
821
822 path[level].bp_bh = path[level].bp_sib_bh;
823 path[level].bp_sib_bh = NULL;
824
825 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
826
827 *keyp = nilfs_btree_node_get_key(btree, child, 0);
828 *ptrp = path[level].bp_newreq.bpr_ptr;
829}
830
831static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
832 const struct nilfs_btree_path *path)
833{
834 struct nilfs_btree_node *node;
835 int level;
836
837 if (path == NULL)
838 return NILFS_BMAP_INVALID_PTR;
839
840 /* left sibling */
841 level = NILFS_BTREE_LEVEL_NODE_MIN;
842 if (path[level].bp_index > 0) {
843 node = nilfs_btree_get_node(btree, path, level);
844 return nilfs_btree_node_get_ptr(btree, node,
845 path[level].bp_index - 1);
846 }
847
848 /* parent */
849 level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
850 if (level <= nilfs_btree_height(btree) - 1) {
851 node = nilfs_btree_get_node(btree, path, level);
852 return nilfs_btree_node_get_ptr(btree, node,
853 path[level].bp_index);
854 }
855
856 return NILFS_BMAP_INVALID_PTR;
857}
858
859static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
860 const struct nilfs_btree_path *path,
861 __u64 key)
862{
863 __u64 ptr;
864
865 ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
866 if (ptr != NILFS_BMAP_INVALID_PTR)
867 /* sequential access */
868 return ptr;
869 else {
870 ptr = nilfs_btree_find_near(btree, path);
871 if (ptr != NILFS_BMAP_INVALID_PTR)
872 /* near */
873 return ptr;
874 }
875 /* block group */
876 return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
877}
878
879static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
880 __u64 ptr)
881{
882 btree->bt_bmap.b_last_allocated_key = key;
883 btree->bt_bmap.b_last_allocated_ptr = ptr;
884}
885
886static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
887 struct nilfs_btree_path *path,
888 int *levelp, __u64 key, __u64 ptr,
889 struct nilfs_bmap_stats *stats)
890{
891 struct buffer_head *bh;
892 struct nilfs_btree_node *node, *parent, *sib;
893 __u64 sibptr;
894 int pindex, level, ret;
895
896 stats->bs_nblocks = 0;
897 level = NILFS_BTREE_LEVEL_DATA;
898
899 /* allocate a new ptr for data block */
900 if (btree->bt_ops->btop_find_target != NULL)
901 path[level].bp_newreq.bpr_ptr =
902 btree->bt_ops->btop_find_target(btree, path, key);
903
904 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
905 &btree->bt_bmap, &path[level].bp_newreq);
906 if (ret < 0)
907 goto err_out_data;
908
909 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
910 level < nilfs_btree_height(btree) - 1;
911 level++) {
912 node = nilfs_btree_get_nonroot_node(btree, path, level);
913 if (nilfs_btree_node_get_nchildren(btree, node) <
914 nilfs_btree_node_nchildren_max(btree, node)) {
915 path[level].bp_op = nilfs_btree_do_insert;
916 stats->bs_nblocks++;
917 goto out;
918 }
919
920 parent = nilfs_btree_get_node(btree, path, level + 1);
921 pindex = path[level + 1].bp_index;
922
923 /* left sibling */
924 if (pindex > 0) {
925 sibptr = nilfs_btree_node_get_ptr(btree, parent,
926 pindex - 1);
927 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
928 &bh);
929 if (ret < 0)
930 goto err_out_child_node;
931 sib = (struct nilfs_btree_node *)bh->b_data;
932 if (nilfs_btree_node_get_nchildren(btree, sib) <
933 nilfs_btree_node_nchildren_max(btree, sib)) {
934 path[level].bp_sib_bh = bh;
935 path[level].bp_op = nilfs_btree_carry_left;
936 stats->bs_nblocks++;
937 goto out;
938 } else
939 nilfs_bmap_put_block(&btree->bt_bmap, bh);
940 }
941
942 /* right sibling */
943 if (pindex <
944 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
945 sibptr = nilfs_btree_node_get_ptr(btree, parent,
946 pindex + 1);
947 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
948 &bh);
949 if (ret < 0)
950 goto err_out_child_node;
951 sib = (struct nilfs_btree_node *)bh->b_data;
952 if (nilfs_btree_node_get_nchildren(btree, sib) <
953 nilfs_btree_node_nchildren_max(btree, sib)) {
954 path[level].bp_sib_bh = bh;
955 path[level].bp_op = nilfs_btree_carry_right;
956 stats->bs_nblocks++;
957 goto out;
958 } else
959 nilfs_bmap_put_block(&btree->bt_bmap, bh);
960 }
961
962 /* split */
963 path[level].bp_newreq.bpr_ptr =
964 path[level - 1].bp_newreq.bpr_ptr + 1;
965 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
966 &btree->bt_bmap, &path[level].bp_newreq);
967 if (ret < 0)
968 goto err_out_child_node;
969 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
970 path[level].bp_newreq.bpr_ptr,
971 &bh);
972 if (ret < 0)
973 goto err_out_curr_node;
974
975 stats->bs_nblocks++;
976
977 lock_buffer(bh);
978 nilfs_btree_node_init(btree,
979 (struct nilfs_btree_node *)bh->b_data,
980 0, level, 0, NULL, NULL);
981 unlock_buffer(bh);
982 path[level].bp_sib_bh = bh;
983 path[level].bp_op = nilfs_btree_split;
984 }
985
986 /* root */
987 node = nilfs_btree_get_root(btree);
988 if (nilfs_btree_node_get_nchildren(btree, node) <
989 nilfs_btree_node_nchildren_max(btree, node)) {
990 path[level].bp_op = nilfs_btree_do_insert;
991 stats->bs_nblocks++;
992 goto out;
993 }
994
995 /* grow */
996 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
997 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
998 &btree->bt_bmap, &path[level].bp_newreq);
999 if (ret < 0)
1000 goto err_out_child_node;
1001 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
1002 path[level].bp_newreq.bpr_ptr, &bh);
1003 if (ret < 0)
1004 goto err_out_curr_node;
1005
1006 lock_buffer(bh);
1007 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
1008 0, level, 0, NULL, NULL);
1009 unlock_buffer(bh);
1010 path[level].bp_sib_bh = bh;
1011 path[level].bp_op = nilfs_btree_grow;
1012
1013 level++;
1014 path[level].bp_op = nilfs_btree_do_insert;
1015
1016 /* a newly-created node block and a data block are added */
1017 stats->bs_nblocks += 2;
1018
1019 /* success */
1020 out:
1021 *levelp = level;
1022 return ret;
1023
1024 /* error */
1025 err_out_curr_node:
1026 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1027 &path[level].bp_newreq);
1028 err_out_child_node:
1029 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1030 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1031 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
1032 &btree->bt_bmap, &path[level].bp_newreq);
1033
1034 }
1035
1036 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1037 &path[level].bp_newreq);
1038 err_out_data:
1039 *levelp = level;
1040 stats->bs_nblocks = 0;
1041 return ret;
1042}
1043
1044static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1045 struct nilfs_btree_path *path,
1046 int maxlevel, __u64 key, __u64 ptr)
1047{
1048 int level;
1049
1050 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1051 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1052 if (btree->bt_ops->btop_set_target != NULL)
1053 btree->bt_ops->btop_set_target(btree, key, ptr);
1054
1055 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1056 if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
1057 btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
1058 &btree->bt_bmap, &path[level - 1].bp_newreq);
1059 }
1060 path[level].bp_op(btree, path, level, &key, &ptr);
1061 }
1062
1063 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1064 nilfs_bmap_set_dirty(&btree->bt_bmap);
1065}
1066
1067static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1068{
1069 struct nilfs_btree *btree;
1070 struct nilfs_btree_path *path;
1071 struct nilfs_bmap_stats stats;
1072 int level, ret;
1073
1074 btree = (struct nilfs_btree *)bmap;
1075 path = nilfs_btree_alloc_path(btree);
1076 if (path == NULL)
1077 return -ENOMEM;
1078 nilfs_btree_init_path(btree, path);
1079
1080 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1081 NILFS_BTREE_LEVEL_NODE_MIN);
1082 if (ret != -ENOENT) {
1083 if (ret == 0)
1084 ret = -EEXIST;
1085 goto out;
1086 }
1087
1088 ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
1089 if (ret < 0)
1090 goto out;
1091 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1092 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1093
1094 out:
1095 nilfs_btree_clear_path(btree, path);
1096 nilfs_btree_free_path(btree, path);
1097 return ret;
1098}
1099
1100static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1101 struct nilfs_btree_path *path,
1102 int level, __u64 *keyp, __u64 *ptrp)
1103{
1104 struct nilfs_btree_node *node;
1105
1106 if (level < nilfs_btree_height(btree) - 1) {
1107 lock_buffer(path[level].bp_bh);
1108 node = nilfs_btree_get_nonroot_node(btree, path, level);
1109 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1110 path[level].bp_index);
1111 if (!buffer_dirty(path[level].bp_bh))
1112 nilfs_btnode_mark_dirty(path[level].bp_bh);
1113 unlock_buffer(path[level].bp_bh);
1114 if (path[level].bp_index == 0)
1115 nilfs_btree_promote_key(btree, path, level + 1,
1116 nilfs_btree_node_get_key(btree, node, 0));
1117 } else {
1118 node = nilfs_btree_get_root(btree);
1119 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1120 path[level].bp_index);
1121 }
1122}
1123
1124static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1125 struct nilfs_btree_path *path,
1126 int level, __u64 *keyp, __u64 *ptrp)
1127{
1128 struct nilfs_btree_node *node, *left;
1129 int nchildren, lnchildren, n;
1130
1131 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1132
1133 lock_buffer(path[level].bp_bh);
1134 lock_buffer(path[level].bp_sib_bh);
1135
1136 node = nilfs_btree_get_nonroot_node(btree, path, level);
1137 left = nilfs_btree_get_sib_node(btree, path, level);
1138 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1139 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
1140
1141 n = (nchildren + lnchildren) / 2 - nchildren;
1142
1143 nilfs_btree_node_move_right(btree, left, node, n);
1144
1145 if (!buffer_dirty(path[level].bp_bh))
1146 nilfs_btnode_mark_dirty(path[level].bp_bh);
1147 if (!buffer_dirty(path[level].bp_sib_bh))
1148 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1149
1150 unlock_buffer(path[level].bp_bh);
1151 unlock_buffer(path[level].bp_sib_bh);
1152
1153 nilfs_btree_promote_key(btree, path, level + 1,
1154 nilfs_btree_node_get_key(btree, node, 0));
1155
1156 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1157 path[level].bp_sib_bh = NULL;
1158 path[level].bp_index += n;
1159}
1160
1161static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1162 struct nilfs_btree_path *path,
1163 int level, __u64 *keyp, __u64 *ptrp)
1164{
1165 struct nilfs_btree_node *node, *right;
1166 int nchildren, rnchildren, n;
1167
1168 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1169
1170 lock_buffer(path[level].bp_bh);
1171 lock_buffer(path[level].bp_sib_bh);
1172
1173 node = nilfs_btree_get_nonroot_node(btree, path, level);
1174 right = nilfs_btree_get_sib_node(btree, path, level);
1175 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1176 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
1177
1178 n = (nchildren + rnchildren) / 2 - nchildren;
1179
1180 nilfs_btree_node_move_left(btree, node, right, n);
1181
1182 if (!buffer_dirty(path[level].bp_bh))
1183 nilfs_btnode_mark_dirty(path[level].bp_bh);
1184 if (!buffer_dirty(path[level].bp_sib_bh))
1185 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1186
1187 unlock_buffer(path[level].bp_bh);
1188 unlock_buffer(path[level].bp_sib_bh);
1189
1190 path[level + 1].bp_index++;
1191 nilfs_btree_promote_key(btree, path, level + 1,
1192 nilfs_btree_node_get_key(btree, right, 0));
1193 path[level + 1].bp_index--;
1194
1195 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1196 path[level].bp_sib_bh = NULL;
1197}
1198
1199static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1200 struct nilfs_btree_path *path,
1201 int level, __u64 *keyp, __u64 *ptrp)
1202{
1203 struct nilfs_btree_node *node, *left;
1204 int n;
1205
1206 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1207
1208 lock_buffer(path[level].bp_bh);
1209 lock_buffer(path[level].bp_sib_bh);
1210
1211 node = nilfs_btree_get_nonroot_node(btree, path, level);
1212 left = nilfs_btree_get_sib_node(btree, path, level);
1213
1214 n = nilfs_btree_node_get_nchildren(btree, node);
1215
1216 nilfs_btree_node_move_left(btree, left, node, n);
1217
1218 if (!buffer_dirty(path[level].bp_sib_bh))
1219 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1220
1221 unlock_buffer(path[level].bp_bh);
1222 unlock_buffer(path[level].bp_sib_bh);
1223
1224 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1225 path[level].bp_bh = path[level].bp_sib_bh;
1226 path[level].bp_sib_bh = NULL;
1227 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
1228}
1229
1230static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1231 struct nilfs_btree_path *path,
1232 int level, __u64 *keyp, __u64 *ptrp)
1233{
1234 struct nilfs_btree_node *node, *right;
1235 int n;
1236
1237 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1238
1239 lock_buffer(path[level].bp_bh);
1240 lock_buffer(path[level].bp_sib_bh);
1241
1242 node = nilfs_btree_get_nonroot_node(btree, path, level);
1243 right = nilfs_btree_get_sib_node(btree, path, level);
1244
1245 n = nilfs_btree_node_get_nchildren(btree, right);
1246
1247 nilfs_btree_node_move_left(btree, node, right, n);
1248
1249 if (!buffer_dirty(path[level].bp_bh))
1250 nilfs_btnode_mark_dirty(path[level].bp_bh);
1251
1252 unlock_buffer(path[level].bp_bh);
1253 unlock_buffer(path[level].bp_sib_bh);
1254
1255 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1256 path[level].bp_sib_bh = NULL;
1257 path[level + 1].bp_index++;
1258}
1259
1260static void nilfs_btree_shrink(struct nilfs_btree *btree,
1261 struct nilfs_btree_path *path,
1262 int level, __u64 *keyp, __u64 *ptrp)
1263{
1264 struct nilfs_btree_node *root, *child;
1265 int n;
1266
1267 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1268
1269 lock_buffer(path[level].bp_bh);
1270 root = nilfs_btree_get_root(btree);
1271 child = nilfs_btree_get_nonroot_node(btree, path, level);
1272
1273 nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
1274 nilfs_btree_node_set_level(btree, root, level);
1275 n = nilfs_btree_node_get_nchildren(btree, child);
1276 nilfs_btree_node_move_left(btree, root, child, n);
1277 unlock_buffer(path[level].bp_bh);
1278
1279 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1280 path[level].bp_bh = NULL;
1281}
1282
1283
1284static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1285 struct nilfs_btree_path *path,
1286 int *levelp,
1287 struct nilfs_bmap_stats *stats)
1288{
1289 struct buffer_head *bh;
1290 struct nilfs_btree_node *node, *parent, *sib;
1291 __u64 sibptr;
1292 int pindex, level, ret;
1293
1294 ret = 0;
1295 stats->bs_nblocks = 0;
1296 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1297 level < nilfs_btree_height(btree) - 1;
1298 level++) {
1299 node = nilfs_btree_get_nonroot_node(btree, path, level);
1300 path[level].bp_oldreq.bpr_ptr =
1301 nilfs_btree_node_get_ptr(btree, node,
1302 path[level].bp_index);
1303 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1304 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1305 &btree->bt_bmap, &path[level].bp_oldreq);
1306 if (ret < 0)
1307 goto err_out_child_node;
1308 }
1309
1310 if (nilfs_btree_node_get_nchildren(btree, node) >
1311 nilfs_btree_node_nchildren_min(btree, node)) {
1312 path[level].bp_op = nilfs_btree_do_delete;
1313 stats->bs_nblocks++;
1314 goto out;
1315 }
1316
1317 parent = nilfs_btree_get_node(btree, path, level + 1);
1318 pindex = path[level + 1].bp_index;
1319
1320 if (pindex > 0) {
1321 /* left sibling */
1322 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1323 pindex - 1);
1324 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1325 &bh);
1326 if (ret < 0)
1327 goto err_out_curr_node;
1328 sib = (struct nilfs_btree_node *)bh->b_data;
1329 if (nilfs_btree_node_get_nchildren(btree, sib) >
1330 nilfs_btree_node_nchildren_min(btree, sib)) {
1331 path[level].bp_sib_bh = bh;
1332 path[level].bp_op = nilfs_btree_borrow_left;
1333 stats->bs_nblocks++;
1334 goto out;
1335 } else {
1336 path[level].bp_sib_bh = bh;
1337 path[level].bp_op = nilfs_btree_concat_left;
1338 stats->bs_nblocks++;
1339 /* continue; */
1340 }
1341 } else if (pindex <
1342 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
1343 /* right sibling */
1344 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1345 pindex + 1);
1346 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1347 &bh);
1348 if (ret < 0)
1349 goto err_out_curr_node;
1350 sib = (struct nilfs_btree_node *)bh->b_data;
1351 if (nilfs_btree_node_get_nchildren(btree, sib) >
1352 nilfs_btree_node_nchildren_min(btree, sib)) {
1353 path[level].bp_sib_bh = bh;
1354 path[level].bp_op = nilfs_btree_borrow_right;
1355 stats->bs_nblocks++;
1356 goto out;
1357 } else {
1358 path[level].bp_sib_bh = bh;
1359 path[level].bp_op = nilfs_btree_concat_right;
1360 stats->bs_nblocks++;
1361 /* continue; */
1362 }
1363 } else {
1364 /* no siblings */
1365 /* the only child of the root node */
1366 WARN_ON(level != nilfs_btree_height(btree) - 2);
1367 if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
1368 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1369 path[level].bp_op = nilfs_btree_shrink;
1370 stats->bs_nblocks += 2;
1371 } else {
1372 path[level].bp_op = nilfs_btree_do_delete;
1373 stats->bs_nblocks++;
1374 }
1375
1376 goto out;
1377
1378 }
1379 }
1380
1381 node = nilfs_btree_get_root(btree);
1382 path[level].bp_oldreq.bpr_ptr =
1383 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1384 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1385 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1386 &btree->bt_bmap, &path[level].bp_oldreq);
1387 if (ret < 0)
1388 goto err_out_child_node;
1389 }
1390 /* child of the root node is deleted */
1391 path[level].bp_op = nilfs_btree_do_delete;
1392 stats->bs_nblocks++;
1393
1394 /* success */
1395 out:
1396 *levelp = level;
1397 return ret;
1398
1399 /* error */
1400 err_out_curr_node:
1401 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1402 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1403 &btree->bt_bmap, &path[level].bp_oldreq);
1404 err_out_child_node:
1405 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1406 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1407 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1408 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1409 &btree->bt_bmap, &path[level].bp_oldreq);
1410 }
1411 *levelp = level;
1412 stats->bs_nblocks = 0;
1413 return ret;
1414}
1415
1416static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1417 struct nilfs_btree_path *path,
1418 int maxlevel)
1419{
1420 int level;
1421
1422 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1423 if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
1424 btree->bt_bmap.b_pops->bpop_commit_end_ptr(
1425 &btree->bt_bmap, &path[level].bp_oldreq);
1426 path[level].bp_op(btree, path, level, NULL, NULL);
1427 }
1428
1429 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1430 nilfs_bmap_set_dirty(&btree->bt_bmap);
1431}
1432
1433static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1434
1435{
1436 struct nilfs_btree *btree;
1437 struct nilfs_btree_path *path;
1438 struct nilfs_bmap_stats stats;
1439 int level, ret;
1440
1441 btree = (struct nilfs_btree *)bmap;
1442 path = nilfs_btree_alloc_path(btree);
1443 if (path == NULL)
1444 return -ENOMEM;
1445 nilfs_btree_init_path(btree, path);
1446 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1447 NILFS_BTREE_LEVEL_NODE_MIN);
1448 if (ret < 0)
1449 goto out;
1450
1451 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
1452 if (ret < 0)
1453 goto out;
1454 nilfs_btree_commit_delete(btree, path, level);
1455 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1456
1457out:
1458 nilfs_btree_clear_path(btree, path);
1459 nilfs_btree_free_path(btree, path);
1460 return ret;
1461}
1462
1463static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1464{
1465 struct nilfs_btree *btree;
1466 struct nilfs_btree_path *path;
1467 int ret;
1468
1469 btree = (struct nilfs_btree *)bmap;
1470 path = nilfs_btree_alloc_path(btree);
1471 if (path == NULL)
1472 return -ENOMEM;
1473 nilfs_btree_init_path(btree, path);
1474
1475 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1476
1477 nilfs_btree_clear_path(btree, path);
1478 nilfs_btree_free_path(btree, path);
1479
1480 return ret;
1481}
1482
1483static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1484{
1485 struct buffer_head *bh;
1486 struct nilfs_btree *btree;
1487 struct nilfs_btree_node *root, *node;
1488 __u64 maxkey, nextmaxkey;
1489 __u64 ptr;
1490 int nchildren, ret;
1491
1492 btree = (struct nilfs_btree *)bmap;
1493 root = nilfs_btree_get_root(btree);
1494 switch (nilfs_btree_height(btree)) {
1495 case 2:
1496 bh = NULL;
1497 node = root;
1498 break;
1499 case 3:
1500 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1501 if (nchildren > 1)
1502 return 0;
1503 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1504 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1505 if (ret < 0)
1506 return ret;
1507 node = (struct nilfs_btree_node *)bh->b_data;
1508 break;
1509 default:
1510 return 0;
1511 }
1512
1513 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1514 maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
1515 nextmaxkey = (nchildren > 1) ?
1516 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
1517 if (bh != NULL)
1518 nilfs_bmap_put_block(bmap, bh);
1519
1520 return (maxkey == key) && (nextmaxkey < bmap->b_low);
1521}
1522
1523static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1524 __u64 *keys, __u64 *ptrs, int nitems)
1525{
1526 struct buffer_head *bh;
1527 struct nilfs_btree *btree;
1528 struct nilfs_btree_node *node, *root;
1529 __le64 *dkeys;
1530 __le64 *dptrs;
1531 __u64 ptr;
1532 int nchildren, i, ret;
1533
1534 btree = (struct nilfs_btree *)bmap;
1535 root = nilfs_btree_get_root(btree);
1536 switch (nilfs_btree_height(btree)) {
1537 case 2:
1538 bh = NULL;
1539 node = root;
1540 break;
1541 case 3:
1542 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1543 WARN_ON(nchildren > 1);
1544 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1545 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1546 if (ret < 0)
1547 return ret;
1548 node = (struct nilfs_btree_node *)bh->b_data;
1549 break;
1550 default:
1551 node = NULL;
1552 return -EINVAL;
1553 }
1554
1555 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1556 if (nchildren < nitems)
1557 nitems = nchildren;
1558 dkeys = nilfs_btree_node_dkeys(btree, node);
1559 dptrs = nilfs_btree_node_dptrs(btree, node);
1560 for (i = 0; i < nitems; i++) {
1561 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
1562 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
1563 }
1564
1565 if (bh != NULL)
1566 nilfs_bmap_put_block(bmap, bh);
1567
1568 return nitems;
1569}
1570
1571static int
1572nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1573 union nilfs_bmap_ptr_req *dreq,
1574 union nilfs_bmap_ptr_req *nreq,
1575 struct buffer_head **bhp,
1576 struct nilfs_bmap_stats *stats)
1577{
1578 struct buffer_head *bh;
1579 struct nilfs_btree *btree;
1580 int ret;
1581
1582 btree = (struct nilfs_btree *)bmap;
1583 stats->bs_nblocks = 0;
1584
1585 /* for data */
1586 /* cannot find near ptr */
1587 if (btree->bt_ops->btop_find_target != NULL)
1588 dreq->bpr_ptr
1589 = btree->bt_ops->btop_find_target(btree, NULL, key);
1590 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
1591 if (ret < 0)
1592 return ret;
1593
1594 *bhp = NULL;
1595 stats->bs_nblocks++;
1596 if (nreq != NULL) {
1597 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1598 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
1599 if (ret < 0)
1600 goto err_out_dreq;
1601
1602 ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
1603 if (ret < 0)
1604 goto err_out_nreq;
1605
1606 *bhp = bh;
1607 stats->bs_nblocks++;
1608 }
1609
1610 /* success */
1611 return 0;
1612
1613 /* error */
1614 err_out_nreq:
1615 bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
1616 err_out_dreq:
1617 bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
1618 stats->bs_nblocks = 0;
1619 return ret;
1620
1621}
1622
1623static void
1624nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1625 __u64 key, __u64 ptr,
1626 const __u64 *keys, const __u64 *ptrs,
1627 int n, __u64 low, __u64 high,
1628 union nilfs_bmap_ptr_req *dreq,
1629 union nilfs_bmap_ptr_req *nreq,
1630 struct buffer_head *bh)
1631{
1632 struct nilfs_btree *btree;
1633 struct nilfs_btree_node *node;
1634 __u64 tmpptr;
1635
1636 /* free resources */
1637 if (bmap->b_ops->bop_clear != NULL)
1638 bmap->b_ops->bop_clear(bmap);
1639
1640 /* ptr must be a pointer to a buffer head. */
1641 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1642
1643 /* convert and insert */
1644 btree = (struct nilfs_btree *)bmap;
1645 nilfs_btree_init(bmap, low, high);
1646 if (nreq != NULL) {
1647 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
1648 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1649 bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
1650 }
1651
1652 /* create child node at level 1 */
1653 lock_buffer(bh);
1654 node = (struct nilfs_btree_node *)bh->b_data;
1655 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
1656 nilfs_btree_node_insert(btree, node,
1657 key, dreq->bpr_ptr, n);
1658 if (!buffer_dirty(bh))
1659 nilfs_btnode_mark_dirty(bh);
1660 if (!nilfs_bmap_dirty(bmap))
1661 nilfs_bmap_set_dirty(bmap);
1662
1663 unlock_buffer(bh);
1664 nilfs_bmap_put_block(bmap, bh);
1665
1666 /* create root node at level 2 */
1667 node = nilfs_btree_get_root(btree);
1668 tmpptr = nreq->bpr_ptr;
1669 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1670 2, 1, &keys[0], &tmpptr);
1671 } else {
1672 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
1673 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1674
1675 /* create root node at level 1 */
1676 node = nilfs_btree_get_root(btree);
1677 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1678 1, n, keys, ptrs);
1679 nilfs_btree_node_insert(btree, node,
1680 key, dreq->bpr_ptr, n);
1681 if (!nilfs_bmap_dirty(bmap))
1682 nilfs_bmap_set_dirty(bmap);
1683 }
1684
1685 if (btree->bt_ops->btop_set_target != NULL)
1686 btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
1687}
1688
1689/**
1690 * nilfs_btree_convert_and_insert -
1691 * @bmap:
1692 * @key:
1693 * @ptr:
1694 * @keys:
1695 * @ptrs:
1696 * @n:
1697 * @low:
1698 * @high:
1699 */
1700int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1701 __u64 key, __u64 ptr,
1702 const __u64 *keys, const __u64 *ptrs,
1703 int n, __u64 low, __u64 high)
1704{
1705 struct buffer_head *bh;
1706 union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
1707 struct nilfs_bmap_stats stats;
1708 int ret;
1709
1710 if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1711 di = &dreq;
1712 ni = NULL;
1713 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1714 1 << bmap->b_inode->i_blkbits)) {
1715 di = &dreq;
1716 ni = &nreq;
1717 } else {
1718 di = NULL;
1719 ni = NULL;
1720 BUG();
1721 }
1722
1723 ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
1724 &stats);
1725 if (ret < 0)
1726 return ret;
1727 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
1728 low, high, di, ni, bh);
1729 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1730 return 0;
1731}
1732
1733static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1734 struct nilfs_btree_path *path,
1735 int level,
1736 struct buffer_head *bh)
1737{
1738 while ((++level < nilfs_btree_height(btree) - 1) &&
1739 !buffer_dirty(path[level].bp_bh))
1740 nilfs_btnode_mark_dirty(path[level].bp_bh);
1741
1742 return 0;
1743}
1744
1745static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1746 struct nilfs_btree_path *path,
1747 int level)
1748{
1749 struct nilfs_btree_node *parent;
1750 int ret;
1751
1752 parent = nilfs_btree_get_node(btree, path, level + 1);
1753 path[level].bp_oldreq.bpr_ptr =
1754 nilfs_btree_node_get_ptr(btree, parent,
1755 path[level + 1].bp_index);
1756 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1757 ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
1758 &path[level].bp_oldreq,
1759 &path[level].bp_newreq);
1760 if (ret < 0)
1761 return ret;
1762
1763 if (buffer_nilfs_node(path[level].bp_bh)) {
1764 path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
1765 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1766 path[level].bp_ctxt.bh = path[level].bp_bh;
1767 ret = nilfs_btnode_prepare_change_key(
1768 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1769 &path[level].bp_ctxt);
1770 if (ret < 0) {
1771 nilfs_bmap_abort_update(&btree->bt_bmap,
1772 &path[level].bp_oldreq,
1773 &path[level].bp_newreq);
1774 return ret;
1775 }
1776 }
1777
1778 return 0;
1779}
1780
1781static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1782 struct nilfs_btree_path *path,
1783 int level)
1784{
1785 struct nilfs_btree_node *parent;
1786
1787 nilfs_bmap_commit_update(&btree->bt_bmap,
1788 &path[level].bp_oldreq,
1789 &path[level].bp_newreq);
1790
1791 if (buffer_nilfs_node(path[level].bp_bh)) {
1792 nilfs_btnode_commit_change_key(
1793 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1794 &path[level].bp_ctxt);
1795 path[level].bp_bh = path[level].bp_ctxt.bh;
1796 }
1797 set_buffer_nilfs_volatile(path[level].bp_bh);
1798
1799 parent = nilfs_btree_get_node(btree, path, level + 1);
1800 nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
1801 path[level].bp_newreq.bpr_ptr);
1802}
1803
1804static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1805 struct nilfs_btree_path *path,
1806 int level)
1807{
1808 nilfs_bmap_abort_update(&btree->bt_bmap,
1809 &path[level].bp_oldreq,
1810 &path[level].bp_newreq);
1811 if (buffer_nilfs_node(path[level].bp_bh))
1812 nilfs_btnode_abort_change_key(
1813 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1814 &path[level].bp_ctxt);
1815}
1816
1817static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1818 struct nilfs_btree_path *path,
1819 int minlevel,
1820 int *maxlevelp)
1821{
1822 int level, ret;
1823
1824 level = minlevel;
1825 if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1826 ret = nilfs_btree_prepare_update_v(btree, path, level);
1827 if (ret < 0)
1828 return ret;
1829 }
1830 while ((++level < nilfs_btree_height(btree) - 1) &&
1831 !buffer_dirty(path[level].bp_bh)) {
1832
1833 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1834 ret = nilfs_btree_prepare_update_v(btree, path, level);
1835 if (ret < 0)
1836 goto out;
1837 }
1838
1839 /* success */
1840 *maxlevelp = level - 1;
1841 return 0;
1842
1843 /* error */
1844 out:
1845 while (--level > minlevel)
1846 nilfs_btree_abort_update_v(btree, path, level);
1847 if (!buffer_nilfs_volatile(path[level].bp_bh))
1848 nilfs_btree_abort_update_v(btree, path, level);
1849 return ret;
1850}
1851
1852static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1853 struct nilfs_btree_path *path,
1854 int minlevel,
1855 int maxlevel,
1856 struct buffer_head *bh)
1857{
1858 int level;
1859
1860 if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
1861 nilfs_btree_commit_update_v(btree, path, minlevel);
1862
1863 for (level = minlevel + 1; level <= maxlevel; level++)
1864 nilfs_btree_commit_update_v(btree, path, level);
1865}
1866
1867static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1868 struct nilfs_btree_path *path,
1869 int level,
1870 struct buffer_head *bh)
1871{
1872 int maxlevel, ret;
1873 struct nilfs_btree_node *parent;
1874 __u64 ptr;
1875
1876 get_bh(bh);
1877 path[level].bp_bh = bh;
1878 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
1879 if (ret < 0)
1880 goto out;
1881
1882 if (buffer_nilfs_volatile(path[level].bp_bh)) {
1883 parent = nilfs_btree_get_node(btree, path, level + 1);
1884 ptr = nilfs_btree_node_get_ptr(btree, parent,
1885 path[level + 1].bp_index);
1886 ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
1887 if (ret < 0)
1888 goto out;
1889 }
1890
1891 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
1892
1893 out:
1894 brelse(path[level].bp_bh);
1895 path[level].bp_bh = NULL;
1896 return ret;
1897}
1898
1899static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1900 struct buffer_head *bh)
1901{
1902 struct nilfs_btree *btree;
1903 struct nilfs_btree_path *path;
1904 struct nilfs_btree_node *node;
1905 __u64 key;
1906 int level, ret;
1907
1908 WARN_ON(!buffer_dirty(bh));
1909
1910 btree = (struct nilfs_btree *)bmap;
1911 path = nilfs_btree_alloc_path(btree);
1912 if (path == NULL)
1913 return -ENOMEM;
1914 nilfs_btree_init_path(btree, path);
1915
1916 if (buffer_nilfs_node(bh)) {
1917 node = (struct nilfs_btree_node *)bh->b_data;
1918 key = nilfs_btree_node_get_key(btree, node, 0);
1919 level = nilfs_btree_node_get_level(btree, node);
1920 } else {
1921 key = nilfs_bmap_data_get_key(bmap, bh);
1922 level = NILFS_BTREE_LEVEL_DATA;
1923 }
1924
1925 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
1926 if (ret < 0) {
1927 if (unlikely(ret == -ENOENT))
1928 printk(KERN_CRIT "%s: key = %llu, level == %d\n",
1929 __func__, (unsigned long long)key, level);
1930 goto out;
1931 }
1932
1933 ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
1934
1935 out:
1936 nilfs_btree_clear_path(btree, path);
1937 nilfs_btree_free_path(btree, path);
1938
1939 return ret;
1940}
1941
1942static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
1943 struct buffer_head *bh)
1944{
1945 return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
1946}
1947
1948static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1949 struct list_head *lists,
1950 struct buffer_head *bh)
1951{
1952 struct list_head *head;
1953 struct buffer_head *cbh;
1954 struct nilfs_btree_node *node, *cnode;
1955 __u64 key, ckey;
1956 int level;
1957
1958 get_bh(bh);
1959 node = (struct nilfs_btree_node *)bh->b_data;
1960 key = nilfs_btree_node_get_key(btree, node, 0);
1961 level = nilfs_btree_node_get_level(btree, node);
1962 list_for_each(head, &lists[level]) {
1963 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
1964 cnode = (struct nilfs_btree_node *)cbh->b_data;
1965 ckey = nilfs_btree_node_get_key(btree, cnode, 0);
1966 if (key < ckey)
1967 break;
1968 }
1969 list_add_tail(&bh->b_assoc_buffers, head);
1970}
1971
1972static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
1973 struct list_head *listp)
1974{
1975 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1976 struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
1977 struct list_head lists[NILFS_BTREE_LEVEL_MAX];
1978 struct pagevec pvec;
1979 struct buffer_head *bh, *head;
1980 pgoff_t index = 0;
1981 int level, i;
1982
1983 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1984 level < NILFS_BTREE_LEVEL_MAX;
1985 level++)
1986 INIT_LIST_HEAD(&lists[level]);
1987
1988 pagevec_init(&pvec, 0);
1989
1990 while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
1991 PAGEVEC_SIZE)) {
1992 for (i = 0; i < pagevec_count(&pvec); i++) {
1993 bh = head = page_buffers(pvec.pages[i]);
1994 do {
1995 if (buffer_dirty(bh))
1996 nilfs_btree_add_dirty_buffer(btree,
1997 lists, bh);
1998 } while ((bh = bh->b_this_page) != head);
1999 }
2000 pagevec_release(&pvec);
2001 cond_resched();
2002 }
2003
2004 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
2005 level < NILFS_BTREE_LEVEL_MAX;
2006 level++)
2007 list_splice(&lists[level], listp->prev);
2008}
2009
2010static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2011 struct nilfs_btree_path *path,
2012 int level,
2013 struct buffer_head **bh,
2014 sector_t blocknr,
2015 union nilfs_binfo *binfo)
2016{
2017 struct nilfs_btree_node *parent;
2018 __u64 key;
2019 __u64 ptr;
2020 int ret;
2021
2022 parent = nilfs_btree_get_node(btree, path, level + 1);
2023 ptr = nilfs_btree_node_get_ptr(btree, parent,
2024 path[level + 1].bp_index);
2025 if (buffer_nilfs_node(*bh)) {
2026 path[level].bp_ctxt.oldkey = ptr;
2027 path[level].bp_ctxt.newkey = blocknr;
2028 path[level].bp_ctxt.bh = *bh;
2029 ret = nilfs_btnode_prepare_change_key(
2030 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2031 &path[level].bp_ctxt);
2032 if (ret < 0)
2033 return ret;
2034 nilfs_btnode_commit_change_key(
2035 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2036 &path[level].bp_ctxt);
2037 *bh = path[level].bp_ctxt.bh;
2038 }
2039
2040 nilfs_btree_node_set_ptr(btree, parent,
2041 path[level + 1].bp_index, blocknr);
2042
2043 key = nilfs_btree_node_get_key(btree, parent,
2044 path[level + 1].bp_index);
2045 /* on-disk format */
2046 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2047 binfo->bi_dat.bi_level = level;
2048
2049 return 0;
2050}
2051
2052static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2053 struct nilfs_btree_path *path,
2054 int level,
2055 struct buffer_head **bh,
2056 sector_t blocknr,
2057 union nilfs_binfo *binfo)
2058{
2059 struct nilfs_btree_node *parent;
2060 __u64 key;
2061 __u64 ptr;
2062 union nilfs_bmap_ptr_req req;
2063 int ret;
2064
2065 parent = nilfs_btree_get_node(btree, path, level + 1);
2066 ptr = nilfs_btree_node_get_ptr(btree, parent,
2067 path[level + 1].bp_index);
2068 req.bpr_ptr = ptr;
2069 ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
2070 &req);
2071 if (ret < 0)
2072 return ret;
2073 btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
2074 &req, blocknr);
2075
2076 key = nilfs_btree_node_get_key(btree, parent,
2077 path[level + 1].bp_index);
2078 /* on-disk format */
2079 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
2080 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2081
2082 return 0;
2083}
2084
2085static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2086 struct buffer_head **bh,
2087 sector_t blocknr,
2088 union nilfs_binfo *binfo)
2089{
2090 struct nilfs_btree *btree;
2091 struct nilfs_btree_path *path;
2092 struct nilfs_btree_node *node;
2093 __u64 key;
2094 int level, ret;
2095
2096 btree = (struct nilfs_btree *)bmap;
2097 path = nilfs_btree_alloc_path(btree);
2098 if (path == NULL)
2099 return -ENOMEM;
2100 nilfs_btree_init_path(btree, path);
2101
2102 if (buffer_nilfs_node(*bh)) {
2103 node = (struct nilfs_btree_node *)(*bh)->b_data;
2104 key = nilfs_btree_node_get_key(btree, node, 0);
2105 level = nilfs_btree_node_get_level(btree, node);
2106 } else {
2107 key = nilfs_bmap_data_get_key(bmap, *bh);
2108 level = NILFS_BTREE_LEVEL_DATA;
2109 }
2110
2111 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
2112 if (ret < 0) {
2113 WARN_ON(ret == -ENOENT);
2114 goto out;
2115 }
2116
2117 ret = btree->bt_ops->btop_assign(btree, path, level, bh,
2118 blocknr, binfo);
2119
2120 out:
2121 nilfs_btree_clear_path(btree, path);
2122 nilfs_btree_free_path(btree, path);
2123
2124 return ret;
2125}
2126
2127static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2128 struct buffer_head **bh,
2129 sector_t blocknr,
2130 union nilfs_binfo *binfo)
2131{
2132 struct nilfs_btree *btree;
2133 struct nilfs_btree_node *node;
2134 __u64 key;
2135 int ret;
2136
2137 btree = (struct nilfs_btree *)bmap;
2138 ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
2139 if (ret < 0)
2140 return ret;
2141
2142 if (buffer_nilfs_node(*bh)) {
2143 node = (struct nilfs_btree_node *)(*bh)->b_data;
2144 key = nilfs_btree_node_get_key(btree, node, 0);
2145 } else
2146 key = nilfs_bmap_data_get_key(bmap, *bh);
2147
2148 /* on-disk format */
2149 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2150 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2151
2152 return 0;
2153}
2154
2155static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2156{
2157 struct buffer_head *bh;
2158 struct nilfs_btree *btree;
2159 struct nilfs_btree_path *path;
2160 __u64 ptr;
2161 int ret;
2162
2163 btree = (struct nilfs_btree *)bmap;
2164 path = nilfs_btree_alloc_path(btree);
2165 if (path == NULL)
2166 return -ENOMEM;
2167 nilfs_btree_init_path(btree, path);
2168
2169 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2170 if (ret < 0) {
2171 WARN_ON(ret == -ENOENT);
2172 goto out;
2173 }
2174 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
2175 if (ret < 0) {
2176 WARN_ON(ret == -ENOENT);
2177 goto out;
2178 }
2179
2180 if (!buffer_dirty(bh))
2181 nilfs_btnode_mark_dirty(bh);
2182 nilfs_bmap_put_block(&btree->bt_bmap, bh);
2183 if (!nilfs_bmap_dirty(&btree->bt_bmap))
2184 nilfs_bmap_set_dirty(&btree->bt_bmap);
2185
2186 out:
2187 nilfs_btree_clear_path(btree, path);
2188 nilfs_btree_free_path(btree, path);
2189 return ret;
2190}
2191
2192static const struct nilfs_bmap_operations nilfs_btree_ops = {
2193 .bop_lookup = nilfs_btree_lookup,
2194 .bop_insert = nilfs_btree_insert,
2195 .bop_delete = nilfs_btree_delete,
2196 .bop_clear = NULL,
2197
2198 .bop_propagate = nilfs_btree_propagate,
2199
2200 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2201
2202 .bop_assign = nilfs_btree_assign,
2203 .bop_mark = nilfs_btree_mark,
2204
2205 .bop_last_key = nilfs_btree_last_key,
2206 .bop_check_insert = NULL,
2207 .bop_check_delete = nilfs_btree_check_delete,
2208 .bop_gather_data = nilfs_btree_gather_data,
2209};
2210
2211static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2212 .bop_lookup = NULL,
2213 .bop_insert = NULL,
2214 .bop_delete = NULL,
2215 .bop_clear = NULL,
2216
2217 .bop_propagate = nilfs_btree_propagate_gc,
2218
2219 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2220
2221 .bop_assign = nilfs_btree_assign_gc,
2222 .bop_mark = NULL,
2223
2224 .bop_last_key = NULL,
2225 .bop_check_insert = NULL,
2226 .bop_check_delete = NULL,
2227 .bop_gather_data = NULL,
2228};
2229
2230static const struct nilfs_btree_operations nilfs_btree_ops_v = {
2231 .btop_find_target = nilfs_btree_find_target_v,
2232 .btop_set_target = nilfs_btree_set_target_v,
2233 .btop_propagate = nilfs_btree_propagate_v,
2234 .btop_assign = nilfs_btree_assign_v,
2235};
2236
2237static const struct nilfs_btree_operations nilfs_btree_ops_p = {
2238 .btop_find_target = NULL,
2239 .btop_set_target = NULL,
2240 .btop_propagate = nilfs_btree_propagate_p,
2241 .btop_assign = nilfs_btree_assign_p,
2242};
2243
2244int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
2245{
2246 struct nilfs_btree *btree;
2247
2248 btree = (struct nilfs_btree *)bmap;
2249 bmap->b_ops = &nilfs_btree_ops;
2250 bmap->b_low = low;
2251 bmap->b_high = high;
2252 switch (bmap->b_inode->i_ino) {
2253 case NILFS_DAT_INO:
2254 btree->bt_ops = &nilfs_btree_ops_p;
2255 break;
2256 default:
2257 btree->bt_ops = &nilfs_btree_ops_v;
2258 break;
2259 }
2260
2261 return 0;
2262}
2263
2264void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2265{
2266 bmap->b_low = NILFS_BMAP_LARGE_LOW;
2267 bmap->b_high = NILFS_BMAP_LARGE_HIGH;
2268 bmap->b_ops = &nilfs_btree_ops_gc;
2269}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 000000000000..4766deb52fb1
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
1/*
2 * btree.h - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BTREE_H
24#define _NILFS_BTREE_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/list.h>
29#include <linux/nilfs2_fs.h>
30#include "btnode.h"
31#include "bmap.h"
32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/**
37 * struct nilfs_btree_operations - B-tree operation table
38 */
39struct nilfs_btree_operations {
40 __u64 (*btop_find_target)(const struct nilfs_btree *,
41 const struct nilfs_btree_path *, __u64);
42 void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
43
44 struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
45
46 int (*btop_propagate)(struct nilfs_btree *,
47 struct nilfs_btree_path *,
48 int,
49 struct buffer_head *);
50 int (*btop_assign)(struct nilfs_btree *,
51 struct nilfs_btree_path *,
52 int,
53 struct buffer_head **,
54 sector_t,
55 union nilfs_binfo *);
56};
57
58/**
59 * struct nilfs_btree_node - B-tree node
60 * @bn_flags: flags
61 * @bn_level: level
62 * @bn_nchildren: number of children
63 * @bn_pad: padding
64 */
65struct nilfs_btree_node {
66 __u8 bn_flags;
67 __u8 bn_level;
68 __le16 bn_nchildren;
69 __le32 bn_pad;
70};
71
72/* flags */
73#define NILFS_BTREE_NODE_ROOT 0x01
74
75/* level */
76#define NILFS_BTREE_LEVEL_DATA 0
77#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1)
78#define NILFS_BTREE_LEVEL_MAX 14
79
80/**
81 * struct nilfs_btree - B-tree structure
82 * @bt_bmap: bmap base structure
83 * @bt_ops: B-tree operation table
84 */
85struct nilfs_btree {
86 struct nilfs_bmap bt_bmap;
87
88 /* B-tree-specific members */
89 const struct nilfs_btree_operations *bt_ops;
90};
91
92
93#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
94#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
95 ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \
96 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
97#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0
98#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64))
99#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \
100 (((nodesize) - sizeof(struct nilfs_btree_node) - \
101 NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \
102 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
103#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \
104 ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
105#define NILFS_BTREE_KEY_MIN ((__u64)0)
106#define NILFS_BTREE_KEY_MAX (~(__u64)0)
107
108
109int nilfs_btree_path_cache_init(void);
110void nilfs_btree_path_cache_destroy(void);
111int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
112int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
113 const __u64 *, const __u64 *,
114 int, __u64, __u64);
115void nilfs_btree_init_gc(struct nilfs_bmap *);
116
117#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000000..e90b60dfced9
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,925 @@
1/*
2 * cpfile.c - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "cpfile.h"
31
32
33static inline unsigned long
34nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
35{
36 return NILFS_MDT(cpfile)->mi_entries_per_block;
37}
38
39/* block number from the beginning of the file */
40static unsigned long
41nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
42{
43 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
44 do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
45 return (unsigned long)tcno;
46}
47
48/* offset in block */
49static unsigned long
50nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
51{
52 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
53 return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
54}
55
56static unsigned long
57nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
58 __u64 curr,
59 __u64 max)
60{
61 return min_t(__u64,
62 nilfs_cpfile_checkpoints_per_block(cpfile) -
63 nilfs_cpfile_get_offset(cpfile, curr),
64 max - curr);
65}
66
67static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
68 __u64 cno)
69{
70 return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
71}
72
73static unsigned int
74nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
75 struct buffer_head *bh,
76 void *kaddr,
77 unsigned int n)
78{
79 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
80 unsigned int count;
81
82 count = le32_to_cpu(cp->cp_checkpoints_count) + n;
83 cp->cp_checkpoints_count = cpu_to_le32(count);
84 return count;
85}
86
87static unsigned int
88nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
89 struct buffer_head *bh,
90 void *kaddr,
91 unsigned int n)
92{
93 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
94 unsigned int count;
95
96 WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
97 count = le32_to_cpu(cp->cp_checkpoints_count) - n;
98 cp->cp_checkpoints_count = cpu_to_le32(count);
99 return count;
100}
101
102static inline struct nilfs_cpfile_header *
103nilfs_cpfile_block_get_header(const struct inode *cpfile,
104 struct buffer_head *bh,
105 void *kaddr)
106{
107 return kaddr + bh_offset(bh);
108}
109
110static struct nilfs_checkpoint *
111nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
112 struct buffer_head *bh,
113 void *kaddr)
114{
115 return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
116 NILFS_MDT(cpfile)->mi_entry_size;
117}
118
119static void nilfs_cpfile_block_init(struct inode *cpfile,
120 struct buffer_head *bh,
121 void *kaddr)
122{
123 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
124 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
125 int n = nilfs_cpfile_checkpoints_per_block(cpfile);
126
127 while (n-- > 0) {
128 nilfs_checkpoint_set_invalid(cp);
129 cp = (void *)cp + cpsz;
130 }
131}
132
133static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
134 struct buffer_head **bhp)
135{
136 return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
137}
138
139static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
140 __u64 cno,
141 int create,
142 struct buffer_head **bhp)
143{
144 return nilfs_mdt_get_block(cpfile,
145 nilfs_cpfile_get_blkoff(cpfile, cno),
146 create, nilfs_cpfile_block_init, bhp);
147}
148
149static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
150 __u64 cno)
151{
152 return nilfs_mdt_delete_block(cpfile,
153 nilfs_cpfile_get_blkoff(cpfile, cno));
154}
155
156/**
157 * nilfs_cpfile_get_checkpoint - get a checkpoint
158 * @cpfile: inode of checkpoint file
159 * @cno: checkpoint number
160 * @create: create flag
161 * @cpp: pointer to a checkpoint
162 * @bhp: pointer to a buffer head
163 *
164 * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
165 * specified by @cno. A new checkpoint will be created if @cno is the current
166 * checkpoint number and @create is nonzero.
167 *
168 * Return Value: On success, 0 is returned, and the checkpoint and the
169 * buffer head of the buffer on which the checkpoint is located are stored in
170 * the place pointed by @cpp and @bhp, respectively. On error, one of the
171 * following negative error codes is returned.
172 *
173 * %-EIO - I/O error.
174 *
175 * %-ENOMEM - Insufficient amount of memory available.
176 *
177 * %-ENOENT - No such checkpoint.
178 *
179 * %-EINVAL - invalid checkpoint.
180 */
181int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
182 __u64 cno,
183 int create,
184 struct nilfs_checkpoint **cpp,
185 struct buffer_head **bhp)
186{
187 struct buffer_head *header_bh, *cp_bh;
188 struct nilfs_cpfile_header *header;
189 struct nilfs_checkpoint *cp;
190 void *kaddr;
191 int ret;
192
193 if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
194 (cno < nilfs_mdt_cno(cpfile) && create)))
195 return -EINVAL;
196
197 down_write(&NILFS_MDT(cpfile)->mi_sem);
198
199 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
200 if (ret < 0)
201 goto out_sem;
202 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
203 if (ret < 0)
204 goto out_header;
205 kaddr = kmap(cp_bh->b_page);
206 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
207 if (nilfs_checkpoint_invalid(cp)) {
208 if (!create) {
209 kunmap(cp_bh->b_page);
210 brelse(cp_bh);
211 ret = -ENOENT;
212 goto out_header;
213 }
214 /* a newly-created checkpoint */
215 nilfs_checkpoint_clear_invalid(cp);
216 if (!nilfs_cpfile_is_in_first(cpfile, cno))
217 nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
218 kaddr, 1);
219 nilfs_mdt_mark_buffer_dirty(cp_bh);
220
221 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
222 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
223 kaddr);
224 le64_add_cpu(&header->ch_ncheckpoints, 1);
225 kunmap_atomic(kaddr, KM_USER0);
226 nilfs_mdt_mark_buffer_dirty(header_bh);
227 nilfs_mdt_mark_dirty(cpfile);
228 }
229
230 if (cpp != NULL)
231 *cpp = cp;
232 *bhp = cp_bh;
233
234 out_header:
235 brelse(header_bh);
236
237 out_sem:
238 up_write(&NILFS_MDT(cpfile)->mi_sem);
239 return ret;
240}
241
242/**
243 * nilfs_cpfile_put_checkpoint - put a checkpoint
244 * @cpfile: inode of checkpoint file
245 * @cno: checkpoint number
246 * @bh: buffer head
247 *
248 * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
249 * specified by @cno. @bh must be the buffer head which has been returned by
250 * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
251 */
252void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
253 struct buffer_head *bh)
254{
255 kunmap(bh->b_page);
256 brelse(bh);
257}
258
259/**
260 * nilfs_cpfile_delete_checkpoints - delete checkpoints
261 * @cpfile: inode of checkpoint file
262 * @start: start checkpoint number
263 * @end: end checkpoint numer
264 *
265 * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
266 * the period from @start to @end, excluding @end itself. The checkpoints
267 * which have been already deleted are ignored.
268 *
269 * Return Value: On success, 0 is returned. On error, one of the following
270 * negative error codes is returned.
271 *
272 * %-EIO - I/O error.
273 *
274 * %-ENOMEM - Insufficient amount of memory available.
275 *
276 * %-EINVAL - invalid checkpoints.
277 */
278int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
279 __u64 start,
280 __u64 end)
281{
282 struct buffer_head *header_bh, *cp_bh;
283 struct nilfs_cpfile_header *header;
284 struct nilfs_checkpoint *cp;
285 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
286 __u64 cno;
287 void *kaddr;
288 unsigned long tnicps;
289 int ret, ncps, nicps, count, i;
290
291 if (unlikely(start == 0 || start > end)) {
292 printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
293 "[%llu, %llu)\n", __func__,
294 (unsigned long long)start, (unsigned long long)end);
295 return -EINVAL;
296 }
297
298 /* cannot delete the latest checkpoint */
299 if (start == nilfs_mdt_cno(cpfile) - 1)
300 return -EPERM;
301
302 down_write(&NILFS_MDT(cpfile)->mi_sem);
303
304 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
305 if (ret < 0)
306 goto out_sem;
307 tnicps = 0;
308
309 for (cno = start; cno < end; cno += ncps) {
310 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
311 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
312 if (ret < 0) {
313 if (ret != -ENOENT)
314 goto out_sem;
315 /* skip hole */
316 ret = 0;
317 continue;
318 }
319
320 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
321 cp = nilfs_cpfile_block_get_checkpoint(
322 cpfile, cno, cp_bh, kaddr);
323 nicps = 0;
324 for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
325 WARN_ON(nilfs_checkpoint_snapshot(cp));
326 if (!nilfs_checkpoint_invalid(cp)) {
327 nilfs_checkpoint_set_invalid(cp);
328 nicps++;
329 }
330 }
331 if (nicps > 0) {
332 tnicps += nicps;
333 nilfs_mdt_mark_buffer_dirty(cp_bh);
334 nilfs_mdt_mark_dirty(cpfile);
335 if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
336 (count = nilfs_cpfile_block_sub_valid_checkpoints(
337 cpfile, cp_bh, kaddr, nicps)) == 0) {
338 /* make hole */
339 kunmap_atomic(kaddr, KM_USER0);
340 brelse(cp_bh);
341 ret = nilfs_cpfile_delete_checkpoint_block(
342 cpfile, cno);
343 if (ret == 0)
344 continue;
345 printk(KERN_ERR "%s: cannot delete block\n",
346 __func__);
347 goto out_sem;
348 }
349 }
350
351 kunmap_atomic(kaddr, KM_USER0);
352 brelse(cp_bh);
353 }
354
355 if (tnicps > 0) {
356 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
357 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
358 kaddr);
359 le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
360 nilfs_mdt_mark_buffer_dirty(header_bh);
361 nilfs_mdt_mark_dirty(cpfile);
362 kunmap_atomic(kaddr, KM_USER0);
363 }
364 brelse(header_bh);
365
366 out_sem:
367 up_write(&NILFS_MDT(cpfile)->mi_sem);
368 return ret;
369}
370
371static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
372 struct nilfs_checkpoint *cp,
373 struct nilfs_cpinfo *ci)
374{
375 ci->ci_flags = le32_to_cpu(cp->cp_flags);
376 ci->ci_cno = le64_to_cpu(cp->cp_cno);
377 ci->ci_create = le64_to_cpu(cp->cp_create);
378 ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
379 ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
380 ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
381 ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
382}
383
384static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
385 struct nilfs_cpinfo *ci, size_t nci)
386{
387 struct nilfs_checkpoint *cp;
388 struct buffer_head *bh;
389 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
390 __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
391 void *kaddr;
392 int n, ret;
393 int ncps, i;
394
395 if (cno == 0)
396 return -ENOENT; /* checkpoint number 0 is invalid */
397 down_read(&NILFS_MDT(cpfile)->mi_sem);
398
399 for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
400 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
401 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
402 if (ret < 0) {
403 if (ret != -ENOENT)
404 goto out;
405 continue; /* skip hole */
406 }
407
408 kaddr = kmap_atomic(bh->b_page, KM_USER0);
409 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
410 for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
411 if (!nilfs_checkpoint_invalid(cp))
412 nilfs_cpfile_checkpoint_to_cpinfo(
413 cpfile, cp, &ci[n++]);
414 }
415 kunmap_atomic(kaddr, KM_USER0);
416 brelse(bh);
417 }
418
419 ret = n;
420 if (n > 0)
421 *cnop = ci[n - 1].ci_cno + 1;
422
423 out:
424 up_read(&NILFS_MDT(cpfile)->mi_sem);
425 return ret;
426}
427
428static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
429 struct nilfs_cpinfo *ci, size_t nci)
430{
431 struct buffer_head *bh;
432 struct nilfs_cpfile_header *header;
433 struct nilfs_checkpoint *cp;
434 __u64 curr = *cnop, next;
435 unsigned long curr_blkoff, next_blkoff;
436 void *kaddr;
437 int n = 0, ret;
438
439 down_read(&NILFS_MDT(cpfile)->mi_sem);
440
441 if (curr == 0) {
442 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
443 if (ret < 0)
444 goto out;
445 kaddr = kmap_atomic(bh->b_page, KM_USER0);
446 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
447 curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
448 kunmap_atomic(kaddr, KM_USER0);
449 brelse(bh);
450 if (curr == 0) {
451 ret = 0;
452 goto out;
453 }
454 } else if (unlikely(curr == ~(__u64)0)) {
455 ret = 0;
456 goto out;
457 }
458
459 curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
460 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
461 if (unlikely(ret < 0)) {
462 if (ret == -ENOENT)
463 ret = 0; /* No snapshots (started from a hole block) */
464 goto out;
465 }
466 kaddr = kmap_atomic(bh->b_page, KM_USER0);
467 while (n < nci) {
468 cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
469 curr = ~(__u64)0; /* Terminator */
470 if (unlikely(nilfs_checkpoint_invalid(cp) ||
471 !nilfs_checkpoint_snapshot(cp)))
472 break;
473 nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
474 next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
475 if (next == 0)
476 break; /* reach end of the snapshot list */
477
478 next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
479 if (curr_blkoff != next_blkoff) {
480 kunmap_atomic(kaddr, KM_USER0);
481 brelse(bh);
482 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
483 0, &bh);
484 if (unlikely(ret < 0)) {
485 WARN_ON(ret == -ENOENT);
486 goto out;
487 }
488 kaddr = kmap_atomic(bh->b_page, KM_USER0);
489 }
490 curr = next;
491 curr_blkoff = next_blkoff;
492 }
493 kunmap_atomic(kaddr, KM_USER0);
494 brelse(bh);
495 *cnop = curr;
496 ret = n;
497
498 out:
499 up_read(&NILFS_MDT(cpfile)->mi_sem);
500 return ret;
501}
502
503/**
504 * nilfs_cpfile_get_cpinfo -
505 * @cpfile:
506 * @cno:
507 * @ci:
508 * @nci:
509 */
510
511ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
512 struct nilfs_cpinfo *ci, size_t nci)
513{
514 switch (mode) {
515 case NILFS_CHECKPOINT:
516 return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
517 case NILFS_SNAPSHOT:
518 return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
519 default:
520 return -EINVAL;
521 }
522}
523
524/**
525 * nilfs_cpfile_delete_checkpoint -
526 * @cpfile:
527 * @cno:
528 */
529int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
530{
531 struct nilfs_cpinfo ci;
532 __u64 tcno = cno;
533 ssize_t nci;
534 int ret;
535
536 nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
537 if (nci < 0)
538 return nci;
539 else if (nci == 0 || ci.ci_cno != cno)
540 return -ENOENT;
541
542 /* cannot delete the latest checkpoint nor snapshots */
543 ret = nilfs_cpinfo_snapshot(&ci);
544 if (ret < 0)
545 return ret;
546 else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
547 return -EPERM;
548
549 return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
550}
551
552static struct nilfs_snapshot_list *
553nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
554 __u64 cno,
555 struct buffer_head *bh,
556 void *kaddr)
557{
558 struct nilfs_cpfile_header *header;
559 struct nilfs_checkpoint *cp;
560 struct nilfs_snapshot_list *list;
561
562 if (cno != 0) {
563 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
564 list = &cp->cp_snapshot_list;
565 } else {
566 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
567 list = &header->ch_snapshot_list;
568 }
569 return list;
570}
571
572static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
573{
574 struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
575 struct nilfs_cpfile_header *header;
576 struct nilfs_checkpoint *cp;
577 struct nilfs_snapshot_list *list;
578 __u64 curr, prev;
579 unsigned long curr_blkoff, prev_blkoff;
580 void *kaddr;
581 int ret;
582
583 if (cno == 0)
584 return -ENOENT; /* checkpoint number 0 is invalid */
585 down_write(&NILFS_MDT(cpfile)->mi_sem);
586
587 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
588 if (ret < 0)
589 goto out_sem;
590 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
591 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
592 if (nilfs_checkpoint_invalid(cp)) {
593 ret = -ENOENT;
594 kunmap_atomic(kaddr, KM_USER0);
595 goto out_cp;
596 }
597 if (nilfs_checkpoint_snapshot(cp)) {
598 ret = 0;
599 kunmap_atomic(kaddr, KM_USER0);
600 goto out_cp;
601 }
602 kunmap_atomic(kaddr, KM_USER0);
603
604 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
605 if (ret < 0)
606 goto out_cp;
607 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
608 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
609 list = &header->ch_snapshot_list;
610 curr_bh = header_bh;
611 get_bh(curr_bh);
612 curr = 0;
613 curr_blkoff = 0;
614 prev = le64_to_cpu(list->ssl_prev);
615 while (prev > cno) {
616 prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
617 curr = prev;
618 if (curr_blkoff != prev_blkoff) {
619 kunmap_atomic(kaddr, KM_USER0);
620 brelse(curr_bh);
621 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
622 0, &curr_bh);
623 if (ret < 0)
624 goto out_header;
625 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
626 }
627 curr_blkoff = prev_blkoff;
628 cp = nilfs_cpfile_block_get_checkpoint(
629 cpfile, curr, curr_bh, kaddr);
630 list = &cp->cp_snapshot_list;
631 prev = le64_to_cpu(list->ssl_prev);
632 }
633 kunmap_atomic(kaddr, KM_USER0);
634
635 if (prev != 0) {
636 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
637 &prev_bh);
638 if (ret < 0)
639 goto out_curr;
640 } else {
641 prev_bh = header_bh;
642 get_bh(prev_bh);
643 }
644
645 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
646 list = nilfs_cpfile_block_get_snapshot_list(
647 cpfile, curr, curr_bh, kaddr);
648 list->ssl_prev = cpu_to_le64(cno);
649 kunmap_atomic(kaddr, KM_USER0);
650
651 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
652 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
653 cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
654 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
655 nilfs_checkpoint_set_snapshot(cp);
656 kunmap_atomic(kaddr, KM_USER0);
657
658 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
659 list = nilfs_cpfile_block_get_snapshot_list(
660 cpfile, prev, prev_bh, kaddr);
661 list->ssl_next = cpu_to_le64(cno);
662 kunmap_atomic(kaddr, KM_USER0);
663
664 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
665 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
666 le64_add_cpu(&header->ch_nsnapshots, 1);
667 kunmap_atomic(kaddr, KM_USER0);
668
669 nilfs_mdt_mark_buffer_dirty(prev_bh);
670 nilfs_mdt_mark_buffer_dirty(curr_bh);
671 nilfs_mdt_mark_buffer_dirty(cp_bh);
672 nilfs_mdt_mark_buffer_dirty(header_bh);
673 nilfs_mdt_mark_dirty(cpfile);
674
675 brelse(prev_bh);
676
677 out_curr:
678 brelse(curr_bh);
679
680 out_header:
681 brelse(header_bh);
682
683 out_cp:
684 brelse(cp_bh);
685
686 out_sem:
687 up_write(&NILFS_MDT(cpfile)->mi_sem);
688 return ret;
689}
690
691static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
692{
693 struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
694 struct nilfs_cpfile_header *header;
695 struct nilfs_checkpoint *cp;
696 struct nilfs_snapshot_list *list;
697 __u64 next, prev;
698 void *kaddr;
699 int ret;
700
701 if (cno == 0)
702 return -ENOENT; /* checkpoint number 0 is invalid */
703 down_write(&NILFS_MDT(cpfile)->mi_sem);
704
705 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
706 if (ret < 0)
707 goto out_sem;
708 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
709 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
710 if (nilfs_checkpoint_invalid(cp)) {
711 ret = -ENOENT;
712 kunmap_atomic(kaddr, KM_USER0);
713 goto out_cp;
714 }
715 if (!nilfs_checkpoint_snapshot(cp)) {
716 ret = 0;
717 kunmap_atomic(kaddr, KM_USER0);
718 goto out_cp;
719 }
720
721 list = &cp->cp_snapshot_list;
722 next = le64_to_cpu(list->ssl_next);
723 prev = le64_to_cpu(list->ssl_prev);
724 kunmap_atomic(kaddr, KM_USER0);
725
726 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
727 if (ret < 0)
728 goto out_cp;
729 if (next != 0) {
730 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
731 &next_bh);
732 if (ret < 0)
733 goto out_header;
734 } else {
735 next_bh = header_bh;
736 get_bh(next_bh);
737 }
738 if (prev != 0) {
739 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
740 &prev_bh);
741 if (ret < 0)
742 goto out_next;
743 } else {
744 prev_bh = header_bh;
745 get_bh(prev_bh);
746 }
747
748 kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
749 list = nilfs_cpfile_block_get_snapshot_list(
750 cpfile, next, next_bh, kaddr);
751 list->ssl_prev = cpu_to_le64(prev);
752 kunmap_atomic(kaddr, KM_USER0);
753
754 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
755 list = nilfs_cpfile_block_get_snapshot_list(
756 cpfile, prev, prev_bh, kaddr);
757 list->ssl_next = cpu_to_le64(next);
758 kunmap_atomic(kaddr, KM_USER0);
759
760 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
761 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
762 cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
763 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
764 nilfs_checkpoint_clear_snapshot(cp);
765 kunmap_atomic(kaddr, KM_USER0);
766
767 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
768 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
769 le64_add_cpu(&header->ch_nsnapshots, -1);
770 kunmap_atomic(kaddr, KM_USER0);
771
772 nilfs_mdt_mark_buffer_dirty(next_bh);
773 nilfs_mdt_mark_buffer_dirty(prev_bh);
774 nilfs_mdt_mark_buffer_dirty(cp_bh);
775 nilfs_mdt_mark_buffer_dirty(header_bh);
776 nilfs_mdt_mark_dirty(cpfile);
777
778 brelse(prev_bh);
779
780 out_next:
781 brelse(next_bh);
782
783 out_header:
784 brelse(header_bh);
785
786 out_cp:
787 brelse(cp_bh);
788
789 out_sem:
790 up_write(&NILFS_MDT(cpfile)->mi_sem);
791 return ret;
792}
793
794/**
795 * nilfs_cpfile_is_snapshot -
796 * @cpfile: inode of checkpoint file
797 * @cno: checkpoint number
798 *
799 * Description:
800 *
801 * Return Value: On success, 1 is returned if the checkpoint specified by
802 * @cno is a snapshot, or 0 if not. On error, one of the following negative
803 * error codes is returned.
804 *
805 * %-EIO - I/O error.
806 *
807 * %-ENOMEM - Insufficient amount of memory available.
808 *
809 * %-ENOENT - No such checkpoint.
810 */
811int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
812{
813 struct buffer_head *bh;
814 struct nilfs_checkpoint *cp;
815 void *kaddr;
816 int ret;
817
818 if (cno == 0)
819 return -ENOENT; /* checkpoint number 0 is invalid */
820 down_read(&NILFS_MDT(cpfile)->mi_sem);
821
822 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
823 if (ret < 0)
824 goto out;
825 kaddr = kmap_atomic(bh->b_page, KM_USER0);
826 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
827 ret = nilfs_checkpoint_snapshot(cp);
828 kunmap_atomic(kaddr, KM_USER0);
829 brelse(bh);
830
831 out:
832 up_read(&NILFS_MDT(cpfile)->mi_sem);
833 return ret;
834}
835
836/**
837 * nilfs_cpfile_change_cpmode - change checkpoint mode
838 * @cpfile: inode of checkpoint file
839 * @cno: checkpoint number
840 * @status: mode of checkpoint
841 *
842 * Description: nilfs_change_cpmode() changes the mode of the checkpoint
843 * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
844 *
845 * Return Value: On success, 0 is returned. On error, one of the following
846 * negative error codes is returned.
847 *
848 * %-EIO - I/O error.
849 *
850 * %-ENOMEM - Insufficient amount of memory available.
851 *
852 * %-ENOENT - No such checkpoint.
853 */
854int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
855{
856 struct the_nilfs *nilfs;
857 int ret;
858
859 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
860
861 switch (mode) {
862 case NILFS_CHECKPOINT:
863 /*
864 * Check for protecting existing snapshot mounts:
865 * bd_mount_sem is used to make this operation atomic and
866 * exclusive with a new mount job. Though it doesn't cover
867 * umount, it's enough for the purpose.
868 */
869 down(&nilfs->ns_bdev->bd_mount_sem);
870 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
871 /* Current implementation does not have to protect
872 plain read-only mounts since they are exclusive
873 with a read/write mount and are protected from the
874 cleaner. */
875 ret = -EBUSY;
876 } else
877 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
878 up(&nilfs->ns_bdev->bd_mount_sem);
879 return ret;
880 case NILFS_SNAPSHOT:
881 return nilfs_cpfile_set_snapshot(cpfile, cno);
882 default:
883 return -EINVAL;
884 }
885}
886
887/**
888 * nilfs_cpfile_get_stat - get checkpoint statistics
889 * @cpfile: inode of checkpoint file
890 * @stat: pointer to a structure of checkpoint statistics
891 *
892 * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
893 *
894 * Return Value: On success, 0 is returned, and checkpoints information is
895 * stored in the place pointed by @stat. On error, one of the following
896 * negative error codes is returned.
897 *
898 * %-EIO - I/O error.
899 *
900 * %-ENOMEM - Insufficient amount of memory available.
901 */
902int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
903{
904 struct buffer_head *bh;
905 struct nilfs_cpfile_header *header;
906 void *kaddr;
907 int ret;
908
909 down_read(&NILFS_MDT(cpfile)->mi_sem);
910
911 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
912 if (ret < 0)
913 goto out_sem;
914 kaddr = kmap_atomic(bh->b_page, KM_USER0);
915 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
916 cpstat->cs_cno = nilfs_mdt_cno(cpfile);
917 cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
918 cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
919 kunmap_atomic(kaddr, KM_USER0);
920 brelse(bh);
921
922 out_sem:
923 up_read(&NILFS_MDT(cpfile)->mi_sem);
924 return ret;
925}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000000..1a8a1008c342
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
1/*
2 * cpfile.h - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_CPFILE_H
24#define _NILFS_CPFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29
30#define NILFS_CPFILE_GFP NILFS_MDT_GFP
31
32
33int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
34 struct nilfs_checkpoint **,
35 struct buffer_head **);
36void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
37int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
38int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
39int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
40int nilfs_cpfile_is_snapshot(struct inode *, __u64);
41int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
42ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
43 struct nilfs_cpinfo *, size_t);
44
45#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 000000000000..bb8a5818e7f1
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
1/*
2 * dat.c - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/types.h>
24#include <linux/buffer_head.h>
25#include <linux/string.h>
26#include <linux/errno.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "dat.h"
31
32
33#define NILFS_CNO_MIN ((__u64)1)
34#define NILFS_CNO_MAX (~(__u64)0)
35
36static int nilfs_dat_prepare_entry(struct inode *dat,
37 struct nilfs_palloc_req *req, int create)
38{
39 return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
40 create, &req->pr_entry_bh);
41}
42
43static void nilfs_dat_commit_entry(struct inode *dat,
44 struct nilfs_palloc_req *req)
45{
46 nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
47 nilfs_mdt_mark_dirty(dat);
48 brelse(req->pr_entry_bh);
49}
50
51static void nilfs_dat_abort_entry(struct inode *dat,
52 struct nilfs_palloc_req *req)
53{
54 brelse(req->pr_entry_bh);
55}
56
57int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
58{
59 int ret;
60
61 ret = nilfs_palloc_prepare_alloc_entry(dat, req);
62 if (ret < 0)
63 return ret;
64
65 ret = nilfs_dat_prepare_entry(dat, req, 1);
66 if (ret < 0)
67 nilfs_palloc_abort_alloc_entry(dat, req);
68
69 return ret;
70}
71
72void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
73{
74 struct nilfs_dat_entry *entry;
75 void *kaddr;
76
77 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
78 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
79 req->pr_entry_bh, kaddr);
80 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
81 entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
82 entry->de_blocknr = cpu_to_le64(0);
83 kunmap_atomic(kaddr, KM_USER0);
84
85 nilfs_palloc_commit_alloc_entry(dat, req);
86 nilfs_dat_commit_entry(dat, req);
87}
88
89void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
90{
91 nilfs_dat_abort_entry(dat, req);
92 nilfs_palloc_abort_alloc_entry(dat, req);
93}
94
95int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
96{
97 int ret;
98
99 ret = nilfs_palloc_prepare_free_entry(dat, req);
100 if (ret < 0)
101 return ret;
102 ret = nilfs_dat_prepare_entry(dat, req, 0);
103 if (ret < 0) {
104 nilfs_palloc_abort_free_entry(dat, req);
105 return ret;
106 }
107 return 0;
108}
109
110void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
111{
112 struct nilfs_dat_entry *entry;
113 void *kaddr;
114
115 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
116 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
117 req->pr_entry_bh, kaddr);
118 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
119 entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
120 entry->de_blocknr = cpu_to_le64(0);
121 kunmap_atomic(kaddr, KM_USER0);
122
123 nilfs_dat_commit_entry(dat, req);
124 nilfs_palloc_commit_free_entry(dat, req);
125}
126
127void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
128{
129 nilfs_dat_abort_entry(dat, req);
130 nilfs_palloc_abort_free_entry(dat, req);
131}
132
133int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
134{
135 int ret;
136
137 ret = nilfs_dat_prepare_entry(dat, req, 0);
138 WARN_ON(ret == -ENOENT);
139 return ret;
140}
141
142void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
143 sector_t blocknr)
144{
145 struct nilfs_dat_entry *entry;
146 void *kaddr;
147
148 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
149 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
150 req->pr_entry_bh, kaddr);
151 entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
152 if (entry->de_blocknr != cpu_to_le64(0) ||
153 entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
154 printk(KERN_CRIT
155 "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
156 __func__, (unsigned long long)req->pr_entry_nr,
157 (unsigned long long)le64_to_cpu(entry->de_start),
158 (unsigned long long)le64_to_cpu(entry->de_end),
159 (unsigned long long)le64_to_cpu(entry->de_blocknr));
160 }
161 entry->de_blocknr = cpu_to_le64(blocknr);
162 kunmap_atomic(kaddr, KM_USER0);
163
164 nilfs_dat_commit_entry(dat, req);
165}
166
167void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
168{
169 nilfs_dat_abort_entry(dat, req);
170}
171
172int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
173{
174 struct nilfs_dat_entry *entry;
175 __u64 start;
176 sector_t blocknr;
177 void *kaddr;
178 int ret;
179
180 ret = nilfs_dat_prepare_entry(dat, req, 0);
181 if (ret < 0) {
182 WARN_ON(ret == -ENOENT);
183 return ret;
184 }
185
186 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
187 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
188 req->pr_entry_bh, kaddr);
189 start = le64_to_cpu(entry->de_start);
190 blocknr = le64_to_cpu(entry->de_blocknr);
191 kunmap_atomic(kaddr, KM_USER0);
192
193 if (blocknr == 0) {
194 ret = nilfs_palloc_prepare_free_entry(dat, req);
195 if (ret < 0) {
196 nilfs_dat_abort_entry(dat, req);
197 return ret;
198 }
199 }
200
201 return 0;
202}
203
204void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
205 int dead)
206{
207 struct nilfs_dat_entry *entry;
208 __u64 start, end;
209 sector_t blocknr;
210 void *kaddr;
211
212 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
213 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
214 req->pr_entry_bh, kaddr);
215 end = start = le64_to_cpu(entry->de_start);
216 if (!dead) {
217 end = nilfs_mdt_cno(dat);
218 WARN_ON(start > end);
219 }
220 entry->de_end = cpu_to_le64(end);
221 blocknr = le64_to_cpu(entry->de_blocknr);
222 kunmap_atomic(kaddr, KM_USER0);
223
224 if (blocknr == 0)
225 nilfs_dat_commit_free(dat, req);
226 else
227 nilfs_dat_commit_entry(dat, req);
228}
229
230void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
231{
232 struct nilfs_dat_entry *entry;
233 __u64 start;
234 sector_t blocknr;
235 void *kaddr;
236
237 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
238 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
239 req->pr_entry_bh, kaddr);
240 start = le64_to_cpu(entry->de_start);
241 blocknr = le64_to_cpu(entry->de_blocknr);
242 kunmap_atomic(kaddr, KM_USER0);
243
244 if (start == nilfs_mdt_cno(dat) && blocknr == 0)
245 nilfs_palloc_abort_free_entry(dat, req);
246 nilfs_dat_abort_entry(dat, req);
247}
248
249/**
250 * nilfs_dat_mark_dirty -
251 * @dat: DAT file inode
252 * @vblocknr: virtual block number
253 *
254 * Description:
255 *
256 * Return Value: On success, 0 is returned. On error, one of the following
257 * negative error codes is returned.
258 *
259 * %-EIO - I/O error.
260 *
261 * %-ENOMEM - Insufficient amount of memory available.
262 */
263int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
264{
265 struct nilfs_palloc_req req;
266 int ret;
267
268 req.pr_entry_nr = vblocknr;
269 ret = nilfs_dat_prepare_entry(dat, &req, 0);
270 if (ret == 0)
271 nilfs_dat_commit_entry(dat, &req);
272 return ret;
273}
274
275/**
276 * nilfs_dat_freev - free virtual block numbers
277 * @dat: DAT file inode
278 * @vblocknrs: array of virtual block numbers
279 * @nitems: number of virtual block numbers
280 *
281 * Description: nilfs_dat_freev() frees the virtual block numbers specified by
282 * @vblocknrs and @nitems.
283 *
284 * Return Value: On success, 0 is returned. On error, one of the following
285 * nagative error codes is returned.
286 *
287 * %-EIO - I/O error.
288 *
289 * %-ENOMEM - Insufficient amount of memory available.
290 *
291 * %-ENOENT - The virtual block number have not been allocated.
292 */
293int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
294{
295 return nilfs_palloc_freev(dat, vblocknrs, nitems);
296}
297
298/**
299 * nilfs_dat_move - change a block number
300 * @dat: DAT file inode
301 * @vblocknr: virtual block number
302 * @blocknr: block number
303 *
304 * Description: nilfs_dat_move() changes the block number associated with
305 * @vblocknr to @blocknr.
306 *
307 * Return Value: On success, 0 is returned. On error, one of the following
308 * negative error codes is returned.
309 *
310 * %-EIO - I/O error.
311 *
312 * %-ENOMEM - Insufficient amount of memory available.
313 */
314int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
315{
316 struct buffer_head *entry_bh;
317 struct nilfs_dat_entry *entry;
318 void *kaddr;
319 int ret;
320
321 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
322 if (ret < 0)
323 return ret;
324 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
325 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
326 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
327 printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
328 (unsigned long long)vblocknr,
329 (unsigned long long)le64_to_cpu(entry->de_start),
330 (unsigned long long)le64_to_cpu(entry->de_end));
331 kunmap_atomic(kaddr, KM_USER0);
332 brelse(entry_bh);
333 return -EINVAL;
334 }
335 WARN_ON(blocknr == 0);
336 entry->de_blocknr = cpu_to_le64(blocknr);
337 kunmap_atomic(kaddr, KM_USER0);
338
339 nilfs_mdt_mark_buffer_dirty(entry_bh);
340 nilfs_mdt_mark_dirty(dat);
341
342 brelse(entry_bh);
343
344 return 0;
345}
346
347/**
348 * nilfs_dat_translate - translate a virtual block number to a block number
349 * @dat: DAT file inode
350 * @vblocknr: virtual block number
351 * @blocknrp: pointer to a block number
352 *
353 * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
354 * to the corresponding block number.
355 *
356 * Return Value: On success, 0 is returned and the block number associated
357 * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
358 * of the following negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 *
364 * %-ENOENT - A block number associated with @vblocknr does not exist.
365 */
366int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
367{
368 struct buffer_head *entry_bh;
369 struct nilfs_dat_entry *entry;
370 sector_t blocknr;
371 void *kaddr;
372 int ret;
373
374 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
375 if (ret < 0)
376 return ret;
377
378 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
379 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
380 blocknr = le64_to_cpu(entry->de_blocknr);
381 if (blocknr == 0) {
382 ret = -ENOENT;
383 goto out;
384 }
385 if (blocknrp != NULL)
386 *blocknrp = blocknr;
387
388 out:
389 kunmap_atomic(kaddr, KM_USER0);
390 brelse(entry_bh);
391 return ret;
392}
393
394ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
395 size_t nvi)
396{
397 struct buffer_head *entry_bh;
398 struct nilfs_dat_entry *entry;
399 __u64 first, last;
400 void *kaddr;
401 unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
402 int i, j, n, ret;
403
404 for (i = 0; i < nvi; i += n) {
405 ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
406 0, &entry_bh);
407 if (ret < 0)
408 return ret;
409 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
410 /* last virtual block number in this block */
411 first = vinfo[i].vi_vblocknr;
412 do_div(first, entries_per_block);
413 first *= entries_per_block;
414 last = first + entries_per_block - 1;
415 for (j = i, n = 0;
416 j < nvi && vinfo[j].vi_vblocknr >= first &&
417 vinfo[j].vi_vblocknr <= last;
418 j++, n++) {
419 entry = nilfs_palloc_block_get_entry(
420 dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
421 vinfo[j].vi_start = le64_to_cpu(entry->de_start);
422 vinfo[j].vi_end = le64_to_cpu(entry->de_end);
423 vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
424 }
425 kunmap_atomic(kaddr, KM_USER0);
426 brelse(entry_bh);
427 }
428
429 return nvi;
430}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 000000000000..d9560654a4b7
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
1/*
2 * dat.h - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DAT_H
24#define _NILFS_DAT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/fs.h>
29
30#define NILFS_DAT_GFP NILFS_MDT_GFP
31
32struct nilfs_palloc_req;
33
34int nilfs_dat_translate(struct inode *, __u64, sector_t *);
35
36int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
37void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
38void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
39int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
40void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
41 sector_t);
42void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
43int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
44void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
45void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
46
47int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t);
49int nilfs_dat_move(struct inode *, __u64, sector_t);
50ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
51
52#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 000000000000..54100acc1102
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
1/*
2 * dir.c - NILFS directory entry operations
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
21 */
22/*
23 * linux/fs/ext2/dir.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/dir.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * ext2 directory handling functions
37 *
38 * Big-endian to little-endian byte-swapping/bitmaps by
39 * David S. Miller (davem@caip.rutgers.edu), 1995
40 *
41 * All code that works with directory layout had been switched to pagecache
42 * and moved here. AV
43 */
44
45#include <linux/pagemap.h>
46#include <linux/smp_lock.h>
47#include "nilfs.h"
48#include "page.h"
49
50/*
51 * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
52 * more robust, but we have what we have
53 */
54static inline unsigned nilfs_chunk_size(struct inode *inode)
55{
56 return inode->i_sb->s_blocksize;
57}
58
59static inline void nilfs_put_page(struct page *page)
60{
61 kunmap(page);
62 page_cache_release(page);
63}
64
65static inline unsigned long dir_pages(struct inode *inode)
66{
67 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
68}
69
70/*
71 * Return the offset into page `page_nr' of the last valid
72 * byte in that page, plus one.
73 */
74static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
75{
76 unsigned last_byte = inode->i_size;
77
78 last_byte -= page_nr << PAGE_CACHE_SHIFT;
79 if (last_byte > PAGE_CACHE_SIZE)
80 last_byte = PAGE_CACHE_SIZE;
81 return last_byte;
82}
83
84static int nilfs_prepare_chunk_uninterruptible(struct page *page,
85 struct address_space *mapping,
86 unsigned from, unsigned to)
87{
88 loff_t pos = page_offset(page) + from;
89 return block_write_begin(NULL, mapping, pos, to - from,
90 AOP_FLAG_UNINTERRUPTIBLE, &page,
91 NULL, nilfs_get_block);
92}
93
94static int nilfs_prepare_chunk(struct page *page,
95 struct address_space *mapping,
96 unsigned from, unsigned to)
97{
98 loff_t pos = page_offset(page) + from;
99 return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
100 NULL, nilfs_get_block);
101}
102
103static int nilfs_commit_chunk(struct page *page,
104 struct address_space *mapping,
105 unsigned from, unsigned to)
106{
107 struct inode *dir = mapping->host;
108 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
109 loff_t pos = page_offset(page) + from;
110 unsigned len = to - from;
111 unsigned nr_dirty, copied;
112 int err;
113
114 nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
115 copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
116 if (pos + copied > dir->i_size) {
117 i_size_write(dir, pos + copied);
118 mark_inode_dirty(dir);
119 }
120 if (IS_DIRSYNC(dir))
121 nilfs_set_transaction_flag(NILFS_TI_SYNC);
122 err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
123 unlock_page(page);
124 return err;
125}
126
127static void nilfs_check_page(struct page *page)
128{
129 struct inode *dir = page->mapping->host;
130 struct super_block *sb = dir->i_sb;
131 unsigned chunk_size = nilfs_chunk_size(dir);
132 char *kaddr = page_address(page);
133 unsigned offs, rec_len;
134 unsigned limit = PAGE_CACHE_SIZE;
135 struct nilfs_dir_entry *p;
136 char *error;
137
138 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
139 limit = dir->i_size & ~PAGE_CACHE_MASK;
140 if (limit & (chunk_size - 1))
141 goto Ebadsize;
142 if (!limit)
143 goto out;
144 }
145 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
146 p = (struct nilfs_dir_entry *)(kaddr + offs);
147 rec_len = le16_to_cpu(p->rec_len);
148
149 if (rec_len < NILFS_DIR_REC_LEN(1))
150 goto Eshort;
151 if (rec_len & 3)
152 goto Ealign;
153 if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
154 goto Enamelen;
155 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
156 goto Espan;
157 }
158 if (offs != limit)
159 goto Eend;
160out:
161 SetPageChecked(page);
162 return;
163
164 /* Too bad, we had an error */
165
166Ebadsize:
167 nilfs_error(sb, "nilfs_check_page",
168 "size of directory #%lu is not a multiple of chunk size",
169 dir->i_ino
170 );
171 goto fail;
172Eshort:
173 error = "rec_len is smaller than minimal";
174 goto bad_entry;
175Ealign:
176 error = "unaligned directory entry";
177 goto bad_entry;
178Enamelen:
179 error = "rec_len is too small for name_len";
180 goto bad_entry;
181Espan:
182 error = "directory entry across blocks";
183bad_entry:
184 nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
185 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
186 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
187 (unsigned long) le64_to_cpu(p->inode),
188 rec_len, p->name_len);
189 goto fail;
190Eend:
191 p = (struct nilfs_dir_entry *)(kaddr + offs);
192 nilfs_error(sb, "nilfs_check_page",
193 "entry in directory #%lu spans the page boundary"
194 "offset=%lu, inode=%lu",
195 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
196 (unsigned long) le64_to_cpu(p->inode));
197fail:
198 SetPageChecked(page);
199 SetPageError(page);
200}
201
202static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
203{
204 struct address_space *mapping = dir->i_mapping;
205 struct page *page = read_cache_page(mapping, n,
206 (filler_t *)mapping->a_ops->readpage, NULL);
207 if (!IS_ERR(page)) {
208 wait_on_page_locked(page);
209 kmap(page);
210 if (!PageUptodate(page))
211 goto fail;
212 if (!PageChecked(page))
213 nilfs_check_page(page);
214 if (PageError(page))
215 goto fail;
216 }
217 return page;
218
219fail:
220 nilfs_put_page(page);
221 return ERR_PTR(-EIO);
222}
223
224/*
225 * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
226 *
227 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
228 */
229static int
230nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
231{
232 if (len != de->name_len)
233 return 0;
234 if (!de->inode)
235 return 0;
236 return !memcmp(name, de->name, len);
237}
238
239/*
240 * p is at least 6 bytes before the end of page
241 */
242static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
243{
244 return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
245}
246
247static unsigned char
248nilfs_filetype_table[NILFS_FT_MAX] = {
249 [NILFS_FT_UNKNOWN] = DT_UNKNOWN,
250 [NILFS_FT_REG_FILE] = DT_REG,
251 [NILFS_FT_DIR] = DT_DIR,
252 [NILFS_FT_CHRDEV] = DT_CHR,
253 [NILFS_FT_BLKDEV] = DT_BLK,
254 [NILFS_FT_FIFO] = DT_FIFO,
255 [NILFS_FT_SOCK] = DT_SOCK,
256 [NILFS_FT_SYMLINK] = DT_LNK,
257};
258
259#define S_SHIFT 12
260static unsigned char
261nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
262 [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
263 [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
264 [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
265 [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV,
266 [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO,
267 [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK,
268 [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK,
269};
270
271static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
272{
273 mode_t mode = inode->i_mode;
274
275 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
276}
277
278static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
279{
280 loff_t pos = filp->f_pos;
281 struct inode *inode = filp->f_dentry->d_inode;
282 struct super_block *sb = inode->i_sb;
283 unsigned int offset = pos & ~PAGE_CACHE_MASK;
284 unsigned long n = pos >> PAGE_CACHE_SHIFT;
285 unsigned long npages = dir_pages(inode);
286/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
287 unsigned char *types = NULL;
288 int ret;
289
290 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
291 goto success;
292
293 types = nilfs_filetype_table;
294
295 for ( ; n < npages; n++, offset = 0) {
296 char *kaddr, *limit;
297 struct nilfs_dir_entry *de;
298 struct page *page = nilfs_get_page(inode, n);
299
300 if (IS_ERR(page)) {
301 nilfs_error(sb, __func__, "bad page in #%lu",
302 inode->i_ino);
303 filp->f_pos += PAGE_CACHE_SIZE - offset;
304 ret = -EIO;
305 goto done;
306 }
307 kaddr = page_address(page);
308 de = (struct nilfs_dir_entry *)(kaddr + offset);
309 limit = kaddr + nilfs_last_byte(inode, n) -
310 NILFS_DIR_REC_LEN(1);
311 for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
312 if (de->rec_len == 0) {
313 nilfs_error(sb, __func__,
314 "zero-length directory entry");
315 ret = -EIO;
316 nilfs_put_page(page);
317 goto done;
318 }
319 if (de->inode) {
320 int over;
321 unsigned char d_type = DT_UNKNOWN;
322
323 if (types && de->file_type < NILFS_FT_MAX)
324 d_type = types[de->file_type];
325
326 offset = (char *)de - kaddr;
327 over = filldir(dirent, de->name, de->name_len,
328 (n<<PAGE_CACHE_SHIFT) | offset,
329 le64_to_cpu(de->inode), d_type);
330 if (over) {
331 nilfs_put_page(page);
332 goto success;
333 }
334 }
335 filp->f_pos += le16_to_cpu(de->rec_len);
336 }
337 nilfs_put_page(page);
338 }
339
340success:
341 ret = 0;
342done:
343 return ret;
344}
345
346/*
347 * nilfs_find_entry()
348 *
349 * finds an entry in the specified directory with the wanted name. It
350 * returns the page in which the entry was found, and the entry itself
351 * (as a parameter - res_dir). Page is returned mapped and unlocked.
352 * Entry is guaranteed to be valid.
353 */
354struct nilfs_dir_entry *
355nilfs_find_entry(struct inode *dir, struct dentry *dentry,
356 struct page **res_page)
357{
358 const char *name = dentry->d_name.name;
359 int namelen = dentry->d_name.len;
360 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
361 unsigned long start, n;
362 unsigned long npages = dir_pages(dir);
363 struct page *page = NULL;
364 struct nilfs_inode_info *ei = NILFS_I(dir);
365 struct nilfs_dir_entry *de;
366
367 if (npages == 0)
368 goto out;
369
370 /* OFFSET_CACHE */
371 *res_page = NULL;
372
373 start = ei->i_dir_start_lookup;
374 if (start >= npages)
375 start = 0;
376 n = start;
377 do {
378 char *kaddr;
379 page = nilfs_get_page(dir, n);
380 if (!IS_ERR(page)) {
381 kaddr = page_address(page);
382 de = (struct nilfs_dir_entry *)kaddr;
383 kaddr += nilfs_last_byte(dir, n) - reclen;
384 while ((char *) de <= kaddr) {
385 if (de->rec_len == 0) {
386 nilfs_error(dir->i_sb, __func__,
387 "zero-length directory entry");
388 nilfs_put_page(page);
389 goto out;
390 }
391 if (nilfs_match(namelen, name, de))
392 goto found;
393 de = nilfs_next_entry(de);
394 }
395 nilfs_put_page(page);
396 }
397 if (++n >= npages)
398 n = 0;
399 /* next page is past the blocks we've got */
400 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
401 nilfs_error(dir->i_sb, __func__,
402 "dir %lu size %lld exceeds block cout %llu",
403 dir->i_ino, dir->i_size,
404 (unsigned long long)dir->i_blocks);
405 goto out;
406 }
407 } while (n != start);
408out:
409 return NULL;
410
411found:
412 *res_page = page;
413 ei->i_dir_start_lookup = n;
414 return de;
415}
416
417struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
418{
419 struct page *page = nilfs_get_page(dir, 0);
420 struct nilfs_dir_entry *de = NULL;
421
422 if (!IS_ERR(page)) {
423 de = nilfs_next_entry(
424 (struct nilfs_dir_entry *)page_address(page));
425 *p = page;
426 }
427 return de;
428}
429
430ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
431{
432 ino_t res = 0;
433 struct nilfs_dir_entry *de;
434 struct page *page;
435
436 de = nilfs_find_entry(dir, dentry, &page);
437 if (de) {
438 res = le64_to_cpu(de->inode);
439 kunmap(page);
440 page_cache_release(page);
441 }
442 return res;
443}
444
445/* Releases the page */
446void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
447 struct page *page, struct inode *inode)
448{
449 unsigned from = (char *) de - (char *) page_address(page);
450 unsigned to = from + le16_to_cpu(de->rec_len);
451 struct address_space *mapping = page->mapping;
452 int err;
453
454 lock_page(page);
455 err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
456 BUG_ON(err);
457 de->inode = cpu_to_le64(inode->i_ino);
458 nilfs_set_de_type(de, inode);
459 err = nilfs_commit_chunk(page, mapping, from, to);
460 nilfs_put_page(page);
461 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
462/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
463 mark_inode_dirty(dir);
464}
465
466/*
467 * Parent is locked.
468 */
469int nilfs_add_link(struct dentry *dentry, struct inode *inode)
470{
471 struct inode *dir = dentry->d_parent->d_inode;
472 const char *name = dentry->d_name.name;
473 int namelen = dentry->d_name.len;
474 unsigned chunk_size = nilfs_chunk_size(dir);
475 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
476 unsigned short rec_len, name_len;
477 struct page *page = NULL;
478 struct nilfs_dir_entry *de;
479 unsigned long npages = dir_pages(dir);
480 unsigned long n;
481 char *kaddr;
482 unsigned from, to;
483 int err;
484
485 /*
486 * We take care of directory expansion in the same loop.
487 * This code plays outside i_size, so it locks the page
488 * to protect that region.
489 */
490 for (n = 0; n <= npages; n++) {
491 char *dir_end;
492
493 page = nilfs_get_page(dir, n);
494 err = PTR_ERR(page);
495 if (IS_ERR(page))
496 goto out;
497 lock_page(page);
498 kaddr = page_address(page);
499 dir_end = kaddr + nilfs_last_byte(dir, n);
500 de = (struct nilfs_dir_entry *)kaddr;
501 kaddr += PAGE_CACHE_SIZE - reclen;
502 while ((char *)de <= kaddr) {
503 if ((char *)de == dir_end) {
504 /* We hit i_size */
505 name_len = 0;
506 rec_len = chunk_size;
507 de->rec_len = cpu_to_le16(chunk_size);
508 de->inode = 0;
509 goto got_it;
510 }
511 if (de->rec_len == 0) {
512 nilfs_error(dir->i_sb, __func__,
513 "zero-length directory entry");
514 err = -EIO;
515 goto out_unlock;
516 }
517 err = -EEXIST;
518 if (nilfs_match(namelen, name, de))
519 goto out_unlock;
520 name_len = NILFS_DIR_REC_LEN(de->name_len);
521 rec_len = le16_to_cpu(de->rec_len);
522 if (!de->inode && rec_len >= reclen)
523 goto got_it;
524 if (rec_len >= name_len + reclen)
525 goto got_it;
526 de = (struct nilfs_dir_entry *)((char *)de + rec_len);
527 }
528 unlock_page(page);
529 nilfs_put_page(page);
530 }
531 BUG();
532 return -EINVAL;
533
534got_it:
535 from = (char *)de - (char *)page_address(page);
536 to = from + rec_len;
537 err = nilfs_prepare_chunk(page, page->mapping, from, to);
538 if (err)
539 goto out_unlock;
540 if (de->inode) {
541 struct nilfs_dir_entry *de1;
542
543 de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
544 de1->rec_len = cpu_to_le16(rec_len - name_len);
545 de->rec_len = cpu_to_le16(name_len);
546 de = de1;
547 }
548 de->name_len = namelen;
549 memcpy(de->name, name, namelen);
550 de->inode = cpu_to_le64(inode->i_ino);
551 nilfs_set_de_type(de, inode);
552 err = nilfs_commit_chunk(page, page->mapping, from, to);
553 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
554/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
555 mark_inode_dirty(dir);
556 /* OFFSET_CACHE */
557out_put:
558 nilfs_put_page(page);
559out:
560 return err;
561out_unlock:
562 unlock_page(page);
563 goto out_put;
564}
565
566/*
567 * nilfs_delete_entry deletes a directory entry by merging it with the
568 * previous entry. Page is up-to-date. Releases the page.
569 */
570int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
571{
572 struct address_space *mapping = page->mapping;
573 struct inode *inode = mapping->host;
574 char *kaddr = page_address(page);
575 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
576 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
577 struct nilfs_dir_entry *pde = NULL;
578 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
579 int err;
580
581 while ((char *)de < (char *)dir) {
582 if (de->rec_len == 0) {
583 nilfs_error(inode->i_sb, __func__,
584 "zero-length directory entry");
585 err = -EIO;
586 goto out;
587 }
588 pde = de;
589 de = nilfs_next_entry(de);
590 }
591 if (pde)
592 from = (char *)pde - (char *)page_address(page);
593 lock_page(page);
594 err = nilfs_prepare_chunk(page, mapping, from, to);
595 BUG_ON(err);
596 if (pde)
597 pde->rec_len = cpu_to_le16(to - from);
598 dir->inode = 0;
599 err = nilfs_commit_chunk(page, mapping, from, to);
600 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
601/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
602 mark_inode_dirty(inode);
603out:
604 nilfs_put_page(page);
605 return err;
606}
607
608/*
609 * Set the first fragment of directory.
610 */
611int nilfs_make_empty(struct inode *inode, struct inode *parent)
612{
613 struct address_space *mapping = inode->i_mapping;
614 struct page *page = grab_cache_page(mapping, 0);
615 unsigned chunk_size = nilfs_chunk_size(inode);
616 struct nilfs_dir_entry *de;
617 int err;
618 void *kaddr;
619
620 if (!page)
621 return -ENOMEM;
622
623 err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
624 if (unlikely(err)) {
625 unlock_page(page);
626 goto fail;
627 }
628 kaddr = kmap_atomic(page, KM_USER0);
629 memset(kaddr, 0, chunk_size);
630 de = (struct nilfs_dir_entry *)kaddr;
631 de->name_len = 1;
632 de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
633 memcpy(de->name, ".\0\0", 4);
634 de->inode = cpu_to_le64(inode->i_ino);
635 nilfs_set_de_type(de, inode);
636
637 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
638 de->name_len = 2;
639 de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
640 de->inode = cpu_to_le64(parent->i_ino);
641 memcpy(de->name, "..\0", 4);
642 nilfs_set_de_type(de, inode);
643 kunmap_atomic(kaddr, KM_USER0);
644 err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
645fail:
646 page_cache_release(page);
647 return err;
648}
649
650/*
651 * routine to check that the specified directory is empty (for rmdir)
652 */
653int nilfs_empty_dir(struct inode *inode)
654{
655 struct page *page = NULL;
656 unsigned long i, npages = dir_pages(inode);
657
658 for (i = 0; i < npages; i++) {
659 char *kaddr;
660 struct nilfs_dir_entry *de;
661
662 page = nilfs_get_page(inode, i);
663 if (IS_ERR(page))
664 continue;
665
666 kaddr = page_address(page);
667 de = (struct nilfs_dir_entry *)kaddr;
668 kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
669
670 while ((char *)de <= kaddr) {
671 if (de->rec_len == 0) {
672 nilfs_error(inode->i_sb, __func__,
673 "zero-length directory entry "
674 "(kaddr=%p, de=%p)\n", kaddr, de);
675 goto not_empty;
676 }
677 if (de->inode != 0) {
678 /* check for . and .. */
679 if (de->name[0] != '.')
680 goto not_empty;
681 if (de->name_len > 2)
682 goto not_empty;
683 if (de->name_len < 2) {
684 if (de->inode !=
685 cpu_to_le64(inode->i_ino))
686 goto not_empty;
687 } else if (de->name[1] != '.')
688 goto not_empty;
689 }
690 de = nilfs_next_entry(de);
691 }
692 nilfs_put_page(page);
693 }
694 return 1;
695
696not_empty:
697 nilfs_put_page(page);
698 return 0;
699}
700
701struct file_operations nilfs_dir_operations = {
702 .llseek = generic_file_llseek,
703 .read = generic_read_dir,
704 .readdir = nilfs_readdir,
705 .unlocked_ioctl = nilfs_ioctl,
706#ifdef CONFIG_COMPAT
707 .compat_ioctl = nilfs_ioctl,
708#endif /* CONFIG_COMPAT */
709 .fsync = nilfs_sync_file,
710
711};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 000000000000..c6379e482781
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
1/*
2 * direct.c - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/errno.h>
24#include "nilfs.h"
25#include "page.h"
26#include "direct.h"
27#include "alloc.h"
28
29static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
30{
31 return (__le64 *)
32 ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
33}
34
35static inline __u64
36nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
37{
38 return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
39}
40
41static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
42 __u64 key, __u64 ptr)
43{
44 *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
45}
46
47static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
48 __u64 key, int level, __u64 *ptrp)
49{
50 struct nilfs_direct *direct;
51 __u64 ptr;
52
53 direct = (struct nilfs_direct *)bmap;
54 if ((key > NILFS_DIRECT_KEY_MAX) ||
55 (level != 1) || /* XXX: use macro for level 1 */
56 ((ptr = nilfs_direct_get_ptr(direct, key)) ==
57 NILFS_BMAP_INVALID_PTR))
58 return -ENOENT;
59
60 if (ptrp != NULL)
61 *ptrp = ptr;
62 return 0;
63}
64
65static __u64
66nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
67{
68 __u64 ptr;
69
70 ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
71 if (ptr != NILFS_BMAP_INVALID_PTR)
72 /* sequential access */
73 return ptr;
74 else
75 /* block group */
76 return nilfs_bmap_find_target_in_group(&direct->d_bmap);
77}
78
79static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
80 __u64 key, __u64 ptr)
81{
82 direct->d_bmap.b_last_allocated_key = key;
83 direct->d_bmap.b_last_allocated_ptr = ptr;
84}
85
86static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
87 __u64 key,
88 union nilfs_bmap_ptr_req *req,
89 struct nilfs_bmap_stats *stats)
90{
91 int ret;
92
93 if (direct->d_ops->dop_find_target != NULL)
94 req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
95 ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
96 req);
97 if (ret < 0)
98 return ret;
99
100 stats->bs_nblocks = 1;
101 return 0;
102}
103
104static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
105 union nilfs_bmap_ptr_req *req,
106 __u64 key, __u64 ptr)
107{
108 struct buffer_head *bh;
109
110 /* ptr must be a pointer to a buffer head. */
111 bh = (struct buffer_head *)((unsigned long)ptr);
112 set_buffer_nilfs_volatile(bh);
113
114 if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
115 direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
116 &direct->d_bmap, req);
117 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
118
119 if (!nilfs_bmap_dirty(&direct->d_bmap))
120 nilfs_bmap_set_dirty(&direct->d_bmap);
121
122 if (direct->d_ops->dop_set_target != NULL)
123 direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
124}
125
126static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
127{
128 struct nilfs_direct *direct;
129 union nilfs_bmap_ptr_req req;
130 struct nilfs_bmap_stats stats;
131 int ret;
132
133 direct = (struct nilfs_direct *)bmap;
134 if (key > NILFS_DIRECT_KEY_MAX)
135 return -ENOENT;
136 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
137 return -EEXIST;
138
139 ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
140 if (ret < 0)
141 return ret;
142 nilfs_direct_commit_insert(direct, &req, key, ptr);
143 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
144
145 return 0;
146}
147
148static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
149 union nilfs_bmap_ptr_req *req,
150 __u64 key,
151 struct nilfs_bmap_stats *stats)
152{
153 int ret;
154
155 if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
156 req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
157 ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
158 &direct->d_bmap, req);
159 if (ret < 0)
160 return ret;
161 }
162
163 stats->bs_nblocks = 1;
164 return 0;
165}
166
167static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
168 union nilfs_bmap_ptr_req *req,
169 __u64 key)
170{
171 if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
172 direct->d_bmap.b_pops->bpop_commit_end_ptr(
173 &direct->d_bmap, req);
174 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
175}
176
177static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
178{
179 struct nilfs_direct *direct;
180 union nilfs_bmap_ptr_req req;
181 struct nilfs_bmap_stats stats;
182 int ret;
183
184 direct = (struct nilfs_direct *)bmap;
185 if ((key > NILFS_DIRECT_KEY_MAX) ||
186 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
187 return -ENOENT;
188
189 ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
190 if (ret < 0)
191 return ret;
192 nilfs_direct_commit_delete(direct, &req, key);
193 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
194
195 return 0;
196}
197
198static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
199{
200 struct nilfs_direct *direct;
201 __u64 key, lastkey;
202
203 direct = (struct nilfs_direct *)bmap;
204 lastkey = NILFS_DIRECT_KEY_MAX + 1;
205 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
206 if (nilfs_direct_get_ptr(direct, key) !=
207 NILFS_BMAP_INVALID_PTR)
208 lastkey = key;
209
210 if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
211 return -ENOENT;
212
213 *keyp = lastkey;
214
215 return 0;
216}
217
218static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
219{
220 return key > NILFS_DIRECT_KEY_MAX;
221}
222
223static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
224 __u64 *keys, __u64 *ptrs, int nitems)
225{
226 struct nilfs_direct *direct;
227 __u64 key;
228 __u64 ptr;
229 int n;
230
231 direct = (struct nilfs_direct *)bmap;
232 if (nitems > NILFS_DIRECT_NBLOCKS)
233 nitems = NILFS_DIRECT_NBLOCKS;
234 n = 0;
235 for (key = 0; key < nitems; key++) {
236 ptr = nilfs_direct_get_ptr(direct, key);
237 if (ptr != NILFS_BMAP_INVALID_PTR) {
238 keys[n] = key;
239 ptrs[n] = ptr;
240 n++;
241 }
242 }
243 return n;
244}
245
246int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
247 __u64 key, __u64 *keys, __u64 *ptrs,
248 int n, __u64 low, __u64 high)
249{
250 struct nilfs_direct *direct;
251 __le64 *dptrs;
252 int ret, i, j;
253
254 /* no need to allocate any resource for conversion */
255
256 /* delete */
257 ret = bmap->b_ops->bop_delete(bmap, key);
258 if (ret < 0)
259 return ret;
260
261 /* free resources */
262 if (bmap->b_ops->bop_clear != NULL)
263 bmap->b_ops->bop_clear(bmap);
264
265 /* convert */
266 direct = (struct nilfs_direct *)bmap;
267 dptrs = nilfs_direct_dptrs(direct);
268 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
269 if ((j < n) && (i == keys[j])) {
270 dptrs[i] = (i != key) ?
271 nilfs_bmap_ptr_to_dptr(ptrs[j]) :
272 NILFS_BMAP_INVALID_PTR;
273 j++;
274 } else
275 dptrs[i] = NILFS_BMAP_INVALID_PTR;
276 }
277
278 nilfs_direct_init(bmap, low, high);
279
280 return 0;
281}
282
283static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
284 struct buffer_head *bh)
285{
286 union nilfs_bmap_ptr_req oldreq, newreq;
287 __u64 key;
288 __u64 ptr;
289 int ret;
290
291 key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
292 ptr = nilfs_direct_get_ptr(direct, key);
293 if (!buffer_nilfs_volatile(bh)) {
294 oldreq.bpr_ptr = ptr;
295 newreq.bpr_ptr = ptr;
296 ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
297 &newreq);
298 if (ret < 0)
299 return ret;
300 nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
301 set_buffer_nilfs_volatile(bh);
302 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
303 } else
304 ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
305
306 return ret;
307}
308
309static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
310 struct buffer_head *bh)
311{
312 struct nilfs_direct *direct;
313
314 direct = (struct nilfs_direct *)bmap;
315 return (direct->d_ops->dop_propagate != NULL) ?
316 direct->d_ops->dop_propagate(direct, bh) :
317 0;
318}
319
320static int nilfs_direct_assign_v(struct nilfs_direct *direct,
321 __u64 key, __u64 ptr,
322 struct buffer_head **bh,
323 sector_t blocknr,
324 union nilfs_binfo *binfo)
325{
326 union nilfs_bmap_ptr_req req;
327 int ret;
328
329 req.bpr_ptr = ptr;
330 ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
331 &direct->d_bmap, &req);
332 if (ret < 0)
333 return ret;
334 direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
335 &req, blocknr);
336
337 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
338 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
339
340 return 0;
341}
342
343static int nilfs_direct_assign_p(struct nilfs_direct *direct,
344 __u64 key, __u64 ptr,
345 struct buffer_head **bh,
346 sector_t blocknr,
347 union nilfs_binfo *binfo)
348{
349 nilfs_direct_set_ptr(direct, key, blocknr);
350
351 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
352 binfo->bi_dat.bi_level = 0;
353
354 return 0;
355}
356
357static int nilfs_direct_assign(struct nilfs_bmap *bmap,
358 struct buffer_head **bh,
359 sector_t blocknr,
360 union nilfs_binfo *binfo)
361{
362 struct nilfs_direct *direct;
363 __u64 key;
364 __u64 ptr;
365
366 direct = (struct nilfs_direct *)bmap;
367 key = nilfs_bmap_data_get_key(bmap, *bh);
368 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
369 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
370 (unsigned long long)key);
371 return -EINVAL;
372 }
373 ptr = nilfs_direct_get_ptr(direct, key);
374 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
375 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
376 (unsigned long long)ptr);
377 return -EINVAL;
378 }
379
380 return direct->d_ops->dop_assign(direct, key, ptr, bh,
381 blocknr, binfo);
382}
383
384static const struct nilfs_bmap_operations nilfs_direct_ops = {
385 .bop_lookup = nilfs_direct_lookup,
386 .bop_insert = nilfs_direct_insert,
387 .bop_delete = nilfs_direct_delete,
388 .bop_clear = NULL,
389
390 .bop_propagate = nilfs_direct_propagate,
391
392 .bop_lookup_dirty_buffers = NULL,
393
394 .bop_assign = nilfs_direct_assign,
395 .bop_mark = NULL,
396
397 .bop_last_key = nilfs_direct_last_key,
398 .bop_check_insert = nilfs_direct_check_insert,
399 .bop_check_delete = NULL,
400 .bop_gather_data = nilfs_direct_gather_data,
401};
402
403
404static const struct nilfs_direct_operations nilfs_direct_ops_v = {
405 .dop_find_target = nilfs_direct_find_target_v,
406 .dop_set_target = nilfs_direct_set_target_v,
407 .dop_propagate = nilfs_direct_propagate_v,
408 .dop_assign = nilfs_direct_assign_v,
409};
410
411static const struct nilfs_direct_operations nilfs_direct_ops_p = {
412 .dop_find_target = NULL,
413 .dop_set_target = NULL,
414 .dop_propagate = NULL,
415 .dop_assign = nilfs_direct_assign_p,
416};
417
418int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
419{
420 struct nilfs_direct *direct;
421
422 direct = (struct nilfs_direct *)bmap;
423 bmap->b_ops = &nilfs_direct_ops;
424 bmap->b_low = low;
425 bmap->b_high = high;
426 switch (bmap->b_inode->i_ino) {
427 case NILFS_DAT_INO:
428 direct->d_ops = &nilfs_direct_ops_p;
429 break;
430 default:
431 direct->d_ops = &nilfs_direct_ops_v;
432 break;
433 }
434
435 return 0;
436}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 000000000000..45d2c5cda812
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
1/*
2 * direct.h - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DIRECT_H
24#define _NILFS_DIRECT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include "bmap.h"
29
30
31struct nilfs_direct;
32
33/**
34 * struct nilfs_direct_operations - direct mapping operation table
35 */
36struct nilfs_direct_operations {
37 __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
38 void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
39 int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
40 int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
41 struct buffer_head **, sector_t,
42 union nilfs_binfo *);
43};
44
45/**
46 * struct nilfs_direct_node - direct node
47 * @dn_flags: flags
48 * @dn_pad: padding
49 */
50struct nilfs_direct_node {
51 __u8 dn_flags;
52 __u8 pad[7];
53};
54
55/**
56 * struct nilfs_direct - direct mapping
57 * @d_bmap: bmap structure
58 * @d_ops: direct mapping operation table
59 */
60struct nilfs_direct {
61 struct nilfs_bmap d_bmap;
62
63 /* direct-mapping-specific members */
64 const struct nilfs_direct_operations *d_ops;
65};
66
67
68#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
69#define NILFS_DIRECT_KEY_MIN 0
70#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
71
72
73int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
74int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
75 __u64 *, int, __u64, __u64);
76
77
78#endif /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 000000000000..6bd84a0d8238
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
1/*
2 * file.c - NILFS regular file handling primitives including fsync().
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#include <linux/fs.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include "nilfs.h"
28#include "segment.h"
29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
31{
32 /*
33 * Called from fsync() system call
34 * This is the only entry point that can catch write and synch
35 * timing for both data blocks and intermediate blocks.
36 *
37 * This function should be implemented when the writeback function
38 * will be implemented.
39 */
40 struct inode *inode = dentry->d_inode;
41 int err;
42
43 if (!nilfs_inode_dirty(inode))
44 return 0;
45
46 if (datasync)
47 err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
48 LLONG_MAX);
49 else
50 err = nilfs_construct_segment(inode->i_sb);
51
52 return err;
53}
54
55static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
56{
57 struct page *page = vmf->page;
58 struct inode *inode = vma->vm_file->f_dentry->d_inode;
59 struct nilfs_transaction_info ti;
60 int ret;
61
62 if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
63 return VM_FAULT_SIGBUS; /* -ENOSPC */
64
65 lock_page(page);
66 if (page->mapping != inode->i_mapping ||
67 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
68 unlock_page(page);
69 return VM_FAULT_NOPAGE; /* make the VM retry the fault */
70 }
71
72 /*
73 * check to see if the page is mapped already (no holes)
74 */
75 if (PageMappedToDisk(page)) {
76 unlock_page(page);
77 goto mapped;
78 }
79 if (page_has_buffers(page)) {
80 struct buffer_head *bh, *head;
81 int fully_mapped = 1;
82
83 bh = head = page_buffers(page);
84 do {
85 if (!buffer_mapped(bh)) {
86 fully_mapped = 0;
87 break;
88 }
89 } while (bh = bh->b_this_page, bh != head);
90
91 if (fully_mapped) {
92 SetPageMappedToDisk(page);
93 unlock_page(page);
94 goto mapped;
95 }
96 }
97 unlock_page(page);
98
99 /*
100 * fill hole blocks
101 */
102 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
103 /* never returns -ENOMEM, but may return -ENOSPC */
104 if (unlikely(ret))
105 return VM_FAULT_SIGBUS;
106
107 ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
108 if (unlikely(ret)) {
109 nilfs_transaction_abort(inode->i_sb);
110 return ret;
111 }
112 nilfs_transaction_commit(inode->i_sb);
113
114 mapped:
115 SetPageChecked(page);
116 wait_on_page_writeback(page);
117 return 0;
118}
119
120struct vm_operations_struct nilfs_file_vm_ops = {
121 .fault = filemap_fault,
122 .page_mkwrite = nilfs_page_mkwrite,
123};
124
125static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
126{
127 file_accessed(file);
128 vma->vm_ops = &nilfs_file_vm_ops;
129 vma->vm_flags |= VM_CAN_NONLINEAR;
130 return 0;
131}
132
133/*
134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem.
136 */
137struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek,
139 .read = do_sync_read,
140 .write = do_sync_write,
141 .aio_read = generic_file_aio_read,
142 .aio_write = generic_file_aio_write,
143 .unlocked_ioctl = nilfs_ioctl,
144#ifdef CONFIG_COMPAT
145 .compat_ioctl = nilfs_ioctl,
146#endif /* CONFIG_COMPAT */
147 .mmap = nilfs_file_mmap,
148 .open = generic_file_open,
149 /* .release = nilfs_release_file, */
150 .fsync = nilfs_sync_file,
151 .splice_read = generic_file_splice_read,
152};
153
154struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission,
158};
159
160/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 000000000000..93383c5cee90
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_clear_dirty_pages(mapping);
65 nilfs_copy_back_pages(mapping, gmapping);
66 /* note: mdt dirty flags should be cleared by segctor. */
67
68 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
69 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
70
71 up_write(&NILFS_MDT(dat)->mi_sem);
72}
73
74void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
75{
76 struct inode *gcdat = nilfs->ns_gc_dat;
77 struct nilfs_inode_info *gii = NILFS_I(gcdat);
78
79 gcdat->i_state = I_CLEAR;
80 gii->i_flags = 0;
81
82 truncate_inode_pages(gcdat->i_mapping, 0);
83 truncate_inode_pages(&gii->i_btnode_cache, 0);
84}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000000..19d2102b6a69
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
1/*
2 * gcinode.c - dummy inodes to buffer blocks for garbage collection
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
23 *
24 */
25/*
26 * This file adds the cache of on-disk blocks to be moved in garbage
27 * collection. The disk blocks are held with dummy inodes (called
28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function.
30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap
41 * because the dirty blocks make a new generation; they rather must be
42 * written individually.
43 */
44
45#include <linux/buffer_head.h>
46#include <linux/mpage.h>
47#include <linux/hash.h>
48#include <linux/swap.h>
49#include "nilfs.h"
50#include "page.h"
51#include "mdt.h"
52#include "dat.h"
53#include "ifile.h"
54
55static struct address_space_operations def_gcinode_aops = {};
56/* XXX need def_gcinode_iops/fops? */
57
58/*
59 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
60 * @inode - gc inode
61 * @blkoff - dummy offset treated as the key for the page cache
62 * @pbn - physical block number of the block
63 * @vbn - virtual block number of the block, 0 for non-virtual block
64 * @out_bh - indirect pointer to a buffer_head struct to receive the results
65 *
66 * Description: nilfs_gccache_submit_read_data() registers the data buffer
67 * specified by @pbn to the GC pagecache with the key @blkoff.
68 * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
69 *
70 * Return Value: On success, 0 is returned. On Error, one of the following
71 * negative error code is returned.
72 *
73 * %-EIO - I/O error.
74 *
75 * %-ENOMEM - Insufficient amount of memory available.
76 *
77 * %-ENOENT - The block specified with @pbn does not exist.
78 */
79int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
80 sector_t pbn, __u64 vbn,
81 struct buffer_head **out_bh)
82{
83 struct buffer_head *bh;
84 int err;
85
86 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
87 if (unlikely(!bh))
88 return -ENOMEM;
89
90 if (buffer_uptodate(bh))
91 goto out;
92
93 if (pbn == 0) {
94 struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
95 /* use original dat, not gc dat. */
96 err = nilfs_dat_translate(dat_inode, vbn, &pbn);
97 if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
98 brelse(bh);
99 goto failed;
100 }
101 }
102
103 lock_buffer(bh);
104 if (buffer_uptodate(bh)) {
105 unlock_buffer(bh);
106 goto out;
107 }
108
109 if (!buffer_mapped(bh)) {
110 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
111 set_buffer_mapped(bh);
112 }
113 bh->b_blocknr = pbn;
114 bh->b_end_io = end_buffer_read_sync;
115 get_bh(bh);
116 submit_bh(READ, bh);
117 if (vbn)
118 bh->b_blocknr = vbn;
119 out:
120 err = 0;
121 *out_bh = bh;
122
123 failed:
124 unlock_page(bh->b_page);
125 page_cache_release(bh->b_page);
126 return err;
127}
128
129/*
130 * nilfs_gccache_submit_read_node() - add node buffer and submit read request
131 * @inode - gc inode
132 * @pbn - physical block number for the block
133 * @vbn - virtual block number for the block
134 * @out_bh - indirect pointer to a buffer_head struct to receive the results
135 *
136 * Description: nilfs_gccache_submit_read_node() registers the node buffer
137 * specified by @vbn to the GC pagecache. @pbn can be supplied by the
138 * caller to avoid translation of the disk block address.
139 *
140 * Return Value: On success, 0 is returned. On Error, one of the following
141 * negative error code is returned.
142 *
143 * %-EIO - I/O error.
144 *
145 * %-ENOMEM - Insufficient amount of memory available.
146 */
147int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
148 __u64 vbn, struct buffer_head **out_bh)
149{
150 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
151 vbn ? : pbn, pbn, out_bh, 0);
152 if (ret == -EEXIST) /* internal code (cache hit) */
153 ret = 0;
154 return ret;
155}
156
157int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
158{
159 wait_on_buffer(bh);
160 if (!buffer_uptodate(bh))
161 return -EIO;
162 if (buffer_dirty(bh))
163 return -EEXIST;
164
165 if (buffer_nilfs_node(bh))
166 nilfs_btnode_mark_dirty(bh);
167 else
168 nilfs_mdt_mark_buffer_dirty(bh);
169 return 0;
170}
171
172/*
173 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
174 * @nilfs - the_nilfs
175 *
176 * Return Value: On success, 0.
177 * On error, a negative error code is returned.
178 */
179int nilfs_init_gccache(struct the_nilfs *nilfs)
180{
181 int loop;
182
183 BUG_ON(nilfs->ns_gc_inodes_h);
184
185 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
186
187 nilfs->ns_gc_inodes_h =
188 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
189 GFP_NOFS);
190 if (nilfs->ns_gc_inodes_h == NULL)
191 return -ENOMEM;
192
193 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
194 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
195 return 0;
196}
197
198/*
199 * nilfs_destroy_gccache() - free gc_inode hash table
200 * @nilfs - the nilfs
201 */
202void nilfs_destroy_gccache(struct the_nilfs *nilfs)
203{
204 if (nilfs->ns_gc_inodes_h) {
205 nilfs_remove_all_gcinode(nilfs);
206 kfree(nilfs->ns_gc_inodes_h);
207 nilfs->ns_gc_inodes_h = NULL;
208 }
209}
210
211static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
212 __u64 cno)
213{
214 struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
215 struct nilfs_inode_info *ii;
216
217 if (!inode)
218 return NULL;
219
220 inode->i_op = NULL;
221 inode->i_fop = NULL;
222 inode->i_mapping->a_ops = &def_gcinode_aops;
223
224 ii = NILFS_I(inode);
225 ii->i_cno = cno;
226 ii->i_flags = 0;
227 ii->i_state = 1 << NILFS_I_GCINODE;
228 ii->i_bh = NULL;
229 nilfs_bmap_init_gc(ii->i_bmap);
230
231 return inode;
232}
233
234static unsigned long ihash(ino_t ino, __u64 cno)
235{
236 return hash_long((unsigned long)((ino << 2) + cno),
237 NILFS_GCINODE_HASH_BITS);
238}
239
240/*
241 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
242 */
243struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
244{
245 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
246 struct hlist_node *node;
247 struct inode *inode;
248
249 hlist_for_each_entry(inode, node, head, i_hash) {
250 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
251 return inode;
252 }
253
254 inode = alloc_gcinode(nilfs, ino, cno);
255 if (likely(inode)) {
256 hlist_add_head(&inode->i_hash, head);
257 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
258 }
259 return inode;
260}
261
262/*
263 * nilfs_clear_gcinode() - clear and free a gc inode
264 */
265void nilfs_clear_gcinode(struct inode *inode)
266{
267 nilfs_mdt_clear(inode);
268 nilfs_mdt_destroy(inode);
269}
270
271/*
272 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
273 */
274void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
275{
276 struct hlist_head *head = nilfs->ns_gc_inodes_h;
277 struct hlist_node *node, *n;
278 struct inode *inode;
279 int loop;
280
281 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
282 hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
283 hlist_del_init(&inode->i_hash);
284 list_del_init(&NILFS_I(inode)->i_dirty);
285 nilfs_clear_gcinode(inode); /* might sleep */
286 }
287 }
288}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 000000000000..de86401f209f
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
1/*
2 * ifile.c - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>.
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "ifile.h"
31
32/**
33 * nilfs_ifile_create_inode - create a new disk inode
34 * @ifile: ifile inode
35 * @out_ino: pointer to a variable to store inode number
36 * @out_bh: buffer_head contains newly allocated disk inode
37 *
38 * Return Value: On success, 0 is returned and the newly allocated inode
39 * number is stored in the place pointed by @ino, and buffer_head pointer
40 * that contains newly allocated disk inode structure is stored in the
41 * place pointed by @out_bh
42 * On error, one of the following negative error codes is returned.
43 *
44 * %-EIO - I/O error.
45 *
46 * %-ENOMEM - Insufficient amount of memory available.
47 *
48 * %-ENOSPC - No inode left.
49 */
50int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
51 struct buffer_head **out_bh)
52{
53 struct nilfs_palloc_req req;
54 int ret;
55
56 req.pr_entry_nr = 0; /* 0 says find free inode from beginning of
57 a group. dull code!! */
58 req.pr_entry_bh = NULL;
59
60 ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
61 if (!ret) {
62 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
63 &req.pr_entry_bh);
64 if (ret < 0)
65 nilfs_palloc_abort_alloc_entry(ifile, &req);
66 }
67 if (ret < 0) {
68 brelse(req.pr_entry_bh);
69 return ret;
70 }
71 nilfs_palloc_commit_alloc_entry(ifile, &req);
72 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
73 nilfs_mdt_mark_dirty(ifile);
74 *out_ino = (ino_t)req.pr_entry_nr;
75 *out_bh = req.pr_entry_bh;
76 return 0;
77}
78
79/**
80 * nilfs_ifile_delete_inode - delete a disk inode
81 * @ifile: ifile inode
82 * @ino: inode number
83 *
84 * Return Value: On success, 0 is returned. On error, one of the following
85 * negative error codes is returned.
86 *
87 * %-EIO - I/O error.
88 *
89 * %-ENOMEM - Insufficient amount of memory available.
90 *
91 * %-ENOENT - The inode number @ino have not been allocated.
92 */
93int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
94{
95 struct nilfs_palloc_req req = {
96 .pr_entry_nr = ino, .pr_entry_bh = NULL
97 };
98 struct nilfs_inode *raw_inode;
99 void *kaddr;
100 int ret;
101
102 ret = nilfs_palloc_prepare_free_entry(ifile, &req);
103 if (!ret) {
104 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
105 &req.pr_entry_bh);
106 if (ret < 0)
107 nilfs_palloc_abort_free_entry(ifile, &req);
108 }
109 if (ret < 0) {
110 brelse(req.pr_entry_bh);
111 return ret;
112 }
113
114 kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
115 raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
116 req.pr_entry_bh, kaddr);
117 raw_inode->i_flags = 0;
118 kunmap_atomic(kaddr, KM_USER0);
119
120 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
121 brelse(req.pr_entry_bh);
122
123 nilfs_palloc_commit_free_entry(ifile, &req);
124
125 return 0;
126}
127
128int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
129 struct buffer_head **out_bh)
130{
131 struct super_block *sb = ifile->i_sb;
132 int err;
133
134 if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
135 nilfs_error(sb, __func__, "bad inode number: %lu",
136 (unsigned long) ino);
137 return -EINVAL;
138 }
139
140 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
141 if (unlikely(err)) {
142 if (err == -EINVAL)
143 nilfs_error(sb, __func__, "ifile is broken");
144 else
145 nilfs_warning(sb, __func__,
146 "unable to read inode: %lu",
147 (unsigned long) ino);
148 }
149 return err;
150}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 000000000000..5d30a35679b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
1/*
2 * ifile.h - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24
25#ifndef _NILFS_IFILE_H
26#define _NILFS_IFILE_H
27
28#include <linux/fs.h>
29#include <linux/buffer_head.h>
30#include <linux/nilfs2_fs.h>
31#include "mdt.h"
32#include "alloc.h"
33
34#define NILFS_IFILE_GFP NILFS_MDT_GFP
35
36static inline struct nilfs_inode *
37nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
38{
39 void *kaddr = kmap(ibh->b_page);
40 return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
41}
42
43static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
44 struct buffer_head *ibh)
45{
46 kunmap(ibh->b_page);
47}
48
49int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
50int nilfs_ifile_delete_inode(struct inode *, ino_t);
51int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
52
53#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 000000000000..49ab4a49bb4f
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
1/*
2 * inode.c - NILFS inode operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/mpage.h>
26#include <linux/writeback.h>
27#include <linux/uio.h>
28#include "nilfs.h"
29#include "segment.h"
30#include "page.h"
31#include "mdt.h"
32#include "cpfile.h"
33#include "ifile.h"
34
35
36/**
37 * nilfs_get_block() - get a file block on the filesystem (callback function)
38 * @inode - inode struct of the target file
39 * @blkoff - file block number
40 * @bh_result - buffer head to be mapped on
41 * @create - indicate whether allocating the block or not when it has not
42 * been allocated yet.
43 *
44 * This function does not issue actual read request of the specified data
45 * block. It is done by VFS.
46 * Bulk read for direct-io is not supported yet. (should be supported)
47 */
48int nilfs_get_block(struct inode *inode, sector_t blkoff,
49 struct buffer_head *bh_result, int create)
50{
51 struct nilfs_inode_info *ii = NILFS_I(inode);
52 unsigned long blknum = 0;
53 int err = 0, ret;
54 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
55
56 /* This exclusion control is a workaround; should be revised */
57 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
58 ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
59 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
60 if (ret == 0) { /* found */
61 map_bh(bh_result, inode->i_sb, blknum);
62 goto out;
63 }
64 /* data block was not found */
65 if (ret == -ENOENT && create) {
66 struct nilfs_transaction_info ti;
67
68 bh_result->b_blocknr = 0;
69 err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
70 if (unlikely(err))
71 goto out;
72 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
73 (unsigned long)bh_result);
74 if (unlikely(err != 0)) {
75 if (err == -EEXIST) {
76 /*
77 * The get_block() function could be called
78 * from multiple callers for an inode.
79 * However, the page having this block must
80 * be locked in this case.
81 */
82 printk(KERN_WARNING
83 "nilfs_get_block: a race condition "
84 "while inserting a data block. "
85 "(inode number=%lu, file block "
86 "offset=%llu)\n",
87 inode->i_ino,
88 (unsigned long long)blkoff);
89 err = 0;
90 } else if (err == -EINVAL) {
91 nilfs_error(inode->i_sb, __func__,
92 "broken bmap (inode=%lu)\n",
93 inode->i_ino);
94 err = -EIO;
95 }
96 nilfs_transaction_abort(inode->i_sb);
97 goto out;
98 }
99 nilfs_transaction_commit(inode->i_sb); /* never fails */
100 /* Error handling should be detailed */
101 set_buffer_new(bh_result);
102 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
103 to proper value */
104 } else if (ret == -ENOENT) {
105 /* not found is not error (e.g. hole); must return without
106 the mapped state flag. */
107 ;
108 } else {
109 err = ret;
110 }
111
112 out:
113 return err;
114}
115
116/**
117 * nilfs_readpage() - implement readpage() method of nilfs_aops {}
118 * address_space_operations.
119 * @file - file struct of the file to be read
120 * @page - the page to be read
121 */
122static int nilfs_readpage(struct file *file, struct page *page)
123{
124 return mpage_readpage(page, nilfs_get_block);
125}
126
127/**
128 * nilfs_readpages() - implement readpages() method of nilfs_aops {}
129 * address_space_operations.
130 * @file - file struct of the file to be read
131 * @mapping - address_space struct used for reading multiple pages
132 * @pages - the pages to be read
133 * @nr_pages - number of pages to be read
134 */
135static int nilfs_readpages(struct file *file, struct address_space *mapping,
136 struct list_head *pages, unsigned nr_pages)
137{
138 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
139}
140
141static int nilfs_writepages(struct address_space *mapping,
142 struct writeback_control *wbc)
143{
144 struct inode *inode = mapping->host;
145 int err = 0;
146
147 if (wbc->sync_mode == WB_SYNC_ALL)
148 err = nilfs_construct_dsync_segment(inode->i_sb, inode,
149 wbc->range_start,
150 wbc->range_end);
151 return err;
152}
153
154static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
155{
156 struct inode *inode = page->mapping->host;
157 int err;
158
159 redirty_page_for_writepage(wbc, page);
160 unlock_page(page);
161
162 if (wbc->sync_mode == WB_SYNC_ALL) {
163 err = nilfs_construct_segment(inode->i_sb);
164 if (unlikely(err))
165 return err;
166 } else if (wbc->for_reclaim)
167 nilfs_flush_segment(inode->i_sb, inode->i_ino);
168
169 return 0;
170}
171
172static int nilfs_set_page_dirty(struct page *page)
173{
174 int ret = __set_page_dirty_buffers(page);
175
176 if (ret) {
177 struct inode *inode = page->mapping->host;
178 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
179 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
180
181 nilfs_set_file_dirty(sbi, inode, nr_dirty);
182 }
183 return ret;
184}
185
186static int nilfs_write_begin(struct file *file, struct address_space *mapping,
187 loff_t pos, unsigned len, unsigned flags,
188 struct page **pagep, void **fsdata)
189
190{
191 struct inode *inode = mapping->host;
192 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
193
194 if (unlikely(err))
195 return err;
196
197 *pagep = NULL;
198 err = block_write_begin(file, mapping, pos, len, flags, pagep,
199 fsdata, nilfs_get_block);
200 if (unlikely(err))
201 nilfs_transaction_abort(inode->i_sb);
202 return err;
203}
204
205static int nilfs_write_end(struct file *file, struct address_space *mapping,
206 loff_t pos, unsigned len, unsigned copied,
207 struct page *page, void *fsdata)
208{
209 struct inode *inode = mapping->host;
210 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
211 unsigned nr_dirty;
212 int err;
213
214 nr_dirty = nilfs_page_count_clean_buffers(page, start,
215 start + copied);
216 copied = generic_write_end(file, mapping, pos, len, copied, page,
217 fsdata);
218 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
219 err = nilfs_transaction_commit(inode->i_sb);
220 return err ? : copied;
221}
222
223static ssize_t
224nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
225 loff_t offset, unsigned long nr_segs)
226{
227 struct file *file = iocb->ki_filp;
228 struct inode *inode = file->f_mapping->host;
229 ssize_t size;
230
231 if (rw == WRITE)
232 return 0;
233
234 /* Needs synchronization with the cleaner */
235 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
236 offset, nr_segs, nilfs_get_block, NULL);
237 return size;
238}
239
240struct address_space_operations nilfs_aops = {
241 .writepage = nilfs_writepage,
242 .readpage = nilfs_readpage,
243 /* .sync_page = nilfs_sync_page, */
244 .writepages = nilfs_writepages,
245 .set_page_dirty = nilfs_set_page_dirty,
246 .readpages = nilfs_readpages,
247 .write_begin = nilfs_write_begin,
248 .write_end = nilfs_write_end,
249 /* .releasepage = nilfs_releasepage, */
250 .invalidatepage = block_invalidatepage,
251 .direct_IO = nilfs_direct_IO,
252};
253
254struct inode *nilfs_new_inode(struct inode *dir, int mode)
255{
256 struct super_block *sb = dir->i_sb;
257 struct nilfs_sb_info *sbi = NILFS_SB(sb);
258 struct inode *inode;
259 struct nilfs_inode_info *ii;
260 int err = -ENOMEM;
261 ino_t ino;
262
263 inode = new_inode(sb);
264 if (unlikely(!inode))
265 goto failed;
266
267 mapping_set_gfp_mask(inode->i_mapping,
268 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
269
270 ii = NILFS_I(inode);
271 ii->i_state = 1 << NILFS_I_NEW;
272
273 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
274 if (unlikely(err))
275 goto failed_ifile_create_inode;
276 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
277
278 atomic_inc(&sbi->s_inodes_count);
279
280 inode->i_uid = current_fsuid();
281 if (dir->i_mode & S_ISGID) {
282 inode->i_gid = dir->i_gid;
283 if (S_ISDIR(mode))
284 mode |= S_ISGID;
285 } else
286 inode->i_gid = current_fsgid();
287
288 inode->i_mode = mode;
289 inode->i_ino = ino;
290 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
291
292 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
293 err = nilfs_bmap_read(ii->i_bmap, NULL);
294 if (err < 0)
295 goto failed_bmap;
296
297 set_bit(NILFS_I_BMAP, &ii->i_state);
298 /* No lock is needed; iget() ensures it. */
299 }
300
301 ii->i_flags = NILFS_I(dir)->i_flags;
302 if (S_ISLNK(mode))
303 ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
304 if (!S_ISDIR(mode))
305 ii->i_flags &= ~NILFS_DIRSYNC_FL;
306
307 /* ii->i_file_acl = 0; */
308 /* ii->i_dir_acl = 0; */
309 ii->i_dir_start_lookup = 0;
310#ifdef CONFIG_NILFS_FS_POSIX_ACL
311 ii->i_acl = NULL;
312 ii->i_default_acl = NULL;
313#endif
314 ii->i_cno = 0;
315 nilfs_set_inode_flags(inode);
316 spin_lock(&sbi->s_next_gen_lock);
317 inode->i_generation = sbi->s_next_generation++;
318 spin_unlock(&sbi->s_next_gen_lock);
319 insert_inode_hash(inode);
320
321 err = nilfs_init_acl(inode, dir);
322 if (unlikely(err))
323 goto failed_acl; /* never occur. When supporting
324 nilfs_init_acl(), proper cancellation of
325 above jobs should be considered */
326
327 mark_inode_dirty(inode);
328 return inode;
329
330 failed_acl:
331 failed_bmap:
332 inode->i_nlink = 0;
333 iput(inode); /* raw_inode will be deleted through
334 generic_delete_inode() */
335 goto failed;
336
337 failed_ifile_create_inode:
338 make_bad_inode(inode);
339 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
340 called */
341 failed:
342 return ERR_PTR(err);
343}
344
345void nilfs_free_inode(struct inode *inode)
346{
347 struct super_block *sb = inode->i_sb;
348 struct nilfs_sb_info *sbi = NILFS_SB(sb);
349
350 clear_inode(inode);
351 /* XXX: check error code? Is there any thing I can do? */
352 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
353 atomic_dec(&sbi->s_inodes_count);
354}
355
356void nilfs_set_inode_flags(struct inode *inode)
357{
358 unsigned int flags = NILFS_I(inode)->i_flags;
359
360 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
361 S_DIRSYNC);
362 if (flags & NILFS_SYNC_FL)
363 inode->i_flags |= S_SYNC;
364 if (flags & NILFS_APPEND_FL)
365 inode->i_flags |= S_APPEND;
366 if (flags & NILFS_IMMUTABLE_FL)
367 inode->i_flags |= S_IMMUTABLE;
368#ifndef NILFS_ATIME_DISABLE
369 if (flags & NILFS_NOATIME_FL)
370#endif
371 inode->i_flags |= S_NOATIME;
372 if (flags & NILFS_DIRSYNC_FL)
373 inode->i_flags |= S_DIRSYNC;
374 mapping_set_gfp_mask(inode->i_mapping,
375 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
376}
377
378int nilfs_read_inode_common(struct inode *inode,
379 struct nilfs_inode *raw_inode)
380{
381 struct nilfs_inode_info *ii = NILFS_I(inode);
382 int err;
383
384 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
385 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
386 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
387 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
388 inode->i_size = le64_to_cpu(raw_inode->i_size);
389 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
390 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
391 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
392 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
393 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
394 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
395 if (inode->i_nlink == 0 && inode->i_mode == 0)
396 return -EINVAL; /* this inode is deleted */
397
398 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
399 ii->i_flags = le32_to_cpu(raw_inode->i_flags);
400#if 0
401 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
402 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
403 0 : le32_to_cpu(raw_inode->i_dir_acl);
404#endif
405 ii->i_cno = 0;
406 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
407
408 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
409 S_ISLNK(inode->i_mode)) {
410 err = nilfs_bmap_read(ii->i_bmap, raw_inode);
411 if (err < 0)
412 return err;
413 set_bit(NILFS_I_BMAP, &ii->i_state);
414 /* No lock is needed; iget() ensures it. */
415 }
416 return 0;
417}
418
419static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
420 struct inode *inode)
421{
422 struct nilfs_sb_info *sbi = NILFS_SB(sb);
423 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
424 struct buffer_head *bh;
425 struct nilfs_inode *raw_inode;
426 int err;
427
428 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
429 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
430 if (unlikely(err))
431 goto bad_inode;
432
433 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
434
435#ifdef CONFIG_NILFS_FS_POSIX_ACL
436 ii->i_acl = NILFS_ACL_NOT_CACHED;
437 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
438#endif
439 if (nilfs_read_inode_common(inode, raw_inode))
440 goto failed_unmap;
441
442 if (S_ISREG(inode->i_mode)) {
443 inode->i_op = &nilfs_file_inode_operations;
444 inode->i_fop = &nilfs_file_operations;
445 inode->i_mapping->a_ops = &nilfs_aops;
446 } else if (S_ISDIR(inode->i_mode)) {
447 inode->i_op = &nilfs_dir_inode_operations;
448 inode->i_fop = &nilfs_dir_operations;
449 inode->i_mapping->a_ops = &nilfs_aops;
450 } else if (S_ISLNK(inode->i_mode)) {
451 inode->i_op = &nilfs_symlink_inode_operations;
452 inode->i_mapping->a_ops = &nilfs_aops;
453 } else {
454 inode->i_op = &nilfs_special_inode_operations;
455 init_special_inode(
456 inode, inode->i_mode,
457 new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
458 }
459 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
460 brelse(bh);
461 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
462 nilfs_set_inode_flags(inode);
463 return 0;
464
465 failed_unmap:
466 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
467 brelse(bh);
468
469 bad_inode:
470 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
471 return err;
472}
473
474struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
475{
476 struct inode *inode;
477 int err;
478
479 inode = iget_locked(sb, ino);
480 if (unlikely(!inode))
481 return ERR_PTR(-ENOMEM);
482 if (!(inode->i_state & I_NEW))
483 return inode;
484
485 err = __nilfs_read_inode(sb, ino, inode);
486 if (unlikely(err)) {
487 iget_failed(inode);
488 return ERR_PTR(err);
489 }
490 unlock_new_inode(inode);
491 return inode;
492}
493
494void nilfs_write_inode_common(struct inode *inode,
495 struct nilfs_inode *raw_inode, int has_bmap)
496{
497 struct nilfs_inode_info *ii = NILFS_I(inode);
498
499 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
500 raw_inode->i_uid = cpu_to_le32(inode->i_uid);
501 raw_inode->i_gid = cpu_to_le32(inode->i_gid);
502 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
503 raw_inode->i_size = cpu_to_le64(inode->i_size);
504 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
505 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
506 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
507 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
508 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
509
510 raw_inode->i_flags = cpu_to_le32(ii->i_flags);
511 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
512
513 if (has_bmap)
514 nilfs_bmap_write(ii->i_bmap, raw_inode);
515 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
516 raw_inode->i_device_code =
517 cpu_to_le64(new_encode_dev(inode->i_rdev));
518 /* When extending inode, nilfs->ns_inode_size should be checked
519 for substitutions of appended fields */
520}
521
522void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
523{
524 ino_t ino = inode->i_ino;
525 struct nilfs_inode_info *ii = NILFS_I(inode);
526 struct super_block *sb = inode->i_sb;
527 struct nilfs_sb_info *sbi = NILFS_SB(sb);
528 struct nilfs_inode *raw_inode;
529
530 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
531
532 /* The buffer is guarded with lock_buffer() by the caller */
533 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
534 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
535 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
536
537 nilfs_write_inode_common(inode, raw_inode, 0);
538 /* XXX: call with has_bmap = 0 is a workaround to avoid
539 deadlock of bmap. This delays update of i_bmap to just
540 before writing */
541 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
542}
543
544#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
545
546static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
547 unsigned long from)
548{
549 unsigned long b;
550 int ret;
551
552 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
553 return;
554 repeat:
555 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
556 if (ret == -ENOENT)
557 return;
558 else if (ret < 0)
559 goto failed;
560
561 if (b < from)
562 return;
563
564 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
565 ret = nilfs_bmap_truncate(ii->i_bmap, b);
566 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
567 if (!ret || (ret == -ENOMEM &&
568 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
569 goto repeat;
570
571 failed:
572 if (ret == -EINVAL)
573 nilfs_error(ii->vfs_inode.i_sb, __func__,
574 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
575 else
576 nilfs_warning(ii->vfs_inode.i_sb, __func__,
577 "failed to truncate bmap (ino=%lu, err=%d)",
578 ii->vfs_inode.i_ino, ret);
579}
580
581void nilfs_truncate(struct inode *inode)
582{
583 unsigned long blkoff;
584 unsigned int blocksize;
585 struct nilfs_transaction_info ti;
586 struct super_block *sb = inode->i_sb;
587 struct nilfs_inode_info *ii = NILFS_I(inode);
588
589 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
590 return;
591 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
592 return;
593
594 blocksize = sb->s_blocksize;
595 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
596 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
597
598 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
599
600 nilfs_truncate_bmap(ii, blkoff);
601
602 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
603 if (IS_SYNC(inode))
604 nilfs_set_transaction_flag(NILFS_TI_SYNC);
605
606 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
607 nilfs_transaction_commit(sb);
608 /* May construct a logical segment and may fail in sync mode.
609 But truncate has no return value. */
610}
611
612void nilfs_delete_inode(struct inode *inode)
613{
614 struct nilfs_transaction_info ti;
615 struct super_block *sb = inode->i_sb;
616 struct nilfs_inode_info *ii = NILFS_I(inode);
617
618 if (unlikely(is_bad_inode(inode))) {
619 if (inode->i_data.nrpages)
620 truncate_inode_pages(&inode->i_data, 0);
621 clear_inode(inode);
622 return;
623 }
624 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
625
626 if (inode->i_data.nrpages)
627 truncate_inode_pages(&inode->i_data, 0);
628
629 nilfs_truncate_bmap(ii, 0);
630 nilfs_free_inode(inode);
631 /* nilfs_free_inode() marks inode buffer dirty */
632 if (IS_SYNC(inode))
633 nilfs_set_transaction_flag(NILFS_TI_SYNC);
634 nilfs_transaction_commit(sb);
635 /* May construct a logical segment and may fail in sync mode.
636 But delete_inode has no return value. */
637}
638
639int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
640{
641 struct nilfs_transaction_info ti;
642 struct inode *inode = dentry->d_inode;
643 struct super_block *sb = inode->i_sb;
644 int err;
645
646 err = inode_change_ok(inode, iattr);
647 if (err)
648 return err;
649
650 err = nilfs_transaction_begin(sb, &ti, 0);
651 if (unlikely(err))
652 return err;
653 err = inode_setattr(inode, iattr);
654 if (!err && (iattr->ia_valid & ATTR_MODE))
655 err = nilfs_acl_chmod(inode);
656 if (likely(!err))
657 err = nilfs_transaction_commit(sb);
658 else
659 nilfs_transaction_abort(sb);
660
661 return err;
662}
663
664int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
665 struct buffer_head **pbh)
666{
667 struct nilfs_inode_info *ii = NILFS_I(inode);
668 int err;
669
670 spin_lock(&sbi->s_inode_lock);
671 /* Caller of this function MUST lock s_inode_lock */
672 if (ii->i_bh == NULL) {
673 spin_unlock(&sbi->s_inode_lock);
674 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
675 pbh);
676 if (unlikely(err))
677 return err;
678 spin_lock(&sbi->s_inode_lock);
679 if (ii->i_bh == NULL)
680 ii->i_bh = *pbh;
681 else {
682 brelse(*pbh);
683 *pbh = ii->i_bh;
684 }
685 } else
686 *pbh = ii->i_bh;
687
688 get_bh(*pbh);
689 spin_unlock(&sbi->s_inode_lock);
690 return 0;
691}
692
693int nilfs_inode_dirty(struct inode *inode)
694{
695 struct nilfs_inode_info *ii = NILFS_I(inode);
696 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
697 int ret = 0;
698
699 if (!list_empty(&ii->i_dirty)) {
700 spin_lock(&sbi->s_inode_lock);
701 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
702 test_bit(NILFS_I_BUSY, &ii->i_state);
703 spin_unlock(&sbi->s_inode_lock);
704 }
705 return ret;
706}
707
708int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
709 unsigned nr_dirty)
710{
711 struct nilfs_inode_info *ii = NILFS_I(inode);
712
713 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
714
715 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
716 return 0;
717
718 spin_lock(&sbi->s_inode_lock);
719 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
720 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
721 /* Because this routine may race with nilfs_dispose_list(),
722 we have to check NILFS_I_QUEUED here, too. */
723 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
724 /* This will happen when somebody is freeing
725 this inode. */
726 nilfs_warning(sbi->s_super, __func__,
727 "cannot get inode (ino=%lu)\n",
728 inode->i_ino);
729 spin_unlock(&sbi->s_inode_lock);
730 return -EINVAL; /* NILFS_I_DIRTY may remain for
731 freeing inode */
732 }
733 list_del(&ii->i_dirty);
734 list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
735 set_bit(NILFS_I_QUEUED, &ii->i_state);
736 }
737 spin_unlock(&sbi->s_inode_lock);
738 return 0;
739}
740
741int nilfs_mark_inode_dirty(struct inode *inode)
742{
743 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
744 struct buffer_head *ibh;
745 int err;
746
747 err = nilfs_load_inode_block(sbi, inode, &ibh);
748 if (unlikely(err)) {
749 nilfs_warning(inode->i_sb, __func__,
750 "failed to reget inode block.\n");
751 return err;
752 }
753 lock_buffer(ibh);
754 nilfs_update_inode(inode, ibh);
755 unlock_buffer(ibh);
756 nilfs_mdt_mark_buffer_dirty(ibh);
757 nilfs_mdt_mark_dirty(sbi->s_ifile);
758 brelse(ibh);
759 return 0;
760}
761
762/**
763 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
764 * @inode: inode of the file to be registered.
765 *
766 * nilfs_dirty_inode() loads a inode block containing the specified
767 * @inode and copies data from a nilfs_inode to a corresponding inode
768 * entry in the inode block. This operation is excluded from the segment
769 * construction. This function can be called both as a single operation
770 * and as a part of indivisible file operations.
771 */
772void nilfs_dirty_inode(struct inode *inode)
773{
774 struct nilfs_transaction_info ti;
775
776 if (is_bad_inode(inode)) {
777 nilfs_warning(inode->i_sb, __func__,
778 "tried to mark bad_inode dirty. ignored.\n");
779 dump_stack();
780 return;
781 }
782 nilfs_transaction_begin(inode->i_sb, &ti, 0);
783 nilfs_mark_inode_dirty(inode);
784 nilfs_transaction_commit(inode->i_sb); /* never fails */
785}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000000..50ff3f2cdf24
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,665 @@
1/*
2 * ioctl.c - NILFS ioctl operations.
3 *
4 * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h>
29#include <linux/nilfs2_fs.h>
30#include "nilfs.h"
31#include "segment.h"
32#include "bmap.h"
33#include "cpfile.h"
34#include "sufile.h"
35#include "dat.h"
36
37
38static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
39 struct nilfs_argv *argv, int dir,
40 ssize_t (*dofunc)(struct the_nilfs *,
41 __u64 *, int,
42 void *, size_t, size_t))
43{
44 void *buf;
45 void __user *base = (void __user *)(unsigned long)argv->v_base;
46 size_t maxmembs, total, n;
47 ssize_t nr;
48 int ret, i;
49 __u64 pos, ppos;
50
51 if (argv->v_nmembs == 0)
52 return 0;
53
54 if (argv->v_size > PAGE_SIZE)
55 return -EINVAL;
56
57 buf = (void *)__get_free_pages(GFP_NOFS, 0);
58 if (unlikely(!buf))
59 return -ENOMEM;
60 maxmembs = PAGE_SIZE / argv->v_size;
61
62 ret = 0;
63 total = 0;
64 pos = argv->v_index;
65 for (i = 0; i < argv->v_nmembs; i += n) {
66 n = (argv->v_nmembs - i < maxmembs) ?
67 argv->v_nmembs - i : maxmembs;
68 if ((dir & _IOC_WRITE) &&
69 copy_from_user(buf, base + argv->v_size * i,
70 argv->v_size * n)) {
71 ret = -EFAULT;
72 break;
73 }
74 ppos = pos;
75 nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
76 n);
77 if (nr < 0) {
78 ret = nr;
79 break;
80 }
81 if ((dir & _IOC_READ) &&
82 copy_to_user(base + argv->v_size * i, buf,
83 argv->v_size * nr)) {
84 ret = -EFAULT;
85 break;
86 }
87 total += nr;
88 if ((size_t)nr < n)
89 break;
90 if (pos == ppos)
91 pos += n;
92 }
93 argv->v_nmembs = total;
94
95 free_pages((unsigned long)buf, 0);
96 return ret;
97}
98
99static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
100 unsigned int cmd, void __user *argp)
101{
102 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
103 struct nilfs_transaction_info ti;
104 struct nilfs_cpmode cpmode;
105 int ret;
106
107 if (!capable(CAP_SYS_ADMIN))
108 return -EPERM;
109 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
110 return -EFAULT;
111
112 nilfs_transaction_begin(inode->i_sb, &ti, 0);
113 ret = nilfs_cpfile_change_cpmode(
114 cpfile, cpmode.cm_cno, cpmode.cm_mode);
115 if (unlikely(ret < 0)) {
116 nilfs_transaction_abort(inode->i_sb);
117 return ret;
118 }
119 nilfs_transaction_commit(inode->i_sb); /* never fails */
120 return ret;
121}
122
123static int
124nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
125 unsigned int cmd, void __user *argp)
126{
127 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
128 struct nilfs_transaction_info ti;
129 __u64 cno;
130 int ret;
131
132 if (!capable(CAP_SYS_ADMIN))
133 return -EPERM;
134 if (copy_from_user(&cno, argp, sizeof(cno)))
135 return -EFAULT;
136
137 nilfs_transaction_begin(inode->i_sb, &ti, 0);
138 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
139 if (unlikely(ret < 0)) {
140 nilfs_transaction_abort(inode->i_sb);
141 return ret;
142 }
143 nilfs_transaction_commit(inode->i_sb); /* never fails */
144 return ret;
145}
146
147static ssize_t
148nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
149 void *buf, size_t size, size_t nmembs)
150{
151 int ret;
152
153 down_read(&nilfs->ns_segctor_sem);
154 ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
155 nmembs);
156 up_read(&nilfs->ns_segctor_sem);
157 return ret;
158}
159
160static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
161 unsigned int cmd, void __user *argp)
162{
163 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
164 struct nilfs_cpstat cpstat;
165 int ret;
166
167 down_read(&nilfs->ns_segctor_sem);
168 ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
169 up_read(&nilfs->ns_segctor_sem);
170 if (ret < 0)
171 return ret;
172
173 if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
174 ret = -EFAULT;
175 return ret;
176}
177
178static ssize_t
179nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
180 void *buf, size_t size, size_t nmembs)
181{
182 int ret;
183
184 down_read(&nilfs->ns_segctor_sem);
185 ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
186 up_read(&nilfs->ns_segctor_sem);
187 return ret;
188}
189
190static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
191 unsigned int cmd, void __user *argp)
192{
193 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
194 struct nilfs_sustat sustat;
195 int ret;
196
197 down_read(&nilfs->ns_segctor_sem);
198 ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
199 up_read(&nilfs->ns_segctor_sem);
200 if (ret < 0)
201 return ret;
202
203 if (copy_to_user(argp, &sustat, sizeof(sustat)))
204 ret = -EFAULT;
205 return ret;
206}
207
208static ssize_t
209nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
210 void *buf, size_t size, size_t nmembs)
211{
212 int ret;
213
214 down_read(&nilfs->ns_segctor_sem);
215 ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
216 up_read(&nilfs->ns_segctor_sem);
217 return ret;
218}
219
220static ssize_t
221nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
222 void *buf, size_t size, size_t nmembs)
223{
224 struct inode *dat = nilfs_dat_inode(nilfs);
225 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
226 struct nilfs_bdesc *bdescs = buf;
227 int ret, i;
228
229 down_read(&nilfs->ns_segctor_sem);
230 for (i = 0; i < nmembs; i++) {
231 ret = nilfs_bmap_lookup_at_level(bmap,
232 bdescs[i].bd_offset,
233 bdescs[i].bd_level + 1,
234 &bdescs[i].bd_blocknr);
235 if (ret < 0) {
236 if (ret != -ENOENT) {
237 up_read(&nilfs->ns_segctor_sem);
238 return ret;
239 }
240 bdescs[i].bd_blocknr = 0;
241 }
242 }
243 up_read(&nilfs->ns_segctor_sem);
244 return nmembs;
245}
246
247static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
248 unsigned int cmd, void __user *argp)
249{
250 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
251 struct nilfs_argv argv;
252 int ret;
253
254 if (copy_from_user(&argv, argp, sizeof(argv)))
255 return -EFAULT;
256
257 if (argv.v_size != sizeof(struct nilfs_bdesc))
258 return -EINVAL;
259
260 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
261 nilfs_ioctl_do_get_bdescs);
262 if (ret < 0)
263 return ret;
264
265 if (copy_to_user(argp, &argv, sizeof(argv)))
266 ret = -EFAULT;
267 return ret;
268}
269
270static int nilfs_ioctl_move_inode_block(struct inode *inode,
271 struct nilfs_vdesc *vdesc,
272 struct list_head *buffers)
273{
274 struct buffer_head *bh;
275 int ret;
276
277 if (vdesc->vd_flags == 0)
278 ret = nilfs_gccache_submit_read_data(
279 inode, vdesc->vd_offset, vdesc->vd_blocknr,
280 vdesc->vd_vblocknr, &bh);
281 else
282 ret = nilfs_gccache_submit_read_node(
283 inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
284
285 if (unlikely(ret < 0)) {
286 if (ret == -ENOENT)
287 printk(KERN_CRIT
288 "%s: invalid virtual block address (%s): "
289 "ino=%llu, cno=%llu, offset=%llu, "
290 "blocknr=%llu, vblocknr=%llu\n",
291 __func__, vdesc->vd_flags ? "node" : "data",
292 (unsigned long long)vdesc->vd_ino,
293 (unsigned long long)vdesc->vd_cno,
294 (unsigned long long)vdesc->vd_offset,
295 (unsigned long long)vdesc->vd_blocknr,
296 (unsigned long long)vdesc->vd_vblocknr);
297 return ret;
298 }
299 bh->b_private = vdesc;
300 list_add_tail(&bh->b_assoc_buffers, buffers);
301 return 0;
302}
303
304static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
305 struct nilfs_argv *argv, void *buf)
306{
307 size_t nmembs = argv->v_nmembs;
308 struct inode *inode;
309 struct nilfs_vdesc *vdesc;
310 struct buffer_head *bh, *n;
311 LIST_HEAD(buffers);
312 ino_t ino;
313 __u64 cno;
314 int i, ret;
315
316 for (i = 0, vdesc = buf; i < nmembs; ) {
317 ino = vdesc->vd_ino;
318 cno = vdesc->vd_cno;
319 inode = nilfs_gc_iget(nilfs, ino, cno);
320 if (unlikely(inode == NULL)) {
321 ret = -ENOMEM;
322 goto failed;
323 }
324 do {
325 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
326 &buffers);
327 if (unlikely(ret < 0))
328 goto failed;
329 vdesc++;
330 } while (++i < nmembs &&
331 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
332 }
333
334 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
335 ret = nilfs_gccache_wait_and_mark_dirty(bh);
336 if (unlikely(ret < 0)) {
337 if (ret == -EEXIST) {
338 vdesc = bh->b_private;
339 printk(KERN_CRIT
340 "%s: conflicting %s buffer: "
341 "ino=%llu, cno=%llu, offset=%llu, "
342 "blocknr=%llu, vblocknr=%llu\n",
343 __func__,
344 vdesc->vd_flags ? "node" : "data",
345 (unsigned long long)vdesc->vd_ino,
346 (unsigned long long)vdesc->vd_cno,
347 (unsigned long long)vdesc->vd_offset,
348 (unsigned long long)vdesc->vd_blocknr,
349 (unsigned long long)vdesc->vd_vblocknr);
350 }
351 goto failed;
352 }
353 list_del_init(&bh->b_assoc_buffers);
354 bh->b_private = NULL;
355 brelse(bh);
356 }
357 return nmembs;
358
359 failed:
360 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
361 list_del_init(&bh->b_assoc_buffers);
362 bh->b_private = NULL;
363 brelse(bh);
364 }
365 return ret;
366}
367
368static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
369 struct nilfs_argv *argv, void *buf)
370{
371 size_t nmembs = argv->v_nmembs;
372 struct inode *cpfile = nilfs->ns_cpfile;
373 struct nilfs_period *periods = buf;
374 int ret, i;
375
376 for (i = 0; i < nmembs; i++) {
377 ret = nilfs_cpfile_delete_checkpoints(
378 cpfile, periods[i].p_start, periods[i].p_end);
379 if (ret < 0)
380 return ret;
381 }
382 return nmembs;
383}
384
385static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
386 struct nilfs_argv *argv, void *buf)
387{
388 size_t nmembs = argv->v_nmembs;
389 int ret;
390
391 ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
392
393 return (ret < 0) ? ret : nmembs;
394}
395
396static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
397 struct nilfs_argv *argv, void *buf)
398{
399 size_t nmembs = argv->v_nmembs;
400 struct inode *dat = nilfs_dat_inode(nilfs);
401 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
402 struct nilfs_bdesc *bdescs = buf;
403 int ret, i;
404
405 for (i = 0; i < nmembs; i++) {
406 /* XXX: use macro or inline func to check liveness */
407 ret = nilfs_bmap_lookup_at_level(bmap,
408 bdescs[i].bd_offset,
409 bdescs[i].bd_level + 1,
410 &bdescs[i].bd_blocknr);
411 if (ret < 0) {
412 if (ret != -ENOENT)
413 return ret;
414 bdescs[i].bd_blocknr = 0;
415 }
416 if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
417 /* skip dead block */
418 continue;
419 if (bdescs[i].bd_level == 0) {
420 ret = nilfs_mdt_mark_block_dirty(dat,
421 bdescs[i].bd_offset);
422 if (ret < 0) {
423 WARN_ON(ret == -ENOENT);
424 return ret;
425 }
426 } else {
427 ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
428 bdescs[i].bd_level);
429 if (ret < 0) {
430 WARN_ON(ret == -ENOENT);
431 return ret;
432 }
433 }
434 }
435 return nmembs;
436}
437
438static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
439 struct nilfs_argv *argv, void *buf)
440{
441 size_t nmembs = argv->v_nmembs;
442 struct nilfs_sb_info *sbi = nilfs->ns_writer;
443 int ret;
444
445 if (unlikely(!sbi)) {
446 /* never happens because called for a writable mount */
447 WARN_ON(1);
448 return -EROFS;
449 }
450 ret = nilfs_segctor_add_segments_to_be_freed(
451 NILFS_SC(sbi), buf, nmembs);
452
453 return (ret < 0) ? ret : nmembs;
454}
455
456int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
457 struct nilfs_argv *argv, void **kbufs)
458{
459 const char *msg;
460 int ret;
461
462 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
463 if (ret < 0) {
464 msg = "cannot read source blocks";
465 goto failed;
466 }
467
468 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
469 if (ret < 0) {
470 /*
471 * can safely abort because checkpoints can be removed
472 * independently.
473 */
474 msg = "cannot delete checkpoints";
475 goto failed;
476 }
477 ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]);
478 if (ret < 0) {
479 /*
480 * can safely abort because DAT file is updated atomically
481 * using a copy-on-write technique.
482 */
483 msg = "cannot delete virtual blocks from DAT file";
484 goto failed;
485 }
486 ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]);
487 if (ret < 0) {
488 /*
489 * can safely abort because the operation is nondestructive.
490 */
491 msg = "cannot mark copying blocks dirty";
492 goto failed;
493 }
494 ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
495 if (ret < 0) {
496 /*
497 * can safely abort because this operation is atomic.
498 */
499 msg = "cannot set segments to be freed";
500 goto failed;
501 }
502 return 0;
503
504 failed:
505 nilfs_remove_all_gcinode(nilfs);
506 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
507 msg, ret);
508 return ret;
509}
510
511static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
512 unsigned int cmd, void __user *argp)
513{
514 struct nilfs_argv argv[5];
515 const static size_t argsz[5] = {
516 sizeof(struct nilfs_vdesc),
517 sizeof(struct nilfs_period),
518 sizeof(__u64),
519 sizeof(struct nilfs_bdesc),
520 sizeof(__u64),
521 };
522 void __user *base;
523 void *kbufs[5];
524 struct the_nilfs *nilfs;
525 size_t len, nsegs;
526 int n, ret;
527
528 if (!capable(CAP_SYS_ADMIN))
529 return -EPERM;
530
531 if (copy_from_user(argv, argp, sizeof(argv)))
532 return -EFAULT;
533
534 nsegs = argv[4].v_nmembs;
535 if (argv[4].v_size != argsz[4])
536 return -EINVAL;
537 /*
538 * argv[4] points to segment numbers this ioctl cleans. We
539 * use kmalloc() for its buffer because memory used for the
540 * segment numbers is enough small.
541 */
542 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
543 nsegs * sizeof(__u64));
544 if (IS_ERR(kbufs[4]))
545 return PTR_ERR(kbufs[4]);
546
547 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
548
549 for (n = 0; n < 4; n++) {
550 ret = -EINVAL;
551 if (argv[n].v_size != argsz[n])
552 goto out_free;
553
554 if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
555 goto out_free;
556
557 len = argv[n].v_size * argv[n].v_nmembs;
558 base = (void __user *)(unsigned long)argv[n].v_base;
559 if (len == 0) {
560 kbufs[n] = NULL;
561 continue;
562 }
563
564 kbufs[n] = vmalloc(len);
565 if (!kbufs[n]) {
566 ret = -ENOMEM;
567 goto out_free;
568 }
569 if (copy_from_user(kbufs[n], base, len)) {
570 ret = -EFAULT;
571 vfree(kbufs[n]);
572 goto out_free;
573 }
574 }
575
576 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
577
578 out_free:
579 while (--n > 0)
580 vfree(kbufs[n]);
581 kfree(kbufs[4]);
582 return ret;
583}
584
585static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
586 unsigned int cmd, void __user *argp)
587{
588 __u64 cno;
589 int ret;
590
591 ret = nilfs_construct_segment(inode->i_sb);
592 if (ret < 0)
593 return ret;
594
595 if (argp != NULL) {
596 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
597 if (copy_to_user(argp, &cno, sizeof(cno)))
598 return -EFAULT;
599 }
600 return 0;
601}
602
603static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
604 unsigned int cmd, void __user *argp,
605 size_t membsz,
606 ssize_t (*dofunc)(struct the_nilfs *,
607 __u64 *, int,
608 void *, size_t, size_t))
609
610{
611 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
612 struct nilfs_argv argv;
613 int ret;
614
615 if (copy_from_user(&argv, argp, sizeof(argv)))
616 return -EFAULT;
617
618 if (argv.v_size != membsz)
619 return -EINVAL;
620
621 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
622 if (ret < 0)
623 return ret;
624
625 if (copy_to_user(argp, &argv, sizeof(argv)))
626 ret = -EFAULT;
627 return ret;
628}
629
630long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
631{
632 struct inode *inode = filp->f_dentry->d_inode;
633 void __user *argp = (void * __user *)arg;
634
635 switch (cmd) {
636 case NILFS_IOCTL_CHANGE_CPMODE:
637 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
638 case NILFS_IOCTL_DELETE_CHECKPOINT:
639 return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
640 case NILFS_IOCTL_GET_CPINFO:
641 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
642 sizeof(struct nilfs_cpinfo),
643 nilfs_ioctl_do_get_cpinfo);
644 case NILFS_IOCTL_GET_CPSTAT:
645 return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
646 case NILFS_IOCTL_GET_SUINFO:
647 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
648 sizeof(struct nilfs_suinfo),
649 nilfs_ioctl_do_get_suinfo);
650 case NILFS_IOCTL_GET_SUSTAT:
651 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
652 case NILFS_IOCTL_GET_VINFO:
653 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
654 sizeof(struct nilfs_vinfo),
655 nilfs_ioctl_do_get_vinfo);
656 case NILFS_IOCTL_GET_BDESCS:
657 return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
658 case NILFS_IOCTL_CLEAN_SEGMENTS:
659 return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
660 case NILFS_IOCTL_SYNC:
661 return nilfs_ioctl_sync(inode, filp, cmd, argp);
662 default:
663 return -ENOTTY;
664 }
665}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 000000000000..bb78745a0e30
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,564 @@
1/*
2 * mdt.c - meta data file for NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/mpage.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include <linux/backing-dev.h>
28#include <linux/swap.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "page.h"
32#include "mdt.h"
33
34
35#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
36
37#define INIT_UNUSED_INODE_FIELDS
38
39static int
40nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
41 struct buffer_head *bh,
42 void (*init_block)(struct inode *,
43 struct buffer_head *, void *))
44{
45 struct nilfs_inode_info *ii = NILFS_I(inode);
46 void *kaddr;
47 int ret;
48
49 /* Caller exclude read accesses using page lock */
50
51 /* set_buffer_new(bh); */
52 bh->b_blocknr = 0;
53
54 ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
55 if (unlikely(ret))
56 return ret;
57
58 set_buffer_mapped(bh);
59
60 kaddr = kmap_atomic(bh->b_page, KM_USER0);
61 memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
62 if (init_block)
63 init_block(inode, bh, kaddr);
64 flush_dcache_page(bh->b_page);
65 kunmap_atomic(kaddr, KM_USER0);
66
67 set_buffer_uptodate(bh);
68 nilfs_mark_buffer_dirty(bh);
69 nilfs_mdt_mark_dirty(inode);
70 return 0;
71}
72
73static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
74 struct buffer_head **out_bh,
75 void (*init_block)(struct inode *,
76 struct buffer_head *,
77 void *))
78{
79 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
80 struct super_block *sb = inode->i_sb;
81 struct nilfs_transaction_info ti;
82 struct buffer_head *bh;
83 int err;
84
85 if (!sb) {
86 /*
87 * Make sure this function is not called from any
88 * read-only context.
89 */
90 if (!nilfs->ns_writer) {
91 WARN_ON(1);
92 err = -EROFS;
93 goto out;
94 }
95 sb = nilfs->ns_writer->s_super;
96 }
97
98 nilfs_transaction_begin(sb, &ti, 0);
99
100 err = -ENOMEM;
101 bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
102 if (unlikely(!bh))
103 goto failed_unlock;
104
105 err = -EEXIST;
106 if (buffer_uptodate(bh) || buffer_mapped(bh))
107 goto failed_bh;
108#if 0
109 /* The uptodate flag is not protected by the page lock, but
110 the mapped flag is. Thus, we don't have to wait the buffer. */
111 wait_on_buffer(bh);
112 if (buffer_uptodate(bh))
113 goto failed_bh;
114#endif
115
116 bh->b_bdev = nilfs->ns_bdev;
117 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
118 if (likely(!err)) {
119 get_bh(bh);
120 *out_bh = bh;
121 }
122
123 failed_bh:
124 unlock_page(bh->b_page);
125 page_cache_release(bh->b_page);
126 brelse(bh);
127
128 failed_unlock:
129 if (likely(!err))
130 err = nilfs_transaction_commit(sb);
131 else
132 nilfs_transaction_abort(sb);
133 out:
134 return err;
135}
136
137static int
138nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
139 int mode, struct buffer_head **out_bh)
140{
141 struct buffer_head *bh;
142 unsigned long blknum = 0;
143 int ret = -ENOMEM;
144
145 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
146 if (unlikely(!bh))
147 goto failed;
148
149 ret = -EEXIST; /* internal code */
150 if (buffer_uptodate(bh))
151 goto out;
152
153 if (mode == READA) {
154 if (!trylock_buffer(bh)) {
155 ret = -EBUSY;
156 goto failed_bh;
157 }
158 } else /* mode == READ */
159 lock_buffer(bh);
160
161 if (buffer_uptodate(bh)) {
162 unlock_buffer(bh);
163 goto out;
164 }
165 if (!buffer_mapped(bh)) { /* unused buffer */
166 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
167 &blknum);
168 if (unlikely(ret)) {
169 unlock_buffer(bh);
170 goto failed_bh;
171 }
172 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
173 bh->b_blocknr = blknum;
174 set_buffer_mapped(bh);
175 }
176
177 bh->b_end_io = end_buffer_read_sync;
178 get_bh(bh);
179 submit_bh(mode, bh);
180 ret = 0;
181 out:
182 get_bh(bh);
183 *out_bh = bh;
184
185 failed_bh:
186 unlock_page(bh->b_page);
187 page_cache_release(bh->b_page);
188 brelse(bh);
189 failed:
190 return ret;
191}
192
193static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
194 struct buffer_head **out_bh)
195{
196 struct buffer_head *first_bh, *bh;
197 unsigned long blkoff;
198 int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
199 int err;
200
201 err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
202 if (err == -EEXIST) /* internal code */
203 goto out;
204
205 if (unlikely(err))
206 goto failed;
207
208 blkoff = block + 1;
209 for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
210 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
211 if (likely(!err || err == -EEXIST))
212 brelse(bh);
213 else if (err != -EBUSY)
214 break; /* abort readahead if bmap lookup failed */
215
216 if (!buffer_locked(first_bh))
217 goto out_no_wait;
218 }
219
220 wait_on_buffer(first_bh);
221
222 out_no_wait:
223 err = -EIO;
224 if (!buffer_uptodate(first_bh))
225 goto failed_bh;
226 out:
227 *out_bh = first_bh;
228 return 0;
229
230 failed_bh:
231 brelse(first_bh);
232 failed:
233 return err;
234}
235
236/**
237 * nilfs_mdt_get_block - read or create a buffer on meta data file.
238 * @inode: inode of the meta data file
239 * @blkoff: block offset
240 * @create: create flag
241 * @init_block: initializer used for newly allocated block
242 * @out_bh: output of a pointer to the buffer_head
243 *
244 * nilfs_mdt_get_block() looks up the specified buffer and tries to create
245 * a new buffer if @create is not zero. On success, the returned buffer is
246 * assured to be either existing or formatted using a buffer lock on success.
247 * @out_bh is substituted only when zero is returned.
248 *
249 * Return Value: On success, it returns 0. On error, the following negative
250 * error code is returned.
251 *
252 * %-ENOMEM - Insufficient memory available.
253 *
254 * %-EIO - I/O error
255 *
256 * %-ENOENT - the specified block does not exist (hole block)
257 *
258 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
259 *
260 * %-EROFS - Read only filesystem (for create mode)
261 */
262int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
263 void (*init_block)(struct inode *,
264 struct buffer_head *, void *),
265 struct buffer_head **out_bh)
266{
267 int ret;
268
269 /* Should be rewritten with merging nilfs_mdt_read_block() */
270 retry:
271 ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
272 if (!create || ret != -ENOENT)
273 return ret;
274
275 ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
276 if (unlikely(ret == -EEXIST)) {
277 /* create = 0; */ /* limit read-create loop retries */
278 goto retry;
279 }
280 return ret;
281}
282
283/**
284 * nilfs_mdt_delete_block - make a hole on the meta data file.
285 * @inode: inode of the meta data file
286 * @block: block offset
287 *
288 * Return Value: On success, zero is returned.
289 * On error, one of the following negative error code is returned.
290 *
291 * %-ENOMEM - Insufficient memory available.
292 *
293 * %-EIO - I/O error
294 *
295 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
296 */
297int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
298{
299 struct nilfs_inode_info *ii = NILFS_I(inode);
300 int err;
301
302 err = nilfs_bmap_delete(ii->i_bmap, block);
303 if (!err || err == -ENOENT) {
304 nilfs_mdt_mark_dirty(inode);
305 nilfs_mdt_forget_block(inode, block);
306 }
307 return err;
308}
309
310/**
311 * nilfs_mdt_forget_block - discard dirty state and try to remove the page
312 * @inode: inode of the meta data file
313 * @block: block offset
314 *
315 * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
316 * tries to release the page including the buffer from a page cache.
317 *
318 * Return Value: On success, 0 is returned. On error, one of the following
319 * negative error code is returned.
320 *
321 * %-EBUSY - page has an active buffer.
322 *
323 * %-ENOENT - page cache has no page addressed by the offset.
324 */
325int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
326{
327 pgoff_t index = (pgoff_t)block >>
328 (PAGE_CACHE_SHIFT - inode->i_blkbits);
329 struct page *page;
330 unsigned long first_block;
331 int ret = 0;
332 int still_dirty;
333
334 page = find_lock_page(inode->i_mapping, index);
335 if (!page)
336 return -ENOENT;
337
338 wait_on_page_writeback(page);
339
340 first_block = (unsigned long)index <<
341 (PAGE_CACHE_SHIFT - inode->i_blkbits);
342 if (page_has_buffers(page)) {
343 struct buffer_head *bh;
344
345 bh = nilfs_page_get_nth_block(page, block - first_block);
346 nilfs_forget_buffer(bh);
347 }
348 still_dirty = PageDirty(page);
349 unlock_page(page);
350 page_cache_release(page);
351
352 if (still_dirty ||
353 invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
354 ret = -EBUSY;
355 return ret;
356}
357
358/**
359 * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
360 * @inode: inode of the meta data file
361 * @block: block offset
362 *
363 * Return Value: On success, it returns 0. On error, the following negative
364 * error code is returned.
365 *
366 * %-ENOMEM - Insufficient memory available.
367 *
368 * %-EIO - I/O error
369 *
370 * %-ENOENT - the specified block does not exist (hole block)
371 *
372 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
373 */
374int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
375{
376 struct buffer_head *bh;
377 int err;
378
379 err = nilfs_mdt_read_block(inode, block, &bh);
380 if (unlikely(err))
381 return err;
382 nilfs_mark_buffer_dirty(bh);
383 nilfs_mdt_mark_dirty(inode);
384 brelse(bh);
385 return 0;
386}
387
388int nilfs_mdt_fetch_dirty(struct inode *inode)
389{
390 struct nilfs_inode_info *ii = NILFS_I(inode);
391
392 if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
393 set_bit(NILFS_I_DIRTY, &ii->i_state);
394 return 1;
395 }
396 return test_bit(NILFS_I_DIRTY, &ii->i_state);
397}
398
399static int
400nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
401{
402 struct inode *inode = container_of(page->mapping,
403 struct inode, i_data);
404 struct super_block *sb = inode->i_sb;
405 struct nilfs_sb_info *writer = NULL;
406 int err = 0;
407
408 redirty_page_for_writepage(wbc, page);
409 unlock_page(page);
410
411 if (page->mapping->assoc_mapping)
412 return 0; /* Do not request flush for shadow page cache */
413 if (!sb) {
414 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
415 if (!writer)
416 return -EROFS;
417 sb = writer->s_super;
418 }
419
420 if (wbc->sync_mode == WB_SYNC_ALL)
421 err = nilfs_construct_segment(sb);
422 else if (wbc->for_reclaim)
423 nilfs_flush_segment(sb, inode->i_ino);
424
425 if (writer)
426 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
427 return err;
428}
429
430
431static struct address_space_operations def_mdt_aops = {
432 .writepage = nilfs_mdt_write_page,
433};
434
435static struct inode_operations def_mdt_iops;
436static struct file_operations def_mdt_fops;
437
438/*
439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
440 * ifile, or gcinodes. This allows the B-tree code and segment constructor
441 * to treat them like regular files, and this helps to simplify the
442 * implementation.
443 * On the other hand, some of the pseudo inodes have an irregular point:
444 * They don't have valid inode->i_sb pointer because their lifetimes are
445 * longer than those of the super block structs; they may continue for
446 * several consecutive mounts/umounts. This would need discussions.
447 */
448struct inode *
449nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
450 ino_t ino, gfp_t gfp_mask)
451{
452 struct inode *inode = nilfs_alloc_inode(sb);
453
454 if (!inode)
455 return NULL;
456 else {
457 struct address_space * const mapping = &inode->i_data;
458 struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
459
460 if (!mi) {
461 nilfs_destroy_inode(inode);
462 return NULL;
463 }
464 mi->mi_nilfs = nilfs;
465 init_rwsem(&mi->mi_sem);
466
467 inode->i_sb = sb; /* sb may be NULL for some meta data files */
468 inode->i_blkbits = nilfs->ns_blocksize_bits;
469 inode->i_flags = 0;
470 atomic_set(&inode->i_count, 1);
471 inode->i_nlink = 1;
472 inode->i_ino = ino;
473 inode->i_mode = S_IFREG;
474 inode->i_private = mi;
475
476#ifdef INIT_UNUSED_INODE_FIELDS
477 atomic_set(&inode->i_writecount, 0);
478 inode->i_size = 0;
479 inode->i_blocks = 0;
480 inode->i_bytes = 0;
481 inode->i_generation = 0;
482#ifdef CONFIG_QUOTA
483 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
484#endif
485 inode->i_pipe = NULL;
486 inode->i_bdev = NULL;
487 inode->i_cdev = NULL;
488 inode->i_rdev = 0;
489#ifdef CONFIG_SECURITY
490 inode->i_security = NULL;
491#endif
492 inode->dirtied_when = 0;
493
494 INIT_LIST_HEAD(&inode->i_list);
495 INIT_LIST_HEAD(&inode->i_sb_list);
496 inode->i_state = 0;
497#endif
498
499 spin_lock_init(&inode->i_lock);
500 mutex_init(&inode->i_mutex);
501 init_rwsem(&inode->i_alloc_sem);
502
503 mapping->host = NULL; /* instead of inode */
504 mapping->flags = 0;
505 mapping_set_gfp_mask(mapping, gfp_mask);
506 mapping->assoc_mapping = NULL;
507 mapping->backing_dev_info = nilfs->ns_bdi;
508
509 inode->i_mapping = mapping;
510 }
511
512 return inode;
513}
514
515struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
516 ino_t ino, gfp_t gfp_mask)
517{
518 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
519
520 if (!inode)
521 return NULL;
522
523 inode->i_op = &def_mdt_iops;
524 inode->i_fop = &def_mdt_fops;
525 inode->i_mapping->a_ops = &def_mdt_aops;
526 return inode;
527}
528
529void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
530 unsigned header_size)
531{
532 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
533
534 mi->mi_entry_size = entry_size;
535 mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
536 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
537}
538
539void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
540{
541 shadow->i_mapping->assoc_mapping = orig->i_mapping;
542 NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
543 &NILFS_I(orig)->i_btnode_cache;
544}
545
546void nilfs_mdt_clear(struct inode *inode)
547{
548 struct nilfs_inode_info *ii = NILFS_I(inode);
549
550 invalidate_mapping_pages(inode->i_mapping, 0, -1);
551 truncate_inode_pages(inode->i_mapping, 0);
552
553 nilfs_bmap_clear(ii->i_bmap);
554 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
555}
556
557void nilfs_mdt_destroy(struct inode *inode)
558{
559 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
560
561 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
562 kfree(mdi);
563 nilfs_destroy_inode(inode);
564}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 000000000000..df683e0bca6a
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
1/*
2 * mdt.h - NILFS meta data file prototype and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#ifndef _NILFS_MDT_H
24#define _NILFS_MDT_H
25
26#include <linux/buffer_head.h>
27#include <linux/blockgroup_lock.h>
28#include "nilfs.h"
29#include "page.h"
30
31/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block
39 * @mi_blocks_per_group: number of blocks in a group
40 * @mi_blocks_per_desc_block: number of blocks per descriptor block
41 */
42struct nilfs_mdt_info {
43 struct the_nilfs *mi_nilfs;
44 struct rw_semaphore mi_sem;
45 struct blockgroup_lock *mi_bgl;
46 unsigned mi_entry_size;
47 unsigned mi_first_entry_offset;
48 unsigned long mi_entries_per_block;
49 unsigned long mi_blocks_per_group;
50 unsigned long mi_blocks_per_desc_block;
51};
52
53static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
54{
55 return inode->i_private;
56}
57
58static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
59{
60 struct super_block *sb = inode->i_sb;
61
62 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
63}
64
65/* Default GFP flags using highmem */
66#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
67
68int nilfs_mdt_get_block(struct inode *, unsigned long, int,
69 void (*init_block)(struct inode *,
70 struct buffer_head *, void *),
71 struct buffer_head **);
72int nilfs_mdt_delete_block(struct inode *, unsigned long);
73int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *);
76
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
78 gfp_t);
79struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
80 ino_t, gfp_t);
81void nilfs_mdt_destroy(struct inode *);
82void nilfs_mdt_clear(struct inode *);
83void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
84void nilfs_mdt_set_shadow(struct inode *, struct inode *);
85
86
87#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
88
89static inline void nilfs_mdt_mark_dirty(struct inode *inode)
90{
91 if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
92 set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
93}
94
95static inline void nilfs_mdt_clear_dirty(struct inode *inode)
96{
97 clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
98}
99
100static inline __u64 nilfs_mdt_cno(struct inode *inode)
101{
102 return NILFS_MDT(inode)->mi_nilfs->ns_cno;
103}
104
105#define nilfs_mdt_bgl_lock(inode, bg) \
106 (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
107
108
109static inline int
110nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
111 unsigned n)
112{
113 return nilfs_read_inode_common(
114 inode, (struct nilfs_inode *)(bh->b_data + n));
115}
116
117static inline void
118nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
119 unsigned n)
120{
121 nilfs_write_inode_common(
122 inode, (struct nilfs_inode *)(bh->b_data + n), 1);
123}
124
125#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 000000000000..df70dadb336f
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
1/*
2 * namei.c - NILFS pathname lookup operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23/*
24 * linux/fs/ext2/namei.c
25 *
26 * Copyright (C) 1992, 1993, 1994, 1995
27 * Remy Card (card@masi.ibp.fr)
28 * Laboratoire MASI - Institut Blaise Pascal
29 * Universite Pierre et Marie Curie (Paris VI)
30 *
31 * from
32 *
33 * linux/fs/minix/namei.c
34 *
35 * Copyright (C) 1991, 1992 Linus Torvalds
36 *
37 * Big-endian to little-endian byte-swapping/bitmaps by
38 * David S. Miller (davem@caip.rutgers.edu), 1995
39 */
40
41#include <linux/pagemap.h>
42#include "nilfs.h"
43
44
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{
47 int err = nilfs_add_link(dentry, inode);
48 if (!err) {
49 d_instantiate(dentry, inode);
50 return 0;
51 }
52 inode_dec_link_count(inode);
53 iput(inode);
54 return err;
55}
56
57/*
58 * Methods themselves.
59 */
60
61static struct dentry *
62nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
63{
64 struct inode *inode;
65 ino_t ino;
66
67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG);
69
70 ino = nilfs_inode_by_name(dir, dentry);
71 inode = NULL;
72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino);
74 if (IS_ERR(inode))
75 return ERR_CAST(inode);
76 }
77 return d_splice_alias(inode, dentry);
78}
79
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct dentry dotdot;
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino)
91 return ERR_PTR(-ENOENT);
92
93 inode = nilfs_iget(child->d_inode->i_sb, ino);
94 if (IS_ERR(inode))
95 return ERR_CAST(inode);
96 return d_obtain_alias(inode);
97}
98
99/*
100 * By the time this is called, we already have created
101 * the directory cache entry for the new file, but it
102 * is so far negative - it has no inode.
103 *
104 * If the create succeeds, we fill in the inode information
105 * with d_instantiate().
106 */
107static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
108 struct nameidata *nd)
109{
110 struct inode *inode;
111 struct nilfs_transaction_info ti;
112 int err;
113
114 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
115 if (err)
116 return err;
117 inode = nilfs_new_inode(dir, mode);
118 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) {
120 inode->i_op = &nilfs_file_inode_operations;
121 inode->i_fop = &nilfs_file_operations;
122 inode->i_mapping->a_ops = &nilfs_aops;
123 mark_inode_dirty(inode);
124 err = nilfs_add_nondir(dentry, inode);
125 }
126 if (!err)
127 err = nilfs_transaction_commit(dir->i_sb);
128 else
129 nilfs_transaction_abort(dir->i_sb);
130
131 return err;
132}
133
134static int
135nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
136{
137 struct inode *inode;
138 struct nilfs_transaction_info ti;
139 int err;
140
141 if (!new_valid_dev(rdev))
142 return -EINVAL;
143
144 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
145 if (err)
146 return err;
147 inode = nilfs_new_inode(dir, mode);
148 err = PTR_ERR(inode);
149 if (!IS_ERR(inode)) {
150 init_special_inode(inode, inode->i_mode, rdev);
151 mark_inode_dirty(inode);
152 err = nilfs_add_nondir(dentry, inode);
153 }
154 if (!err)
155 err = nilfs_transaction_commit(dir->i_sb);
156 else
157 nilfs_transaction_abort(dir->i_sb);
158
159 return err;
160}
161
162static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
163 const char *symname)
164{
165 struct nilfs_transaction_info ti;
166 struct super_block *sb = dir->i_sb;
167 unsigned l = strlen(symname)+1;
168 struct inode *inode;
169 int err;
170
171 if (l > sb->s_blocksize)
172 return -ENAMETOOLONG;
173
174 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
175 if (err)
176 return err;
177
178 inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
179 err = PTR_ERR(inode);
180 if (IS_ERR(inode))
181 goto out;
182
183 /* slow symlink */
184 inode->i_op = &nilfs_symlink_inode_operations;
185 inode->i_mapping->a_ops = &nilfs_aops;
186 err = page_symlink(inode, symname, l);
187 if (err)
188 goto out_fail;
189
190 /* mark_inode_dirty(inode); */
191 /* nilfs_new_inode() and page_symlink() do this */
192
193 err = nilfs_add_nondir(dentry, inode);
194out:
195 if (!err)
196 err = nilfs_transaction_commit(dir->i_sb);
197 else
198 nilfs_transaction_abort(dir->i_sb);
199
200 return err;
201
202out_fail:
203 inode_dec_link_count(inode);
204 iput(inode);
205 goto out;
206}
207
208static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
209 struct dentry *dentry)
210{
211 struct inode *inode = old_dentry->d_inode;
212 struct nilfs_transaction_info ti;
213 int err;
214
215 if (inode->i_nlink >= NILFS_LINK_MAX)
216 return -EMLINK;
217
218 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
219 if (err)
220 return err;
221
222 inode->i_ctime = CURRENT_TIME;
223 inode_inc_link_count(inode);
224 atomic_inc(&inode->i_count);
225
226 err = nilfs_add_nondir(dentry, inode);
227 if (!err)
228 err = nilfs_transaction_commit(dir->i_sb);
229 else
230 nilfs_transaction_abort(dir->i_sb);
231
232 return err;
233}
234
235static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
236{
237 struct inode *inode;
238 struct nilfs_transaction_info ti;
239 int err;
240
241 if (dir->i_nlink >= NILFS_LINK_MAX)
242 return -EMLINK;
243
244 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
245 if (err)
246 return err;
247
248 inode_inc_link_count(dir);
249
250 inode = nilfs_new_inode(dir, S_IFDIR | mode);
251 err = PTR_ERR(inode);
252 if (IS_ERR(inode))
253 goto out_dir;
254
255 inode->i_op = &nilfs_dir_inode_operations;
256 inode->i_fop = &nilfs_dir_operations;
257 inode->i_mapping->a_ops = &nilfs_aops;
258
259 inode_inc_link_count(inode);
260
261 err = nilfs_make_empty(inode, dir);
262 if (err)
263 goto out_fail;
264
265 err = nilfs_add_link(dentry, inode);
266 if (err)
267 goto out_fail;
268
269 d_instantiate(dentry, inode);
270out:
271 if (!err)
272 err = nilfs_transaction_commit(dir->i_sb);
273 else
274 nilfs_transaction_abort(dir->i_sb);
275
276 return err;
277
278out_fail:
279 inode_dec_link_count(inode);
280 inode_dec_link_count(inode);
281 iput(inode);
282out_dir:
283 inode_dec_link_count(dir);
284 goto out;
285}
286
287static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
288{
289 struct inode *inode;
290 struct nilfs_dir_entry *de;
291 struct page *page;
292 struct nilfs_transaction_info ti;
293 int err;
294
295 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
296 if (err)
297 return err;
298
299 err = -ENOENT;
300 de = nilfs_find_entry(dir, dentry, &page);
301 if (!de)
302 goto out;
303
304 inode = dentry->d_inode;
305 err = -EIO;
306 if (le64_to_cpu(de->inode) != inode->i_ino)
307 goto out;
308
309 if (!inode->i_nlink) {
310 nilfs_warning(inode->i_sb, __func__,
311 "deleting nonexistent file (%lu), %d\n",
312 inode->i_ino, inode->i_nlink);
313 inode->i_nlink = 1;
314 }
315 err = nilfs_delete_entry(de, page);
316 if (err)
317 goto out;
318
319 inode->i_ctime = dir->i_ctime;
320 inode_dec_link_count(inode);
321 err = 0;
322out:
323 if (!err)
324 err = nilfs_transaction_commit(dir->i_sb);
325 else
326 nilfs_transaction_abort(dir->i_sb);
327
328 return err;
329}
330
331static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
332{
333 struct inode *inode = dentry->d_inode;
334 struct nilfs_transaction_info ti;
335 int err;
336
337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
338 if (err)
339 return err;
340
341 err = -ENOTEMPTY;
342 if (nilfs_empty_dir(inode)) {
343 err = nilfs_unlink(dir, dentry);
344 if (!err) {
345 inode->i_size = 0;
346 inode_dec_link_count(inode);
347 inode_dec_link_count(dir);
348 }
349 }
350 if (!err)
351 err = nilfs_transaction_commit(dir->i_sb);
352 else
353 nilfs_transaction_abort(dir->i_sb);
354
355 return err;
356}
357
358static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
359 struct inode *new_dir, struct dentry *new_dentry)
360{
361 struct inode *old_inode = old_dentry->d_inode;
362 struct inode *new_inode = new_dentry->d_inode;
363 struct page *dir_page = NULL;
364 struct nilfs_dir_entry *dir_de = NULL;
365 struct page *old_page;
366 struct nilfs_dir_entry *old_de;
367 struct nilfs_transaction_info ti;
368 int err;
369
370 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
371 if (unlikely(err))
372 return err;
373
374 err = -ENOENT;
375 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
376 if (!old_de)
377 goto out;
378
379 if (S_ISDIR(old_inode->i_mode)) {
380 err = -EIO;
381 dir_de = nilfs_dotdot(old_inode, &dir_page);
382 if (!dir_de)
383 goto out_old;
384 }
385
386 if (new_inode) {
387 struct page *new_page;
388 struct nilfs_dir_entry *new_de;
389
390 err = -ENOTEMPTY;
391 if (dir_de && !nilfs_empty_dir(new_inode))
392 goto out_dir;
393
394 err = -ENOENT;
395 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
396 if (!new_de)
397 goto out_dir;
398 inode_inc_link_count(old_inode);
399 nilfs_set_link(new_dir, new_de, new_page, old_inode);
400 new_inode->i_ctime = CURRENT_TIME;
401 if (dir_de)
402 drop_nlink(new_inode);
403 inode_dec_link_count(new_inode);
404 } else {
405 if (dir_de) {
406 err = -EMLINK;
407 if (new_dir->i_nlink >= NILFS_LINK_MAX)
408 goto out_dir;
409 }
410 inode_inc_link_count(old_inode);
411 err = nilfs_add_link(new_dentry, old_inode);
412 if (err) {
413 inode_dec_link_count(old_inode);
414 goto out_dir;
415 }
416 if (dir_de)
417 inode_inc_link_count(new_dir);
418 }
419
420 /*
421 * Like most other Unix systems, set the ctime for inodes on a
422 * rename.
423 * inode_dec_link_count() will mark the inode dirty.
424 */
425 old_inode->i_ctime = CURRENT_TIME;
426
427 nilfs_delete_entry(old_de, old_page);
428 inode_dec_link_count(old_inode);
429
430 if (dir_de) {
431 nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
432 inode_dec_link_count(old_dir);
433 }
434
435 err = nilfs_transaction_commit(old_dir->i_sb);
436 return err;
437
438out_dir:
439 if (dir_de) {
440 kunmap(dir_page);
441 page_cache_release(dir_page);
442 }
443out_old:
444 kunmap(old_page);
445 page_cache_release(old_page);
446out:
447 nilfs_transaction_abort(old_dir->i_sb);
448 return err;
449}
450
451struct inode_operations nilfs_dir_inode_operations = {
452 .create = nilfs_create,
453 .lookup = nilfs_lookup,
454 .link = nilfs_link,
455 .unlink = nilfs_unlink,
456 .symlink = nilfs_symlink,
457 .mkdir = nilfs_mkdir,
458 .rmdir = nilfs_rmdir,
459 .mknod = nilfs_mknod,
460 .rename = nilfs_rename,
461 .setattr = nilfs_setattr,
462 .permission = nilfs_permission,
463};
464
465struct inode_operations nilfs_special_inode_operations = {
466 .setattr = nilfs_setattr,
467 .permission = nilfs_permission,
468};
469
470struct inode_operations nilfs_symlink_inode_operations = {
471 .readlink = generic_readlink,
472 .follow_link = page_follow_link_light,
473 .put_link = page_put_link,
474};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000000..da6fc0bba2e5
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,314 @@
1/*
2 * nilfs.h - NILFS local header file.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_H
25#define _NILFS_H
26
27#include <linux/kernel.h>
28#include <linux/buffer_head.h>
29#include <linux/spinlock.h>
30#include <linux/blkdev.h>
31#include <linux/nilfs2_fs.h>
32#include "the_nilfs.h"
33#include "sb.h"
34#include "bmap.h"
35#include "bmap_union.h"
36
37/*
38 * nilfs inode data in memory
39 */
40struct nilfs_inode_info {
41 __u32 i_flags;
42 unsigned long i_state; /* Dynamic state flags */
43 struct nilfs_bmap *i_bmap;
44 union nilfs_bmap_union i_bmap_union;
45 __u64 i_xattr; /* sector_t ??? */
46 __u32 i_dir_start_lookup;
47 __u64 i_cno; /* check point number for GC inode */
48 struct address_space i_btnode_cache;
49 struct list_head i_dirty; /* List for connecting dirty files */
50
51#ifdef CONFIG_NILFS_XATTR
52 /*
53 * Extended attributes can be read independently of the main file
54 * data. Taking i_sem even when reading would cause contention
55 * between readers of EAs and writers of regular file data, so
56 * instead we synchronize on xattr_sem when reading or changing
57 * EAs.
58 */
59 struct rw_semaphore xattr_sem;
60#endif
61#ifdef CONFIG_NILFS_POSIX_ACL
62 struct posix_acl *i_acl;
63 struct posix_acl *i_default_acl;
64#endif
65 struct buffer_head *i_bh; /* i_bh contains a new or dirty
66 disk inode */
67 struct inode vfs_inode;
68};
69
70static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
71{
72 return container_of(inode, struct nilfs_inode_info, vfs_inode);
73}
74
75static inline struct nilfs_inode_info *
76NILFS_BMAP_I(const struct nilfs_bmap *bmap)
77{
78 return container_of((union nilfs_bmap_union *)bmap,
79 struct nilfs_inode_info,
80 i_bmap_union);
81}
82
83static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
84{
85 struct nilfs_inode_info *ii =
86 container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
87 return &ii->vfs_inode;
88}
89
90static inline struct inode *NILFS_AS_I(struct address_space *mapping)
91{
92 return (mapping->host) ? :
93 container_of(mapping, struct inode, i_data);
94}
95
96/*
97 * Dynamic state flags of NILFS on-memory inode (i_state)
98 */
99enum {
100 NILFS_I_NEW = 0, /* Inode is newly created */
101 NILFS_I_DIRTY, /* The file is dirty */
102 NILFS_I_QUEUED, /* inode is in dirty_files list */
103 NILFS_I_BUSY, /* inode is grabbed by a segment
104 constructor */
105 NILFS_I_COLLECTED, /* All dirty blocks are collected */
106 NILFS_I_UPDATED, /* The file has been written back */
107 NILFS_I_INODE_DIRTY, /* write_inode is requested */
108 NILFS_I_BMAP, /* has bmap and btnode_cache */
109 NILFS_I_GCINODE, /* inode for GC, on memory only */
110 NILFS_I_GCDAT, /* shadow DAT, on memory only */
111};
112
113/*
114 * Macros to check inode numbers
115 */
116#define NILFS_MDT_INO_BITS \
117 ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
118 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
119 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
120
121#define NILFS_SYS_INO_BITS \
122 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
123
124#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino)
125
126#define NILFS_MDT_INODE(sb, ino) \
127 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
128#define NILFS_VALID_INODE(sb, ino) \
129 ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
130
131/**
132 * struct nilfs_transaction_info: context information for synchronization
133 * @ti_magic: Magic number
134 * @ti_save: Backup of journal_info field of task_struct
135 * @ti_flags: Flags
136 * @ti_count: Nest level
137 * @ti_garbage: List of inode to be put when releasing semaphore
138 */
139struct nilfs_transaction_info {
140 u32 ti_magic;
141 void *ti_save;
142 /* This should never used. If this happens,
143 one of other filesystems has a bug. */
144 unsigned short ti_flags;
145 unsigned short ti_count;
146 struct list_head ti_garbage;
147};
148
149/* ti_magic */
150#define NILFS_TI_MAGIC 0xd9e392fb
151
152/* ti_flags */
153#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
154#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the
155 end of transaction. */
156#define NILFS_TI_GC 0x0004 /* GC context */
157#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
158#define NILFS_TI_WRITER 0x0010 /* Constructor context */
159
160
161int nilfs_transaction_begin(struct super_block *,
162 struct nilfs_transaction_info *, int);
163int nilfs_transaction_commit(struct super_block *);
164void nilfs_transaction_abort(struct super_block *);
165
166static inline void nilfs_set_transaction_flag(unsigned int flag)
167{
168 struct nilfs_transaction_info *ti = current->journal_info;
169
170 ti->ti_flags |= flag;
171}
172
173static inline int nilfs_test_transaction_flag(unsigned int flag)
174{
175 struct nilfs_transaction_info *ti = current->journal_info;
176
177 if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
178 return 0;
179 return !!(ti->ti_flags & flag);
180}
181
182static inline int nilfs_doing_gc(void)
183{
184 return nilfs_test_transaction_flag(NILFS_TI_GC);
185}
186
187static inline int nilfs_doing_construction(void)
188{
189 return nilfs_test_transaction_flag(NILFS_TI_WRITER);
190}
191
192static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
193{
194 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
195}
196
197/*
198 * function prototype
199 */
200#ifdef CONFIG_NILFS_POSIX_ACL
201#error "NILFS: not yet supported POSIX ACL"
202extern int nilfs_permission(struct inode *, int, struct nameidata *);
203extern int nilfs_acl_chmod(struct inode *);
204extern int nilfs_init_acl(struct inode *, struct inode *);
205#else
206#define nilfs_permission NULL
207
208static inline int nilfs_acl_chmod(struct inode *inode)
209{
210 return 0;
211}
212
213static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
214{
215 inode->i_mode &= ~current_umask();
216 return 0;
217}
218#endif
219
220#define NILFS_ATIME_DISABLE
221
222/* dir.c */
223extern int nilfs_add_link(struct dentry *, struct inode *);
224extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
225extern int nilfs_make_empty(struct inode *, struct inode *);
226extern struct nilfs_dir_entry *
227nilfs_find_entry(struct inode *, struct dentry *, struct page **);
228extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
229extern int nilfs_empty_dir(struct inode *);
230extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
231extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
232 struct page *, struct inode *);
233
234/* file.c */
235extern int nilfs_sync_file(struct file *, struct dentry *, int);
236
237/* ioctl.c */
238long nilfs_ioctl(struct file *, unsigned int, unsigned long);
239int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
240 void **);
241
242/* inode.c */
243extern struct inode *nilfs_new_inode(struct inode *, int);
244extern void nilfs_free_inode(struct inode *);
245extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
246extern void nilfs_set_inode_flags(struct inode *);
247extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
248extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
249extern struct inode *nilfs_iget(struct super_block *, unsigned long);
250extern void nilfs_update_inode(struct inode *, struct buffer_head *);
251extern void nilfs_truncate(struct inode *);
252extern void nilfs_delete_inode(struct inode *);
253extern int nilfs_setattr(struct dentry *, struct iattr *);
254extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
255 struct buffer_head **);
256extern int nilfs_inode_dirty(struct inode *);
257extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
258 unsigned);
259extern int nilfs_mark_inode_dirty(struct inode *);
260extern void nilfs_dirty_inode(struct inode *);
261
262/* namei.c */
263extern struct dentry *nilfs_get_parent(struct dentry *);
264
265/* super.c */
266extern struct inode *nilfs_alloc_inode(struct super_block *);
267extern void nilfs_destroy_inode(struct inode *);
268extern void nilfs_error(struct super_block *, const char *, const char *, ...)
269 __attribute__ ((format (printf, 3, 4)));
270extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
271 __attribute__ ((format (printf, 3, 4)));
272extern struct nilfs_super_block *
273nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
274extern int nilfs_store_magic_and_option(struct super_block *,
275 struct nilfs_super_block *, char *);
276extern int nilfs_commit_super(struct nilfs_sb_info *, int);
277extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
278extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
279
280/* gcinode.c */
281int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
282 struct buffer_head **);
283int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
284 struct buffer_head **);
285int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
286int nilfs_init_gccache(struct the_nilfs *);
287void nilfs_destroy_gccache(struct the_nilfs *);
288void nilfs_clear_gcinode(struct inode *);
289struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
290void nilfs_remove_all_gcinode(struct the_nilfs *);
291
292/* gcdat.c */
293int nilfs_init_gcdat_inode(struct the_nilfs *);
294void nilfs_commit_gcdat_inode(struct the_nilfs *);
295void nilfs_clear_gcdat_inode(struct the_nilfs *);
296
297/*
298 * Inodes and files operations
299 */
300extern struct file_operations nilfs_dir_operations;
301extern struct inode_operations nilfs_file_inode_operations;
302extern struct file_operations nilfs_file_operations;
303extern struct address_space_operations nilfs_aops;
304extern struct inode_operations nilfs_dir_inode_operations;
305extern struct inode_operations nilfs_special_inode_operations;
306extern struct inode_operations nilfs_symlink_inode_operations;
307
308/*
309 * filesystem type
310 */
311extern struct file_system_type nilfs_fs_type;
312
313
314#endif /* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 000000000000..a2692bbc7b50
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,541 @@
1/*
2 * page.c - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#include <linux/pagemap.h>
25#include <linux/writeback.h>
26#include <linux/swap.h>
27#include <linux/bitops.h>
28#include <linux/page-flags.h>
29#include <linux/list.h>
30#include <linux/highmem.h>
31#include <linux/pagevec.h>
32#include "nilfs.h"
33#include "page.h"
34#include "mdt.h"
35
36
37#define NILFS_BUFFER_INHERENT_BITS \
38 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
39 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
40
41static struct buffer_head *
42__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
43 int blkbits, unsigned long b_state)
44
45{
46 unsigned long first_block;
47 struct buffer_head *bh;
48
49 if (!page_has_buffers(page))
50 create_empty_buffers(page, 1 << blkbits, b_state);
51
52 first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
53 bh = nilfs_page_get_nth_block(page, block - first_block);
54
55 touch_buffer(bh);
56 wait_on_buffer(bh);
57 return bh;
58}
59
60/*
61 * Since the page cache of B-tree node pages or data page cache of pseudo
62 * inodes does not have a valid mapping->host pointer, calling
63 * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
64 * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
65 * To avoid this problem, the old style mark_buffer_dirty() is used instead.
66 */
67void nilfs_mark_buffer_dirty(struct buffer_head *bh)
68{
69 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
70 __set_page_dirty_nobuffers(bh->b_page);
71}
72
73struct buffer_head *nilfs_grab_buffer(struct inode *inode,
74 struct address_space *mapping,
75 unsigned long blkoff,
76 unsigned long b_state)
77{
78 int blkbits = inode->i_blkbits;
79 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
80 struct page *page, *opage;
81 struct buffer_head *bh, *obh;
82
83 page = grab_cache_page(mapping, index);
84 if (unlikely(!page))
85 return NULL;
86
87 bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
88 if (unlikely(!bh)) {
89 unlock_page(page);
90 page_cache_release(page);
91 return NULL;
92 }
93 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
94 /*
95 * Shadow page cache uses assoc_mapping to point its original
96 * page cache. The following code tries the original cache
97 * if the given cache is a shadow and it didn't hit.
98 */
99 opage = find_lock_page(mapping->assoc_mapping, index);
100 if (!opage)
101 return bh;
102
103 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
104 b_state);
105 if (buffer_uptodate(obh)) {
106 nilfs_copy_buffer(bh, obh);
107 if (buffer_dirty(obh)) {
108 nilfs_mark_buffer_dirty(bh);
109 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
110 nilfs_mdt_mark_dirty(inode);
111 }
112 }
113 brelse(obh);
114 unlock_page(opage);
115 page_cache_release(opage);
116 }
117 return bh;
118}
119
120/**
121 * nilfs_forget_buffer - discard dirty state
122 * @inode: owner inode of the buffer
123 * @bh: buffer head of the buffer to be discarded
124 */
125void nilfs_forget_buffer(struct buffer_head *bh)
126{
127 struct page *page = bh->b_page;
128
129 lock_buffer(bh);
130 clear_buffer_nilfs_volatile(bh);
131 clear_buffer_dirty(bh);
132 if (nilfs_page_buffers_clean(page))
133 __nilfs_clear_page_dirty(page);
134
135 clear_buffer_uptodate(bh);
136 clear_buffer_mapped(bh);
137 bh->b_blocknr = -1;
138 ClearPageUptodate(page);
139 ClearPageMappedToDisk(page);
140 unlock_buffer(bh);
141 brelse(bh);
142}
143
144/**
145 * nilfs_copy_buffer -- copy buffer data and flags
146 * @dbh: destination buffer
147 * @sbh: source buffer
148 */
149void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
150{
151 void *kaddr0, *kaddr1;
152 unsigned long bits;
153 struct page *spage = sbh->b_page, *dpage = dbh->b_page;
154 struct buffer_head *bh;
155
156 kaddr0 = kmap_atomic(spage, KM_USER0);
157 kaddr1 = kmap_atomic(dpage, KM_USER1);
158 memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
159 kunmap_atomic(kaddr1, KM_USER1);
160 kunmap_atomic(kaddr0, KM_USER0);
161
162 dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
163 dbh->b_blocknr = sbh->b_blocknr;
164 dbh->b_bdev = sbh->b_bdev;
165
166 bh = dbh;
167 bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
168 while ((bh = bh->b_this_page) != dbh) {
169 lock_buffer(bh);
170 bits &= bh->b_state;
171 unlock_buffer(bh);
172 }
173 if (bits & (1UL << BH_Uptodate))
174 SetPageUptodate(dpage);
175 else
176 ClearPageUptodate(dpage);
177 if (bits & (1UL << BH_Mapped))
178 SetPageMappedToDisk(dpage);
179 else
180 ClearPageMappedToDisk(dpage);
181}
182
183/**
184 * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
185 * @page: page to be checked
186 *
187 * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
188 * Otherwise, it returns non-zero value.
189 */
190int nilfs_page_buffers_clean(struct page *page)
191{
192 struct buffer_head *bh, *head;
193
194 bh = head = page_buffers(page);
195 do {
196 if (buffer_dirty(bh))
197 return 0;
198 bh = bh->b_this_page;
199 } while (bh != head);
200 return 1;
201}
202
203void nilfs_page_bug(struct page *page)
204{
205 struct address_space *m;
206 unsigned long ino = 0;
207
208 if (unlikely(!page)) {
209 printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
210 return;
211 }
212
213 m = page->mapping;
214 if (m) {
215 struct inode *inode = NILFS_AS_I(m);
216 if (inode != NULL)
217 ino = inode->i_ino;
218 }
219 printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
220 "mapping=%p ino=%lu\n",
221 page, atomic_read(&page->_count),
222 (unsigned long long)page->index, page->flags, m, ino);
223
224 if (page_has_buffers(page)) {
225 struct buffer_head *bh, *head;
226 int i = 0;
227
228 bh = head = page_buffers(page);
229 do {
230 printk(KERN_CRIT
231 " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
232 i++, bh, atomic_read(&bh->b_count),
233 (unsigned long long)bh->b_blocknr, bh->b_state);
234 bh = bh->b_this_page;
235 } while (bh != head);
236 }
237}
238
239/**
240 * nilfs_alloc_private_page - allocate a private page with buffer heads
241 *
242 * Return Value: On success, a pointer to the allocated page is returned.
243 * On error, NULL is returned.
244 */
245struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
246 unsigned long state)
247{
248 struct buffer_head *bh, *head, *tail;
249 struct page *page;
250
251 page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
252 if (unlikely(!page))
253 return NULL;
254
255 lock_page(page);
256 head = alloc_page_buffers(page, size, 0);
257 if (unlikely(!head)) {
258 unlock_page(page);
259 __free_page(page);
260 return NULL;
261 }
262
263 bh = head;
264 do {
265 bh->b_state = (1UL << BH_NILFS_Allocated) | state;
266 tail = bh;
267 bh->b_bdev = bdev;
268 bh = bh->b_this_page;
269 } while (bh);
270
271 tail->b_this_page = head;
272 attach_page_buffers(page, head);
273
274 return page;
275}
276
277void nilfs_free_private_page(struct page *page)
278{
279 BUG_ON(!PageLocked(page));
280 BUG_ON(page->mapping);
281
282 if (page_has_buffers(page) && !try_to_free_buffers(page))
283 NILFS_PAGE_BUG(page, "failed to free page");
284
285 unlock_page(page);
286 __free_page(page);
287}
288
289/**
290 * nilfs_copy_page -- copy the page with buffers
291 * @dst: destination page
292 * @src: source page
293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
294 *
295 * This fuction is for both data pages and btnode pages. The dirty flag
296 * should be treated by caller. The page must not be under i/o.
297 * Both src and dst page must be locked
298 */
299static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
300{
301 struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
302 unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
303
304 BUG_ON(PageWriteback(dst));
305
306 sbh = sbufs = page_buffers(src);
307 if (!page_has_buffers(dst))
308 create_empty_buffers(dst, sbh->b_size, 0);
309
310 if (copy_dirty)
311 mask |= (1UL << BH_Dirty);
312
313 dbh = dbufs = page_buffers(dst);
314 do {
315 lock_buffer(sbh);
316 lock_buffer(dbh);
317 dbh->b_state = sbh->b_state & mask;
318 dbh->b_blocknr = sbh->b_blocknr;
319 dbh->b_bdev = sbh->b_bdev;
320 sbh = sbh->b_this_page;
321 dbh = dbh->b_this_page;
322 } while (dbh != dbufs);
323
324 copy_highpage(dst, src);
325
326 if (PageUptodate(src) && !PageUptodate(dst))
327 SetPageUptodate(dst);
328 else if (!PageUptodate(src) && PageUptodate(dst))
329 ClearPageUptodate(dst);
330 if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
331 SetPageMappedToDisk(dst);
332 else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
333 ClearPageMappedToDisk(dst);
334
335 do {
336 unlock_buffer(sbh);
337 unlock_buffer(dbh);
338 sbh = sbh->b_this_page;
339 dbh = dbh->b_this_page;
340 } while (dbh != dbufs);
341}
342
343int nilfs_copy_dirty_pages(struct address_space *dmap,
344 struct address_space *smap)
345{
346 struct pagevec pvec;
347 unsigned int i;
348 pgoff_t index = 0;
349 int err = 0;
350
351 pagevec_init(&pvec, 0);
352repeat:
353 if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
354 PAGEVEC_SIZE))
355 return 0;
356
357 for (i = 0; i < pagevec_count(&pvec); i++) {
358 struct page *page = pvec.pages[i], *dpage;
359
360 lock_page(page);
361 if (unlikely(!PageDirty(page)))
362 NILFS_PAGE_BUG(page, "inconsistent dirty state");
363
364 dpage = grab_cache_page(dmap, page->index);
365 if (unlikely(!dpage)) {
366 /* No empty page is added to the page cache */
367 err = -ENOMEM;
368 unlock_page(page);
369 break;
370 }
371 if (unlikely(!page_has_buffers(page)))
372 NILFS_PAGE_BUG(page,
373 "found empty page in dat page cache");
374
375 nilfs_copy_page(dpage, page, 1);
376 __set_page_dirty_nobuffers(dpage);
377
378 unlock_page(dpage);
379 page_cache_release(dpage);
380 unlock_page(page);
381 }
382 pagevec_release(&pvec);
383 cond_resched();
384
385 if (likely(!err))
386 goto repeat;
387 return err;
388}
389
390/**
391 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
392 * @dmap: destination page cache
393 * @smap: source page cache
394 *
395 * No pages must no be added to the cache during this process.
396 * This must be ensured by the caller.
397 */
398void nilfs_copy_back_pages(struct address_space *dmap,
399 struct address_space *smap)
400{
401 struct pagevec pvec;
402 unsigned int i, n;
403 pgoff_t index = 0;
404 int err;
405
406 pagevec_init(&pvec, 0);
407repeat:
408 n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
409 if (!n)
410 return;
411 index = pvec.pages[n - 1]->index + 1;
412
413 for (i = 0; i < pagevec_count(&pvec); i++) {
414 struct page *page = pvec.pages[i], *dpage;
415 pgoff_t offset = page->index;
416
417 lock_page(page);
418 dpage = find_lock_page(dmap, offset);
419 if (dpage) {
420 /* override existing page on the destination cache */
421 WARN_ON(PageDirty(dpage));
422 nilfs_copy_page(dpage, page, 0);
423 unlock_page(dpage);
424 page_cache_release(dpage);
425 } else {
426 struct page *page2;
427
428 /* move the page to the destination cache */
429 spin_lock_irq(&smap->tree_lock);
430 page2 = radix_tree_delete(&smap->page_tree, offset);
431 WARN_ON(page2 != page);
432
433 smap->nrpages--;
434 spin_unlock_irq(&smap->tree_lock);
435
436 spin_lock_irq(&dmap->tree_lock);
437 err = radix_tree_insert(&dmap->page_tree, offset, page);
438 if (unlikely(err < 0)) {
439 WARN_ON(err == -EEXIST);
440 page->mapping = NULL;
441 page_cache_release(page); /* for cache */
442 } else {
443 page->mapping = dmap;
444 dmap->nrpages++;
445 if (PageDirty(page))
446 radix_tree_tag_set(&dmap->page_tree,
447 offset,
448 PAGECACHE_TAG_DIRTY);
449 }
450 spin_unlock_irq(&dmap->tree_lock);
451 }
452 unlock_page(page);
453 }
454 pagevec_release(&pvec);
455 cond_resched();
456
457 goto repeat;
458}
459
460void nilfs_clear_dirty_pages(struct address_space *mapping)
461{
462 struct pagevec pvec;
463 unsigned int i;
464 pgoff_t index = 0;
465
466 pagevec_init(&pvec, 0);
467
468 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
469 PAGEVEC_SIZE)) {
470 for (i = 0; i < pagevec_count(&pvec); i++) {
471 struct page *page = pvec.pages[i];
472 struct buffer_head *bh, *head;
473
474 lock_page(page);
475 ClearPageUptodate(page);
476 ClearPageMappedToDisk(page);
477 bh = head = page_buffers(page);
478 do {
479 lock_buffer(bh);
480 clear_buffer_dirty(bh);
481 clear_buffer_nilfs_volatile(bh);
482 clear_buffer_uptodate(bh);
483 clear_buffer_mapped(bh);
484 unlock_buffer(bh);
485 bh = bh->b_this_page;
486 } while (bh != head);
487
488 __nilfs_clear_page_dirty(page);
489 unlock_page(page);
490 }
491 pagevec_release(&pvec);
492 cond_resched();
493 }
494}
495
496unsigned nilfs_page_count_clean_buffers(struct page *page,
497 unsigned from, unsigned to)
498{
499 unsigned block_start, block_end;
500 struct buffer_head *bh, *head;
501 unsigned nc = 0;
502
503 for (bh = head = page_buffers(page), block_start = 0;
504 bh != head || !block_start;
505 block_start = block_end, bh = bh->b_this_page) {
506 block_end = block_start + bh->b_size;
507 if (block_end > from && block_start < to && !buffer_dirty(bh))
508 nc++;
509 }
510 return nc;
511}
512
513/*
514 * NILFS2 needs clear_page_dirty() in the following two cases:
515 *
516 * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
517 * page dirty flags when it copies back pages from the shadow cache
518 * (gcdat->{i_mapping,i_btnode_cache}) to its original cache
519 * (dat->{i_mapping,i_btnode_cache}).
520 *
521 * 2) Some B-tree operations like insertion or deletion may dispose buffers
522 * in dirty state, and this needs to cancel the dirty state of their pages.
523 */
524int __nilfs_clear_page_dirty(struct page *page)
525{
526 struct address_space *mapping = page->mapping;
527
528 if (mapping) {
529 spin_lock_irq(&mapping->tree_lock);
530 if (test_bit(PG_dirty, &page->flags)) {
531 radix_tree_tag_clear(&mapping->page_tree,
532 page_index(page),
533 PAGECACHE_TAG_DIRTY);
534 spin_unlock_irq(&mapping->tree_lock);
535 return clear_page_dirty_for_io(page);
536 }
537 spin_unlock_irq(&mapping->tree_lock);
538 return 0;
539 }
540 return TestClearPageDirty(page);
541}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 000000000000..8abca4d1c1f8
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
1/*
2 * page.h - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#ifndef _NILFS_PAGE_H
25#define _NILFS_PAGE_H
26
27#include <linux/buffer_head.h>
28#include "nilfs.h"
29
30/*
31 * Extended buffer state bits
32 */
33enum {
34 BH_NILFS_Allocated = BH_PrivateStart,
35 BH_NILFS_Node,
36 BH_NILFS_Volatile,
37};
38
39BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
40BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
41BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
42
43
44void nilfs_mark_buffer_dirty(struct buffer_head *bh);
45int __nilfs_clear_page_dirty(struct page *);
46
47struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
48 unsigned long, unsigned long);
49void nilfs_forget_buffer(struct buffer_head *);
50void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
51int nilfs_page_buffers_clean(struct page *);
52void nilfs_page_bug(struct page *);
53struct page *nilfs_alloc_private_page(struct block_device *, int,
54 unsigned long);
55void nilfs_free_private_page(struct page *);
56
57int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
58void nilfs_copy_back_pages(struct address_space *, struct address_space *);
59void nilfs_clear_dirty_pages(struct address_space *);
60unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
61
62#define NILFS_PAGE_BUG(page, m, a...) \
63 do { nilfs_page_bug(page); BUG(); } while (0)
64
65static inline struct buffer_head *
66nilfs_page_get_nth_block(struct page *page, unsigned int count)
67{
68 struct buffer_head *bh = page_buffers(page);
69
70 while (count-- > 0)
71 bh = bh->b_this_page;
72 get_bh(bh);
73 return bh;
74}
75
76#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 000000000000..57afa9d24061
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,919 @@
1/*
2 * recovery.c - NILFS recovery logic
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/swap.h>
26#include <linux/crc32.h>
27#include "nilfs.h"
28#include "segment.h"
29#include "sufile.h"
30#include "page.h"
31#include "seglist.h"
32#include "segbuf.h"
33
34/*
35 * Segment check result
36 */
37enum {
38 NILFS_SEG_VALID,
39 NILFS_SEG_NO_SUPER_ROOT,
40 NILFS_SEG_FAIL_IO,
41 NILFS_SEG_FAIL_MAGIC,
42 NILFS_SEG_FAIL_SEQ,
43 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
44 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
45 NILFS_SEG_FAIL_CHECKSUM_FULL,
46 NILFS_SEG_FAIL_CONSISTENCY,
47};
48
49/* work structure for recovery */
50struct nilfs_recovery_block {
51 ino_t ino; /* Inode number of the file that this block
52 belongs to */
53 sector_t blocknr; /* block number */
54 __u64 vblocknr; /* virtual block number */
55 unsigned long blkoff; /* File offset of the data block (per block) */
56 struct list_head list;
57};
58
59
60static int nilfs_warn_segment_error(int err)
61{
62 switch (err) {
63 case NILFS_SEG_FAIL_IO:
64 printk(KERN_WARNING
65 "NILFS warning: I/O error on loading last segment\n");
66 return -EIO;
67 case NILFS_SEG_FAIL_MAGIC:
68 printk(KERN_WARNING
69 "NILFS warning: Segment magic number invalid\n");
70 break;
71 case NILFS_SEG_FAIL_SEQ:
72 printk(KERN_WARNING
73 "NILFS warning: Sequence number mismatch\n");
74 break;
75 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
76 printk(KERN_WARNING
77 "NILFS warning: Checksum error in segment summary\n");
78 break;
79 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
80 printk(KERN_WARNING
81 "NILFS warning: Checksum error in super root\n");
82 break;
83 case NILFS_SEG_FAIL_CHECKSUM_FULL:
84 printk(KERN_WARNING
85 "NILFS warning: Checksum error in segment payload\n");
86 break;
87 case NILFS_SEG_FAIL_CONSISTENCY:
88 printk(KERN_WARNING
89 "NILFS warning: Inconsistent segment\n");
90 break;
91 case NILFS_SEG_NO_SUPER_ROOT:
92 printk(KERN_WARNING
93 "NILFS warning: No super root in the last segment\n");
94 break;
95 }
96 return -EINVAL;
97}
98
99static void store_segsum_info(struct nilfs_segsum_info *ssi,
100 struct nilfs_segment_summary *sum,
101 unsigned int blocksize)
102{
103 ssi->flags = le16_to_cpu(sum->ss_flags);
104 ssi->seg_seq = le64_to_cpu(sum->ss_seq);
105 ssi->ctime = le64_to_cpu(sum->ss_create);
106 ssi->next = le64_to_cpu(sum->ss_next);
107 ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
108 ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
109 ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
110
111 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
112 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
113}
114
115/**
116 * calc_crc_cont - check CRC of blocks continuously
117 * @sbi: nilfs_sb_info
118 * @bhs: buffer head of start block
119 * @sum: place to store result
120 * @offset: offset bytes in the first block
121 * @check_bytes: number of bytes to be checked
122 * @start: DBN of start block
123 * @nblock: number of blocks to be checked
124 */
125static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
126 u32 *sum, unsigned long offset, u64 check_bytes,
127 sector_t start, unsigned long nblock)
128{
129 unsigned long blocksize = sbi->s_super->s_blocksize;
130 unsigned long size;
131 u32 crc;
132
133 BUG_ON(offset >= blocksize);
134 check_bytes -= offset;
135 size = min_t(u64, check_bytes, blocksize - offset);
136 crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
137 (unsigned char *)bhs->b_data + offset, size);
138 if (--nblock > 0) {
139 do {
140 struct buffer_head *bh
141 = sb_bread(sbi->s_super, ++start);
142 if (!bh)
143 return -EIO;
144 check_bytes -= size;
145 size = min_t(u64, check_bytes, blocksize);
146 crc = crc32_le(crc, bh->b_data, size);
147 brelse(bh);
148 } while (--nblock > 0);
149 }
150 *sum = crc;
151 return 0;
152}
153
154/**
155 * nilfs_read_super_root_block - read super root block
156 * @sb: super_block
157 * @sr_block: disk block number of the super root block
158 * @pbh: address of a buffer_head pointer to return super root buffer
159 * @check: CRC check flag
160 */
161int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
162 struct buffer_head **pbh, int check)
163{
164 struct buffer_head *bh_sr;
165 struct nilfs_super_root *sr;
166 u32 crc;
167 int ret;
168
169 *pbh = NULL;
170 bh_sr = sb_bread(sb, sr_block);
171 if (unlikely(!bh_sr)) {
172 ret = NILFS_SEG_FAIL_IO;
173 goto failed;
174 }
175
176 sr = (struct nilfs_super_root *)bh_sr->b_data;
177 if (check) {
178 unsigned bytes = le16_to_cpu(sr->sr_bytes);
179
180 if (bytes == 0 || bytes > sb->s_blocksize) {
181 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
182 goto failed_bh;
183 }
184 if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
185 sizeof(sr->sr_sum), bytes, sr_block, 1)) {
186 ret = NILFS_SEG_FAIL_IO;
187 goto failed_bh;
188 }
189 if (crc != le32_to_cpu(sr->sr_sum)) {
190 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
191 goto failed_bh;
192 }
193 }
194 *pbh = bh_sr;
195 return 0;
196
197 failed_bh:
198 brelse(bh_sr);
199
200 failed:
201 return nilfs_warn_segment_error(ret);
202}
203
204/**
205 * load_segment_summary - read segment summary of the specified partial segment
206 * @sbi: nilfs_sb_info
207 * @pseg_start: start disk block number of partial segment
208 * @seg_seq: sequence number requested
209 * @ssi: pointer to nilfs_segsum_info struct to store information
210 * @full_check: full check flag
211 * (0: only checks segment summary CRC, 1: data CRC)
212 */
213static int
214load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
215 u64 seg_seq, struct nilfs_segsum_info *ssi,
216 int full_check)
217{
218 struct buffer_head *bh_sum;
219 struct nilfs_segment_summary *sum;
220 unsigned long offset, nblock;
221 u64 check_bytes;
222 u32 crc, crc_sum;
223 int ret = NILFS_SEG_FAIL_IO;
224
225 bh_sum = sb_bread(sbi->s_super, pseg_start);
226 if (!bh_sum)
227 goto out;
228
229 sum = (struct nilfs_segment_summary *)bh_sum->b_data;
230
231 /* Check consistency of segment summary */
232 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
233 ret = NILFS_SEG_FAIL_MAGIC;
234 goto failed;
235 }
236 store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
237 if (seg_seq != ssi->seg_seq) {
238 ret = NILFS_SEG_FAIL_SEQ;
239 goto failed;
240 }
241 if (full_check) {
242 offset = sizeof(sum->ss_datasum);
243 check_bytes =
244 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
245 nblock = ssi->nblocks;
246 crc_sum = le32_to_cpu(sum->ss_datasum);
247 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
248 } else { /* only checks segment summary */
249 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
250 check_bytes = ssi->sumbytes;
251 nblock = ssi->nsumblk;
252 crc_sum = le32_to_cpu(sum->ss_sumsum);
253 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
254 }
255
256 if (unlikely(nblock == 0 ||
257 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
258 /* This limits the number of blocks read in the CRC check */
259 ret = NILFS_SEG_FAIL_CONSISTENCY;
260 goto failed;
261 }
262 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
263 pseg_start, nblock)) {
264 ret = NILFS_SEG_FAIL_IO;
265 goto failed;
266 }
267 if (crc == crc_sum)
268 ret = 0;
269 failed:
270 brelse(bh_sum);
271 out:
272 return ret;
273}
274
275static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
276 unsigned int *offset, unsigned int bytes)
277{
278 void *ptr;
279 sector_t blocknr;
280
281 BUG_ON((*pbh)->b_size < *offset);
282 if (bytes > (*pbh)->b_size - *offset) {
283 blocknr = (*pbh)->b_blocknr;
284 brelse(*pbh);
285 *pbh = sb_bread(sb, blocknr + 1);
286 if (unlikely(!*pbh))
287 return NULL;
288 *offset = 0;
289 }
290 ptr = (*pbh)->b_data + *offset;
291 *offset += bytes;
292 return ptr;
293}
294
295static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
296 unsigned int *offset, unsigned int bytes,
297 unsigned long count)
298{
299 unsigned int rest_item_in_current_block
300 = ((*pbh)->b_size - *offset) / bytes;
301
302 if (count <= rest_item_in_current_block) {
303 *offset += bytes * count;
304 } else {
305 sector_t blocknr = (*pbh)->b_blocknr;
306 unsigned int nitem_per_block = (*pbh)->b_size / bytes;
307 unsigned int bcnt;
308
309 count -= rest_item_in_current_block;
310 bcnt = DIV_ROUND_UP(count, nitem_per_block);
311 *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
312
313 brelse(*pbh);
314 *pbh = sb_bread(sb, blocknr + bcnt);
315 }
316}
317
318static int
319collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
320 struct nilfs_segsum_info *ssi,
321 struct list_head *head)
322{
323 struct buffer_head *bh;
324 unsigned int offset;
325 unsigned long nfinfo = ssi->nfinfo;
326 sector_t blocknr = sum_blocknr + ssi->nsumblk;
327 ino_t ino;
328 int err = -EIO;
329
330 if (!nfinfo)
331 return 0;
332
333 bh = sb_bread(sbi->s_super, sum_blocknr);
334 if (unlikely(!bh))
335 goto out;
336
337 offset = le16_to_cpu(
338 ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
339 for (;;) {
340 unsigned long nblocks, ndatablk, nnodeblk;
341 struct nilfs_finfo *finfo;
342
343 finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
344 if (unlikely(!finfo))
345 goto out;
346
347 ino = le64_to_cpu(finfo->fi_ino);
348 nblocks = le32_to_cpu(finfo->fi_nblocks);
349 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
350 nnodeblk = nblocks - ndatablk;
351
352 while (ndatablk-- > 0) {
353 struct nilfs_recovery_block *rb;
354 struct nilfs_binfo_v *binfo;
355
356 binfo = segsum_get(sbi->s_super, &bh, &offset,
357 sizeof(*binfo));
358 if (unlikely(!binfo))
359 goto out;
360
361 rb = kmalloc(sizeof(*rb), GFP_NOFS);
362 if (unlikely(!rb)) {
363 err = -ENOMEM;
364 goto out;
365 }
366 rb->ino = ino;
367 rb->blocknr = blocknr++;
368 rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
369 rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
370 /* INIT_LIST_HEAD(&rb->list); */
371 list_add_tail(&rb->list, head);
372 }
373 if (--nfinfo == 0)
374 break;
375 blocknr += nnodeblk; /* always 0 for the data sync segments */
376 segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
377 nnodeblk);
378 if (unlikely(!bh))
379 goto out;
380 }
381 err = 0;
382 out:
383 brelse(bh); /* brelse(NULL) is just ignored */
384 return err;
385}
386
387static void dispose_recovery_list(struct list_head *head)
388{
389 while (!list_empty(head)) {
390 struct nilfs_recovery_block *rb
391 = list_entry(head->next,
392 struct nilfs_recovery_block, list);
393 list_del(&rb->list);
394 kfree(rb);
395 }
396}
397
398void nilfs_dispose_segment_list(struct list_head *head)
399{
400 while (!list_empty(head)) {
401 struct nilfs_segment_entry *ent
402 = list_entry(head->next,
403 struct nilfs_segment_entry, list);
404 list_del(&ent->list);
405 nilfs_free_segment_entry(ent);
406 }
407}
408
409static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
410 struct nilfs_sb_info *sbi,
411 struct nilfs_recovery_info *ri)
412{
413 struct list_head *head = &ri->ri_used_segments;
414 struct nilfs_segment_entry *ent, *n;
415 struct inode *sufile = nilfs->ns_sufile;
416 __u64 segnum[4];
417 int err;
418 int i;
419
420 segnum[0] = nilfs->ns_segnum;
421 segnum[1] = nilfs->ns_nextnum;
422 segnum[2] = ri->ri_segnum;
423 segnum[3] = ri->ri_nextnum;
424
425 nilfs_attach_writer(nilfs, sbi);
426 /*
427 * Releasing the next segment of the latest super root.
428 * The next segment is invalidated by this recovery.
429 */
430 err = nilfs_sufile_free(sufile, segnum[1]);
431 if (unlikely(err))
432 goto failed;
433
434 err = -ENOMEM;
435 for (i = 1; i < 4; i++) {
436 ent = nilfs_alloc_segment_entry(segnum[i]);
437 if (unlikely(!ent))
438 goto failed;
439 list_add_tail(&ent->list, head);
440 }
441
442 /*
443 * Collecting segments written after the latest super root.
444 * These are marked dirty to avoid being reallocated in the next write.
445 */
446 list_for_each_entry_safe(ent, n, head, list) {
447 if (ent->segnum != segnum[0]) {
448 err = nilfs_sufile_scrap(sufile, ent->segnum);
449 if (unlikely(err))
450 goto failed;
451 }
452 list_del(&ent->list);
453 nilfs_free_segment_entry(ent);
454 }
455
456 /* Allocate new segments for recovery */
457 err = nilfs_sufile_alloc(sufile, &segnum[0]);
458 if (unlikely(err))
459 goto failed;
460
461 nilfs->ns_pseg_offset = 0;
462 nilfs->ns_seg_seq = ri->ri_seq + 2;
463 nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
464
465 failed:
466 /* No need to recover sufile because it will be destroyed on error */
467 nilfs_detach_writer(nilfs, sbi);
468 return err;
469}
470
471static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
472 struct nilfs_recovery_block *rb,
473 struct page *page)
474{
475 struct buffer_head *bh_org;
476 void *kaddr;
477
478 bh_org = sb_bread(sbi->s_super, rb->blocknr);
479 if (unlikely(!bh_org))
480 return -EIO;
481
482 kaddr = kmap_atomic(page, KM_USER0);
483 memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
484 kunmap_atomic(kaddr, KM_USER0);
485 brelse(bh_org);
486 return 0;
487}
488
489static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
490 struct list_head *head,
491 unsigned long *nr_salvaged_blocks)
492{
493 struct inode *inode;
494 struct nilfs_recovery_block *rb, *n;
495 unsigned blocksize = sbi->s_super->s_blocksize;
496 struct page *page;
497 loff_t pos;
498 int err = 0, err2 = 0;
499
500 list_for_each_entry_safe(rb, n, head, list) {
501 inode = nilfs_iget(sbi->s_super, rb->ino);
502 if (IS_ERR(inode)) {
503 err = PTR_ERR(inode);
504 inode = NULL;
505 goto failed_inode;
506 }
507
508 pos = rb->blkoff << inode->i_blkbits;
509 page = NULL;
510 err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
511 0, &page, NULL, nilfs_get_block);
512 if (unlikely(err))
513 goto failed_inode;
514
515 err = nilfs_recovery_copy_block(sbi, rb, page);
516 if (unlikely(err))
517 goto failed_page;
518
519 err = nilfs_set_file_dirty(sbi, inode, 1);
520 if (unlikely(err))
521 goto failed_page;
522
523 block_write_end(NULL, inode->i_mapping, pos, blocksize,
524 blocksize, page, NULL);
525
526 unlock_page(page);
527 page_cache_release(page);
528
529 (*nr_salvaged_blocks)++;
530 goto next;
531
532 failed_page:
533 unlock_page(page);
534 page_cache_release(page);
535
536 failed_inode:
537 printk(KERN_WARNING
538 "NILFS warning: error recovering data block "
539 "(err=%d, ino=%lu, block-offset=%llu)\n",
540 err, rb->ino, (unsigned long long)rb->blkoff);
541 if (!err2)
542 err2 = err;
543 next:
544 iput(inode); /* iput(NULL) is just ignored */
545 list_del_init(&rb->list);
546 kfree(rb);
547 }
548 return err2;
549}
550
551/**
552 * nilfs_do_roll_forward - salvage logical segments newer than the latest
553 * checkpoint
554 * @sbi: nilfs_sb_info
555 * @nilfs: the_nilfs
556 * @ri: pointer to a nilfs_recovery_info
557 */
558static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
559 struct nilfs_sb_info *sbi,
560 struct nilfs_recovery_info *ri)
561{
562 struct nilfs_segsum_info ssi;
563 sector_t pseg_start;
564 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
565 unsigned long nsalvaged_blocks = 0;
566 u64 seg_seq;
567 __u64 segnum, nextnum = 0;
568 int empty_seg = 0;
569 int err = 0, ret;
570 LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */
571 enum {
572 RF_INIT_ST,
573 RF_DSYNC_ST, /* scanning data-sync segments */
574 };
575 int state = RF_INIT_ST;
576
577 nilfs_attach_writer(nilfs, sbi);
578 pseg_start = ri->ri_lsegs_start;
579 seg_seq = ri->ri_lsegs_start_seq;
580 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
581 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
582
583 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
584
585 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
586 if (ret) {
587 if (ret == NILFS_SEG_FAIL_IO) {
588 err = -EIO;
589 goto failed;
590 }
591 goto strayed;
592 }
593 if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
594 goto confused;
595
596 /* Found a valid partial segment; do recovery actions */
597 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
598 empty_seg = 0;
599 nilfs->ns_ctime = ssi.ctime;
600 if (!(ssi.flags & NILFS_SS_GC))
601 nilfs->ns_nongc_ctime = ssi.ctime;
602
603 switch (state) {
604 case RF_INIT_ST:
605 if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
606 goto try_next_pseg;
607 state = RF_DSYNC_ST;
608 /* Fall through */
609 case RF_DSYNC_ST:
610 if (!NILFS_SEG_DSYNC(&ssi))
611 goto confused;
612
613 err = collect_blocks_from_segsum(
614 sbi, pseg_start, &ssi, &dsync_blocks);
615 if (unlikely(err))
616 goto failed;
617 if (NILFS_SEG_LOGEND(&ssi)) {
618 err = recover_dsync_blocks(
619 sbi, &dsync_blocks, &nsalvaged_blocks);
620 if (unlikely(err))
621 goto failed;
622 state = RF_INIT_ST;
623 }
624 break; /* Fall through to try_next_pseg */
625 }
626
627 try_next_pseg:
628 if (pseg_start == ri->ri_lsegs_end)
629 break;
630 pseg_start += ssi.nblocks;
631 if (pseg_start < seg_end)
632 continue;
633 goto feed_segment;
634
635 strayed:
636 if (pseg_start == ri->ri_lsegs_end)
637 break;
638
639 feed_segment:
640 /* Looking to the next full segment */
641 if (empty_seg++)
642 break;
643 seg_seq++;
644 segnum = nextnum;
645 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
646 pseg_start = seg_start;
647 }
648
649 if (nsalvaged_blocks) {
650 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
651 sbi->s_super->s_id, nsalvaged_blocks);
652 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
653 }
654 out:
655 dispose_recovery_list(&dsync_blocks);
656 nilfs_detach_writer(sbi->s_nilfs, sbi);
657 return err;
658
659 confused:
660 err = -EINVAL;
661 failed:
662 printk(KERN_ERR
663 "NILFS (device %s): Error roll-forwarding "
664 "(err=%d, pseg block=%llu). ",
665 sbi->s_super->s_id, err, (unsigned long long)pseg_start);
666 goto out;
667}
668
669static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
670 struct nilfs_sb_info *sbi,
671 struct nilfs_recovery_info *ri)
672{
673 struct buffer_head *bh;
674 int err;
675
676 if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
677 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
678 return;
679
680 bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
681 BUG_ON(!bh);
682 memset(bh->b_data, 0, bh->b_size);
683 set_buffer_dirty(bh);
684 err = sync_dirty_buffer(bh);
685 if (unlikely(err))
686 printk(KERN_WARNING
687 "NILFS warning: buffer sync write failed during "
688 "post-cleaning of recovery.\n");
689 brelse(bh);
690}
691
692/**
693 * nilfs_recover_logical_segments - salvage logical segments written after
694 * the latest super root
695 * @nilfs: the_nilfs
696 * @sbi: nilfs_sb_info
697 * @ri: pointer to a nilfs_recovery_info struct to store search results.
698 *
699 * Return Value: On success, 0 is returned. On error, one of the following
700 * negative error code is returned.
701 *
702 * %-EINVAL - Inconsistent filesystem state.
703 *
704 * %-EIO - I/O error
705 *
706 * %-ENOSPC - No space left on device (only in a panic state).
707 *
708 * %-ERESTARTSYS - Interrupted.
709 *
710 * %-ENOMEM - Insufficient memory available.
711 */
712int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
713 struct nilfs_sb_info *sbi,
714 struct nilfs_recovery_info *ri)
715{
716 int err;
717
718 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
719 return 0;
720
721 err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
722 if (unlikely(err)) {
723 printk(KERN_ERR
724 "NILFS: error loading the latest checkpoint.\n");
725 return err;
726 }
727
728 err = nilfs_do_roll_forward(nilfs, sbi, ri);
729 if (unlikely(err))
730 goto failed;
731
732 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
733 err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri);
734 if (unlikely(err)) {
735 printk(KERN_ERR "NILFS: Error preparing segments for "
736 "recovery.\n");
737 goto failed;
738 }
739
740 err = nilfs_attach_segment_constructor(sbi);
741 if (unlikely(err))
742 goto failed;
743
744 set_nilfs_discontinued(nilfs);
745 err = nilfs_construct_segment(sbi->s_super);
746 nilfs_detach_segment_constructor(sbi);
747
748 if (unlikely(err)) {
749 printk(KERN_ERR "NILFS: Oops! recovery failed. "
750 "(err=%d)\n", err);
751 goto failed;
752 }
753
754 nilfs_finish_roll_forward(nilfs, sbi, ri);
755 }
756
757 nilfs_detach_checkpoint(sbi);
758 return 0;
759
760 failed:
761 nilfs_detach_checkpoint(sbi);
762 nilfs_mdt_clear(nilfs->ns_cpfile);
763 nilfs_mdt_clear(nilfs->ns_sufile);
764 nilfs_mdt_clear(nilfs->ns_dat);
765 return err;
766}
767
768/**
769 * nilfs_search_super_root - search the latest valid super root
770 * @nilfs: the_nilfs
771 * @sbi: nilfs_sb_info
772 * @ri: pointer to a nilfs_recovery_info struct to store search results.
773 *
774 * nilfs_search_super_root() looks for the latest super-root from a partial
775 * segment pointed by the superblock. It sets up struct the_nilfs through
776 * this search. It fills nilfs_recovery_info (ri) required for recovery.
777 *
778 * Return Value: On success, 0 is returned. On error, one of the following
779 * negative error code is returned.
780 *
781 * %-EINVAL - No valid segment found
782 *
783 * %-EIO - I/O error
784 */
785int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
786 struct nilfs_recovery_info *ri)
787{
788 struct nilfs_segsum_info ssi;
789 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
790 sector_t seg_start, seg_end; /* range of full segment (block number) */
791 u64 seg_seq;
792 __u64 segnum, nextnum = 0;
793 __u64 cno;
794 struct nilfs_segment_entry *ent;
795 LIST_HEAD(segments);
796 int empty_seg = 0, scan_newer = 0;
797 int ret;
798
799 pseg_start = nilfs->ns_last_pseg;
800 seg_seq = nilfs->ns_last_seq;
801 cno = nilfs->ns_last_cno;
802 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
803
804 /* Calculate range of segment */
805 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
806
807 for (;;) {
808 /* Load segment summary */
809 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
810 if (ret) {
811 if (ret == NILFS_SEG_FAIL_IO)
812 goto failed;
813 goto strayed;
814 }
815 pseg_end = pseg_start + ssi.nblocks - 1;
816 if (unlikely(pseg_end > seg_end)) {
817 ret = NILFS_SEG_FAIL_CONSISTENCY;
818 goto strayed;
819 }
820
821 /* A valid partial segment */
822 ri->ri_pseg_start = pseg_start;
823 ri->ri_seq = seg_seq;
824 ri->ri_segnum = segnum;
825 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
826 ri->ri_nextnum = nextnum;
827 empty_seg = 0;
828
829 if (!NILFS_SEG_HAS_SR(&ssi)) {
830 if (!scan_newer) {
831 /* This will never happen because a superblock
832 (last_segment) always points to a pseg
833 having a super root. */
834 ret = NILFS_SEG_FAIL_CONSISTENCY;
835 goto failed;
836 }
837 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
838 ri->ri_lsegs_start = pseg_start;
839 ri->ri_lsegs_start_seq = seg_seq;
840 }
841 if (NILFS_SEG_LOGEND(&ssi))
842 ri->ri_lsegs_end = pseg_start;
843 goto try_next_pseg;
844 }
845
846 /* A valid super root was found. */
847 ri->ri_cno = cno++;
848 ri->ri_super_root = pseg_end;
849 ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
850
851 nilfs_dispose_segment_list(&segments);
852 nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
853 + ssi.nblocks - seg_start;
854 nilfs->ns_seg_seq = seg_seq;
855 nilfs->ns_segnum = segnum;
856 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */
857 nilfs->ns_ctime = ssi.ctime;
858 nilfs->ns_nextnum = nextnum;
859
860 if (scan_newer)
861 ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
862 else {
863 if (nilfs->ns_mount_state & NILFS_VALID_FS)
864 goto super_root_found;
865 scan_newer = 1;
866 }
867
868 /* reset region for roll-forward */
869 pseg_start += ssi.nblocks;
870 if (pseg_start < seg_end)
871 continue;
872 goto feed_segment;
873
874 try_next_pseg:
875 /* Standing on a course, or met an inconsistent state */
876 pseg_start += ssi.nblocks;
877 if (pseg_start < seg_end)
878 continue;
879 goto feed_segment;
880
881 strayed:
882 /* Off the trail */
883 if (!scan_newer)
884 /*
885 * This can happen if a checkpoint was written without
886 * barriers, or as a result of an I/O failure.
887 */
888 goto failed;
889
890 feed_segment:
891 /* Looking to the next full segment */
892 if (empty_seg++)
893 goto super_root_found; /* found a valid super root */
894
895 ent = nilfs_alloc_segment_entry(segnum);
896 if (unlikely(!ent)) {
897 ret = -ENOMEM;
898 goto failed;
899 }
900 list_add_tail(&ent->list, &segments);
901
902 seg_seq++;
903 segnum = nextnum;
904 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
905 pseg_start = seg_start;
906 }
907
908 super_root_found:
909 /* Updating pointers relating to the latest checkpoint */
910 list_splice(&segments, ri->ri_used_segments.prev);
911 nilfs->ns_last_pseg = sr_pseg_start;
912 nilfs->ns_last_seq = nilfs->ns_seg_seq;
913 nilfs->ns_last_cno = ri->ri_cno;
914 return 0;
915
916 failed:
917 nilfs_dispose_segment_list(&segments);
918 return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
919}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 000000000000..adccd4fc654e
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
1/*
2 * sb.h - NILFS on-memory super block structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _NILFS_SB
25#define _NILFS_SB
26
27#include <linux/types.h>
28#include <linux/fs.h>
29
30/*
31 * Mount options
32 */
33struct nilfs_mount_options {
34 unsigned long mount_opt;
35 __u64 snapshot_cno;
36};
37
38struct the_nilfs;
39struct nilfs_sc_info;
40
41/*
42 * NILFS super-block data in memory
43 */
44struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */
51 unsigned long s_mount_opt;
52 uid_t s_resuid;
53 gid_t s_resgid;
54
55 unsigned long s_interval; /* construction interval */
56 unsigned long s_watermark; /* threshold of data amount
57 for the segment construction */
58
59 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63
64 /* Segment constructor */
65 struct list_head s_dirty_files; /* dirty files list */
66 struct nilfs_sc_info *s_sc_info; /* segment constructor info */
67 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
68 It covers s_dirty_files list */
69
70 /* Metadata files */
71 struct inode *s_ifile; /* index file inode */
72
73 /* Inode allocator */
74 spinlock_t s_next_gen_lock;
75 u32 s_next_generation;
76};
77
78static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
79{
80 return sb->s_fs_info;
81}
82
83static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
84{
85 return sbi->s_sc_info;
86}
87
88/*
89 * Bit operations for the mount option
90 */
91#define nilfs_clear_opt(sbi, opt) \
92 do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
93#define nilfs_set_opt(sbi, opt) \
94 do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
95#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
96#define nilfs_write_opt(sbi, mask, opt) \
97 do { (sbi)->s_mount_opt = \
98 (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \
99 NILFS_MOUNT_##opt); \
100 } while (0)
101
102#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000000..1e68821b4a9b
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
1/*
2 * segbuf.c - NILFS segment buffer
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/writeback.h>
26#include <linux/crc32.h>
27#include "page.h"
28#include "segbuf.h"
29#include "seglist.h"
30
31
32static struct kmem_cache *nilfs_segbuf_cachep;
33
34static void nilfs_segbuf_init_once(void *obj)
35{
36 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
37}
38
39int __init nilfs_init_segbuf_cache(void)
40{
41 nilfs_segbuf_cachep =
42 kmem_cache_create("nilfs2_segbuf_cache",
43 sizeof(struct nilfs_segment_buffer),
44 0, SLAB_RECLAIM_ACCOUNT,
45 nilfs_segbuf_init_once);
46
47 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
48}
49
50void nilfs_destroy_segbuf_cache(void)
51{
52 kmem_cache_destroy(nilfs_segbuf_cachep);
53}
54
55struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
56{
57 struct nilfs_segment_buffer *segbuf;
58
59 segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
60 if (unlikely(!segbuf))
61 return NULL;
62
63 segbuf->sb_super = sb;
64 INIT_LIST_HEAD(&segbuf->sb_list);
65 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
66 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
67 return segbuf;
68}
69
70void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
71{
72 kmem_cache_free(nilfs_segbuf_cachep, segbuf);
73}
74
75void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
76 unsigned long offset, struct the_nilfs *nilfs)
77{
78 segbuf->sb_segnum = segnum;
79 nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
80 &segbuf->sb_fseg_end);
81
82 segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
83 segbuf->sb_rest_blocks =
84 segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
85}
86
87void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
88 __u64 nextnum, struct the_nilfs *nilfs)
89{
90 segbuf->sb_nextnum = nextnum;
91 segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
92}
93
94int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
95{
96 struct buffer_head *bh;
97
98 bh = sb_getblk(segbuf->sb_super,
99 segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
100 if (unlikely(!bh))
101 return -ENOMEM;
102
103 nilfs_segbuf_add_segsum_buffer(segbuf, bh);
104 return 0;
105}
106
107int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
108 struct buffer_head **bhp)
109{
110 struct buffer_head *bh;
111
112 bh = sb_getblk(segbuf->sb_super,
113 segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
114 if (unlikely(!bh))
115 return -ENOMEM;
116
117 nilfs_segbuf_add_payload_buffer(segbuf, bh);
118 *bhp = bh;
119 return 0;
120}
121
122int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
123 time_t ctime)
124{
125 int err;
126
127 segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
128 err = nilfs_segbuf_extend_segsum(segbuf);
129 if (unlikely(err))
130 return err;
131
132 segbuf->sb_sum.flags = flags;
133 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
134 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
135 segbuf->sb_sum.ctime = ctime;
136
137 segbuf->sb_io_error = 0;
138 return 0;
139}
140
141/*
142 * Setup segument summary
143 */
144void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
145{
146 struct nilfs_segment_summary *raw_sum;
147 struct buffer_head *bh_sum;
148
149 bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
150 struct buffer_head, b_assoc_buffers);
151 raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
152
153 raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC);
154 raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum));
155 raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags);
156 raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq);
157 raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime);
158 raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next);
159 raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks);
160 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
161 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
162 raw_sum->ss_pad = 0;
163}
164
165/*
166 * CRC calculation routines
167 */
168void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
169 u32 seed)
170{
171 struct buffer_head *bh;
172 struct nilfs_segment_summary *raw_sum;
173 unsigned long size, bytes = segbuf->sb_sum.sumbytes;
174 u32 crc;
175
176 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
177 b_assoc_buffers);
178
179 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
180 size = min_t(unsigned long, bytes, bh->b_size);
181 crc = crc32_le(seed,
182 (unsigned char *)raw_sum +
183 sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
184 size - (sizeof(raw_sum->ss_datasum) +
185 sizeof(raw_sum->ss_sumsum)));
186
187 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
188 b_assoc_buffers) {
189 bytes -= size;
190 size = min_t(unsigned long, bytes, bh->b_size);
191 crc = crc32_le(crc, bh->b_data, size);
192 }
193 raw_sum->ss_sumsum = cpu_to_le32(crc);
194}
195
196void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
197 u32 seed)
198{
199 struct buffer_head *bh;
200 struct nilfs_segment_summary *raw_sum;
201 void *kaddr;
202 u32 crc;
203
204 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
205 b_assoc_buffers);
206 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
207 crc = crc32_le(seed,
208 (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
209 bh->b_size - sizeof(raw_sum->ss_datasum));
210
211 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
212 b_assoc_buffers) {
213 crc = crc32_le(crc, bh->b_data, bh->b_size);
214 }
215 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
216 kaddr = kmap_atomic(bh->b_page, KM_USER0);
217 crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
218 kunmap_atomic(kaddr, KM_USER0);
219 }
220 raw_sum->ss_datasum = cpu_to_le32(crc);
221}
222
223void nilfs_release_buffers(struct list_head *list)
224{
225 struct buffer_head *bh, *n;
226
227 list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
228 list_del_init(&bh->b_assoc_buffers);
229 if (buffer_nilfs_allocated(bh)) {
230 struct page *clone_page = bh->b_page;
231
232 /* remove clone page */
233 brelse(bh);
234 page_cache_release(clone_page); /* for each bh */
235 if (page_count(clone_page) <= 2) {
236 lock_page(clone_page);
237 nilfs_free_private_page(clone_page);
238 }
239 continue;
240 }
241 brelse(bh);
242 }
243}
244
245/*
246 * BIO operations
247 */
248static void nilfs_end_bio_write(struct bio *bio, int err)
249{
250 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
251 struct nilfs_write_info *wi = bio->bi_private;
252
253 if (err == -EOPNOTSUPP) {
254 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
255 bio_put(bio);
256 /* to be detected by submit_seg_bio() */
257 }
258
259 if (!uptodate)
260 atomic_inc(&wi->err);
261
262 bio_put(bio);
263 complete(&wi->bio_event);
264}
265
266static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
267{
268 struct bio *bio = wi->bio;
269 int err;
270
271 if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
272 wait_for_completion(&wi->bio_event);
273 wi->nbio--;
274 if (unlikely(atomic_read(&wi->err))) {
275 bio_put(bio);
276 err = -EIO;
277 goto failed;
278 }
279 }
280
281 bio->bi_end_io = nilfs_end_bio_write;
282 bio->bi_private = wi;
283 bio_get(bio);
284 submit_bio(mode, bio);
285 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
286 bio_put(bio);
287 err = -EOPNOTSUPP;
288 goto failed;
289 }
290 wi->nbio++;
291 bio_put(bio);
292
293 wi->bio = NULL;
294 wi->rest_blocks -= wi->end - wi->start;
295 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
296 wi->start = wi->end;
297 return 0;
298
299 failed:
300 wi->bio = NULL;
301 return err;
302}
303
304/**
305 * nilfs_alloc_seg_bio - allocate a bio for writing segment.
306 * @sb: super block
307 * @start: beginning disk block number of this BIO.
308 * @nr_vecs: request size of page vector.
309 *
310 * alloc_seg_bio() allocates a new BIO structure and initialize it.
311 *
312 * Return Value: On success, pointer to the struct bio is returned.
313 * On error, NULL is returned.
314 */
315static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
316 int nr_vecs)
317{
318 struct bio *bio;
319
320 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
321 if (bio == NULL) {
322 while (!bio && (nr_vecs >>= 1))
323 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
324 }
325 if (likely(bio)) {
326 bio->bi_bdev = sb->s_bdev;
327 bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
328 }
329 return bio;
330}
331
332void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
333 struct nilfs_write_info *wi)
334{
335 wi->bio = NULL;
336 wi->rest_blocks = segbuf->sb_sum.nblocks;
337 wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
338 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
339 wi->start = wi->end = 0;
340 wi->nbio = 0;
341 wi->blocknr = segbuf->sb_pseg_start;
342
343 atomic_set(&wi->err, 0);
344 init_completion(&wi->bio_event);
345}
346
347static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
348 int mode)
349{
350 int len, err;
351
352 BUG_ON(wi->nr_vecs <= 0);
353 repeat:
354 if (!wi->bio) {
355 wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
356 wi->nr_vecs);
357 if (unlikely(!wi->bio))
358 return -ENOMEM;
359 }
360
361 len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (len == bh->b_size) {
363 wi->end++;
364 return 0;
365 }
366 /* bio is FULL */
367 err = nilfs_submit_seg_bio(wi, mode);
368 /* never submit current bh */
369 if (likely(!err))
370 goto repeat;
371 return err;
372}
373
374int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
375 struct nilfs_write_info *wi)
376{
377 struct buffer_head *bh;
378 int res, rw = WRITE;
379
380 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
381 res = nilfs_submit_bh(wi, bh, rw);
382 if (unlikely(res))
383 goto failed_bio;
384 }
385
386 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
387 res = nilfs_submit_bh(wi, bh, rw);
388 if (unlikely(res))
389 goto failed_bio;
390 }
391
392 if (wi->bio) {
393 /*
394 * Last BIO is always sent through the following
395 * submission.
396 */
397 rw |= (1 << BIO_RW_SYNCIO);
398 res = nilfs_submit_seg_bio(wi, rw);
399 if (unlikely(res))
400 goto failed_bio;
401 }
402
403 res = 0;
404 out:
405 return res;
406
407 failed_bio:
408 atomic_inc(&wi->err);
409 goto out;
410}
411
412/**
413 * nilfs_segbuf_wait - wait for completion of requested BIOs
414 * @wi: nilfs_write_info
415 *
416 * Return Value: On Success, 0 is returned. On Error, one of the following
417 * negative error code is returned.
418 *
419 * %-EIO - I/O error
420 */
421int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
422 struct nilfs_write_info *wi)
423{
424 int err = 0;
425
426 if (!wi->nbio)
427 return 0;
428
429 do {
430 wait_for_completion(&wi->bio_event);
431 } while (--wi->nbio > 0);
432
433 if (unlikely(atomic_read(&wi->err) > 0)) {
434 printk(KERN_ERR "NILFS: IO error writing segment\n");
435 err = -EIO;
436 segbuf->sb_io_error = 1;
437 }
438 return err;
439}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000000..0c3076f4e592
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
1/*
2 * segbuf.h - NILFS Segment buffer prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGBUF_H
24#define _NILFS_SEGBUF_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/bio.h>
29#include <linux/completion.h>
30#include <linux/backing-dev.h>
31
32/**
33 * struct nilfs_segsum_info - On-memory segment summary
34 * @flags: Flags
35 * @nfinfo: Number of file information structures
36 * @nblocks: Number of blocks included in the partial segment
37 * @nsumblk: Number of summary blocks
38 * @sumbytes: Byte count of segment summary
39 * @nfileblk: Total number of file blocks
40 * @seg_seq: Segment sequence number
41 * @ctime: Creation time
42 * @next: Block number of the next full segment
43 */
44struct nilfs_segsum_info {
45 unsigned int flags;
46 unsigned long nfinfo;
47 unsigned long nblocks;
48 unsigned long nsumblk;
49 unsigned long sumbytes;
50 unsigned long nfileblk;
51 u64 seg_seq;
52 time_t ctime;
53 sector_t next;
54};
55
56/* macro for the flags */
57#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR)
58#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN)
59#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND)
60#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT)
61#define NILFS_SEG_SIMPLEX(sum) \
62 (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
63 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
64
65#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk)
66
67/**
68 * struct nilfs_segment_buffer - Segment buffer
69 * @sb_super: back pointer to a superblock struct
70 * @sb_list: List head to chain this structure
71 * @sb_sum: On-memory segment summary
72 * @sb_segnum: Index number of the full segment
73 * @sb_nextnum: Index number of the next full segment
74 * @sb_fseg_start: Start block number of the full segment
75 * @sb_fseg_end: End block number of the full segment
76 * @sb_pseg_start: Disk block number of partial segment
77 * @sb_rest_blocks: Number of residual blocks in the current segment
78 * @sb_segsum_buffers: List of buffers for segment summaries
79 * @sb_payload_buffers: List of buffers for segment payload
80 * @sb_io_error: I/O error status
81 */
82struct nilfs_segment_buffer {
83 struct super_block *sb_super;
84 struct list_head sb_list;
85
86 /* Segment information */
87 struct nilfs_segsum_info sb_sum;
88 __u64 sb_segnum;
89 __u64 sb_nextnum;
90 sector_t sb_fseg_start, sb_fseg_end;
91 sector_t sb_pseg_start;
92 unsigned sb_rest_blocks;
93
94 /* Buffers */
95 struct list_head sb_segsum_buffers;
96 struct list_head sb_payload_buffers; /* including super root */
97
98 /* io status */
99 int sb_io_error;
100};
101
102#define NILFS_LIST_SEGBUF(head) \
103 list_entry((head), struct nilfs_segment_buffer, sb_list)
104#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
105#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
106#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev)
107#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next)
108#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head))
109
110#define nilfs_for_each_segbuf_before(s, t, h) \
111 for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
112 (s) = NILFS_NEXT_SEGBUF(s))
113
114#define NILFS_SEGBUF_FIRST_BH(head) \
115 (list_entry((head)->next, struct buffer_head, b_assoc_buffers))
116#define NILFS_SEGBUF_NEXT_BH(bh) \
117 (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
118 b_assoc_buffers))
119#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
120
121
122int __init nilfs_init_segbuf_cache(void);
123void nilfs_destroy_segbuf_cache(void);
124struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
125void nilfs_segbuf_free(struct nilfs_segment_buffer *);
126void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
127 struct the_nilfs *);
128void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
129 struct the_nilfs *);
130int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
131int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
132int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
133 struct buffer_head **);
134void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
135void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
136void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
137
138static inline void
139nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
140 struct buffer_head *bh)
141{
142 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
143 segbuf->sb_sum.nblocks++;
144 segbuf->sb_sum.nsumblk++;
145}
146
147static inline void
148nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
149 struct buffer_head *bh)
150{
151 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
152 segbuf->sb_sum.nblocks++;
153}
154
155static inline void
156nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
157 struct buffer_head *bh)
158{
159 get_bh(bh);
160 nilfs_segbuf_add_payload_buffer(segbuf, bh);
161 segbuf->sb_sum.nfileblk++;
162}
163
164void nilfs_release_buffers(struct list_head *);
165
166static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
167{
168 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
169 nilfs_release_buffers(&segbuf->sb_payload_buffers);
170}
171
172struct nilfs_write_info {
173 struct bio *bio;
174 int start, end; /* The region to be submitted */
175 int rest_blocks;
176 int max_pages;
177 int nr_vecs;
178 sector_t blocknr;
179
180 int nbio;
181 atomic_t err;
182 struct completion bio_event;
183 /* completion event of segment write */
184
185 /*
186 * The following fields must be set explicitly
187 */
188 struct super_block *sb;
189 struct backing_dev_info *bdi; /* backing dev info */
190 struct buffer_head *bh_sr;
191};
192
193
194void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
195 struct nilfs_write_info *);
196int nilfs_segbuf_write(struct nilfs_segment_buffer *,
197 struct nilfs_write_info *);
198int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
199 struct nilfs_write_info *);
200
201#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 000000000000..d39df9144e99
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
1/*
2 * seglist.h - expediential structure and routines to handle list of segments
3 * (would be removed in a future release)
4 *
5 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 * Written by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24#ifndef _NILFS_SEGLIST_H
25#define _NILFS_SEGLIST_H
26
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sufile.h"
31
32struct nilfs_segment_entry {
33 __u64 segnum;
34
35#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally.
36 It must be cancelled if
37 construction aborted */
38
39 unsigned flags;
40 struct list_head list;
41 struct buffer_head *bh_su;
42 struct nilfs_segment_usage *raw_su;
43};
44
45
46void nilfs_dispose_segment_list(struct list_head *);
47
48static inline struct nilfs_segment_entry *
49nilfs_alloc_segment_entry(__u64 segnum)
50{
51 struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
52
53 if (likely(ent)) {
54 ent->segnum = segnum;
55 ent->flags = 0;
56 ent->bh_su = NULL;
57 ent->raw_su = NULL;
58 INIT_LIST_HEAD(&ent->list);
59 }
60 return ent;
61}
62
63static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
64 struct inode *sufile)
65{
66 return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
67 &ent->raw_su, &ent->bh_su);
68}
69
70static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
71 struct inode *sufile)
72{
73 if (!ent->bh_su)
74 return;
75 nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
76 ent->bh_su = NULL;
77 ent->raw_su = NULL;
78}
79
80static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
81{
82 kfree(ent);
83}
84
85#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 000000000000..22c7f65c2403
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2978 @@
1/*
2 * segment.c - NILFS segment constructor.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/pagemap.h>
25#include <linux/buffer_head.h>
26#include <linux/writeback.h>
27#include <linux/bio.h>
28#include <linux/completion.h>
29#include <linux/blkdev.h>
30#include <linux/backing-dev.h>
31#include <linux/freezer.h>
32#include <linux/kthread.h>
33#include <linux/crc32.h>
34#include <linux/pagevec.h>
35#include "nilfs.h"
36#include "btnode.h"
37#include "page.h"
38#include "segment.h"
39#include "sufile.h"
40#include "cpfile.h"
41#include "ifile.h"
42#include "seglist.h"
43#include "segbuf.h"
44
45
46/*
47 * Segment constructor
48 */
49#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */
50
51#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments
52 appended in collection retry loop */
53
54/* Construction mode */
55enum {
56 SC_LSEG_SR = 1, /* Make a logical segment having a super root */
57 SC_LSEG_DSYNC, /* Flush data blocks of a given file and make
58 a logical segment without a super root */
59 SC_FLUSH_FILE, /* Flush data files, leads to segment writes without
60 creating a checkpoint */
61 SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without
62 a checkpoint */
63};
64
65/* Stage numbers of dirty block collection */
66enum {
67 NILFS_ST_INIT = 0,
68 NILFS_ST_GC, /* Collecting dirty blocks for GC */
69 NILFS_ST_FILE,
70 NILFS_ST_IFILE,
71 NILFS_ST_CPFILE,
72 NILFS_ST_SUFILE,
73 NILFS_ST_DAT,
74 NILFS_ST_SR, /* Super root */
75 NILFS_ST_DSYNC, /* Data sync blocks */
76 NILFS_ST_DONE,
77};
78
79/* State flags of collection */
80#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
81#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
82#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED)
83
84/* Operations depending on the construction mode and file type */
85struct nilfs_sc_operations {
86 int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
87 struct inode *);
88 int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
89 struct inode *);
90 int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
91 struct inode *);
92 void (*write_data_binfo)(struct nilfs_sc_info *,
93 struct nilfs_segsum_pointer *,
94 union nilfs_binfo *);
95 void (*write_node_binfo)(struct nilfs_sc_info *,
96 struct nilfs_segsum_pointer *,
97 union nilfs_binfo *);
98};
99
100/*
101 * Other definitions
102 */
103static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
104static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
105static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
106static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
107 int);
108
109#define nilfs_cnt32_gt(a, b) \
110 (typecheck(__u32, a) && typecheck(__u32, b) && \
111 ((__s32)(b) - (__s32)(a) < 0))
112#define nilfs_cnt32_ge(a, b) \
113 (typecheck(__u32, a) && typecheck(__u32, b) && \
114 ((__s32)(a) - (__s32)(b) >= 0))
115#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
116#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
117
118/*
119 * Transaction
120 */
121static struct kmem_cache *nilfs_transaction_cachep;
122
123/**
124 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
125 *
126 * nilfs_init_transaction_cache() creates a slab cache for the struct
127 * nilfs_transaction_info.
128 *
129 * Return Value: On success, it returns 0. On error, one of the following
130 * negative error code is returned.
131 *
132 * %-ENOMEM - Insufficient memory available.
133 */
134int nilfs_init_transaction_cache(void)
135{
136 nilfs_transaction_cachep =
137 kmem_cache_create("nilfs2_transaction_cache",
138 sizeof(struct nilfs_transaction_info),
139 0, SLAB_RECLAIM_ACCOUNT, NULL);
140 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
141}
142
143/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info
145 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info.
148 */
149void nilfs_destroy_transaction_cache(void)
150{
151 kmem_cache_destroy(nilfs_transaction_cachep);
152}
153
154static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
155{
156 struct nilfs_transaction_info *cur_ti = current->journal_info;
157 void *save = NULL;
158
159 if (cur_ti) {
160 if (cur_ti->ti_magic == NILFS_TI_MAGIC)
161 return ++cur_ti->ti_count;
162 else {
163 /*
164 * If journal_info field is occupied by other FS,
165 * it is saved and will be restored on
166 * nilfs_transaction_commit().
167 */
168 printk(KERN_WARNING
169 "NILFS warning: journal info from a different "
170 "FS\n");
171 save = current->journal_info;
172 }
173 }
174 if (!ti) {
175 ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
176 if (!ti)
177 return -ENOMEM;
178 ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
179 } else {
180 ti->ti_flags = 0;
181 }
182 ti->ti_count = 0;
183 ti->ti_save = save;
184 ti->ti_magic = NILFS_TI_MAGIC;
185 current->journal_info = ti;
186 return 0;
187}
188
189/**
190 * nilfs_transaction_begin - start indivisible file operations.
191 * @sb: super block
192 * @ti: nilfs_transaction_info
193 * @vacancy_check: flags for vacancy rate checks
194 *
195 * nilfs_transaction_begin() acquires a reader/writer semaphore, called
196 * the segment semaphore, to make a segment construction and write tasks
197 * exclusive. The function is used with nilfs_transaction_commit() in pairs.
198 * The region enclosed by these two functions can be nested. To avoid a
199 * deadlock, the semaphore is only acquired or released in the outermost call.
200 *
201 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab.
205 *
206 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity.
208 *
209 * Return Value: On success, 0 is returned. On error, one of the following
210 * negative error code is returned.
211 *
212 * %-ENOMEM - Insufficient memory available.
213 *
214 * %-ENOSPC - No space left on device
215 */
216int nilfs_transaction_begin(struct super_block *sb,
217 struct nilfs_transaction_info *ti,
218 int vacancy_check)
219{
220 struct nilfs_sb_info *sbi;
221 struct the_nilfs *nilfs;
222 int ret = nilfs_prepare_segment_lock(ti);
223
224 if (unlikely(ret < 0))
225 return ret;
226 if (ret > 0)
227 return 0;
228
229 sbi = NILFS_SB(sb);
230 nilfs = sbi->s_nilfs;
231 down_read(&nilfs->ns_segctor_sem);
232 if (vacancy_check && nilfs_near_disk_full(nilfs)) {
233 up_read(&nilfs->ns_segctor_sem);
234 ret = -ENOSPC;
235 goto failed;
236 }
237 return 0;
238
239 failed:
240 ti = current->journal_info;
241 current->journal_info = ti->ti_save;
242 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
243 kmem_cache_free(nilfs_transaction_cachep, ti);
244 return ret;
245}
246
247/**
248 * nilfs_transaction_commit - commit indivisible file operations.
249 * @sb: super block
250 *
251 * nilfs_transaction_commit() releases the read semaphore which is
252 * acquired by nilfs_transaction_begin(). This is only performed
253 * in outermost call of this function. If a commit flag is set,
254 * nilfs_transaction_commit() sets a timer to start the segment
255 * constructor. If a sync flag is set, it starts construction
256 * directly.
257 */
258int nilfs_transaction_commit(struct super_block *sb)
259{
260 struct nilfs_transaction_info *ti = current->journal_info;
261 struct nilfs_sb_info *sbi;
262 struct nilfs_sc_info *sci;
263 int err = 0;
264
265 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
266 ti->ti_flags |= NILFS_TI_COMMIT;
267 if (ti->ti_count > 0) {
268 ti->ti_count--;
269 return 0;
270 }
271 sbi = NILFS_SB(sb);
272 sci = NILFS_SC(sbi);
273 if (sci != NULL) {
274 if (ti->ti_flags & NILFS_TI_COMMIT)
275 nilfs_segctor_start_timer(sci);
276 if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
277 sci->sc_watermark)
278 nilfs_segctor_do_flush(sci, 0);
279 }
280 up_read(&sbi->s_nilfs->ns_segctor_sem);
281 current->journal_info = ti->ti_save;
282
283 if (ti->ti_flags & NILFS_TI_SYNC)
284 err = nilfs_construct_segment(sb);
285 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
286 kmem_cache_free(nilfs_transaction_cachep, ti);
287 return err;
288}
289
290void nilfs_transaction_abort(struct super_block *sb)
291{
292 struct nilfs_transaction_info *ti = current->journal_info;
293
294 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
295 if (ti->ti_count > 0) {
296 ti->ti_count--;
297 return;
298 }
299 up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
300
301 current->journal_info = ti->ti_save;
302 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
303 kmem_cache_free(nilfs_transaction_cachep, ti);
304}
305
306void nilfs_relax_pressure_in_lock(struct super_block *sb)
307{
308 struct nilfs_sb_info *sbi = NILFS_SB(sb);
309 struct nilfs_sc_info *sci = NILFS_SC(sbi);
310 struct the_nilfs *nilfs = sbi->s_nilfs;
311
312 if (!sci || !sci->sc_flush_request)
313 return;
314
315 set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
316 up_read(&nilfs->ns_segctor_sem);
317
318 down_write(&nilfs->ns_segctor_sem);
319 if (sci->sc_flush_request &&
320 test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
321 struct nilfs_transaction_info *ti = current->journal_info;
322
323 ti->ti_flags |= NILFS_TI_WRITER;
324 nilfs_segctor_do_immediate_flush(sci);
325 ti->ti_flags &= ~NILFS_TI_WRITER;
326 }
327 downgrade_write(&nilfs->ns_segctor_sem);
328}
329
330static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
331 struct nilfs_transaction_info *ti,
332 int gcflag)
333{
334 struct nilfs_transaction_info *cur_ti = current->journal_info;
335
336 WARN_ON(cur_ti);
337 ti->ti_flags = NILFS_TI_WRITER;
338 ti->ti_count = 0;
339 ti->ti_save = cur_ti;
340 ti->ti_magic = NILFS_TI_MAGIC;
341 INIT_LIST_HEAD(&ti->ti_garbage);
342 current->journal_info = ti;
343
344 for (;;) {
345 down_write(&sbi->s_nilfs->ns_segctor_sem);
346 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
347 break;
348
349 nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
350
351 up_write(&sbi->s_nilfs->ns_segctor_sem);
352 yield();
353 }
354 if (gcflag)
355 ti->ti_flags |= NILFS_TI_GC;
356}
357
358static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
359{
360 struct nilfs_transaction_info *ti = current->journal_info;
361
362 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
363 BUG_ON(ti->ti_count > 0);
364
365 up_write(&sbi->s_nilfs->ns_segctor_sem);
366 current->journal_info = ti->ti_save;
367 if (!list_empty(&ti->ti_garbage))
368 nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
369}
370
371static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
372 struct nilfs_segsum_pointer *ssp,
373 unsigned bytes)
374{
375 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
376 unsigned blocksize = sci->sc_super->s_blocksize;
377 void *p;
378
379 if (unlikely(ssp->offset + bytes > blocksize)) {
380 ssp->offset = 0;
381 BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
382 &segbuf->sb_segsum_buffers));
383 ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
384 }
385 p = ssp->bh->b_data + ssp->offset;
386 ssp->offset += bytes;
387 return p;
388}
389
390/**
391 * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
392 * @sci: nilfs_sc_info
393 */
394static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
395{
396 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
397 struct buffer_head *sumbh;
398 unsigned sumbytes;
399 unsigned flags = 0;
400 int err;
401
402 if (nilfs_doing_gc())
403 flags = NILFS_SS_GC;
404 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
405 if (unlikely(err))
406 return err;
407
408 sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
409 sumbytes = segbuf->sb_sum.sumbytes;
410 sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes;
411 sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes;
412 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
413 return 0;
414}
415
416static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
417{
418 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
419 if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
420 return -E2BIG; /* The current segment is filled up
421 (internal code) */
422 sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
423 return nilfs_segctor_reset_segment_buffer(sci);
424}
425
426static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
427{
428 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
429 int err;
430
431 if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
432 err = nilfs_segctor_feed_segment(sci);
433 if (err)
434 return err;
435 segbuf = sci->sc_curseg;
436 }
437 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
438 if (likely(!err))
439 segbuf->sb_sum.flags |= NILFS_SS_SR;
440 return err;
441}
442
443/*
444 * Functions for making segment summary and payloads
445 */
446static int nilfs_segctor_segsum_block_required(
447 struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
448 unsigned binfo_size)
449{
450 unsigned blocksize = sci->sc_super->s_blocksize;
451 /* Size of finfo and binfo is enough small against blocksize */
452
453 return ssp->offset + binfo_size +
454 (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
455 blocksize;
456}
457
458static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
459 struct inode *inode)
460{
461 sci->sc_curseg->sb_sum.nfinfo++;
462 sci->sc_binfo_ptr = sci->sc_finfo_ptr;
463 nilfs_segctor_map_segsum_entry(
464 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
465
466 if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
467 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
468 /* skip finfo */
469}
470
471static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
472 struct inode *inode)
473{
474 struct nilfs_finfo *finfo;
475 struct nilfs_inode_info *ii;
476 struct nilfs_segment_buffer *segbuf;
477
478 if (sci->sc_blk_cnt == 0)
479 return;
480
481 ii = NILFS_I(inode);
482 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
483 sizeof(*finfo));
484 finfo->fi_ino = cpu_to_le64(inode->i_ino);
485 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
486 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
487 finfo->fi_cno = cpu_to_le64(ii->i_cno);
488
489 segbuf = sci->sc_curseg;
490 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
491 sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
492 sci->sc_finfo_ptr = sci->sc_binfo_ptr;
493 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
494}
495
496static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
497 struct buffer_head *bh,
498 struct inode *inode,
499 unsigned binfo_size)
500{
501 struct nilfs_segment_buffer *segbuf;
502 int required, err = 0;
503
504 retry:
505 segbuf = sci->sc_curseg;
506 required = nilfs_segctor_segsum_block_required(
507 sci, &sci->sc_binfo_ptr, binfo_size);
508 if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
509 nilfs_segctor_end_finfo(sci, inode);
510 err = nilfs_segctor_feed_segment(sci);
511 if (err)
512 return err;
513 goto retry;
514 }
515 if (unlikely(required)) {
516 err = nilfs_segbuf_extend_segsum(segbuf);
517 if (unlikely(err))
518 goto failed;
519 }
520 if (sci->sc_blk_cnt == 0)
521 nilfs_segctor_begin_finfo(sci, inode);
522
523 nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
524 /* Substitution to vblocknr is delayed until update_blocknr() */
525 nilfs_segbuf_add_file_buffer(segbuf, bh);
526 sci->sc_blk_cnt++;
527 failed:
528 return err;
529}
530
531static int nilfs_handle_bmap_error(int err, const char *fname,
532 struct inode *inode, struct super_block *sb)
533{
534 if (err == -EINVAL) {
535 nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
536 inode->i_ino);
537 err = -EIO;
538 }
539 return err;
540}
541
542/*
543 * Callback functions that enumerate, mark, and collect dirty blocks
544 */
545static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
546 struct buffer_head *bh, struct inode *inode)
547{
548 int err;
549
550 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
551 if (unlikely(err < 0))
552 return nilfs_handle_bmap_error(err, __func__, inode,
553 sci->sc_super);
554
555 err = nilfs_segctor_add_file_block(sci, bh, inode,
556 sizeof(struct nilfs_binfo_v));
557 if (!err)
558 sci->sc_datablk_cnt++;
559 return err;
560}
561
562static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
563 struct buffer_head *bh,
564 struct inode *inode)
565{
566 int err;
567
568 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
569 if (unlikely(err < 0))
570 return nilfs_handle_bmap_error(err, __func__, inode,
571 sci->sc_super);
572 return 0;
573}
574
575static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
576 struct buffer_head *bh,
577 struct inode *inode)
578{
579 WARN_ON(!buffer_dirty(bh));
580 return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
581}
582
583static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
584 struct nilfs_segsum_pointer *ssp,
585 union nilfs_binfo *binfo)
586{
587 struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
588 sci, ssp, sizeof(*binfo_v));
589 *binfo_v = binfo->bi_v;
590}
591
592static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
593 struct nilfs_segsum_pointer *ssp,
594 union nilfs_binfo *binfo)
595{
596 __le64 *vblocknr = nilfs_segctor_map_segsum_entry(
597 sci, ssp, sizeof(*vblocknr));
598 *vblocknr = binfo->bi_v.bi_vblocknr;
599}
600
601struct nilfs_sc_operations nilfs_sc_file_ops = {
602 .collect_data = nilfs_collect_file_data,
603 .collect_node = nilfs_collect_file_node,
604 .collect_bmap = nilfs_collect_file_bmap,
605 .write_data_binfo = nilfs_write_file_data_binfo,
606 .write_node_binfo = nilfs_write_file_node_binfo,
607};
608
609static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
610 struct buffer_head *bh, struct inode *inode)
611{
612 int err;
613
614 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
615 if (unlikely(err < 0))
616 return nilfs_handle_bmap_error(err, __func__, inode,
617 sci->sc_super);
618
619 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
620 if (!err)
621 sci->sc_datablk_cnt++;
622 return err;
623}
624
625static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
626 struct buffer_head *bh, struct inode *inode)
627{
628 WARN_ON(!buffer_dirty(bh));
629 return nilfs_segctor_add_file_block(sci, bh, inode,
630 sizeof(struct nilfs_binfo_dat));
631}
632
633static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
634 struct nilfs_segsum_pointer *ssp,
635 union nilfs_binfo *binfo)
636{
637 __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
638 sizeof(*blkoff));
639 *blkoff = binfo->bi_dat.bi_blkoff;
640}
641
642static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
643 struct nilfs_segsum_pointer *ssp,
644 union nilfs_binfo *binfo)
645{
646 struct nilfs_binfo_dat *binfo_dat =
647 nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
648 *binfo_dat = binfo->bi_dat;
649}
650
651struct nilfs_sc_operations nilfs_sc_dat_ops = {
652 .collect_data = nilfs_collect_dat_data,
653 .collect_node = nilfs_collect_file_node,
654 .collect_bmap = nilfs_collect_dat_bmap,
655 .write_data_binfo = nilfs_write_dat_data_binfo,
656 .write_node_binfo = nilfs_write_dat_node_binfo,
657};
658
659struct nilfs_sc_operations nilfs_sc_dsync_ops = {
660 .collect_data = nilfs_collect_file_data,
661 .collect_node = NULL,
662 .collect_bmap = NULL,
663 .write_data_binfo = nilfs_write_file_data_binfo,
664 .write_node_binfo = NULL,
665};
666
667static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
668 struct list_head *listp,
669 size_t nlimit,
670 loff_t start, loff_t end)
671{
672 struct address_space *mapping = inode->i_mapping;
673 struct pagevec pvec;
674 pgoff_t index = 0, last = ULONG_MAX;
675 size_t ndirties = 0;
676 int i;
677
678 if (unlikely(start != 0 || end != LLONG_MAX)) {
679 /*
680 * A valid range is given for sync-ing data pages. The
681 * range is rounded to per-page; extra dirty buffers
682 * may be included if blocksize < pagesize.
683 */
684 index = start >> PAGE_SHIFT;
685 last = end >> PAGE_SHIFT;
686 }
687 pagevec_init(&pvec, 0);
688 repeat:
689 if (unlikely(index > last) ||
690 !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
691 min_t(pgoff_t, last - index,
692 PAGEVEC_SIZE - 1) + 1))
693 return ndirties;
694
695 for (i = 0; i < pagevec_count(&pvec); i++) {
696 struct buffer_head *bh, *head;
697 struct page *page = pvec.pages[i];
698
699 if (unlikely(page->index > last))
700 break;
701
702 if (mapping->host) {
703 lock_page(page);
704 if (!page_has_buffers(page))
705 create_empty_buffers(page,
706 1 << inode->i_blkbits, 0);
707 unlock_page(page);
708 }
709
710 bh = head = page_buffers(page);
711 do {
712 if (!buffer_dirty(bh))
713 continue;
714 get_bh(bh);
715 list_add_tail(&bh->b_assoc_buffers, listp);
716 ndirties++;
717 if (unlikely(ndirties >= nlimit)) {
718 pagevec_release(&pvec);
719 cond_resched();
720 return ndirties;
721 }
722 } while (bh = bh->b_this_page, bh != head);
723 }
724 pagevec_release(&pvec);
725 cond_resched();
726 goto repeat;
727}
728
729static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
730 struct list_head *listp)
731{
732 struct nilfs_inode_info *ii = NILFS_I(inode);
733 struct address_space *mapping = &ii->i_btnode_cache;
734 struct pagevec pvec;
735 struct buffer_head *bh, *head;
736 unsigned int i;
737 pgoff_t index = 0;
738
739 pagevec_init(&pvec, 0);
740
741 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
742 PAGEVEC_SIZE)) {
743 for (i = 0; i < pagevec_count(&pvec); i++) {
744 bh = head = page_buffers(pvec.pages[i]);
745 do {
746 if (buffer_dirty(bh)) {
747 get_bh(bh);
748 list_add_tail(&bh->b_assoc_buffers,
749 listp);
750 }
751 bh = bh->b_this_page;
752 } while (bh != head);
753 }
754 pagevec_release(&pvec);
755 cond_resched();
756 }
757}
758
759static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
760 struct list_head *head, int force)
761{
762 struct nilfs_inode_info *ii, *n;
763 struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
764 unsigned nv = 0;
765
766 while (!list_empty(head)) {
767 spin_lock(&sbi->s_inode_lock);
768 list_for_each_entry_safe(ii, n, head, i_dirty) {
769 list_del_init(&ii->i_dirty);
770 if (force) {
771 if (unlikely(ii->i_bh)) {
772 brelse(ii->i_bh);
773 ii->i_bh = NULL;
774 }
775 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
776 set_bit(NILFS_I_QUEUED, &ii->i_state);
777 list_add_tail(&ii->i_dirty,
778 &sbi->s_dirty_files);
779 continue;
780 }
781 ivec[nv++] = ii;
782 if (nv == SC_N_INODEVEC)
783 break;
784 }
785 spin_unlock(&sbi->s_inode_lock);
786
787 for (pii = ivec; nv > 0; pii++, nv--)
788 iput(&(*pii)->vfs_inode);
789 }
790}
791
792static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
793{
794 struct the_nilfs *nilfs = sbi->s_nilfs;
795 int ret = 0;
796
797 if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
798 ret++;
799 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
800 ret++;
801 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
802 ret++;
803 if (ret || nilfs_doing_gc())
804 if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
805 ret++;
806 return ret;
807}
808
809static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
810{
811 return list_empty(&sci->sc_dirty_files) &&
812 !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
813 list_empty(&sci->sc_cleaning_segments) &&
814 (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
815}
816
817static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
818{
819 struct nilfs_sb_info *sbi = sci->sc_sbi;
820 int ret = 0;
821
822 if (nilfs_test_metadata_dirty(sbi))
823 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
824
825 spin_lock(&sbi->s_inode_lock);
826 if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
827 ret++;
828
829 spin_unlock(&sbi->s_inode_lock);
830 return ret;
831}
832
833static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
834{
835 struct nilfs_sb_info *sbi = sci->sc_sbi;
836 struct the_nilfs *nilfs = sbi->s_nilfs;
837
838 nilfs_mdt_clear_dirty(sbi->s_ifile);
839 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
840 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
841 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
842}
843
844static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
845{
846 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
847 struct buffer_head *bh_cp;
848 struct nilfs_checkpoint *raw_cp;
849 int err;
850
851 /* XXX: this interface will be changed */
852 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
853 &raw_cp, &bh_cp);
854 if (likely(!err)) {
855 /* The following code is duplicated with cpfile. But, it is
856 needed to collect the checkpoint even if it was not newly
857 created */
858 nilfs_mdt_mark_buffer_dirty(bh_cp);
859 nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
860 nilfs_cpfile_put_checkpoint(
861 nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
862 } else
863 WARN_ON(err == -EINVAL || err == -ENOENT);
864
865 return err;
866}
867
868static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
869{
870 struct nilfs_sb_info *sbi = sci->sc_sbi;
871 struct the_nilfs *nilfs = sbi->s_nilfs;
872 struct buffer_head *bh_cp;
873 struct nilfs_checkpoint *raw_cp;
874 int err;
875
876 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
877 &raw_cp, &bh_cp);
878 if (unlikely(err)) {
879 WARN_ON(err == -EINVAL || err == -ENOENT);
880 goto failed_ibh;
881 }
882 raw_cp->cp_snapshot_list.ssl_next = 0;
883 raw_cp->cp_snapshot_list.ssl_prev = 0;
884 raw_cp->cp_inodes_count =
885 cpu_to_le64(atomic_read(&sbi->s_inodes_count));
886 raw_cp->cp_blocks_count =
887 cpu_to_le64(atomic_read(&sbi->s_blocks_count));
888 raw_cp->cp_nblk_inc =
889 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
890 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
891 raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
892
893 if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
894 nilfs_checkpoint_clear_minor(raw_cp);
895 else
896 nilfs_checkpoint_set_minor(raw_cp);
897
898 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
899 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
900 return 0;
901
902 failed_ibh:
903 return err;
904}
905
906static void nilfs_fill_in_file_bmap(struct inode *ifile,
907 struct nilfs_inode_info *ii)
908
909{
910 struct buffer_head *ibh;
911 struct nilfs_inode *raw_inode;
912
913 if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
914 ibh = ii->i_bh;
915 BUG_ON(!ibh);
916 raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
917 ibh);
918 nilfs_bmap_write(ii->i_bmap, raw_inode);
919 nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
920 }
921}
922
923static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
924 struct inode *ifile)
925{
926 struct nilfs_inode_info *ii;
927
928 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
929 nilfs_fill_in_file_bmap(ifile, ii);
930 set_bit(NILFS_I_COLLECTED, &ii->i_state);
931 }
932}
933
934/*
935 * CRC calculation routines
936 */
937static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
938{
939 struct nilfs_super_root *raw_sr =
940 (struct nilfs_super_root *)bh_sr->b_data;
941 u32 crc;
942
943 crc = crc32_le(seed,
944 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
945 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
946 raw_sr->sr_sum = cpu_to_le32(crc);
947}
948
949static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
950 u32 seed)
951{
952 struct nilfs_segment_buffer *segbuf;
953
954 if (sci->sc_super_root)
955 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
956
957 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
958 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
959 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
960 }
961}
962
963static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
964 struct the_nilfs *nilfs)
965{
966 struct buffer_head *bh_sr = sci->sc_super_root;
967 struct nilfs_super_root *raw_sr =
968 (struct nilfs_super_root *)bh_sr->b_data;
969 unsigned isz = nilfs->ns_inode_size;
970
971 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
972 raw_sr->sr_nongc_ctime
973 = cpu_to_le64(nilfs_doing_gc() ?
974 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
975 raw_sr->sr_flags = 0;
976
977 nilfs_mdt_write_inode_direct(
978 nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
979 nilfs_mdt_write_inode_direct(
980 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
981 nilfs_mdt_write_inode_direct(
982 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
983}
984
985static void nilfs_redirty_inodes(struct list_head *head)
986{
987 struct nilfs_inode_info *ii;
988
989 list_for_each_entry(ii, head, i_dirty) {
990 if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
991 clear_bit(NILFS_I_COLLECTED, &ii->i_state);
992 }
993}
994
995static void nilfs_drop_collected_inodes(struct list_head *head)
996{
997 struct nilfs_inode_info *ii;
998
999 list_for_each_entry(ii, head, i_dirty) {
1000 if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
1001 continue;
1002
1003 clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
1004 set_bit(NILFS_I_UPDATED, &ii->i_state);
1005 }
1006}
1007
1008static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
1009 struct inode *sufile)
1010
1011{
1012 struct list_head *head = &sci->sc_cleaning_segments;
1013 struct nilfs_segment_entry *ent;
1014 int err;
1015
1016 list_for_each_entry(ent, head, list) {
1017 if (!(ent->flags & NILFS_SLH_FREED))
1018 break;
1019 err = nilfs_sufile_cancel_free(sufile, ent->segnum);
1020 WARN_ON(err); /* do not happen */
1021 ent->flags &= ~NILFS_SLH_FREED;
1022 }
1023}
1024
1025static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
1026 struct inode *sufile)
1027{
1028 struct list_head *head = &sci->sc_cleaning_segments;
1029 struct nilfs_segment_entry *ent;
1030 int err;
1031
1032 list_for_each_entry(ent, head, list) {
1033 err = nilfs_sufile_free(sufile, ent->segnum);
1034 if (unlikely(err))
1035 return err;
1036 ent->flags |= NILFS_SLH_FREED;
1037 }
1038 return 0;
1039}
1040
1041static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
1042{
1043 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
1044}
1045
1046static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
1047 struct inode *inode,
1048 struct list_head *listp,
1049 int (*collect)(struct nilfs_sc_info *,
1050 struct buffer_head *,
1051 struct inode *))
1052{
1053 struct buffer_head *bh, *n;
1054 int err = 0;
1055
1056 if (collect) {
1057 list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
1058 list_del_init(&bh->b_assoc_buffers);
1059 err = collect(sci, bh, inode);
1060 brelse(bh);
1061 if (unlikely(err))
1062 goto dispose_buffers;
1063 }
1064 return 0;
1065 }
1066
1067 dispose_buffers:
1068 while (!list_empty(listp)) {
1069 bh = list_entry(listp->next, struct buffer_head,
1070 b_assoc_buffers);
1071 list_del_init(&bh->b_assoc_buffers);
1072 brelse(bh);
1073 }
1074 return err;
1075}
1076
1077static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
1078{
1079 /* Remaining number of blocks within segment buffer */
1080 return sci->sc_segbuf_nblocks -
1081 (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
1082}
1083
1084static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
1085 struct inode *inode,
1086 struct nilfs_sc_operations *sc_ops)
1087{
1088 LIST_HEAD(data_buffers);
1089 LIST_HEAD(node_buffers);
1090 int err;
1091
1092 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1093 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1094
1095 n = nilfs_lookup_dirty_data_buffers(
1096 inode, &data_buffers, rest + 1, 0, LLONG_MAX);
1097 if (n > rest) {
1098 err = nilfs_segctor_apply_buffers(
1099 sci, inode, &data_buffers,
1100 sc_ops->collect_data);
1101 BUG_ON(!err); /* always receive -E2BIG or true error */
1102 goto break_or_fail;
1103 }
1104 }
1105 nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
1106
1107 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1108 err = nilfs_segctor_apply_buffers(
1109 sci, inode, &data_buffers, sc_ops->collect_data);
1110 if (unlikely(err)) {
1111 /* dispose node list */
1112 nilfs_segctor_apply_buffers(
1113 sci, inode, &node_buffers, NULL);
1114 goto break_or_fail;
1115 }
1116 sci->sc_stage.flags |= NILFS_CF_NODE;
1117 }
1118 /* Collect node */
1119 err = nilfs_segctor_apply_buffers(
1120 sci, inode, &node_buffers, sc_ops->collect_node);
1121 if (unlikely(err))
1122 goto break_or_fail;
1123
1124 nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
1125 err = nilfs_segctor_apply_buffers(
1126 sci, inode, &node_buffers, sc_ops->collect_bmap);
1127 if (unlikely(err))
1128 goto break_or_fail;
1129
1130 nilfs_segctor_end_finfo(sci, inode);
1131 sci->sc_stage.flags &= ~NILFS_CF_NODE;
1132
1133 break_or_fail:
1134 return err;
1135}
1136
1137static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
1138 struct inode *inode)
1139{
1140 LIST_HEAD(data_buffers);
1141 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1142 int err;
1143
1144 n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
1145 sci->sc_dsync_start,
1146 sci->sc_dsync_end);
1147
1148 err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
1149 nilfs_collect_file_data);
1150 if (!err) {
1151 nilfs_segctor_end_finfo(sci, inode);
1152 BUG_ON(n > rest);
1153 /* always receive -E2BIG or true error if n > rest */
1154 }
1155 return err;
1156}
1157
1158static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1159{
1160 struct nilfs_sb_info *sbi = sci->sc_sbi;
1161 struct the_nilfs *nilfs = sbi->s_nilfs;
1162 struct list_head *head;
1163 struct nilfs_inode_info *ii;
1164 int err = 0;
1165
1166 switch (sci->sc_stage.scnt) {
1167 case NILFS_ST_INIT:
1168 /* Pre-processes */
1169 sci->sc_stage.flags = 0;
1170
1171 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
1172 sci->sc_nblk_inc = 0;
1173 sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
1174 if (mode == SC_LSEG_DSYNC) {
1175 sci->sc_stage.scnt = NILFS_ST_DSYNC;
1176 goto dsync_mode;
1177 }
1178 }
1179
1180 sci->sc_stage.dirty_file_ptr = NULL;
1181 sci->sc_stage.gc_inode_ptr = NULL;
1182 if (mode == SC_FLUSH_DAT) {
1183 sci->sc_stage.scnt = NILFS_ST_DAT;
1184 goto dat_stage;
1185 }
1186 sci->sc_stage.scnt++; /* Fall through */
1187 case NILFS_ST_GC:
1188 if (nilfs_doing_gc()) {
1189 head = &sci->sc_gc_inodes;
1190 ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
1191 head, i_dirty);
1192 list_for_each_entry_continue(ii, head, i_dirty) {
1193 err = nilfs_segctor_scan_file(
1194 sci, &ii->vfs_inode,
1195 &nilfs_sc_file_ops);
1196 if (unlikely(err)) {
1197 sci->sc_stage.gc_inode_ptr = list_entry(
1198 ii->i_dirty.prev,
1199 struct nilfs_inode_info,
1200 i_dirty);
1201 goto break_or_fail;
1202 }
1203 set_bit(NILFS_I_COLLECTED, &ii->i_state);
1204 }
1205 sci->sc_stage.gc_inode_ptr = NULL;
1206 }
1207 sci->sc_stage.scnt++; /* Fall through */
1208 case NILFS_ST_FILE:
1209 head = &sci->sc_dirty_files;
1210 ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
1211 i_dirty);
1212 list_for_each_entry_continue(ii, head, i_dirty) {
1213 clear_bit(NILFS_I_DIRTY, &ii->i_state);
1214
1215 err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
1216 &nilfs_sc_file_ops);
1217 if (unlikely(err)) {
1218 sci->sc_stage.dirty_file_ptr =
1219 list_entry(ii->i_dirty.prev,
1220 struct nilfs_inode_info,
1221 i_dirty);
1222 goto break_or_fail;
1223 }
1224 /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
1225 /* XXX: required ? */
1226 }
1227 sci->sc_stage.dirty_file_ptr = NULL;
1228 if (mode == SC_FLUSH_FILE) {
1229 sci->sc_stage.scnt = NILFS_ST_DONE;
1230 return 0;
1231 }
1232 sci->sc_stage.scnt++;
1233 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1234 /* Fall through */
1235 case NILFS_ST_IFILE:
1236 err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
1237 &nilfs_sc_file_ops);
1238 if (unlikely(err))
1239 break;
1240 sci->sc_stage.scnt++;
1241 /* Creating a checkpoint */
1242 err = nilfs_segctor_create_checkpoint(sci);
1243 if (unlikely(err))
1244 break;
1245 /* Fall through */
1246 case NILFS_ST_CPFILE:
1247 err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
1248 &nilfs_sc_file_ops);
1249 if (unlikely(err))
1250 break;
1251 sci->sc_stage.scnt++; /* Fall through */
1252 case NILFS_ST_SUFILE:
1253 err = nilfs_segctor_prepare_free_segments(sci,
1254 nilfs->ns_sufile);
1255 if (unlikely(err))
1256 break;
1257 err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
1258 &nilfs_sc_file_ops);
1259 if (unlikely(err))
1260 break;
1261 sci->sc_stage.scnt++; /* Fall through */
1262 case NILFS_ST_DAT:
1263 dat_stage:
1264 err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
1265 &nilfs_sc_dat_ops);
1266 if (unlikely(err))
1267 break;
1268 if (mode == SC_FLUSH_DAT) {
1269 sci->sc_stage.scnt = NILFS_ST_DONE;
1270 return 0;
1271 }
1272 sci->sc_stage.scnt++; /* Fall through */
1273 case NILFS_ST_SR:
1274 if (mode == SC_LSEG_SR) {
1275 /* Appending a super root */
1276 err = nilfs_segctor_add_super_root(sci);
1277 if (unlikely(err))
1278 break;
1279 }
1280 /* End of a logical segment */
1281 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1282 sci->sc_stage.scnt = NILFS_ST_DONE;
1283 return 0;
1284 case NILFS_ST_DSYNC:
1285 dsync_mode:
1286 sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
1287 ii = sci->sc_dsync_inode;
1288 if (!test_bit(NILFS_I_BUSY, &ii->i_state))
1289 break;
1290
1291 err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
1292 if (unlikely(err))
1293 break;
1294 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1295 sci->sc_stage.scnt = NILFS_ST_DONE;
1296 return 0;
1297 case NILFS_ST_DONE:
1298 return 0;
1299 default:
1300 BUG();
1301 }
1302
1303 break_or_fail:
1304 return err;
1305}
1306
1307static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
1308{
1309 struct buffer_head *bh_su;
1310 struct nilfs_segment_usage *raw_su;
1311 int err;
1312
1313 err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
1314 if (unlikely(err))
1315 return err;
1316 nilfs_mdt_mark_buffer_dirty(bh_su);
1317 nilfs_mdt_mark_dirty(sufile);
1318 nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
1319 return 0;
1320}
1321
1322static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
1323 struct the_nilfs *nilfs)
1324{
1325 struct nilfs_segment_buffer *segbuf, *n;
1326 __u64 nextnum;
1327 int err;
1328
1329 if (list_empty(&sci->sc_segbufs)) {
1330 segbuf = nilfs_segbuf_new(sci->sc_super);
1331 if (unlikely(!segbuf))
1332 return -ENOMEM;
1333 list_add(&segbuf->sb_list, &sci->sc_segbufs);
1334 } else
1335 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1336
1337 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
1338 nilfs);
1339
1340 if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
1341 nilfs_shift_to_next_segment(nilfs);
1342 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
1343 }
1344 sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
1345
1346 err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
1347 if (unlikely(err))
1348 return err;
1349
1350 if (nilfs->ns_segnum == nilfs->ns_nextnum) {
1351 /* Start from the head of a new full segment */
1352 err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
1353 if (unlikely(err))
1354 return err;
1355 } else
1356 nextnum = nilfs->ns_nextnum;
1357
1358 segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
1359 nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
1360
1361 /* truncating segment buffers */
1362 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1363 sb_list) {
1364 list_del_init(&segbuf->sb_list);
1365 nilfs_segbuf_free(segbuf);
1366 }
1367 return 0;
1368}
1369
1370static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
1371 struct the_nilfs *nilfs, int nadd)
1372{
1373 struct nilfs_segment_buffer *segbuf, *prev, *n;
1374 struct inode *sufile = nilfs->ns_sufile;
1375 __u64 nextnextnum;
1376 LIST_HEAD(list);
1377 int err, ret, i;
1378
1379 prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
1380 /*
1381 * Since the segment specified with nextnum might be allocated during
1382 * the previous construction, the buffer including its segusage may
1383 * not be dirty. The following call ensures that the buffer is dirty
1384 * and will pin the buffer on memory until the sufile is written.
1385 */
1386 err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
1387 if (unlikely(err))
1388 return err;
1389
1390 for (i = 0; i < nadd; i++) {
1391 /* extend segment info */
1392 err = -ENOMEM;
1393 segbuf = nilfs_segbuf_new(sci->sc_super);
1394 if (unlikely(!segbuf))
1395 goto failed;
1396
1397 /* map this buffer to region of segment on-disk */
1398 nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
1399 sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
1400
1401 /* allocate the next next full segment */
1402 err = nilfs_sufile_alloc(sufile, &nextnextnum);
1403 if (unlikely(err))
1404 goto failed_segbuf;
1405
1406 segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
1407 nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
1408
1409 list_add_tail(&segbuf->sb_list, &list);
1410 prev = segbuf;
1411 }
1412 list_splice(&list, sci->sc_segbufs.prev);
1413 return 0;
1414
1415 failed_segbuf:
1416 nilfs_segbuf_free(segbuf);
1417 failed:
1418 list_for_each_entry_safe(segbuf, n, &list, sb_list) {
1419 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1420 WARN_ON(ret); /* never fails */
1421 list_del_init(&segbuf->sb_list);
1422 nilfs_segbuf_free(segbuf);
1423 }
1424 return err;
1425}
1426
1427static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
1428 struct the_nilfs *nilfs)
1429{
1430 struct nilfs_segment_buffer *segbuf;
1431 int ret, done = 0;
1432
1433 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1434 if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
1435 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1436 WARN_ON(ret); /* never fails */
1437 }
1438 if (segbuf->sb_io_error) {
1439 /* Case 1: The first segment failed */
1440 if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
1441 /* Case 1a: Partial segment appended into an existing
1442 segment */
1443 nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
1444 segbuf->sb_fseg_end);
1445 else /* Case 1b: New full segment */
1446 set_nilfs_discontinued(nilfs);
1447 done++;
1448 }
1449
1450 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1451 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1452 WARN_ON(ret); /* never fails */
1453 if (!done && segbuf->sb_io_error) {
1454 if (segbuf->sb_segnum != nilfs->ns_nextnum)
1455 /* Case 2: extended segment (!= next) failed */
1456 nilfs_sufile_set_error(nilfs->ns_sufile,
1457 segbuf->sb_segnum);
1458 done++;
1459 }
1460 }
1461}
1462
1463static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
1464{
1465 struct nilfs_segment_buffer *segbuf;
1466
1467 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
1468 nilfs_segbuf_clear(segbuf);
1469 sci->sc_super_root = NULL;
1470}
1471
1472static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
1473{
1474 struct nilfs_segment_buffer *segbuf;
1475
1476 while (!list_empty(&sci->sc_segbufs)) {
1477 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1478 list_del_init(&segbuf->sb_list);
1479 nilfs_segbuf_free(segbuf);
1480 }
1481 /* sci->sc_curseg = NULL; */
1482}
1483
1484static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
1485 struct the_nilfs *nilfs, int err)
1486{
1487 if (unlikely(err)) {
1488 nilfs_segctor_free_incomplete_segments(sci, nilfs);
1489 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1490 }
1491 nilfs_segctor_clear_segment_buffers(sci);
1492}
1493
1494static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
1495 struct inode *sufile)
1496{
1497 struct nilfs_segment_buffer *segbuf;
1498 struct buffer_head *bh_su;
1499 struct nilfs_segment_usage *raw_su;
1500 unsigned long live_blocks;
1501 int ret;
1502
1503 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1504 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1505 &raw_su, &bh_su);
1506 WARN_ON(ret); /* always succeed because bh_su is dirty */
1507 live_blocks = segbuf->sb_sum.nblocks +
1508 (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
1509 raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
1510 raw_su->su_nblocks = cpu_to_le32(live_blocks);
1511 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1512 bh_su);
1513 }
1514}
1515
1516static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
1517 struct inode *sufile)
1518{
1519 struct nilfs_segment_buffer *segbuf;
1520 struct buffer_head *bh_su;
1521 struct nilfs_segment_usage *raw_su;
1522 int ret;
1523
1524 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1525 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1526 &raw_su, &bh_su);
1527 WARN_ON(ret); /* always succeed because bh_su is dirty */
1528 raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
1529 segbuf->sb_fseg_start);
1530 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
1531
1532 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1533 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1534 &raw_su, &bh_su);
1535 WARN_ON(ret); /* always succeed */
1536 raw_su->su_nblocks = 0;
1537 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1538 bh_su);
1539 }
1540}
1541
1542static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
1543 struct nilfs_segment_buffer *last,
1544 struct inode *sufile)
1545{
1546 struct nilfs_segment_buffer *segbuf = last, *n;
1547 int ret;
1548
1549 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1550 sb_list) {
1551 list_del_init(&segbuf->sb_list);
1552 sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
1553 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1554 WARN_ON(ret);
1555 nilfs_segbuf_free(segbuf);
1556 }
1557}
1558
1559
1560static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1561 struct the_nilfs *nilfs, int mode)
1562{
1563 struct nilfs_cstage prev_stage = sci->sc_stage;
1564 int err, nadd = 1;
1565
1566 /* Collection retry loop */
1567 for (;;) {
1568 sci->sc_super_root = NULL;
1569 sci->sc_nblk_this_inc = 0;
1570 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1571
1572 err = nilfs_segctor_reset_segment_buffer(sci);
1573 if (unlikely(err))
1574 goto failed;
1575
1576 err = nilfs_segctor_collect_blocks(sci, mode);
1577 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
1578 if (!err)
1579 break;
1580
1581 if (unlikely(err != -E2BIG))
1582 goto failed;
1583
1584 /* The current segment is filled up */
1585 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1586 break;
1587
1588 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1589 nilfs_segctor_clear_segment_buffers(sci);
1590
1591 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1592 if (unlikely(err))
1593 return err;
1594
1595 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1596 sci->sc_stage = prev_stage;
1597 }
1598 nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
1599 return 0;
1600
1601 failed:
1602 return err;
1603}
1604
1605static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
1606 struct buffer_head *new_bh)
1607{
1608 BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
1609
1610 list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
1611 /* The caller must release old_bh */
1612}
1613
1614static int
1615nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1616 struct nilfs_segment_buffer *segbuf,
1617 int mode)
1618{
1619 struct inode *inode = NULL;
1620 sector_t blocknr;
1621 unsigned long nfinfo = segbuf->sb_sum.nfinfo;
1622 unsigned long nblocks = 0, ndatablk = 0;
1623 struct nilfs_sc_operations *sc_op = NULL;
1624 struct nilfs_segsum_pointer ssp;
1625 struct nilfs_finfo *finfo = NULL;
1626 union nilfs_binfo binfo;
1627 struct buffer_head *bh, *bh_org;
1628 ino_t ino = 0;
1629 int err = 0;
1630
1631 if (!nfinfo)
1632 goto out;
1633
1634 blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
1635 ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
1636 ssp.offset = sizeof(struct nilfs_segment_summary);
1637
1638 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1639 if (bh == sci->sc_super_root)
1640 break;
1641 if (!finfo) {
1642 finfo = nilfs_segctor_map_segsum_entry(
1643 sci, &ssp, sizeof(*finfo));
1644 ino = le64_to_cpu(finfo->fi_ino);
1645 nblocks = le32_to_cpu(finfo->fi_nblocks);
1646 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
1647
1648 if (buffer_nilfs_node(bh))
1649 inode = NILFS_BTNC_I(bh->b_page->mapping);
1650 else
1651 inode = NILFS_AS_I(bh->b_page->mapping);
1652
1653 if (mode == SC_LSEG_DSYNC)
1654 sc_op = &nilfs_sc_dsync_ops;
1655 else if (ino == NILFS_DAT_INO)
1656 sc_op = &nilfs_sc_dat_ops;
1657 else /* file blocks */
1658 sc_op = &nilfs_sc_file_ops;
1659 }
1660 bh_org = bh;
1661 get_bh(bh_org);
1662 err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
1663 &binfo);
1664 if (bh != bh_org)
1665 nilfs_list_replace_buffer(bh_org, bh);
1666 brelse(bh_org);
1667 if (unlikely(err))
1668 goto failed_bmap;
1669
1670 if (ndatablk > 0)
1671 sc_op->write_data_binfo(sci, &ssp, &binfo);
1672 else
1673 sc_op->write_node_binfo(sci, &ssp, &binfo);
1674
1675 blocknr++;
1676 if (--nblocks == 0) {
1677 finfo = NULL;
1678 if (--nfinfo == 0)
1679 break;
1680 } else if (ndatablk > 0)
1681 ndatablk--;
1682 }
1683 out:
1684 return 0;
1685
1686 failed_bmap:
1687 err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
1688 return err;
1689}
1690
1691static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
1692{
1693 struct nilfs_segment_buffer *segbuf;
1694 int err;
1695
1696 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1697 err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
1698 if (unlikely(err))
1699 return err;
1700 nilfs_segbuf_fill_in_segsum(segbuf);
1701 }
1702 return 0;
1703}
1704
1705static int
1706nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1707{
1708 struct page *clone_page;
1709 struct buffer_head *bh, *head, *bh2;
1710 void *kaddr;
1711
1712 bh = head = page_buffers(page);
1713
1714 clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
1715 if (unlikely(!clone_page))
1716 return -ENOMEM;
1717
1718 bh2 = page_buffers(clone_page);
1719 kaddr = kmap_atomic(page, KM_USER0);
1720 do {
1721 if (list_empty(&bh->b_assoc_buffers))
1722 continue;
1723 get_bh(bh2);
1724 page_cache_get(clone_page); /* for each bh */
1725 memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
1726 bh2->b_blocknr = bh->b_blocknr;
1727 list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
1728 list_add_tail(&bh->b_assoc_buffers, out);
1729 } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
1730 kunmap_atomic(kaddr, KM_USER0);
1731
1732 if (!TestSetPageWriteback(clone_page))
1733 inc_zone_page_state(clone_page, NR_WRITEBACK);
1734 unlock_page(clone_page);
1735
1736 return 0;
1737}
1738
1739static int nilfs_test_page_to_be_frozen(struct page *page)
1740{
1741 struct address_space *mapping = page->mapping;
1742
1743 if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
1744 return 0;
1745
1746 if (page_mapped(page)) {
1747 ClearPageChecked(page);
1748 return 1;
1749 }
1750 return PageChecked(page);
1751}
1752
1753static int nilfs_begin_page_io(struct page *page, struct list_head *out)
1754{
1755 if (!page || PageWriteback(page))
1756 /* For split b-tree node pages, this function may be called
1757 twice. We ignore the 2nd or later calls by this check. */
1758 return 0;
1759
1760 lock_page(page);
1761 clear_page_dirty_for_io(page);
1762 set_page_writeback(page);
1763 unlock_page(page);
1764
1765 if (nilfs_test_page_to_be_frozen(page)) {
1766 int err = nilfs_copy_replace_page_buffers(page, out);
1767 if (unlikely(err))
1768 return err;
1769 }
1770 return 0;
1771}
1772
1773static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1774 struct page **failed_page)
1775{
1776 struct nilfs_segment_buffer *segbuf;
1777 struct page *bd_page = NULL, *fs_page = NULL;
1778 struct list_head *list = &sci->sc_copied_buffers;
1779 int err;
1780
1781 *failed_page = NULL;
1782 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1783 struct buffer_head *bh;
1784
1785 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1786 b_assoc_buffers) {
1787 if (bh->b_page != bd_page) {
1788 if (bd_page) {
1789 lock_page(bd_page);
1790 clear_page_dirty_for_io(bd_page);
1791 set_page_writeback(bd_page);
1792 unlock_page(bd_page);
1793 }
1794 bd_page = bh->b_page;
1795 }
1796 }
1797
1798 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1799 b_assoc_buffers) {
1800 if (bh == sci->sc_super_root) {
1801 if (bh->b_page != bd_page) {
1802 lock_page(bd_page);
1803 clear_page_dirty_for_io(bd_page);
1804 set_page_writeback(bd_page);
1805 unlock_page(bd_page);
1806 bd_page = bh->b_page;
1807 }
1808 break;
1809 }
1810 if (bh->b_page != fs_page) {
1811 err = nilfs_begin_page_io(fs_page, list);
1812 if (unlikely(err)) {
1813 *failed_page = fs_page;
1814 goto out;
1815 }
1816 fs_page = bh->b_page;
1817 }
1818 }
1819 }
1820 if (bd_page) {
1821 lock_page(bd_page);
1822 clear_page_dirty_for_io(bd_page);
1823 set_page_writeback(bd_page);
1824 unlock_page(bd_page);
1825 }
1826 err = nilfs_begin_page_io(fs_page, list);
1827 if (unlikely(err))
1828 *failed_page = fs_page;
1829 out:
1830 return err;
1831}
1832
1833static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1834 struct backing_dev_info *bdi)
1835{
1836 struct nilfs_segment_buffer *segbuf;
1837 struct nilfs_write_info wi;
1838 int err, res;
1839
1840 wi.sb = sci->sc_super;
1841 wi.bh_sr = sci->sc_super_root;
1842 wi.bdi = bdi;
1843
1844 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1845 nilfs_segbuf_prepare_write(segbuf, &wi);
1846 err = nilfs_segbuf_write(segbuf, &wi);
1847
1848 res = nilfs_segbuf_wait(segbuf, &wi);
1849 err = unlikely(err) ? : res;
1850 if (unlikely(err))
1851 return err;
1852 }
1853 return 0;
1854}
1855
1856static int nilfs_page_has_uncleared_buffer(struct page *page)
1857{
1858 struct buffer_head *head, *bh;
1859
1860 head = bh = page_buffers(page);
1861 do {
1862 if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
1863 return 1;
1864 bh = bh->b_this_page;
1865 } while (bh != head);
1866 return 0;
1867}
1868
1869static void __nilfs_end_page_io(struct page *page, int err)
1870{
1871 if (!err) {
1872 if (!nilfs_page_buffers_clean(page))
1873 __set_page_dirty_nobuffers(page);
1874 ClearPageError(page);
1875 } else {
1876 __set_page_dirty_nobuffers(page);
1877 SetPageError(page);
1878 }
1879
1880 if (buffer_nilfs_allocated(page_buffers(page))) {
1881 if (TestClearPageWriteback(page))
1882 dec_zone_page_state(page, NR_WRITEBACK);
1883 } else
1884 end_page_writeback(page);
1885}
1886
1887static void nilfs_end_page_io(struct page *page, int err)
1888{
1889 if (!page)
1890 return;
1891
1892 if (buffer_nilfs_node(page_buffers(page)) &&
1893 nilfs_page_has_uncleared_buffer(page))
1894 /* For b-tree node pages, this function may be called twice
1895 or more because they might be split in a segment.
1896 This check assures that cleanup has been done for all
1897 buffers in a split btnode page. */
1898 return;
1899
1900 __nilfs_end_page_io(page, err);
1901}
1902
1903static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1904{
1905 struct buffer_head *bh, *head;
1906 struct page *page;
1907
1908 while (!list_empty(list)) {
1909 bh = list_entry(list->next, struct buffer_head,
1910 b_assoc_buffers);
1911 page = bh->b_page;
1912 page_cache_get(page);
1913 head = bh = page_buffers(page);
1914 do {
1915 if (!list_empty(&bh->b_assoc_buffers)) {
1916 list_del_init(&bh->b_assoc_buffers);
1917 if (!err) {
1918 set_buffer_uptodate(bh);
1919 clear_buffer_dirty(bh);
1920 clear_buffer_nilfs_volatile(bh);
1921 }
1922 brelse(bh); /* for b_assoc_buffers */
1923 }
1924 } while ((bh = bh->b_this_page) != head);
1925
1926 __nilfs_end_page_io(page, err);
1927 page_cache_release(page);
1928 }
1929}
1930
1931static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1932 struct page *failed_page, int err)
1933{
1934 struct nilfs_segment_buffer *segbuf;
1935 struct page *bd_page = NULL, *fs_page = NULL;
1936
1937 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1938 struct buffer_head *bh;
1939
1940 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1941 b_assoc_buffers) {
1942 if (bh->b_page != bd_page) {
1943 if (bd_page)
1944 end_page_writeback(bd_page);
1945 bd_page = bh->b_page;
1946 }
1947 }
1948
1949 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1950 b_assoc_buffers) {
1951 if (bh == sci->sc_super_root) {
1952 if (bh->b_page != bd_page) {
1953 end_page_writeback(bd_page);
1954 bd_page = bh->b_page;
1955 }
1956 break;
1957 }
1958 if (bh->b_page != fs_page) {
1959 nilfs_end_page_io(fs_page, err);
1960 if (unlikely(fs_page == failed_page))
1961 goto done;
1962 fs_page = bh->b_page;
1963 }
1964 }
1965 }
1966 if (bd_page)
1967 end_page_writeback(bd_page);
1968
1969 nilfs_end_page_io(fs_page, err);
1970 done:
1971 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
1972}
1973
1974static void nilfs_set_next_segment(struct the_nilfs *nilfs,
1975 struct nilfs_segment_buffer *segbuf)
1976{
1977 nilfs->ns_segnum = segbuf->sb_segnum;
1978 nilfs->ns_nextnum = segbuf->sb_nextnum;
1979 nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
1980 + segbuf->sb_sum.nblocks;
1981 nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
1982 nilfs->ns_ctime = segbuf->sb_sum.ctime;
1983}
1984
1985static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1986{
1987 struct nilfs_segment_buffer *segbuf;
1988 struct page *bd_page = NULL, *fs_page = NULL;
1989 struct nilfs_sb_info *sbi = sci->sc_sbi;
1990 struct the_nilfs *nilfs = sbi->s_nilfs;
1991 int update_sr = (sci->sc_super_root != NULL);
1992
1993 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1994 struct buffer_head *bh;
1995
1996 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1997 b_assoc_buffers) {
1998 set_buffer_uptodate(bh);
1999 clear_buffer_dirty(bh);
2000 if (bh->b_page != bd_page) {
2001 if (bd_page)
2002 end_page_writeback(bd_page);
2003 bd_page = bh->b_page;
2004 }
2005 }
2006 /*
2007 * We assume that the buffers which belong to the same page
2008 * continue over the buffer list.
2009 * Under this assumption, the last BHs of pages is
2010 * identifiable by the discontinuity of bh->b_page
2011 * (page != fs_page).
2012 *
2013 * For B-tree node blocks, however, this assumption is not
2014 * guaranteed. The cleanup code of B-tree node pages needs
2015 * special care.
2016 */
2017 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
2018 b_assoc_buffers) {
2019 set_buffer_uptodate(bh);
2020 clear_buffer_dirty(bh);
2021 clear_buffer_nilfs_volatile(bh);
2022 if (bh == sci->sc_super_root) {
2023 if (bh->b_page != bd_page) {
2024 end_page_writeback(bd_page);
2025 bd_page = bh->b_page;
2026 }
2027 break;
2028 }
2029 if (bh->b_page != fs_page) {
2030 nilfs_end_page_io(fs_page, 0);
2031 fs_page = bh->b_page;
2032 }
2033 }
2034
2035 if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
2036 if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
2037 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2038 sci->sc_lseg_stime = jiffies;
2039 }
2040 if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
2041 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2042 }
2043 }
2044 /*
2045 * Since pages may continue over multiple segment buffers,
2046 * end of the last page must be checked outside of the loop.
2047 */
2048 if (bd_page)
2049 end_page_writeback(bd_page);
2050
2051 nilfs_end_page_io(fs_page, 0);
2052
2053 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
2054
2055 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
2056
2057 if (nilfs_doing_gc()) {
2058 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
2059 if (update_sr)
2060 nilfs_commit_gcdat_inode(nilfs);
2061 } else
2062 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
2063
2064 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
2065
2066 segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
2067 nilfs_set_next_segment(nilfs, segbuf);
2068
2069 if (update_sr) {
2070 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2071 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2072 sbi->s_super->s_dirt = 1;
2073
2074 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2075 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2076 set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2077 } else
2078 clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2079}
2080
2081static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2082 struct nilfs_sb_info *sbi)
2083{
2084 struct nilfs_inode_info *ii, *n;
2085 __u64 cno = sbi->s_nilfs->ns_cno;
2086
2087 spin_lock(&sbi->s_inode_lock);
2088 retry:
2089 list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
2090 if (!ii->i_bh) {
2091 struct buffer_head *ibh;
2092 int err;
2093
2094 spin_unlock(&sbi->s_inode_lock);
2095 err = nilfs_ifile_get_inode_block(
2096 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
2097 if (unlikely(err)) {
2098 nilfs_warning(sbi->s_super, __func__,
2099 "failed to get inode block.\n");
2100 return err;
2101 }
2102 nilfs_mdt_mark_buffer_dirty(ibh);
2103 nilfs_mdt_mark_dirty(sbi->s_ifile);
2104 spin_lock(&sbi->s_inode_lock);
2105 if (likely(!ii->i_bh))
2106 ii->i_bh = ibh;
2107 else
2108 brelse(ibh);
2109 goto retry;
2110 }
2111 ii->i_cno = cno;
2112
2113 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2114 set_bit(NILFS_I_BUSY, &ii->i_state);
2115 list_del(&ii->i_dirty);
2116 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
2117 }
2118 spin_unlock(&sbi->s_inode_lock);
2119
2120 NILFS_I(sbi->s_ifile)->i_cno = cno;
2121
2122 return 0;
2123}
2124
2125static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2126 struct nilfs_sb_info *sbi)
2127{
2128 struct nilfs_transaction_info *ti = current->journal_info;
2129 struct nilfs_inode_info *ii, *n;
2130 __u64 cno = sbi->s_nilfs->ns_cno;
2131
2132 spin_lock(&sbi->s_inode_lock);
2133 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2134 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2135 test_bit(NILFS_I_DIRTY, &ii->i_state)) {
2136 /* The current checkpoint number (=nilfs->ns_cno) is
2137 changed between check-in and check-out only if the
2138 super root is written out. So, we can update i_cno
2139 for the inodes that remain in the dirty list. */
2140 ii->i_cno = cno;
2141 continue;
2142 }
2143 clear_bit(NILFS_I_BUSY, &ii->i_state);
2144 brelse(ii->i_bh);
2145 ii->i_bh = NULL;
2146 list_del(&ii->i_dirty);
2147 list_add_tail(&ii->i_dirty, &ti->ti_garbage);
2148 }
2149 spin_unlock(&sbi->s_inode_lock);
2150}
2151
2152/*
2153 * Main procedure of segment constructor
2154 */
2155static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2156{
2157 struct nilfs_sb_info *sbi = sci->sc_sbi;
2158 struct the_nilfs *nilfs = sbi->s_nilfs;
2159 struct page *failed_page;
2160 int err, has_sr = 0;
2161
2162 sci->sc_stage.scnt = NILFS_ST_INIT;
2163
2164 err = nilfs_segctor_check_in_files(sci, sbi);
2165 if (unlikely(err))
2166 goto out;
2167
2168 if (nilfs_test_metadata_dirty(sbi))
2169 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2170
2171 if (nilfs_segctor_clean(sci))
2172 goto out;
2173
2174 do {
2175 sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
2176
2177 err = nilfs_segctor_begin_construction(sci, nilfs);
2178 if (unlikely(err))
2179 goto out;
2180
2181 /* Update time stamp */
2182 sci->sc_seg_ctime = get_seconds();
2183
2184 err = nilfs_segctor_collect(sci, nilfs, mode);
2185 if (unlikely(err))
2186 goto failed;
2187
2188 has_sr = (sci->sc_super_root != NULL);
2189
2190 /* Avoid empty segment */
2191 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2192 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
2193 nilfs_segctor_end_construction(sci, nilfs, 1);
2194 goto out;
2195 }
2196
2197 err = nilfs_segctor_assign(sci, mode);
2198 if (unlikely(err))
2199 goto failed;
2200
2201 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2202 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2203
2204 if (has_sr) {
2205 err = nilfs_segctor_fill_in_checkpoint(sci);
2206 if (unlikely(err))
2207 goto failed_to_make_up;
2208
2209 nilfs_segctor_fill_in_super_root(sci, nilfs);
2210 }
2211 nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
2212
2213 /* Write partial segments */
2214 err = nilfs_segctor_prepare_write(sci, &failed_page);
2215 if (unlikely(err))
2216 goto failed_to_write;
2217
2218 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
2219
2220 err = nilfs_segctor_write(sci, nilfs->ns_bdi);
2221 if (unlikely(err))
2222 goto failed_to_write;
2223
2224 nilfs_segctor_complete_write(sci);
2225
2226 /* Commit segments */
2227 if (has_sr) {
2228 nilfs_segctor_commit_free_segments(sci);
2229 nilfs_segctor_clear_metadata_dirty(sci);
2230 }
2231
2232 nilfs_segctor_end_construction(sci, nilfs, 0);
2233
2234 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2235
2236 out:
2237 nilfs_segctor_destroy_segment_buffers(sci);
2238 nilfs_segctor_check_out_files(sci, sbi);
2239 return err;
2240
2241 failed_to_write:
2242 nilfs_segctor_abort_write(sci, failed_page, err);
2243 nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
2244
2245 failed_to_make_up:
2246 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2247 nilfs_redirty_inodes(&sci->sc_dirty_files);
2248
2249 failed:
2250 if (nilfs_doing_gc())
2251 nilfs_redirty_inodes(&sci->sc_gc_inodes);
2252 nilfs_segctor_end_construction(sci, nilfs, err);
2253 goto out;
2254}
2255
2256/**
2257 * nilfs_secgtor_start_timer - set timer of background write
2258 * @sci: nilfs_sc_info
2259 *
2260 * If the timer has already been set, it ignores the new request.
2261 * This function MUST be called within a section locking the segment
2262 * semaphore.
2263 */
2264static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2265{
2266 spin_lock(&sci->sc_state_lock);
2267 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2268 sci->sc_timer->expires = jiffies + sci->sc_interval;
2269 add_timer(sci->sc_timer);
2270 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2271 }
2272 spin_unlock(&sci->sc_state_lock);
2273}
2274
2275static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
2276{
2277 spin_lock(&sci->sc_state_lock);
2278 if (!(sci->sc_flush_request & (1 << bn))) {
2279 unsigned long prev_req = sci->sc_flush_request;
2280
2281 sci->sc_flush_request |= (1 << bn);
2282 if (!prev_req)
2283 wake_up(&sci->sc_wait_daemon);
2284 }
2285 spin_unlock(&sci->sc_state_lock);
2286}
2287
2288/**
2289 * nilfs_flush_segment - trigger a segment construction for resource control
2290 * @sb: super block
2291 * @ino: inode number of the file to be flushed out.
2292 */
2293void nilfs_flush_segment(struct super_block *sb, ino_t ino)
2294{
2295 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2296 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2297
2298 if (!sci || nilfs_doing_construction())
2299 return;
2300 nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
2301 /* assign bit 0 to data files */
2302}
2303
2304int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
2305 __u64 *segnum, size_t nsegs)
2306{
2307 struct nilfs_segment_entry *ent;
2308 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2309 struct inode *sufile = nilfs->ns_sufile;
2310 LIST_HEAD(list);
2311 __u64 *pnum;
2312 size_t i;
2313 int err;
2314
2315 for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
2316 ent = nilfs_alloc_segment_entry(*pnum);
2317 if (unlikely(!ent)) {
2318 err = -ENOMEM;
2319 goto failed;
2320 }
2321 list_add_tail(&ent->list, &list);
2322
2323 err = nilfs_open_segment_entry(ent, sufile);
2324 if (unlikely(err))
2325 goto failed;
2326
2327 if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
2328 printk(KERN_WARNING "NILFS: unused segment is "
2329 "requested to be cleaned (segnum=%llu)\n",
2330 (unsigned long long)ent->segnum);
2331 nilfs_close_segment_entry(ent, sufile);
2332 }
2333 list_splice(&list, sci->sc_cleaning_segments.prev);
2334 return 0;
2335
2336 failed:
2337 nilfs_dispose_segment_list(&list);
2338 return err;
2339}
2340
2341void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
2342{
2343 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2344}
2345
2346struct nilfs_segctor_wait_request {
2347 wait_queue_t wq;
2348 __u32 seq;
2349 int err;
2350 atomic_t done;
2351};
2352
2353static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
2354{
2355 struct nilfs_segctor_wait_request wait_req;
2356 int err = 0;
2357
2358 spin_lock(&sci->sc_state_lock);
2359 init_wait(&wait_req.wq);
2360 wait_req.err = 0;
2361 atomic_set(&wait_req.done, 0);
2362 wait_req.seq = ++sci->sc_seq_request;
2363 spin_unlock(&sci->sc_state_lock);
2364
2365 init_waitqueue_entry(&wait_req.wq, current);
2366 add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
2367 set_current_state(TASK_INTERRUPTIBLE);
2368 wake_up(&sci->sc_wait_daemon);
2369
2370 for (;;) {
2371 if (atomic_read(&wait_req.done)) {
2372 err = wait_req.err;
2373 break;
2374 }
2375 if (!signal_pending(current)) {
2376 schedule();
2377 continue;
2378 }
2379 err = -ERESTARTSYS;
2380 break;
2381 }
2382 finish_wait(&sci->sc_wait_request, &wait_req.wq);
2383 return err;
2384}
2385
2386static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
2387{
2388 struct nilfs_segctor_wait_request *wrq, *n;
2389 unsigned long flags;
2390
2391 spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
2392 list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
2393 wq.task_list) {
2394 if (!atomic_read(&wrq->done) &&
2395 nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
2396 wrq->err = err;
2397 atomic_set(&wrq->done, 1);
2398 }
2399 if (atomic_read(&wrq->done)) {
2400 wrq->wq.func(&wrq->wq,
2401 TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2402 0, NULL);
2403 }
2404 }
2405 spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
2406}
2407
2408/**
2409 * nilfs_construct_segment - construct a logical segment
2410 * @sb: super block
2411 *
2412 * Return Value: On success, 0 is retured. On errors, one of the following
2413 * negative error code is returned.
2414 *
2415 * %-EROFS - Read only filesystem.
2416 *
2417 * %-EIO - I/O error
2418 *
2419 * %-ENOSPC - No space left on device (only in a panic state).
2420 *
2421 * %-ERESTARTSYS - Interrupted.
2422 *
2423 * %-ENOMEM - Insufficient memory available.
2424 */
2425int nilfs_construct_segment(struct super_block *sb)
2426{
2427 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2428 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2429 struct nilfs_transaction_info *ti;
2430 int err;
2431
2432 if (!sci)
2433 return -EROFS;
2434
2435 /* A call inside transactions causes a deadlock. */
2436 BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
2437
2438 err = nilfs_segctor_sync(sci);
2439 return err;
2440}
2441
2442/**
2443 * nilfs_construct_dsync_segment - construct a data-only logical segment
2444 * @sb: super block
2445 * @inode: inode whose data blocks should be written out
2446 * @start: start byte offset
2447 * @end: end byte offset (inclusive)
2448 *
2449 * Return Value: On success, 0 is retured. On errors, one of the following
2450 * negative error code is returned.
2451 *
2452 * %-EROFS - Read only filesystem.
2453 *
2454 * %-EIO - I/O error
2455 *
2456 * %-ENOSPC - No space left on device (only in a panic state).
2457 *
2458 * %-ERESTARTSYS - Interrupted.
2459 *
2460 * %-ENOMEM - Insufficient memory available.
2461 */
2462int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2463 loff_t start, loff_t end)
2464{
2465 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2466 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2467 struct nilfs_inode_info *ii;
2468 struct nilfs_transaction_info ti;
2469 int err = 0;
2470
2471 if (!sci)
2472 return -EROFS;
2473
2474 nilfs_transaction_lock(sbi, &ti, 0);
2475
2476 ii = NILFS_I(inode);
2477 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
2478 nilfs_test_opt(sbi, STRICT_ORDER) ||
2479 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2480 nilfs_discontinued(sbi->s_nilfs)) {
2481 nilfs_transaction_unlock(sbi);
2482 err = nilfs_segctor_sync(sci);
2483 return err;
2484 }
2485
2486 spin_lock(&sbi->s_inode_lock);
2487 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
2488 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
2489 spin_unlock(&sbi->s_inode_lock);
2490 nilfs_transaction_unlock(sbi);
2491 return 0;
2492 }
2493 spin_unlock(&sbi->s_inode_lock);
2494 sci->sc_dsync_inode = ii;
2495 sci->sc_dsync_start = start;
2496 sci->sc_dsync_end = end;
2497
2498 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
2499
2500 nilfs_transaction_unlock(sbi);
2501 return err;
2502}
2503
2504struct nilfs_segctor_req {
2505 int mode;
2506 __u32 seq_accepted;
2507 int sc_err; /* construction failure */
2508 int sb_err; /* super block writeback failure */
2509};
2510
2511#define FLUSH_FILE_BIT (0x1) /* data file only */
2512#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2513
2514static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
2515 struct nilfs_segctor_req *req)
2516{
2517 req->sc_err = req->sb_err = 0;
2518 spin_lock(&sci->sc_state_lock);
2519 req->seq_accepted = sci->sc_seq_request;
2520 spin_unlock(&sci->sc_state_lock);
2521
2522 if (sci->sc_timer)
2523 del_timer_sync(sci->sc_timer);
2524}
2525
2526static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2527 struct nilfs_segctor_req *req)
2528{
2529 /* Clear requests (even when the construction failed) */
2530 spin_lock(&sci->sc_state_lock);
2531
2532 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2533
2534 if (req->mode == SC_LSEG_SR) {
2535 sci->sc_seq_done = req->seq_accepted;
2536 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
2537 sci->sc_flush_request = 0;
2538 } else if (req->mode == SC_FLUSH_FILE)
2539 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2540 else if (req->mode == SC_FLUSH_DAT)
2541 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2542
2543 spin_unlock(&sci->sc_state_lock);
2544}
2545
2546static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
2547 struct nilfs_segctor_req *req)
2548{
2549 struct nilfs_sb_info *sbi = sci->sc_sbi;
2550 struct the_nilfs *nilfs = sbi->s_nilfs;
2551 int err = 0;
2552
2553 if (nilfs_discontinued(nilfs))
2554 req->mode = SC_LSEG_SR;
2555 if (!nilfs_segctor_confirm(sci)) {
2556 err = nilfs_segctor_do_construct(sci, req->mode);
2557 req->sc_err = err;
2558 }
2559 if (likely(!err)) {
2560 if (req->mode != SC_FLUSH_DAT)
2561 atomic_set(&nilfs->ns_ndirtyblks, 0);
2562 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2563 nilfs_discontinued(nilfs)) {
2564 down_write(&nilfs->ns_sem);
2565 req->sb_err = nilfs_commit_super(sbi, 0);
2566 up_write(&nilfs->ns_sem);
2567 }
2568 }
2569 return err;
2570}
2571
2572static void nilfs_construction_timeout(unsigned long data)
2573{
2574 struct task_struct *p = (struct task_struct *)data;
2575 wake_up_process(p);
2576}
2577
2578static void
2579nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2580{
2581 struct nilfs_inode_info *ii, *n;
2582
2583 list_for_each_entry_safe(ii, n, head, i_dirty) {
2584 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2585 continue;
2586 hlist_del_init(&ii->vfs_inode.i_hash);
2587 list_del_init(&ii->i_dirty);
2588 nilfs_clear_gcinode(&ii->vfs_inode);
2589 }
2590}
2591
2592int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2593 void **kbufs)
2594{
2595 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2596 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2597 struct the_nilfs *nilfs = sbi->s_nilfs;
2598 struct nilfs_transaction_info ti;
2599 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2600 int err;
2601
2602 if (unlikely(!sci))
2603 return -EROFS;
2604
2605 nilfs_transaction_lock(sbi, &ti, 1);
2606
2607 err = nilfs_init_gcdat_inode(nilfs);
2608 if (unlikely(err))
2609 goto out_unlock;
2610 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
2611 if (unlikely(err))
2612 goto out_unlock;
2613
2614 list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
2615
2616 for (;;) {
2617 nilfs_segctor_accept(sci, &req);
2618 err = nilfs_segctor_construct(sci, &req);
2619 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2620 nilfs_segctor_notify(sci, &req);
2621
2622 if (likely(!err))
2623 break;
2624
2625 nilfs_warning(sb, __func__,
2626 "segment construction failed. (err=%d)", err);
2627 set_current_state(TASK_INTERRUPTIBLE);
2628 schedule_timeout(sci->sc_interval);
2629 }
2630
2631 out_unlock:
2632 nilfs_clear_gcdat_inode(nilfs);
2633 nilfs_transaction_unlock(sbi);
2634 return err;
2635}
2636
2637static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2638{
2639 struct nilfs_sb_info *sbi = sci->sc_sbi;
2640 struct nilfs_transaction_info ti;
2641 struct nilfs_segctor_req req = { .mode = mode };
2642
2643 nilfs_transaction_lock(sbi, &ti, 0);
2644
2645 nilfs_segctor_accept(sci, &req);
2646 nilfs_segctor_construct(sci, &req);
2647 nilfs_segctor_notify(sci, &req);
2648
2649 /*
2650 * Unclosed segment should be retried. We do this using sc_timer.
2651 * Timeout of sc_timer will invoke complete construction which leads
2652 * to close the current logical segment.
2653 */
2654 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
2655 nilfs_segctor_start_timer(sci);
2656
2657 nilfs_transaction_unlock(sbi);
2658}
2659
2660static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
2661{
2662 int mode = 0;
2663 int err;
2664
2665 spin_lock(&sci->sc_state_lock);
2666 mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
2667 SC_FLUSH_DAT : SC_FLUSH_FILE;
2668 spin_unlock(&sci->sc_state_lock);
2669
2670 if (mode) {
2671 err = nilfs_segctor_do_construct(sci, mode);
2672
2673 spin_lock(&sci->sc_state_lock);
2674 sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
2675 ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
2676 spin_unlock(&sci->sc_state_lock);
2677 }
2678 clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
2679}
2680
2681static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2682{
2683 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2684 time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
2685 if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
2686 return SC_FLUSH_FILE;
2687 else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
2688 return SC_FLUSH_DAT;
2689 }
2690 return SC_LSEG_SR;
2691}
2692
2693/**
2694 * nilfs_segctor_thread - main loop of the segment constructor thread.
2695 * @arg: pointer to a struct nilfs_sc_info.
2696 *
2697 * nilfs_segctor_thread() initializes a timer and serves as a daemon
2698 * to execute segment constructions.
2699 */
2700static int nilfs_segctor_thread(void *arg)
2701{
2702 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2703 struct timer_list timer;
2704 int timeout = 0;
2705
2706 init_timer(&timer);
2707 timer.data = (unsigned long)current;
2708 timer.function = nilfs_construction_timeout;
2709 sci->sc_timer = &timer;
2710
2711 /* start sync. */
2712 sci->sc_task = current;
2713 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
2714 printk(KERN_INFO
2715 "segctord starting. Construction interval = %lu seconds, "
2716 "CP frequency < %lu seconds\n",
2717 sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
2718
2719 spin_lock(&sci->sc_state_lock);
2720 loop:
2721 for (;;) {
2722 int mode;
2723
2724 if (sci->sc_state & NILFS_SEGCTOR_QUIT)
2725 goto end_thread;
2726
2727 if (timeout || sci->sc_seq_request != sci->sc_seq_done)
2728 mode = SC_LSEG_SR;
2729 else if (!sci->sc_flush_request)
2730 break;
2731 else
2732 mode = nilfs_segctor_flush_mode(sci);
2733
2734 spin_unlock(&sci->sc_state_lock);
2735 nilfs_segctor_thread_construct(sci, mode);
2736 spin_lock(&sci->sc_state_lock);
2737 timeout = 0;
2738 }
2739
2740
2741 if (freezing(current)) {
2742 spin_unlock(&sci->sc_state_lock);
2743 refrigerator();
2744 spin_lock(&sci->sc_state_lock);
2745 } else {
2746 DEFINE_WAIT(wait);
2747 int should_sleep = 1;
2748
2749 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2750 TASK_INTERRUPTIBLE);
2751
2752 if (sci->sc_seq_request != sci->sc_seq_done)
2753 should_sleep = 0;
2754 else if (sci->sc_flush_request)
2755 should_sleep = 0;
2756 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2757 should_sleep = time_before(jiffies,
2758 sci->sc_timer->expires);
2759
2760 if (should_sleep) {
2761 spin_unlock(&sci->sc_state_lock);
2762 schedule();
2763 spin_lock(&sci->sc_state_lock);
2764 }
2765 finish_wait(&sci->sc_wait_daemon, &wait);
2766 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2767 time_after_eq(jiffies, sci->sc_timer->expires));
2768 }
2769 goto loop;
2770
2771 end_thread:
2772 spin_unlock(&sci->sc_state_lock);
2773 del_timer_sync(sci->sc_timer);
2774 sci->sc_timer = NULL;
2775
2776 /* end sync. */
2777 sci->sc_task = NULL;
2778 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
2779 return 0;
2780}
2781
2782static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2783{
2784 struct task_struct *t;
2785
2786 t = kthread_run(nilfs_segctor_thread, sci, "segctord");
2787 if (IS_ERR(t)) {
2788 int err = PTR_ERR(t);
2789
2790 printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
2791 err);
2792 return err;
2793 }
2794 wait_event(sci->sc_wait_task, sci->sc_task != NULL);
2795 return 0;
2796}
2797
2798static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2799{
2800 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2801
2802 while (sci->sc_task) {
2803 wake_up(&sci->sc_wait_daemon);
2804 spin_unlock(&sci->sc_state_lock);
2805 wait_event(sci->sc_wait_task, sci->sc_task == NULL);
2806 spin_lock(&sci->sc_state_lock);
2807 }
2808}
2809
2810static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2811{
2812 sci->sc_seq_done = sci->sc_seq_request;
2813
2814 return nilfs_segctor_start_thread(sci);
2815}
2816
2817/*
2818 * Setup & clean-up functions
2819 */
2820static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2821{
2822 struct nilfs_sc_info *sci;
2823
2824 sci = kzalloc(sizeof(*sci), GFP_KERNEL);
2825 if (!sci)
2826 return NULL;
2827
2828 sci->sc_sbi = sbi;
2829 sci->sc_super = sbi->s_super;
2830
2831 init_waitqueue_head(&sci->sc_wait_request);
2832 init_waitqueue_head(&sci->sc_wait_daemon);
2833 init_waitqueue_head(&sci->sc_wait_task);
2834 spin_lock_init(&sci->sc_state_lock);
2835 INIT_LIST_HEAD(&sci->sc_dirty_files);
2836 INIT_LIST_HEAD(&sci->sc_segbufs);
2837 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2838 INIT_LIST_HEAD(&sci->sc_cleaning_segments);
2839 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2840
2841 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2842 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
2843 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
2844
2845 if (sbi->s_interval)
2846 sci->sc_interval = sbi->s_interval;
2847 if (sbi->s_watermark)
2848 sci->sc_watermark = sbi->s_watermark;
2849 return sci;
2850}
2851
2852static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2853{
2854 int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
2855
2856 /* The segctord thread was stopped and its timer was removed.
2857 But some tasks remain. */
2858 do {
2859 struct nilfs_sb_info *sbi = sci->sc_sbi;
2860 struct nilfs_transaction_info ti;
2861 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2862
2863 nilfs_transaction_lock(sbi, &ti, 0);
2864 nilfs_segctor_accept(sci, &req);
2865 ret = nilfs_segctor_construct(sci, &req);
2866 nilfs_segctor_notify(sci, &req);
2867 nilfs_transaction_unlock(sbi);
2868
2869 } while (ret && retrycount-- > 0);
2870}
2871
2872/**
2873 * nilfs_segctor_destroy - destroy the segment constructor.
2874 * @sci: nilfs_sc_info
2875 *
2876 * nilfs_segctor_destroy() kills the segctord thread and frees
2877 * the nilfs_sc_info struct.
2878 * Caller must hold the segment semaphore.
2879 */
2880static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2881{
2882 struct nilfs_sb_info *sbi = sci->sc_sbi;
2883 int flag;
2884
2885 up_write(&sbi->s_nilfs->ns_segctor_sem);
2886
2887 spin_lock(&sci->sc_state_lock);
2888 nilfs_segctor_kill_thread(sci);
2889 flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
2890 || sci->sc_seq_request != sci->sc_seq_done);
2891 spin_unlock(&sci->sc_state_lock);
2892
2893 if (flag || nilfs_segctor_confirm(sci))
2894 nilfs_segctor_write_out(sci);
2895
2896 WARN_ON(!list_empty(&sci->sc_copied_buffers));
2897
2898 if (!list_empty(&sci->sc_dirty_files)) {
2899 nilfs_warning(sbi->s_super, __func__,
2900 "dirty file(s) after the final construction\n");
2901 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
2902 }
2903
2904 if (!list_empty(&sci->sc_cleaning_segments))
2905 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2906
2907 WARN_ON(!list_empty(&sci->sc_segbufs));
2908
2909 down_write(&sbi->s_nilfs->ns_segctor_sem);
2910
2911 kfree(sci);
2912}
2913
2914/**
2915 * nilfs_attach_segment_constructor - attach a segment constructor
2916 * @sbi: nilfs_sb_info
2917 *
2918 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2919 * initilizes it, and starts the segment constructor.
2920 *
2921 * Return Value: On success, 0 is returned. On error, one of the following
2922 * negative error code is returned.
2923 *
2924 * %-ENOMEM - Insufficient memory available.
2925 */
2926int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2927{
2928 struct the_nilfs *nilfs = sbi->s_nilfs;
2929 int err;
2930
2931 /* Each field of nilfs_segctor is cleared through the initialization
2932 of super-block info */
2933 sbi->s_sc_info = nilfs_segctor_new(sbi);
2934 if (!sbi->s_sc_info)
2935 return -ENOMEM;
2936
2937 nilfs_attach_writer(nilfs, sbi);
2938 err = nilfs_segctor_init(NILFS_SC(sbi));
2939 if (err) {
2940 nilfs_detach_writer(nilfs, sbi);
2941 kfree(sbi->s_sc_info);
2942 sbi->s_sc_info = NULL;
2943 }
2944 return err;
2945}
2946
2947/**
2948 * nilfs_detach_segment_constructor - destroy the segment constructor
2949 * @sbi: nilfs_sb_info
2950 *
2951 * nilfs_detach_segment_constructor() kills the segment constructor daemon,
2952 * frees the struct nilfs_sc_info, and destroy the dirty file list.
2953 */
2954void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2955{
2956 struct the_nilfs *nilfs = sbi->s_nilfs;
2957 LIST_HEAD(garbage_list);
2958
2959 down_write(&nilfs->ns_segctor_sem);
2960 if (NILFS_SC(sbi)) {
2961 nilfs_segctor_destroy(NILFS_SC(sbi));
2962 sbi->s_sc_info = NULL;
2963 }
2964
2965 /* Force to free the list of dirty files */
2966 spin_lock(&sbi->s_inode_lock);
2967 if (!list_empty(&sbi->s_dirty_files)) {
2968 list_splice_init(&sbi->s_dirty_files, &garbage_list);
2969 nilfs_warning(sbi->s_super, __func__,
2970 "Non empty dirty list after the last "
2971 "segment construction\n");
2972 }
2973 spin_unlock(&sbi->s_inode_lock);
2974 up_write(&nilfs->ns_segctor_sem);
2975
2976 nilfs_dispose_list(sbi, &garbage_list, 1);
2977 nilfs_detach_writer(nilfs, sbi);
2978}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 000000000000..476bdd5df5be
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,244 @@
1/*
2 * segment.h - NILFS Segment constructor prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGMENT_H
24#define _NILFS_SEGMENT_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sb.h"
31
32/**
33 * struct nilfs_recovery_info - Recovery infomation
34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint
37 * @ri_lsegs_start: Region for roll-forwarding (start block number)
38 * @ri_lsegs_end: Region for roll-forwarding (end block number)
39 * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
40 * @ri_used_segments: List of segments to be mark active
41 * @ri_pseg_start: Block number of the last partial segment
42 * @ri_seq: Sequence number on the last partial segment
43 * @ri_segnum: Segment number on the last partial segment
44 * @ri_nextnum: Next segment number on the last partial segment
45 */
46struct nilfs_recovery_info {
47 int ri_need_recovery;
48 sector_t ri_super_root;
49 __u64 ri_cno;
50
51 sector_t ri_lsegs_start;
52 sector_t ri_lsegs_end;
53 u64 ri_lsegs_start_seq;
54 struct list_head ri_used_segments;
55 sector_t ri_pseg_start;
56 u64 ri_seq;
57 __u64 ri_segnum;
58 __u64 ri_nextnum;
59};
60
61/* ri_need_recovery */
62#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */
63#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */
64
65/**
66 * struct nilfs_cstage - Context of collection stage
67 * @scnt: Stage count
68 * @flags: State flags
69 * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
70 * @gc_inode_ptr: Pointer on the list of gc-inodes
71 */
72struct nilfs_cstage {
73 int scnt;
74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr;
77};
78
79struct nilfs_segment_buffer;
80
81struct nilfs_segsum_pointer {
82 struct buffer_head *bh;
83 unsigned offset; /* offset in bytes */
84};
85
86/**
87 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct
90 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written
93 * @sc_cleaning_segments: List of segments to be freed through construction
94 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
95 * @sc_dsync_inode: inode whose data pages are written for a sync operation
96 * @sc_dsync_start: start byte offset of data pages
97 * @sc_dsync_end: end byte offset of data pages (inclusive)
98 * @sc_segbufs: List of segment buffers
99 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
100 * @sc_curseg: Current segment buffer
101 * @sc_super_root: Pointer to the super root buffer
102 * @sc_stage: Collection stage
103 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
104 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
105 * @sc_blk_cnt: Block count of a file
106 * @sc_datablk_cnt: Data block count of a file
107 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
108 * @sc_seg_ctime: Creation time
109 * @sc_flags: Internal flags
110 * @sc_state_lock: spinlock for sc_state and so on
111 * @sc_state: Segctord state flags
112 * @sc_flush_request: inode bitmap of metadata files to be flushed
113 * @sc_wait_request: Client request queue
114 * @sc_wait_daemon: Daemon wait queue
115 * @sc_wait_task: Start/end wait queue to control segctord task
116 * @sc_seq_request: Request counter
117 * @sc_seq_done: Completion counter
118 * @sc_sync: Request of explicit sync operation
119 * @sc_interval: Timeout value of background construction
120 * @sc_mjcp_freq: Frequency of creating checkpoints
121 * @sc_lseg_stime: Start time of the latest logical segment
122 * @sc_watermark: Watermark for the number of dirty buffers
123 * @sc_timer: Timer for segctord
124 * @sc_task: current thread of segctord
125 */
126struct nilfs_sc_info {
127 struct super_block *sc_super;
128 struct nilfs_sb_info *sc_sbi;
129
130 unsigned long sc_nblk_inc;
131
132 struct list_head sc_dirty_files;
133 struct list_head sc_gc_inodes;
134 struct list_head sc_cleaning_segments;
135 struct list_head sc_copied_buffers;
136
137 struct nilfs_inode_info *sc_dsync_inode;
138 loff_t sc_dsync_start;
139 loff_t sc_dsync_end;
140
141 /* Segment buffers */
142 struct list_head sc_segbufs;
143 unsigned long sc_segbuf_nblocks;
144 struct nilfs_segment_buffer *sc_curseg;
145 struct buffer_head *sc_super_root;
146
147 struct nilfs_cstage sc_stage;
148
149 struct nilfs_segsum_pointer sc_finfo_ptr;
150 struct nilfs_segsum_pointer sc_binfo_ptr;
151 unsigned long sc_blk_cnt;
152 unsigned long sc_datablk_cnt;
153 unsigned long sc_nblk_this_inc;
154 time_t sc_seg_ctime;
155
156 unsigned long sc_flags;
157
158 spinlock_t sc_state_lock;
159 unsigned long sc_state;
160 unsigned long sc_flush_request;
161
162 wait_queue_head_t sc_wait_request;
163 wait_queue_head_t sc_wait_daemon;
164 wait_queue_head_t sc_wait_task;
165
166 __u32 sc_seq_request;
167 __u32 sc_seq_done;
168
169 int sc_sync;
170 unsigned long sc_interval;
171 unsigned long sc_mjcp_freq;
172 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
173 unsigned long sc_watermark;
174
175 struct timer_list *sc_timer;
176 struct task_struct *sc_task;
177};
178
179/* sc_flags */
180enum {
181 NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */
182 NILFS_SC_UNCLOSED, /* Logical segment is not closed */
183 NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */
184 NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a
185 checkpoint */
186 NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files
187 other than DAT, cpfile, sufile, or files
188 moved by GC */
189};
190
191/* sc_state */
192#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */
193#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */
194
195/*
196 * Constant parameters
197 */
198#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when
199 destroying segctord */
200
201/*
202 * Default values of timeout, in seconds.
203 */
204#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks.
205 It triggers construction of a
206 logical segment with a super root */
207#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root
208 creation */
209
210/*
211 * The default threshold amount of data, in block counts.
212 */
213#define NILFS_SC_DEFAULT_WATERMARK 3600
214
215
216/* segment.c */
217extern int nilfs_init_transaction_cache(void);
218extern void nilfs_destroy_transaction_cache(void);
219extern void nilfs_relax_pressure_in_lock(struct super_block *);
220
221extern int nilfs_construct_segment(struct super_block *);
222extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
223 loff_t, loff_t);
224extern void nilfs_flush_segment(struct super_block *, ino_t);
225extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
226 void **);
227
228extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
229 __u64 *, size_t);
230extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
231
232extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
233extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
234
235/* recovery.c */
236extern int nilfs_read_super_root_block(struct super_block *, sector_t,
237 struct buffer_head **, int);
238extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
239 struct nilfs_recovery_info *);
240extern int nilfs_recover_logical_segments(struct the_nilfs *,
241 struct nilfs_sb_info *,
242 struct nilfs_recovery_info *);
243
244#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 000000000000..98e68677f045
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,558 @@
1/*
2 * sufile.c - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "sufile.h"
31
32
33static inline unsigned long
34nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
35{
36 return NILFS_MDT(sufile)->mi_entries_per_block;
37}
38
39static unsigned long
40nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
41{
42 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
43 do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
44 return (unsigned long)t;
45}
46
47static unsigned long
48nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
49{
50 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
51 return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
52}
53
54static unsigned long
55nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
56 __u64 max)
57{
58 return min_t(unsigned long,
59 nilfs_sufile_segment_usages_per_block(sufile) -
60 nilfs_sufile_get_offset(sufile, curr),
61 max - curr + 1);
62}
63
64static inline struct nilfs_sufile_header *
65nilfs_sufile_block_get_header(const struct inode *sufile,
66 struct buffer_head *bh,
67 void *kaddr)
68{
69 return kaddr + bh_offset(bh);
70}
71
72static struct nilfs_segment_usage *
73nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
74 struct buffer_head *bh, void *kaddr)
75{
76 return kaddr + bh_offset(bh) +
77 nilfs_sufile_get_offset(sufile, segnum) *
78 NILFS_MDT(sufile)->mi_entry_size;
79}
80
81static inline int nilfs_sufile_get_header_block(struct inode *sufile,
82 struct buffer_head **bhp)
83{
84 return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
85}
86
87static inline int
88nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
89 int create, struct buffer_head **bhp)
90{
91 return nilfs_mdt_get_block(sufile,
92 nilfs_sufile_get_blkoff(sufile, segnum),
93 create, NULL, bhp);
94}
95
96static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
97 u64 ncleanadd, u64 ndirtyadd)
98{
99 struct nilfs_sufile_header *header;
100 void *kaddr;
101
102 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
103 header = kaddr + bh_offset(header_bh);
104 le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
105 le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
106 kunmap_atomic(kaddr, KM_USER0);
107
108 nilfs_mdt_mark_buffer_dirty(header_bh);
109}
110
111int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
112 void (*dofunc)(struct inode *, __u64,
113 struct buffer_head *,
114 struct buffer_head *))
115{
116 struct buffer_head *header_bh, *bh;
117 int ret;
118
119 if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
120 printk(KERN_WARNING "%s: invalid segment number: %llu\n",
121 __func__, (unsigned long long)segnum);
122 return -EINVAL;
123 }
124 down_write(&NILFS_MDT(sufile)->mi_sem);
125
126 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
127 if (ret < 0)
128 goto out_sem;
129
130 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
131 if (!ret) {
132 dofunc(sufile, segnum, header_bh, bh);
133 brelse(bh);
134 }
135 brelse(header_bh);
136
137 out_sem:
138 up_write(&NILFS_MDT(sufile)->mi_sem);
139 return ret;
140}
141
142/**
143 * nilfs_sufile_alloc - allocate a segment
144 * @sufile: inode of segment usage file
145 * @segnump: pointer to segment number
146 *
147 * Description: nilfs_sufile_alloc() allocates a clean segment.
148 *
149 * Return Value: On success, 0 is returned and the segment number of the
150 * allocated segment is stored in the place pointed by @segnump. On error, one
151 * of the following negative error codes is returned.
152 *
153 * %-EIO - I/O error.
154 *
155 * %-ENOMEM - Insufficient amount of memory available.
156 *
157 * %-ENOSPC - No clean segment left.
158 */
159int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
160{
161 struct buffer_head *header_bh, *su_bh;
162 struct nilfs_sufile_header *header;
163 struct nilfs_segment_usage *su;
164 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
165 __u64 segnum, maxsegnum, last_alloc;
166 void *kaddr;
167 unsigned long nsegments, ncleansegs, nsus;
168 int ret, i, j;
169
170 down_write(&NILFS_MDT(sufile)->mi_sem);
171
172 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
173 if (ret < 0)
174 goto out_sem;
175 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
176 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
177 ncleansegs = le64_to_cpu(header->sh_ncleansegs);
178 last_alloc = le64_to_cpu(header->sh_last_alloc);
179 kunmap_atomic(kaddr, KM_USER0);
180
181 nsegments = nilfs_sufile_get_nsegments(sufile);
182 segnum = last_alloc + 1;
183 maxsegnum = nsegments - 1;
184 for (i = 0; i < nsegments; i += nsus) {
185 if (segnum >= nsegments) {
186 /* wrap around */
187 segnum = 0;
188 maxsegnum = last_alloc;
189 }
190 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
191 &su_bh);
192 if (ret < 0)
193 goto out_header;
194 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
195 su = nilfs_sufile_block_get_segment_usage(
196 sufile, segnum, su_bh, kaddr);
197
198 nsus = nilfs_sufile_segment_usages_in_block(
199 sufile, segnum, maxsegnum);
200 for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
201 if (!nilfs_segment_usage_clean(su))
202 continue;
203 /* found a clean segment */
204 nilfs_segment_usage_set_dirty(su);
205 kunmap_atomic(kaddr, KM_USER0);
206
207 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
208 header = nilfs_sufile_block_get_header(
209 sufile, header_bh, kaddr);
210 le64_add_cpu(&header->sh_ncleansegs, -1);
211 le64_add_cpu(&header->sh_ndirtysegs, 1);
212 header->sh_last_alloc = cpu_to_le64(segnum);
213 kunmap_atomic(kaddr, KM_USER0);
214
215 nilfs_mdt_mark_buffer_dirty(header_bh);
216 nilfs_mdt_mark_buffer_dirty(su_bh);
217 nilfs_mdt_mark_dirty(sufile);
218 brelse(su_bh);
219 *segnump = segnum;
220 goto out_header;
221 }
222
223 kunmap_atomic(kaddr, KM_USER0);
224 brelse(su_bh);
225 }
226
227 /* no segments left */
228 ret = -ENOSPC;
229
230 out_header:
231 brelse(header_bh);
232
233 out_sem:
234 up_write(&NILFS_MDT(sufile)->mi_sem);
235 return ret;
236}
237
238void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
239 struct buffer_head *header_bh,
240 struct buffer_head *su_bh)
241{
242 struct nilfs_segment_usage *su;
243 void *kaddr;
244
245 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
246 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
247 if (unlikely(!nilfs_segment_usage_clean(su))) {
248 printk(KERN_WARNING "%s: segment %llu must be clean\n",
249 __func__, (unsigned long long)segnum);
250 kunmap_atomic(kaddr, KM_USER0);
251 return;
252 }
253 nilfs_segment_usage_set_dirty(su);
254 kunmap_atomic(kaddr, KM_USER0);
255
256 nilfs_sufile_mod_counter(header_bh, -1, 1);
257 nilfs_mdt_mark_buffer_dirty(su_bh);
258 nilfs_mdt_mark_dirty(sufile);
259}
260
261void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
262 struct buffer_head *header_bh,
263 struct buffer_head *su_bh)
264{
265 struct nilfs_segment_usage *su;
266 void *kaddr;
267 int clean, dirty;
268
269 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
270 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
271 if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
272 su->su_nblocks == cpu_to_le32(0)) {
273 kunmap_atomic(kaddr, KM_USER0);
274 return;
275 }
276 clean = nilfs_segment_usage_clean(su);
277 dirty = nilfs_segment_usage_dirty(su);
278
279 /* make the segment garbage */
280 su->su_lastmod = cpu_to_le64(0);
281 su->su_nblocks = cpu_to_le32(0);
282 su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
283 kunmap_atomic(kaddr, KM_USER0);
284
285 nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
286 nilfs_mdt_mark_buffer_dirty(su_bh);
287 nilfs_mdt_mark_dirty(sufile);
288}
289
290void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
291 struct buffer_head *header_bh,
292 struct buffer_head *su_bh)
293{
294 struct nilfs_segment_usage *su;
295 void *kaddr;
296 int sudirty;
297
298 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
299 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
300 if (nilfs_segment_usage_clean(su)) {
301 printk(KERN_WARNING "%s: segment %llu is already clean\n",
302 __func__, (unsigned long long)segnum);
303 kunmap_atomic(kaddr, KM_USER0);
304 return;
305 }
306 WARN_ON(nilfs_segment_usage_error(su));
307 WARN_ON(!nilfs_segment_usage_dirty(su));
308
309 sudirty = nilfs_segment_usage_dirty(su);
310 nilfs_segment_usage_set_clean(su);
311 kunmap_atomic(kaddr, KM_USER0);
312 nilfs_mdt_mark_buffer_dirty(su_bh);
313
314 nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
315 nilfs_mdt_mark_dirty(sufile);
316}
317
318/**
319 * nilfs_sufile_get_segment_usage - get a segment usage
320 * @sufile: inode of segment usage file
321 * @segnum: segment number
322 * @sup: pointer to segment usage
323 * @bhp: pointer to buffer head
324 *
325 * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
326 * specified by @segnum.
327 *
328 * Return Value: On success, 0 is returned, and the segment usage and the
329 * buffer head of the buffer on which the segment usage is located are stored
330 * in the place pointed by @sup and @bhp, respectively. On error, one of the
331 * following negative error codes is returned.
332 *
333 * %-EIO - I/O error.
334 *
335 * %-ENOMEM - Insufficient amount of memory available.
336 *
337 * %-EINVAL - Invalid segment usage number.
338 */
339int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
340 struct nilfs_segment_usage **sup,
341 struct buffer_head **bhp)
342{
343 struct buffer_head *bh;
344 struct nilfs_segment_usage *su;
345 void *kaddr;
346 int ret;
347
348 /* segnum is 0 origin */
349 if (segnum >= nilfs_sufile_get_nsegments(sufile))
350 return -EINVAL;
351 down_write(&NILFS_MDT(sufile)->mi_sem);
352 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
353 if (ret < 0)
354 goto out_sem;
355 kaddr = kmap(bh->b_page);
356 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
357 if (nilfs_segment_usage_error(su)) {
358 kunmap(bh->b_page);
359 brelse(bh);
360 ret = -EINVAL;
361 goto out_sem;
362 }
363
364 if (sup != NULL)
365 *sup = su;
366 *bhp = bh;
367
368 out_sem:
369 up_write(&NILFS_MDT(sufile)->mi_sem);
370 return ret;
371}
372
373/**
374 * nilfs_sufile_put_segment_usage - put a segment usage
375 * @sufile: inode of segment usage file
376 * @segnum: segment number
377 * @bh: buffer head
378 *
379 * Description: nilfs_sufile_put_segment_usage() releases the segment usage
380 * specified by @segnum. @bh must be the buffer head which have been returned
381 * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
382 */
383void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
384 struct buffer_head *bh)
385{
386 kunmap(bh->b_page);
387 brelse(bh);
388}
389
390/**
391 * nilfs_sufile_get_stat - get segment usage statistics
392 * @sufile: inode of segment usage file
393 * @stat: pointer to a structure of segment usage statistics
394 *
395 * Description: nilfs_sufile_get_stat() returns information about segment
396 * usage.
397 *
398 * Return Value: On success, 0 is returned, and segment usage information is
399 * stored in the place pointed by @stat. On error, one of the following
400 * negative error codes is returned.
401 *
402 * %-EIO - I/O error.
403 *
404 * %-ENOMEM - Insufficient amount of memory available.
405 */
406int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
407{
408 struct buffer_head *header_bh;
409 struct nilfs_sufile_header *header;
410 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
411 void *kaddr;
412 int ret;
413
414 down_read(&NILFS_MDT(sufile)->mi_sem);
415
416 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
417 if (ret < 0)
418 goto out_sem;
419
420 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
421 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
422 sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
423 sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
424 sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
425 sustat->ss_ctime = nilfs->ns_ctime;
426 sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
427 spin_lock(&nilfs->ns_last_segment_lock);
428 sustat->ss_prot_seq = nilfs->ns_prot_seq;
429 spin_unlock(&nilfs->ns_last_segment_lock);
430 kunmap_atomic(kaddr, KM_USER0);
431 brelse(header_bh);
432
433 out_sem:
434 up_read(&NILFS_MDT(sufile)->mi_sem);
435 return ret;
436}
437
438/**
439 * nilfs_sufile_get_ncleansegs - get the number of clean segments
440 * @sufile: inode of segment usage file
441 * @nsegsp: pointer to the number of clean segments
442 *
443 * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
444 * segments.
445 *
446 * Return Value: On success, 0 is returned and the number of clean segments is
447 * stored in the place pointed by @nsegsp. On error, one of the following
448 * negative error codes is returned.
449 *
450 * %-EIO - I/O error.
451 *
452 * %-ENOMEM - Insufficient amount of memory available.
453 */
454int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
455{
456 struct nilfs_sustat sustat;
457 int ret;
458
459 ret = nilfs_sufile_get_stat(sufile, &sustat);
460 if (ret == 0)
461 *nsegsp = sustat.ss_ncleansegs;
462 return ret;
463}
464
465void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
466 struct buffer_head *header_bh,
467 struct buffer_head *su_bh)
468{
469 struct nilfs_segment_usage *su;
470 void *kaddr;
471 int suclean;
472
473 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
474 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
475 if (nilfs_segment_usage_error(su)) {
476 kunmap_atomic(kaddr, KM_USER0);
477 return;
478 }
479 suclean = nilfs_segment_usage_clean(su);
480 nilfs_segment_usage_set_error(su);
481 kunmap_atomic(kaddr, KM_USER0);
482
483 if (suclean)
484 nilfs_sufile_mod_counter(header_bh, -1, 0);
485 nilfs_mdt_mark_buffer_dirty(su_bh);
486 nilfs_mdt_mark_dirty(sufile);
487}
488
489/**
490 * nilfs_sufile_get_suinfo -
491 * @sufile: inode of segment usage file
492 * @segnum: segment number to start looking
493 * @si: array of suinfo
494 * @nsi: size of suinfo array
495 *
496 * Description:
497 *
498 * Return Value: On success, 0 is returned and .... On error, one of the
499 * following negative error codes is returned.
500 *
501 * %-EIO - I/O error.
502 *
503 * %-ENOMEM - Insufficient amount of memory available.
504 */
505ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
506 struct nilfs_suinfo *si, size_t nsi)
507{
508 struct buffer_head *su_bh;
509 struct nilfs_segment_usage *su;
510 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
511 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
512 void *kaddr;
513 unsigned long nsegs, segusages_per_block;
514 ssize_t n;
515 int ret, i, j;
516
517 down_read(&NILFS_MDT(sufile)->mi_sem);
518
519 segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
520 nsegs = min_t(unsigned long,
521 nilfs_sufile_get_nsegments(sufile) - segnum,
522 nsi);
523 for (i = 0; i < nsegs; i += n, segnum += n) {
524 n = min_t(unsigned long,
525 segusages_per_block -
526 nilfs_sufile_get_offset(sufile, segnum),
527 nsegs - i);
528 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
529 &su_bh);
530 if (ret < 0) {
531 if (ret != -ENOENT)
532 goto out;
533 /* hole */
534 memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
535 continue;
536 }
537
538 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
539 su = nilfs_sufile_block_get_segment_usage(
540 sufile, segnum, su_bh, kaddr);
541 for (j = 0; j < n; j++, su = (void *)su + susz) {
542 si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
543 si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
544 si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
545 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
546 if (nilfs_segment_is_active(nilfs, segnum + j))
547 si[i + j].sui_flags |=
548 (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
549 }
550 kunmap_atomic(kaddr, KM_USER0);
551 brelse(su_bh);
552 }
553 ret = nsegs;
554
555 out:
556 up_read(&NILFS_MDT(sufile)->mi_sem);
557 return ret;
558}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 000000000000..a2e2efd4ade1
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,125 @@
1/*
2 * sufile.h - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_SUFILE_H
24#define _NILFS_SUFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30
31#define NILFS_SUFILE_GFP NILFS_MDT_GFP
32
33static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34{
35 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
36}
37
38int nilfs_sufile_alloc(struct inode *, __u64 *);
39int nilfs_sufile_get_segment_usage(struct inode *, __u64,
40 struct nilfs_segment_usage **,
41 struct buffer_head **);
42void nilfs_sufile_put_segment_usage(struct inode *, __u64,
43 struct buffer_head *);
44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
45int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
46ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
47 size_t);
48
49int nilfs_sufile_update(struct inode *, __u64, int,
50 void (*dofunc)(struct inode *, __u64,
51 struct buffer_head *,
52 struct buffer_head *));
53void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
54 struct buffer_head *);
55void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
56 struct buffer_head *);
57void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
58 struct buffer_head *);
59void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
60 struct buffer_head *);
61
62/**
63 * nilfs_sufile_cancel_free -
64 * @sufile: inode of segment usage file
65 * @segnum: segment number
66 *
67 * Description:
68 *
69 * Return Value: On success, 0 is returned. On error, one of the following
70 * negative error codes is returned.
71 *
72 * %-EIO - I/O error.
73 *
74 * %-ENOMEM - Insufficient amount of memory available.
75 */
76static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
77{
78 return nilfs_sufile_update(sufile, segnum, 0,
79 nilfs_sufile_do_cancel_free);
80}
81
82/**
83 * nilfs_sufile_scrap - make a segment garbage
84 * @sufile: inode of segment usage file
85 * @segnum: segment number to be freed
86 */
87static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
88{
89 return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
90}
91
92/**
93 * nilfs_sufile_free - free segment
94 * @sufile: inode of segment usage file
95 * @segnum: segment number to be freed
96 */
97static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
98{
99 return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
100}
101
102/**
103 * nilfs_sufile_set_error - mark a segment as erroneous
104 * @sufile: inode of segment usage file
105 * @segnum: segment number
106 *
107 * Description: nilfs_sufile_set_error() marks the segment specified by
108 * @segnum as erroneous. The error segment will never be used again.
109 *
110 * Return Value: On success, 0 is returned. On error, one of the following
111 * negative error codes is returned.
112 *
113 * %-EIO - I/O error.
114 *
115 * %-ENOMEM - Insufficient amount of memory available.
116 *
117 * %-EINVAL - Invalid segment usage number.
118 */
119static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
120{
121 return nilfs_sufile_update(sufile, segnum, 0,
122 nilfs_sufile_do_set_error);
123}
124
125#endif /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 000000000000..6989b03e97ab
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1326 @@
1/*
2 * super.c - NILFS module and super block management.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22/*
23 * linux/fs/ext2/super.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/inode.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * Big-endian to little-endian byte-swapping/bitmaps by
37 * David S. Miller (davem@caip.rutgers.edu), 1995
38 */
39
40#include <linux/module.h>
41#include <linux/string.h>
42#include <linux/slab.h>
43#include <linux/init.h>
44#include <linux/blkdev.h>
45#include <linux/parser.h>
46#include <linux/random.h>
47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h>
50#include <linux/writeback.h>
51#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include "nilfs.h"
54#include "mdt.h"
55#include "alloc.h"
56#include "page.h"
57#include "cpfile.h"
58#include "ifile.h"
59#include "dat.h"
60#include "segment.h"
61#include "segbuf.h"
62
63MODULE_AUTHOR("NTT Corp.");
64MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)");
66MODULE_LICENSE("GPL");
67
68static int nilfs_remount(struct super_block *sb, int *flags, char *data);
69static int test_exclusive_mount(struct file_system_type *fs_type,
70 struct block_device *bdev, int flags);
71
72/**
73 * nilfs_error() - report failure condition on a filesystem
74 *
75 * nilfs_error() sets an ERROR_FS flag on the superblock as well as
76 * reporting an error message. It should be called when NILFS detects
77 * incoherences or defects of meta data on disk. As for sustainable
78 * errors such as a single-shot I/O error, nilfs_warning() or the printk()
79 * function should be used instead.
80 *
81 * The segment constructor must not call this function because it can
82 * kill itself.
83 */
84void nilfs_error(struct super_block *sb, const char *function,
85 const char *fmt, ...)
86{
87 struct nilfs_sb_info *sbi = NILFS_SB(sb);
88 va_list args;
89
90 va_start(args, fmt);
91 printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
92 vprintk(fmt, args);
93 printk("\n");
94 va_end(args);
95
96 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs;
98
99 if (!nilfs_test_opt(sbi, ERRORS_CONT))
100 nilfs_detach_segment_constructor(sbi);
101
102 down_write(&nilfs->ns_sem);
103 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
104 nilfs->ns_mount_state |= NILFS_ERROR_FS;
105 nilfs->ns_sbp[0]->s_state |=
106 cpu_to_le16(NILFS_ERROR_FS);
107 nilfs_commit_super(sbi, 1);
108 }
109 up_write(&nilfs->ns_sem);
110
111 if (nilfs_test_opt(sbi, ERRORS_RO)) {
112 printk(KERN_CRIT "Remounting filesystem read-only\n");
113 sb->s_flags |= MS_RDONLY;
114 }
115 }
116
117 if (nilfs_test_opt(sbi, ERRORS_PANIC))
118 panic("NILFS (device %s): panic forced after error\n",
119 sb->s_id);
120}
121
122void nilfs_warning(struct super_block *sb, const char *function,
123 const char *fmt, ...)
124{
125 va_list args;
126
127 va_start(args, fmt);
128 printk(KERN_WARNING "NILFS warning (device %s): %s: ",
129 sb->s_id, function);
130 vprintk(fmt, args);
131 printk("\n");
132 va_end(args);
133}
134
135static struct kmem_cache *nilfs_inode_cachep;
136
137struct inode *nilfs_alloc_inode(struct super_block *sb)
138{
139 struct nilfs_inode_info *ii;
140
141 ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
142 if (!ii)
143 return NULL;
144 ii->i_bh = NULL;
145 ii->i_state = 0;
146 ii->vfs_inode.i_version = 1;
147 nilfs_btnode_cache_init(&ii->i_btnode_cache);
148 return &ii->vfs_inode;
149}
150
151void nilfs_destroy_inode(struct inode *inode)
152{
153 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
154}
155
156static void init_once(void *obj)
157{
158 struct nilfs_inode_info *ii = obj;
159
160 INIT_LIST_HEAD(&ii->i_dirty);
161#ifdef CONFIG_NILFS_XATTR
162 init_rwsem(&ii->xattr_sem);
163#endif
164 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
165 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
166 inode_init_once(&ii->vfs_inode);
167}
168
169static int nilfs_init_inode_cache(void)
170{
171 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
172 sizeof(struct nilfs_inode_info),
173 0, SLAB_RECLAIM_ACCOUNT,
174 init_once);
175
176 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
177}
178
179static inline void nilfs_destroy_inode_cache(void)
180{
181 kmem_cache_destroy(nilfs_inode_cachep);
182}
183
184static void nilfs_clear_inode(struct inode *inode)
185{
186 struct nilfs_inode_info *ii = NILFS_I(inode);
187
188#ifdef CONFIG_NILFS_POSIX_ACL
189 if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
190 posix_acl_release(ii->i_acl);
191 ii->i_acl = NILFS_ACL_NOT_CACHED;
192 }
193 if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
194 posix_acl_release(ii->i_default_acl);
195 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
196 }
197#endif
198 /*
199 * Free resources allocated in nilfs_read_inode(), here.
200 */
201 BUG_ON(!list_empty(&ii->i_dirty));
202 brelse(ii->i_bh);
203 ii->i_bh = NULL;
204
205 if (test_bit(NILFS_I_BMAP, &ii->i_state))
206 nilfs_bmap_clear(ii->i_bmap);
207
208 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
209}
210
211static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
212{
213 struct the_nilfs *nilfs = sbi->s_nilfs;
214 int err;
215 int barrier_done = 0;
216
217 if (nilfs_test_opt(sbi, BARRIER)) {
218 set_buffer_ordered(nilfs->ns_sbh[0]);
219 barrier_done = 1;
220 }
221 retry:
222 set_buffer_dirty(nilfs->ns_sbh[0]);
223 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
224 if (err == -EOPNOTSUPP && barrier_done) {
225 nilfs_warning(sbi->s_super, __func__,
226 "barrier-based sync failed. "
227 "disabling barriers\n");
228 nilfs_clear_opt(sbi, BARRIER);
229 barrier_done = 0;
230 clear_buffer_ordered(nilfs->ns_sbh[0]);
231 goto retry;
232 }
233 if (unlikely(err)) {
234 printk(KERN_ERR
235 "NILFS: unable to write superblock (err=%d)\n", err);
236 if (err == -EIO && nilfs->ns_sbh[1]) {
237 nilfs_fall_back_super_block(nilfs);
238 goto retry;
239 }
240 } else {
241 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
242
243 /*
244 * The latest segment becomes trailable from the position
245 * written in superblock.
246 */
247 clear_nilfs_discontinued(nilfs);
248
249 /* update GC protection for recent segments */
250 if (nilfs->ns_sbh[1]) {
251 sbp = NULL;
252 if (dupsb) {
253 set_buffer_dirty(nilfs->ns_sbh[1]);
254 if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
255 sbp = nilfs->ns_sbp[1];
256 }
257 }
258 if (sbp) {
259 spin_lock(&nilfs->ns_last_segment_lock);
260 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
261 spin_unlock(&nilfs->ns_last_segment_lock);
262 }
263 }
264
265 return err;
266}
267
268int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
269{
270 struct the_nilfs *nilfs = sbi->s_nilfs;
271 struct nilfs_super_block **sbp = nilfs->ns_sbp;
272 sector_t nfreeblocks;
273 time_t t;
274 int err;
275
276 /* nilfs->sem must be locked by the caller. */
277 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
278 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
279 nilfs_swap_super_block(nilfs);
280 else {
281 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
282 sbi->s_super->s_id);
283 return -EIO;
284 }
285 }
286 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
287 if (unlikely(err)) {
288 printk(KERN_ERR "NILFS: failed to count free blocks\n");
289 return err;
290 }
291 spin_lock(&nilfs->ns_last_segment_lock);
292 sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
293 sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
294 sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
295 spin_unlock(&nilfs->ns_last_segment_lock);
296
297 t = get_seconds();
298 nilfs->ns_sbwtime[0] = t;
299 sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
300 sbp[0]->s_wtime = cpu_to_le64(t);
301 sbp[0]->s_sum = 0;
302 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
303 (unsigned char *)sbp[0],
304 nilfs->ns_sbsize));
305 if (dupsb && sbp[1]) {
306 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
307 nilfs->ns_sbwtime[1] = t;
308 }
309 sbi->s_super->s_dirt = 0;
310 return nilfs_sync_super(sbi, dupsb);
311}
312
313static void nilfs_put_super(struct super_block *sb)
314{
315 struct nilfs_sb_info *sbi = NILFS_SB(sb);
316 struct the_nilfs *nilfs = sbi->s_nilfs;
317
318 nilfs_detach_segment_constructor(sbi);
319
320 if (!(sb->s_flags & MS_RDONLY)) {
321 down_write(&nilfs->ns_sem);
322 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
323 nilfs_commit_super(sbi, 1);
324 up_write(&nilfs->ns_sem);
325 }
326
327 nilfs_detach_checkpoint(sbi);
328 put_nilfs(sbi->s_nilfs);
329 sbi->s_super = NULL;
330 sb->s_fs_info = NULL;
331 kfree(sbi);
332}
333
334/**
335 * nilfs_write_super - write super block(s) of NILFS
336 * @sb: super_block
337 *
338 * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
339 * clears s_dirt. This function is called in the section protected by
340 * lock_super().
341 *
342 * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
343 * of the struct the_nilfs. Lock order must be as follows:
344 *
345 * 1. lock_super()
346 * 2. down_write(&nilfs->ns_sem)
347 *
348 * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
349 * of the super block (nilfs->ns_sbp[]).
350 *
351 * In most cases, VFS functions call lock_super() before calling these
352 * methods. So we must be careful not to bring on deadlocks when using
353 * lock_super(); see generic_shutdown_super(), write_super(), and so on.
354 *
355 * Note that order of lock_kernel() and lock_super() depends on contexts
356 * of VFS. We should also note that lock_kernel() can be used in its
357 * protective section and only the outermost one has an effect.
358 */
359static void nilfs_write_super(struct super_block *sb)
360{
361 struct nilfs_sb_info *sbi = NILFS_SB(sb);
362 struct the_nilfs *nilfs = sbi->s_nilfs;
363
364 down_write(&nilfs->ns_sem);
365 if (!(sb->s_flags & MS_RDONLY)) {
366 struct nilfs_super_block **sbp = nilfs->ns_sbp;
367 u64 t = get_seconds();
368 int dupsb;
369
370 if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
371 t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
372 up_write(&nilfs->ns_sem);
373 return;
374 }
375 dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
376 nilfs_commit_super(sbi, dupsb);
377 }
378 sb->s_dirt = 0;
379 up_write(&nilfs->ns_sem);
380}
381
382static int nilfs_sync_fs(struct super_block *sb, int wait)
383{
384 int err = 0;
385
386 /* This function is called when super block should be written back */
387 if (wait)
388 err = nilfs_construct_segment(sb);
389 return err;
390}
391
392int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
393{
394 struct the_nilfs *nilfs = sbi->s_nilfs;
395 struct nilfs_checkpoint *raw_cp;
396 struct buffer_head *bh_cp;
397 int err;
398
399 down_write(&nilfs->ns_sem);
400 list_add(&sbi->s_list, &nilfs->ns_supers);
401 up_write(&nilfs->ns_sem);
402
403 sbi->s_ifile = nilfs_mdt_new(
404 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
405 if (!sbi->s_ifile)
406 return -ENOMEM;
407
408 err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
409 if (unlikely(err))
410 goto failed;
411
412 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
413 &bh_cp);
414 if (unlikely(err)) {
415 if (err == -ENOENT || err == -EINVAL) {
416 printk(KERN_ERR
417 "NILFS: Invalid checkpoint "
418 "(checkpoint number=%llu)\n",
419 (unsigned long long)cno);
420 err = -EINVAL;
421 }
422 goto failed;
423 }
424 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
425 if (unlikely(err))
426 goto failed_bh;
427 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
428 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
429
430 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
431 return 0;
432
433 failed_bh:
434 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
435 failed:
436 nilfs_mdt_destroy(sbi->s_ifile);
437 sbi->s_ifile = NULL;
438
439 down_write(&nilfs->ns_sem);
440 list_del_init(&sbi->s_list);
441 up_write(&nilfs->ns_sem);
442
443 return err;
444}
445
446void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
447{
448 struct the_nilfs *nilfs = sbi->s_nilfs;
449
450 nilfs_mdt_clear(sbi->s_ifile);
451 nilfs_mdt_destroy(sbi->s_ifile);
452 sbi->s_ifile = NULL;
453 down_write(&nilfs->ns_sem);
454 list_del_init(&sbi->s_list);
455 up_write(&nilfs->ns_sem);
456}
457
458static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
459{
460 struct the_nilfs *nilfs = sbi->s_nilfs;
461 int err = 0;
462
463 down_write(&nilfs->ns_sem);
464 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
465 nilfs->ns_mount_state |= NILFS_VALID_FS;
466 err = nilfs_commit_super(sbi, 1);
467 if (likely(!err))
468 printk(KERN_INFO "NILFS: recovery complete.\n");
469 }
470 up_write(&nilfs->ns_sem);
471 return err;
472}
473
474static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
475{
476 struct super_block *sb = dentry->d_sb;
477 struct nilfs_sb_info *sbi = NILFS_SB(sb);
478 struct the_nilfs *nilfs = sbi->s_nilfs;
479 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
480 unsigned long long blocks;
481 unsigned long overhead;
482 unsigned long nrsvblocks;
483 sector_t nfreeblocks;
484 int err;
485
486 /*
487 * Compute all of the segment blocks
488 *
489 * The blocks before first segment and after last segment
490 * are excluded.
491 */
492 blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
493 - nilfs->ns_first_data_block;
494 nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
495
496 /*
497 * Compute the overhead
498 *
499 * When distributing meta data blocks outside semgent structure,
500 * We must count them as the overhead.
501 */
502 overhead = 0;
503
504 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
505 if (unlikely(err))
506 return err;
507
508 buf->f_type = NILFS_SUPER_MAGIC;
509 buf->f_bsize = sb->s_blocksize;
510 buf->f_blocks = blocks - overhead;
511 buf->f_bfree = nfreeblocks;
512 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
513 (buf->f_bfree - nrsvblocks) : 0;
514 buf->f_files = atomic_read(&sbi->s_inodes_count);
515 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
516 buf->f_namelen = NILFS_NAME_LEN;
517 buf->f_fsid.val[0] = (u32)id;
518 buf->f_fsid.val[1] = (u32)(id >> 32);
519
520 return 0;
521}
522
523static struct super_operations nilfs_sops = {
524 .alloc_inode = nilfs_alloc_inode,
525 .destroy_inode = nilfs_destroy_inode,
526 .dirty_inode = nilfs_dirty_inode,
527 /* .write_inode = nilfs_write_inode, */
528 /* .put_inode = nilfs_put_inode, */
529 /* .drop_inode = nilfs_drop_inode, */
530 .delete_inode = nilfs_delete_inode,
531 .put_super = nilfs_put_super,
532 .write_super = nilfs_write_super,
533 .sync_fs = nilfs_sync_fs,
534 /* .write_super_lockfs */
535 /* .unlockfs */
536 .statfs = nilfs_statfs,
537 .remount_fs = nilfs_remount,
538 .clear_inode = nilfs_clear_inode,
539 /* .umount_begin */
540 /* .show_options */
541};
542
543static struct inode *
544nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
545{
546 struct inode *inode;
547
548 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
549 ino != NILFS_SKETCH_INO)
550 return ERR_PTR(-ESTALE);
551
552 inode = nilfs_iget(sb, ino);
553 if (IS_ERR(inode))
554 return ERR_CAST(inode);
555 if (generation && inode->i_generation != generation) {
556 iput(inode);
557 return ERR_PTR(-ESTALE);
558 }
559
560 return inode;
561}
562
563static struct dentry *
564nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
565 int fh_type)
566{
567 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
568 nilfs_nfs_get_inode);
569}
570
571static struct dentry *
572nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
573 int fh_type)
574{
575 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
576 nilfs_nfs_get_inode);
577}
578
579static struct export_operations nilfs_export_ops = {
580 .fh_to_dentry = nilfs_fh_to_dentry,
581 .fh_to_parent = nilfs_fh_to_parent,
582 .get_parent = nilfs_get_parent,
583};
584
585enum {
586 Opt_err_cont, Opt_err_panic, Opt_err_ro,
587 Opt_barrier, Opt_snapshot, Opt_order,
588 Opt_err,
589};
590
591static match_table_t tokens = {
592 {Opt_err_cont, "errors=continue"},
593 {Opt_err_panic, "errors=panic"},
594 {Opt_err_ro, "errors=remount-ro"},
595 {Opt_barrier, "barrier=%s"},
596 {Opt_snapshot, "cp=%u"},
597 {Opt_order, "order=%s"},
598 {Opt_err, NULL}
599};
600
601static int match_bool(substring_t *s, int *result)
602{
603 int len = s->to - s->from;
604
605 if (strncmp(s->from, "on", len) == 0)
606 *result = 1;
607 else if (strncmp(s->from, "off", len) == 0)
608 *result = 0;
609 else
610 return 1;
611 return 0;
612}
613
614static int parse_options(char *options, struct super_block *sb)
615{
616 struct nilfs_sb_info *sbi = NILFS_SB(sb);
617 char *p;
618 substring_t args[MAX_OPT_ARGS];
619 int option;
620
621 if (!options)
622 return 1;
623
624 while ((p = strsep(&options, ",")) != NULL) {
625 int token;
626 if (!*p)
627 continue;
628
629 token = match_token(p, tokens, args);
630 switch (token) {
631 case Opt_barrier:
632 if (match_bool(&args[0], &option))
633 return 0;
634 if (option)
635 nilfs_set_opt(sbi, BARRIER);
636 else
637 nilfs_clear_opt(sbi, BARRIER);
638 break;
639 case Opt_order:
640 if (strcmp(args[0].from, "relaxed") == 0)
641 /* Ordered data semantics */
642 nilfs_clear_opt(sbi, STRICT_ORDER);
643 else if (strcmp(args[0].from, "strict") == 0)
644 /* Strict in-order semantics */
645 nilfs_set_opt(sbi, STRICT_ORDER);
646 else
647 return 0;
648 break;
649 case Opt_err_panic:
650 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
651 break;
652 case Opt_err_ro:
653 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
654 break;
655 case Opt_err_cont:
656 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
657 break;
658 case Opt_snapshot:
659 if (match_int(&args[0], &option) || option <= 0)
660 return 0;
661 if (!(sb->s_flags & MS_RDONLY))
662 return 0;
663 sbi->s_snapshot_cno = option;
664 nilfs_set_opt(sbi, SNAPSHOT);
665 break;
666 default:
667 printk(KERN_ERR
668 "NILFS: Unrecognized mount option \"%s\"\n", p);
669 return 0;
670 }
671 }
672 return 1;
673}
674
675static inline void
676nilfs_set_default_options(struct nilfs_sb_info *sbi,
677 struct nilfs_super_block *sbp)
678{
679 sbi->s_mount_opt =
680 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
681}
682
683static int nilfs_setup_super(struct nilfs_sb_info *sbi)
684{
685 struct the_nilfs *nilfs = sbi->s_nilfs;
686 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
687 int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
688 int mnt_count = le16_to_cpu(sbp->s_mnt_count);
689
690 /* nilfs->sem must be locked by the caller. */
691 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
692 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
693 } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
694 printk(KERN_WARNING
695 "NILFS warning: mounting fs with errors\n");
696#if 0
697 } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
698 printk(KERN_WARNING
699 "NILFS warning: maximal mount count reached\n");
700#endif
701 }
702 if (!max_mnt_count)
703 sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
704
705 sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
706 sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
707 sbp->s_mtime = cpu_to_le64(get_seconds());
708 return nilfs_commit_super(sbi, 1);
709}
710
711struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
712 u64 pos, int blocksize,
713 struct buffer_head **pbh)
714{
715 unsigned long long sb_index = pos;
716 unsigned long offset;
717
718 offset = do_div(sb_index, blocksize);
719 *pbh = sb_bread(sb, sb_index);
720 if (!*pbh)
721 return NULL;
722 return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
723}
724
725int nilfs_store_magic_and_option(struct super_block *sb,
726 struct nilfs_super_block *sbp,
727 char *data)
728{
729 struct nilfs_sb_info *sbi = NILFS_SB(sb);
730
731 sb->s_magic = le16_to_cpu(sbp->s_magic);
732
733 /* FS independent flags */
734#ifdef NILFS_ATIME_DISABLE
735 sb->s_flags |= MS_NOATIME;
736#endif
737
738 nilfs_set_default_options(sbi, sbp);
739
740 sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
741 sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
742 sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
743 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
744
745 return !parse_options(data, sb) ? -EINVAL : 0 ;
746}
747
748/**
749 * nilfs_fill_super() - initialize a super block instance
750 * @sb: super_block
751 * @data: mount options
752 * @silent: silent mode flag
753 * @nilfs: the_nilfs struct
754 *
755 * This function is called exclusively by bd_mount_mutex.
756 * So, the recovery process is protected from other simultaneous mounts.
757 */
758static int
759nilfs_fill_super(struct super_block *sb, void *data, int silent,
760 struct the_nilfs *nilfs)
761{
762 struct nilfs_sb_info *sbi;
763 struct inode *root;
764 __u64 cno;
765 int err;
766
767 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
768 if (!sbi)
769 return -ENOMEM;
770
771 sb->s_fs_info = sbi;
772
773 get_nilfs(nilfs);
774 sbi->s_nilfs = nilfs;
775 sbi->s_super = sb;
776
777 err = init_nilfs(nilfs, sbi, (char *)data);
778 if (err)
779 goto failed_sbi;
780
781 spin_lock_init(&sbi->s_inode_lock);
782 INIT_LIST_HEAD(&sbi->s_dirty_files);
783 INIT_LIST_HEAD(&sbi->s_list);
784
785 /*
786 * Following initialization is overlapped because
787 * nilfs_sb_info structure has been cleared at the beginning.
788 * But we reserve them to keep our interest and make ready
789 * for the future change.
790 */
791 get_random_bytes(&sbi->s_next_generation,
792 sizeof(sbi->s_next_generation));
793 spin_lock_init(&sbi->s_next_gen_lock);
794
795 sb->s_op = &nilfs_sops;
796 sb->s_export_op = &nilfs_export_ops;
797 sb->s_root = NULL;
798 sb->s_time_gran = 1;
799
800 if (!nilfs_loaded(nilfs)) {
801 err = load_nilfs(nilfs, sbi);
802 if (err)
803 goto failed_sbi;
804 }
805 cno = nilfs_last_cno(nilfs);
806
807 if (sb->s_flags & MS_RDONLY) {
808 if (nilfs_test_opt(sbi, SNAPSHOT)) {
809 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
810 sbi->s_snapshot_cno);
811 if (err < 0)
812 goto failed_sbi;
813 if (!err) {
814 printk(KERN_ERR
815 "NILFS: The specified checkpoint is "
816 "not a snapshot "
817 "(checkpoint number=%llu).\n",
818 (unsigned long long)sbi->s_snapshot_cno);
819 err = -EINVAL;
820 goto failed_sbi;
821 }
822 cno = sbi->s_snapshot_cno;
823 } else
824 /* Read-only mount */
825 sbi->s_snapshot_cno = cno;
826 }
827
828 err = nilfs_attach_checkpoint(sbi, cno);
829 if (err) {
830 printk(KERN_ERR "NILFS: error loading a checkpoint"
831 " (checkpoint number=%llu).\n", (unsigned long long)cno);
832 goto failed_sbi;
833 }
834
835 if (!(sb->s_flags & MS_RDONLY)) {
836 err = nilfs_attach_segment_constructor(sbi);
837 if (err)
838 goto failed_checkpoint;
839 }
840
841 root = nilfs_iget(sb, NILFS_ROOT_INO);
842 if (IS_ERR(root)) {
843 printk(KERN_ERR "NILFS: get root inode failed\n");
844 err = PTR_ERR(root);
845 goto failed_segctor;
846 }
847 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
848 iput(root);
849 printk(KERN_ERR "NILFS: corrupt root inode.\n");
850 err = -EINVAL;
851 goto failed_segctor;
852 }
853 sb->s_root = d_alloc_root(root);
854 if (!sb->s_root) {
855 iput(root);
856 printk(KERN_ERR "NILFS: get root dentry failed\n");
857 err = -ENOMEM;
858 goto failed_segctor;
859 }
860
861 if (!(sb->s_flags & MS_RDONLY)) {
862 down_write(&nilfs->ns_sem);
863 nilfs_setup_super(sbi);
864 up_write(&nilfs->ns_sem);
865 }
866
867 err = nilfs_mark_recovery_complete(sbi);
868 if (unlikely(err)) {
869 printk(KERN_ERR "NILFS: recovery failed.\n");
870 goto failed_root;
871 }
872
873 return 0;
874
875 failed_root:
876 dput(sb->s_root);
877 sb->s_root = NULL;
878
879 failed_segctor:
880 nilfs_detach_segment_constructor(sbi);
881
882 failed_checkpoint:
883 nilfs_detach_checkpoint(sbi);
884
885 failed_sbi:
886 put_nilfs(nilfs);
887 sb->s_fs_info = NULL;
888 kfree(sbi);
889 return err;
890}
891
892static int nilfs_remount(struct super_block *sb, int *flags, char *data)
893{
894 struct nilfs_sb_info *sbi = NILFS_SB(sb);
895 struct nilfs_super_block *sbp;
896 struct the_nilfs *nilfs = sbi->s_nilfs;
897 unsigned long old_sb_flags;
898 struct nilfs_mount_options old_opts;
899 int err;
900
901 old_sb_flags = sb->s_flags;
902 old_opts.mount_opt = sbi->s_mount_opt;
903 old_opts.snapshot_cno = sbi->s_snapshot_cno;
904
905 if (!parse_options(data, sb)) {
906 err = -EINVAL;
907 goto restore_opts;
908 }
909 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
910
911 if ((*flags & MS_RDONLY) &&
912 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
913 printk(KERN_WARNING "NILFS (device %s): couldn't "
914 "remount to a different snapshot. \n",
915 sb->s_id);
916 err = -EINVAL;
917 goto restore_opts;
918 }
919
920 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
921 goto out;
922 if (*flags & MS_RDONLY) {
923 /* Shutting down the segment constructor */
924 nilfs_detach_segment_constructor(sbi);
925 sb->s_flags |= MS_RDONLY;
926
927 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
928 /* nilfs_set_opt(sbi, SNAPSHOT); */
929
930 /*
931 * Remounting a valid RW partition RDONLY, so set
932 * the RDONLY flag and then mark the partition as valid again.
933 */
934 down_write(&nilfs->ns_sem);
935 sbp = nilfs->ns_sbp[0];
936 if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
937 (nilfs->ns_mount_state & NILFS_VALID_FS))
938 sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
939 sbp->s_mtime = cpu_to_le64(get_seconds());
940 nilfs_commit_super(sbi, 1);
941 up_write(&nilfs->ns_sem);
942 } else {
943 /*
944 * Mounting a RDONLY partition read-write, so reread and
945 * store the current valid flag. (It may have been changed
946 * by fsck since we originally mounted the partition.)
947 */
948 down(&sb->s_bdev->bd_mount_sem);
949 /* Check existing RW-mount */
950 if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
951 printk(KERN_WARNING "NILFS (device %s): couldn't "
952 "remount because a RW-mount exists.\n",
953 sb->s_id);
954 err = -EBUSY;
955 goto rw_remount_failed;
956 }
957 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
958 printk(KERN_WARNING "NILFS (device %s): couldn't "
959 "remount because the current RO-mount is not "
960 "the latest one.\n",
961 sb->s_id);
962 err = -EINVAL;
963 goto rw_remount_failed;
964 }
965 sb->s_flags &= ~MS_RDONLY;
966 nilfs_clear_opt(sbi, SNAPSHOT);
967 sbi->s_snapshot_cno = 0;
968
969 err = nilfs_attach_segment_constructor(sbi);
970 if (err)
971 goto rw_remount_failed;
972
973 down_write(&nilfs->ns_sem);
974 nilfs_setup_super(sbi);
975 up_write(&nilfs->ns_sem);
976
977 up(&sb->s_bdev->bd_mount_sem);
978 }
979 out:
980 return 0;
981
982 rw_remount_failed:
983 up(&sb->s_bdev->bd_mount_sem);
984 restore_opts:
985 sb->s_flags = old_sb_flags;
986 sbi->s_mount_opt = old_opts.mount_opt;
987 sbi->s_snapshot_cno = old_opts.snapshot_cno;
988 return err;
989}
990
991struct nilfs_super_data {
992 struct block_device *bdev;
993 __u64 cno;
994 int flags;
995};
996
997/**
998 * nilfs_identify - pre-read mount options needed to identify mount instance
999 * @data: mount options
1000 * @sd: nilfs_super_data
1001 */
1002static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1003{
1004 char *p, *options = data;
1005 substring_t args[MAX_OPT_ARGS];
1006 int option, token;
1007 int ret = 0;
1008
1009 do {
1010 p = strsep(&options, ",");
1011 if (p != NULL && *p) {
1012 token = match_token(p, tokens, args);
1013 if (token == Opt_snapshot) {
1014 if (!(sd->flags & MS_RDONLY))
1015 ret++;
1016 else {
1017 ret = match_int(&args[0], &option);
1018 if (!ret) {
1019 if (option > 0)
1020 sd->cno = option;
1021 else
1022 ret++;
1023 }
1024 }
1025 }
1026 if (ret)
1027 printk(KERN_ERR
1028 "NILFS: invalid mount option: %s\n", p);
1029 }
1030 if (!options)
1031 break;
1032 BUG_ON(options == data);
1033 *(options - 1) = ',';
1034 } while (!ret);
1035 return ret;
1036}
1037
1038static int nilfs_set_bdev_super(struct super_block *s, void *data)
1039{
1040 struct nilfs_super_data *sd = data;
1041
1042 s->s_bdev = sd->bdev;
1043 s->s_dev = s->s_bdev->bd_dev;
1044 return 0;
1045}
1046
1047static int nilfs_test_bdev_super(struct super_block *s, void *data)
1048{
1049 struct nilfs_super_data *sd = data;
1050
1051 return s->s_bdev == sd->bdev;
1052}
1053
1054static int nilfs_test_bdev_super2(struct super_block *s, void *data)
1055{
1056 struct nilfs_super_data *sd = data;
1057 int ret;
1058
1059 if (s->s_bdev != sd->bdev)
1060 return 0;
1061
1062 if (!((s->s_flags | sd->flags) & MS_RDONLY))
1063 return 1; /* Reuse an old R/W-mode super_block */
1064
1065 if (s->s_flags & sd->flags & MS_RDONLY) {
1066 if (down_read_trylock(&s->s_umount)) {
1067 ret = s->s_root &&
1068 (sd->cno == NILFS_SB(s)->s_snapshot_cno);
1069 up_read(&s->s_umount);
1070 /*
1071 * This path is locked with sb_lock by sget().
1072 * So, drop_super() causes deadlock.
1073 */
1074 return ret;
1075 }
1076 }
1077 return 0;
1078}
1079
1080static int
1081nilfs_get_sb(struct file_system_type *fs_type, int flags,
1082 const char *dev_name, void *data, struct vfsmount *mnt)
1083{
1084 struct nilfs_super_data sd;
1085 struct super_block *s, *s2;
1086 struct the_nilfs *nilfs = NULL;
1087 int err, need_to_close = 1;
1088
1089 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
1090 if (IS_ERR(sd.bdev))
1091 return PTR_ERR(sd.bdev);
1092
1093 /*
1094 * To get mount instance using sget() vfs-routine, NILFS needs
1095 * much more information than normal filesystems to identify mount
1096 * instance. For snapshot mounts, not only a mount type (ro-mount
1097 * or rw-mount) but also a checkpoint number is required.
1098 * The results are passed in sget() using nilfs_super_data.
1099 */
1100 sd.cno = 0;
1101 sd.flags = flags;
1102 if (nilfs_identify((char *)data, &sd)) {
1103 err = -EINVAL;
1104 goto failed;
1105 }
1106
1107 /*
1108 * once the super is inserted into the list by sget, s_umount
1109 * will protect the lockfs code from trying to start a snapshot
1110 * while we are mounting
1111 */
1112 down(&sd.bdev->bd_mount_sem);
1113 if (!sd.cno &&
1114 (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
1115 err = (err < 0) ? : -EBUSY;
1116 goto failed_unlock;
1117 }
1118
1119 /*
1120 * Phase-1: search any existent instance and get the_nilfs
1121 */
1122 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1123 if (IS_ERR(s))
1124 goto error_s;
1125
1126 if (!s->s_root) {
1127 err = -ENOMEM;
1128 nilfs = alloc_nilfs(sd.bdev);
1129 if (!nilfs)
1130 goto cancel_new;
1131 } else {
1132 struct nilfs_sb_info *sbi = NILFS_SB(s);
1133
1134 /*
1135 * s_umount protects super_block from unmount process;
1136 * It covers pointers of nilfs_sb_info and the_nilfs.
1137 */
1138 nilfs = sbi->s_nilfs;
1139 get_nilfs(nilfs);
1140 up_write(&s->s_umount);
1141
1142 /*
1143 * Phase-2: search specified snapshot or R/W mode super_block
1144 */
1145 if (!sd.cno)
1146 /* trying to get the latest checkpoint. */
1147 sd.cno = nilfs_last_cno(nilfs);
1148
1149 s2 = sget(fs_type, nilfs_test_bdev_super2,
1150 nilfs_set_bdev_super, &sd);
1151 deactivate_super(s);
1152 /*
1153 * Although deactivate_super() invokes close_bdev_exclusive() at
1154 * kill_block_super(). Here, s is an existent mount; we need
1155 * one more close_bdev_exclusive() call.
1156 */
1157 s = s2;
1158 if (IS_ERR(s))
1159 goto error_s;
1160 }
1161
1162 if (!s->s_root) {
1163 char b[BDEVNAME_SIZE];
1164
1165 s->s_flags = flags;
1166 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1167 sb_set_blocksize(s, block_size(sd.bdev));
1168
1169 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
1170 if (err)
1171 goto cancel_new;
1172
1173 s->s_flags |= MS_ACTIVE;
1174 need_to_close = 0;
1175 } else if (!(s->s_flags & MS_RDONLY)) {
1176 err = -EBUSY;
1177 }
1178
1179 up(&sd.bdev->bd_mount_sem);
1180 put_nilfs(nilfs);
1181 if (need_to_close)
1182 close_bdev_exclusive(sd.bdev, flags);
1183 simple_set_mnt(mnt, s);
1184 return 0;
1185
1186 error_s:
1187 up(&sd.bdev->bd_mount_sem);
1188 if (nilfs)
1189 put_nilfs(nilfs);
1190 close_bdev_exclusive(sd.bdev, flags);
1191 return PTR_ERR(s);
1192
1193 failed_unlock:
1194 up(&sd.bdev->bd_mount_sem);
1195 failed:
1196 close_bdev_exclusive(sd.bdev, flags);
1197
1198 return err;
1199
1200 cancel_new:
1201 /* Abandoning the newly allocated superblock */
1202 up(&sd.bdev->bd_mount_sem);
1203 if (nilfs)
1204 put_nilfs(nilfs);
1205 up_write(&s->s_umount);
1206 deactivate_super(s);
1207 /*
1208 * deactivate_super() invokes close_bdev_exclusive().
1209 * We must finish all post-cleaning before this call;
1210 * put_nilfs() and unlocking bd_mount_sem need the block device.
1211 */
1212 return err;
1213}
1214
1215static int nilfs_test_bdev_super3(struct super_block *s, void *data)
1216{
1217 struct nilfs_super_data *sd = data;
1218 int ret;
1219
1220 if (s->s_bdev != sd->bdev)
1221 return 0;
1222 if (down_read_trylock(&s->s_umount)) {
1223 ret = (s->s_flags & MS_RDONLY) && s->s_root &&
1224 nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
1225 up_read(&s->s_umount);
1226 if (ret)
1227 return 0; /* ignore snapshot mounts */
1228 }
1229 return !((sd->flags ^ s->s_flags) & MS_RDONLY);
1230}
1231
1232static int __false_bdev_super(struct super_block *s, void *data)
1233{
1234#if 0 /* XXX: workaround for lock debug. This is not good idea */
1235 up_write(&s->s_umount);
1236#endif
1237 return -EFAULT;
1238}
1239
1240/**
1241 * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
1242 * fs_type: filesystem type
1243 * bdev: block device
1244 * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
1245 * res: pointer to an integer to store result
1246 *
1247 * This function must be called within a section protected by bd_mount_mutex.
1248 */
1249static int test_exclusive_mount(struct file_system_type *fs_type,
1250 struct block_device *bdev, int flags)
1251{
1252 struct super_block *s;
1253 struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
1254
1255 s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
1256 if (IS_ERR(s)) {
1257 if (PTR_ERR(s) != -EFAULT)
1258 return PTR_ERR(s);
1259 return 0; /* Not found */
1260 }
1261 up_write(&s->s_umount);
1262 deactivate_super(s);
1263 return 1; /* Found */
1264}
1265
1266struct file_system_type nilfs_fs_type = {
1267 .owner = THIS_MODULE,
1268 .name = "nilfs2",
1269 .get_sb = nilfs_get_sb,
1270 .kill_sb = kill_block_super,
1271 .fs_flags = FS_REQUIRES_DEV,
1272};
1273
1274static int __init init_nilfs_fs(void)
1275{
1276 int err;
1277
1278 err = nilfs_init_inode_cache();
1279 if (err)
1280 goto failed;
1281
1282 err = nilfs_init_transaction_cache();
1283 if (err)
1284 goto failed_inode_cache;
1285
1286 err = nilfs_init_segbuf_cache();
1287 if (err)
1288 goto failed_transaction_cache;
1289
1290 err = nilfs_btree_path_cache_init();
1291 if (err)
1292 goto failed_segbuf_cache;
1293
1294 err = register_filesystem(&nilfs_fs_type);
1295 if (err)
1296 goto failed_btree_path_cache;
1297
1298 return 0;
1299
1300 failed_btree_path_cache:
1301 nilfs_btree_path_cache_destroy();
1302
1303 failed_segbuf_cache:
1304 nilfs_destroy_segbuf_cache();
1305
1306 failed_transaction_cache:
1307 nilfs_destroy_transaction_cache();
1308
1309 failed_inode_cache:
1310 nilfs_destroy_inode_cache();
1311
1312 failed:
1313 return err;
1314}
1315
1316static void __exit exit_nilfs_fs(void)
1317{
1318 nilfs_destroy_segbuf_cache();
1319 nilfs_destroy_transaction_cache();
1320 nilfs_destroy_inode_cache();
1321 nilfs_btree_path_cache_destroy();
1322 unregister_filesystem(&nilfs_fs_type);
1323}
1324
1325module_init(init_nilfs_fs)
1326module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000000..7f65b3be4aa9
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,641 @@
1/*
2 * the_nilfs.c - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/slab.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/crc32.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "alloc.h"
32#include "cpfile.h"
33#include "sufile.h"
34#include "dat.h"
35#include "seglist.h"
36#include "segbuf.h"
37
38void nilfs_set_last_segment(struct the_nilfs *nilfs,
39 sector_t start_blocknr, u64 seq, __u64 cno)
40{
41 spin_lock(&nilfs->ns_last_segment_lock);
42 nilfs->ns_last_pseg = start_blocknr;
43 nilfs->ns_last_seq = seq;
44 nilfs->ns_last_cno = cno;
45 spin_unlock(&nilfs->ns_last_segment_lock);
46}
47
48/**
49 * alloc_nilfs - allocate the_nilfs structure
50 * @bdev: block device to which the_nilfs is related
51 *
52 * alloc_nilfs() allocates memory for the_nilfs and
53 * initializes its reference count and locks.
54 *
55 * Return Value: On success, pointer to the_nilfs is returned.
56 * On error, NULL is returned.
57 */
58struct the_nilfs *alloc_nilfs(struct block_device *bdev)
59{
60 struct the_nilfs *nilfs;
61
62 nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
63 if (!nilfs)
64 return NULL;
65
66 nilfs->ns_bdev = bdev;
67 atomic_set(&nilfs->ns_count, 1);
68 atomic_set(&nilfs->ns_writer_refcount, -1);
69 atomic_set(&nilfs->ns_ndirtyblks, 0);
70 init_rwsem(&nilfs->ns_sem);
71 mutex_init(&nilfs->ns_writer_mutex);
72 INIT_LIST_HEAD(&nilfs->ns_supers);
73 spin_lock_init(&nilfs->ns_last_segment_lock);
74 nilfs->ns_gc_inodes_h = NULL;
75 init_rwsem(&nilfs->ns_segctor_sem);
76
77 return nilfs;
78}
79
80/**
81 * put_nilfs - release a reference to the_nilfs
82 * @nilfs: the_nilfs structure to be released
83 *
84 * put_nilfs() decrements a reference counter of the_nilfs.
85 * If the reference count reaches zero, the_nilfs is freed.
86 */
87void put_nilfs(struct the_nilfs *nilfs)
88{
89 if (!atomic_dec_and_test(&nilfs->ns_count))
90 return;
91 /*
92 * Increment of ns_count never occur below because the caller
93 * of get_nilfs() holds at least one reference to the_nilfs.
94 * Thus its exclusion control is not required here.
95 */
96 might_sleep();
97 if (nilfs_loaded(nilfs)) {
98 nilfs_mdt_clear(nilfs->ns_sufile);
99 nilfs_mdt_destroy(nilfs->ns_sufile);
100 nilfs_mdt_clear(nilfs->ns_cpfile);
101 nilfs_mdt_destroy(nilfs->ns_cpfile);
102 nilfs_mdt_clear(nilfs->ns_dat);
103 nilfs_mdt_destroy(nilfs->ns_dat);
104 /* XXX: how and when to clear nilfs->ns_gc_dat? */
105 nilfs_mdt_destroy(nilfs->ns_gc_dat);
106 }
107 if (nilfs_init(nilfs)) {
108 nilfs_destroy_gccache(nilfs);
109 brelse(nilfs->ns_sbh[0]);
110 brelse(nilfs->ns_sbh[1]);
111 }
112 kfree(nilfs);
113}
114
115static int nilfs_load_super_root(struct the_nilfs *nilfs,
116 struct nilfs_sb_info *sbi, sector_t sr_block)
117{
118 static struct lock_class_key dat_lock_key;
119 struct buffer_head *bh_sr;
120 struct nilfs_super_root *raw_sr;
121 struct nilfs_super_block **sbp = nilfs->ns_sbp;
122 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
123 unsigned inode_size;
124 int err;
125
126 err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
127 if (unlikely(err))
128 return err;
129
130 down_read(&nilfs->ns_sem);
131 dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
132 checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
133 segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
134 up_read(&nilfs->ns_sem);
135
136 inode_size = nilfs->ns_inode_size;
137
138 err = -ENOMEM;
139 nilfs->ns_dat = nilfs_mdt_new(
140 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
141 if (unlikely(!nilfs->ns_dat))
142 goto failed;
143
144 nilfs->ns_gc_dat = nilfs_mdt_new(
145 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
146 if (unlikely(!nilfs->ns_gc_dat))
147 goto failed_dat;
148
149 nilfs->ns_cpfile = nilfs_mdt_new(
150 nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
151 if (unlikely(!nilfs->ns_cpfile))
152 goto failed_gc_dat;
153
154 nilfs->ns_sufile = nilfs_mdt_new(
155 nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
156 if (unlikely(!nilfs->ns_sufile))
157 goto failed_cpfile;
158
159 err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
160 if (unlikely(err))
161 goto failed_sufile;
162
163 err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
164 if (unlikely(err))
165 goto failed_sufile;
166
167 lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
168 lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
169
170 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
171 nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
172 sizeof(struct nilfs_cpfile_header));
173 nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
174 sizeof(struct nilfs_sufile_header));
175
176 err = nilfs_mdt_read_inode_direct(
177 nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
178 if (unlikely(err))
179 goto failed_sufile;
180
181 err = nilfs_mdt_read_inode_direct(
182 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
183 if (unlikely(err))
184 goto failed_sufile;
185
186 err = nilfs_mdt_read_inode_direct(
187 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
188 if (unlikely(err))
189 goto failed_sufile;
190
191 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
192 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
193
194 failed:
195 brelse(bh_sr);
196 return err;
197
198 failed_sufile:
199 nilfs_mdt_destroy(nilfs->ns_sufile);
200
201 failed_cpfile:
202 nilfs_mdt_destroy(nilfs->ns_cpfile);
203
204 failed_gc_dat:
205 nilfs_mdt_destroy(nilfs->ns_gc_dat);
206
207 failed_dat:
208 nilfs_mdt_destroy(nilfs->ns_dat);
209 goto failed;
210}
211
212static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
213{
214 memset(ri, 0, sizeof(*ri));
215 INIT_LIST_HEAD(&ri->ri_used_segments);
216}
217
218static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
219{
220 nilfs_dispose_segment_list(&ri->ri_used_segments);
221}
222
223/**
224 * load_nilfs - load and recover the nilfs
225 * @nilfs: the_nilfs structure to be released
226 * @sbi: nilfs_sb_info used to recover past segment
227 *
228 * load_nilfs() searches and load the latest super root,
229 * attaches the last segment, and does recovery if needed.
230 * The caller must call this exclusively for simultaneous mounts.
231 */
232int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
233{
234 struct nilfs_recovery_info ri;
235 unsigned int s_flags = sbi->s_super->s_flags;
236 int really_read_only = bdev_read_only(nilfs->ns_bdev);
237 unsigned valid_fs;
238 int err = 0;
239
240 nilfs_init_recovery_info(&ri);
241
242 down_write(&nilfs->ns_sem);
243 valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
244 up_write(&nilfs->ns_sem);
245
246 if (!valid_fs && (s_flags & MS_RDONLY)) {
247 printk(KERN_INFO "NILFS: INFO: recovery "
248 "required for readonly filesystem.\n");
249 if (really_read_only) {
250 printk(KERN_ERR "NILFS: write access "
251 "unavailable, cannot proceed.\n");
252 err = -EROFS;
253 goto failed;
254 }
255 printk(KERN_INFO "NILFS: write access will "
256 "be enabled during recovery.\n");
257 sbi->s_super->s_flags &= ~MS_RDONLY;
258 }
259
260 err = nilfs_search_super_root(nilfs, sbi, &ri);
261 if (unlikely(err)) {
262 printk(KERN_ERR "NILFS: error searching super root.\n");
263 goto failed;
264 }
265
266 err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
267 if (unlikely(err)) {
268 printk(KERN_ERR "NILFS: error loading super root.\n");
269 goto failed;
270 }
271
272 if (!valid_fs) {
273 err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
274 if (unlikely(err)) {
275 nilfs_mdt_destroy(nilfs->ns_cpfile);
276 nilfs_mdt_destroy(nilfs->ns_sufile);
277 nilfs_mdt_destroy(nilfs->ns_dat);
278 goto failed;
279 }
280 if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
281 sbi->s_super->s_dirt = 1;
282 }
283
284 set_nilfs_loaded(nilfs);
285
286 failed:
287 nilfs_clear_recovery_info(&ri);
288 sbi->s_super->s_flags = s_flags;
289 return err;
290}
291
292static unsigned long long nilfs_max_size(unsigned int blkbits)
293{
294 unsigned int max_bits;
295 unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
296
297 max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
298 if (max_bits < 64)
299 res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
300 return res;
301}
302
303static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
304 struct nilfs_super_block *sbp)
305{
306 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
307 printk(KERN_ERR "NILFS: revision mismatch "
308 "(superblock rev.=%d.%d, current rev.=%d.%d). "
309 "Please check the version of mkfs.nilfs.\n",
310 le32_to_cpu(sbp->s_rev_level),
311 le16_to_cpu(sbp->s_minor_rev_level),
312 NILFS_CURRENT_REV, NILFS_MINOR_REV);
313 return -EINVAL;
314 }
315 nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
316 if (nilfs->ns_sbsize > BLOCK_SIZE)
317 return -EINVAL;
318
319 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
320 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
321
322 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
323 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
324 printk(KERN_ERR "NILFS: too short segment. \n");
325 return -EINVAL;
326 }
327
328 nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
329 nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
330 nilfs->ns_r_segments_percentage =
331 le32_to_cpu(sbp->s_r_segments_percentage);
332 nilfs->ns_nrsvsegs =
333 max_t(unsigned long, NILFS_MIN_NRSVSEGS,
334 DIV_ROUND_UP(nilfs->ns_nsegments *
335 nilfs->ns_r_segments_percentage, 100));
336 nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
337 return 0;
338}
339
340static int nilfs_valid_sb(struct nilfs_super_block *sbp)
341{
342 static unsigned char sum[4];
343 const int sumoff = offsetof(struct nilfs_super_block, s_sum);
344 size_t bytes;
345 u32 crc;
346
347 if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
348 return 0;
349 bytes = le16_to_cpu(sbp->s_bytes);
350 if (bytes > BLOCK_SIZE)
351 return 0;
352 crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
353 sumoff);
354 crc = crc32_le(crc, sum, 4);
355 crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
356 bytes - sumoff - 4);
357 return crc == le32_to_cpu(sbp->s_sum);
358}
359
360static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
361{
362 return offset < ((le64_to_cpu(sbp->s_nsegments) *
363 le32_to_cpu(sbp->s_blocks_per_segment)) <<
364 (le32_to_cpu(sbp->s_log_block_size) + 10));
365}
366
367static void nilfs_release_super_block(struct the_nilfs *nilfs)
368{
369 int i;
370
371 for (i = 0; i < 2; i++) {
372 if (nilfs->ns_sbp[i]) {
373 brelse(nilfs->ns_sbh[i]);
374 nilfs->ns_sbh[i] = NULL;
375 nilfs->ns_sbp[i] = NULL;
376 }
377 }
378}
379
380void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
381{
382 brelse(nilfs->ns_sbh[0]);
383 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
384 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
385 nilfs->ns_sbh[1] = NULL;
386 nilfs->ns_sbp[1] = NULL;
387}
388
389void nilfs_swap_super_block(struct the_nilfs *nilfs)
390{
391 struct buffer_head *tsbh = nilfs->ns_sbh[0];
392 struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
393
394 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
395 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
396 nilfs->ns_sbh[1] = tsbh;
397 nilfs->ns_sbp[1] = tsbp;
398}
399
400static int nilfs_load_super_block(struct the_nilfs *nilfs,
401 struct super_block *sb, int blocksize,
402 struct nilfs_super_block **sbpp)
403{
404 struct nilfs_super_block **sbp = nilfs->ns_sbp;
405 struct buffer_head **sbh = nilfs->ns_sbh;
406 u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
407 int valid[2], swp = 0;
408
409 sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
410 &sbh[0]);
411 sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
412
413 if (!sbp[0]) {
414 if (!sbp[1]) {
415 printk(KERN_ERR "NILFS: unable to read superblock\n");
416 return -EIO;
417 }
418 printk(KERN_WARNING
419 "NILFS warning: unable to read primary superblock\n");
420 } else if (!sbp[1])
421 printk(KERN_WARNING
422 "NILFS warning: unable to read secondary superblock\n");
423
424 valid[0] = nilfs_valid_sb(sbp[0]);
425 valid[1] = nilfs_valid_sb(sbp[1]);
426 swp = valid[1] &&
427 (!valid[0] ||
428 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
429
430 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
431 brelse(sbh[1]);
432 sbh[1] = NULL;
433 sbp[1] = NULL;
434 swp = 0;
435 }
436 if (!valid[swp]) {
437 nilfs_release_super_block(nilfs);
438 printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
439 sb->s_id);
440 return -EINVAL;
441 }
442
443 if (swp) {
444 printk(KERN_WARNING "NILFS warning: broken superblock. "
445 "using spare superblock.\n");
446 nilfs_swap_super_block(nilfs);
447 }
448
449 nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
450 nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
451 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
452 *sbpp = sbp[0];
453 return 0;
454}
455
456/**
457 * init_nilfs - initialize a NILFS instance.
458 * @nilfs: the_nilfs structure
459 * @sbi: nilfs_sb_info
460 * @sb: super block
461 * @data: mount options
462 *
463 * init_nilfs() performs common initialization per block device (e.g.
464 * reading the super block, getting disk layout information, initializing
465 * shared fields in the_nilfs). It takes on some portion of the jobs
466 * typically done by a fill_super() routine. This division arises from
467 * the nature that multiple NILFS instances may be simultaneously
468 * mounted on a device.
469 * For multiple mounts on the same device, only the first mount
470 * invokes these tasks.
471 *
472 * Return Value: On success, 0 is returned. On error, a negative error
473 * code is returned.
474 */
475int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
476{
477 struct super_block *sb = sbi->s_super;
478 struct nilfs_super_block *sbp;
479 struct backing_dev_info *bdi;
480 int blocksize;
481 int err;
482
483 down_write(&nilfs->ns_sem);
484 if (nilfs_init(nilfs)) {
485 /* Load values from existing the_nilfs */
486 sbp = nilfs->ns_sbp[0];
487 err = nilfs_store_magic_and_option(sb, sbp, data);
488 if (err)
489 goto out;
490
491 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
492 if (sb->s_blocksize != blocksize &&
493 !sb_set_blocksize(sb, blocksize)) {
494 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
495 blocksize);
496 err = -EINVAL;
497 }
498 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
499 goto out;
500 }
501
502 blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
503 if (!blocksize) {
504 printk(KERN_ERR "NILFS: unable to set blocksize\n");
505 err = -EINVAL;
506 goto out;
507 }
508 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
509 if (err)
510 goto out;
511
512 err = nilfs_store_magic_and_option(sb, sbp, data);
513 if (err)
514 goto failed_sbh;
515
516 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
517 if (sb->s_blocksize != blocksize) {
518 int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
519
520 if (blocksize < hw_blocksize) {
521 printk(KERN_ERR
522 "NILFS: blocksize %d too small for device "
523 "(sector-size = %d).\n",
524 blocksize, hw_blocksize);
525 err = -EINVAL;
526 goto failed_sbh;
527 }
528 nilfs_release_super_block(nilfs);
529 sb_set_blocksize(sb, blocksize);
530
531 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
532 if (err)
533 goto out;
534 /* not failed_sbh; sbh is released automatically
535 when reloading fails. */
536 }
537 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
538
539 err = nilfs_store_disk_layout(nilfs, sbp);
540 if (err)
541 goto failed_sbh;
542
543 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
544
545 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
546
547 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
548 if (!bdi)
549 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
550 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
551
552 /* Finding last segment */
553 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
554 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
555 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
556
557 nilfs->ns_seg_seq = nilfs->ns_last_seq;
558 nilfs->ns_segnum =
559 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
560 nilfs->ns_cno = nilfs->ns_last_cno + 1;
561 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
562 printk(KERN_ERR "NILFS invalid last segment number.\n");
563 err = -EINVAL;
564 goto failed_sbh;
565 }
566 /* Dummy values */
567 nilfs->ns_free_segments_count =
568 nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
569
570 /* Initialize gcinode cache */
571 err = nilfs_init_gccache(nilfs);
572 if (err)
573 goto failed_sbh;
574
575 set_nilfs_init(nilfs);
576 err = 0;
577 out:
578 up_write(&nilfs->ns_sem);
579 return err;
580
581 failed_sbh:
582 nilfs_release_super_block(nilfs);
583 goto out;
584}
585
586int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
587{
588 struct inode *dat = nilfs_dat_inode(nilfs);
589 unsigned long ncleansegs;
590 int err;
591
592 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
593 err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
594 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
595 if (likely(!err))
596 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
597 return err;
598}
599
600int nilfs_near_disk_full(struct the_nilfs *nilfs)
601{
602 struct inode *sufile = nilfs->ns_sufile;
603 unsigned long ncleansegs, nincsegs;
604 int ret;
605
606 ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
607 if (likely(!ret)) {
608 nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
609 nilfs->ns_blocks_per_segment + 1;
610 if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
611 ret++;
612 }
613 return ret;
614}
615
616int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
617 int snapshot_mount)
618{
619 struct nilfs_sb_info *sbi;
620 int ret = 0;
621
622 down_read(&nilfs->ns_sem);
623 if (cno == 0 || cno > nilfs->ns_cno)
624 goto out_unlock;
625
626 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
627 if (sbi->s_snapshot_cno == cno &&
628 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
629 /* exclude read-only mounts */
630 ret++;
631 break;
632 }
633 }
634 /* for protecting recent checkpoints */
635 if (cno >= nilfs_last_cno(nilfs))
636 ret++;
637
638 out_unlock:
639 up_read(&nilfs->ns_sem);
640 return ret;
641}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000000..30fe58778d05
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
1/*
2 * the_nilfs.h - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _THE_NILFS_H
25#define _THE_NILFS_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/blkdev.h>
31#include <linux/backing-dev.h>
32#include "sb.h"
33
34/* the_nilfs struct */
35enum {
36 THE_NILFS_INIT = 0, /* Information from super_block is set */
37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40};
41
42/**
43 * struct the_nilfs - struct to supervise multiple nilfs mount points
44 * @ns_flags: flags
45 * @ns_count: reference count
46 * @ns_bdev: block device
47 * @ns_bdi: backing dev info
48 * @ns_writer: back pointer to writable nilfs_sb_info
49 * @ns_sem: semaphore for shared states
50 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
51 * @ns_writer_refcount: number of referrers on ns_writer
52 * @ns_sbh: buffer heads of on-disk super blocks
53 * @ns_sbp: pointers to super block data
54 * @ns_sbwtime: previous write time of super blocks
55 * @ns_sbsize: size of valid data in super block
56 * @ns_supers: list of nilfs super block structs
57 * @ns_seg_seq: segment sequence counter
58 * @ns_segnum: index number of the latest full segment.
59 * @ns_nextnum: index number of the full segment index to be used next
60 * @ns_pseg_offset: offset of next partial segment in the current full segment
61 * @ns_cno: next checkpoint number
62 * @ns_ctime: write time of the last segment
63 * @ns_nongc_ctime: write time of the last segment not for cleaner operation
64 * @ns_ndirtyblks: Number of dirty data blocks
65 * @ns_last_segment_lock: lock protecting fields for the latest segment
66 * @ns_last_pseg: start block number of the latest segment
67 * @ns_last_seq: sequence value of the latest segment
68 * @ns_last_cno: checkpoint number of the latest segment
69 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
70 * @ns_free_segments_count: counter of free segments
71 * @ns_segctor_sem: segment constructor semaphore
72 * @ns_dat: DAT file inode
73 * @ns_cpfile: checkpoint file inode
74 * @ns_sufile: segusage file inode
75 * @ns_gc_dat: shadow inode of the DAT file inode for GC
76 * @ns_gc_inodes: dummy inodes to keep live blocks
77 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
78 * @ns_blocksize_bits: bit length of block size
79 * @ns_nsegments: number of segments in filesystem
80 * @ns_blocks_per_segment: number of blocks per segment
81 * @ns_r_segments_percentage: reserved segments percentage
82 * @ns_nrsvsegs: number of reserved segments
83 * @ns_first_data_block: block number of first data block
84 * @ns_inode_size: size of on-disk inode
85 * @ns_first_ino: first not-special inode number
86 * @ns_crc_seed: seed value of CRC32 calculation
87 */
88struct the_nilfs {
89 unsigned long ns_flags;
90 atomic_t ns_count;
91
92 struct block_device *ns_bdev;
93 struct backing_dev_info *ns_bdi;
94 struct nilfs_sb_info *ns_writer;
95 struct rw_semaphore ns_sem;
96 struct mutex ns_writer_mutex;
97 atomic_t ns_writer_refcount;
98
99 /*
100 * used for
101 * - loading the latest checkpoint exclusively.
102 * - allocating a new full segment.
103 * - protecting s_dirt in the super_block struct
104 * (see nilfs_write_super) and the following fields.
105 */
106 struct buffer_head *ns_sbh[2];
107 struct nilfs_super_block *ns_sbp[2];
108 time_t ns_sbwtime[2];
109 unsigned ns_sbsize;
110 unsigned ns_mount_state;
111 struct list_head ns_supers;
112
113 /*
114 * Following fields are dedicated to a writable FS-instance.
115 * Except for the period seeking checkpoint, code outside the segment
116 * constructor must lock a segment semaphore while accessing these
117 * fields.
118 * The writable FS-instance is sole during a lifetime of the_nilfs.
119 */
120 u64 ns_seg_seq;
121 __u64 ns_segnum;
122 __u64 ns_nextnum;
123 unsigned long ns_pseg_offset;
124 __u64 ns_cno;
125 time_t ns_ctime;
126 time_t ns_nongc_ctime;
127 atomic_t ns_ndirtyblks;
128
129 /*
130 * The following fields hold information on the latest partial segment
131 * written to disk with a super root. These fields are protected by
132 * ns_last_segment_lock.
133 */
134 spinlock_t ns_last_segment_lock;
135 sector_t ns_last_pseg;
136 u64 ns_last_seq;
137 __u64 ns_last_cno;
138 u64 ns_prot_seq;
139 unsigned long ns_free_segments_count;
140
141 struct rw_semaphore ns_segctor_sem;
142
143 /*
144 * Following fields are lock free except for the period before
145 * the_nilfs is initialized.
146 */
147 struct inode *ns_dat;
148 struct inode *ns_cpfile;
149 struct inode *ns_sufile;
150 struct inode *ns_gc_dat;
151
152 /* GC inode list and hash table head */
153 struct list_head ns_gc_inodes;
154 struct hlist_head *ns_gc_inodes_h;
155
156 /* Disk layout information (static) */
157 unsigned int ns_blocksize_bits;
158 unsigned long ns_nsegments;
159 unsigned long ns_blocks_per_segment;
160 unsigned long ns_r_segments_percentage;
161 unsigned long ns_nrsvsegs;
162 unsigned long ns_first_data_block;
163 int ns_inode_size;
164 int ns_first_ino;
165 u32 ns_crc_seed;
166};
167
168#define NILFS_GCINODE_HASH_BITS 8
169#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
170
171#define THE_NILFS_FNS(bit, name) \
172static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
173{ \
174 set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
175} \
176static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \
177{ \
178 clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
179} \
180static inline int nilfs_##name(struct the_nilfs *nilfs) \
181{ \
182 return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
183}
184
185THE_NILFS_FNS(INIT, init)
186THE_NILFS_FNS(LOADED, loaded)
187THE_NILFS_FNS(DISCONTINUED, discontinued)
188
189/* Minimum interval of periodical update of superblocks (in seconds) */
190#define NILFS_SB_FREQ 10
191#define NILFS_ALTSB_FREQ 60 /* spare superblock */
192
193void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
194struct the_nilfs *alloc_nilfs(struct block_device *);
195void put_nilfs(struct the_nilfs *);
196int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
197int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
198int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
199int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
200int nilfs_near_disk_full(struct the_nilfs *);
201void nilfs_fall_back_super_block(struct the_nilfs *);
202void nilfs_swap_super_block(struct the_nilfs *);
203
204
205static inline void get_nilfs(struct the_nilfs *nilfs)
206{
207 /* Caller must have at least one reference of the_nilfs. */
208 atomic_inc(&nilfs->ns_count);
209}
210
211static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
212{
213 if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
214 mutex_lock(&nilfs->ns_writer_mutex);
215 return nilfs->ns_writer;
216}
217
218static inline void nilfs_put_writer(struct the_nilfs *nilfs)
219{
220 if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
221 mutex_unlock(&nilfs->ns_writer_mutex);
222}
223
224static inline void
225nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
226{
227 mutex_lock(&nilfs->ns_writer_mutex);
228 nilfs->ns_writer = sbi;
229 mutex_unlock(&nilfs->ns_writer_mutex);
230}
231
232static inline void
233nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
234{
235 mutex_lock(&nilfs->ns_writer_mutex);
236 if (sbi == nilfs->ns_writer)
237 nilfs->ns_writer = NULL;
238 mutex_unlock(&nilfs->ns_writer_mutex);
239}
240
241static inline void
242nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
243 sector_t *seg_start, sector_t *seg_end)
244{
245 *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
246 *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
247 if (segnum == 0)
248 *seg_start = nilfs->ns_first_data_block;
249}
250
251static inline sector_t
252nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
253{
254 return (segnum == 0) ? nilfs->ns_first_data_block :
255 (sector_t)nilfs->ns_blocks_per_segment * segnum;
256}
257
258static inline __u64
259nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
260{
261 sector_t segnum = blocknr;
262
263 sector_div(segnum, nilfs->ns_blocks_per_segment);
264 return segnum;
265}
266
267static inline void
268nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
269 sector_t seg_end)
270{
271 /* terminate the current full segment (used in case of I/O-error) */
272 nilfs->ns_pseg_offset = seg_end - seg_start + 1;
273}
274
275static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
276{
277 /* move forward with a full segment */
278 nilfs->ns_segnum = nilfs->ns_nextnum;
279 nilfs->ns_pseg_offset = 0;
280 nilfs->ns_seg_seq++;
281}
282
283static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
284{
285 __u64 cno;
286
287 spin_lock(&nilfs->ns_last_segment_lock);
288 cno = nilfs->ns_last_cno;
289 spin_unlock(&nilfs->ns_last_segment_lock);
290 return cno;
291}
292
293static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
294{
295 return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
296}
297
298#endif /* _THE_NILFS_H */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bed766e435b5..1634319e2404 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -220,7 +220,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
220 rem = 0; 220 rem = 0;
221 } 221 }
222 222
223 kevent->name = kmalloc(len + rem, GFP_KERNEL); 223 kevent->name = kmalloc(len + rem, GFP_NOFS);
224 if (unlikely(!kevent->name)) { 224 if (unlikely(!kevent->name)) {
225 kmem_cache_free(event_cachep, kevent); 225 kmem_cache_free(event_cachep, kevent);
226 return NULL; 226 return NULL;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 7d604480557a..b574431a031d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -290,6 +290,21 @@ out_attach:
290 else 290 else
291 mlog_errno(ret); 291 mlog_errno(ret);
292 292
293 /*
294 * In case of error, manually free the allocation and do the iput().
295 * We need to do this because error here means no d_instantiate(),
296 * which means iput() will not be called during dput(dentry).
297 */
298 if (ret < 0 && !alias) {
299 ocfs2_lock_res_free(&dl->dl_lockres);
300 BUG_ON(dl->dl_count != 1);
301 spin_lock(&dentry_attach_lock);
302 dentry->d_fsdata = NULL;
303 spin_unlock(&dentry_attach_lock);
304 kfree(dl);
305 iput(inode);
306 }
307
293 dput(alias); 308 dput(alias);
294 309
295 return ret; 310 return ret;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e71160cda110..c5752305627c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2697,7 +2697,7 @@ static int ocfs2_dx_dir_index_block(struct inode *dir,
2697 u32 *num_dx_entries, 2697 u32 *num_dx_entries,
2698 struct buffer_head *dirent_bh) 2698 struct buffer_head *dirent_bh)
2699{ 2699{
2700 int ret, namelen, i; 2700 int ret = 0, namelen, i;
2701 char *de_buf, *limit; 2701 char *de_buf, *limit;
2702 struct ocfs2_dir_entry *de; 2702 struct ocfs2_dir_entry *de;
2703 struct buffer_head *dx_leaf_bh; 2703 struct buffer_head *dx_leaf_bh;
@@ -2934,7 +2934,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2934 */ 2934 */
2935 BUG_ON(alloc > 2); 2935 BUG_ON(alloc > 2);
2936 2936
2937 ret = ocfs2_reserve_clusters(osb, alloc, &data_ac); 2937 ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
2938 if (ret) { 2938 if (ret) {
2939 mlog_errno(ret); 2939 mlog_errno(ret);
2940 goto out; 2940 goto out;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index de3da8eb558c..15713cbb865c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -100,7 +100,8 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
100 100
101 /* If the inode allocator bit is clear, this inode must be stale */ 101 /* If the inode allocator bit is clear, this inode must be stale */
102 if (!set) { 102 if (!set) {
103 mlog(0, "inode %llu suballoc bit is clear\n", blkno); 103 mlog(0, "inode %llu suballoc bit is clear\n",
104 (unsigned long long)blkno);
104 status = -ESTALE; 105 status = -ESTALE;
105 goto unlock_nfs_sync; 106 goto unlock_nfs_sync;
106 } 107 }
@@ -114,7 +115,7 @@ check_err:
114 if (status < 0) { 115 if (status < 0) {
115 if (status == -ESTALE) { 116 if (status == -ESTALE) {
116 mlog(0, "stale inode ino: %llu generation: %u\n", 117 mlog(0, "stale inode ino: %llu generation: %u\n",
117 blkno, handle->ih_generation); 118 (unsigned long long)blkno, handle->ih_generation);
118 } 119 }
119 result = ERR_PTR(status); 120 result = ERR_PTR(status);
120 goto bail; 121 goto bail;
@@ -129,8 +130,8 @@ check_err:
129check_gen: 130check_gen:
130 if (handle->ih_generation != inode->i_generation) { 131 if (handle->ih_generation != inode->i_generation) {
131 iput(inode); 132 iput(inode);
132 mlog(0, "stale inode ino: %llu generation: %u\n", blkno, 133 mlog(0, "stale inode ino: %llu generation: %u\n",
133 handle->ih_generation); 134 (unsigned long long)blkno, handle->ih_generation);
134 result = ERR_PTR(-ESTALE); 135 result = ERR_PTR(-ESTALE);
135 goto bail; 136 goto bail;
136 } 137 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8a..c2a87c885b73 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1912,6 +1912,22 @@ out_sems:
1912 return written ? written : ret; 1912 return written ? written : ret;
1913} 1913}
1914 1914
1915static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
1916 struct file *out,
1917 struct splice_desc *sd)
1918{
1919 int ret;
1920
1921 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
1922 sd->total_len, 0, NULL);
1923 if (ret < 0) {
1924 mlog_errno(ret);
1925 return ret;
1926 }
1927
1928 return splice_from_pipe_feed(pipe, sd, pipe_to_file);
1929}
1930
1915static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1931static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1916 struct file *out, 1932 struct file *out,
1917 loff_t *ppos, 1933 loff_t *ppos,
@@ -1919,34 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1919 unsigned int flags) 1935 unsigned int flags)
1920{ 1936{
1921 int ret; 1937 int ret;
1922 struct inode *inode = out->f_path.dentry->d_inode; 1938 struct address_space *mapping = out->f_mapping;
1939 struct inode *inode = mapping->host;
1940 struct splice_desc sd = {
1941 .total_len = len,
1942 .flags = flags,
1943 .pos = *ppos,
1944 .u.file = out,
1945 };
1923 1946
1924 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1947 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1925 (unsigned int)len, 1948 (unsigned int)len,
1926 out->f_path.dentry->d_name.len, 1949 out->f_path.dentry->d_name.len,
1927 out->f_path.dentry->d_name.name); 1950 out->f_path.dentry->d_name.name);
1928 1951
1929 inode_double_lock(inode, pipe->inode); 1952 if (pipe->inode)
1953 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
1930 1954
1931 ret = ocfs2_rw_lock(inode, 1); 1955 splice_from_pipe_begin(&sd);
1932 if (ret < 0) { 1956 do {
1933 mlog_errno(ret); 1957 ret = splice_from_pipe_next(pipe, &sd);
1934 goto out; 1958 if (ret <= 0)
1935 } 1959 break;
1936 1960
1937 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1961 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1938 NULL); 1962 ret = ocfs2_rw_lock(inode, 1);
1939 if (ret < 0) { 1963 if (ret < 0)
1940 mlog_errno(ret); 1964 mlog_errno(ret);
1941 goto out_unlock; 1965 else {
1942 } 1966 ret = ocfs2_splice_to_file(pipe, out, &sd);
1967 ocfs2_rw_unlock(inode, 1);
1968 }
1969 mutex_unlock(&inode->i_mutex);
1970 } while (ret > 0);
1971 splice_from_pipe_end(pipe, &sd);
1943 1972
1944 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1973 if (pipe->inode)
1974 mutex_unlock(&pipe->inode->i_mutex);
1945 1975
1946out_unlock: 1976 if (sd.num_spliced)
1947 ocfs2_rw_unlock(inode, 1); 1977 ret = sd.num_spliced;
1948out: 1978
1949 inode_double_unlock(inode, pipe->inode); 1979 if (ret > 0) {
1980 unsigned long nr_pages;
1981
1982 *ppos += ret;
1983 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1984
1985 /*
1986 * If file or inode is SYNC and we actually wrote some data,
1987 * sync it.
1988 */
1989 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1990 int err;
1991
1992 mutex_lock(&inode->i_mutex);
1993 err = ocfs2_rw_lock(inode, 1);
1994 if (err < 0) {
1995 mlog_errno(err);
1996 } else {
1997 err = generic_osync_inode(inode, mapping,
1998 OSYNC_METADATA|OSYNC_DATA);
1999 ocfs2_rw_unlock(inode, 1);
2000 }
2001 mutex_unlock(&inode->i_mutex);
2002
2003 if (err)
2004 ret = err;
2005 }
2006 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2007 }
1950 2008
1951 mlog_exit(ret); 2009 mlog_exit(ret);
1952 return ret; 2010 return ret;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 619dd7f6c053..eb7b76331eb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -437,8 +437,9 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
437} 437}
438 438
439/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 439/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
440 * inode alloc group descriptor + orphan dir index leaf */ 440 * inode alloc group descriptor + orphan dir index root +
441#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3) 441 * orphan dir index leaf */
442#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
442 443
443/* dinode update, old dir dinode update, new dir dinode update, old 444/* dinode update, old dir dinode update, new dir dinode update, old
444 * dir dir entry, new dir dir entry, dir entry update for renaming 445 * dir dir entry, new dir dir entry, dir entry update for renaming
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2220f93f668b..33464c6b60a2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1025,10 +1025,8 @@ static int ocfs2_rename(struct inode *old_dir,
1025 struct inode *orphan_dir = NULL; 1025 struct inode *orphan_dir = NULL;
1026 struct ocfs2_dinode *newfe = NULL; 1026 struct ocfs2_dinode *newfe = NULL;
1027 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 1027 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
1028 struct buffer_head *orphan_entry_bh = NULL;
1029 struct buffer_head *newfe_bh = NULL; 1028 struct buffer_head *newfe_bh = NULL;
1030 struct buffer_head *old_inode_bh = NULL; 1029 struct buffer_head *old_inode_bh = NULL;
1031 struct buffer_head *insert_entry_bh = NULL;
1032 struct ocfs2_super *osb = NULL; 1030 struct ocfs2_super *osb = NULL;
1033 u64 newfe_blkno, old_de_ino; 1031 u64 newfe_blkno, old_de_ino;
1034 handle_t *handle = NULL; 1032 handle_t *handle = NULL;
@@ -1455,8 +1453,6 @@ bail:
1455 brelse(old_inode_bh); 1453 brelse(old_inode_bh);
1456 brelse(old_dir_bh); 1454 brelse(old_dir_bh);
1457 brelse(new_dir_bh); 1455 brelse(new_dir_bh);
1458 brelse(orphan_entry_bh);
1459 brelse(insert_entry_bh);
1460 1456
1461 mlog_exit(status); 1457 mlog_exit(status);
1462 1458
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b4ca5911caaf..8439f6b324b9 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2197,26 +2197,29 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2197 struct buffer_head *inode_bh = NULL; 2197 struct buffer_head *inode_bh = NULL;
2198 struct ocfs2_dinode *inode_fe; 2198 struct ocfs2_dinode *inode_fe;
2199 2199
2200 mlog_entry("blkno: %llu\n", blkno); 2200 mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2201 2201
2202 /* dirty read disk */ 2202 /* dirty read disk */
2203 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 2203 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2204 if (status < 0) { 2204 if (status < 0) {
2205 mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status); 2205 mlog(ML_ERROR, "read block %llu failed %d\n",
2206 (unsigned long long)blkno, status);
2206 goto bail; 2207 goto bail;
2207 } 2208 }
2208 2209
2209 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; 2210 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2210 if (!OCFS2_IS_VALID_DINODE(inode_fe)) { 2211 if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2211 mlog(ML_ERROR, "invalid inode %llu requested\n", blkno); 2212 mlog(ML_ERROR, "invalid inode %llu requested\n",
2213 (unsigned long long)blkno);
2212 status = -EINVAL; 2214 status = -EINVAL;
2213 goto bail; 2215 goto bail;
2214 } 2216 }
2215 2217
2216 if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT && 2218 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2217 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { 2219 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2218 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", 2220 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2219 blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); 2221 (unsigned long long)blkno,
2222 (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2220 status = -EINVAL; 2223 status = -EINVAL;
2221 goto bail; 2224 goto bail;
2222 } 2225 }
@@ -2251,7 +2254,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2251 u64 bg_blkno; 2254 u64 bg_blkno;
2252 int status; 2255 int status;
2253 2256
2254 mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); 2257 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2258 (unsigned int)bit);
2255 2259
2256 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; 2260 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2257 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { 2261 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
@@ -2266,7 +2270,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2266 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, 2270 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2267 &group_bh); 2271 &group_bh);
2268 if (status < 0) { 2272 if (status < 0) {
2269 mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status); 2273 mlog(ML_ERROR, "read group %llu failed %d\n",
2274 (unsigned long long)bg_blkno, status);
2270 goto bail; 2275 goto bail;
2271 } 2276 }
2272 2277
@@ -2300,7 +2305,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2300 struct inode *inode_alloc_inode; 2305 struct inode *inode_alloc_inode;
2301 struct buffer_head *alloc_bh = NULL; 2306 struct buffer_head *alloc_bh = NULL;
2302 2307
2303 mlog_entry("blkno: %llu", blkno); 2308 mlog_entry("blkno: %llu", (unsigned long long)blkno);
2304 2309
2305 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2310 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2306 &suballoc_bit); 2311 &suballoc_bit);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ed0a0cfd68d2..579dd1b1110f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -39,6 +39,7 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/utsname.h> 41#include <linux/utsname.h>
42#include <linux/namei.h>
42 43
43#define MLOG_MASK_PREFIX ML_NAMEI 44#define MLOG_MASK_PREFIX ML_NAMEI
44#include <cluster/masklog.h> 45#include <cluster/masklog.h>
@@ -54,26 +55,6 @@
54 55
55#include "buffer_head_io.h" 56#include "buffer_head_io.h"
56 57
57static char *ocfs2_page_getlink(struct dentry * dentry,
58 struct page **ppage);
59static char *ocfs2_fast_symlink_getlink(struct inode *inode,
60 struct buffer_head **bh);
61
62/* get the link contents into pagecache */
63static char *ocfs2_page_getlink(struct dentry * dentry,
64 struct page **ppage)
65{
66 struct page * page;
67 struct address_space *mapping = dentry->d_inode->i_mapping;
68 page = read_mapping_page(mapping, 0, NULL);
69 if (IS_ERR(page))
70 goto sync_fail;
71 *ppage = page;
72 return kmap(page);
73
74sync_fail:
75 return (char*)page;
76}
77 58
78static char *ocfs2_fast_symlink_getlink(struct inode *inode, 59static char *ocfs2_fast_symlink_getlink(struct inode *inode,
79 struct buffer_head **bh) 60 struct buffer_head **bh)
@@ -128,40 +109,55 @@ out:
128 return ret; 109 return ret;
129} 110}
130 111
131static void *ocfs2_follow_link(struct dentry *dentry, 112static void *ocfs2_fast_follow_link(struct dentry *dentry,
132 struct nameidata *nd) 113 struct nameidata *nd)
133{ 114{
134 int status; 115 int status = 0;
135 char *link; 116 int len;
117 char *target, *link = ERR_PTR(-ENOMEM);
136 struct inode *inode = dentry->d_inode; 118 struct inode *inode = dentry->d_inode;
137 struct page *page = NULL;
138 struct buffer_head *bh = NULL; 119 struct buffer_head *bh = NULL;
139 120
140 if (ocfs2_inode_is_fast_symlink(inode)) 121 mlog_entry_void();
141 link = ocfs2_fast_symlink_getlink(inode, &bh); 122
142 else 123 BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
143 link = ocfs2_page_getlink(dentry, &page); 124 target = ocfs2_fast_symlink_getlink(inode, &bh);
144 if (IS_ERR(link)) { 125 if (IS_ERR(target)) {
145 status = PTR_ERR(link); 126 status = PTR_ERR(target);
146 mlog_errno(status); 127 mlog_errno(status);
147 goto bail; 128 goto bail;
148 } 129 }
149 130
150 status = vfs_follow_link(nd, link); 131 /* Fast symlinks can't be large */
132 len = strlen(target);
133 link = kzalloc(len + 1, GFP_NOFS);
134 if (!link) {
135 status = -ENOMEM;
136 mlog_errno(status);
137 goto bail;
138 }
139
140 memcpy(link, target, len);
141 nd_set_link(nd, link);
151 142
152bail: 143bail:
153 if (page) {
154 kunmap(page);
155 page_cache_release(page);
156 }
157 brelse(bh); 144 brelse(bh);
158 145
159 return ERR_PTR(status); 146 mlog_exit(status);
147 return status ? ERR_PTR(status) : link;
148}
149
150static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
151{
152 char *link = cookie;
153
154 kfree(link);
160} 155}
161 156
162const struct inode_operations ocfs2_symlink_inode_operations = { 157const struct inode_operations ocfs2_symlink_inode_operations = {
163 .readlink = page_readlink, 158 .readlink = page_readlink,
164 .follow_link = ocfs2_follow_link, 159 .follow_link = page_follow_link_light,
160 .put_link = page_put_link,
165 .getattr = ocfs2_getattr, 161 .getattr = ocfs2_getattr,
166 .setattr = ocfs2_setattr, 162 .setattr = ocfs2_setattr,
167 .setxattr = generic_setxattr, 163 .setxattr = generic_setxattr,
@@ -171,7 +167,8 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
171}; 167};
172const struct inode_operations ocfs2_fast_symlink_inode_operations = { 168const struct inode_operations ocfs2_fast_symlink_inode_operations = {
173 .readlink = ocfs2_readlink, 169 .readlink = ocfs2_readlink,
174 .follow_link = ocfs2_follow_link, 170 .follow_link = ocfs2_fast_follow_link,
171 .put_link = ocfs2_fast_put_link,
175 .getattr = ocfs2_getattr, 172 .getattr = ocfs2_getattr,
176 .setattr = ocfs2_setattr, 173 .setattr = ocfs2_setattr,
177 .setxattr = generic_setxattr, 174 .setxattr = generic_setxattr,
diff --git a/fs/open.c b/fs/open.c
index 377eb25b6abf..bdfbf03615a4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1033,7 +1033,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
1033 if (!IS_ERR(tmp)) { 1033 if (!IS_ERR(tmp)) {
1034 fd = get_unused_fd_flags(flags); 1034 fd = get_unused_fd_flags(flags);
1035 if (fd >= 0) { 1035 if (fd >= 0) {
1036 struct file *f = do_filp_open(dfd, tmp, flags, mode); 1036 struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
1037 if (IS_ERR(f)) { 1037 if (IS_ERR(f)) {
1038 put_unused_fd(fd); 1038 put_unused_fd(fd);
1039 fd = PTR_ERR(f); 1039 fd = PTR_ERR(f);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 38e337d51ced..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <linux/ctype.h> 20#include <linux/ctype.h>
21#include <linux/genhd.h> 21#include <linux/genhd.h>
22#include <linux/blktrace_api.h>
22 23
23#include "check.h" 24#include "check.h"
24 25
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
294 295
295static struct attribute_group *part_attr_groups[] = { 296static struct attribute_group *part_attr_groups[] = {
296 &part_attr_group, 297 &part_attr_group,
298#ifdef CONFIG_BLK_DEV_IO_TRACE
299 &blk_trace_attr_group,
300#endif
297 NULL 301 NULL
298}; 302};
299 303
diff --git a/fs/pipe.c b/fs/pipe.c
index 4af7aa521813..13414ec45b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -37,6 +37,42 @@
37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
38 */ 38 */
39 39
40static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
41{
42 if (pipe->inode)
43 mutex_lock_nested(&pipe->inode->i_mutex, subclass);
44}
45
46void pipe_lock(struct pipe_inode_info *pipe)
47{
48 /*
49 * pipe_lock() nests non-pipe inode locks (for writing to a file)
50 */
51 pipe_lock_nested(pipe, I_MUTEX_PARENT);
52}
53EXPORT_SYMBOL(pipe_lock);
54
55void pipe_unlock(struct pipe_inode_info *pipe)
56{
57 if (pipe->inode)
58 mutex_unlock(&pipe->inode->i_mutex);
59}
60EXPORT_SYMBOL(pipe_unlock);
61
62void pipe_double_lock(struct pipe_inode_info *pipe1,
63 struct pipe_inode_info *pipe2)
64{
65 BUG_ON(pipe1 == pipe2);
66
67 if (pipe1 < pipe2) {
68 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
69 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
70 } else {
71 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
72 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
73 }
74}
75
40/* Drop the inode semaphore and wait for a pipe event, atomically */ 76/* Drop the inode semaphore and wait for a pipe event, atomically */
41void pipe_wait(struct pipe_inode_info *pipe) 77void pipe_wait(struct pipe_inode_info *pipe)
42{ 78{
@@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe)
47 * is considered a noninteractive wait: 83 * is considered a noninteractive wait:
48 */ 84 */
49 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 85 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
50 if (pipe->inode) 86 pipe_unlock(pipe);
51 mutex_unlock(&pipe->inode->i_mutex);
52 schedule(); 87 schedule();
53 finish_wait(&pipe->wait, &wait); 88 finish_wait(&pipe->wait, &wait);
54 if (pipe->inode) 89 pipe_lock(pipe);
55 mutex_lock(&pipe->inode->i_mutex);
56} 90}
57 91
58static int 92static int
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7e4877d9dcb5..725a650bbbb8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
80#include <linux/delayacct.h> 80#include <linux/delayacct.h>
81#include <linux/seq_file.h> 81#include <linux/seq_file.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/ptrace.h>
83#include <linux/tracehook.h> 84#include <linux/tracehook.h>
84 85
85#include <asm/pgtable.h> 86#include <asm/pgtable.h>
@@ -352,6 +353,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
352 char state; 353 char state;
353 pid_t ppid = 0, pgid = -1, sid = -1; 354 pid_t ppid = 0, pgid = -1, sid = -1;
354 int num_threads = 0; 355 int num_threads = 0;
356 int permitted;
355 struct mm_struct *mm; 357 struct mm_struct *mm;
356 unsigned long long start_time; 358 unsigned long long start_time;
357 unsigned long cmin_flt = 0, cmaj_flt = 0; 359 unsigned long cmin_flt = 0, cmaj_flt = 0;
@@ -364,11 +366,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
364 366
365 state = *get_task_state(task); 367 state = *get_task_state(task);
366 vsize = eip = esp = 0; 368 vsize = eip = esp = 0;
369 permitted = ptrace_may_access(task, PTRACE_MODE_READ);
367 mm = get_task_mm(task); 370 mm = get_task_mm(task);
368 if (mm) { 371 if (mm) {
369 vsize = task_vsize(mm); 372 vsize = task_vsize(mm);
370 eip = KSTK_EIP(task); 373 if (permitted) {
371 esp = KSTK_ESP(task); 374 eip = KSTK_EIP(task);
375 esp = KSTK_ESP(task);
376 }
372 } 377 }
373 378
374 get_task_comm(tcomm, task); 379 get_task_comm(tcomm, task);
@@ -424,7 +429,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
424 unlock_task_sighand(task, &flags); 429 unlock_task_sighand(task, &flags);
425 } 430 }
426 431
427 if (!whole || num_threads < 2) 432 if (permitted && (!whole || num_threads < 2))
428 wchan = get_wchan(task); 433 wchan = get_wchan(task);
429 if (!whole) { 434 if (!whole) {
430 min_flt = task->min_flt; 435 min_flt = task->min_flt;
@@ -476,7 +481,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
476 rsslim, 481 rsslim,
477 mm ? mm->start_code : 0, 482 mm ? mm->start_code : 0,
478 mm ? mm->end_code : 0, 483 mm ? mm->end_code : 0,
479 mm ? mm->start_stack : 0, 484 (permitted && mm) ? mm->start_stack : 0,
480 esp, 485 esp,
481 eip, 486 eip,
482 /* The signal information here is obsolete. 487 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f71559784bfb..fb45615943c2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -322,7 +322,10 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
322 wchan = get_wchan(task); 322 wchan = get_wchan(task);
323 323
324 if (lookup_symbol_name(wchan, symname) < 0) 324 if (lookup_symbol_name(wchan, symname) < 0)
325 return sprintf(buffer, "%lu", wchan); 325 if (!ptrace_may_access(task, PTRACE_MODE_READ))
326 return 0;
327 else
328 return sprintf(buffer, "%lu", wchan);
326 else 329 else
327 return sprintf(buffer, "%s", symname); 330 return sprintf(buffer, "%s", symname);
328} 331}
@@ -648,14 +651,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
648{ 651{
649 struct proc_mounts *p = file->private_data; 652 struct proc_mounts *p = file->private_data;
650 struct mnt_namespace *ns = p->ns; 653 struct mnt_namespace *ns = p->ns;
651 unsigned res = 0; 654 unsigned res = POLLIN | POLLRDNORM;
652 655
653 poll_wait(file, &ns->poll, wait); 656 poll_wait(file, &ns->poll, wait);
654 657
655 spin_lock(&vfsmount_lock); 658 spin_lock(&vfsmount_lock);
656 if (p->event != ns->event) { 659 if (p->event != ns->event) {
657 p->event = ns->event; 660 p->event = ns->event;
658 res = POLLERR; 661 res |= POLLERR | POLLPRI;
659 } 662 }
660 spin_unlock(&vfsmount_lock); 663 spin_unlock(&vfsmount_lock);
661 664
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 74ea974f5ca6..c6b0302af4c4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
35#define K(x) ((x) << (PAGE_SHIFT - 10)) 35#define K(x) ((x) << (PAGE_SHIFT - 10))
36 si_meminfo(&i); 36 si_meminfo(&i);
37 si_swapinfo(&i); 37 si_swapinfo(&i);
38 committed = atomic_long_read(&vm_committed_space); 38 committed = percpu_counter_read_positive(&vm_committed_as);
39 allowed = ((totalram_pages - hugetlb_total_pages()) 39 allowed = ((totalram_pages - hugetlb_total_pages())
40 * sysctl_overcommit_ratio / 100) + total_swap_pages; 40 * sysctl_overcommit_ratio / 100) + total_swap_pages;
41 41
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1e15a2b176e8..b080b791d9e3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -67,8 +67,7 @@ static int proc_get_sb(struct file_system_type *fs_type,
67 sb->s_flags = flags; 67 sb->s_flags = flags;
68 err = proc_fill_super(sb); 68 err = proc_fill_super(sb);
69 if (err) { 69 if (err) {
70 up_write(&sb->s_umount); 70 deactivate_locked_super(sb);
71 deactivate_super(sb);
72 return err; 71 return err;
73 } 72 }
74 73
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f75efa22df5e..81e4eb60972e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,6 +18,9 @@
18#ifndef arch_irq_stat 18#ifndef arch_irq_stat
19#define arch_irq_stat() 0 19#define arch_irq_stat() 0
20#endif 20#endif
21#ifndef arch_idle_time
22#define arch_idle_time(cpu) 0
23#endif
21 24
22static int show_stat(struct seq_file *p, void *v) 25static int show_stat(struct seq_file *p, void *v)
23{ 26{
@@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v)
40 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); 43 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
41 system = cputime64_add(system, kstat_cpu(i).cpustat.system); 44 system = cputime64_add(system, kstat_cpu(i).cpustat.system);
42 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); 45 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
46 idle = cputime64_add(idle, arch_idle_time(i));
43 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); 47 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
44 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); 48 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
45 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 49 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
@@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v)
69 nice = kstat_cpu(i).cpustat.nice; 73 nice = kstat_cpu(i).cpustat.nice;
70 system = kstat_cpu(i).cpustat.system; 74 system = kstat_cpu(i).cpustat.system;
71 idle = kstat_cpu(i).cpustat.idle; 75 idle = kstat_cpu(i).cpustat.idle;
76 idle = cputime64_add(idle, arch_idle_time(i));
72 iowait = kstat_cpu(i).cpustat.iowait; 77 iowait = kstat_cpu(i).cpustat.iowait;
73 irq = kstat_cpu(i).cpustat.irq; 78 irq = kstat_cpu(i).cpustat.irq;
74 softirq = kstat_cpu(i).cpustat.softirq; 79 softirq = kstat_cpu(i).cpustat.softirq;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801f..6f61b7cc32e0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
204 struct file *file = vma->vm_file; 204 struct file *file = vma->vm_file;
205 int flags = vma->vm_flags; 205 int flags = vma->vm_flags;
206 unsigned long ino = 0; 206 unsigned long ino = 0;
207 unsigned long long pgoff = 0;
207 dev_t dev = 0; 208 dev_t dev = 0;
208 int len; 209 int len;
209 210
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
211 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 212 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
212 dev = inode->i_sb->s_dev; 213 dev = inode->i_sb->s_dev;
213 ino = inode->i_ino; 214 ino = inode->i_ino;
215 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
214 } 216 }
215 217
216 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 218 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
220 flags & VM_WRITE ? 'w' : '-', 222 flags & VM_WRITE ? 'w' : '-',
221 flags & VM_EXEC ? 'x' : '-', 223 flags & VM_EXEC ? 'x' : '-',
222 flags & VM_MAYSHARE ? 's' : 'p', 224 flags & VM_MAYSHARE ? 's' : 'p',
223 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, 225 pgoff,
224 MAJOR(dev), MINOR(dev), ino, &len); 226 MAJOR(dev), MINOR(dev), ino, &len);
225 227
226 /* 228 /*
@@ -663,6 +665,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
663 goto out_task; 665 goto out_task;
664 666
665 ret = 0; 667 ret = 0;
668
669 if (!count)
670 goto out_task;
671
666 mm = get_task_mm(task); 672 mm = get_task_mm(task);
667 if (!mm) 673 if (!mm)
668 goto out_task; 674 goto out_task;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 863464d5519c..64a72e2e7650 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -126,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
126 struct file *file; 126 struct file *file;
127 dev_t dev = 0; 127 dev_t dev = 0;
128 int flags, len; 128 int flags, len;
129 unsigned long long pgoff = 0;
129 130
130 flags = vma->vm_flags; 131 flags = vma->vm_flags;
131 file = vma->vm_file; 132 file = vma->vm_file;
@@ -134,6 +135,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
134 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 135 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
135 dev = inode->i_sb->s_dev; 136 dev = inode->i_sb->s_dev;
136 ino = inode->i_ino; 137 ino = inode->i_ino;
138 pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
137 } 139 }
138 140
139 seq_printf(m, 141 seq_printf(m,
@@ -144,7 +146,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
144 flags & VM_WRITE ? 'w' : '-', 146 flags & VM_WRITE ? 'w' : '-',
145 flags & VM_EXEC ? 'x' : '-', 147 flags & VM_EXEC ? 'x' : '-',
146 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 148 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
147 (unsigned long long) vma->vm_pgoff << PAGE_SHIFT, 149 pgoff,
148 MAJOR(dev), MINOR(dev), ino, &len); 150 MAJOR(dev), MINOR(dev), ino, &len);
149 151
150 if (file) { 152 if (file) {
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 385a0831cc99..68d4f6dc0578 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -1,12 +1,3 @@
1#
2# Makefile for the Linux filesystems.
3#
4# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
5# Rewritten to use lists instead of if-statements.
6#
7
8obj-y :=
9
10obj-$(CONFIG_QUOTA) += dquot.o 1obj-$(CONFIG_QUOTA) += dquot.o
11obj-$(CONFIG_QFMT_V1) += quota_v1.o 2obj-$(CONFIG_QFMT_V1) += quota_v1.o
12obj-$(CONFIG_QFMT_V2) += quota_v2.o 3obj-$(CONFIG_QFMT_V2) += quota_v2.o
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a404fb88e456..3a6b193d8444 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
221 save_mount_options(sb, data); 221 save_mount_options(sb, data);
222 222
223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); 223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
224 sb->s_fs_info = fsi;
224 if (!fsi) { 225 if (!fsi) {
225 err = -ENOMEM; 226 err = -ENOMEM;
226 goto fail; 227 goto fail;
227 } 228 }
228 sb->s_fs_info = fsi;
229 229
230 err = ramfs_parse_options(data, &fsi->mount_opts); 230 err = ramfs_parse_options(data, &fsi->mount_opts);
231 if (err) 231 if (err)
232 goto fail; 232 goto fail;
233 233
234 sb->s_maxbytes = MAX_LFS_FILESIZE; 234 sb->s_maxbytes = MAX_LFS_FILESIZE;
235 sb->s_blocksize = PAGE_CACHE_SIZE; 235 sb->s_blocksize = PAGE_CACHE_SIZE;
236 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 236 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
237 sb->s_magic = RAMFS_MAGIC; 237 sb->s_magic = RAMFS_MAGIC;
238 sb->s_op = &ramfs_ops; 238 sb->s_op = &ramfs_ops;
239 sb->s_time_gran = 1; 239 sb->s_time_gran = 1;
240
240 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 241 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
241 if (!inode) { 242 if (!inode) {
242 err = -ENOMEM; 243 err = -ENOMEM;
@@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
244 } 245 }
245 246
246 root = d_alloc_root(inode); 247 root = d_alloc_root(inode);
248 sb->s_root = root;
247 if (!root) { 249 if (!root) {
248 err = -ENOMEM; 250 err = -ENOMEM;
249 goto fail; 251 goto fail;
250 } 252 }
251 sb->s_root = root; 253
252 return 0; 254 return 0;
253fail: 255fail:
254 kfree(fsi); 256 kfree(fsi);
257 sb->s_fs_info = NULL;
255 iput(inode); 258 iput(inode);
256 return err; 259 return err;
257} 260}
diff --git a/fs/read_write.c b/fs/read_write.c
index 6d5d8ff238aa..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,10 +731,16 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
731 return ret; 731 return ret;
732} 732}
733 733
734static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
735{
736#define HALF_LONG_BITS (BITS_PER_LONG / 2)
737 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
738}
739
734SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 740SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
735 unsigned long, vlen, u32, pos_high, u32, pos_low) 741 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
736{ 742{
737 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 743 loff_t pos = pos_from_hilo(pos_h, pos_l);
738 struct file *file; 744 struct file *file;
739 ssize_t ret = -EBADF; 745 ssize_t ret = -EBADF;
740 int fput_needed; 746 int fput_needed;
@@ -757,9 +763,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
757} 763}
758 764
759SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 765SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
760 unsigned long, vlen, u32, pos_high, u32, pos_low) 766 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
761{ 767{
762 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 768 loff_t pos = pos_from_hilo(pos_h, pos_l);
763 struct file *file; 769 struct file *file;
764 ssize_t ret = -EBADF; 770 ssize_t ret = -EBADF;
765 int fput_needed; 771 int fput_needed;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 67a80d7e59e2..45ee3d357c70 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,6 +41,18 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
41 41
42#define store_ih(where,what) copy_item_head (where, what) 42#define store_ih(where,what) copy_item_head (where, what)
43 43
44static inline bool is_privroot_deh(struct dentry *dir,
45 struct reiserfs_de_head *deh)
46{
47 int ret = 0;
48#ifdef CONFIG_REISERFS_FS_XATTR
49 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
50 ret = (dir == dir->d_parent && privroot->d_inode &&
51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
52#endif
53 return ret;
54}
55
44int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, 56int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
45 filldir_t filldir, loff_t *pos) 57 filldir_t filldir, loff_t *pos)
46{ 58{
@@ -138,18 +150,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
138 } 150 }
139 151
140 /* Ignore the .reiserfs_priv entry */ 152 /* Ignore the .reiserfs_priv entry */
141 if (reiserfs_xattrs(inode->i_sb) && 153 if (is_privroot_deh(dentry, deh))
142 !old_format_only(inode->i_sb) &&
143 dentry == inode->i_sb->s_root &&
144 REISERFS_SB(inode->i_sb)->priv_root &&
145 REISERFS_SB(inode->i_sb)->priv_root->d_inode
146 && deh_objectid(deh) ==
147 le32_to_cpu(INODE_PKEY
148 (REISERFS_SB(inode->i_sb)->
149 priv_root->d_inode)->
150 k_objectid)) {
151 continue; 154 continue;
152 }
153 155
154 d_off = deh_offset(deh); 156 d_off = deh_offset(deh);
155 *pos = d_off; 157 *pos = d_off;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index efd4d720718e..271579128634 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -338,21 +338,8 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
338 &path_to_entry, &de); 338 &path_to_entry, &de);
339 pathrelse(&path_to_entry); 339 pathrelse(&path_to_entry);
340 if (retval == NAME_FOUND) { 340 if (retval == NAME_FOUND) {
341 /* Hide the .reiserfs_priv directory */ 341 inode = reiserfs_iget(dir->i_sb,
342 if (reiserfs_xattrs(dir->i_sb) && 342 (struct cpu_key *)&(de.de_dir_id));
343 !old_format_only(dir->i_sb) &&
344 REISERFS_SB(dir->i_sb)->priv_root &&
345 REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
346 de.de_objectid ==
347 le32_to_cpu(INODE_PKEY
348 (REISERFS_SB(dir->i_sb)->priv_root->d_inode)->
349 k_objectid)) {
350 reiserfs_write_unlock(dir->i_sb);
351 return ERR_PTR(-EACCES);
352 }
353
354 inode =
355 reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
356 if (!inode || IS_ERR(inode)) { 343 if (!inode || IS_ERR(inode)) {
357 reiserfs_write_unlock(dir->i_sb); 344 reiserfs_write_unlock(dir->i_sb);
358 return ERR_PTR(-EACCES); 345 return ERR_PTR(-EACCES);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0ae6486d9046..3567fb9e3fb1 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -448,13 +448,11 @@ int remove_save_link(struct inode *inode, int truncate)
448static void reiserfs_kill_sb(struct super_block *s) 448static void reiserfs_kill_sb(struct super_block *s)
449{ 449{
450 if (REISERFS_SB(s)) { 450 if (REISERFS_SB(s)) {
451#ifdef CONFIG_REISERFS_FS_XATTR
452 if (REISERFS_SB(s)->xattr_root) { 451 if (REISERFS_SB(s)->xattr_root) {
453 d_invalidate(REISERFS_SB(s)->xattr_root); 452 d_invalidate(REISERFS_SB(s)->xattr_root);
454 dput(REISERFS_SB(s)->xattr_root); 453 dput(REISERFS_SB(s)->xattr_root);
455 REISERFS_SB(s)->xattr_root = NULL; 454 REISERFS_SB(s)->xattr_root = NULL;
456 } 455 }
457#endif
458 if (REISERFS_SB(s)->priv_root) { 456 if (REISERFS_SB(s)->priv_root) {
459 d_invalidate(REISERFS_SB(s)->priv_root); 457 d_invalidate(REISERFS_SB(s)->priv_root);
460 dput(REISERFS_SB(s)->priv_root); 458 dput(REISERFS_SB(s)->priv_root);
@@ -1316,8 +1314,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1316 } 1314 }
1317 1315
1318out_ok: 1316out_ok:
1319 kfree(s->s_options); 1317 replace_mount_options(s, new_opts);
1320 s->s_options = new_opts;
1321 return 0; 1318 return 0;
1322 1319
1323out_err: 1320out_err:
@@ -1842,7 +1839,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1842 goto error; 1839 goto error;
1843 } 1840 }
1844 1841
1845 if ((errval = reiserfs_xattr_init(s, s->s_flags))) { 1842 if ((errval = reiserfs_lookup_privroot(s)) ||
1843 (errval = reiserfs_xattr_init(s, s->s_flags))) {
1846 dput(s->s_root); 1844 dput(s->s_root);
1847 s->s_root = NULL; 1845 s->s_root = NULL;
1848 goto error; 1846 goto error;
@@ -1855,7 +1853,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1855 reiserfs_info(s, "using 3.5.x disk format\n"); 1853 reiserfs_info(s, "using 3.5.x disk format\n");
1856 } 1854 }
1857 1855
1858 if ((errval = reiserfs_xattr_init(s, s->s_flags))) { 1856 if ((errval = reiserfs_lookup_privroot(s)) ||
1857 (errval = reiserfs_xattr_init(s, s->s_flags))) {
1859 dput(s->s_root); 1858 dput(s->s_root);
1860 s->s_root = NULL; 1859 s->s_root = NULL;
1861 goto error; 1860 goto error;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f83f52bae390..8e7deb0e6964 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -113,41 +113,30 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
113 113
114#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE) 114#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE)
115 115
116/* Returns and possibly creates the xattr dir. */ 116static struct dentry *open_xa_root(struct super_block *sb, int flags)
117static struct dentry *lookup_or_create_dir(struct dentry *parent,
118 const char *name, int flags)
119{ 117{
120 struct dentry *dentry; 118 struct dentry *privroot = REISERFS_SB(sb)->priv_root;
121 BUG_ON(!parent); 119 struct dentry *xaroot;
122 120 if (!privroot->d_inode)
123 dentry = lookup_one_len(name, parent, strlen(name)); 121 return ERR_PTR(-ENODATA);
124 if (IS_ERR(dentry))
125 return dentry;
126 else if (!dentry->d_inode) {
127 int err = -ENODATA;
128 122
129 if (xattr_may_create(flags)) { 123 mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
130 mutex_lock_nested(&parent->d_inode->i_mutex,
131 I_MUTEX_XATTR);
132 err = xattr_mkdir(parent->d_inode, dentry, 0700);
133 mutex_unlock(&parent->d_inode->i_mutex);
134 }
135 124
125 xaroot = dget(REISERFS_SB(sb)->xattr_root);
126 if (!xaroot)
127 xaroot = ERR_PTR(-ENODATA);
128 else if (!xaroot->d_inode) {
129 int err = -ENODATA;
130 if (xattr_may_create(flags))
131 err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
136 if (err) { 132 if (err) {
137 dput(dentry); 133 dput(xaroot);
138 dentry = ERR_PTR(err); 134 xaroot = ERR_PTR(err);
139 } 135 }
140 } 136 }
141 137
142 return dentry; 138 mutex_unlock(&privroot->d_inode->i_mutex);
143} 139 return xaroot;
144
145static struct dentry *open_xa_root(struct super_block *sb, int flags)
146{
147 struct dentry *privroot = REISERFS_SB(sb)->priv_root;
148 if (!privroot)
149 return ERR_PTR(-ENODATA);
150 return lookup_or_create_dir(privroot, XAROOT_NAME, flags);
151} 140}
152 141
153static struct dentry *open_xa_dir(const struct inode *inode, int flags) 142static struct dentry *open_xa_dir(const struct inode *inode, int flags)
@@ -163,10 +152,22 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
163 le32_to_cpu(INODE_PKEY(inode)->k_objectid), 152 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
164 inode->i_generation); 153 inode->i_generation);
165 154
166 xadir = lookup_or_create_dir(xaroot, namebuf, flags); 155 mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR);
156
157 xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
158 if (!IS_ERR(xadir) && !xadir->d_inode) {
159 int err = -ENODATA;
160 if (xattr_may_create(flags))
161 err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
162 if (err) {
163 dput(xadir);
164 xadir = ERR_PTR(err);
165 }
166 }
167
168 mutex_unlock(&xaroot->d_inode->i_mutex);
167 dput(xaroot); 169 dput(xaroot);
168 return xadir; 170 return xadir;
169
170} 171}
171 172
172/* The following are side effects of other operations that aren't explicitly 173/* The following are side effects of other operations that aren't explicitly
@@ -184,6 +185,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
184{ 185{
185 struct reiserfs_dentry_buf *dbuf = buf; 186 struct reiserfs_dentry_buf *dbuf = buf;
186 struct dentry *dentry; 187 struct dentry *dentry;
188 WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
187 189
188 if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) 190 if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
189 return -ENOSPC; 191 return -ENOSPC;
@@ -349,6 +351,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
349 if (IS_ERR(xadir)) 351 if (IS_ERR(xadir))
350 return ERR_CAST(xadir); 352 return ERR_CAST(xadir);
351 353
354 mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
352 xafile = lookup_one_len(name, xadir, strlen(name)); 355 xafile = lookup_one_len(name, xadir, strlen(name));
353 if (IS_ERR(xafile)) { 356 if (IS_ERR(xafile)) {
354 err = PTR_ERR(xafile); 357 err = PTR_ERR(xafile);
@@ -360,18 +363,15 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
360 363
361 if (!xafile->d_inode) { 364 if (!xafile->d_inode) {
362 err = -ENODATA; 365 err = -ENODATA;
363 if (xattr_may_create(flags)) { 366 if (xattr_may_create(flags))
364 mutex_lock_nested(&xadir->d_inode->i_mutex,
365 I_MUTEX_XATTR);
366 err = xattr_create(xadir->d_inode, xafile, 367 err = xattr_create(xadir->d_inode, xafile,
367 0700|S_IFREG); 368 0700|S_IFREG);
368 mutex_unlock(&xadir->d_inode->i_mutex);
369 }
370 } 369 }
371 370
372 if (err) 371 if (err)
373 dput(xafile); 372 dput(xafile);
374out: 373out:
374 mutex_unlock(&xadir->d_inode->i_mutex);
375 dput(xadir); 375 dput(xadir);
376 if (err) 376 if (err)
377 return ERR_PTR(err); 377 return ERR_PTR(err);
@@ -435,6 +435,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
435 if (IS_ERR(xadir)) 435 if (IS_ERR(xadir))
436 return PTR_ERR(xadir); 436 return PTR_ERR(xadir);
437 437
438 mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
438 dentry = lookup_one_len(name, xadir, strlen(name)); 439 dentry = lookup_one_len(name, xadir, strlen(name));
439 if (IS_ERR(dentry)) { 440 if (IS_ERR(dentry)) {
440 err = PTR_ERR(dentry); 441 err = PTR_ERR(dentry);
@@ -442,14 +443,13 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
442 } 443 }
443 444
444 if (dentry->d_inode) { 445 if (dentry->d_inode) {
445 mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
446 err = xattr_unlink(xadir->d_inode, dentry); 446 err = xattr_unlink(xadir->d_inode, dentry);
447 mutex_unlock(&xadir->d_inode->i_mutex);
448 update_ctime(inode); 447 update_ctime(inode);
449 } 448 }
450 449
451 dput(dentry); 450 dput(dentry);
452out_dput: 451out_dput:
452 mutex_unlock(&xadir->d_inode->i_mutex);
453 dput(xadir); 453 dput(xadir);
454 return err; 454 return err;
455} 455}
@@ -687,20 +687,6 @@ out:
687 return err; 687 return err;
688} 688}
689 689
690/* Actual operations that are exported to VFS-land */
691struct xattr_handler *reiserfs_xattr_handlers[] = {
692 &reiserfs_xattr_user_handler,
693 &reiserfs_xattr_trusted_handler,
694#ifdef CONFIG_REISERFS_FS_SECURITY
695 &reiserfs_xattr_security_handler,
696#endif
697#ifdef CONFIG_REISERFS_FS_POSIX_ACL
698 &reiserfs_posix_acl_access_handler,
699 &reiserfs_posix_acl_default_handler,
700#endif
701 NULL
702};
703
704/* 690/*
705 * In order to implement different sets of xattr operations for each xattr 691 * In order to implement different sets of xattr operations for each xattr
706 * prefix with the generic xattr API, a filesystem should create a 692 * prefix with the generic xattr API, a filesystem should create a
@@ -843,7 +829,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
843 if (!dentry->d_inode) 829 if (!dentry->d_inode)
844 return -EINVAL; 830 return -EINVAL;
845 831
846 if (!reiserfs_xattrs(dentry->d_sb) || 832 if (!dentry->d_sb->s_xattr ||
847 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 833 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
848 return -EOPNOTSUPP; 834 return -EOPNOTSUPP;
849 835
@@ -885,42 +871,50 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
885 return error; 871 return error;
886} 872}
887 873
888int reiserfs_permission(struct inode *inode, int mask)
889{
890 /*
891 * We don't do permission checks on the internal objects.
892 * Permissions are determined by the "owning" object.
893 */
894 if (IS_PRIVATE(inode))
895 return 0;
896 /*
897 * Stat data v1 doesn't support ACLs.
898 */
899 if (get_inode_sd_version(inode) == STAT_DATA_V1)
900 return generic_permission(inode, mask, NULL);
901 else
902 return generic_permission(inode, mask, reiserfs_check_acl);
903}
904
905static int create_privroot(struct dentry *dentry) 874static int create_privroot(struct dentry *dentry)
906{ 875{
907 int err; 876 int err;
908 struct inode *inode = dentry->d_parent->d_inode; 877 struct inode *inode = dentry->d_parent->d_inode;
909 mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR); 878 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
879
910 err = xattr_mkdir(inode, dentry, 0700); 880 err = xattr_mkdir(inode, dentry, 0700);
911 mutex_unlock(&inode->i_mutex); 881 if (err || !dentry->d_inode) {
912 if (err) { 882 reiserfs_warning(dentry->d_sb, "jdm-20006",
913 dput(dentry); 883 "xattrs/ACLs enabled and couldn't "
914 dentry = NULL; 884 "find/create .reiserfs_priv. "
885 "Failing mount.");
886 return -EOPNOTSUPP;
915 } 887 }
916 888
917 if (dentry && dentry->d_inode) 889 dentry->d_inode->i_flags |= S_PRIVATE;
918 reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " 890 reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
919 "storage.\n", PRIVROOT_NAME); 891 "storage.\n", PRIVROOT_NAME);
920 892
921 return err; 893 return 0;
922} 894}
923 895
896#else
897int __init reiserfs_xattr_register_handlers(void) { return 0; }
898void reiserfs_xattr_unregister_handlers(void) {}
899static int create_privroot(struct dentry *dentry) { return 0; }
900#endif
901
902/* Actual operations that are exported to VFS-land */
903struct xattr_handler *reiserfs_xattr_handlers[] = {
904#ifdef CONFIG_REISERFS_FS_XATTR
905 &reiserfs_xattr_user_handler,
906 &reiserfs_xattr_trusted_handler,
907#endif
908#ifdef CONFIG_REISERFS_FS_SECURITY
909 &reiserfs_xattr_security_handler,
910#endif
911#ifdef CONFIG_REISERFS_FS_POSIX_ACL
912 &reiserfs_posix_acl_access_handler,
913 &reiserfs_posix_acl_default_handler,
914#endif
915 NULL
916};
917
924static int xattr_mount_check(struct super_block *s) 918static int xattr_mount_check(struct super_block *s)
925{ 919{
926 /* We need generation numbers to ensure that the oid mapping is correct 920 /* We need generation numbers to ensure that the oid mapping is correct
@@ -940,21 +934,33 @@ static int xattr_mount_check(struct super_block *s)
940 return 0; 934 return 0;
941} 935}
942 936
943#else 937int reiserfs_permission(struct inode *inode, int mask)
944int __init reiserfs_xattr_register_handlers(void) { return 0; } 938{
945void reiserfs_xattr_unregister_handlers(void) {} 939 /*
940 * We don't do permission checks on the internal objects.
941 * Permissions are determined by the "owning" object.
942 */
943 if (IS_PRIVATE(inode))
944 return 0;
945
946#ifdef CONFIG_REISERFS_FS_XATTR
947 /*
948 * Stat data v1 doesn't support ACLs.
949 */
950 if (get_inode_sd_version(inode) != STAT_DATA_V1)
951 return generic_permission(inode, mask, reiserfs_check_acl);
946#endif 952#endif
953 return generic_permission(inode, mask, NULL);
954}
947 955
948/* This will catch lookups from the fs root to .reiserfs_priv */ 956/* This will catch lookups from the fs root to .reiserfs_priv */
949static int 957static int
950xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name) 958xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
951{ 959{
952 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; 960 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
953 if (name->len == priv_root->d_name.len && 961 if (container_of(q1, struct dentry, d_name) == priv_root)
954 name->hash == priv_root->d_name.hash &&
955 !memcmp(name->name, priv_root->d_name.name, name->len)) {
956 return -ENOENT; 962 return -ENOENT;
957 } else if (q1->len == name->len && 963 if (q1->len == name->len &&
958 !memcmp(q1->name, name->name, name->len)) 964 !memcmp(q1->name, name->name, name->len))
959 return 0; 965 return 0;
960 return 1; 966 return 1;
@@ -964,73 +970,71 @@ static const struct dentry_operations xattr_lookup_poison_ops = {
964 .d_compare = xattr_lookup_poison, 970 .d_compare = xattr_lookup_poison,
965}; 971};
966 972
973int reiserfs_lookup_privroot(struct super_block *s)
974{
975 struct dentry *dentry;
976 int err = 0;
977
978 /* If we don't have the privroot located yet - go find it */
979 mutex_lock(&s->s_root->d_inode->i_mutex);
980 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
981 strlen(PRIVROOT_NAME));
982 if (!IS_ERR(dentry)) {
983 REISERFS_SB(s)->priv_root = dentry;
984 s->s_root->d_op = &xattr_lookup_poison_ops;
985 if (dentry->d_inode)
986 dentry->d_inode->i_flags |= S_PRIVATE;
987 } else
988 err = PTR_ERR(dentry);
989 mutex_unlock(&s->s_root->d_inode->i_mutex);
990
991 return err;
992}
993
967/* We need to take a copy of the mount flags since things like 994/* We need to take a copy of the mount flags since things like
968 * MS_RDONLY don't get set until *after* we're called. 995 * MS_RDONLY don't get set until *after* we're called.
969 * mount_flags != mount_options */ 996 * mount_flags != mount_options */
970int reiserfs_xattr_init(struct super_block *s, int mount_flags) 997int reiserfs_xattr_init(struct super_block *s, int mount_flags)
971{ 998{
972 int err = 0; 999 int err = 0;
1000 struct dentry *privroot = REISERFS_SB(s)->priv_root;
973 1001
974#ifdef CONFIG_REISERFS_FS_XATTR
975 err = xattr_mount_check(s); 1002 err = xattr_mount_check(s);
976 if (err) 1003 if (err)
977 goto error; 1004 goto error;
978#endif
979 1005
980 /* If we don't have the privroot located yet - go find it */ 1006 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
981 if (!REISERFS_SB(s)->priv_root) { 1007 mutex_lock(&s->s_root->d_inode->i_mutex);
982 struct dentry *dentry; 1008 err = create_privroot(REISERFS_SB(s)->priv_root);
983 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, 1009 mutex_unlock(&s->s_root->d_inode->i_mutex);
984 strlen(PRIVROOT_NAME));
985 if (!IS_ERR(dentry)) {
986#ifdef CONFIG_REISERFS_FS_XATTR
987 if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
988 err = create_privroot(dentry);
989#endif
990 if (!dentry->d_inode) {
991 dput(dentry);
992 dentry = NULL;
993 }
994 } else
995 err = PTR_ERR(dentry);
996
997 if (!err && dentry) {
998 s->s_root->d_op = &xattr_lookup_poison_ops;
999 dentry->d_inode->i_flags |= S_PRIVATE;
1000 REISERFS_SB(s)->priv_root = dentry;
1001#ifdef CONFIG_REISERFS_FS_XATTR
1002 /* xattrs are unavailable */
1003 } else if (!(mount_flags & MS_RDONLY)) {
1004 /* If we're read-only it just means that the dir
1005 * hasn't been created. Not an error -- just no
1006 * xattrs on the fs. We'll check again if we
1007 * go read-write */
1008 reiserfs_warning(s, "jdm-20006",
1009 "xattrs/ACLs enabled and couldn't "
1010 "find/create .reiserfs_priv. "
1011 "Failing mount.");
1012 err = -EOPNOTSUPP;
1013#endif
1014 }
1015 } 1010 }
1016 1011
1017#ifdef CONFIG_REISERFS_FS_XATTR 1012 if (privroot->d_inode) {
1018 if (!err)
1019 s->s_xattr = reiserfs_xattr_handlers; 1013 s->s_xattr = reiserfs_xattr_handlers;
1014 mutex_lock(&privroot->d_inode->i_mutex);
1015 if (!REISERFS_SB(s)->xattr_root) {
1016 struct dentry *dentry;
1017 dentry = lookup_one_len(XAROOT_NAME, privroot,
1018 strlen(XAROOT_NAME));
1019 if (!IS_ERR(dentry))
1020 REISERFS_SB(s)->xattr_root = dentry;
1021 else
1022 err = PTR_ERR(dentry);
1023 }
1024 mutex_unlock(&privroot->d_inode->i_mutex);
1025 }
1020 1026
1021error: 1027error:
1022 if (err) { 1028 if (err) {
1023 clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); 1029 clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
1024 clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); 1030 clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
1025 } 1031 }
1026#endif
1027 1032
1028 /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ 1033 /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
1029 s->s_flags = s->s_flags & ~MS_POSIXACL;
1030#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1031 if (reiserfs_posixacl(s)) 1034 if (reiserfs_posixacl(s))
1032 s->s_flags |= MS_POSIXACL; 1035 s->s_flags |= MS_POSIXACL;
1033#endif 1036 else
1037 s->s_flags &= ~MS_POSIXACL;
1034 1038
1035 return err; 1039 return err;
1036} 1040}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 4d3c20e787c3..a92c8792c0f6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -55,8 +55,16 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
55 struct reiserfs_security_handle *sec) 55 struct reiserfs_security_handle *sec)
56{ 56{
57 int blocks = 0; 57 int blocks = 0;
58 int error = security_inode_init_security(inode, dir, &sec->name, 58 int error;
59 &sec->value, &sec->length); 59
60 sec->name = NULL;
61
62 /* Don't add selinux attributes on xattrs - they'll never get used */
63 if (IS_PRIVATE(dir))
64 return 0;
65
66 error = security_inode_init_security(inode, dir, &sec->name,
67 &sec->value, &sec->length);
60 if (error) { 68 if (error) {
61 if (error == -EOPNOTSUPP) 69 if (error == -EOPNOTSUPP)
62 error = 0; 70 error = 0;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
1config ROMFS_FS 1config ROMFS_FS
2 tristate "ROM file system support" 2 tristate "ROM file system support"
3 depends on BLOCK 3 depends on BLOCK || MTD
4 ---help--- 4 ---help---
5 This is a very small read-only file system mainly intended for 5 This is a very small read-only file system mainly intended for
6 initial ram disks of installation disks, but it could be used for 6 initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
14 14
15 If you don't know whether you need it, then you don't need it: 15 If you don't know whether you need it, then you don't need it:
16 answer N. 16 answer N.
17
18#
19# Select the backing stores to be supported
20#
21choice
22 prompt "RomFS backing stores"
23 depends on ROMFS_FS
24 default ROMFS_BACKED_BY_BLOCK
25 help
26 Select the backing stores to be supported.
27
28config ROMFS_BACKED_BY_BLOCK
29 bool "Block device-backed ROM file system support"
30 depends on BLOCK
31 help
32 This permits ROMFS to use block devices buffered through the page
33 cache as the medium from which to retrieve data. It does not allow
34 direct mapping of the medium.
35
36 If unsure, answer Y.
37
38config ROMFS_BACKED_BY_MTD
39 bool "MTD-backed ROM file system support"
40 depends on MTD=y || (ROMFS_FS=m && MTD)
41 help
42 This permits ROMFS to use MTD based devices directly, without the
43 intercession of the block layer (which may have been disabled). It
44 also allows direct mapping of MTD devices through romfs files under
45 NOMMU conditions if the underlying device is directly addressable by
46 the CPU.
47
48 If unsure, answer Y.
49
50config ROMFS_BACKED_BY_BOTH
51 bool "Both the above"
52 depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
53endchoice
54
55
56config ROMFS_ON_BLOCK
57 bool
58 default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
59
60config ROMFS_ON_MTD
61 bool
62 default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
1# 1#
2# Makefile for the linux romfs filesystem routines. 2# Makefile for the linux RomFS filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_ROMFS_FS) += romfs.o 5obj-$(CONFIG_ROMFS_FS) += romfs.o
6 6
7romfs-objs := inode.o 7romfs-y := storage.o super.o
8
9ifneq ($(CONFIG_MMU),y)
10romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
11endif
12
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
1/*
2 * ROMFS file system, Linux implementation
3 *
4 * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
5 *
6 * Using parts of the minix filesystem
7 * Copyright (C) 1991, 1992 Linus Torvalds
8 *
9 * and parts of the affs filesystem additionally
10 * Copyright (C) 1993 Ray Burr
11 * Copyright (C) 1996 Hans-Joachim Widmaier
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 *
18 * Changes
19 * Changed for 2.1.19 modules
20 * Jan 1997 Initial release
21 * Jun 1997 2.1.43+ changes
22 * Proper page locking in readpage
23 * Changed to work with 2.1.45+ fs
24 * Jul 1997 Fixed follow_link
25 * 2.1.47
26 * lookup shouldn't return -ENOENT
27 * from Horst von Brand:
28 * fail on wrong checksum
29 * double unlock_super was possible
30 * correct namelen for statfs
31 * spotted by Bill Hawes:
32 * readlink shouldn't iput()
33 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
34 * exposed a problem in readdir
35 * 2.1.107 code-freeze spellchecker run
36 * Aug 1998 2.1.118+ VFS changes
37 * Sep 1998 2.1.122 another VFS change (follow_link)
38 * Apr 1999 2.2.7 no more EBADF checking in
39 * lookup/readdir, use ERR_PTR
40 * Jun 1999 2.3.6 d_alloc_root use changed
41 * 2.3.9 clean up usage of ENOENT/negative
42 * dentries in lookup
43 * clean up page flags setting
44 * (error, uptodate, locking) in
45 * in readpage
46 * use init_special_inode for
47 * fifos/sockets (and streamline) in
48 * read_inode, fix _ops table order
49 * Aug 1999 2.3.16 __initfunc() => __init change
50 * Oct 1999 2.3.24 page->owner hack obsoleted
51 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
52 */
53
54/* todo:
55 * - see Documentation/filesystems/romfs.txt
56 * - use allocated, not stack memory for file names?
57 * - considering write access...
58 * - network (tftp) files?
59 * - merge back some _op tables
60 */
61
62/*
63 * Sorry about some optimizations and for some goto's. I just wanted
64 * to squeeze some more bytes out of this code.. :)
65 */
66
67#include <linux/module.h>
68#include <linux/types.h>
69#include <linux/errno.h>
70#include <linux/slab.h>
71#include <linux/romfs_fs.h>
72#include <linux/fs.h>
73#include <linux/init.h>
74#include <linux/pagemap.h>
75#include <linux/smp_lock.h>
76#include <linux/buffer_head.h>
77#include <linux/vfs.h>
78
79#include <asm/uaccess.h>
80
81struct romfs_inode_info {
82 unsigned long i_metasize; /* size of non-data area */
83 unsigned long i_dataoffset; /* from the start of fs */
84 struct inode vfs_inode;
85};
86
87static struct inode *romfs_iget(struct super_block *, unsigned long);
88
89/* instead of private superblock data */
90static inline unsigned long romfs_maxsize(struct super_block *sb)
91{
92 return (unsigned long)sb->s_fs_info;
93}
94
95static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
96{
97 return container_of(inode, struct romfs_inode_info, vfs_inode);
98}
99
100static __u32
101romfs_checksum(void *data, int size)
102{
103 __u32 sum;
104 __be32 *ptr;
105
106 sum = 0; ptr = data;
107 size>>=2;
108 while (size>0) {
109 sum += be32_to_cpu(*ptr++);
110 size--;
111 }
112 return sum;
113}
114
115static const struct super_operations romfs_ops;
116
117static int romfs_fill_super(struct super_block *s, void *data, int silent)
118{
119 struct buffer_head *bh;
120 struct romfs_super_block *rsb;
121 struct inode *root;
122 int sz, ret = -EINVAL;
123
124 /* I would parse the options here, but there are none.. :) */
125
126 sb_set_blocksize(s, ROMBSIZE);
127 s->s_maxbytes = 0xFFFFFFFF;
128
129 bh = sb_bread(s, 0);
130 if (!bh) {
131 /* XXX merge with other printk? */
132 printk ("romfs: unable to read superblock\n");
133 goto outnobh;
134 }
135
136 rsb = (struct romfs_super_block *)bh->b_data;
137 sz = be32_to_cpu(rsb->size);
138 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
139 || sz < ROMFH_SIZE) {
140 if (!silent)
141 printk ("VFS: Can't find a romfs filesystem on dev "
142 "%s.\n", s->s_id);
143 goto out;
144 }
145 if (romfs_checksum(rsb, min_t(int, sz, 512))) {
146 printk ("romfs: bad initial checksum on dev "
147 "%s.\n", s->s_id);
148 goto out;
149 }
150
151 s->s_magic = ROMFS_MAGIC;
152 s->s_fs_info = (void *)(long)sz;
153
154 s->s_flags |= MS_RDONLY;
155
156 /* Find the start of the fs */
157 sz = (ROMFH_SIZE +
158 strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
159 & ROMFH_MASK;
160
161 s->s_op = &romfs_ops;
162 root = romfs_iget(s, sz);
163 if (IS_ERR(root)) {
164 ret = PTR_ERR(root);
165 goto out;
166 }
167
168 ret = -ENOMEM;
169 s->s_root = d_alloc_root(root);
170 if (!s->s_root)
171 goto outiput;
172
173 brelse(bh);
174 return 0;
175
176outiput:
177 iput(root);
178out:
179 brelse(bh);
180outnobh:
181 return ret;
182}
183
184/* That's simple too. */
185
186static int
187romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
188{
189 buf->f_type = ROMFS_MAGIC;
190 buf->f_bsize = ROMBSIZE;
191 buf->f_bfree = buf->f_bavail = buf->f_ffree;
192 buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
193 buf->f_namelen = ROMFS_MAXFN;
194 return 0;
195}
196
197/* some helper routines */
198
199static int
200romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
201{
202 struct buffer_head *bh;
203 unsigned long avail, maxsize, res;
204
205 maxsize = romfs_maxsize(i->i_sb);
206 if (offset >= maxsize)
207 return -1;
208
209 /* strnlen is almost always valid */
210 if (count > maxsize || offset+count > maxsize)
211 count = maxsize-offset;
212
213 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
214 if (!bh)
215 return -1; /* error */
216
217 avail = ROMBSIZE - (offset & ROMBMASK);
218 maxsize = min_t(unsigned long, count, avail);
219 res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
220 brelse(bh);
221
222 if (res < maxsize)
223 return res; /* found all of it */
224
225 while (res < count) {
226 offset += maxsize;
227
228 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
229 if (!bh)
230 return -1;
231 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
232 avail = strnlen(bh->b_data, maxsize);
233 res += avail;
234 brelse(bh);
235 if (avail < maxsize)
236 return res;
237 }
238 return res;
239}
240
241static int
242romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
243{
244 struct buffer_head *bh;
245 unsigned long avail, maxsize, res;
246
247 maxsize = romfs_maxsize(i->i_sb);
248 if (offset >= maxsize || count > maxsize || offset+count>maxsize)
249 return -1;
250
251 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
252 if (!bh)
253 return -1; /* error */
254
255 avail = ROMBSIZE - (offset & ROMBMASK);
256 maxsize = min_t(unsigned long, count, avail);
257 memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
258 brelse(bh);
259
260 res = maxsize; /* all of it */
261
262 while (res < count) {
263 offset += maxsize;
264 dest += maxsize;
265
266 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
267 if (!bh)
268 return -1;
269 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
270 memcpy(dest, bh->b_data, maxsize);
271 brelse(bh);
272 res += maxsize;
273 }
274 return res;
275}
276
277static unsigned char romfs_dtype_table[] = {
278 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
279};
280
281static int
282romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
283{
284 struct inode *i = filp->f_path.dentry->d_inode;
285 struct romfs_inode ri;
286 unsigned long offset, maxoff;
287 int j, ino, nextfh;
288 int stored = 0;
289 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
290
291 lock_kernel();
292
293 maxoff = romfs_maxsize(i->i_sb);
294
295 offset = filp->f_pos;
296 if (!offset) {
297 offset = i->i_ino & ROMFH_MASK;
298 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
299 goto out;
300 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
301 }
302
303 /* Not really failsafe, but we are read-only... */
304 for(;;) {
305 if (!offset || offset >= maxoff) {
306 offset = maxoff;
307 filp->f_pos = offset;
308 goto out;
309 }
310 filp->f_pos = offset;
311
312 /* Fetch inode info */
313 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
314 goto out;
315
316 j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
317 if (j < 0)
318 goto out;
319
320 fsname[j]=0;
321 romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
322
323 ino = offset;
324 nextfh = be32_to_cpu(ri.next);
325 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
326 ino = be32_to_cpu(ri.spec);
327 if (filldir(dirent, fsname, j, offset, ino,
328 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
329 goto out;
330 }
331 stored++;
332 offset = nextfh & ROMFH_MASK;
333 }
334out:
335 unlock_kernel();
336 return stored;
337}
338
339static struct dentry *
340romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
341{
342 unsigned long offset, maxoff;
343 long res;
344 int fslen;
345 struct inode *inode = NULL;
346 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
347 struct romfs_inode ri;
348 const char *name; /* got from dentry */
349 int len;
350
351 res = -EACCES; /* placeholder for "no data here" */
352 offset = dir->i_ino & ROMFH_MASK;
353 lock_kernel();
354 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
355 goto error;
356
357 maxoff = romfs_maxsize(dir->i_sb);
358 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
359
360 /* OK, now find the file whose name is in "dentry" in the
361 * directory specified by "dir". */
362
363 name = dentry->d_name.name;
364 len = dentry->d_name.len;
365
366 for(;;) {
367 if (!offset || offset >= maxoff)
368 goto success; /* negative success */
369 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
370 goto error;
371
372 /* try to match the first 16 bytes of name */
373 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
374 if (len < ROMFH_SIZE) {
375 if (len == fslen) {
376 /* both are shorter, and same size */
377 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
378 if (strncmp (name, fsname, len) == 0)
379 break;
380 }
381 } else if (fslen >= ROMFH_SIZE) {
382 /* both are longer; XXX optimize max size */
383 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
384 if (len == fslen) {
385 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
386 if (strncmp(name, fsname, len) == 0)
387 break;
388 }
389 }
390 /* next entry */
391 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
392 }
393
394 /* Hard link handling */
395 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
396 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
397
398 inode = romfs_iget(dir->i_sb, offset);
399 if (IS_ERR(inode)) {
400 res = PTR_ERR(inode);
401 goto error;
402 }
403
404success:
405 d_add(dentry, inode);
406 res = 0;
407error:
408 unlock_kernel();
409 return ERR_PTR(res);
410}
411
412/*
413 * Ok, we do readpage, to be able to execute programs. Unfortunately,
414 * we can't use bmap, since we may have looser alignments.
415 */
416
417static int
418romfs_readpage(struct file *file, struct page * page)
419{
420 struct inode *inode = page->mapping->host;
421 loff_t offset, size;
422 unsigned long filled;
423 void *buf;
424 int result = -EIO;
425
426 page_cache_get(page);
427 lock_kernel();
428 buf = kmap(page);
429 if (!buf)
430 goto err_out;
431
432 /* 32 bit warning -- but not for us :) */
433 offset = page_offset(page);
434 size = i_size_read(inode);
435 filled = 0;
436 result = 0;
437 if (offset < size) {
438 unsigned long readlen;
439
440 size -= offset;
441 readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
442
443 filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
444
445 if (filled != readlen) {
446 SetPageError(page);
447 filled = 0;
448 result = -EIO;
449 }
450 }
451
452 if (filled < PAGE_SIZE)
453 memset(buf + filled, 0, PAGE_SIZE-filled);
454
455 if (!result)
456 SetPageUptodate(page);
457 flush_dcache_page(page);
458
459 unlock_page(page);
460
461 kunmap(page);
462err_out:
463 page_cache_release(page);
464 unlock_kernel();
465
466 return result;
467}
468
469/* Mapping from our types to the kernel */
470
471static const struct address_space_operations romfs_aops = {
472 .readpage = romfs_readpage
473};
474
475static const struct file_operations romfs_dir_operations = {
476 .read = generic_read_dir,
477 .readdir = romfs_readdir,
478};
479
480static const struct inode_operations romfs_dir_inode_operations = {
481 .lookup = romfs_lookup,
482};
483
484static mode_t romfs_modemap[] =
485{
486 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
487 S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
488};
489
490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino)
492{
493 int nextfh, ret;
494 struct romfs_inode ri;
495 struct inode *i;
496
497 ino &= ROMFH_MASK;
498 i = iget_locked(sb, ino);
499 if (!i)
500 return ERR_PTR(-ENOMEM);
501 if (!(i->i_state & I_NEW))
502 return i;
503
504 i->i_mode = 0;
505
506 /* Loop for finding the real hard link */
507 for(;;) {
508 if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
509 printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
510 ino);
511 iget_failed(i);
512 return ERR_PTR(-EIO);
513 }
514 /* XXX: do romfs_checksum here too (with name) */
515
516 nextfh = be32_to_cpu(ri.next);
517 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
518 break;
519
520 ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
521 }
522
523 i->i_nlink = 1; /* Hard to decide.. */
524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527
528 /* Precalculate the data offset */
529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
530 if (ret >= 0)
531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
532 else
533 ino = 0;
534
535 ROMFS_I(i)->i_metasize = ino;
536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
537
538 /* Compute permissions */
539 ino = romfs_modemap[nextfh & ROMFH_TYPE];
540 /* only "normal" files have ops */
541 switch (nextfh & ROMFH_TYPE) {
542 case 1:
543 i->i_size = ROMFS_I(i)->i_metasize;
544 i->i_op = &romfs_dir_inode_operations;
545 i->i_fop = &romfs_dir_operations;
546 if (nextfh & ROMFH_EXEC)
547 ino |= S_IXUGO;
548 i->i_mode = ino;
549 break;
550 case 2:
551 i->i_fop = &generic_ro_fops;
552 i->i_data.a_ops = &romfs_aops;
553 if (nextfh & ROMFH_EXEC)
554 ino |= S_IXUGO;
555 i->i_mode = ino;
556 break;
557 case 3:
558 i->i_op = &page_symlink_inode_operations;
559 i->i_data.a_ops = &romfs_aops;
560 i->i_mode = ino | S_IRWXUGO;
561 break;
562 default:
563 /* depending on MBZ for sock/fifos */
564 nextfh = be32_to_cpu(ri.spec);
565 init_special_inode(i, ino,
566 MKDEV(nextfh>>16,nextfh&0xffff));
567 }
568 unlock_new_inode(i);
569 return i;
570}
571
572static struct kmem_cache * romfs_inode_cachep;
573
574static struct inode *romfs_alloc_inode(struct super_block *sb)
575{
576 struct romfs_inode_info *ei;
577 ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
578 if (!ei)
579 return NULL;
580 return &ei->vfs_inode;
581}
582
583static void romfs_destroy_inode(struct inode *inode)
584{
585 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
586}
587
588static void init_once(void *foo)
589{
590 struct romfs_inode_info *ei = foo;
591
592 inode_init_once(&ei->vfs_inode);
593}
594
595static int init_inodecache(void)
596{
597 romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
598 sizeof(struct romfs_inode_info),
599 0, (SLAB_RECLAIM_ACCOUNT|
600 SLAB_MEM_SPREAD),
601 init_once);
602 if (romfs_inode_cachep == NULL)
603 return -ENOMEM;
604 return 0;
605}
606
607static void destroy_inodecache(void)
608{
609 kmem_cache_destroy(romfs_inode_cachep);
610}
611
612static int romfs_remount(struct super_block *sb, int *flags, char *data)
613{
614 *flags |= MS_RDONLY;
615 return 0;
616}
617
618static const struct super_operations romfs_ops = {
619 .alloc_inode = romfs_alloc_inode,
620 .destroy_inode = romfs_destroy_inode,
621 .statfs = romfs_statfs,
622 .remount_fs = romfs_remount,
623};
624
625static int romfs_get_sb(struct file_system_type *fs_type,
626 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
627{
628 return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
629 mnt);
630}
631
632static struct file_system_type romfs_fs_type = {
633 .owner = THIS_MODULE,
634 .name = "romfs",
635 .get_sb = romfs_get_sb,
636 .kill_sb = kill_block_super,
637 .fs_flags = FS_REQUIRES_DEV,
638};
639
640static int __init init_romfs_fs(void)
641{
642 int err = init_inodecache();
643 if (err)
644 goto out1;
645 err = register_filesystem(&romfs_fs_type);
646 if (err)
647 goto out;
648 return 0;
649out:
650 destroy_inodecache();
651out1:
652 return err;
653}
654
655static void __exit exit_romfs_fs(void)
656{
657 unregister_filesystem(&romfs_fs_type);
658 destroy_inodecache();
659}
660
661/* Yes, works even as a module... :) */
662
663module_init(init_romfs_fs)
664module_exit(exit_romfs_fs)
665MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..95217b830118
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
1/* RomFS internal definitions
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/romfs_fs.h>
13
14struct romfs_inode_info {
15 struct inode vfs_inode;
16 unsigned long i_metasize; /* size of non-data area */
17 unsigned long i_dataoffset; /* from the start of fs */
18};
19
20static inline size_t romfs_maxsize(struct super_block *sb)
21{
22 return (size_t) (unsigned long) sb->s_fs_info;
23}
24
25static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
26{
27 return container_of(inode, struct romfs_inode_info, vfs_inode);
28}
29
30/*
31 * mmap-nommu.c
32 */
33#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
34extern const struct file_operations romfs_ro_fops;
35#else
36#define romfs_ro_fops generic_ro_fops
37#endif
38
39/*
40 * storage.c
41 */
42extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
43 void *buf, size_t buflen);
44extern ssize_t romfs_dev_strnlen(struct super_block *sb,
45 unsigned long pos, size_t maxlen);
46extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
47 const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
1/* NOMMU mmap support for RomFS on MTD devices
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/mtd/super.h>
14#include "internal.h"
15
16/*
17 * try to determine where a shared mapping can be made
18 * - only supported for NOMMU at the moment (MMU can't doesn't copy private
19 * mappings)
20 * - attempts to map through to the underlying MTD device
21 */
22static unsigned long romfs_get_unmapped_area(struct file *file,
23 unsigned long addr,
24 unsigned long len,
25 unsigned long pgoff,
26 unsigned long flags)
27{
28 struct inode *inode = file->f_mapping->host;
29 struct mtd_info *mtd = inode->i_sb->s_mtd;
30 unsigned long isize, offset;
31
32 if (!mtd)
33 goto cant_map_directly;
34
35 isize = i_size_read(inode);
36 offset = pgoff << PAGE_SHIFT;
37 if (offset > isize || len > isize || offset > isize - len)
38 return (unsigned long) -EINVAL;
39
40 /* we need to call down to the MTD layer to do the actual mapping */
41 if (mtd->get_unmapped_area) {
42 if (addr != 0)
43 return (unsigned long) -EINVAL;
44
45 if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
46 return (unsigned long) -EINVAL;
47
48 offset += ROMFS_I(inode)->i_dataoffset;
49 if (offset > mtd->size - len)
50 return (unsigned long) -EINVAL;
51
52 return mtd->get_unmapped_area(mtd, len, offset, flags);
53 }
54
55cant_map_directly:
56 return (unsigned long) -ENOSYS;
57}
58
59/*
60 * permit a R/O mapping to be made directly through onto an MTD device if
61 * possible
62 */
63static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
64{
65 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
66}
67
68const struct file_operations romfs_ro_fops = {
69 .llseek = generic_file_llseek,
70 .read = do_sync_read,
71 .aio_read = generic_file_aio_read,
72 .splice_read = generic_file_splice_read,
73 .mmap = romfs_mmap,
74 .get_unmapped_area = romfs_get_unmapped_area,
75};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..b3208adf8e71
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,293 @@
1/* RomFS storage access routines
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/mtd/super.h>
14#include <linux/buffer_head.h>
15#include "internal.h"
16
17#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
18#error no ROMFS backing store interface configured
19#endif
20
21#ifdef CONFIG_ROMFS_ON_MTD
22#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
23
24/*
25 * read data from an romfs image on an MTD device
26 */
27static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
28 void *buf, size_t buflen)
29{
30 size_t rlen;
31 int ret;
32
33 ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
34 return (ret < 0 || rlen != buflen) ? -EIO : 0;
35}
36
37/*
38 * determine the length of a string in a romfs image on an MTD device
39 */
40static ssize_t romfs_mtd_strnlen(struct super_block *sb,
41 unsigned long pos, size_t maxlen)
42{
43 ssize_t n = 0;
44 size_t segment;
45 u_char buf[16], *p;
46 size_t len;
47 int ret;
48
49 /* scan the string up to 16 bytes at a time */
50 while (maxlen > 0) {
51 segment = min_t(size_t, maxlen, 16);
52 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
53 if (ret < 0)
54 return ret;
55 p = memchr(buf, 0, len);
56 if (p)
57 return n + (p - buf);
58 maxlen -= len;
59 pos += len;
60 n += len;
61 }
62
63 return n;
64}
65
66/*
67 * compare a string to one in a romfs image on MTD
68 * - return 1 if matched, 0 if differ, -ve if error
69 */
70static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
71 const char *str, size_t size)
72{
73 u_char buf[17];
74 size_t len, segment;
75 int ret;
76
77 /* scan the string up to 16 bytes at a time, and attempt to grab the
78 * trailing NUL whilst we're at it */
79 buf[0] = 0xff;
80
81 while (size > 0) {
82 segment = min_t(size_t, size + 1, 17);
83 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
84 if (ret < 0)
85 return ret;
86 len--;
87 if (memcmp(buf, str, len) != 0)
88 return 0;
89 buf[0] = buf[len];
90 size -= len;
91 pos += len;
92 str += len;
93 }
94
95 /* check the trailing NUL was */
96 if (buf[0])
97 return 0;
98
99 return 1;
100}
101#endif /* CONFIG_ROMFS_ON_MTD */
102
103#ifdef CONFIG_ROMFS_ON_BLOCK
104/*
105 * read data from an romfs image on a block device
106 */
107static int romfs_blk_read(struct super_block *sb, unsigned long pos,
108 void *buf, size_t buflen)
109{
110 struct buffer_head *bh;
111 unsigned long offset;
112 size_t segment;
113
114 /* copy the string up to blocksize bytes at a time */
115 while (buflen > 0) {
116 offset = pos & (ROMBSIZE - 1);
117 segment = min_t(size_t, buflen, ROMBSIZE - offset);
118 bh = sb_bread(sb, pos >> ROMBSBITS);
119 if (!bh)
120 return -EIO;
121 memcpy(buf, bh->b_data + offset, segment);
122 brelse(bh);
123 buf += segment;
124 buflen -= segment;
125 pos += segment;
126 }
127
128 return 0;
129}
130
131/*
132 * determine the length of a string in romfs on a block device
133 */
134static ssize_t romfs_blk_strnlen(struct super_block *sb,
135 unsigned long pos, size_t limit)
136{
137 struct buffer_head *bh;
138 unsigned long offset;
139 ssize_t n = 0;
140 size_t segment;
141 u_char *buf, *p;
142
143 /* scan the string up to blocksize bytes at a time */
144 while (limit > 0) {
145 offset = pos & (ROMBSIZE - 1);
146 segment = min_t(size_t, limit, ROMBSIZE - offset);
147 bh = sb_bread(sb, pos >> ROMBSBITS);
148 if (!bh)
149 return -EIO;
150 buf = bh->b_data + offset;
151 p = memchr(buf, 0, segment);
152 brelse(bh);
153 if (p)
154 return n + (p - buf);
155 limit -= segment;
156 pos += segment;
157 n += segment;
158 }
159
160 return n;
161}
162
163/*
164 * compare a string to one in a romfs image on a block device
165 * - return 1 if matched, 0 if differ, -ve if error
166 */
167static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
168 const char *str, size_t size)
169{
170 struct buffer_head *bh;
171 unsigned long offset;
172 size_t segment;
173 bool matched, terminated = false;
174
175 /* compare string up to a block at a time */
176 while (size > 0) {
177 offset = pos & (ROMBSIZE - 1);
178 segment = min_t(size_t, size, ROMBSIZE - offset);
179 bh = sb_bread(sb, pos >> ROMBSBITS);
180 if (!bh)
181 return -EIO;
182 matched = (memcmp(bh->b_data + offset, str, segment) == 0);
183
184 size -= segment;
185 pos += segment;
186 str += segment;
187 if (matched && size == 0 && offset + segment < ROMBSIZE) {
188 if (!bh->b_data[offset + segment])
189 terminated = true;
190 else
191 matched = false;
192 }
193 brelse(bh);
194 if (!matched)
195 return 0;
196 }
197
198 if (!terminated) {
199 /* the terminating NUL must be on the first byte of the next
200 * block */
201 BUG_ON((pos & (ROMBSIZE - 1)) != 0);
202 bh = sb_bread(sb, pos >> ROMBSBITS);
203 if (!bh)
204 return -EIO;
205 matched = !bh->b_data[0];
206 brelse(bh);
207 if (!matched)
208 return 0;
209 }
210
211 return 1;
212}
213#endif /* CONFIG_ROMFS_ON_BLOCK */
214
215/*
216 * read data from the romfs image
217 */
218int romfs_dev_read(struct super_block *sb, unsigned long pos,
219 void *buf, size_t buflen)
220{
221 size_t limit;
222
223 limit = romfs_maxsize(sb);
224 if (pos >= limit)
225 return -EIO;
226 if (buflen > limit - pos)
227 buflen = limit - pos;
228
229#ifdef CONFIG_ROMFS_ON_MTD
230 if (sb->s_mtd)
231 return romfs_mtd_read(sb, pos, buf, buflen);
232#endif
233#ifdef CONFIG_ROMFS_ON_BLOCK
234 if (sb->s_bdev)
235 return romfs_blk_read(sb, pos, buf, buflen);
236#endif
237 return -EIO;
238}
239
240/*
241 * determine the length of a string in romfs
242 */
243ssize_t romfs_dev_strnlen(struct super_block *sb,
244 unsigned long pos, size_t maxlen)
245{
246 size_t limit;
247
248 limit = romfs_maxsize(sb);
249 if (pos >= limit)
250 return -EIO;
251 if (maxlen > limit - pos)
252 maxlen = limit - pos;
253
254#ifdef CONFIG_ROMFS_ON_MTD
255 if (sb->s_mtd)
256 return romfs_mtd_strnlen(sb, pos, limit);
257#endif
258#ifdef CONFIG_ROMFS_ON_BLOCK
259 if (sb->s_bdev)
260 return romfs_blk_strnlen(sb, pos, limit);
261#endif
262 return -EIO;
263}
264
265/*
266 * compare a string to one in romfs
267 * - the string to be compared to, str, may not be NUL-terminated; instead the
268 * string is of the specified size
269 * - return 1 if matched, 0 if differ, -ve if error
270 */
271int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
272 const char *str, size_t size)
273{
274 size_t limit;
275
276 limit = romfs_maxsize(sb);
277 if (pos >= limit)
278 return -EIO;
279 if (size > ROMFS_MAXFN)
280 return -ENAMETOOLONG;
281 if (size + 1 > limit - pos)
282 return -EIO;
283
284#ifdef CONFIG_ROMFS_ON_MTD
285 if (sb->s_mtd)
286 return romfs_mtd_strcmp(sb, pos, str, size);
287#endif
288#ifdef CONFIG_ROMFS_ON_BLOCK
289 if (sb->s_bdev)
290 return romfs_blk_strcmp(sb, pos, str, size);
291#endif
292 return -EIO;
293}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..4ab3c03d8f95
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,654 @@
1/* Block- or MTD-based romfs
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * Derived from: ROMFS file system, Linux implementation
7 *
8 * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
9 *
10 * Using parts of the minix filesystem
11 * Copyright © 1991, 1992 Linus Torvalds
12 *
13 * and parts of the affs filesystem additionally
14 * Copyright © 1993 Ray Burr
15 * Copyright © 1996 Hans-Joachim Widmaier
16 *
17 * Changes
18 * Changed for 2.1.19 modules
19 * Jan 1997 Initial release
20 * Jun 1997 2.1.43+ changes
21 * Proper page locking in readpage
22 * Changed to work with 2.1.45+ fs
23 * Jul 1997 Fixed follow_link
24 * 2.1.47
25 * lookup shouldn't return -ENOENT
26 * from Horst von Brand:
27 * fail on wrong checksum
28 * double unlock_super was possible
29 * correct namelen for statfs
30 * spotted by Bill Hawes:
31 * readlink shouldn't iput()
32 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
33 * exposed a problem in readdir
34 * 2.1.107 code-freeze spellchecker run
35 * Aug 1998 2.1.118+ VFS changes
36 * Sep 1998 2.1.122 another VFS change (follow_link)
37 * Apr 1999 2.2.7 no more EBADF checking in
38 * lookup/readdir, use ERR_PTR
39 * Jun 1999 2.3.6 d_alloc_root use changed
40 * 2.3.9 clean up usage of ENOENT/negative
41 * dentries in lookup
42 * clean up page flags setting
43 * (error, uptodate, locking) in
44 * in readpage
45 * use init_special_inode for
46 * fifos/sockets (and streamline) in
47 * read_inode, fix _ops table order
48 * Aug 1999 2.3.16 __initfunc() => __init change
49 * Oct 1999 2.3.24 page->owner hack obsoleted
50 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
51 *
52 *
53 * This program is free software; you can redistribute it and/or
54 * modify it under the terms of the GNU General Public Licence
55 * as published by the Free Software Foundation; either version
56 * 2 of the Licence, or (at your option) any later version.
57 */
58
59#include <linux/module.h>
60#include <linux/string.h>
61#include <linux/fs.h>
62#include <linux/time.h>
63#include <linux/slab.h>
64#include <linux/init.h>
65#include <linux/blkdev.h>
66#include <linux/parser.h>
67#include <linux/mount.h>
68#include <linux/namei.h>
69#include <linux/statfs.h>
70#include <linux/mtd/super.h>
71#include <linux/ctype.h>
72#include <linux/highmem.h>
73#include <linux/pagemap.h>
74#include <linux/uaccess.h>
75#include "internal.h"
76
77static struct kmem_cache *romfs_inode_cachep;
78
79static const umode_t romfs_modemap[8] = {
80 0, /* hard link */
81 S_IFDIR | 0644, /* directory */
82 S_IFREG | 0644, /* regular file */
83 S_IFLNK | 0777, /* symlink */
84 S_IFBLK | 0600, /* blockdev */
85 S_IFCHR | 0600, /* chardev */
86 S_IFSOCK | 0644, /* socket */
87 S_IFIFO | 0644 /* FIFO */
88};
89
90static const unsigned char romfs_dtype_table[] = {
91 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
92};
93
94static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
95
96/*
97 * read a page worth of data from the image
98 */
99static int romfs_readpage(struct file *file, struct page *page)
100{
101 struct inode *inode = page->mapping->host;
102 loff_t offset, size;
103 unsigned long fillsize, pos;
104 void *buf;
105 int ret;
106
107 buf = kmap(page);
108 if (!buf)
109 return -ENOMEM;
110
111 /* 32 bit warning -- but not for us :) */
112 offset = page_offset(page);
113 size = i_size_read(inode);
114 fillsize = 0;
115 ret = 0;
116 if (offset < size) {
117 size -= offset;
118 fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
119
120 pos = ROMFS_I(inode)->i_dataoffset + offset;
121
122 ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
123 if (ret < 0) {
124 SetPageError(page);
125 fillsize = 0;
126 ret = -EIO;
127 }
128 }
129
130 if (fillsize < PAGE_SIZE)
131 memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
132 if (ret == 0)
133 SetPageUptodate(page);
134
135 flush_dcache_page(page);
136 kunmap(page);
137 unlock_page(page);
138 return ret;
139}
140
141static const struct address_space_operations romfs_aops = {
142 .readpage = romfs_readpage
143};
144
145/*
146 * read the entries from a directory
147 */
148static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
149{
150 struct inode *i = filp->f_dentry->d_inode;
151 struct romfs_inode ri;
152 unsigned long offset, maxoff;
153 int j, ino, nextfh;
154 int stored = 0;
155 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
156 int ret;
157
158 maxoff = romfs_maxsize(i->i_sb);
159
160 offset = filp->f_pos;
161 if (!offset) {
162 offset = i->i_ino & ROMFH_MASK;
163 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
164 if (ret < 0)
165 goto out;
166 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
167 }
168
169 /* Not really failsafe, but we are read-only... */
170 for (;;) {
171 if (!offset || offset >= maxoff) {
172 offset = maxoff;
173 filp->f_pos = offset;
174 goto out;
175 }
176 filp->f_pos = offset;
177
178 /* Fetch inode info */
179 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
180 if (ret < 0)
181 goto out;
182
183 j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
184 sizeof(fsname) - 1);
185 if (j < 0)
186 goto out;
187
188 ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
189 if (ret < 0)
190 goto out;
191 fsname[j] = '\0';
192
193 ino = offset;
194 nextfh = be32_to_cpu(ri.next);
195 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
196 ino = be32_to_cpu(ri.spec);
197 if (filldir(dirent, fsname, j, offset, ino,
198 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
199 goto out;
200
201 stored++;
202 offset = nextfh & ROMFH_MASK;
203 }
204
205out:
206 return stored;
207}
208
209/*
210 * look up an entry in a directory
211 */
212static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
213 struct nameidata *nd)
214{
215 unsigned long offset, maxoff;
216 struct inode *inode;
217 struct romfs_inode ri;
218 const char *name; /* got from dentry */
219 int len, ret;
220
221 offset = dir->i_ino & ROMFH_MASK;
222 ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
223 if (ret < 0)
224 goto error;
225
226 /* search all the file entries in the list starting from the one
227 * pointed to by the directory's special data */
228 maxoff = romfs_maxsize(dir->i_sb);
229 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
230
231 name = dentry->d_name.name;
232 len = dentry->d_name.len;
233
234 for (;;) {
235 if (!offset || offset >= maxoff)
236 goto out0;
237
238 ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
239 if (ret < 0)
240 goto error;
241
242 /* try to match the first 16 bytes of name */
243 ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
244 len);
245 if (ret < 0)
246 goto error;
247 if (ret == 1)
248 break;
249
250 /* next entry */
251 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
252 }
253
254 /* Hard link handling */
255 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
256 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
257
258 inode = romfs_iget(dir->i_sb, offset);
259 if (IS_ERR(inode)) {
260 ret = PTR_ERR(inode);
261 goto error;
262 }
263 goto outi;
264
265 /*
266 * it's a bit funky, _lookup needs to return an error code
267 * (negative) or a NULL, both as a dentry. ENOENT should not
268 * be returned, instead we need to create a negative dentry by
269 * d_add(dentry, NULL); and return 0 as no error.
270 * (Although as I see, it only matters on writable file
271 * systems).
272 */
273out0:
274 inode = NULL;
275outi:
276 d_add(dentry, inode);
277 ret = 0;
278error:
279 return ERR_PTR(ret);
280}
281
282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir,
284 .readdir = romfs_readdir,
285};
286
287static struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup,
289};
290
291/*
292 * get a romfs inode based on its position in the image (which doubles as the
293 * inode number)
294 */
295static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
296{
297 struct romfs_inode_info *inode;
298 struct romfs_inode ri;
299 struct inode *i;
300 unsigned long nlen;
301 unsigned nextfh;
302 int ret;
303 umode_t mode;
304
305 /* we might have to traverse a chain of "hard link" file entries to get
306 * to the actual file */
307 for (;;) {
308 ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
309 if (ret < 0)
310 goto error;
311
312 /* XXX: do romfs_checksum here too (with name) */
313
314 nextfh = be32_to_cpu(ri.next);
315 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
316 break;
317
318 pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
319 }
320
321 /* determine the length of the filename */
322 nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
323 if (IS_ERR_VALUE(nlen))
324 goto eio;
325
326 /* get an inode for this image position */
327 i = iget_locked(sb, pos);
328 if (!i)
329 return ERR_PTR(-ENOMEM);
330
331 if (!(i->i_state & I_NEW))
332 return i;
333
334 /* precalculate the data offset */
335 inode = ROMFS_I(i);
336 inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
337 inode->i_dataoffset = pos + inode->i_metasize;
338
339 i->i_nlink = 1; /* Hard to decide.. */
340 i->i_size = be32_to_cpu(ri.size);
341 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
342 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
343
344 /* set up mode and ops */
345 mode = romfs_modemap[nextfh & ROMFH_TYPE];
346
347 switch (nextfh & ROMFH_TYPE) {
348 case ROMFH_DIR:
349 i->i_size = ROMFS_I(i)->i_metasize;
350 i->i_op = &romfs_dir_inode_operations;
351 i->i_fop = &romfs_dir_operations;
352 if (nextfh & ROMFH_EXEC)
353 mode |= S_IXUGO;
354 break;
355 case ROMFH_REG:
356 i->i_fop = &romfs_ro_fops;
357 i->i_data.a_ops = &romfs_aops;
358 if (i->i_sb->s_mtd)
359 i->i_data.backing_dev_info =
360 i->i_sb->s_mtd->backing_dev_info;
361 if (nextfh & ROMFH_EXEC)
362 mode |= S_IXUGO;
363 break;
364 case ROMFH_SYM:
365 i->i_op = &page_symlink_inode_operations;
366 i->i_data.a_ops = &romfs_aops;
367 mode |= S_IRWXUGO;
368 break;
369 default:
370 /* depending on MBZ for sock/fifos */
371 nextfh = be32_to_cpu(ri.spec);
372 init_special_inode(i, mode, MKDEV(nextfh >> 16,
373 nextfh & 0xffff));
374 break;
375 }
376
377 i->i_mode = mode;
378
379 unlock_new_inode(i);
380 return i;
381
382eio:
383 ret = -EIO;
384error:
385 printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
386 return ERR_PTR(ret);
387}
388
389/*
390 * allocate a new inode
391 */
392static struct inode *romfs_alloc_inode(struct super_block *sb)
393{
394 struct romfs_inode_info *inode;
395 inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
396 return inode ? &inode->vfs_inode : NULL;
397}
398
399/*
400 * return a spent inode to the slab cache
401 */
402static void romfs_destroy_inode(struct inode *inode)
403{
404 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
405}
406
407/*
408 * get filesystem statistics
409 */
410static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
411{
412 struct super_block *sb = dentry->d_sb;
413 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
414
415 buf->f_type = ROMFS_MAGIC;
416 buf->f_namelen = ROMFS_MAXFN;
417 buf->f_bsize = ROMBSIZE;
418 buf->f_bfree = buf->f_bavail = buf->f_ffree;
419 buf->f_blocks =
420 (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
421 buf->f_fsid.val[0] = (u32)id;
422 buf->f_fsid.val[1] = (u32)(id >> 32);
423 return 0;
424}
425
426/*
427 * remounting must involve read-only
428 */
429static int romfs_remount(struct super_block *sb, int *flags, char *data)
430{
431 *flags |= MS_RDONLY;
432 return 0;
433}
434
435static const struct super_operations romfs_super_ops = {
436 .alloc_inode = romfs_alloc_inode,
437 .destroy_inode = romfs_destroy_inode,
438 .statfs = romfs_statfs,
439 .remount_fs = romfs_remount,
440};
441
442/*
443 * checksum check on part of a romfs filesystem
444 */
445static __u32 romfs_checksum(const void *data, int size)
446{
447 const __be32 *ptr = data;
448 __u32 sum;
449
450 sum = 0;
451 size >>= 2;
452 while (size > 0) {
453 sum += be32_to_cpu(*ptr++);
454 size--;
455 }
456 return sum;
457}
458
459/*
460 * fill in the superblock
461 */
462static int romfs_fill_super(struct super_block *sb, void *data, int silent)
463{
464 struct romfs_super_block *rsb;
465 struct inode *root;
466 unsigned long pos, img_size;
467 const char *storage;
468 size_t len;
469 int ret;
470
471#ifdef CONFIG_BLOCK
472 if (!sb->s_mtd) {
473 sb_set_blocksize(sb, ROMBSIZE);
474 } else {
475 sb->s_blocksize = ROMBSIZE;
476 sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
477 }
478#endif
479
480 sb->s_maxbytes = 0xFFFFFFFF;
481 sb->s_magic = ROMFS_MAGIC;
482 sb->s_flags |= MS_RDONLY | MS_NOATIME;
483 sb->s_op = &romfs_super_ops;
484
485 /* read the image superblock and check it */
486 rsb = kmalloc(512, GFP_KERNEL);
487 if (!rsb)
488 return -ENOMEM;
489
490 sb->s_fs_info = (void *) 512;
491 ret = romfs_dev_read(sb, 0, rsb, 512);
492 if (ret < 0)
493 goto error_rsb;
494
495 img_size = be32_to_cpu(rsb->size);
496
497 if (sb->s_mtd && img_size > sb->s_mtd->size)
498 goto error_rsb_inval;
499
500 sb->s_fs_info = (void *) img_size;
501
502 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
503 img_size < ROMFH_SIZE) {
504 if (!silent)
505 printk(KERN_WARNING "VFS:"
506 " Can't find a romfs filesystem on dev %s.\n",
507 sb->s_id);
508 goto error_rsb_inval;
509 }
510
511 if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
512 printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
513 sb->s_id);
514 goto error_rsb_inval;
515 }
516
517 storage = sb->s_mtd ? "MTD" : "the block layer";
518
519 len = strnlen(rsb->name, ROMFS_MAXFN);
520 if (!silent)
521 printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
522 (unsigned) len, (unsigned) len, rsb->name, storage);
523
524 kfree(rsb);
525 rsb = NULL;
526
527 /* find the root directory */
528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
529
530 root = romfs_iget(sb, pos);
531 if (!root)
532 goto error;
533
534 sb->s_root = d_alloc_root(root);
535 if (!sb->s_root)
536 goto error_i;
537
538 return 0;
539
540error_i:
541 iput(root);
542error:
543 return -EINVAL;
544error_rsb_inval:
545 ret = -EINVAL;
546error_rsb:
547 return ret;
548}
549
550/*
551 * get a superblock for mounting
552 */
553static int romfs_get_sb(struct file_system_type *fs_type,
554 int flags, const char *dev_name,
555 void *data, struct vfsmount *mnt)
556{
557 int ret = -EINVAL;
558
559#ifdef CONFIG_ROMFS_ON_MTD
560 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
561 mnt);
562#endif
563#ifdef CONFIG_ROMFS_ON_BLOCK
564 if (ret == -EINVAL)
565 ret = get_sb_bdev(fs_type, flags, dev_name, data,
566 romfs_fill_super, mnt);
567#endif
568 return ret;
569}
570
571/*
572 * destroy a romfs superblock in the appropriate manner
573 */
574static void romfs_kill_sb(struct super_block *sb)
575{
576#ifdef CONFIG_ROMFS_ON_MTD
577 if (sb->s_mtd) {
578 kill_mtd_super(sb);
579 return;
580 }
581#endif
582#ifdef CONFIG_ROMFS_ON_BLOCK
583 if (sb->s_bdev) {
584 kill_block_super(sb);
585 return;
586 }
587#endif
588}
589
590static struct file_system_type romfs_fs_type = {
591 .owner = THIS_MODULE,
592 .name = "romfs",
593 .get_sb = romfs_get_sb,
594 .kill_sb = romfs_kill_sb,
595 .fs_flags = FS_REQUIRES_DEV,
596};
597
598/*
599 * inode storage initialiser
600 */
601static void romfs_i_init_once(void *_inode)
602{
603 struct romfs_inode_info *inode = _inode;
604
605 inode_init_once(&inode->vfs_inode);
606}
607
608/*
609 * romfs module initialisation
610 */
611static int __init init_romfs_fs(void)
612{
613 int ret;
614
615 printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
616
617 romfs_inode_cachep =
618 kmem_cache_create("romfs_i",
619 sizeof(struct romfs_inode_info), 0,
620 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
621 romfs_i_init_once);
622
623 if (!romfs_inode_cachep) {
624 printk(KERN_ERR
625 "ROMFS error: Failed to initialise inode cache\n");
626 return -ENOMEM;
627 }
628 ret = register_filesystem(&romfs_fs_type);
629 if (ret) {
630 printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
631 goto error_register;
632 }
633 return 0;
634
635error_register:
636 kmem_cache_destroy(romfs_inode_cachep);
637 return ret;
638}
639
640/*
641 * romfs module removal
642 */
643static void __exit exit_romfs_fs(void)
644{
645 unregister_filesystem(&romfs_fs_type);
646 kmem_cache_destroy(romfs_inode_cachep);
647}
648
649module_init(init_romfs_fs);
650module_exit(exit_romfs_fs);
651
652MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
653MODULE_AUTHOR("Red Hat, Inc.");
654MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index dd727d43e5b7..666953d59a35 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -182,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
182 do_wakeup = 0; 182 do_wakeup = 0;
183 page_nr = 0; 183 page_nr = 0;
184 184
185 if (pipe->inode) 185 pipe_lock(pipe);
186 mutex_lock(&pipe->inode->i_mutex);
187 186
188 for (;;) { 187 for (;;) {
189 if (!pipe->readers) { 188 if (!pipe->readers) {
@@ -245,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
245 pipe->waiting_writers--; 244 pipe->waiting_writers--;
246 } 245 }
247 246
248 if (pipe->inode) { 247 pipe_unlock(pipe);
249 mutex_unlock(&pipe->inode->i_mutex);
250 248
251 if (do_wakeup) { 249 if (do_wakeup) {
252 smp_mb(); 250 smp_mb();
253 if (waitqueue_active(&pipe->wait)) 251 if (waitqueue_active(&pipe->wait))
254 wake_up_interruptible(&pipe->wait); 252 wake_up_interruptible(&pipe->wait);
255 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 253 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
256 }
257 } 254 }
258 255
259 while (page_nr < spd_pages) 256 while (page_nr < spd_pages)
@@ -555,8 +552,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
555 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 552 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
556 * a new page in the output file page cache and fill/dirty that. 553 * a new page in the output file page cache and fill/dirty that.
557 */ 554 */
558static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 555int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
559 struct splice_desc *sd) 556 struct splice_desc *sd)
560{ 557{
561 struct file *file = sd->u.file; 558 struct file *file = sd->u.file;
562 struct address_space *mapping = file->f_mapping; 559 struct address_space *mapping = file->f_mapping;
@@ -600,108 +597,177 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
600out: 597out:
601 return ret; 598 return ret;
602} 599}
600EXPORT_SYMBOL(pipe_to_file);
601
602static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
603{
604 smp_mb();
605 if (waitqueue_active(&pipe->wait))
606 wake_up_interruptible(&pipe->wait);
607 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
608}
603 609
604/** 610/**
605 * __splice_from_pipe - splice data from a pipe to given actor 611 * splice_from_pipe_feed - feed available data from a pipe to a file
606 * @pipe: pipe to splice from 612 * @pipe: pipe to splice from
607 * @sd: information to @actor 613 * @sd: information to @actor
608 * @actor: handler that splices the data 614 * @actor: handler that splices the data
609 * 615 *
610 * Description: 616 * Description:
611 * This function does little more than loop over the pipe and call 617 * This function loops over the pipe and calls @actor to do the
612 * @actor to do the actual moving of a single struct pipe_buffer to 618 * actual moving of a single struct pipe_buffer to the desired
613 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 619 * destination. It returns when there's no more buffers left in
614 * pipe_to_user. 620 * the pipe or if the requested number of bytes (@sd->total_len)
621 * have been copied. It returns a positive number (one) if the
622 * pipe needs to be filled with more data, zero if the required
623 * number of bytes have been copied and -errno on error.
615 * 624 *
625 * This, together with splice_from_pipe_{begin,end,next}, may be
626 * used to implement the functionality of __splice_from_pipe() when
627 * locking is required around copying the pipe buffers to the
628 * destination.
616 */ 629 */
617ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 630int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
618 splice_actor *actor) 631 splice_actor *actor)
619{ 632{
620 int ret, do_wakeup, err; 633 int ret;
621
622 ret = 0;
623 do_wakeup = 0;
624
625 for (;;) {
626 if (pipe->nrbufs) {
627 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
628 const struct pipe_buf_operations *ops = buf->ops;
629 634
630 sd->len = buf->len; 635 while (pipe->nrbufs) {
631 if (sd->len > sd->total_len) 636 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
632 sd->len = sd->total_len; 637 const struct pipe_buf_operations *ops = buf->ops;
633 638
634 err = actor(pipe, buf, sd); 639 sd->len = buf->len;
635 if (err <= 0) { 640 if (sd->len > sd->total_len)
636 if (!ret && err != -ENODATA) 641 sd->len = sd->total_len;
637 ret = err;
638 642
639 break; 643 ret = actor(pipe, buf, sd);
640 } 644 if (ret <= 0) {
645 if (ret == -ENODATA)
646 ret = 0;
647 return ret;
648 }
649 buf->offset += ret;
650 buf->len -= ret;
641 651
642 ret += err; 652 sd->num_spliced += ret;
643 buf->offset += err; 653 sd->len -= ret;
644 buf->len -= err; 654 sd->pos += ret;
655 sd->total_len -= ret;
645 656
646 sd->len -= err; 657 if (!buf->len) {
647 sd->pos += err; 658 buf->ops = NULL;
648 sd->total_len -= err; 659 ops->release(pipe, buf);
649 if (sd->len) 660 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
650 continue; 661 pipe->nrbufs--;
662 if (pipe->inode)
663 sd->need_wakeup = true;
664 }
651 665
652 if (!buf->len) { 666 if (!sd->total_len)
653 buf->ops = NULL; 667 return 0;
654 ops->release(pipe, buf); 668 }
655 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
656 pipe->nrbufs--;
657 if (pipe->inode)
658 do_wakeup = 1;
659 }
660 669
661 if (!sd->total_len) 670 return 1;
662 break; 671}
663 } 672EXPORT_SYMBOL(splice_from_pipe_feed);
664 673
665 if (pipe->nrbufs) 674/**
666 continue; 675 * splice_from_pipe_next - wait for some data to splice from
676 * @pipe: pipe to splice from
677 * @sd: information about the splice operation
678 *
679 * Description:
680 * This function will wait for some data and return a positive
681 * value (one) if pipe buffers are available. It will return zero
682 * or -errno if no more data needs to be spliced.
683 */
684int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
685{
686 while (!pipe->nrbufs) {
667 if (!pipe->writers) 687 if (!pipe->writers)
668 break; 688 return 0;
669 if (!pipe->waiting_writers) {
670 if (ret)
671 break;
672 }
673 689
674 if (sd->flags & SPLICE_F_NONBLOCK) { 690 if (!pipe->waiting_writers && sd->num_spliced)
675 if (!ret) 691 return 0;
676 ret = -EAGAIN;
677 break;
678 }
679 692
680 if (signal_pending(current)) { 693 if (sd->flags & SPLICE_F_NONBLOCK)
681 if (!ret) 694 return -EAGAIN;
682 ret = -ERESTARTSYS;
683 break;
684 }
685 695
686 if (do_wakeup) { 696 if (signal_pending(current))
687 smp_mb(); 697 return -ERESTARTSYS;
688 if (waitqueue_active(&pipe->wait)) 698
689 wake_up_interruptible_sync(&pipe->wait); 699 if (sd->need_wakeup) {
690 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 700 wakeup_pipe_writers(pipe);
691 do_wakeup = 0; 701 sd->need_wakeup = false;
692 } 702 }
693 703
694 pipe_wait(pipe); 704 pipe_wait(pipe);
695 } 705 }
696 706
697 if (do_wakeup) { 707 return 1;
698 smp_mb(); 708}
699 if (waitqueue_active(&pipe->wait)) 709EXPORT_SYMBOL(splice_from_pipe_next);
700 wake_up_interruptible(&pipe->wait);
701 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
702 }
703 710
704 return ret; 711/**
712 * splice_from_pipe_begin - start splicing from pipe
713 * @sd: information about the splice operation
714 *
715 * Description:
716 * This function should be called before a loop containing
717 * splice_from_pipe_next() and splice_from_pipe_feed() to
718 * initialize the necessary fields of @sd.
719 */
720void splice_from_pipe_begin(struct splice_desc *sd)
721{
722 sd->num_spliced = 0;
723 sd->need_wakeup = false;
724}
725EXPORT_SYMBOL(splice_from_pipe_begin);
726
727/**
728 * splice_from_pipe_end - finish splicing from pipe
729 * @pipe: pipe to splice from
730 * @sd: information about the splice operation
731 *
732 * Description:
733 * This function will wake up pipe writers if necessary. It should
734 * be called after a loop containing splice_from_pipe_next() and
735 * splice_from_pipe_feed().
736 */
737void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
738{
739 if (sd->need_wakeup)
740 wakeup_pipe_writers(pipe);
741}
742EXPORT_SYMBOL(splice_from_pipe_end);
743
744/**
745 * __splice_from_pipe - splice data from a pipe to given actor
746 * @pipe: pipe to splice from
747 * @sd: information to @actor
748 * @actor: handler that splices the data
749 *
750 * Description:
751 * This function does little more than loop over the pipe and call
752 * @actor to do the actual moving of a single struct pipe_buffer to
753 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
754 * pipe_to_user.
755 *
756 */
757ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
758 splice_actor *actor)
759{
760 int ret;
761
762 splice_from_pipe_begin(sd);
763 do {
764 ret = splice_from_pipe_next(pipe, sd);
765 if (ret > 0)
766 ret = splice_from_pipe_feed(pipe, sd, actor);
767 } while (ret > 0);
768 splice_from_pipe_end(pipe, sd);
769
770 return sd->num_spliced ? sd->num_spliced : ret;
705} 771}
706EXPORT_SYMBOL(__splice_from_pipe); 772EXPORT_SYMBOL(__splice_from_pipe);
707 773
@@ -715,7 +781,7 @@ EXPORT_SYMBOL(__splice_from_pipe);
715 * @actor: handler that splices the data 781 * @actor: handler that splices the data
716 * 782 *
717 * Description: 783 * Description:
718 * See __splice_from_pipe. This function locks the input and output inodes, 784 * See __splice_from_pipe. This function locks the pipe inode,
719 * otherwise it's identical to __splice_from_pipe(). 785 * otherwise it's identical to __splice_from_pipe().
720 * 786 *
721 */ 787 */
@@ -724,7 +790,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
724 splice_actor *actor) 790 splice_actor *actor)
725{ 791{
726 ssize_t ret; 792 ssize_t ret;
727 struct inode *inode = out->f_mapping->host;
728 struct splice_desc sd = { 793 struct splice_desc sd = {
729 .total_len = len, 794 .total_len = len,
730 .flags = flags, 795 .flags = flags,
@@ -732,21 +797,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
732 .u.file = out, 797 .u.file = out,
733 }; 798 };
734 799
735 /* 800 pipe_lock(pipe);
736 * The actor worker might be calling ->write_begin and
737 * ->write_end. Most of the time, these expect i_mutex to
738 * be held. Since this may result in an ABBA deadlock with
739 * pipe->inode, we have to order lock acquiry here.
740 */
741 inode_double_lock(inode, pipe->inode);
742 ret = __splice_from_pipe(pipe, &sd, actor); 801 ret = __splice_from_pipe(pipe, &sd, actor);
743 inode_double_unlock(inode, pipe->inode); 802 pipe_unlock(pipe);
744 803
745 return ret; 804 return ret;
746} 805}
747 806
748/** 807/**
749 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 808 * generic_file_splice_write - splice data from a pipe to a file
750 * @pipe: pipe info 809 * @pipe: pipe info
751 * @out: file to write to 810 * @out: file to write to
752 * @ppos: position in @out 811 * @ppos: position in @out
@@ -755,13 +814,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
755 * 814 *
756 * Description: 815 * Description:
757 * Will either move or copy pages (determined by @flags options) from 816 * Will either move or copy pages (determined by @flags options) from
758 * the given pipe inode to the given file. The caller is responsible 817 * the given pipe inode to the given file.
759 * for acquiring i_mutex on both inodes.
760 * 818 *
761 */ 819 */
762ssize_t 820ssize_t
763generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 821generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
764 loff_t *ppos, size_t len, unsigned int flags) 822 loff_t *ppos, size_t len, unsigned int flags)
765{ 823{
766 struct address_space *mapping = out->f_mapping; 824 struct address_space *mapping = out->f_mapping;
767 struct inode *inode = mapping->host; 825 struct inode *inode = mapping->host;
@@ -772,70 +830,28 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
772 .u.file = out, 830 .u.file = out,
773 }; 831 };
774 ssize_t ret; 832 ssize_t ret;
775 int err;
776
777 err = file_remove_suid(out);
778 if (unlikely(err))
779 return err;
780
781 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
782 if (ret > 0) {
783 unsigned long nr_pages;
784 833
785 *ppos += ret; 834 pipe_lock(pipe);
786 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
787 835
788 /* 836 splice_from_pipe_begin(&sd);
789 * If file or inode is SYNC and we actually wrote some data, 837 do {
790 * sync it. 838 ret = splice_from_pipe_next(pipe, &sd);
791 */ 839 if (ret <= 0)
792 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 840 break;
793 err = generic_osync_inode(inode, mapping,
794 OSYNC_METADATA|OSYNC_DATA);
795
796 if (err)
797 ret = err;
798 }
799 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
800 }
801 841
802 return ret; 842 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
803} 843 ret = file_remove_suid(out);
844 if (!ret)
845 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
846 mutex_unlock(&inode->i_mutex);
847 } while (ret > 0);
848 splice_from_pipe_end(pipe, &sd);
804 849
805EXPORT_SYMBOL(generic_file_splice_write_nolock); 850 pipe_unlock(pipe);
806 851
807/** 852 if (sd.num_spliced)
808 * generic_file_splice_write - splice data from a pipe to a file 853 ret = sd.num_spliced;
809 * @pipe: pipe info
810 * @out: file to write to
811 * @ppos: position in @out
812 * @len: number of bytes to splice
813 * @flags: splice modifier flags
814 *
815 * Description:
816 * Will either move or copy pages (determined by @flags options) from
817 * the given pipe inode to the given file.
818 *
819 */
820ssize_t
821generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
822 loff_t *ppos, size_t len, unsigned int flags)
823{
824 struct address_space *mapping = out->f_mapping;
825 struct inode *inode = mapping->host;
826 struct splice_desc sd = {
827 .total_len = len,
828 .flags = flags,
829 .pos = *ppos,
830 .u.file = out,
831 };
832 ssize_t ret;
833 854
834 inode_double_lock(inode, pipe->inode);
835 ret = file_remove_suid(out);
836 if (likely(!ret))
837 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
838 inode_double_unlock(inode, pipe->inode);
839 if (ret > 0) { 855 if (ret > 0) {
840 unsigned long nr_pages; 856 unsigned long nr_pages;
841 857
@@ -1324,8 +1340,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1324 if (!pipe) 1340 if (!pipe)
1325 return -EBADF; 1341 return -EBADF;
1326 1342
1327 if (pipe->inode) 1343 pipe_lock(pipe);
1328 mutex_lock(&pipe->inode->i_mutex);
1329 1344
1330 error = ret = 0; 1345 error = ret = 0;
1331 while (nr_segs) { 1346 while (nr_segs) {
@@ -1380,8 +1395,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1380 iov++; 1395 iov++;
1381 } 1396 }
1382 1397
1383 if (pipe->inode) 1398 pipe_unlock(pipe);
1384 mutex_unlock(&pipe->inode->i_mutex);
1385 1399
1386 if (!ret) 1400 if (!ret)
1387 ret = error; 1401 ret = error;
@@ -1509,7 +1523,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1509 return 0; 1523 return 0;
1510 1524
1511 ret = 0; 1525 ret = 0;
1512 mutex_lock(&pipe->inode->i_mutex); 1526 pipe_lock(pipe);
1513 1527
1514 while (!pipe->nrbufs) { 1528 while (!pipe->nrbufs) {
1515 if (signal_pending(current)) { 1529 if (signal_pending(current)) {
@@ -1527,7 +1541,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1527 pipe_wait(pipe); 1541 pipe_wait(pipe);
1528 } 1542 }
1529 1543
1530 mutex_unlock(&pipe->inode->i_mutex); 1544 pipe_unlock(pipe);
1531 return ret; 1545 return ret;
1532} 1546}
1533 1547
@@ -1547,7 +1561,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1547 return 0; 1561 return 0;
1548 1562
1549 ret = 0; 1563 ret = 0;
1550 mutex_lock(&pipe->inode->i_mutex); 1564 pipe_lock(pipe);
1551 1565
1552 while (pipe->nrbufs >= PIPE_BUFFERS) { 1566 while (pipe->nrbufs >= PIPE_BUFFERS) {
1553 if (!pipe->readers) { 1567 if (!pipe->readers) {
@@ -1568,7 +1582,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1568 pipe->waiting_writers--; 1582 pipe->waiting_writers--;
1569 } 1583 }
1570 1584
1571 mutex_unlock(&pipe->inode->i_mutex); 1585 pipe_unlock(pipe);
1572 return ret; 1586 return ret;
1573} 1587}
1574 1588
@@ -1584,10 +1598,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1584 1598
1585 /* 1599 /*
1586 * Potential ABBA deadlock, work around it by ordering lock 1600 * Potential ABBA deadlock, work around it by ordering lock
1587 * grabbing by inode address. Otherwise two different processes 1601 * grabbing by pipe info address. Otherwise two different processes
1588 * could deadlock (one doing tee from A -> B, the other from B -> A). 1602 * could deadlock (one doing tee from A -> B, the other from B -> A).
1589 */ 1603 */
1590 inode_double_lock(ipipe->inode, opipe->inode); 1604 pipe_double_lock(ipipe, opipe);
1591 1605
1592 do { 1606 do {
1593 if (!opipe->readers) { 1607 if (!opipe->readers) {
@@ -1638,7 +1652,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1638 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1652 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1639 ret = -EAGAIN; 1653 ret = -EAGAIN;
1640 1654
1641 inode_double_unlock(ipipe->inode, opipe->inode); 1655 pipe_unlock(ipipe);
1656 pipe_unlock(opipe);
1642 1657
1643 /* 1658 /*
1644 * If we put data in the output pipe, wakeup any potential readers. 1659 * If we put data in the output pipe, wakeup any potential readers.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 8258cf9a0317..70e3244fa30f 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,4 +5,3 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o 7squashfs-y += namei.o super.o symlink.o
8#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1c4739e33af6..40c98fa6b5d6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -252,6 +252,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
252 cache->entries = entries; 252 cache->entries = entries;
253 cache->block_size = block_size; 253 cache->block_size = block_size;
254 cache->pages = block_size >> PAGE_CACHE_SHIFT; 254 cache->pages = block_size >> PAGE_CACHE_SHIFT;
255 cache->pages = cache->pages ? cache->pages : 1;
255 cache->name = name; 256 cache->name = name;
256 cache->num_waiters = 0; 257 cache->num_waiters = 0;
257 spin_lock_init(&cache->lock); 258 spin_lock_init(&cache->lock);
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h> 42#include <linux/zlib.h>
43#include <linux/slab.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index ffa6edcd2d0c..0adc624c956f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -157,6 +157,16 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
157 if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) 157 if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
158 goto failed_mount; 158 goto failed_mount;
159 159
160 /*
161 * Check the system page size is not larger than the filesystem
162 * block size (by default 128K). This is currently not supported.
163 */
164 if (PAGE_CACHE_SIZE > msblk->block_size) {
165 ERROR("Page size > filesystem block size (%d). This is "
166 "currently not supported!\n", msblk->block_size);
167 goto failed_mount;
168 }
169
160 msblk->block_log = le16_to_cpu(sblk->block_log); 170 msblk->block_log = le16_to_cpu(sblk->block_log);
161 if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) 171 if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
162 goto failed_mount; 172 goto failed_mount;
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..075694e31d8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,59 +55,54 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
55 55
56EXPORT_SYMBOL(vfs_getattr); 56EXPORT_SYMBOL(vfs_getattr);
57 57
58int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat) 58int vfs_fstat(unsigned int fd, struct kstat *stat)
59{ 59{
60 struct path path; 60 struct file *f = fget(fd);
61 int error; 61 int error = -EBADF;
62 62
63 error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path); 63 if (f) {
64 if (!error) { 64 error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
65 error = vfs_getattr(path.mnt, path.dentry, stat); 65 fput(f);
66 path_put(&path);
67 } 66 }
68 return error; 67 return error;
69} 68}
69EXPORT_SYMBOL(vfs_fstat);
70 70
71int vfs_stat(char __user *name, struct kstat *stat) 71int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
72{ 72{
73 return vfs_stat_fd(AT_FDCWD, name, stat); 73 struct path path;
74} 74 int error = -EINVAL;
75 int lookup_flags = 0;
75 76
76EXPORT_SYMBOL(vfs_stat); 77 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
78 goto out;
77 79
78int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat) 80 if (!(flag & AT_SYMLINK_NOFOLLOW))
79{ 81 lookup_flags |= LOOKUP_FOLLOW;
80 struct path path;
81 int error;
82 82
83 error = user_path_at(dfd, name, 0, &path); 83 error = user_path_at(dfd, filename, lookup_flags, &path);
84 if (!error) { 84 if (error)
85 error = vfs_getattr(path.mnt, path.dentry, stat); 85 goto out;
86 path_put(&path); 86
87 } 87 error = vfs_getattr(path.mnt, path.dentry, stat);
88 path_put(&path);
89out:
88 return error; 90 return error;
89} 91}
92EXPORT_SYMBOL(vfs_fstatat);
90 93
91int vfs_lstat(char __user *name, struct kstat *stat) 94int vfs_stat(char __user *name, struct kstat *stat)
92{ 95{
93 return vfs_lstat_fd(AT_FDCWD, name, stat); 96 return vfs_fstatat(AT_FDCWD, name, stat, 0);
94} 97}
98EXPORT_SYMBOL(vfs_stat);
95 99
96EXPORT_SYMBOL(vfs_lstat); 100int vfs_lstat(char __user *name, struct kstat *stat)
97
98int vfs_fstat(unsigned int fd, struct kstat *stat)
99{ 101{
100 struct file *f = fget(fd); 102 return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
101 int error = -EBADF;
102
103 if (f) {
104 error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
105 fput(f);
106 }
107 return error;
108} 103}
104EXPORT_SYMBOL(vfs_lstat);
109 105
110EXPORT_SYMBOL(vfs_fstat);
111 106
112#ifdef __ARCH_WANT_OLD_STAT 107#ifdef __ARCH_WANT_OLD_STAT
113 108
@@ -155,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
155SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 150SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
156{ 151{
157 struct kstat stat; 152 struct kstat stat;
158 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 153 int error;
159 154
160 if (!error) 155 error = vfs_stat(filename, &stat);
161 error = cp_old_stat(&stat, statbuf); 156 if (error)
157 return error;
162 158
163 return error; 159 return cp_old_stat(&stat, statbuf);
164} 160}
165 161
166SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 162SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
167{ 163{
168 struct kstat stat; 164 struct kstat stat;
169 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 165 int error;
170 166
171 if (!error) 167 error = vfs_lstat(filename, &stat);
172 error = cp_old_stat(&stat, statbuf); 168 if (error)
169 return error;
173 170
174 return error; 171 return cp_old_stat(&stat, statbuf);
175} 172}
176 173
177SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) 174SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -240,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
240SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf) 237SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
241{ 238{
242 struct kstat stat; 239 struct kstat stat;
243 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 240 int error = vfs_stat(filename, &stat);
244
245 if (!error)
246 error = cp_new_stat(&stat, statbuf);
247 241
248 return error; 242 if (error)
243 return error;
244 return cp_new_stat(&stat, statbuf);
249} 245}
250 246
251SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf) 247SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
252{ 248{
253 struct kstat stat; 249 struct kstat stat;
254 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 250 int error;
255 251
256 if (!error) 252 error = vfs_lstat(filename, &stat);
257 error = cp_new_stat(&stat, statbuf); 253 if (error)
254 return error;
258 255
259 return error; 256 return cp_new_stat(&stat, statbuf);
260} 257}
261 258
262#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) 259#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
@@ -264,21 +261,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
264 struct stat __user *, statbuf, int, flag) 261 struct stat __user *, statbuf, int, flag)
265{ 262{
266 struct kstat stat; 263 struct kstat stat;
267 int error = -EINVAL; 264 int error;
268
269 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
270 goto out;
271
272 if (flag & AT_SYMLINK_NOFOLLOW)
273 error = vfs_lstat_fd(dfd, filename, &stat);
274 else
275 error = vfs_stat_fd(dfd, filename, &stat);
276
277 if (!error)
278 error = cp_new_stat(&stat, statbuf);
279 265
280out: 266 error = vfs_fstatat(dfd, filename, &stat, flag);
281 return error; 267 if (error)
268 return error;
269 return cp_new_stat(&stat, statbuf);
282} 270}
283#endif 271#endif
284 272
@@ -404,21 +392,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
404 struct stat64 __user *, statbuf, int, flag) 392 struct stat64 __user *, statbuf, int, flag)
405{ 393{
406 struct kstat stat; 394 struct kstat stat;
407 int error = -EINVAL; 395 int error;
408
409 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
410 goto out;
411
412 if (flag & AT_SYMLINK_NOFOLLOW)
413 error = vfs_lstat_fd(dfd, filename, &stat);
414 else
415 error = vfs_stat_fd(dfd, filename, &stat);
416
417 if (!error)
418 error = cp_new_stat64(&stat, statbuf);
419 396
420out: 397 error = vfs_fstatat(dfd, filename, &stat, flag);
421 return error; 398 if (error)
399 return error;
400 return cp_new_stat64(&stat, statbuf);
422} 401}
423#endif /* __ARCH_WANT_STAT64 */ 402#endif /* __ARCH_WANT_STAT64 */
424 403
diff --git a/fs/super.c b/fs/super.c
index 77cb4ec919b9..1943fdf655fa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -208,6 +208,34 @@ void deactivate_super(struct super_block *s)
208EXPORT_SYMBOL(deactivate_super); 208EXPORT_SYMBOL(deactivate_super);
209 209
210/** 210/**
211 * deactivate_locked_super - drop an active reference to superblock
212 * @s: superblock to deactivate
213 *
214 * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
215 * it does not unlock it until it's all over. As the result, it's safe to
216 * use to dispose of new superblock on ->get_sb() failure exits - nobody
217 * will see the sucker until it's all over. Equivalent using up_write +
218 * deactivate_super is safe for that purpose only if superblock is either
219 * safe to use or has NULL ->s_root when we unlock.
220 */
221void deactivate_locked_super(struct super_block *s)
222{
223 struct file_system_type *fs = s->s_type;
224 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
225 s->s_count -= S_BIAS-1;
226 spin_unlock(&sb_lock);
227 vfs_dq_off(s, 0);
228 fs->kill_sb(s);
229 put_filesystem(fs);
230 put_super(s);
231 } else {
232 up_write(&s->s_umount);
233 }
234}
235
236EXPORT_SYMBOL(deactivate_locked_super);
237
238/**
211 * grab_super - acquire an active reference 239 * grab_super - acquire an active reference
212 * @s: reference we are trying to make active 240 * @s: reference we are trying to make active
213 * 241 *
@@ -771,6 +799,45 @@ void kill_litter_super(struct super_block *sb)
771 799
772EXPORT_SYMBOL(kill_litter_super); 800EXPORT_SYMBOL(kill_litter_super);
773 801
802static int ns_test_super(struct super_block *sb, void *data)
803{
804 return sb->s_fs_info == data;
805}
806
807static int ns_set_super(struct super_block *sb, void *data)
808{
809 sb->s_fs_info = data;
810 return set_anon_super(sb, NULL);
811}
812
813int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
814 int (*fill_super)(struct super_block *, void *, int),
815 struct vfsmount *mnt)
816{
817 struct super_block *sb;
818
819 sb = sget(fs_type, ns_test_super, ns_set_super, data);
820 if (IS_ERR(sb))
821 return PTR_ERR(sb);
822
823 if (!sb->s_root) {
824 int err;
825 sb->s_flags = flags;
826 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
827 if (err) {
828 deactivate_locked_super(sb);
829 return err;
830 }
831
832 sb->s_flags |= MS_ACTIVE;
833 }
834
835 simple_set_mnt(mnt, sb);
836 return 0;
837}
838
839EXPORT_SYMBOL(get_sb_ns);
840
774#ifdef CONFIG_BLOCK 841#ifdef CONFIG_BLOCK
775static int set_bdev_super(struct super_block *s, void *data) 842static int set_bdev_super(struct super_block *s, void *data)
776{ 843{
@@ -814,8 +881,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
814 881
815 if (s->s_root) { 882 if (s->s_root) {
816 if ((flags ^ s->s_flags) & MS_RDONLY) { 883 if ((flags ^ s->s_flags) & MS_RDONLY) {
817 up_write(&s->s_umount); 884 deactivate_locked_super(s);
818 deactivate_super(s);
819 error = -EBUSY; 885 error = -EBUSY;
820 goto error_bdev; 886 goto error_bdev;
821 } 887 }
@@ -830,8 +896,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
830 sb_set_blocksize(s, block_size(bdev)); 896 sb_set_blocksize(s, block_size(bdev));
831 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 897 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
832 if (error) { 898 if (error) {
833 up_write(&s->s_umount); 899 deactivate_locked_super(s);
834 deactivate_super(s);
835 goto error; 900 goto error;
836 } 901 }
837 902
@@ -857,7 +922,7 @@ void kill_block_super(struct super_block *sb)
857 struct block_device *bdev = sb->s_bdev; 922 struct block_device *bdev = sb->s_bdev;
858 fmode_t mode = sb->s_mode; 923 fmode_t mode = sb->s_mode;
859 924
860 bdev->bd_super = 0; 925 bdev->bd_super = NULL;
861 generic_shutdown_super(sb); 926 generic_shutdown_super(sb);
862 sync_blockdev(bdev); 927 sync_blockdev(bdev);
863 close_bdev_exclusive(bdev, mode); 928 close_bdev_exclusive(bdev, mode);
@@ -881,8 +946,7 @@ int get_sb_nodev(struct file_system_type *fs_type,
881 946
882 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 947 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
883 if (error) { 948 if (error) {
884 up_write(&s->s_umount); 949 deactivate_locked_super(s);
885 deactivate_super(s);
886 return error; 950 return error;
887 } 951 }
888 s->s_flags |= MS_ACTIVE; 952 s->s_flags |= MS_ACTIVE;
@@ -912,8 +976,7 @@ int get_sb_single(struct file_system_type *fs_type,
912 s->s_flags = flags; 976 s->s_flags = flags;
913 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 977 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
914 if (error) { 978 if (error) {
915 up_write(&s->s_umount); 979 deactivate_locked_super(s);
916 deactivate_super(s);
917 return error; 980 return error;
918 } 981 }
919 s->s_flags |= MS_ACTIVE; 982 s->s_flags |= MS_ACTIVE;
@@ -966,8 +1029,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
966 return mnt; 1029 return mnt;
967out_sb: 1030out_sb:
968 dput(mnt->mnt_root); 1031 dput(mnt->mnt_root);
969 up_write(&mnt->mnt_sb->s_umount); 1032 deactivate_locked_super(mnt->mnt_sb);
970 deactivate_super(mnt->mnt_sb);
971out_free_secdata: 1033out_free_secdata:
972 free_secdata(secdata); 1034 free_secdata(secdata);
973out_mnt: 1035out_mnt:
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 93e0c0281d45..9345806c8853 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
157 count = size - offs; 157 count = size - offs;
158 } 158 }
159 159
160 temp = kmalloc(count, GFP_KERNEL); 160 temp = memdup_user(userbuf, count);
161 if (!temp) 161 if (IS_ERR(temp))
162 return -ENOMEM; 162 return PTR_ERR(temp);
163
164 if (copy_from_user(temp, userbuf, count)) {
165 count = -EFAULT;
166 goto out_free;
167 }
168 163
169 mutex_lock(&bb->mutex); 164 mutex_lock(&bb->mutex);
170 165
@@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
176 if (count > 0) 171 if (count > 0)
177 *off = offs + count; 172 *off = offs + count;
178 173
179out_free:
180 kfree(temp);
181 return count; 174 return count;
182} 175}
183 176
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 289c43a47263..b1606e07b7a3 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
446 if (buffer->event != atomic_read(&od->event)) 446 if (buffer->event != atomic_read(&od->event))
447 goto trigger; 447 goto trigger;
448 448
449 return 0; 449 return DEFAULT_POLLMASK;
450 450
451 trigger: 451 trigger:
452 buffer->needs_read_fill = 1; 452 buffer->needs_read_fill = 1;
453 return POLLERR|POLLPRI; 453 return DEFAULT_POLLMASK|POLLERR|POLLPRI;
454} 454}
455 455
456void sysfs_notify_dirent(struct sysfs_dirent *sd) 456void sysfs_notify_dirent(struct sysfs_dirent *sd)
@@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct {
667 struct work_struct work; 667 struct work_struct work;
668}; 668};
669 669
670static struct workqueue_struct *sysfs_workqueue;
670static DEFINE_MUTEX(sysfs_workq_mutex); 671static DEFINE_MUTEX(sysfs_workq_mutex);
671static LIST_HEAD(sysfs_workq); 672static LIST_HEAD(sysfs_workq);
672static void sysfs_schedule_callback_work(struct work_struct *work) 673static void sysfs_schedule_callback_work(struct work_struct *work)
@@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
715 mutex_lock(&sysfs_workq_mutex); 716 mutex_lock(&sysfs_workq_mutex);
716 list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) 717 list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
717 if (ss->kobj == kobj) { 718 if (ss->kobj == kobj) {
719 module_put(owner);
718 mutex_unlock(&sysfs_workq_mutex); 720 mutex_unlock(&sysfs_workq_mutex);
719 return -EAGAIN; 721 return -EAGAIN;
720 } 722 }
721 mutex_unlock(&sysfs_workq_mutex); 723 mutex_unlock(&sysfs_workq_mutex);
722 724
725 if (sysfs_workqueue == NULL) {
726 sysfs_workqueue = create_workqueue("sysfsd");
727 if (sysfs_workqueue == NULL) {
728 module_put(owner);
729 return -ENOMEM;
730 }
731 }
732
723 ss = kmalloc(sizeof(*ss), GFP_KERNEL); 733 ss = kmalloc(sizeof(*ss), GFP_KERNEL);
724 if (!ss) { 734 if (!ss) {
725 module_put(owner); 735 module_put(owner);
@@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
735 mutex_lock(&sysfs_workq_mutex); 745 mutex_lock(&sysfs_workq_mutex);
736 list_add_tail(&ss->workq_list, &sysfs_workq); 746 list_add_tail(&ss->workq_list, &sysfs_workq);
737 mutex_unlock(&sysfs_workq_mutex); 747 mutex_unlock(&sysfs_workq_mutex);
738 schedule_work(&ss->work); 748 queue_work(sysfs_workqueue, &ss->work);
739 return 0; 749 return 0;
740} 750}
741EXPORT_SYMBOL_GPL(sysfs_schedule_callback); 751EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
194} 194}
195 195
196/** 196/**
197 * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. 197 * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
198 * @c: UBIFS file-system description object 198 * @c: UBIFS file-system description object
199 * 199 *
200 * This function calculates and returns the number of eraseblocks which should 200 * This function calculates and returns the number of LEBs which should be kept
201 * be kept for index usage. 201 * for index usage.
202 */ 202 */
203int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 203int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
204{ 204{
205 int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; 205 int idx_lebs;
206 long long idx_size; 206 long long idx_size;
207 207
208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
209
210 /* And make sure we have thrice the index size of space reserved */ 209 /* And make sure we have thrice the index size of space reserved */
211 idx_size = idx_size + (idx_size << 1); 210 idx_size += idx_size << 1;
212
213 /* 211 /*
214 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 212 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
215 * pair, nor similarly the two variables for the new index size, so we 213 * pair, nor similarly the two variables for the new index size, so we
216 * have to do this costly 64-bit division on fast-path. 214 * have to do this costly 64-bit division on fast-path.
217 */ 215 */
218 idx_size += eff_leb_size - 1; 216 idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
219 idx_lebs = div_u64(idx_size, eff_leb_size);
220 /* 217 /*
221 * The index head is not available for the in-the-gaps method, so add an 218 * The index head is not available for the in-the-gaps method, so add an
222 * extra LEB to compensate. 219 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
310 * do_budget_space - reserve flash space for index and data growth. 307 * do_budget_space - reserve flash space for index and data growth.
311 * @c: UBIFS file-system description object 308 * @c: UBIFS file-system description object
312 * 309 *
313 * This function makes sure UBIFS has enough free eraseblocks for index growth 310 * This function makes sure UBIFS has enough free LEBs for index growth and
314 * and data. 311 * data.
315 * 312 *
316 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index 313 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
317 * would take if it was consolidated and written to the flash. This guarantees 314 * would take if it was consolidated and written to the flash. This guarantees
318 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 315 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
319 * be able to commit dirty index. So this function basically adds amount of 316 * be able to commit dirty index. So this function basically adds amount of
320 * budgeted index space to the size of the current index, multiplies this by 3, 317 * budgeted index space to the size of the current index, multiplies this by 3,
321 * and makes sure this does not exceed the amount of free eraseblocks. 318 * and makes sure this does not exceed the amount of free LEBs.
322 * 319 *
323 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 320 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
324 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 321 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
325 * be large, because UBIFS does not do any index consolidation as long as 322 * be large, because UBIFS does not do any index consolidation as long as
326 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 323 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
327 * will contain a lot of dirt. 324 * will contain a lot of dirt.
328 * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be 325 * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
329 * consolidated to take up to @c->min_idx_lebs LEBs. 326 * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
330 * 327 *
331 * This function returns zero in case of success, and %-ENOSPC in case of 328 * This function returns zero in case of success, and %-ENOSPC in case of
332 * failure. 329 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
695 * This function calculates amount of free space to report to user-space. 692 * This function calculates amount of free space to report to user-space.
696 * 693 *
697 * Because UBIFS may introduce substantial overhead (the index, node headers, 694 * Because UBIFS may introduce substantial overhead (the index, node headers,
698 * alignment, wastage at the end of eraseblocks, etc), it cannot report real 695 * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
699 * amount of free flash space it has (well, because not all dirty space is 696 * free flash space it has (well, because not all dirty space is reclaimable,
700 * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, 697 * UBIFS does not actually know the real amount). If UBIFS did so, it would
701 * it would bread user expectations about what free space is. Users seem to 698 * bread user expectations about what free space is. Users seem to accustomed
702 * accustomed to assume that if the file-system reports N bytes of free space, 699 * to assume that if the file-system reports N bytes of free space, they would
703 * they would be able to fit a file of N bytes to the FS. This almost works for 700 * be able to fit a file of N bytes to the FS. This almost works for
704 * traditional file-systems, because they have way less overhead than UBIFS. 701 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 702 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 703 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
479 "bad or corrupted node)"); 479 "bad or corrupted node)");
480 else { 480 else {
481 for (i = 0; i < nlen && dent->name[i]; i++) 481 for (i = 0; i < nlen && dent->name[i]; i++)
482 printk("%c", dent->name[i]); 482 printk(KERN_CONT "%c", dent->name[i]);
483 } 483 }
484 printk("\n"); 484 printk(KERN_CONT "\n");
485 485
486 break; 486 break;
487 } 487 }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
1214 1214
1215 /* 1215 /*
1216 * Make sure the last key in our znode is less or 1216 * Make sure the last key in our znode is less or
1217 * equivalent than the the key in zbranch which goes 1217 * equivalent than the key in the zbranch which goes
1218 * after our pointing zbranch. 1218 * after our pointing zbranch.
1219 */ 1219 */
1220 cmp = keys_cmp(c, max, 1220 cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
430 struct ubifs_inode *ui = ubifs_inode(inode); 430 struct ubifs_inode *ui = ubifs_inode(inode);
431 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 431 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 int skipped_read = 0;
433 struct page *page; 434 struct page *page;
434 435
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
444 445
445 if (!PageUptodate(page)) { 446 if (!PageUptodate(page)) {
446 /* The page is not loaded from the flash */ 447 /* The page is not loaded from the flash */
447 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
448 /* 449 /*
449 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
450 * have to set the @PG_checked flag to make the further 451 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
453 * the media. 454 * the media.
454 */ 455 */
455 SetPageChecked(page); 456 SetPageChecked(page);
456 else { 457 skipped_read = 1;
458 } else {
457 err = do_readpage(page); 459 err = do_readpage(page);
458 if (err) { 460 if (err) {
459 unlock_page(page); 461 unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
470 if (unlikely(err)) { 472 if (unlikely(err)) {
471 ubifs_assert(err == -ENOSPC); 473 ubifs_assert(err == -ENOSPC);
472 /* 474 /*
475 * If we skipped reading the page because we were going to
476 * write all of it, then it is not up to date.
477 */
478 if (skipped_read) {
479 ClearPageChecked(page);
480 ClearPageUptodate(page);
481 }
482 /*
473 * Budgeting failed which means it would have to force 483 * Budgeting failed which means it would have to force
474 * write-back but didn't, because we set the @fast flag in the 484 * write-back but didn't, because we set the @fast flag in the
475 * request. Write-back cannot be done now, while we have the 485 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
949 * whole index and correct all inode sizes, which is long an unacceptable. 959 * whole index and correct all inode sizes, which is long an unacceptable.
950 * 960 *
951 * To prevent situations like this, UBIFS writes pages back only if they are 961 * To prevent situations like this, UBIFS writes pages back only if they are
952 * within last synchronized inode size, i.e. the the size which has been 962 * within the last synchronized inode size, i.e. the size which has been
953 * written to the flash media last time. Otherwise, UBIFS forces inode 963 * written to the flash media last time. Otherwise, UBIFS forces inode
954 * write-back, thus making sure the on-flash inode contains current inode size, 964 * write-back, thus making sure the on-flash inode contains current inode size,
955 * and then keeps writing pages back. 965 * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
478 * ubifs_find_free_space - find a data LEB with free space. 478 * ubifs_find_free_space - find a data LEB with free space.
479 * @c: the UBIFS file-system description object 479 * @c: the UBIFS file-system description object
480 * @min_space: minimum amount of required free space 480 * @min_space: minimum amount of required free space
481 * @free: contains amount of free space in the LEB on exit 481 * @offs: contains offset of where free space starts on exit
482 * @squeeze: whether to try to find space in a non-empty LEB first 482 * @squeeze: whether to try to find space in a non-empty LEB first
483 * 483 *
484 * This function looks for an LEB with at least @min_space bytes of free space. 484 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
490 * failed to find a LEB with @min_space bytes of free space and other a negative 490 * failed to find a LEB with @min_space bytes of free space and other a negative
491 * error codes in case of failure. 491 * error codes in case of failure.
492 */ 492 */
493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
494 int squeeze) 494 int squeeze)
495{ 495{
496 const struct ubifs_lprops *lprops; 496 const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
558 spin_unlock(&c->space_lock); 558 spin_unlock(&c->space_lock);
559 } 559 }
560 560
561 *free = lprops->free; 561 *offs = c->leb_size - lprops->free;
562 ubifs_release_lprops(c); 562 ubifs_release_lprops(c);
563 563
564 if (*free == c->leb_size) { 564 if (*offs == 0) {
565 /* 565 /*
566 * Ensure that empty LEBs have been unmapped. They may not have 566 * Ensure that empty LEBs have been unmapped. They may not have
567 * been, for example, because of an unclean unmount. Also 567 * been, for example, because of an unclean unmount. Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
573 return err; 573 return err;
574 } 574 }
575 575
576 dbg_find("found LEB %d, free %d", lnum, *free); 576 dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
577 ubifs_assert(*free >= min_space); 577 ubifs_assert(*offs <= c->leb_size - min_space);
578 return lnum; 578 return lnum;
579 579
580out: 580out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
47 * have to waste large pieces of free space at the end of LEB B, because nodes 47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of 48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB 49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh 50 * which are guaranteed to be reclaimable. If LEB has less space, the GC might
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark 51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so 52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
57#include "ubifs.h" 57#include "ubifs.h"
58 58
59/* 59/*
60 * GC tries to optimize the way it fit nodes to available space, and it sorts
61 * nodes a little. The below constants are watermarks which define "large",
62 * "medium", and "small" nodes.
63 */
64#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
65#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
66
67/*
68 * GC may need to move more than one LEB to make progress. The below constants 60 * GC may need to move more than one LEB to make progress. The below constants
69 * define "soft" and "hard" limits on the number of LEBs the garbage collector 61 * define "soft" and "hard" limits on the number of LEBs the garbage collector
70 * may move. 62 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
116} 108}
117 109
118/** 110/**
119 * joinup - bring data nodes for an inode together. 111 * list_sort - sort a list.
120 * @c: UBIFS file-system description object 112 * @priv: private data, passed to @cmp
121 * @sleb: describes scanned LEB 113 * @head: the list to sort
122 * @inum: inode number 114 * @cmp: the elements comparison function
123 * @blk: block number
124 * @data: list to which to add data nodes
125 * 115 *
126 * This function looks at the first few nodes in the scanned LEB @sleb and adds 116 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
127 * them to @data if they are data nodes from @inum and have a larger block 117 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
128 * number than @blk. This function returns %0 on success and a negative error 118 * in ascending order.
129 * code on failure. 119 *
120 * The comparison function @cmp is supposed to return a negative value if @a is
121 * than @b, and a positive value if @a is greater than @b. If @a and @b are
122 * equivalent, then it does not matter what this function returns.
130 */ 123 */
131static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, 124static void list_sort(void *priv, struct list_head *head,
132 unsigned int blk, struct list_head *data) 125 int (*cmp)(void *priv, struct list_head *a,
126 struct list_head *b))
133{ 127{
134 int err, cnt = 6, lnum = sleb->lnum, offs; 128 struct list_head *p, *q, *e, *list, *tail, *oldhead;
135 struct ubifs_scan_node *snod, *tmp; 129 int insize, nmerges, psize, qsize, i;
136 union ubifs_key *key; 130
131 if (list_empty(head))
132 return;
133
134 list = head->next;
135 list_del(head);
136 insize = 1;
137 for (;;) {
138 p = oldhead = list;
139 list = tail = NULL;
140 nmerges = 0;
141
142 while (p) {
143 nmerges++;
144 q = p;
145 psize = 0;
146 for (i = 0; i < insize; i++) {
147 psize++;
148 q = q->next == oldhead ? NULL : q->next;
149 if (!q)
150 break;
151 }
137 152
138 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 153 qsize = insize;
139 key = &snod->key; 154 while (psize > 0 || (qsize > 0 && q)) {
140 if (key_inum(c, key) == inum && 155 if (!psize) {
141 key_type(c, key) == UBIFS_DATA_KEY && 156 e = q;
142 key_block(c, key) > blk) { 157 q = q->next;
143 offs = snod->offs; 158 qsize--;
144 err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); 159 if (q == oldhead)
145 if (err < 0) 160 q = NULL;
146 return err; 161 } else if (!qsize || !q) {
147 list_del(&snod->list); 162 e = p;
148 if (err) { 163 p = p->next;
149 list_add_tail(&snod->list, data); 164 psize--;
150 blk = key_block(c, key); 165 if (p == oldhead)
151 } else 166 p = NULL;
152 kfree(snod); 167 } else if (cmp(priv, p, q) <= 0) {
153 cnt = 6; 168 e = p;
154 } else if (--cnt == 0) 169 p = p->next;
170 psize--;
171 if (p == oldhead)
172 p = NULL;
173 } else {
174 e = q;
175 q = q->next;
176 qsize--;
177 if (q == oldhead)
178 q = NULL;
179 }
180 if (tail)
181 tail->next = e;
182 else
183 list = e;
184 e->prev = tail;
185 tail = e;
186 }
187 p = q;
188 }
189
190 tail->next = list;
191 list->prev = tail;
192
193 if (nmerges <= 1)
155 break; 194 break;
195
196 insize *= 2;
156 } 197 }
157 return 0; 198
199 head->next = list;
200 head->prev = list->prev;
201 list->prev->next = head;
202 list->prev = head;
158} 203}
159 204
160/** 205/**
161 * move_nodes - move nodes. 206 * data_nodes_cmp - compare 2 data nodes.
207 * @priv: UBIFS file-system description object
208 * @a: first data node
209 * @a: second data node
210 *
211 * This function compares data nodes @a and @b. Returns %1 if @a has greater
212 * inode or block number, and %-1 otherwise.
213 */
214int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
215{
216 ino_t inuma, inumb;
217 struct ubifs_info *c = priv;
218 struct ubifs_scan_node *sa, *sb;
219
220 cond_resched();
221 sa = list_entry(a, struct ubifs_scan_node, list);
222 sb = list_entry(b, struct ubifs_scan_node, list);
223 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
224 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
225
226 inuma = key_inum(c, &sa->key);
227 inumb = key_inum(c, &sb->key);
228
229 if (inuma == inumb) {
230 unsigned int blka = key_block(c, &sa->key);
231 unsigned int blkb = key_block(c, &sb->key);
232
233 if (blka <= blkb)
234 return -1;
235 } else if (inuma <= inumb)
236 return -1;
237
238 return 1;
239}
240
241/*
242 * nondata_nodes_cmp - compare 2 non-data nodes.
243 * @priv: UBIFS file-system description object
244 * @a: first node
245 * @a: second node
246 *
247 * This function compares nodes @a and @b. It makes sure that inode nodes go
248 * first and sorted by length in descending order. Directory entry nodes go
249 * after inode nodes and are sorted in ascending hash valuer order.
250 */
251int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
252{
253 int typea, typeb;
254 ino_t inuma, inumb;
255 struct ubifs_info *c = priv;
256 struct ubifs_scan_node *sa, *sb;
257
258 cond_resched();
259 sa = list_entry(a, struct ubifs_scan_node, list);
260 sb = list_entry(b, struct ubifs_scan_node, list);
261 typea = key_type(c, &sa->key);
262 typeb = key_type(c, &sb->key);
263 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
264
265 /* Inodes go before directory entries */
266 if (typea == UBIFS_INO_KEY) {
267 if (typeb == UBIFS_INO_KEY)
268 return sb->len - sa->len;
269 return -1;
270 }
271 if (typeb == UBIFS_INO_KEY)
272 return 1;
273
274 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
275 inuma = key_inum(c, &sa->key);
276 inumb = key_inum(c, &sb->key);
277
278 if (inuma == inumb) {
279 uint32_t hasha = key_hash(c, &sa->key);
280 uint32_t hashb = key_hash(c, &sb->key);
281
282 if (hasha <= hashb)
283 return -1;
284 } else if (inuma <= inumb)
285 return -1;
286
287 return 1;
288}
289
290/**
291 * sort_nodes - sort nodes for GC.
162 * @c: UBIFS file-system description object 292 * @c: UBIFS file-system description object
163 * @sleb: describes nodes to move 293 * @sleb: describes nodes to sort and contains the result on exit
294 * @nondata: contains non-data nodes on exit
295 * @min: minimum node size is returned here
164 * 296 *
165 * This function moves valid nodes from data LEB described by @sleb to the GC 297 * This function sorts the list of inodes to garbage collect. First of all, it
166 * journal head. The obsolete nodes are dropped. 298 * kills obsolete nodes and separates data and non-data nodes to the
299 * @sleb->nodes and @nondata lists correspondingly.
300 *
301 * Data nodes are then sorted in block number order - this is important for
302 * bulk-read; data nodes with lower inode number go before data nodes with
303 * higher inode number, and data nodes with lower block number go before data
304 * nodes with higher block number;
167 * 305 *
168 * When moving nodes we have to deal with classical bin-packing problem: the 306 * Non-data nodes are sorted as follows.
169 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", 307 * o First go inode nodes - they are sorted in descending length order.
170 * where the nodes in the @sleb->nodes list are the elements which should be 308 * o Then go directory entry nodes - they are sorted in hash order, which
171 * fit optimally to the bins. This function uses the "first fit decreasing" 309 * should supposedly optimize 'readdir()'. Direntry nodes with lower parent
172 * strategy, although it does not really sort the nodes but just split them on 310 * inode number go before direntry nodes with higher parent inode number,
173 * 3 classes - large, medium, and small, so they are roughly sorted. 311 * and direntry nodes with lower name hash values go before direntry nodes
312 * with higher name hash values.
174 * 313 *
175 * This function returns zero in case of success, %-EAGAIN if commit is 314 * This function returns zero in case of success and a negative error code in
176 * required, and other negative error codes in case of other failures. 315 * case of failure.
177 */ 316 */
178static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) 317static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
318 struct list_head *nondata, int *min)
179{ 319{
180 struct ubifs_scan_node *snod, *tmp; 320 struct ubifs_scan_node *snod, *tmp;
181 struct list_head data, large, medium, small;
182 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
183 int avail, err, min = INT_MAX;
184 unsigned int blk = 0;
185 ino_t inum = 0;
186 321
187 INIT_LIST_HEAD(&data); 322 *min = INT_MAX;
188 INIT_LIST_HEAD(&large);
189 INIT_LIST_HEAD(&medium);
190 INIT_LIST_HEAD(&small);
191 323
192 while (!list_empty(&sleb->nodes)) { 324 /* Separate data nodes and non-data nodes */
193 struct list_head *lst = sleb->nodes.next; 325 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
194 326 int err;
195 snod = list_entry(lst, struct ubifs_scan_node, list);
196 327
197 ubifs_assert(snod->type != UBIFS_IDX_NODE); 328 ubifs_assert(snod->type != UBIFS_IDX_NODE);
198 ubifs_assert(snod->type != UBIFS_REF_NODE); 329 ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
201 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 332 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
202 snod->offs, 0); 333 snod->offs, 0);
203 if (err < 0) 334 if (err < 0)
204 goto out; 335 return err;
205 336
206 list_del(lst);
207 if (!err) { 337 if (!err) {
208 /* The node is obsolete, remove it from the list */ 338 /* The node is obsolete, remove it from the list */
339 list_del(&snod->list);
209 kfree(snod); 340 kfree(snod);
210 continue; 341 continue;
211 } 342 }
212 343
213 /* 344 if (snod->len < *min)
214 * Sort the list of nodes so that data nodes go first, large 345 *min = snod->len;
215 * nodes go second, and small nodes go last. 346
216 */ 347 if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
217 if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { 348 list_move_tail(&snod->list, nondata);
218 if (inum != key_inum(c, &snod->key)) {
219 if (inum) {
220 /*
221 * Try to move data nodes from the same
222 * inode together.
223 */
224 err = joinup(c, sleb, inum, blk, &data);
225 if (err)
226 goto out;
227 }
228 inum = key_inum(c, &snod->key);
229 blk = key_block(c, &snod->key);
230 }
231 list_add_tail(lst, &data);
232 } else if (snod->len > MEDIUM_NODE_WM)
233 list_add_tail(lst, &large);
234 else if (snod->len > SMALL_NODE_WM)
235 list_add_tail(lst, &medium);
236 else
237 list_add_tail(lst, &small);
238
239 /* And find the smallest node */
240 if (snod->len < min)
241 min = snod->len;
242 } 349 }
243 350
244 /* 351 /* Sort data and non-data nodes */
245 * Join the tree lists so that we'd have one roughly sorted list 352 list_sort(c, &sleb->nodes, &data_nodes_cmp);
246 * ('large' will be the head of the joined list). 353 list_sort(c, nondata, &nondata_nodes_cmp);
247 */ 354 return 0;
248 list_splice(&data, &large); 355}
249 list_splice(&medium, large.prev); 356
250 list_splice(&small, large.prev); 357/**
358 * move_node - move a node.
359 * @c: UBIFS file-system description object
360 * @sleb: describes the LEB to move nodes from
361 * @snod: the mode to move
362 * @wbuf: write-buffer to move node to
363 *
364 * This function moves node @snod to @wbuf, changes TNC correspondingly, and
365 * destroys @snod. Returns zero in case of success and a negative error code in
366 * case of failure.
367 */
368static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
369 struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
370{
371 int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
372
373 cond_resched();
374 err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
375 if (err)
376 return err;
377
378 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
379 snod->offs, new_lnum, new_offs,
380 snod->len);
381 list_del(&snod->list);
382 kfree(snod);
383 return err;
384}
385
386/**
387 * move_nodes - move nodes.
388 * @c: UBIFS file-system description object
389 * @sleb: describes the LEB to move nodes from
390 *
391 * This function moves valid nodes from data LEB described by @sleb to the GC
392 * journal head. This function returns zero in case of success, %-EAGAIN if
393 * commit is required, and other negative error codes in case of other
394 * failures.
395 */
396static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
397{
398 int err, min;
399 LIST_HEAD(nondata);
400 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
251 401
252 if (wbuf->lnum == -1) { 402 if (wbuf->lnum == -1) {
253 /* 403 /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
256 */ 406 */
257 err = switch_gc_head(c); 407 err = switch_gc_head(c);
258 if (err) 408 if (err)
259 goto out; 409 return err;
260 } 410 }
261 411
412 err = sort_nodes(c, sleb, &nondata, &min);
413 if (err)
414 goto out;
415
262 /* Write nodes to their new location. Use the first-fit strategy */ 416 /* Write nodes to their new location. Use the first-fit strategy */
263 while (1) { 417 while (1) {
264 avail = c->leb_size - wbuf->offs - wbuf->used; 418 int avail;
265 list_for_each_entry_safe(snod, tmp, &large, list) { 419 struct ubifs_scan_node *snod, *tmp;
266 int new_lnum, new_offs; 420
421 /* Move data nodes */
422 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
423 avail = c->leb_size - wbuf->offs - wbuf->used;
424 if (snod->len > avail)
425 /*
426 * Do not skip data nodes in order to optimize
427 * bulk-read.
428 */
429 break;
430
431 err = move_node(c, sleb, snod, wbuf);
432 if (err)
433 goto out;
434 }
267 435
436 /* Move non-data nodes */
437 list_for_each_entry_safe(snod, tmp, &nondata, list) {
438 avail = c->leb_size - wbuf->offs - wbuf->used;
268 if (avail < min) 439 if (avail < min)
269 break; 440 break;
270 441
271 if (snod->len > avail) 442 if (snod->len > avail) {
272 /* This node does not fit */ 443 /*
444 * Keep going only if this is an inode with
445 * some data. Otherwise stop and switch the GC
446 * head. IOW, we assume that data-less inode
447 * nodes and direntry nodes are roughly of the
448 * same size.
449 */
450 if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
451 snod->len == UBIFS_INO_NODE_SZ)
452 break;
273 continue; 453 continue;
454 }
274 455
275 cond_resched(); 456 err = move_node(c, sleb, snod, wbuf);
276
277 new_lnum = wbuf->lnum;
278 new_offs = wbuf->offs + wbuf->used;
279 err = ubifs_wbuf_write_nolock(wbuf, snod->node,
280 snod->len);
281 if (err) 457 if (err)
282 goto out; 458 goto out;
283 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
284 snod->offs, new_lnum, new_offs,
285 snod->len);
286 if (err)
287 goto out;
288
289 avail = c->leb_size - wbuf->offs - wbuf->used;
290 list_del(&snod->list);
291 kfree(snod);
292 } 459 }
293 460
294 if (list_empty(&large)) 461 if (list_empty(&sleb->nodes) && list_empty(&nondata))
295 break; 462 break;
296 463
297 /* 464 /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
306 return 0; 473 return 0;
307 474
308out: 475out:
309 list_for_each_entry_safe(snod, tmp, &large, list) { 476 list_splice_tail(&nondata, &sleb->nodes);
310 list_del(&snod->list);
311 kfree(snod);
312 }
313 return err; 477 return err;
314} 478}
315 479
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
114 */ 114 */
115static int reserve_space(struct ubifs_info *c, int jhead, int len) 115static int reserve_space(struct ubifs_info *c, int jhead, int len)
116{ 116{
117 int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; 117 int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; 118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
119 119
120 /* 120 /*
@@ -139,10 +139,9 @@ again:
139 * Write buffer wasn't seek'ed or there is no enough space - look for an 139 * Write buffer wasn't seek'ed or there is no enough space - look for an
140 * LEB with some empty space. 140 * LEB with some empty space.
141 */ 141 */
142 lnum = ubifs_find_free_space(c, len, &free, squeeze); 142 lnum = ubifs_find_free_space(c, len, &offs, squeeze);
143 if (lnum >= 0) { 143 if (lnum >= 0) {
144 /* Found an LEB, add it to the journal head */ 144 /* Found an LEB, add it to the journal head */
145 offs = c->leb_size - free;
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs); 145 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err) 146 if (err)
148 goto out_return; 147 goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
1366 * @host: host inode 1365 * @host: host inode
1367 * 1366 *
1368 * This function writes the updated version of an extended attribute inode and 1367 * This function writes the updated version of an extended attribute inode and
1369 * the host inode tho the journal (to the base head). The host inode is written 1368 * the host inode to the journal (to the base head). The host inode is written
1370 * after the extended attribute inode in order to guarantee that the extended 1369 * after the extended attribute inode in order to guarantee that the extended
1371 * attribute will be flushed when the inode is synchronized by 'fsync()' and 1370 * attribute will be flushed when the inode is synchronized by 'fsync()' and
1372 * consequently, the write-buffer is synchronized. This function returns zero 1371 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
381 * @c: UBIFS file-system description object 381 * @c: UBIFS file-system description object
382 * @key: the key to get hash from 382 * @key: the key to get hash from
383 */ 383 */
384static inline int key_hash(const struct ubifs_info *c, 384static inline uint32_t key_hash(const struct ubifs_info *c,
385 const union ubifs_key *key) 385 const union ubifs_key *key)
386{ 386{
387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK; 387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
388} 388}
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
392 * @c: UBIFS file-system description object 392 * @c: UBIFS file-system description object
393 * @k: the key to get hash from 393 * @k: the key to get hash from
394 */ 394 */
395static inline int key_hash_flash(const struct ubifs_info *c, const void *k) 395static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
396{ 396{
397 const union ubifs_key *key = k; 397 const union ubifs_key *key = k;
398 398
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
239 } 239 }
240 240
241 /* 241 /*
242 * Make sure the the amount of space in buds will not exceed 242 * Make sure the amount of space in buds will not exceed the
243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time 243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
244 * limits. 244 * limits.
245 * 245 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
367 bud->jhead, c->leb_size - bud->start, 367 bud->jhead, c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 list_del(&bud->list);
371 /* 370 /*
372 * If the commit does not finish, the recovery will need 371 * If the commit does not finish, the recovery will need
373 * to replay the journal, in which case the old buds 372 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
375 * commit i.e. do not allow them to be garbage 374 * commit i.e. do not allow them to be garbage
376 * collected. 375 * collected.
377 */ 376 */
378 list_add(&bud->list, &c->old_buds); 377 list_move(&bud->list, &c->old_buds);
379 } 378 }
380 } 379 }
381 spin_unlock(&c->buds_lock); 380 spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
229 while (offs + len > c->leb_size) { 229 while (offs + len > c->leb_size) {
230 alen = ALIGN(offs, c->min_io_size); 230 alen = ALIGN(offs, c->min_io_size);
231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
232 dbg_chk_lpt_sz(c, 2, alen - offs); 232 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
233 err = alloc_lpt_leb(c, &lnum); 233 err = alloc_lpt_leb(c, &lnum);
234 if (err) 234 if (err)
235 goto no_space; 235 goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
272 if (offs + c->lsave_sz > c->leb_size) { 272 if (offs + c->lsave_sz > c->leb_size) {
273 alen = ALIGN(offs, c->min_io_size); 273 alen = ALIGN(offs, c->min_io_size);
274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
275 dbg_chk_lpt_sz(c, 2, alen - offs); 275 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
276 err = alloc_lpt_leb(c, &lnum); 276 err = alloc_lpt_leb(c, &lnum);
277 if (err) 277 if (err)
278 goto no_space; 278 goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
292 if (offs + c->ltab_sz > c->leb_size) { 292 if (offs + c->ltab_sz > c->leb_size) {
293 alen = ALIGN(offs, c->min_io_size); 293 alen = ALIGN(offs, c->min_io_size);
294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
295 dbg_chk_lpt_sz(c, 2, alen - offs); 295 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
296 err = alloc_lpt_leb(c, &lnum); 296 err = alloc_lpt_leb(c, &lnum);
297 if (err) 297 if (err)
298 goto no_space; 298 goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
416 alen, UBI_SHORTTERM); 416 alen, UBI_SHORTTERM);
417 if (err) 417 if (err)
418 return err; 418 return err;
419 dbg_chk_lpt_sz(c, 4, alen - wlen);
420 } 419 }
421 dbg_chk_lpt_sz(c, 2, 0); 420 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
422 err = realloc_lpt_leb(c, &lnum); 421 err = realloc_lpt_leb(c, &lnum);
423 if (err) 422 if (err)
424 goto no_space; 423 goto no_space;
425 offs = 0; 424 offs = from = 0;
426 from = 0;
427 ubifs_assert(lnum >= c->lpt_first && 425 ubifs_assert(lnum >= c->lpt_first &&
428 lnum <= c->lpt_last); 426 lnum <= c->lpt_last);
429 err = ubifs_leb_unmap(c, lnum); 427 err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
477 UBI_SHORTTERM); 475 UBI_SHORTTERM);
478 if (err) 476 if (err)
479 return err; 477 return err;
480 dbg_chk_lpt_sz(c, 2, alen - wlen); 478 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
481 err = realloc_lpt_leb(c, &lnum); 479 err = realloc_lpt_leb(c, &lnum);
482 if (err) 480 if (err)
483 goto no_space; 481 goto no_space;
484 offs = 0; 482 offs = from = 0;
485 ubifs_assert(lnum >= c->lpt_first && 483 ubifs_assert(lnum >= c->lpt_first &&
486 lnum <= c->lpt_last); 484 lnum <= c->lpt_last);
487 err = ubifs_leb_unmap(c, lnum); 485 err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
504 UBI_SHORTTERM); 502 UBI_SHORTTERM);
505 if (err) 503 if (err)
506 return err; 504 return err;
507 dbg_chk_lpt_sz(c, 2, alen - wlen); 505 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
508 err = realloc_lpt_leb(c, &lnum); 506 err = realloc_lpt_leb(c, &lnum);
509 if (err) 507 if (err)
510 goto no_space; 508 goto no_space;
511 offs = 0; 509 offs = from = 0;
512 ubifs_assert(lnum >= c->lpt_first && 510 ubifs_assert(lnum >= c->lpt_first &&
513 lnum <= c->lpt_last); 511 lnum <= c->lpt_last);
514 err = ubifs_leb_unmap(c, lnum); 512 err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1756/** 1754/**
1757 * dbg_chk_lpt_sz - check LPT does not write more than LPT size. 1755 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
1758 * @c: the UBIFS file-system description object 1756 * @c: the UBIFS file-system description object
1759 * @action: action 1757 * @action: what to do
1760 * @len: length written 1758 * @len: length written
1761 * 1759 *
1762 * This function returns %0 on success and a negative error code on failure. 1760 * This function returns %0 on success and a negative error code on failure.
1761 * The @action argument may be one of:
1762 * o %0 - LPT debugging checking starts, initialize debugging variables;
1763 * o %1 - wrote an LPT node, increase LPT size by @len bytes;
1764 * o %2 - switched to a different LEB and wasted @len bytes;
1765 * o %3 - check that we've written the right number of bytes.
1766 * o %4 - wasted @len bytes;
1763 */ 1767 */
1764int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1768int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1765{ 1769{
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1917 lnum, offs); 1921 lnum, offs);
1918 err = ubifs_unpack_nnode(c, buf, &nnode); 1922 err = ubifs_unpack_nnode(c, buf, &nnode);
1919 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1923 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1920 printk("%d:%d", nnode.nbranch[i].lnum, 1924 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
1921 nnode.nbranch[i].offs); 1925 nnode.nbranch[i].offs);
1922 if (i != UBIFS_LPT_FANOUT - 1) 1926 if (i != UBIFS_LPT_FANOUT - 1)
1923 printk(", "); 1927 printk(KERN_CONT ", ");
1924 } 1928 }
1925 printk("\n"); 1929 printk(KERN_CONT "\n");
1926 break; 1930 break;
1927 } 1931 }
1928 case UBIFS_LPT_LTAB: 1932 case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
425 * @lnum: LEB number of the LEB from which @buf was read 425 * @lnum: LEB number of the LEB from which @buf was read
426 * @offs: offset from which @buf was read 426 * @offs: offset from which @buf was read
427 * 427 *
428 * This function scans @buf for more nodes and returns %0 is a node is found and 428 * This function ensures that the corrupted node at @offs is the last thing
429 * %1 if no more nodes are found. 429 * written to a LEB. This function returns %1 if more data is not found and
430 * %0 if more data is found.
430 */ 431 */
431static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, 432static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
432 int lnum, int offs) 433 int lnum, int offs)
433{ 434{
434 int skip, next_offs = 0; 435 struct ubifs_ch *ch = buf;
436 int skip, dlen = le32_to_cpu(ch->len);
435 437
436 if (len > UBIFS_DATA_NODE_SZ) { 438 /* Check for empty space after the corrupt node's common header */
437 struct ubifs_ch *ch = buf; 439 skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
438 int dlen = le32_to_cpu(ch->len); 440 if (is_empty(buf + skip, len - skip))
439 441 return 1;
440 if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && 442 /*
441 dlen <= UBIFS_MAX_DATA_NODE_SZ) 443 * The area after the common header size is not empty, so the common
442 /* The corrupt node looks like a data node */ 444 * header must be intact. Check it.
443 next_offs = ALIGN(offs + dlen, 8); 445 */
444 } 446 if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
445 447 dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
446 if (c->min_io_size == 1) 448 return 0;
447 skip = 8;
448 else
449 skip = ALIGN(offs + 1, c->min_io_size) - offs;
450
451 offs += skip;
452 buf += skip;
453 len -= skip;
454 while (len > 8) {
455 struct ubifs_ch *ch = buf;
456 uint32_t magic = le32_to_cpu(ch->magic);
457 int ret;
458
459 if (magic == UBIFS_NODE_MAGIC) {
460 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
461 if (ret == SCANNED_A_NODE || ret > 0) {
462 /*
463 * There is a small chance this is just data in
464 * a data node, so check that possibility. e.g.
465 * this is part of a file that itself contains
466 * a UBIFS image.
467 */
468 if (next_offs && offs + le32_to_cpu(ch->len) <=
469 next_offs)
470 continue;
471 dbg_rcvry("unexpected node at %d:%d", lnum,
472 offs);
473 return 0;
474 }
475 }
476 offs += 8;
477 buf += 8;
478 len -= 8;
479 } 449 }
480 return 1; 450 /* Now we know the corrupt node's length we can skip over it */
451 skip = ALIGN(offs + dlen, c->min_io_size) - offs;
452 /* After which there should be empty space */
453 if (is_empty(buf + skip, len - skip))
454 return 1;
455 dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
456 return 0;
481} 457}
482 458
483/** 459/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
143 dirty -= c->leb_size - lp->free; 143 dirty -= c->leb_size - lp->free;
144 /* 144 /*
145 * If the replay order was perfect the dirty space would now be 145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads 146 * zero. The order is not perfect because the journal heads
147 * race with each other. This is not a problem but is does mean 147 * race with each other. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size 148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay. 149 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
193 if (tmp64 > DEFAULT_MAX_RP_SIZE) 193 if (tmp64 > DEFAULT_MAX_RP_SIZE)
194 tmp64 = DEFAULT_MAX_RP_SIZE; 194 tmp64 = DEFAULT_MAX_RP_SIZE;
195 sup->rp_size = cpu_to_le64(tmp64); 195 sup->rp_size = cpu_to_le64(tmp64);
196 sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
196 197
197 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); 198 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
198 kfree(sup); 199 kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
532 if (IS_ERR(sup)) 533 if (IS_ERR(sup))
533 return PTR_ERR(sup); 534 return PTR_ERR(sup);
534 535
536 c->fmt_version = le32_to_cpu(sup->fmt_version);
537 c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
538
535 /* 539 /*
536 * The software supports all previous versions but not future versions, 540 * The software supports all previous versions but not future versions,
537 * due to the unavailability of time-travelling equipment. 541 * due to the unavailability of time-travelling equipment.
538 */ 542 */
539 c->fmt_version = le32_to_cpu(sup->fmt_version);
540 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 543 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
541 ubifs_err("on-flash format version is %d, but software only " 544 struct super_block *sb = c->vfs_sb;
542 "supports up to version %d", c->fmt_version, 545 int mounting_ro = sb->s_flags & MS_RDONLY;
543 UBIFS_FORMAT_VERSION); 546
544 err = -EINVAL; 547 ubifs_assert(!c->ro_media || mounting_ro);
545 goto out; 548 if (!mounting_ro ||
549 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
550 ubifs_err("on-flash format version is w%d/r%d, but "
551 "software only supports up to version "
552 "w%d/r%d", c->fmt_version,
553 c->ro_compat_version, UBIFS_FORMAT_VERSION,
554 UBIFS_RO_COMPAT_VERSION);
555 if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
556 ubifs_msg("only R/O mounting is possible");
557 err = -EROFS;
558 } else
559 err = -EINVAL;
560 goto out;
561 }
562
563 /*
564 * The FS is mounted R/O, and the media format is
565 * R/O-compatible with the UBIFS implementation, so we can
566 * mount.
567 */
568 c->rw_incompat = 1;
546 } 569 }
547 570
548 if (c->fmt_version < 3) { 571 if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
623 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; 646 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
624 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; 647 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
625 c->main_first = c->leb_cnt - c->main_lebs; 648 c->main_first = c->leb_cnt - c->main_lebs;
626 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
627 649
628 err = validate_sb(c, sup); 650 err = validate_sb(c, sup);
629out: 651out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
206 * Move this one to the end of the list to provide some 206 * Move this one to the end of the list to provide some
207 * fairness. 207 * fairness.
208 */ 208 */
209 list_del(&c->infos_list); 209 list_move_tail(&c->infos_list, &ubifs_infos);
210 list_add_tail(&c->infos_list, &ubifs_infos);
211 mutex_unlock(&c->umount_mutex); 210 mutex_unlock(&c->umount_mutex);
212 if (freed >= nr) 211 if (freed >= nr)
213 break; 212 break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
263 } 262 }
264 263
265 if (i == 1) { 264 if (i == 1) {
266 list_del(&c->infos_list); 265 list_move_tail(&c->infos_list, &ubifs_infos);
267 list_add_tail(&c->infos_list, &ubifs_infos);
268 spin_unlock(&ubifs_infos_lock); 266 spin_unlock(&ubifs_infos_lock);
269 267
270 ubifs_request_bg_commit(c); 268 ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..e9f7a754c4f7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
421 seq_printf(s, ",no_chk_data_crc"); 421 seq_printf(s, ",no_chk_data_crc");
422 422
423 if (c->mount_opts.override_compr) { 423 if (c->mount_opts.override_compr) {
424 seq_printf(s, ",compr="); 424 seq_printf(s, ",compr=%s",
425 seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); 425 ubifs_compr_name(c->mount_opts.compr_type));
426 } 426 }
427 427
428 return 0; 428 return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
700 if (err) 700 if (err)
701 return err; 701 return err;
702 702
703 /* Initialize effective LEB size used in budgeting calculations */
704 c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
703 return 0; 705 return 0;
704} 706}
705 707
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
716 long long tmp64; 718 long long tmp64;
717 719
718 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 720 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
721 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
719 722
720 /* 723 /*
721 * Calculate total amount of FS blocks. This number is not used 724 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
1201 goto out_cbuf; 1204 goto out_cbuf;
1202 1205
1203 /* Create background thread */ 1206 /* Create background thread */
1204 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1207 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1205 if (IS_ERR(c->bgt)) { 1208 if (IS_ERR(c->bgt)) {
1206 err = PTR_ERR(c->bgt); 1209 err = PTR_ERR(c->bgt);
1207 c->bgt = NULL; 1210 c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
1318 else { 1321 else {
1319 c->need_recovery = 0; 1322 c->need_recovery = 0;
1320 ubifs_msg("recovery completed"); 1323 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */ 1324 /*
1322 ubifs_assert(c->lst.taken_empty_lebs == 1); 1325 * GC LEB has to be empty and taken at this point. But
1326 * the journal head LEBs may also be accounted as
1327 * "empty taken" if they are empty.
1328 */
1329 ubifs_assert(c->lst.taken_empty_lebs > 0);
1323 } 1330 }
1324 } else 1331 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1); 1332 ubifs_assert(c->lst.taken_empty_lebs > 0);
1326 1333
1327 err = dbg_check_filesystem(c); 1334 err = dbg_check_filesystem(c);
1328 if (err) 1335 if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
1344 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1351 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1345 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1352 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
1346 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1353 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
1347 ubifs_msg("media format: %d (latest is %d)", 1354 ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",
1348 c->fmt_version, UBIFS_FORMAT_VERSION); 1355 c->fmt_version, c->ro_compat_version,
1356 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
1349 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1357 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
1350 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1358 ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
1351 c->report_rp_size, c->report_rp_size >> 10); 1359 c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1485{ 1493{
1486 int err, lnum; 1494 int err, lnum;
1487 1495
1496 if (c->rw_incompat) {
1497 ubifs_err("the file-system is not R/W-compatible");
1498 ubifs_msg("on-flash format version is w%d/r%d, but software "
1499 "only supports up to version w%d/r%d", c->fmt_version,
1500 c->ro_compat_version, UBIFS_FORMAT_VERSION,
1501 UBIFS_RO_COMPAT_VERSION);
1502 return -EROFS;
1503 }
1504
1488 mutex_lock(&c->umount_mutex); 1505 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c); 1506 dbg_save_space_info(c);
1490 c->remounting_rw = 1; 1507 c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1554 ubifs_create_buds_lists(c); 1571 ubifs_create_buds_lists(c);
1555 1572
1556 /* Create background thread */ 1573 /* Create background thread */
1557 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1574 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1558 if (IS_ERR(c->bgt)) { 1575 if (IS_ERR(c->bgt)) {
1559 err = PTR_ERR(c->bgt); 1576 err = PTR_ERR(c->bgt);
1560 c->bgt = NULL; 1577 c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1775 c->bu.buf = NULL; 1792 c->bu.buf = NULL;
1776 } 1793 }
1777 1794
1778 ubifs_assert(c->lst.taken_empty_lebs == 1); 1795 ubifs_assert(c->lst.taken_empty_lebs > 0);
1779 return 0; 1796 return 0;
1780} 1797}
1781 1798
@@ -2038,8 +2055,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2038 return 0; 2055 return 0;
2039 2056
2040out_deact: 2057out_deact:
2041 up_write(&sb->s_umount); 2058 deactivate_locked_super(sb);
2042 deactivate_super(sb);
2043out_close: 2059out_close:
2044 ubi_close_volume(ubi); 2060 ubi_close_volume(ubi);
2045 return err; 2061 return err;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1252 * splitting in the middle of the colliding sequence. Also, when 1252 * splitting in the middle of the colliding sequence. Also, when
1253 * removing the leftmost key, we would have to correct the key of the 1253 * removing the leftmost key, we would have to correct the key of the
1254 * parent node, which would introduce additional complications. Namely, 1254 * parent node, which would introduce additional complications. Namely,
1255 * if we changed the the leftmost key of the parent znode, the garbage 1255 * if we changed the leftmost key of the parent znode, the garbage
1256 * collector would be unable to find it (GC is doing this when GC'ing 1256 * collector would be unable to find it (GC is doing this when GC'ing
1257 * indexing LEBs). Although we already have an additional RB-tree where 1257 * indexing LEBs). Although we already have an additional RB-tree where
1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until 1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
36/* UBIFS node magic number (must not have the padding byte first or last) */ 36/* UBIFS node magic number (must not have the padding byte first or last) */
37#define UBIFS_NODE_MAGIC 0x06101831 37#define UBIFS_NODE_MAGIC 0x06101831
38 38
39/* UBIFS on-flash format version */ 39/*
40 * UBIFS on-flash format version. This version is increased when the on-flash
41 * format is changing. If this happens, UBIFS is will support older versions as
42 * well. But older UBIFS code will not support newer formats. Format changes
43 * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
44 * a new feature.
45 *
46 * UBIFS went into mainline kernel with format version 4. The older formats
47 * were development formats.
48 */
40#define UBIFS_FORMAT_VERSION 4 49#define UBIFS_FORMAT_VERSION 4
41 50
51/*
52 * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
53 * implementations will not be able to mount newer formats in read-write mode.
54 * However, depending on the change, it may be possible to mount newer formats
55 * in R/O mode. This is indicated by the R/O compatibility version which is
56 * stored in the super-block.
57 *
58 * This is needed to support boot-loaders which only need R/O mounting. With
59 * this flag it is possible to do UBIFS format changes without a need to update
60 * boot-loaders.
61 */
62#define UBIFS_RO_COMPAT_VERSION 0
63
42/* Minimum logical eraseblock size in bytes */ 64/* Minimum logical eraseblock size in bytes */
43#define UBIFS_MIN_LEB_SZ (15*1024) 65#define UBIFS_MIN_LEB_SZ (15*1024)
44 66
@@ -53,7 +75,7 @@
53 75
54/* 76/*
55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes 77 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
56 * shorter than uncompressed data length, UBIFS preferes to leave this data 78 * shorter than uncompressed data length, UBIFS prefers to leave this data
57 * node uncompress, because it'll be read faster. 79 * node uncompress, because it'll be read faster.
58 */ 80 */
59#define UBIFS_MIN_COMPRESS_DIFF 64 81#define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
586 * @padding2: reserved for future, zeroes 608 * @padding2: reserved for future, zeroes
587 * @time_gran: time granularity in nanoseconds 609 * @time_gran: time granularity in nanoseconds
588 * @uuid: UUID generated when the file system image was created 610 * @uuid: UUID generated when the file system image was created
611 * @ro_compat_version: UBIFS R/O compatibility version
589 */ 612 */
590struct ubifs_sb_node { 613struct ubifs_sb_node {
591 struct ubifs_ch ch; 614 struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
612 __le64 rp_size; 635 __le64 rp_size;
613 __le32 time_gran; 636 __le32 time_gran;
614 __u8 uuid[16]; 637 __u8 uuid[16];
615 __u8 padding2[3972]; 638 __le32 ro_compat_version;
639 __u8 padding2[3968];
616} __attribute__ ((packed)); 640} __attribute__ ((packed));
617 641
618/** 642/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
934 * by @commit_sem 934 * by @commit_sem
935 * @cnt_lock: protects @highest_inum and @max_sqnum counters 935 * @cnt_lock: protects @highest_inum and @max_sqnum counters
936 * @fmt_version: UBIFS on-flash format version 936 * @fmt_version: UBIFS on-flash format version
937 * @ro_compat_version: R/O compatibility version
937 * @uuid: UUID from super block 938 * @uuid: UUID from super block
938 * 939 *
939 * @lhead_lnum: log head logical eraseblock number 940 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
966 * recovery) 967 * recovery)
967 * @bulk_read: enable bulk-reads 968 * @bulk_read: enable bulk-reads
968 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) 969 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
970 * @rw_incompat: the media is not R/W compatible
969 * 971 *
970 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 972 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
971 * @calc_idx_sz 973 * @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
1015 * @min_io_shift: number of bits in @min_io_size minus one 1017 * @min_io_shift: number of bits in @min_io_size minus one
1016 * @leb_size: logical eraseblock size in bytes 1018 * @leb_size: logical eraseblock size in bytes
1017 * @half_leb_size: half LEB size 1019 * @half_leb_size: half LEB size
1020 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
1021 * used to store indexing nodes (@leb_size - @max_idx_node_sz)
1018 * @leb_cnt: count of logical eraseblocks 1022 * @leb_cnt: count of logical eraseblocks
1019 * @max_leb_cnt: maximum count of logical eraseblocks 1023 * @max_leb_cnt: maximum count of logical eraseblocks
1020 * @old_leb_cnt: count of logical eraseblocks before re-size 1024 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
1132 * previous commit start 1136 * previous commit start
1133 * @uncat_list: list of un-categorized LEBs 1137 * @uncat_list: list of un-categorized LEBs
1134 * @empty_list: list of empty LEBs 1138 * @empty_list: list of empty LEBs
1135 * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) 1139 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
1136 * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) 1140 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
1137 * @freeable_cnt: number of freeable LEBs in @freeable_list 1141 * @freeable_cnt: number of freeable LEBs in @freeable_list
1138 * 1142 *
1139 * @ltab_lnum: LEB number of LPT's own lprops table 1143 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
1177 unsigned long long cmt_no; 1181 unsigned long long cmt_no;
1178 spinlock_t cnt_lock; 1182 spinlock_t cnt_lock;
1179 int fmt_version; 1183 int fmt_version;
1184 int ro_compat_version;
1180 unsigned char uuid[16]; 1185 unsigned char uuid[16];
1181 1186
1182 int lhead_lnum; 1187 int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
1205 unsigned int no_chk_data_crc:1; 1210 unsigned int no_chk_data_crc:1;
1206 unsigned int bulk_read:1; 1211 unsigned int bulk_read:1;
1207 unsigned int default_compr:2; 1212 unsigned int default_compr:2;
1213 unsigned int rw_incompat:1;
1208 1214
1209 struct mutex tnc_mutex; 1215 struct mutex tnc_mutex;
1210 struct ubifs_zbranch zroot; 1216 struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
1253 int min_io_shift; 1259 int min_io_shift;
1254 int leb_size; 1260 int leb_size;
1255 int half_leb_size; 1261 int half_leb_size;
1262 int idx_leb_size;
1256 int leb_cnt; 1263 int leb_cnt;
1257 int max_leb_cnt; 1264 int max_leb_cnt;
1258 int old_leb_cnt; 1265 int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
1500long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1507long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1501 1508
1502/* find.c */ 1509/* find.c */
1503int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 1510int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
1504 int squeeze); 1511 int squeeze);
1505int ubifs_find_free_leb_for_idx(struct ubifs_info *c); 1512int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
1506int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 1513int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index dbbbc4668769..6321b797061b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = file_fsync, 669 .fsync = ufs_sync_file,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 625ef17c6f83..2bd3a1615714 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -30,7 +30,7 @@
30#include "ufs.h" 30#include "ufs.h"
31 31
32 32
33static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync) 33int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
34{ 34{
35 struct inode *inode = dentry->d_inode; 35 struct inode *inode = dentry->d_inode;
36 int err; 36 int err;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 69b3427d7885..d0c4acd4f1f3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -98,8 +98,8 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
98/* file.c */ 98/* file.c */
99extern const struct inode_operations ufs_file_inode_operations; 99extern const struct inode_operations ufs_file_inode_operations;
100extern const struct file_operations ufs_file_operations; 100extern const struct file_operations ufs_file_operations;
101
102extern const struct address_space_operations ufs_aops; 101extern const struct address_space_operations ufs_aops;
102extern int ufs_sync_file(struct file *, struct dentry *, int);
103 103
104/* ialloc.c */ 104/* ialloc.c */
105extern void ufs_free_inode (struct inode *inode); 105extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 197c4fcac032..d51b8f9db921 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
237 if (size) { 237 if (size) {
238 if (size > XATTR_SIZE_MAX) 238 if (size > XATTR_SIZE_MAX)
239 return -E2BIG; 239 return -E2BIG;
240 kvalue = kmalloc(size, GFP_KERNEL); 240 kvalue = memdup_user(value, size);
241 if (!kvalue) 241 if (IS_ERR(kvalue))
242 return -ENOMEM; 242 return PTR_ERR(kvalue);
243 if (copy_from_user(kvalue, value, size)) {
244 kfree(kvalue);
245 return -EFAULT;
246 }
247 } 243 }
248 244
249 error = vfs_setxattr(d, kname, kvalue, size, flags); 245 error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c13f67300fe7..7ec89fc05b2b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -153,23 +153,6 @@ xfs_find_bdev_for_inode(
153} 153}
154 154
155/* 155/*
156 * Schedule IO completion handling on a xfsdatad if this was
157 * the final hold on this ioend. If we are asked to wait,
158 * flush the workqueue.
159 */
160STATIC void
161xfs_finish_ioend(
162 xfs_ioend_t *ioend,
163 int wait)
164{
165 if (atomic_dec_and_test(&ioend->io_remaining)) {
166 queue_work(xfsdatad_workqueue, &ioend->io_work);
167 if (wait)
168 flush_workqueue(xfsdatad_workqueue);
169 }
170}
171
172/*
173 * We're now finished for good with this ioend structure. 156 * We're now finished for good with this ioend structure.
174 * Update the page state via the associated buffer_heads, 157 * Update the page state via the associated buffer_heads,
175 * release holds on the inode and bio, and finally free 158 * release holds on the inode and bio, and finally free
@@ -310,6 +293,27 @@ xfs_end_bio_read(
310} 293}
311 294
312/* 295/*
296 * Schedule IO completion handling on a xfsdatad if this was
297 * the final hold on this ioend. If we are asked to wait,
298 * flush the workqueue.
299 */
300STATIC void
301xfs_finish_ioend(
302 xfs_ioend_t *ioend,
303 int wait)
304{
305 if (atomic_dec_and_test(&ioend->io_remaining)) {
306 struct workqueue_struct *wq = xfsdatad_workqueue;
307 if (ioend->io_work.func == xfs_end_bio_unwritten)
308 wq = xfsconvertd_workqueue;
309
310 queue_work(wq, &ioend->io_work);
311 if (wait)
312 flush_workqueue(wq);
313 }
314}
315
316/*
313 * Allocate and initialise an IO completion structure. 317 * Allocate and initialise an IO completion structure.
314 * We need to track unwritten extent write completion here initially. 318 * We need to track unwritten extent write completion here initially.
315 * We'll need to extend this for updating the ondisk inode size later 319 * We'll need to extend this for updating the ondisk inode size later
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 1dd528849755..221b3e66ceef 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -19,6 +19,7 @@
19#define __XFS_AOPS_H__ 19#define __XFS_AOPS_H__
20 20
21extern struct workqueue_struct *xfsdatad_workqueue; 21extern struct workqueue_struct *xfsdatad_workqueue;
22extern struct workqueue_struct *xfsconvertd_workqueue;
22extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
23 24
24/* 25/*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1016bb9134..e28800a9f2b5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = {
51 51
52static struct workqueue_struct *xfslogd_workqueue; 52static struct workqueue_struct *xfslogd_workqueue;
53struct workqueue_struct *xfsdatad_workqueue; 53struct workqueue_struct *xfsdatad_workqueue;
54struct workqueue_struct *xfsconvertd_workqueue;
54 55
55#ifdef XFS_BUF_TRACE 56#ifdef XFS_BUF_TRACE
56void 57void
@@ -1775,6 +1776,7 @@ xfs_flush_buftarg(
1775 xfs_buf_t *bp, *n; 1776 xfs_buf_t *bp, *n;
1776 int pincount = 0; 1777 int pincount = 0;
1777 1778
1779 xfs_buf_runall_queues(xfsconvertd_workqueue);
1778 xfs_buf_runall_queues(xfsdatad_workqueue); 1780 xfs_buf_runall_queues(xfsdatad_workqueue);
1779 xfs_buf_runall_queues(xfslogd_workqueue); 1781 xfs_buf_runall_queues(xfslogd_workqueue);
1780 1782
@@ -1831,9 +1833,15 @@ xfs_buf_init(void)
1831 if (!xfsdatad_workqueue) 1833 if (!xfsdatad_workqueue)
1832 goto out_destroy_xfslogd_workqueue; 1834 goto out_destroy_xfslogd_workqueue;
1833 1835
1836 xfsconvertd_workqueue = create_workqueue("xfsconvertd");
1837 if (!xfsconvertd_workqueue)
1838 goto out_destroy_xfsdatad_workqueue;
1839
1834 register_shrinker(&xfs_buf_shake); 1840 register_shrinker(&xfs_buf_shake);
1835 return 0; 1841 return 0;
1836 1842
1843 out_destroy_xfsdatad_workqueue:
1844 destroy_workqueue(xfsdatad_workqueue);
1837 out_destroy_xfslogd_workqueue: 1845 out_destroy_xfslogd_workqueue:
1838 destroy_workqueue(xfslogd_workqueue); 1846 destroy_workqueue(xfslogd_workqueue);
1839 out_free_buf_zone: 1847 out_free_buf_zone:
@@ -1849,6 +1857,7 @@ void
1849xfs_buf_terminate(void) 1857xfs_buf_terminate(void)
1850{ 1858{
1851 unregister_shrinker(&xfs_buf_shake); 1859 unregister_shrinker(&xfs_buf_shake);
1860 destroy_workqueue(xfsconvertd_workqueue);
1852 destroy_workqueue(xfsdatad_workqueue); 1861 destroy_workqueue(xfsdatad_workqueue);
1853 destroy_workqueue(xfslogd_workqueue); 1862 destroy_workqueue(xfslogd_workqueue);
1854 kmem_zone_destroy(xfs_buf_zone); 1863 kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 5aeb77776961..08be36d7326c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -74,14 +74,14 @@ xfs_flush_pages(
74 74
75 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 75 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
76 xfs_iflags_clear(ip, XFS_ITRUNCATED); 76 xfs_iflags_clear(ip, XFS_ITRUNCATED);
77 ret = filemap_fdatawrite(mapping); 77 ret = -filemap_fdatawrite(mapping);
78 if (flags & XFS_B_ASYNC)
79 return -ret;
80 ret2 = filemap_fdatawait(mapping);
81 if (!ret)
82 ret = ret2;
83 } 78 }
84 return -ret; 79 if (flags & XFS_B_ASYNC)
80 return ret;
81 ret2 = xfs_wait_on_pages(ip, first, last);
82 if (!ret)
83 ret = ret2;
84 return ret;
85} 85}
86 86
87int 87int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b499418a7d..34eaab608e6e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -489,17 +489,12 @@ xfs_attrmulti_attr_set(
489 if (len > XATTR_SIZE_MAX) 489 if (len > XATTR_SIZE_MAX)
490 return EINVAL; 490 return EINVAL;
491 491
492 kbuf = kmalloc(len, GFP_KERNEL); 492 kbuf = memdup_user(ubuf, len);
493 if (!kbuf) 493 if (IS_ERR(kbuf))
494 return ENOMEM; 494 return PTR_ERR(kbuf);
495
496 if (copy_from_user(kbuf, ubuf, len))
497 goto out_kfree;
498 495
499 error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); 496 error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
500 497
501 out_kfree:
502 kfree(kbuf);
503 return error; 498 return error;
504} 499}
505 500
@@ -540,20 +535,16 @@ xfs_attrmulti_by_handle(
540 if (!size || size > 16 * PAGE_SIZE) 535 if (!size || size > 16 * PAGE_SIZE)
541 goto out_dput; 536 goto out_dput;
542 537
543 error = ENOMEM; 538 ops = memdup_user(am_hreq.ops, size);
544 ops = kmalloc(size, GFP_KERNEL); 539 if (IS_ERR(ops)) {
545 if (!ops) 540 error = PTR_ERR(ops);
546 goto out_dput; 541 goto out_dput;
547 542 }
548 error = EFAULT;
549 if (copy_from_user(ops, am_hreq.ops, size))
550 goto out_kfree_ops;
551 543
552 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); 544 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
553 if (!attr_name) 545 if (!attr_name)
554 goto out_kfree_ops; 546 goto out_kfree_ops;
555 547
556
557 error = 0; 548 error = 0;
558 for (i = 0; i < am_hreq.opcount; i++) { 549 for (i = 0; i < am_hreq.opcount; i++) {
559 ops[i].am_error = strncpy_from_user(attr_name, 550 ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c70c4e3db790..0882d166239a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle(
427 if (!size || size > 16 * PAGE_SIZE) 427 if (!size || size > 16 * PAGE_SIZE)
428 goto out_dput; 428 goto out_dput;
429 429
430 error = ENOMEM; 430 ops = memdup_user(compat_ptr(am_hreq.ops), size);
431 ops = kmalloc(size, GFP_KERNEL); 431 if (IS_ERR(ops)) {
432 if (!ops) 432 error = PTR_ERR(ops);
433 goto out_dput; 433 goto out_dput;
434 434 }
435 error = EFAULT;
436 if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
437 goto out_kfree_ops;
438 435
439 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); 436 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
440 if (!attr_name) 437 if (!attr_name)
441 goto out_kfree_ops; 438 goto out_kfree_ops;
442 439
443
444 error = 0; 440 error = 0;
445 for (i = 0; i < am_hreq.opcount; i++) { 441 for (i = 0; i < am_hreq.opcount; i++) {
446 ops[i].am_error = strncpy_from_user(attr_name, 442 ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7e90daa0d1d1..9142192ccbe6 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -751,10 +751,26 @@ start:
751 goto relock; 751 goto relock;
752 } 752 }
753 } else { 753 } else {
754 int enospc = 0;
755 ssize_t ret2 = 0;
756
757write_retry:
754 xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, 758 xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
755 *offset, ioflags); 759 *offset, ioflags);
756 ret = generic_file_buffered_write(iocb, iovp, segs, 760 ret2 = generic_file_buffered_write(iocb, iovp, segs,
757 pos, offset, count, ret); 761 pos, offset, count, ret);
762 /*
763 * if we just got an ENOSPC, flush the inode now we
764 * aren't holding any page locks and retry *once*
765 */
766 if (ret2 == -ENOSPC && !enospc) {
767 error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
768 if (error)
769 goto out_unlock_internal;
770 enospc = 1;
771 goto write_retry;
772 }
773 ret = ret2;
758 } 774 }
759 775
760 current->backing_dev_info = NULL; 776 current->backing_dev_info = NULL;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a608e72fa405..f7ba76633c29 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
62 uint32_t first_index = 0; 62 uint32_t first_index = 0;
63 int error = 0; 63 int error = 0;
64 int last_error = 0; 64 int last_error = 0;
65 int fflag = XFS_B_ASYNC;
66
67 if (flags & SYNC_DELWRI)
68 fflag = XFS_B_DELWRI;
69 if (flags & SYNC_WAIT)
70 fflag = 0; /* synchronous overrides all */
71 65
72 do { 66 do {
73 struct inode *inode; 67 struct inode *inode;
@@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
128 * If we have to flush data or wait for I/O completion 122 * If we have to flush data or wait for I/O completion
129 * we need to hold the iolock. 123 * we need to hold the iolock.
130 */ 124 */
131 if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) { 125 if (flags & SYNC_DELWRI) {
132 xfs_ilock(ip, XFS_IOLOCK_SHARED); 126 if (VN_DIRTY(inode)) {
133 lock_flags |= XFS_IOLOCK_SHARED; 127 if (flags & SYNC_TRYLOCK) {
134 error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE); 128 if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
135 if (flags & SYNC_IOWAIT) 129 lock_flags |= XFS_IOLOCK_SHARED;
130 } else {
131 xfs_ilock(ip, XFS_IOLOCK_SHARED);
132 lock_flags |= XFS_IOLOCK_SHARED;
133 }
134 if (lock_flags & XFS_IOLOCK_SHARED) {
135 error = xfs_flush_pages(ip, 0, -1,
136 (flags & SYNC_WAIT) ? 0
137 : XFS_B_ASYNC,
138 FI_NONE);
139 }
140 }
141 if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
136 xfs_ioend_wait(ip); 142 xfs_ioend_wait(ip);
137 } 143 }
138 xfs_ilock(ip, XFS_ILOCK_SHARED); 144 xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -398,15 +404,17 @@ STATIC void
398xfs_syncd_queue_work( 404xfs_syncd_queue_work(
399 struct xfs_mount *mp, 405 struct xfs_mount *mp,
400 void *data, 406 void *data,
401 void (*syncer)(struct xfs_mount *, void *)) 407 void (*syncer)(struct xfs_mount *, void *),
408 struct completion *completion)
402{ 409{
403 struct bhv_vfs_sync_work *work; 410 struct xfs_sync_work *work;
404 411
405 work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP); 412 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
406 INIT_LIST_HEAD(&work->w_list); 413 INIT_LIST_HEAD(&work->w_list);
407 work->w_syncer = syncer; 414 work->w_syncer = syncer;
408 work->w_data = data; 415 work->w_data = data;
409 work->w_mount = mp; 416 work->w_mount = mp;
417 work->w_completion = completion;
410 spin_lock(&mp->m_sync_lock); 418 spin_lock(&mp->m_sync_lock);
411 list_add_tail(&work->w_list, &mp->m_sync_list); 419 list_add_tail(&work->w_list, &mp->m_sync_list);
412 spin_unlock(&mp->m_sync_lock); 420 spin_unlock(&mp->m_sync_lock);
@@ -420,49 +428,26 @@ xfs_syncd_queue_work(
420 * heads, looking about for more room... 428 * heads, looking about for more room...
421 */ 429 */
422STATIC void 430STATIC void
423xfs_flush_inode_work( 431xfs_flush_inodes_work(
424 struct xfs_mount *mp,
425 void *arg)
426{
427 struct inode *inode = arg;
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430}
431
432void
433xfs_flush_inode(
434 xfs_inode_t *ip)
435{
436 struct inode *inode = VFS_I(ip);
437
438 igrab(inode);
439 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
440 delay(msecs_to_jiffies(500));
441}
442
443/*
444 * This is the "bigger hammer" version of xfs_flush_inode_work...
445 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
446 */
447STATIC void
448xfs_flush_device_work(
449 struct xfs_mount *mp, 432 struct xfs_mount *mp,
450 void *arg) 433 void *arg)
451{ 434{
452 struct inode *inode = arg; 435 struct inode *inode = arg;
453 sync_blockdev(mp->m_super->s_bdev); 436 xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
437 xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
454 iput(inode); 438 iput(inode);
455} 439}
456 440
457void 441void
458xfs_flush_device( 442xfs_flush_inodes(
459 xfs_inode_t *ip) 443 xfs_inode_t *ip)
460{ 444{
461 struct inode *inode = VFS_I(ip); 445 struct inode *inode = VFS_I(ip);
446 DECLARE_COMPLETION_ONSTACK(completion);
462 447
463 igrab(inode); 448 igrab(inode);
464 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); 449 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
465 delay(msecs_to_jiffies(500)); 450 wait_for_completion(&completion);
466 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 451 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
467} 452}
468 453
@@ -497,7 +482,7 @@ xfssyncd(
497{ 482{
498 struct xfs_mount *mp = arg; 483 struct xfs_mount *mp = arg;
499 long timeleft; 484 long timeleft;
500 bhv_vfs_sync_work_t *work, *n; 485 xfs_sync_work_t *work, *n;
501 LIST_HEAD (tmp); 486 LIST_HEAD (tmp);
502 487
503 set_freezable(); 488 set_freezable();
@@ -532,6 +517,8 @@ xfssyncd(
532 list_del(&work->w_list); 517 list_del(&work->w_list);
533 if (work == &mp->m_sync_work) 518 if (work == &mp->m_sync_work)
534 continue; 519 continue;
520 if (work->w_completion)
521 complete(work->w_completion);
535 kmem_free(work); 522 kmem_free(work);
536 } 523 }
537 } 524 }
@@ -545,6 +532,7 @@ xfs_syncd_init(
545{ 532{
546 mp->m_sync_work.w_syncer = xfs_sync_worker; 533 mp->m_sync_work.w_syncer = xfs_sync_worker;
547 mp->m_sync_work.w_mount = mp; 534 mp->m_sync_work.w_mount = mp;
535 mp->m_sync_work.w_completion = NULL;
548 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); 536 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
549 if (IS_ERR(mp->m_sync_task)) 537 if (IS_ERR(mp->m_sync_task))
550 return -PTR_ERR(mp->m_sync_task); 538 return -PTR_ERR(mp->m_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 04f058c848ae..308d5bf6dfbd 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -21,18 +21,20 @@
21struct xfs_mount; 21struct xfs_mount;
22struct xfs_perag; 22struct xfs_perag;
23 23
24typedef struct bhv_vfs_sync_work { 24typedef struct xfs_sync_work {
25 struct list_head w_list; 25 struct list_head w_list;
26 struct xfs_mount *w_mount; 26 struct xfs_mount *w_mount;
27 void *w_data; /* syncer routine argument */ 27 void *w_data; /* syncer routine argument */
28 void (*w_syncer)(struct xfs_mount *, void *); 28 void (*w_syncer)(struct xfs_mount *, void *);
29} bhv_vfs_sync_work_t; 29 struct completion *w_completion;
30} xfs_sync_work_t;
30 31
31#define SYNC_ATTR 0x0001 /* sync attributes */ 32#define SYNC_ATTR 0x0001 /* sync attributes */
32#define SYNC_DELWRI 0x0002 /* look at delayed writes */ 33#define SYNC_DELWRI 0x0002 /* look at delayed writes */
33#define SYNC_WAIT 0x0004 /* wait for i/o to complete */ 34#define SYNC_WAIT 0x0004 /* wait for i/o to complete */
34#define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */ 35#define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */
35#define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */ 36#define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */
37#define SYNC_TRYLOCK 0x0020 /* only try to lock inodes */
36 38
37int xfs_syncd_init(struct xfs_mount *mp); 39int xfs_syncd_init(struct xfs_mount *mp);
38void xfs_syncd_stop(struct xfs_mount *mp); 40void xfs_syncd_stop(struct xfs_mount *mp);
@@ -43,8 +45,7 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
43int xfs_quiesce_data(struct xfs_mount *mp); 45int xfs_quiesce_data(struct xfs_mount *mp);
44void xfs_quiesce_attr(struct xfs_mount *mp); 46void xfs_quiesce_attr(struct xfs_mount *mp);
45 47
46void xfs_flush_inode(struct xfs_inode *ip); 48void xfs_flush_inodes(struct xfs_inode *ip);
47void xfs_flush_device(struct xfs_inode *ip);
48 49
49int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); 50int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
50int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode); 51int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3a6ed426327a..ca7c6005a487 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5880,7 +5880,7 @@ xfs_getbmap(
5880 void *arg) /* formatter arg */ 5880 void *arg) /* formatter arg */
5881{ 5881{
5882 __int64_t bmvend; /* last block requested */ 5882 __int64_t bmvend; /* last block requested */
5883 int error; /* return value */ 5883 int error = 0; /* return value */
5884 __int64_t fixlen; /* length for -1 case */ 5884 __int64_t fixlen; /* length for -1 case */
5885 int i; /* extent number */ 5885 int i; /* extent number */
5886 int lock; /* lock state */ 5886 int lock; /* lock state */
@@ -5890,39 +5890,18 @@ xfs_getbmap(
5890 int nexleft; /* # of user extents left */ 5890 int nexleft; /* # of user extents left */
5891 int subnex; /* # of bmapi's can do */ 5891 int subnex; /* # of bmapi's can do */
5892 int nmap; /* number of map entries */ 5892 int nmap; /* number of map entries */
5893 struct getbmapx out; /* output structure */ 5893 struct getbmapx *out; /* output structure */
5894 int whichfork; /* data or attr fork */ 5894 int whichfork; /* data or attr fork */
5895 int prealloced; /* this is a file with 5895 int prealloced; /* this is a file with
5896 * preallocated data space */ 5896 * preallocated data space */
5897 int iflags; /* interface flags */ 5897 int iflags; /* interface flags */
5898 int bmapi_flags; /* flags for xfs_bmapi */ 5898 int bmapi_flags; /* flags for xfs_bmapi */
5899 int cur_ext = 0;
5899 5900
5900 mp = ip->i_mount; 5901 mp = ip->i_mount;
5901 iflags = bmv->bmv_iflags; 5902 iflags = bmv->bmv_iflags;
5902
5903 whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; 5903 whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
5904 5904
5905 /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
5906 * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ
5907 * bit is set for the file, generate a read event in order
5908 * that the DMAPI application may do its thing before we return
5909 * the extents. Usually this means restoring user file data to
5910 * regions of the file that look like holes.
5911 *
5912 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
5913 * BMV_IF_NO_DMAPI_READ so that read events are generated.
5914 * If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
5915 * could misinterpret holes in a DMAPI file as true holes,
5916 * when in fact they may represent offline user data.
5917 */
5918 if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
5919 DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
5920 whichfork == XFS_DATA_FORK) {
5921 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
5922 if (error)
5923 return XFS_ERROR(error);
5924 }
5925
5926 if (whichfork == XFS_ATTR_FORK) { 5905 if (whichfork == XFS_ATTR_FORK) {
5927 if (XFS_IFORK_Q(ip)) { 5906 if (XFS_IFORK_Q(ip)) {
5928 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && 5907 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
@@ -5936,11 +5915,37 @@ xfs_getbmap(
5936 ip->i_mount); 5915 ip->i_mount);
5937 return XFS_ERROR(EFSCORRUPTED); 5916 return XFS_ERROR(EFSCORRUPTED);
5938 } 5917 }
5939 } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && 5918
5940 ip->i_d.di_format != XFS_DINODE_FMT_BTREE && 5919 prealloced = 0;
5941 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) 5920 fixlen = 1LL << 32;
5942 return XFS_ERROR(EINVAL); 5921 } else {
5943 if (whichfork == XFS_DATA_FORK) { 5922 /*
5923 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
5924 * not generate a DMAPI read event. Otherwise, if the
5925 * DM_EVENT_READ bit is set for the file, generate a read
5926 * event in order that the DMAPI application may do its thing
5927 * before we return the extents. Usually this means restoring
5928 * user file data to regions of the file that look like holes.
5929 *
5930 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
5931 * BMV_IF_NO_DMAPI_READ so that read events are generated.
5932 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
5933 * could misinterpret holes in a DMAPI file as true holes,
5934 * when in fact they may represent offline user data.
5935 */
5936 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
5937 !(iflags & BMV_IF_NO_DMAPI_READ)) {
5938 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
5939 0, 0, 0, NULL);
5940 if (error)
5941 return XFS_ERROR(error);
5942 }
5943
5944 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
5945 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
5946 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
5947 return XFS_ERROR(EINVAL);
5948
5944 if (xfs_get_extsz_hint(ip) || 5949 if (xfs_get_extsz_hint(ip) ||
5945 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ 5950 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
5946 prealloced = 1; 5951 prealloced = 1;
@@ -5949,42 +5954,41 @@ xfs_getbmap(
5949 prealloced = 0; 5954 prealloced = 0;
5950 fixlen = ip->i_size; 5955 fixlen = ip->i_size;
5951 } 5956 }
5952 } else {
5953 prealloced = 0;
5954 fixlen = 1LL << 32;
5955 } 5957 }
5956 5958
5957 if (bmv->bmv_length == -1) { 5959 if (bmv->bmv_length == -1) {
5958 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); 5960 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
5959 bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset), 5961 bmv->bmv_length =
5960 (__int64_t)0); 5962 max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
5961 } else if (bmv->bmv_length < 0) 5963 } else if (bmv->bmv_length == 0) {
5962 return XFS_ERROR(EINVAL);
5963 if (bmv->bmv_length == 0) {
5964 bmv->bmv_entries = 0; 5964 bmv->bmv_entries = 0;
5965 return 0; 5965 return 0;
5966 } else if (bmv->bmv_length < 0) {
5967 return XFS_ERROR(EINVAL);
5966 } 5968 }
5969
5967 nex = bmv->bmv_count - 1; 5970 nex = bmv->bmv_count - 1;
5968 if (nex <= 0) 5971 if (nex <= 0)
5969 return XFS_ERROR(EINVAL); 5972 return XFS_ERROR(EINVAL);
5970 bmvend = bmv->bmv_offset + bmv->bmv_length; 5973 bmvend = bmv->bmv_offset + bmv->bmv_length;
5971 5974
5972 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5973 5975
5974 if (((iflags & BMV_IF_DELALLOC) == 0) && 5976 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
5975 (whichfork == XFS_DATA_FORK) && 5977 return XFS_ERROR(ENOMEM);
5976 (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { 5978 out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
5977 /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ 5979 if (!out)
5978 error = xfs_flush_pages(ip, (xfs_off_t)0, 5980 return XFS_ERROR(ENOMEM);
5979 -1, 0, FI_REMAPF); 5981
5980 if (error) { 5982 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5981 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 5983 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5982 return error; 5984 if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
5985 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
5986 if (error)
5987 goto out_unlock_iolock;
5983 } 5988 }
5984 }
5985 5989
5986 ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) || 5990 ASSERT(ip->i_delayed_blks == 0);
5987 ip->i_delayed_blks == 0); 5991 }
5988 5992
5989 lock = xfs_ilock_map_shared(ip); 5993 lock = xfs_ilock_map_shared(ip);
5990 5994
@@ -5995,23 +5999,25 @@ xfs_getbmap(
5995 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) 5999 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
5996 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; 6000 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
5997 6001
5998 bmapi_flags = xfs_bmapi_aflag(whichfork) | 6002 bmapi_flags = xfs_bmapi_aflag(whichfork);
5999 ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); 6003 if (!(iflags & BMV_IF_PREALLOC))
6004 bmapi_flags |= XFS_BMAPI_IGSTATE;
6000 6005
6001 /* 6006 /*
6002 * Allocate enough space to handle "subnex" maps at a time. 6007 * Allocate enough space to handle "subnex" maps at a time.
6003 */ 6008 */
6009 error = ENOMEM;
6004 subnex = 16; 6010 subnex = 16;
6005 map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP); 6011 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
6012 if (!map)
6013 goto out_unlock_ilock;
6006 6014
6007 bmv->bmv_entries = 0; 6015 bmv->bmv_entries = 0;
6008 6016
6009 if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) { 6017 if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
6010 if (((iflags & BMV_IF_DELALLOC) == 0) || 6018 (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
6011 whichfork == XFS_ATTR_FORK) { 6019 error = 0;
6012 error = 0; 6020 goto out_free_map;
6013 goto unlock_and_return;
6014 }
6015 } 6021 }
6016 6022
6017 nexleft = nex; 6023 nexleft = nex;
@@ -6023,53 +6029,61 @@ xfs_getbmap(
6023 bmapi_flags, NULL, 0, map, &nmap, 6029 bmapi_flags, NULL, 0, map, &nmap,
6024 NULL, NULL); 6030 NULL, NULL);
6025 if (error) 6031 if (error)
6026 goto unlock_and_return; 6032 goto out_free_map;
6027 ASSERT(nmap <= subnex); 6033 ASSERT(nmap <= subnex);
6028 6034
6029 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { 6035 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
6030 out.bmv_oflags = 0; 6036 out[cur_ext].bmv_oflags = 0;
6031 if (map[i].br_state == XFS_EXT_UNWRITTEN) 6037 if (map[i].br_state == XFS_EXT_UNWRITTEN)
6032 out.bmv_oflags |= BMV_OF_PREALLOC; 6038 out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
6033 else if (map[i].br_startblock == DELAYSTARTBLOCK) 6039 else if (map[i].br_startblock == DELAYSTARTBLOCK)
6034 out.bmv_oflags |= BMV_OF_DELALLOC; 6040 out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
6035 out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); 6041 out[cur_ext].bmv_offset =
6036 out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 6042 XFS_FSB_TO_BB(mp, map[i].br_startoff);
6037 out.bmv_unused1 = out.bmv_unused2 = 0; 6043 out[cur_ext].bmv_length =
6044 XFS_FSB_TO_BB(mp, map[i].br_blockcount);
6045 out[cur_ext].bmv_unused1 = 0;
6046 out[cur_ext].bmv_unused2 = 0;
6038 ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || 6047 ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
6039 (map[i].br_startblock != DELAYSTARTBLOCK)); 6048 (map[i].br_startblock != DELAYSTARTBLOCK));
6040 if (map[i].br_startblock == HOLESTARTBLOCK && 6049 if (map[i].br_startblock == HOLESTARTBLOCK &&
6041 whichfork == XFS_ATTR_FORK) { 6050 whichfork == XFS_ATTR_FORK) {
6042 /* came to the end of attribute fork */ 6051 /* came to the end of attribute fork */
6043 out.bmv_oflags |= BMV_OF_LAST; 6052 out[cur_ext].bmv_oflags |= BMV_OF_LAST;
6044 goto unlock_and_return; 6053 goto out_free_map;
6045 } else {
6046 int full = 0; /* user array is full */
6047
6048 if (!xfs_getbmapx_fix_eof_hole(ip, &out,
6049 prealloced, bmvend,
6050 map[i].br_startblock)) {
6051 goto unlock_and_return;
6052 }
6053
6054 /* format results & advance arg */
6055 error = formatter(&arg, &out, &full);
6056 if (error || full)
6057 goto unlock_and_return;
6058 nexleft--;
6059 bmv->bmv_offset =
6060 out.bmv_offset + out.bmv_length;
6061 bmv->bmv_length = MAX((__int64_t)0,
6062 (__int64_t)(bmvend - bmv->bmv_offset));
6063 bmv->bmv_entries++;
6064 } 6054 }
6055
6056 if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
6057 prealloced, bmvend,
6058 map[i].br_startblock))
6059 goto out_free_map;
6060
6061 nexleft--;
6062 bmv->bmv_offset =
6063 out[cur_ext].bmv_offset +
6064 out[cur_ext].bmv_length;
6065 bmv->bmv_length =
6066 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
6067 bmv->bmv_entries++;
6068 cur_ext++;
6065 } 6069 }
6066 } while (nmap && nexleft && bmv->bmv_length); 6070 } while (nmap && nexleft && bmv->bmv_length);
6067 6071
6068unlock_and_return: 6072 out_free_map:
6073 kmem_free(map);
6074 out_unlock_ilock:
6069 xfs_iunlock_map_shared(ip, lock); 6075 xfs_iunlock_map_shared(ip, lock);
6076 out_unlock_iolock:
6070 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 6077 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
6071 6078
6072 kmem_free(map); 6079 for (i = 0; i < cur_ext; i++) {
6080 int full = 0; /* user array is full */
6081
6082 /* format results & advance arg */
6083 error = formatter(&arg, &out[i], &full);
6084 if (error || full)
6085 break;
6086 }
6073 6087
6074 return error; 6088 return error;
6075} 6089}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 478e587087fe..89b81eedce6a 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -69,15 +69,6 @@ xfs_inode_alloc(
69 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 69 ASSERT(!spin_is_locked(&ip->i_flags_lock));
70 ASSERT(completion_done(&ip->i_flush)); 70 ASSERT(completion_done(&ip->i_flush));
71 71
72 /*
73 * initialise the VFS inode here to get failures
74 * out of the way early.
75 */
76 if (!inode_init_always(mp->m_super, VFS_I(ip))) {
77 kmem_zone_free(xfs_inode_zone, ip);
78 return NULL;
79 }
80
81 /* initialise the xfs inode */ 72 /* initialise the xfs inode */
82 ip->i_ino = ino; 73 ip->i_ino = ino;
83 ip->i_mount = mp; 74 ip->i_mount = mp;
@@ -113,6 +104,20 @@ xfs_inode_alloc(
113#ifdef XFS_DIR2_TRACE 104#ifdef XFS_DIR2_TRACE
114 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); 105 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
115#endif 106#endif
107 /*
108 * Now initialise the VFS inode. We do this after the xfs_inode
109 * initialisation as internal failures will result in ->destroy_inode
110 * being called and that will pass down through the reclaim path and
111 * free the XFS inode. This path requires the XFS inode to already be
112 * initialised. Hence if this call fails, the xfs_inode has already
113 * been freed and we should not reference it at all in the error
114 * handling.
115 */
116 if (!inode_init_always(mp->m_super, VFS_I(ip)))
117 return NULL;
118
119 /* prevent anyone from using this yet */
120 VFS_I(ip)->i_state = I_NEW|I_LOCK;
116 121
117 return ip; 122 return ip;
118} 123}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e7ae08d1df48..123b20c8cbf2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1258,8 +1258,10 @@ xfs_file_last_byte(
1258 * necessary. 1258 * necessary.
1259 */ 1259 */
1260 if (ip->i_df.if_flags & XFS_IFEXTENTS) { 1260 if (ip->i_df.if_flags & XFS_IFEXTENTS) {
1261 xfs_ilock(ip, XFS_ILOCK_SHARED);
1261 error = xfs_bmap_last_offset(NULL, ip, &last_block, 1262 error = xfs_bmap_last_offset(NULL, ip, &last_block,
1262 XFS_DATA_FORK); 1263 XFS_DATA_FORK);
1264 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1263 if (error) { 1265 if (error) {
1264 last_block = 0; 1266 last_block = 0;
1265 } 1267 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 08ce72316bfe..5aaa2d7ec155 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -338,38 +338,6 @@ xfs_iomap_eof_align_last_fsb(
338} 338}
339 339
340STATIC int 340STATIC int
341xfs_flush_space(
342 xfs_inode_t *ip,
343 int *fsynced,
344 int *ioflags)
345{
346 switch (*fsynced) {
347 case 0:
348 if (ip->i_delayed_blks) {
349 xfs_iunlock(ip, XFS_ILOCK_EXCL);
350 xfs_flush_inode(ip);
351 xfs_ilock(ip, XFS_ILOCK_EXCL);
352 *fsynced = 1;
353 } else {
354 *ioflags |= BMAPI_SYNC;
355 *fsynced = 2;
356 }
357 return 0;
358 case 1:
359 *fsynced = 2;
360 *ioflags |= BMAPI_SYNC;
361 return 0;
362 case 2:
363 xfs_iunlock(ip, XFS_ILOCK_EXCL);
364 xfs_flush_device(ip);
365 xfs_ilock(ip, XFS_ILOCK_EXCL);
366 *fsynced = 3;
367 return 0;
368 }
369 return 1;
370}
371
372STATIC int
373xfs_cmn_err_fsblock_zero( 341xfs_cmn_err_fsblock_zero(
374 xfs_inode_t *ip, 342 xfs_inode_t *ip,
375 xfs_bmbt_irec_t *imap) 343 xfs_bmbt_irec_t *imap)
@@ -538,15 +506,9 @@ error_out:
538} 506}
539 507
540/* 508/*
541 * If the caller is doing a write at the end of the file, 509 * If the caller is doing a write at the end of the file, then extend the
542 * then extend the allocation out to the file system's write 510 * allocation out to the file system's write iosize. We clean up any extra
543 * iosize. We clean up any extra space left over when the 511 * space left over when the file is closed in xfs_inactive().
544 * file is closed in xfs_inactive().
545 *
546 * For sync writes, we are flushing delayed allocate space to
547 * try to make additional space available for allocation near
548 * the filesystem full boundary - preallocation hurts in that
549 * situation, of course.
550 */ 512 */
551STATIC int 513STATIC int
552xfs_iomap_eof_want_preallocate( 514xfs_iomap_eof_want_preallocate(
@@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate(
565 int n, error, imaps; 527 int n, error, imaps;
566 528
567 *prealloc = 0; 529 *prealloc = 0;
568 if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size) 530 if ((offset + count) <= ip->i_size)
569 return 0; 531 return 0;
570 532
571 /* 533 /*
@@ -611,7 +573,7 @@ xfs_iomap_write_delay(
611 xfs_extlen_t extsz; 573 xfs_extlen_t extsz;
612 int nimaps; 574 int nimaps;
613 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 575 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
614 int prealloc, fsynced = 0; 576 int prealloc, flushed = 0;
615 int error; 577 int error;
616 578
617 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 579 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -627,12 +589,12 @@ xfs_iomap_write_delay(
627 extsz = xfs_get_extsz_hint(ip); 589 extsz = xfs_get_extsz_hint(ip);
628 offset_fsb = XFS_B_TO_FSBT(mp, offset); 590 offset_fsb = XFS_B_TO_FSBT(mp, offset);
629 591
630retry:
631 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 592 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
632 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 593 ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
633 if (error) 594 if (error)
634 return error; 595 return error;
635 596
597retry:
636 if (prealloc) { 598 if (prealloc) {
637 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 599 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
638 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 600 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
@@ -659,15 +621,22 @@ retry:
659 621
660 /* 622 /*
661 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 623 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
662 * then we must have run out of space - flush delalloc, and retry.. 624 * then we must have run out of space - flush all other inodes with
625 * delalloc blocks and retry without EOF preallocation.
663 */ 626 */
664 if (nimaps == 0) { 627 if (nimaps == 0) {
665 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, 628 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
666 ip, offset, count); 629 ip, offset, count);
667 if (xfs_flush_space(ip, &fsynced, &ioflag)) 630 if (flushed)
668 return XFS_ERROR(ENOSPC); 631 return XFS_ERROR(ENOSPC);
669 632
633 xfs_iunlock(ip, XFS_ILOCK_EXCL);
634 xfs_flush_inodes(ip);
635 xfs_ilock(ip, XFS_ILOCK_EXCL);
636
637 flushed = 1;
670 error = 0; 638 error = 0;
639 prealloc = 0;
671 goto retry; 640 goto retry;
672 } 641 }
673 642
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index a1cc1322fc0f..fdcf7b82747f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -40,8 +40,7 @@ typedef enum {
40 BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ 40 BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */
41 BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ 41 BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */
42 BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ 42 BMAPI_MMAP = (1 << 6), /* allocate for mmap write */
43 BMAPI_SYNC = (1 << 7), /* sync write to flush delalloc space */ 43 BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */
44 BMAPI_TRYLOCK = (1 << 8), /* non-blocking request */
45} bmapi_flags_t; 44} bmapi_flags_t;
46 45
47 46
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f76c6d7cea21..3750f04ede0b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -562,9 +562,8 @@ xfs_log_mount(
562 } 562 }
563 563
564 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); 564 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
565 if (!mp->m_log) { 565 if (IS_ERR(mp->m_log)) {
566 cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!"); 566 error = -PTR_ERR(mp->m_log);
567 error = ENOMEM;
568 goto out; 567 goto out;
569 } 568 }
570 569
@@ -1180,10 +1179,13 @@ xlog_alloc_log(xfs_mount_t *mp,
1180 xfs_buf_t *bp; 1179 xfs_buf_t *bp;
1181 int i; 1180 int i;
1182 int iclogsize; 1181 int iclogsize;
1182 int error = ENOMEM;
1183 1183
1184 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1184 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1185 if (!log) 1185 if (!log) {
1186 return NULL; 1186 xlog_warn("XFS: Log allocation failed: No memory!");
1187 goto out;
1188 }
1187 1189
1188 log->l_mp = mp; 1190 log->l_mp = mp;
1189 log->l_targ = log_target; 1191 log->l_targ = log_target;
@@ -1201,19 +1203,35 @@ xlog_alloc_log(xfs_mount_t *mp,
1201 log->l_grant_reserve_cycle = 1; 1203 log->l_grant_reserve_cycle = 1;
1202 log->l_grant_write_cycle = 1; 1204 log->l_grant_write_cycle = 1;
1203 1205
1206 error = EFSCORRUPTED;
1204 if (xfs_sb_version_hassector(&mp->m_sb)) { 1207 if (xfs_sb_version_hassector(&mp->m_sb)) {
1205 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; 1208 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
1206 ASSERT(log->l_sectbb_log <= mp->m_sectbb_log); 1209 if (log->l_sectbb_log < 0 ||
1210 log->l_sectbb_log > mp->m_sectbb_log) {
1211 xlog_warn("XFS: Log sector size (0x%x) out of range.",
1212 log->l_sectbb_log);
1213 goto out_free_log;
1214 }
1215
1207 /* for larger sector sizes, must have v2 or external log */ 1216 /* for larger sector sizes, must have v2 or external log */
1208 ASSERT(log->l_sectbb_log == 0 || 1217 if (log->l_sectbb_log != 0 &&
1209 log->l_logBBstart == 0 || 1218 (log->l_logBBstart != 0 &&
1210 xfs_sb_version_haslogv2(&mp->m_sb)); 1219 !xfs_sb_version_haslogv2(&mp->m_sb))) {
1211 ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT); 1220 xlog_warn("XFS: log sector size (0x%x) invalid "
1221 "for configuration.", log->l_sectbb_log);
1222 goto out_free_log;
1223 }
1224 if (mp->m_sb.sb_logsectlog < BBSHIFT) {
1225 xlog_warn("XFS: Log sector log (0x%x) too small.",
1226 mp->m_sb.sb_logsectlog);
1227 goto out_free_log;
1228 }
1212 } 1229 }
1213 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; 1230 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
1214 1231
1215 xlog_get_iclog_buffer_size(mp, log); 1232 xlog_get_iclog_buffer_size(mp, log);
1216 1233
1234 error = ENOMEM;
1217 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); 1235 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
1218 if (!bp) 1236 if (!bp)
1219 goto out_free_log; 1237 goto out_free_log;
@@ -1313,7 +1331,8 @@ out_free_iclog:
1313 xfs_buf_free(log->l_xbuf); 1331 xfs_buf_free(log->l_xbuf);
1314out_free_log: 1332out_free_log:
1315 kmem_free(log); 1333 kmem_free(log);
1316 return NULL; 1334out:
1335 return ERR_PTR(-error);
1317} /* xlog_alloc_log */ 1336} /* xlog_alloc_log */
1318 1337
1319 1338
@@ -2541,18 +2560,19 @@ redo:
2541 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2560 xlog_ins_ticketq(&log->l_reserve_headq, tic);
2542 xlog_trace_loggrant(log, tic, 2561 xlog_trace_loggrant(log, tic,
2543 "xlog_grant_log_space: sleep 2"); 2562 "xlog_grant_log_space: sleep 2");
2563 spin_unlock(&log->l_grant_lock);
2564 xlog_grant_push_ail(log->l_mp, need_bytes);
2565 spin_lock(&log->l_grant_lock);
2566
2544 XFS_STATS_INC(xs_sleep_logspace); 2567 XFS_STATS_INC(xs_sleep_logspace);
2545 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2568 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2546 2569
2547 if (XLOG_FORCED_SHUTDOWN(log)) { 2570 spin_lock(&log->l_grant_lock);
2548 spin_lock(&log->l_grant_lock); 2571 if (XLOG_FORCED_SHUTDOWN(log))
2549 goto error_return; 2572 goto error_return;
2550 }
2551 2573
2552 xlog_trace_loggrant(log, tic, 2574 xlog_trace_loggrant(log, tic,
2553 "xlog_grant_log_space: wake 2"); 2575 "xlog_grant_log_space: wake 2");
2554 xlog_grant_push_ail(log->l_mp, need_bytes);
2555 spin_lock(&log->l_grant_lock);
2556 goto redo; 2576 goto redo;
2557 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2577 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2558 xlog_del_ticketq(&log->l_reserve_headq, tic); 2578 xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2631,7 +2651,7 @@ xlog_regrant_write_log_space(xlog_t *log,
2631 * for more free space, otherwise try to get some space for 2651 * for more free space, otherwise try to get some space for
2632 * this transaction. 2652 * this transaction.
2633 */ 2653 */
2634 2654 need_bytes = tic->t_unit_res;
2635 if ((ntic = log->l_write_headq)) { 2655 if ((ntic = log->l_write_headq)) {
2636 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2656 free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
2637 log->l_grant_write_bytes); 2657 log->l_grant_write_bytes);
@@ -2651,26 +2671,25 @@ xlog_regrant_write_log_space(xlog_t *log,
2651 2671
2652 xlog_trace_loggrant(log, tic, 2672 xlog_trace_loggrant(log, tic,
2653 "xlog_regrant_write_log_space: sleep 1"); 2673 "xlog_regrant_write_log_space: sleep 1");
2674 spin_unlock(&log->l_grant_lock);
2675 xlog_grant_push_ail(log->l_mp, need_bytes);
2676 spin_lock(&log->l_grant_lock);
2677
2654 XFS_STATS_INC(xs_sleep_logspace); 2678 XFS_STATS_INC(xs_sleep_logspace);
2655 sv_wait(&tic->t_wait, PINOD|PLTWAIT, 2679 sv_wait(&tic->t_wait, PINOD|PLTWAIT,
2656 &log->l_grant_lock, s); 2680 &log->l_grant_lock, s);
2657 2681
2658 /* If we're shutting down, this tic is already 2682 /* If we're shutting down, this tic is already
2659 * off the queue */ 2683 * off the queue */
2660 if (XLOG_FORCED_SHUTDOWN(log)) { 2684 spin_lock(&log->l_grant_lock);
2661 spin_lock(&log->l_grant_lock); 2685 if (XLOG_FORCED_SHUTDOWN(log))
2662 goto error_return; 2686 goto error_return;
2663 }
2664 2687
2665 xlog_trace_loggrant(log, tic, 2688 xlog_trace_loggrant(log, tic,
2666 "xlog_regrant_write_log_space: wake 1"); 2689 "xlog_regrant_write_log_space: wake 1");
2667 xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
2668 spin_lock(&log->l_grant_lock);
2669 } 2690 }
2670 } 2691 }
2671 2692
2672 need_bytes = tic->t_unit_res;
2673
2674redo: 2693redo:
2675 if (XLOG_FORCED_SHUTDOWN(log)) 2694 if (XLOG_FORCED_SHUTDOWN(log))
2676 goto error_return; 2695 goto error_return;
@@ -2680,19 +2699,20 @@ redo:
2680 if (free_bytes < need_bytes) { 2699 if (free_bytes < need_bytes) {
2681 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2700 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2682 xlog_ins_ticketq(&log->l_write_headq, tic); 2701 xlog_ins_ticketq(&log->l_write_headq, tic);
2702 spin_unlock(&log->l_grant_lock);
2703 xlog_grant_push_ail(log->l_mp, need_bytes);
2704 spin_lock(&log->l_grant_lock);
2705
2683 XFS_STATS_INC(xs_sleep_logspace); 2706 XFS_STATS_INC(xs_sleep_logspace);
2684 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2707 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2685 2708
2686 /* If we're shutting down, this tic is already off the queue */ 2709 /* If we're shutting down, this tic is already off the queue */
2687 if (XLOG_FORCED_SHUTDOWN(log)) { 2710 spin_lock(&log->l_grant_lock);
2688 spin_lock(&log->l_grant_lock); 2711 if (XLOG_FORCED_SHUTDOWN(log))
2689 goto error_return; 2712 goto error_return;
2690 }
2691 2713
2692 xlog_trace_loggrant(log, tic, 2714 xlog_trace_loggrant(log, tic,
2693 "xlog_regrant_write_log_space: wake 2"); 2715 "xlog_regrant_write_log_space: wake 2");
2694 xlog_grant_push_ail(log->l_mp, need_bytes);
2695 spin_lock(&log->l_grant_lock);
2696 goto redo; 2716 goto redo;
2697 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2717 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2698 xlog_del_ticketq(&log->l_write_headq, tic); 2718 xlog_del_ticketq(&log->l_write_headq, tic);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b101990df027..65a99725d0cc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -291,14 +291,17 @@ xfs_mount_validate_sb(
291 sbp->sb_sectsize > XFS_MAX_SECTORSIZE || 291 sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
292 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || 292 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
293 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || 293 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
294 sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
294 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || 295 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
295 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || 296 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
296 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || 297 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
297 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || 298 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
299 sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
298 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || 300 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
299 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || 301 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
300 sbp->sb_inodelog < XFS_DINODE_MIN_LOG || 302 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
301 sbp->sb_inodelog > XFS_DINODE_MAX_LOG || 303 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
304 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
302 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || 305 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
303 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 306 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
304 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 307 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7af44adffc8f..d6a64392f983 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -313,7 +313,7 @@ typedef struct xfs_mount {
313#endif 313#endif
314 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 314 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
315 struct task_struct *m_sync_task; /* generalised sync thread */ 315 struct task_struct *m_sync_task; /* generalised sync thread */
316 bhv_vfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ 316 xfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */
317 struct list_head m_sync_list; /* sync thread work item list */ 317 struct list_head m_sync_list; /* sync thread work item list */
318 spinlock_t m_sync_lock; /* work item list lock */ 318 spinlock_t m_sync_lock; /* work item list lock */
319 int m_sync_seq; /* sync thread generation no. */ 319 int m_sync_seq; /* sync thread generation no. */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7394c7af5de5..19cf90a9c762 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1457,6 +1457,13 @@ xfs_create(
1457 error = xfs_trans_reserve(tp, resblks, log_res, 0, 1457 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1458 XFS_TRANS_PERM_LOG_RES, log_count); 1458 XFS_TRANS_PERM_LOG_RES, log_count);
1459 if (error == ENOSPC) { 1459 if (error == ENOSPC) {
1460 /* flush outstanding delalloc blocks and retry */
1461 xfs_flush_inodes(dp);
1462 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1463 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1464 }
1465 if (error == ENOSPC) {
1466 /* No space at all so try a "no-allocation" reservation */
1460 resblks = 0; 1467 resblks = 0;
1461 error = xfs_trans_reserve(tp, 0, log_res, 0, 1468 error = xfs_trans_reserve(tp, 0, log_res, 0,
1462 XFS_TRANS_PERM_LOG_RES, log_count); 1469 XFS_TRANS_PERM_LOG_RES, log_count);