aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_dir.c8
-rw-r--r--fs/9p/vfs_file.c11
-rw-r--r--fs/9p/vfs_inode.c111
-rw-r--r--fs/9p/vfs_super.c55
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/dir.c6
-rw-r--r--fs/afs/file.c64
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/autofs4/root.c22
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/block_dev.c330
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/inode.c11
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/buffer.c26
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/auth.c9
-rw-r--r--fs/ceph/auth.h2
-rw-r--r--fs/ceph/auth_none.c1
-rw-r--r--fs/ceph/auth_x.c19
-rw-r--r--fs/ceph/caps.c24
-rw-r--r--fs/ceph/ceph_fs.h62
-rw-r--r--fs/ceph/ceph_strings.c16
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/dir.c45
-rw-r--r--fs/ceph/export.c14
-rw-r--r--fs/ceph/file.c19
-rw-r--r--fs/ceph/inode.c97
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/mds_client.c385
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/messenger.c91
-rw-r--r--fs/ceph/messenger.h10
-rw-r--r--fs/ceph/mon_client.c257
-rw-r--r--fs/ceph/mon_client.h27
-rw-r--r--fs/ceph/msgpool.c180
-rw-r--r--fs/ceph/msgpool.h12
-rw-r--r--fs/ceph/msgr.h21
-rw-r--r--fs/ceph/osd_client.c98
-rw-r--r--fs/ceph/pagelist.c2
-rw-r--r--fs/ceph/rados.h23
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c128
-rw-r--r--fs/ceph/super.h30
-rw-r--r--fs/ceph/xattr.c35
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/coda/pioctl.c76
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/devpts/inode.c9
-rw-r--r--fs/drop_caches.c24
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c48
-rw-r--r--fs/ecryptfs/main.c166
-rw-r--r--fs/ecryptfs/mmap.c19
-rw-r--r--fs/ecryptfs/read_write.c13
-rw-r--r--fs/ecryptfs/super.c22
-rw-r--r--fs/exec.c7
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/inode.c41
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/balloc.c6
-rw-r--r--fs/ext2/ialloc.c21
-rw-r--r--fs/ext2/inode.c7
-rw-r--r--fs/ext2/super.c99
-rw-r--r--fs/ext2/xattr.c12
-rw-r--r--fs/ext2/xattr.h12
-rw-r--r--fs/ext2/xattr_security.c2
-rw-r--r--fs/ext2/xattr_trusted.c2
-rw-r--r--fs/ext2/xattr_user.c2
-rw-r--r--fs/ext3/acl.c4
-rw-r--r--fs/ext3/balloc.c6
-rw-r--r--fs/ext3/fsync.c23
-rw-r--r--fs/ext3/ialloc.c13
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/super.c77
-rw-r--r--fs/ext3/xattr.c10
-rw-r--r--fs/ext3/xattr.h12
-rw-r--r--fs/ext3/xattr_security.c2
-rw-r--r--fs/ext3/xattr_trusted.c2
-rw-r--r--fs/ext3/xattr_user.c2
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/ext4/ialloc.c12
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/xattr.c10
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ext4/xattr_security.c2
-rw-r--r--fs/ext4/xattr_trusted.c2
-rw-r--r--fs/ext4/xattr_user.c2
-rw-r--r--fs/fat/cache.c13
-rw-r--r--fs/fat/dir.c28
-rw-r--r--fs/fat/fat.h16
-rw-r--r--fs/fat/file.c19
-rw-r--r--fs/fat/inode.c8
-rw-r--r--fs/fat/misc.c22
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/fs-writeback.c108
-rw-r--r--fs/generic_acl.c4
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/quota.c12
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/xattr.c6
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h3
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c12
-rw-r--r--fs/inode.c26
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c15
-rw-r--r--fs/jbd/commit.c8
-rw-r--r--fs/jbd/journal.c33
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/security.c2
-rw-r--r--fs/jffs2/xattr.c8
-rw-r--r--fs/jffs2/xattr.h8
-rw-r--r--fs/jffs2/xattr_trusted.c2
-rw-r--r--fs/jffs2/xattr_user.c2
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_inode.c12
-rw-r--r--fs/logfs/inode.c9
-rw-r--r--fs/minix/bitmap.c5
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/minix/namei.c11
-rw-r--r--fs/namei.c5
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/ncpfs/ioctl.c27
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfsd/nfs4recover.c87
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/nilfs2/inode.c11
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/notify/inotify/inotify.c88
-rw-r--r--fs/ntfs/file.c28
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/file.c21
-rw-r--r--fs/ocfs2/namei.c9
-rw-r--r--fs/ocfs2/quota.h12
-rw-r--r--fs/ocfs2/quota_global.c347
-rw-r--r--fs/ocfs2/quota_local.c133
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/ocfs2/xattr.c12
-rw-r--r--fs/ocfs2/xattr.h12
-rw-r--r--fs/omfs/inode.c4
-rw-r--r--fs/open.c166
-rw-r--r--fs/partitions/acorn.c68
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c13
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c8
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c84
-rw-r--r--fs/partitions/check.h12
-rw-r--r--fs/partitions/efi.c93
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c21
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c4
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c107
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c13
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c87
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c4
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c6
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c6
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c6
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c4
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c122
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/quota/dquot.c275
-rw-r--r--fs/quota/quota.c95
-rw-r--r--fs/quota/quota_tree.c50
-rw-r--r--fs/quota/quota_tree.h6
-rw-r--r--fs/quota/quota_v1.c4
-rw-r--r--fs/quota/quota_v2.c6
-rw-r--r--fs/ramfs/inode.c22
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c3
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/xattr.c16
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/smbfs/dir.c2
-rw-r--r--fs/smbfs/file.c2
-rw-r--r--fs/smbfs/ioctl.c10
-rw-r--r--fs/smbfs/proto.h2
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c151
-rw-r--r--fs/statfs.c196
-rw-r--r--fs/super.c321
-rw-r--r--fs/sync.c88
-rw-r--r--fs/sysfs/bin.c26
-rw-r--r--fs/sysfs/dir.c114
-rw-r--r--fs/sysfs/file.c17
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysfs/inode.c6
-rw-r--r--fs/sysfs/mount.c95
-rw-r--r--fs/sysfs/symlink.c35
-rw-r--r--fs/sysfs/sysfs.h34
-rw-r--r--fs/sysv/ialloc.c11
-rw-r--r--fs/ubifs/dir.c9
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/file.c45
-rw-r--r--fs/udf/ialloc.c11
-rw-r--r--fs/udf/namei.c10
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/ufs/ialloc.c10
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/symlink.c8
-rw-r--r--fs/ufs/truncate.c10
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h83
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c8
-rw-r--r--fs/xfs/quota/xfs_dquot.c6
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c10
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h24
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_buf_item.c166
-rw-r--r--fs/xfs/xfs_buf_item.h18
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_log.c120
-rw-r--r--fs/xfs/xfs_log.h14
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h118
-rw-r--r--fs/xfs/xfs_log_recover.c46
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c144
-rw-r--r--fs/xfs/xfs_trans.h44
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
269 files changed, 5546 insertions, 3607 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
40extern struct file_system_type v9fs_fs_type; 40extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 41extern const struct address_space_operations v9fs_addr_operations;
42extern const struct file_operations v9fs_file_operations; 42extern const struct file_operations v9fs_file_operations;
43extern const struct file_operations v9fs_file_operations_dotl;
43extern const struct file_operations v9fs_dir_operations; 44extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl;
44extern const struct dentry_operations v9fs_dentry_operations; 46extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 47extern const struct dentry_operations v9fs_cached_dentry_operations;
46 48
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
203 .open = v9fs_file_open, 203 .open = v9fs_file_open,
204 .release = v9fs_dir_release, 204 .release = v9fs_dir_release,
205}; 205};
206
207const struct file_operations v9fs_dir_operations_dotl = {
208 .read = generic_read_dir,
209 .llseek = generic_file_llseek,
210 .readdir = v9fs_dir_readdir,
211 .open = v9fs_file_open,
212 .release = v9fs_dir_release,
213};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..25b300e1c9d7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -296,3 +296,14 @@ const struct file_operations v9fs_file_operations = {
296 .mmap = generic_file_readonly_mmap, 296 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync, 297 .fsync = v9fs_file_fsync,
298}; 298};
299
300const struct file_operations v9fs_file_operations_dotl = {
301 .llseek = generic_file_llseek,
302 .read = v9fs_file_read,
303 .write = v9fs_file_write,
304 .open = v9fs_file_open,
305 .release = v9fs_dir_release,
306 .lock = v9fs_file_lock,
307 .mmap = generic_file_readonly_mmap,
308 .fsync = v9fs_file_fsync,
309};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f2434fc9d2c4..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,9 +44,12 @@
44#include "cache.h" 44#include "cache.h"
45 45
46static const struct inode_operations v9fs_dir_inode_operations; 46static const struct inode_operations v9fs_dir_inode_operations;
47static const struct inode_operations v9fs_dir_inode_operations_ext; 47static const struct inode_operations v9fs_dir_inode_operations_dotu;
48static const struct inode_operations v9fs_dir_inode_operations_dotl;
48static const struct inode_operations v9fs_file_inode_operations; 49static const struct inode_operations v9fs_file_inode_operations;
50static const struct inode_operations v9fs_file_inode_operations_dotl;
49static const struct inode_operations v9fs_symlink_inode_operations; 51static const struct inode_operations v9fs_symlink_inode_operations;
52static const struct inode_operations v9fs_symlink_inode_operations_dotl;
50 53
51/** 54/**
52 * unixmode2p9mode - convert unix mode bits to plan 9 55 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -253,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
253 return ERR_PTR(-ENOMEM); 256 return ERR_PTR(-ENOMEM);
254 } 257 }
255 258
256 inode->i_mode = mode; 259 inode_init_owner(inode, NULL, mode);
257 inode->i_uid = current_fsuid();
258 inode->i_gid = current_fsgid();
259 inode->i_blocks = 0; 260 inode->i_blocks = 0;
260 inode->i_rdev = 0; 261 inode->i_rdev = 0;
261 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 262 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -275,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
275 init_special_inode(inode, inode->i_mode, inode->i_rdev); 276 init_special_inode(inode, inode->i_mode, inode->i_rdev);
276 break; 277 break;
277 case S_IFREG: 278 case S_IFREG:
278 inode->i_op = &v9fs_file_inode_operations; 279 if (v9fs_proto_dotl(v9ses)) {
279 inode->i_fop = &v9fs_file_operations; 280 inode->i_op = &v9fs_file_inode_operations_dotl;
281 inode->i_fop = &v9fs_file_operations_dotl;
282 } else {
283 inode->i_op = &v9fs_file_inode_operations;
284 inode->i_fop = &v9fs_file_operations;
285 }
286
280 break; 287 break;
288
281 case S_IFLNK: 289 case S_IFLNK:
282 if (!v9fs_proto_dotu(v9ses)) { 290 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
283 P9_DPRINTK(P9_DEBUG_ERROR, 291 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
284 "extended modes used w/o 9P2000.u\n"); 292 "legacy protocol.\n");
285 err = -EINVAL; 293 err = -EINVAL;
286 goto error; 294 goto error;
287 } 295 }
288 inode->i_op = &v9fs_symlink_inode_operations; 296
297 if (v9fs_proto_dotl(v9ses))
298 inode->i_op = &v9fs_symlink_inode_operations_dotl;
299 else
300 inode->i_op = &v9fs_symlink_inode_operations;
301
289 break; 302 break;
290 case S_IFDIR: 303 case S_IFDIR:
291 inc_nlink(inode); 304 inc_nlink(inode);
292 if (v9fs_proto_dotu(v9ses)) 305 if (v9fs_proto_dotl(v9ses))
293 inode->i_op = &v9fs_dir_inode_operations_ext; 306 inode->i_op = &v9fs_dir_inode_operations_dotl;
307 else if (v9fs_proto_dotu(v9ses))
308 inode->i_op = &v9fs_dir_inode_operations_dotu;
294 else 309 else
295 inode->i_op = &v9fs_dir_inode_operations; 310 inode->i_op = &v9fs_dir_inode_operations;
296 inode->i_fop = &v9fs_dir_operations; 311
312 if (v9fs_proto_dotl(v9ses))
313 inode->i_fop = &v9fs_dir_operations_dotl;
314 else
315 inode->i_fop = &v9fs_dir_operations;
316
297 break; 317 break;
298 default: 318 default:
299 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", 319 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -434,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
434{ 454{
435 int retval; 455 int retval;
436 struct inode *file_inode; 456 struct inode *file_inode;
437 struct v9fs_session_info *v9ses;
438 struct p9_fid *v9fid; 457 struct p9_fid *v9fid;
439 458
440 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 459 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
441 rmdir); 460 rmdir);
442 461
443 file_inode = file->d_inode; 462 file_inode = file->d_inode;
444 v9ses = v9fs_inode2v9ses(file_inode);
445 v9fid = v9fs_fid_clone(file); 463 v9fid = v9fs_fid_clone(file);
446 if (IS_ERR(v9fid)) 464 if (IS_ERR(v9fid))
447 return PTR_ERR(v9fid); 465 return PTR_ERR(v9fid);
@@ -484,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
484 ofid = NULL; 502 ofid = NULL;
485 fid = NULL; 503 fid = NULL;
486 name = (char *) dentry->d_name.name; 504 name = (char *) dentry->d_name.name;
487 dfid = v9fs_fid_clone(dentry->d_parent); 505 dfid = v9fs_fid_lookup(dentry->d_parent);
488 if (IS_ERR(dfid)) { 506 if (IS_ERR(dfid)) {
489 err = PTR_ERR(dfid); 507 err = PTR_ERR(dfid);
490 P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); 508 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
491 dfid = NULL; 509 return ERR_PTR(err);
492 goto error;
493 } 510 }
494 511
495 /* clone a fid to use for creation */ 512 /* clone a fid to use for creation */
@@ -497,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
497 if (IS_ERR(ofid)) { 514 if (IS_ERR(ofid)) {
498 err = PTR_ERR(ofid); 515 err = PTR_ERR(ofid);
499 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 516 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
500 ofid = NULL; 517 return ERR_PTR(err);
501 goto error;
502 } 518 }
503 519
504 err = p9_client_fcreate(ofid, name, perm, mode, extension); 520 err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -508,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
508 } 524 }
509 525
510 /* now walk from the parent so we can get unopened fid */ 526 /* now walk from the parent so we can get unopened fid */
511 fid = p9_client_walk(dfid, 1, &name, 0); 527 fid = p9_client_walk(dfid, 1, &name, 1);
512 if (IS_ERR(fid)) { 528 if (IS_ERR(fid)) {
513 err = PTR_ERR(fid); 529 err = PTR_ERR(fid);
514 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 530 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
515 fid = NULL; 531 fid = NULL;
516 goto error; 532 goto error;
517 } else 533 }
518 dfid = NULL;
519 534
520 /* instantiate inode and assign the unopened fid to the dentry */ 535 /* instantiate inode and assign the unopened fid to the dentry */
521 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 536 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -538,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
538 return ofid; 553 return ofid;
539 554
540error: 555error:
541 if (dfid)
542 p9_client_clunk(dfid);
543
544 if (ofid) 556 if (ofid)
545 p9_client_clunk(ofid); 557 p9_client_clunk(ofid);
546 558
@@ -675,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
675 if (IS_ERR(fid)) { 687 if (IS_ERR(fid)) {
676 result = PTR_ERR(fid); 688 result = PTR_ERR(fid);
677 if (result == -ENOENT) { 689 if (result == -ENOENT) {
678 d_add(dentry, NULL); 690 inode = NULL;
679 return NULL; 691 goto inst_out;
680 } 692 }
681 693
682 return ERR_PTR(result); 694 return ERR_PTR(result);
@@ -693,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
693 if (result < 0) 705 if (result < 0)
694 goto error; 706 goto error;
695 707
696 if ((fid->qid.version) && (v9ses->cache)) 708inst_out:
709 if (v9ses->cache)
697 dentry->d_op = &v9fs_cached_dentry_operations; 710 dentry->d_op = &v9fs_cached_dentry_operations;
698 else 711 else
699 dentry->d_op = &v9fs_dentry_operations; 712 dentry->d_op = &v9fs_dentry_operations;
@@ -772,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
772 goto clunk_olddir; 785 goto clunk_olddir;
773 } 786 }
774 787
788 if (v9fs_proto_dotl(v9ses)) {
789 retval = p9_client_rename(oldfid, newdirfid,
790 (char *) new_dentry->d_name.name);
791 if (retval != -ENOSYS)
792 goto clunk_newdir;
793 }
794
775 /* 9P can only handle file rename in the same directory */ 795 /* 9P can only handle file rename in the same directory */
776 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { 796 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
777 P9_DPRINTK(P9_DEBUG_ERROR, 797 P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1197,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1197 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); 1217 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
1198 else if (S_ISFIFO(mode)) 1218 else if (S_ISFIFO(mode))
1199 *name = 0; 1219 *name = 0;
1220 else if (S_ISSOCK(mode))
1221 *name = 0;
1200 else { 1222 else {
1201 __putname(name); 1223 __putname(name);
1202 return -EINVAL; 1224 return -EINVAL;
@@ -1208,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1208 return retval; 1230 return retval;
1209} 1231}
1210 1232
1211static const struct inode_operations v9fs_dir_inode_operations_ext = { 1233static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1234 .create = v9fs_vfs_create,
1235 .lookup = v9fs_vfs_lookup,
1236 .symlink = v9fs_vfs_symlink,
1237 .link = v9fs_vfs_link,
1238 .unlink = v9fs_vfs_unlink,
1239 .mkdir = v9fs_vfs_mkdir,
1240 .rmdir = v9fs_vfs_rmdir,
1241 .mknod = v9fs_vfs_mknod,
1242 .rename = v9fs_vfs_rename,
1243 .getattr = v9fs_vfs_getattr,
1244 .setattr = v9fs_vfs_setattr,
1245};
1246
1247static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1212 .create = v9fs_vfs_create, 1248 .create = v9fs_vfs_create,
1213 .lookup = v9fs_vfs_lookup, 1249 .lookup = v9fs_vfs_lookup,
1214 .symlink = v9fs_vfs_symlink, 1250 .symlink = v9fs_vfs_symlink,
@@ -1239,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
1239 .setattr = v9fs_vfs_setattr, 1275 .setattr = v9fs_vfs_setattr,
1240}; 1276};
1241 1277
1278static const struct inode_operations v9fs_file_inode_operations_dotl = {
1279 .getattr = v9fs_vfs_getattr,
1280 .setattr = v9fs_vfs_setattr,
1281};
1282
1242static const struct inode_operations v9fs_symlink_inode_operations = { 1283static const struct inode_operations v9fs_symlink_inode_operations = {
1243 .readlink = generic_readlink, 1284 .readlink = generic_readlink,
1244 .follow_link = v9fs_vfs_follow_link, 1285 .follow_link = v9fs_vfs_follow_link,
@@ -1246,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
1246 .getattr = v9fs_vfs_getattr, 1287 .getattr = v9fs_vfs_getattr,
1247 .setattr = v9fs_vfs_setattr, 1288 .setattr = v9fs_vfs_setattr,
1248}; 1289};
1290
1291static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
1292 .readlink = generic_readlink,
1293 .follow_link = v9fs_vfs_follow_link,
1294 .put_link = v9fs_vfs_put_link,
1295 .getattr = v9fs_vfs_getattr,
1296 .setattr = v9fs_vfs_setattr,
1297};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,6 +38,7 @@
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h>
41#include <net/9p/9p.h> 42#include <net/9p/9p.h>
42#include <net/9p/client.h> 43#include <net/9p/client.h>
43 44
@@ -45,7 +46,7 @@
45#include "v9fs_vfs.h" 46#include "v9fs_vfs.h"
46#include "fid.h" 47#include "fid.h"
47 48
48static const struct super_operations v9fs_super_ops; 49static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
49 50
50/** 51/**
51 * v9fs_set_super - set the superblock 52 * v9fs_set_super - set the superblock
@@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
76 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 77 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
77 sb->s_blocksize = 1 << sb->s_blocksize_bits; 78 sb->s_blocksize = 1 << sb->s_blocksize_bits;
78 sb->s_magic = V9FS_MAGIC; 79 sb->s_magic = V9FS_MAGIC;
79 sb->s_op = &v9fs_super_ops; 80 if (v9fs_proto_dotl(v9ses))
81 sb->s_op = &v9fs_super_ops_dotl;
82 else
83 sb->s_op = &v9fs_super_ops;
80 sb->s_bdi = &v9ses->bdi; 84 sb->s_bdi = &v9ses->bdi;
81 85
82 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 86 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb)
211 v9fs_session_begin_cancel(v9ses); 215 v9fs_session_begin_cancel(v9ses);
212} 216}
213 217
218static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
219{
220 struct v9fs_session_info *v9ses;
221 struct p9_fid *fid;
222 struct p9_rstatfs rs;
223 int res;
224
225 fid = v9fs_fid_lookup(dentry);
226 if (IS_ERR(fid)) {
227 res = PTR_ERR(fid);
228 goto done;
229 }
230
231 v9ses = v9fs_inode2v9ses(dentry->d_inode);
232 if (v9fs_proto_dotl(v9ses)) {
233 res = p9_client_statfs(fid, &rs);
234 if (res == 0) {
235 buf->f_type = rs.type;
236 buf->f_bsize = rs.bsize;
237 buf->f_blocks = rs.blocks;
238 buf->f_bfree = rs.bfree;
239 buf->f_bavail = rs.bavail;
240 buf->f_files = rs.files;
241 buf->f_ffree = rs.ffree;
242 buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
243 buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
244 buf->f_namelen = rs.namelen;
245 }
246 if (res != -ENOSYS)
247 goto done;
248 }
249 res = simple_statfs(dentry, buf);
250done:
251 return res;
252}
253
214static const struct super_operations v9fs_super_ops = { 254static const struct super_operations v9fs_super_ops = {
215#ifdef CONFIG_9P_FSCACHE 255#ifdef CONFIG_9P_FSCACHE
216 .alloc_inode = v9fs_alloc_inode, 256 .alloc_inode = v9fs_alloc_inode,
@@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
222 .umount_begin = v9fs_umount_begin, 262 .umount_begin = v9fs_umount_begin,
223}; 263};
224 264
265static const struct super_operations v9fs_super_ops_dotl = {
266#ifdef CONFIG_9P_FSCACHE
267 .alloc_inode = v9fs_alloc_inode,
268 .destroy_inode = v9fs_destroy_inode,
269#endif
270 .statfs = v9fs_statfs,
271 .clear_inode = v9fs_clear_inode,
272 .show_options = generic_show_options,
273 .umount_begin = v9fs_umount_begin,
274};
275
225struct file_system_type v9fs_fs_type = { 276struct file_system_type v9fs_fs_type = {
226 .name = "9p", 277 .name = "9p",
227 .get_sb = v9fs_get_sb, 278 .get_sb = v9fs_get_sb,
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index adc1cb771b57..b42d5cc1d6d2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
189 struct key *key) 189 struct key *key)
190{ 190{
191 struct page *page; 191 struct page *page;
192 struct file file = {
193 .private_data = key,
194 };
195
196 _enter("{%lu},%lu", dir->i_ino, index); 192 _enter("{%lu},%lu", dir->i_ino, index);
197 193
198 page = read_mapping_page(dir->i_mapping, index, &file); 194 page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
199 if (!IS_ERR(page)) { 195 if (!IS_ERR(page)) {
200 kmap(page); 196 kmap(page);
201 if (!PageChecked(page)) 197 if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0df9bc2b724d..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
121#endif 121#endif
122 122
123/* 123/*
124 * AFS read page from file, directory or symlink 124 * read page from file, directory or symlink, given a key to use
125 */ 125 */
126static int afs_readpage(struct file *file, struct page *page) 126int afs_page_filler(void *data, struct page *page)
127{ 127{
128 struct afs_vnode *vnode; 128 struct inode *inode = page->mapping->host;
129 struct inode *inode; 129 struct afs_vnode *vnode = AFS_FS_I(inode);
130 struct key *key; 130 struct key *key = data;
131 size_t len; 131 size_t len;
132 off_t offset; 132 off_t offset;
133 int ret; 133 int ret;
134 134
135 inode = page->mapping->host;
136
137 if (file) {
138 key = file->private_data;
139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
147
148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 135 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
149 136
150 vnode = AFS_FS_I(inode);
151
152 BUG_ON(!PageLocked(page)); 137 BUG_ON(!PageLocked(page));
153 138
154 ret = -ESTALE; 139 ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
214 unlock_page(page); 199 unlock_page(page);
215 } 200 }
216 201
217 if (!file)
218 key_put(key);
219 _leave(" = 0"); 202 _leave(" = 0");
220 return 0; 203 return 0;
221 204
222error: 205error:
223 SetPageError(page); 206 SetPageError(page);
224 unlock_page(page); 207 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
228 _leave(" = %d", ret); 208 _leave(" = %d", ret);
229 return ret; 209 return ret;
230} 210}
231 211
232/* 212/*
213 * read page from file, directory or symlink, given a file to nominate the key
214 * to be used
215 */
216static int afs_readpage(struct file *file, struct page *page)
217{
218 struct key *key;
219 int ret;
220
221 if (file) {
222 key = file->private_data;
223 ASSERT(key != NULL);
224 ret = afs_page_filler(key, page);
225 } else {
226 struct inode *inode = page->mapping->host;
227 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
228 if (IS_ERR(key)) {
229 ret = PTR_ERR(key);
230 } else {
231 ret = afs_page_filler(key, page);
232 key_put(key);
233 }
234 }
235 return ret;
236}
237
238/*
233 * read a set of pages 239 * read a set of pages
234 */ 240 */
235static int afs_readpages(struct file *file, struct address_space *mapping, 241static int afs_readpages(struct file *file, struct address_space *mapping,
236 struct list_head *pages, unsigned nr_pages) 242 struct list_head *pages, unsigned nr_pages)
237{ 243{
244 struct key *key = file->private_data;
238 struct afs_vnode *vnode; 245 struct afs_vnode *vnode;
239 int ret = 0; 246 int ret = 0;
240 247
241 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); 248 _enter("{%d},{%lu},,%d",
249 key_serial(key), mapping->host->i_ino, nr_pages);
250
251 ASSERT(key != NULL);
242 252
243 vnode = AFS_FS_I(mapping->host); 253 vnode = AFS_FS_I(mapping->host);
244 if (vnode->flags & AFS_VNODE_DELETED) { 254 if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
279 } 289 }
280 290
281 /* load the missing pages from the network */ 291 /* load the missing pages from the network */
282 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); 292 ret = read_cache_pages(mapping, pages, afs_page_filler, key);
283 293
284 _leave(" = %d [netting]", ret); 294 _leave(" = %d [netting]", ret);
285 return ret; 295 return ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a10f2582844f..807f284cc75e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations;
494 494
495extern int afs_open(struct inode *, struct file *); 495extern int afs_open(struct inode *, struct file *);
496extern int afs_release(struct inode *, struct file *); 496extern int afs_release(struct inode *, struct file *);
497extern int afs_page_filler(void *, struct page *);
497 498
498/* 499/*
499 * flock.c 500 * flock.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b3feddc4f7d6..a9e23039ea34 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
49 */ 49 */
50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) 50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
51{ 51{
52 struct file file = {
53 .private_data = key,
54 };
55 struct page *page; 52 struct page *page;
56 size_t size; 53 size_t size;
57 char *buf; 54 char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
61 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 58 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
62 59
63 /* read the contents of the symlink into the pagecache */ 60 /* read the contents of the symlink into the pagecache */
64 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); 61 page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
62 afs_page_filler, key);
65 if (IS_ERR(page)) { 63 if (IS_ERR(page)) {
66 ret = PTR_ERR(page); 64 ret = PTR_ERR(page);
67 goto out; 65 goto out;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..9bd4b3876c99 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
205 * that it already _is_ on the dirty list. 205 * that it already _is_ on the dirty list.
206 */ 206 */
207 inode->i_state = I_DIRTY; 207 inode->i_state = I_DIRTY;
208 inode->i_mode = S_IRUSR | S_IWUSR; 208 inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
209 inode->i_uid = current_fsuid(); 209 inode->i_uid = current_fsuid();
210 inode->i_gid = current_fsgid(); 210 inode->i_gid = current_fsgid();
211 inode->i_flags |= S_PRIVATE; 211 inode->i_flags |= S_PRIVATE;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,14 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/smp_lock.h>
21#include "autofs_i.h" 22#include "autofs_i.h"
22 23
23static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 24static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
24static int autofs4_dir_unlink(struct inode *,struct dentry *); 25static int autofs4_dir_unlink(struct inode *,struct dentry *);
25static int autofs4_dir_rmdir(struct inode *,struct dentry *); 26static int autofs4_dir_rmdir(struct inode *,struct dentry *);
26static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 27static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
27static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
28static int autofs4_dir_open(struct inode *inode, struct file *file); 29static int autofs4_dir_open(struct inode *inode, struct file *file);
29static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 30static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
30static void *autofs4_follow_link(struct dentry *, struct nameidata *); 31static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
38 .read = generic_read_dir, 39 .read = generic_read_dir,
39 .readdir = dcache_readdir, 40 .readdir = dcache_readdir,
40 .llseek = dcache_dir_lseek, 41 .llseek = dcache_dir_lseek,
41 .ioctl = autofs4_root_ioctl, 42 .unlocked_ioctl = autofs4_root_ioctl,
42}; 43};
43 44
44const struct file_operations autofs4_dir_operations = { 45const struct file_operations autofs4_dir_operations = {
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
902 * ioctl()'s on the root directory is the chief method for the daemon to 903 * ioctl()'s on the root directory is the chief method for the daemon to
903 * generate kernel reactions 904 * generate kernel reactions
904 */ 905 */
905static int autofs4_root_ioctl(struct inode *inode, struct file *filp, 906static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
906 unsigned int cmd, unsigned long arg) 907 unsigned int cmd, unsigned long arg)
907{ 908{
908 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 909 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
909 void __user *p = (void __user *)arg; 910 void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
947 return -ENOSYS; 948 return -ENOSYS;
948 } 949 }
949} 950}
951
952static long autofs4_root_ioctl(struct file *filp,
953 unsigned int cmd, unsigned long arg)
954{
955 long ret;
956 struct inode *inode = filp->f_dentry->d_inode;
957
958 lock_kernel();
959 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
960 unlock_kernel();
961
962 return ret;
963}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..8f73841fc974 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
105 } 105 }
106 set_bit(ino, info->si_imap); 106 set_bit(ino, info->si_imap);
107 info->si_freei--; 107 info->si_freei--;
108 inode->i_uid = current_fsuid(); 108 inode_init_owner(inode, dir, mode);
109 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
110 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 109 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
111 inode->i_blocks = 0; 110 inode->i_blocks = 0;
112 inode->i_op = &bfs_file_inops; 111 inode->i_op = &bfs_file_inops;
113 inode->i_fop = &bfs_file_operations; 112 inode->i_fop = &bfs_file_operations;
114 inode->i_mapping->a_ops = &bfs_aops; 113 inode->i_mapping->a_ops = &bfs_aops;
115 inode->i_mode = mode;
116 inode->i_ino = ino; 114 inode->i_ino = ino;
117 BFS_I(inode)->i_dsk_ino = ino; 115 BFS_I(inode)->i_dsk_ino = ino;
118 BFS_I(inode)->i_sblock = 0; 116 BFS_I(inode)->i_sblock = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..26e5f5026620 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
245 sb = get_active_super(bdev); 245 sb = get_active_super(bdev);
246 if (!sb) 246 if (!sb)
247 goto out; 247 goto out;
248 if (sb->s_flags & MS_RDONLY) { 248 error = freeze_super(sb);
249 sb->s_frozen = SB_FREEZE_TRANS; 249 if (error) {
250 up_write(&sb->s_umount); 250 deactivate_super(sb);
251 bdev->bd_fsfreeze_count--;
251 mutex_unlock(&bdev->bd_fsfreeze_mutex); 252 mutex_unlock(&bdev->bd_fsfreeze_mutex);
252 return sb; 253 return ERR_PTR(error);
253 }
254
255 sb->s_frozen = SB_FREEZE_WRITE;
256 smp_wmb();
257
258 sync_filesystem(sb);
259
260 sb->s_frozen = SB_FREEZE_TRANS;
261 smp_wmb();
262
263 sync_blockdev(sb->s_bdev);
264
265 if (sb->s_op->freeze_fs) {
266 error = sb->s_op->freeze_fs(sb);
267 if (error) {
268 printk(KERN_ERR
269 "VFS:Filesystem freeze failed\n");
270 sb->s_frozen = SB_UNFROZEN;
271 deactivate_locked_super(sb);
272 bdev->bd_fsfreeze_count--;
273 mutex_unlock(&bdev->bd_fsfreeze_mutex);
274 return ERR_PTR(error);
275 }
276 } 254 }
277 up_write(&sb->s_umount); 255 deactivate_super(sb);
278
279 out: 256 out:
280 sync_blockdev(bdev); 257 sync_blockdev(bdev);
281 mutex_unlock(&bdev->bd_fsfreeze_mutex); 258 mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
296 273
297 mutex_lock(&bdev->bd_fsfreeze_mutex); 274 mutex_lock(&bdev->bd_fsfreeze_mutex);
298 if (!bdev->bd_fsfreeze_count) 275 if (!bdev->bd_fsfreeze_count)
299 goto out_unlock; 276 goto out;
300 277
301 error = 0; 278 error = 0;
302 if (--bdev->bd_fsfreeze_count > 0) 279 if (--bdev->bd_fsfreeze_count > 0)
303 goto out_unlock; 280 goto out;
304 281
305 if (!sb) 282 if (!sb)
306 goto out_unlock; 283 goto out;
307
308 BUG_ON(sb->s_bdev != bdev);
309 down_write(&sb->s_umount);
310 if (sb->s_flags & MS_RDONLY)
311 goto out_unfrozen;
312
313 if (sb->s_op->unfreeze_fs) {
314 error = sb->s_op->unfreeze_fs(sb);
315 if (error) {
316 printk(KERN_ERR
317 "VFS:Filesystem thaw failed\n");
318 sb->s_frozen = SB_FREEZE_TRANS;
319 bdev->bd_fsfreeze_count++;
320 mutex_unlock(&bdev->bd_fsfreeze_mutex);
321 return error;
322 }
323 }
324
325out_unfrozen:
326 sb->s_frozen = SB_UNFROZEN;
327 smp_wmb();
328 wake_up(&sb->s_wait_unfrozen);
329 284
330 if (sb) 285 error = thaw_super(sb);
331 deactivate_locked_super(sb); 286 if (error) {
332out_unlock: 287 bdev->bd_fsfreeze_count++;
288 mutex_unlock(&bdev->bd_fsfreeze_mutex);
289 return error;
290 }
291out:
333 mutex_unlock(&bdev->bd_fsfreeze_mutex); 292 mutex_unlock(&bdev->bd_fsfreeze_mutex);
334 return 0; 293 return 0;
335} 294}
@@ -417,7 +376,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
417 */ 376 */
418 mutex_unlock(&bd_inode->i_mutex); 377 mutex_unlock(&bd_inode->i_mutex);
419 378
420 error = blkdev_issue_flush(bdev, NULL); 379 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
421 if (error == -EOPNOTSUPP) 380 if (error == -EOPNOTSUPP)
422 error = 0; 381 error = 0;
423 382
@@ -668,41 +627,209 @@ void bd_forget(struct inode *inode)
668 iput(bdev->bd_inode); 627 iput(bdev->bd_inode);
669} 628}
670 629
671int bd_claim(struct block_device *bdev, void *holder) 630/**
631 * bd_may_claim - test whether a block device can be claimed
632 * @bdev: block device of interest
633 * @whole: whole block device containing @bdev, may equal @bdev
634 * @holder: holder trying to claim @bdev
635 *
636 * Test whther @bdev can be claimed by @holder.
637 *
638 * CONTEXT:
639 * spin_lock(&bdev_lock).
640 *
641 * RETURNS:
642 * %true if @bdev can be claimed, %false otherwise.
643 */
644static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
645 void *holder)
672{ 646{
673 int res;
674 spin_lock(&bdev_lock);
675
676 /* first decide result */
677 if (bdev->bd_holder == holder) 647 if (bdev->bd_holder == holder)
678 res = 0; /* already a holder */ 648 return true; /* already a holder */
679 else if (bdev->bd_holder != NULL) 649 else if (bdev->bd_holder != NULL)
680 res = -EBUSY; /* held by someone else */ 650 return false; /* held by someone else */
681 else if (bdev->bd_contains == bdev) 651 else if (bdev->bd_contains == bdev)
682 res = 0; /* is a whole device which isn't held */ 652 return true; /* is a whole device which isn't held */
683 653
684 else if (bdev->bd_contains->bd_holder == bd_claim) 654 else if (whole->bd_holder == bd_claim)
685 res = 0; /* is a partition of a device that is being partitioned */ 655 return true; /* is a partition of a device that is being partitioned */
686 else if (bdev->bd_contains->bd_holder != NULL) 656 else if (whole->bd_holder != NULL)
687 res = -EBUSY; /* is a partition of a held device */ 657 return false; /* is a partition of a held device */
688 else 658 else
689 res = 0; /* is a partition of an un-held device */ 659 return true; /* is a partition of an un-held device */
660}
661
662/**
663 * bd_prepare_to_claim - prepare to claim a block device
664 * @bdev: block device of interest
665 * @whole: the whole device containing @bdev, may equal @bdev
666 * @holder: holder trying to claim @bdev
667 *
668 * Prepare to claim @bdev. This function fails if @bdev is already
669 * claimed by another holder and waits if another claiming is in
670 * progress. This function doesn't actually claim. On successful
671 * return, the caller has ownership of bd_claiming and bd_holder[s].
672 *
673 * CONTEXT:
674 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
675 * it multiple times.
676 *
677 * RETURNS:
678 * 0 if @bdev can be claimed, -EBUSY otherwise.
679 */
680static int bd_prepare_to_claim(struct block_device *bdev,
681 struct block_device *whole, void *holder)
682{
683retry:
684 /* if someone else claimed, fail */
685 if (!bd_may_claim(bdev, whole, holder))
686 return -EBUSY;
687
688 /* if someone else is claiming, wait for it to finish */
689 if (whole->bd_claiming && whole->bd_claiming != holder) {
690 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
691 DEFINE_WAIT(wait);
692
693 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
694 spin_unlock(&bdev_lock);
695 schedule();
696 finish_wait(wq, &wait);
697 spin_lock(&bdev_lock);
698 goto retry;
699 }
700
701 /* yay, all mine */
702 return 0;
703}
690 704
691 /* now impose change */ 705/**
692 if (res==0) { 706 * bd_start_claiming - start claiming a block device
707 * @bdev: block device of interest
708 * @holder: holder trying to claim @bdev
709 *
710 * @bdev is about to be opened exclusively. Check @bdev can be opened
711 * exclusively and mark that an exclusive open is in progress. Each
712 * successful call to this function must be matched with a call to
713 * either bd_claim() or bd_abort_claiming(). If this function
714 * succeeds, the matching bd_claim() is guaranteed to succeed.
715 *
716 * CONTEXT:
717 * Might sleep.
718 *
719 * RETURNS:
720 * Pointer to the block device containing @bdev on success, ERR_PTR()
721 * value on failure.
722 */
723static struct block_device *bd_start_claiming(struct block_device *bdev,
724 void *holder)
725{
726 struct gendisk *disk;
727 struct block_device *whole;
728 int partno, err;
729
730 might_sleep();
731
732 /*
733 * @bdev might not have been initialized properly yet, look up
734 * and grab the outer block device the hard way.
735 */
736 disk = get_gendisk(bdev->bd_dev, &partno);
737 if (!disk)
738 return ERR_PTR(-ENXIO);
739
740 whole = bdget_disk(disk, 0);
741 put_disk(disk);
742 if (!whole)
743 return ERR_PTR(-ENOMEM);
744
745 /* prepare to claim, if successful, mark claiming in progress */
746 spin_lock(&bdev_lock);
747
748 err = bd_prepare_to_claim(bdev, whole, holder);
749 if (err == 0) {
750 whole->bd_claiming = holder;
751 spin_unlock(&bdev_lock);
752 return whole;
753 } else {
754 spin_unlock(&bdev_lock);
755 bdput(whole);
756 return ERR_PTR(err);
757 }
758}
759
760/* releases bdev_lock */
761static void __bd_abort_claiming(struct block_device *whole, void *holder)
762{
763 BUG_ON(whole->bd_claiming != holder);
764 whole->bd_claiming = NULL;
765 wake_up_bit(&whole->bd_claiming, 0);
766
767 spin_unlock(&bdev_lock);
768 bdput(whole);
769}
770
771/**
772 * bd_abort_claiming - abort claiming a block device
773 * @whole: whole block device returned by bd_start_claiming()
774 * @holder: holder trying to claim @bdev
775 *
776 * Abort a claiming block started by bd_start_claiming(). Note that
777 * @whole is not the block device to be claimed but the whole device
778 * returned by bd_start_claiming().
779 *
780 * CONTEXT:
781 * Grabs and releases bdev_lock.
782 */
783static void bd_abort_claiming(struct block_device *whole, void *holder)
784{
785 spin_lock(&bdev_lock);
786 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
787}
788
789/**
790 * bd_claim - claim a block device
791 * @bdev: block device to claim
792 * @holder: holder trying to claim @bdev
793 *
794 * Try to claim @bdev which must have been opened successfully. This
795 * function may be called with or without preceding
796 * blk_start_claiming(). In the former case, this function is always
797 * successful and terminates the claiming block.
798 *
799 * CONTEXT:
800 * Might sleep.
801 *
802 * RETURNS:
803 * 0 if successful, -EBUSY if @bdev is already claimed.
804 */
805int bd_claim(struct block_device *bdev, void *holder)
806{
807 struct block_device *whole = bdev->bd_contains;
808 int res;
809
810 might_sleep();
811
812 spin_lock(&bdev_lock);
813
814 res = bd_prepare_to_claim(bdev, whole, holder);
815 if (res == 0) {
693 /* note that for a whole device bd_holders 816 /* note that for a whole device bd_holders
694 * will be incremented twice, and bd_holder will 817 * will be incremented twice, and bd_holder will
695 * be set to bd_claim before being set to holder 818 * be set to bd_claim before being set to holder
696 */ 819 */
697 bdev->bd_contains->bd_holders ++; 820 whole->bd_holders++;
698 bdev->bd_contains->bd_holder = bd_claim; 821 whole->bd_holder = bd_claim;
699 bdev->bd_holders++; 822 bdev->bd_holders++;
700 bdev->bd_holder = holder; 823 bdev->bd_holder = holder;
701 } 824 }
702 spin_unlock(&bdev_lock); 825
826 if (whole->bd_claiming)
827 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
828 else
829 spin_unlock(&bdev_lock);
830
703 return res; 831 return res;
704} 832}
705
706EXPORT_SYMBOL(bd_claim); 833EXPORT_SYMBOL(bd_claim);
707 834
708void bd_release(struct block_device *bdev) 835void bd_release(struct block_device *bdev)
@@ -1316,6 +1443,7 @@ EXPORT_SYMBOL(blkdev_get);
1316 1443
1317static int blkdev_open(struct inode * inode, struct file * filp) 1444static int blkdev_open(struct inode * inode, struct file * filp)
1318{ 1445{
1446 struct block_device *whole = NULL;
1319 struct block_device *bdev; 1447 struct block_device *bdev;
1320 int res; 1448 int res;
1321 1449
@@ -1338,22 +1466,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1338 if (bdev == NULL) 1466 if (bdev == NULL)
1339 return -ENOMEM; 1467 return -ENOMEM;
1340 1468
1469 if (filp->f_mode & FMODE_EXCL) {
1470 whole = bd_start_claiming(bdev, filp);
1471 if (IS_ERR(whole)) {
1472 bdput(bdev);
1473 return PTR_ERR(whole);
1474 }
1475 }
1476
1341 filp->f_mapping = bdev->bd_inode->i_mapping; 1477 filp->f_mapping = bdev->bd_inode->i_mapping;
1342 1478
1343 res = blkdev_get(bdev, filp->f_mode); 1479 res = blkdev_get(bdev, filp->f_mode);
1344 if (res)
1345 return res;
1346 1480
1347 if (filp->f_mode & FMODE_EXCL) { 1481 if (whole) {
1348 res = bd_claim(bdev, filp); 1482 if (res == 0)
1349 if (res) 1483 BUG_ON(bd_claim(bdev, filp) != 0);
1350 goto out_blkdev_put; 1484 else
1485 bd_abort_claiming(whole, filp);
1351 } 1486 }
1352 1487
1353 return 0;
1354
1355 out_blkdev_put:
1356 blkdev_put(bdev, filp->f_mode);
1357 return res; 1488 return res;
1358} 1489}
1359 1490
@@ -1564,27 +1695,34 @@ EXPORT_SYMBOL(lookup_bdev);
1564 */ 1695 */
1565struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1696struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1566{ 1697{
1567 struct block_device *bdev; 1698 struct block_device *bdev, *whole;
1568 int error = 0; 1699 int error;
1569 1700
1570 bdev = lookup_bdev(path); 1701 bdev = lookup_bdev(path);
1571 if (IS_ERR(bdev)) 1702 if (IS_ERR(bdev))
1572 return bdev; 1703 return bdev;
1573 1704
1705 whole = bd_start_claiming(bdev, holder);
1706 if (IS_ERR(whole)) {
1707 bdput(bdev);
1708 return whole;
1709 }
1710
1574 error = blkdev_get(bdev, mode); 1711 error = blkdev_get(bdev, mode);
1575 if (error) 1712 if (error)
1576 return ERR_PTR(error); 1713 goto out_abort_claiming;
1714
1577 error = -EACCES; 1715 error = -EACCES;
1578 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1716 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1579 goto blkdev_put; 1717 goto out_blkdev_put;
1580 error = bd_claim(bdev, holder);
1581 if (error)
1582 goto blkdev_put;
1583 1718
1719 BUG_ON(bd_claim(bdev, holder) != 0);
1584 return bdev; 1720 return bdev;
1585 1721
1586blkdev_put: 1722out_blkdev_put:
1587 blkdev_put(bdev, mode); 1723 blkdev_put(bdev, mode);
1724out_abort_claiming:
1725 bd_abort_claiming(whole, holder);
1588 return ERR_PTR(error); 1726 return ERR_PTR(error);
1589} 1727}
1590 1728
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26724ec..8d432cd9d580 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -282,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode)
282 return ret; 282 return ret;
283} 283}
284 284
285struct xattr_handler btrfs_xattr_acl_default_handler = { 285const struct xattr_handler btrfs_xattr_acl_default_handler = {
286 .prefix = POSIX_ACL_XATTR_DEFAULT, 286 .prefix = POSIX_ACL_XATTR_DEFAULT,
287 .flags = ACL_TYPE_DEFAULT, 287 .flags = ACL_TYPE_DEFAULT,
288 .get = btrfs_xattr_acl_get, 288 .get = btrfs_xattr_acl_get,
289 .set = btrfs_xattr_acl_set, 289 .set = btrfs_xattr_acl_set,
290}; 290};
291 291
292struct xattr_handler btrfs_xattr_acl_access_handler = { 292const struct xattr_handler btrfs_xattr_acl_access_handler = {
293 .prefix = POSIX_ACL_XATTR_ACCESS, 293 .prefix = POSIX_ACL_XATTR_ACCESS,
294 .flags = ACL_TYPE_ACCESS, 294 .flags = ACL_TYPE_ACCESS,
295 .get = btrfs_xattr_acl_get, 295 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1589 u64 start, u64 len) 1589 u64 start, u64 len)
1590{ 1590{
1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1592 DISCARD_FL_BARRIER); 1592 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1593} 1593}
1594 1594
1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..d601629b85d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4121 if (ret != 0) 4121 if (ret != 0)
4122 goto fail; 4122 goto fail;
4123 4123
4124 inode->i_uid = current_fsuid(); 4124 inode_init_owner(inode, dir, mode);
4125
4126 if (dir && (dir->i_mode & S_ISGID)) {
4127 inode->i_gid = dir->i_gid;
4128 if (S_ISDIR(mode))
4129 mode |= S_ISGID;
4130 } else
4131 inode->i_gid = current_fsgid();
4132
4133 inode->i_mode = mode;
4134 inode->i_ino = objectid; 4125 inode->i_ino = objectid;
4135 inode_set_bytes(inode, 0); 4126 inode_set_bytes(inode, 0);
4136 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4127 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..59acd3eb288a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -282,7 +282,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 282 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 283 * attributes are handled directly.
284 */ 284 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 285const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 286#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 287 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 288 &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..e8aa7081d25c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
275 return; 275 return;
276 276
277 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
278 lru_add_drain_all(); /* make sure all lru add caches are flushed */
278 invalidate_mapping_pages(mapping, 0, -1); 279 invalidate_mapping_pages(mapping, 0, -1);
279} 280}
280EXPORT_SYMBOL(invalidate_bdev); 281EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
560 return err; 561 return err;
561} 562}
562 563
563static void do_thaw_all(struct work_struct *work) 564static void do_thaw_one(struct super_block *sb, void *unused)
564{ 565{
565 struct super_block *sb;
566 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
567 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
568 printk(KERN_WARNING "Emergency Thaw on %s\n",
569 bdevname(sb->s_bdev, b));
570}
567 571
568 spin_lock(&sb_lock); 572static void do_thaw_all(struct work_struct *work)
569restart: 573{
570 list_for_each_entry(sb, &super_blocks, s_list) { 574 iterate_supers(do_thaw_one, NULL);
571 sb->s_count++;
572 spin_unlock(&sb_lock);
573 down_read(&sb->s_umount);
574 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
575 printk(KERN_WARNING "Emergency Thaw on %s\n",
576 bdevname(sb->s_bdev, b));
577 up_read(&sb->s_umount);
578 spin_lock(&sb_lock);
579 if (__put_super_and_need_restart(sb))
580 goto restart;
581 }
582 spin_unlock(&sb_lock);
583 kfree(work); 575 kfree(work);
584 printk(KERN_WARNING "Emergency Thaw complete\n"); 576 printk(KERN_WARNING "Emergency Thaw complete\n");
585} 577}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0; 275 int rc = 0;
276 struct page **pages; 276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset; 277 loff_t offset;
279 u64 len; 278 u64 len;
280 279
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
297 if (rc < 0) 296 if (rc < 0)
298 goto out; 297 goto out;
299 298
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0; 299 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 300 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page = 301 struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
312 zero_user_segment(page, s, PAGE_CACHE_SIZE); 309 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 } 310 }
314 311
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { 312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page); 313 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n", 314 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page); 315 inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
323 flush_dcache_page(page); 320 flush_dcache_page(page);
324 SetPageUptodate(page); 321 SetPageUptodate(page);
325 unlock_page(page); 322 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0) 323 page_cache_release(page);
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 } 324 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0; 325 rc = 0;
331 326
332out: 327out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
568 ceph_release_pages(req->r_pages, req->r_num_pages); 563 ceph_release_pages(req->r_pages, req->r_num_pages);
569 if (req->r_pages_from_pool) 564 if (req->r_pages_from_pool)
570 mempool_free(req->r_pages, 565 mempool_free(req->r_pages,
571 ceph_client(inode->i_sb)->wb_pagevec_pool); 566 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
572 else 567 else
573 kfree(req->r_pages); 568 kfree(req->r_pages);
574 ceph_osdc_put_request(req); 569 ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..9f46de2ba7a7 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
150 150
151 ret = ac->ops->build_request(ac, p + sizeof(u32), end); 151 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
152 if (ret < 0) { 152 if (ret < 0) {
153 pr_err("error %d building request\n", ret); 153 pr_err("error %d building auth method %s request\n", ret,
154 ac->ops->name);
154 return ret; 155 return ret;
155 } 156 }
156 dout(" built request %d bytes\n", ret); 157 dout(" built request %d bytes\n", ret);
@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
216 if (ac->protocol != protocol) { 217 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol); 218 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) { 219 if (ret) {
219 pr_err("error %d on auth protocol %d init\n", 220 pr_err("error %d on auth method %s init\n",
220 ret, protocol); 221 ret, ac->ops->name);
221 goto out; 222 goto out;
222 } 223 }
223 } 224 }
@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
229 if (ret == -EAGAIN) { 230 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len); 231 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) { 232 } else if (ret) {
232 pr_err("authentication error %d\n", ret); 233 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret; 234 return ret;
234 } 235 }
235 return 0; 236 return 0;
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..4429a707c021 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
15struct ceph_authorizer; 15struct ceph_authorizer;
16 16
17struct ceph_auth_client_ops { 17struct ceph_auth_client_ops {
18 const char *name;
19
18 /* 20 /*
19 * true if we are authenticated and can connect to 21 * true if we are authenticated and can connect to
20 * services. 22 * services.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..24407c119291 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
94} 94}
95 95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = { 96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .name = "none",
97 .reset = reset, 98 .reset = reset,
98 .destroy = destroy, 99 .destroy = destroy,
99 .is_authenticated = is_authenticated, 100 .is_authenticated = is_authenticated,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..7b206231566d 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
127 int ret; 127 int ret;
128 char *dbuf; 128 char *dbuf;
129 char *ticket_buf; 129 char *ticket_buf;
130 u8 struct_v; 130 u8 reply_struct_v;
131 131
132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); 132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
133 if (!dbuf) 133 if (!dbuf)
@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
139 goto out_dbuf; 139 goto out_dbuf;
140 140
141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad); 141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
142 struct_v = ceph_decode_8(&p); 142 reply_struct_v = ceph_decode_8(&p);
143 if (struct_v != 1) 143 if (reply_struct_v != 1)
144 goto bad; 144 goto bad;
145 num = ceph_decode_32(&p); 145 num = ceph_decode_32(&p);
146 dout("%d tickets\n", num); 146 dout("%d tickets\n", num);
147 while (num--) { 147 while (num--) {
148 int type; 148 int type;
149 u8 struct_v; 149 u8 tkt_struct_v, blob_struct_v;
150 struct ceph_x_ticket_handler *th; 150 struct ceph_x_ticket_handler *th;
151 void *dp, *dend; 151 void *dp, *dend;
152 int dlen; 152 int dlen;
@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
165 type = ceph_decode_32(&p); 165 type = ceph_decode_32(&p);
166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); 166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
167 167
168 struct_v = ceph_decode_8(&p); 168 tkt_struct_v = ceph_decode_8(&p);
169 if (struct_v != 1) 169 if (tkt_struct_v != 1)
170 goto bad; 170 goto bad;
171 171
172 th = get_ticket_handler(ac, type); 172 th = get_ticket_handler(ac, type);
@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
186 dend = dbuf + dlen; 186 dend = dbuf + dlen;
187 dp = dbuf; 187 dp = dbuf;
188 188
189 struct_v = ceph_decode_8(&dp); 189 tkt_struct_v = ceph_decode_8(&dp);
190 if (struct_v != 1) 190 if (tkt_struct_v != 1)
191 goto bad; 191 goto bad;
192 192
193 memcpy(&old_key, &th->session_key, sizeof(old_key)); 193 memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
224 tpend = tp + dlen; 224 tpend = tp + dlen;
225 dout(" ticket blob is %d bytes\n", dlen); 225 dout(" ticket blob is %d bytes\n", dlen);
226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); 226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
227 struct_v = ceph_decode_8(&tp); 227 blob_struct_v = ceph_decode_8(&tp);
228 new_secret_id = ceph_decode_64(&tp); 228 new_secret_id = ceph_decode_64(&tp);
229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); 229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
230 if (ret) 230 if (ret)
@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
618 618
619 619
620static const struct ceph_auth_client_ops ceph_x_ops = { 620static const struct ceph_auth_client_ops ceph_x_ops = {
621 .name = "x",
621 .is_authenticated = ceph_x_is_authenticated, 622 .is_authenticated = ceph_x_is_authenticated,
622 .build_request = ceph_x_build_request, 623 .build_request = ceph_x_build_request,
623 .handle_reply = ceph_x_handle_reply, 624 .handle_reply = ceph_x_handle_reply,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..0dd0b81e64f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
867{ 867{
868 struct ceph_mds_session *session = cap->session; 868 struct ceph_mds_session *session = cap->session;
869 struct ceph_inode_info *ci = cap->ci; 869 struct ceph_inode_info *ci = cap->ci;
870 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 870 struct ceph_mds_client *mdsc =
871 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
871 int removed = 0; 872 int removed = 0;
872 873
873 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 874 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
937 seq, issue_seq, mseq, follows, size, max_size, 938 seq, issue_seq, mseq, follows, size, max_size,
938 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 939 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
939 940
940 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); 941 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
941 if (IS_ERR(msg)) 942 if (!msg)
942 return PTR_ERR(msg); 943 return -ENOMEM;
943 944
944 msg->hdr.tid = cpu_to_le64(flush_tid); 945 msg->hdr.tid = cpu_to_le64(flush_tid);
945 946
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1298 */ 1299 */
1299void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1300void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1300{ 1301{
1301 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 1302 struct ceph_mds_client *mdsc =
1303 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1302 struct inode *inode = &ci->vfs_inode; 1304 struct inode *inode = &ci->vfs_inode;
1303 int was = ci->i_dirty_caps; 1305 int was = ci->i_dirty_caps;
1304 int dirty = 0; 1306 int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1336static int __mark_caps_flushing(struct inode *inode, 1338static int __mark_caps_flushing(struct inode *inode,
1337 struct ceph_mds_session *session) 1339 struct ceph_mds_session *session)
1338{ 1340{
1339 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1341 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1340 struct ceph_inode_info *ci = ceph_inode(inode); 1342 struct ceph_inode_info *ci = ceph_inode(inode);
1341 int flushing; 1343 int flushing;
1342 1344
@@ -1663,7 +1665,7 @@ ack:
1663static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1665static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1664 unsigned *flush_tid) 1666 unsigned *flush_tid)
1665{ 1667{
1666 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1668 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1667 struct ceph_inode_info *ci = ceph_inode(inode); 1669 struct ceph_inode_info *ci = ceph_inode(inode);
1668 int unlock_session = session ? 0 : 1; 1670 int unlock_session = session ? 0 : 1;
1669 int flushing = 0; 1671 int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
1716static int caps_are_flushed(struct inode *inode, unsigned tid) 1718static int caps_are_flushed(struct inode *inode, unsigned tid)
1717{ 1719{
1718 struct ceph_inode_info *ci = ceph_inode(inode); 1720 struct ceph_inode_info *ci = ceph_inode(inode);
1719 int dirty, i, ret = 1; 1721 int i, ret = 1;
1720 1722
1721 spin_lock(&inode->i_lock); 1723 spin_lock(&inode->i_lock);
1722 dirty = __ceph_caps_dirty(ci);
1723 for (i = 0; i < CEPH_CAP_BITS; i++) 1724 for (i = 0; i < CEPH_CAP_BITS; i++)
1724 if ((ci->i_flushing_caps & (1 << i)) && 1725 if ((ci->i_flushing_caps & (1 << i)) &&
1725 ci->i_cap_flush_tid[i] <= tid) { 1726 ci->i_cap_flush_tid[i] <= tid) {
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1829 err = wait_event_interruptible(ci->i_cap_wq, 1830 err = wait_event_interruptible(ci->i_cap_wq,
1830 caps_are_flushed(inode, flush_tid)); 1831 caps_are_flushed(inode, flush_tid));
1831 } else { 1832 } else {
1832 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1833 struct ceph_mds_client *mdsc =
1834 &ceph_sb_to_client(inode->i_sb)->mdsc;
1833 1835
1834 spin_lock(&inode->i_lock); 1836 spin_lock(&inode->i_lock);
1835 if (__ceph_caps_dirty(ci)) 1837 if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2411 __releases(inode->i_lock) 2413 __releases(inode->i_lock)
2412{ 2414{
2413 struct ceph_inode_info *ci = ceph_inode(inode); 2415 struct ceph_inode_info *ci = ceph_inode(inode);
2414 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 2416 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
2415 unsigned seq = le32_to_cpu(m->seq); 2417 unsigned seq = le32_to_cpu(m->seq);
2416 int dirty = le32_to_cpu(m->dirty); 2418 int dirty = le32_to_cpu(m->dirty);
2417 int cleaned = 0; 2419 int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..3b9eeed097b3 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
19 * Ceph release version 19 * Ceph release version
20 */ 20 */
21#define CEPH_VERSION_MAJOR 0 21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19 22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0 23#define CEPH_VERSION_PATCH 0
24 24
25#define _CEPH_STRINGIFY(x) #x 25#define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
36 * client-facing protocol. 36 * client-facing protocol.
37 */ 37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ 38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ 39#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */ 40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
@@ -53,8 +53,18 @@
53/* 53/*
54 * feature bits 54 * feature bits
55 */ 55 */
56#define CEPH_FEATURE_SUPPORTED 0 56#define CEPH_FEATURE_UID 1
57#define CEPH_FEATURE_REQUIRED 0 57#define CEPH_FEATURE_NOSRCADDR 2
58#define CEPH_FEATURE_FLOCK 4
59
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
58 68
59 69
60/* 70/*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
91#define CEPH_AUTH_NONE 0x1 101#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2 102#define CEPH_AUTH_CEPHX 0x2
93 103
104#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
105
94 106
95/********************************************* 107/*********************************************
96 * message layer 108 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
128#define CEPH_MSG_CLIENT_SNAP 0x312 140#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 141#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130 142
143/* pool ops */
144#define CEPH_MSG_POOLOP_REPLY 48
145#define CEPH_MSG_POOLOP 49
146
147
131/* osd */ 148/* osd */
132#define CEPH_MSG_OSD_MAP 41 149#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42 150#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43 151#define CEPH_MSG_OSD_OPREPLY 43
135 152
153/* pool operations */
154enum {
155 POOL_OP_CREATE = 0x01,
156 POOL_OP_DELETE = 0x02,
157 POOL_OP_AUID_CHANGE = 0x03,
158 POOL_OP_CREATE_SNAP = 0x11,
159 POOL_OP_DELETE_SNAP = 0x12,
160 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
161 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
162};
163
136struct ceph_mon_request_header { 164struct ceph_mon_request_header {
137 __le64 have_version; 165 __le64 have_version;
138 __le16 session_mon; 166 __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
155 struct ceph_statfs st; 183 struct ceph_statfs st;
156} __attribute__ ((packed)); 184} __attribute__ ((packed));
157 185
186const char *ceph_pool_op_name(int op);
187
188struct ceph_mon_poolop {
189 struct ceph_mon_request_header monhdr;
190 struct ceph_fsid fsid;
191 __le32 pool;
192 __le32 op;
193 __le64 auid;
194 __le64 snapid;
195 __le32 name_len;
196} __attribute__ ((packed));
197
198struct ceph_mon_poolop_reply {
199 struct ceph_mon_request_header monhdr;
200 struct ceph_fsid fsid;
201 __le32 reply_code;
202 __le32 epoch;
203 char has_data;
204 char data[0];
205} __attribute__ ((packed));
206
207struct ceph_mon_unmanaged_snap {
208 __le64 snapid;
209} __attribute__ ((packed));
210
158struct ceph_osd_getmap { 211struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr; 212 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid; 213 struct ceph_fsid fsid;
@@ -308,6 +361,7 @@ union ceph_mds_request_args {
308 struct { 361 struct {
309 __le32 frag; /* which dir fragment */ 362 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */ 363 __le32 max_entries; /* how many dentries to grab */
364 __le32 max_bytes;
311 } __attribute__ ((packed)) readdir; 365 } __attribute__ ((packed)) readdir;
312 struct { 366 struct {
313 __le32 mode; 367 __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
10 case CEPH_ENTITY_TYPE_OSD: return "osd"; 10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon"; 11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client"; 12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth"; 13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown"; 14 default: return "unknown";
16 } 15 }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; 44 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; 45 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr"; 46 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
47 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
48 48
49 case CEPH_OSD_OP_PULL: return "pull"; 49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push"; 50 case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
174 } 174 }
175 return "???"; 175 return "???";
176} 176}
177
178const char *ceph_pool_op_name(int op)
179{
180 switch (op) {
181 case POOL_OP_CREATE: return "create";
182 case POOL_OP_DELETE: return "delete";
183 case POOL_OP_AUID_CHANGE: return "auid change";
184 case POOL_OP_CREATE_SNAP: return "create snap";
185 case POOL_OP_DELETE_SNAP: return "delete snap";
186 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
187 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
188 }
189 return "???";
190}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
113static int monc_show(struct seq_file *s, void *p) 113static int monc_show(struct seq_file *s, void *p)
114{ 114{
115 struct ceph_client *client = s->private; 115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req; 116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc; 117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp; 118 struct rb_node *rp;
119 119
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
126 if (monc->want_next_osdmap) 126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n"); 127 seq_printf(s, "want next osdmap\n");
128 128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { 129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node); 130 __u16 op;
131 seq_printf(s, "%lld statfs\n", req->tid); 131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
132 } 137 }
133 138
134 mutex_unlock(&monc->mutex); 139 mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..4fd30900eff7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
51 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
52 52
53 spin_lock(&dentry->d_lock); 53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */ 54 if (dentry->d_fsdata) {
55 /* lost a race */
56 kmem_cache_free(ceph_dentry_cachep, di);
55 goto out_unlock; 57 goto out_unlock;
58 }
56 di->dentry = dentry; 59 di->dentry = dentry;
57 di->lease_session = NULL; 60 di->lease_session = NULL;
58 dentry->d_fsdata = di; 61 dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
125 dentry = list_entry(p, struct dentry, d_u.d_child); 128 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry); 129 di = ceph_dentry(dentry);
127 while (1) { 130 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, 131 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
132 d_unhashed(dentry) ? "!hashed" : "hashed",
129 parent->d_subdirs.prev, parent->d_subdirs.next); 133 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) { 134 if (p == &parent->d_subdirs) {
131 fi->at_end = 1; 135 fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
229 u32 ftype; 233 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo; 234 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir; 235 const int max_entries = client->mount_args->max_readdir;
236 const int max_bytes = client->mount_args->max_readdir_bytes;
232 237
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end) 239 if (fi->at_end)
@@ -312,6 +317,7 @@ more:
312 req->r_readdir_offset = fi->next_offset; 317 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag); 318 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 319 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
320 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
315 req->r_num_caps = max_entries + 1; 321 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req); 322 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) { 323 if (err < 0) {
@@ -335,7 +341,7 @@ more:
335 if (req->r_reply_info.dir_end) { 341 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name); 342 kfree(fi->last_name);
337 fi->last_name = NULL; 343 fi->last_name = NULL;
338 fi->next_offset = 0; 344 fi->next_offset = 2;
339 } else { 345 } else {
340 rinfo = &req->r_reply_info; 346 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi, 347 err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 484struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err) 485 struct dentry *dentry, int err)
480{ 486{
481 struct ceph_client *client = ceph_client(dentry->d_sb); 487 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode; 488 struct inode *parent = dentry->d_parent->d_inode;
483 489
484 /* .snap dir? */ 490 /* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
568 !is_root_ceph_dentry(dir, dentry) && 574 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 575 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 576 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock); 577 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir); 578 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL); 579 d_add(dentry, NULL);
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
888 893
889 /* ensure target dentry is invalidated, despite 894 /* ensure target dentry is invalidated, despite
890 rehashing bug in vfs_rename_dir */ 895 rehashing bug in vfs_rename_dir */
891 new_dentry->d_time = jiffies; 896 ceph_invalidate_dentry_lease(new_dentry);
892 ceph_dentry(new_dentry)->lease_shared_gen = 0;
893 } 897 }
894 ceph_mdsc_put_request(req); 898 ceph_mdsc_put_request(req);
895 return err; 899 return err;
896} 900}
897 901
902/*
903 * Ensure a dentry lease will no longer revalidate.
904 */
905void ceph_invalidate_dentry_lease(struct dentry *dentry)
906{
907 spin_lock(&dentry->d_lock);
908 dentry->d_time = jiffies;
909 ceph_dentry(dentry)->lease_shared_gen = 0;
910 spin_unlock(&dentry->d_lock);
911}
898 912
899/* 913/*
900 * Check if dentry lease is valid. If not, delete the lease. Try to 914 * Check if dentry lease is valid. If not, delete the lease. Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
972{ 986{
973 struct inode *dir = dentry->d_parent->d_inode; 987 struct inode *dir = dentry->d_parent->d_inode;
974 988
975 dout("d_revalidate %p '%.*s' inode %p\n", dentry, 989 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
976 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 990 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
991 ceph_dentry(dentry)->offset);
977 992
978 /* always trust cached snapped dentries, snapdir dentry */ 993 /* always trust cached snapped dentries, snapdir dentry */
979 if (ceph_snap(dir) != CEPH_NOSNAP) { 994 if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1050 struct ceph_inode_info *ci = ceph_inode(inode); 1065 struct ceph_inode_info *ci = ceph_inode(inode);
1051 int left; 1066 int left;
1052 1067
1053 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) 1068 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1054 return -EISDIR; 1069 return -EISDIR;
1055 1070
1056 if (!cf->dir_info) { 1071 if (!cf->dir_info) {
@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1152 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1167 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1153 dn->d_name.len, dn->d_name.name); 1168 dn->d_name.len, dn->d_name.name);
1154 if (di) { 1169 if (di) {
1155 mdsc = &ceph_client(dn->d_sb)->mdsc; 1170 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1156 spin_lock(&mdsc->dentry_lru_lock); 1171 spin_lock(&mdsc->dentry_lru_lock);
1157 list_add_tail(&di->lru, &mdsc->dentry_lru); 1172 list_add_tail(&di->lru, &mdsc->dentry_lru);
1158 mdsc->num_dentry++; 1173 mdsc->num_dentry++;
@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1165 struct ceph_dentry_info *di = ceph_dentry(dn); 1180 struct ceph_dentry_info *di = ceph_dentry(dn);
1166 struct ceph_mds_client *mdsc; 1181 struct ceph_mds_client *mdsc;
1167 1182
1168 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, 1183 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1169 dn->d_name.len, dn->d_name.name); 1184 dn->d_name.len, dn->d_name.name, di->offset);
1170 if (di) { 1185 if (di) {
1171 mdsc = &ceph_client(dn->d_sb)->mdsc; 1186 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1172 spin_lock(&mdsc->dentry_lru_lock); 1187 spin_lock(&mdsc->dentry_lru_lock);
1173 list_move_tail(&di->lru, &mdsc->dentry_lru); 1188 list_move_tail(&di->lru, &mdsc->dentry_lru);
1174 spin_unlock(&mdsc->dentry_lru_lock); 1189 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1183 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1198 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1184 dn->d_name.len, dn->d_name.name); 1199 dn->d_name.len, dn->d_name.name);
1185 if (di) { 1200 if (di) {
1186 mdsc = &ceph_client(dn->d_sb)->mdsc; 1201 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1187 spin_lock(&mdsc->dentry_lru_lock); 1202 spin_lock(&mdsc->dentry_lru_lock);
1188 list_del_init(&di->lru); 1203 list_del_init(&di->lru);
1189 mdsc->num_dentry--; 1204 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..17447644d675 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
93 return ERR_PTR(-ESTALE); 93 return ERR_PTR(-ESTALE);
94 94
95 dentry = d_obtain_alias(inode); 95 dentry = d_obtain_alias(inode);
96 if (!dentry) { 96 if (IS_ERR(dentry)) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", 97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode); 98 fh->ino, inode);
99 iput(inode); 99 iput(inode);
100 return ERR_PTR(-ENOMEM); 100 return dentry;
101 } 101 }
102 err = ceph_init_dentry(dentry); 102 err = ceph_init_dentry(dentry);
103 103
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 116 struct ceph_nfs_confh *cfh)
117{ 117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; 118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 119 struct inode *inode;
120 struct dentry *dentry; 120 struct dentry *dentry;
121 struct ceph_vino vino; 121 struct ceph_vino vino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
149 } 149 }
150 150
151 dentry = d_obtain_alias(inode); 151 dentry = d_obtain_alias(inode);
152 if (!dentry) { 152 if (IS_ERR(dentry)) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", 153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode); 154 cfh->ino, inode);
155 iput(inode); 155 iput(inode);
156 return ERR_PTR(-ENOMEM); 156 return dentry;
157 } 157 }
158 err = ceph_init_dentry(dentry); 158 err = ceph_init_dentry(dentry);
159 if (err < 0) { 159 if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
202 return ERR_PTR(-ESTALE); 202 return ERR_PTR(-ESTALE);
203 203
204 dentry = d_obtain_alias(inode); 204 dentry = d_obtain_alias(inode);
205 if (!dentry) { 205 if (IS_ERR(dentry)) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode); 207 cfh->ino, inode);
208 iput(inode); 208 iput(inode);
209 return ERR_PTR(-ENOMEM); 209 return dentry;
210 } 210 }
211 err = ceph_init_dentry(dentry); 211 err = ceph_init_dentry(dentry);
212 if (err < 0) { 212 if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6e..6512b6701b9e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
317/* 317/*
318 * allocate a vector new pages 318 * allocate a vector new pages
319 */ 319 */
320static struct page **alloc_page_vector(int num_pages) 320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{ 321{
322 struct page **pages; 322 struct page **pages;
323 int i; 323 int i;
324 324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); 325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages) 326 if (!pages)
327 return ERR_PTR(-ENOMEM); 327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) { 328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS); 329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) { 330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i); 331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM); 332 return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
540 * in sequence. 540 * in sequence.
541 */ 541 */
542 } else { 542 } else {
543 pages = alloc_page_vector(num_pages); 543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
544 } 544 }
545 if (IS_ERR(pages)) 545 if (IS_ERR(pages))
546 return PTR_ERR(pages); 546 return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
649 do_sync, 649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size, 650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2); 651 &mtime, false, 2);
652 if (IS_ERR(req)) 652 if (!req)
653 return PTR_ERR(req); 653 return -ENOMEM;
654 654
655 num_pages = calc_pages_for(pos, len); 655 num_pages = calc_pages_for(pos, len);
656 656
@@ -668,7 +668,7 @@ more:
668 truncate_inode_pages_range(inode->i_mapping, pos, 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 670 } else {
671 pages = alloc_page_vector(num_pages); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
672 if (IS_ERR(pages)) { 672 if (IS_ERR(pages)) {
673 ret = PTR_ERR(pages); 673 ret = PTR_ERR(pages);
674 goto out; 674 goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
809 struct file *file = iocb->ki_filp; 809 struct file *file = iocb->ki_filp;
810 struct inode *inode = file->f_dentry->d_inode; 810 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 811 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len; 813 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 814 int got = 0;
815 int ret, err; 815 int ret, err;
@@ -844,8 +844,7 @@ retry_snap:
844 if ((ret >= 0 || ret == -EIOCBQUEUED) && 844 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
847 err = vfs_fsync_range(file, file->f_path.dentry, 847 err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
848 pos, pos + ret - 1, 1);
849 if (err < 0) 848 if (err < 0)
850 ret = err; 849 ret = err;
851 } 850 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..a81b8b662c7b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 384 */
385 if (ci->i_snap_realm) { 385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc; 387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 389
390 dout(" dropping residual ref to snap realm %p\n", realm); 390 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
619 memcpy(ci->i_xattrs.blob->vec.iov_base, 619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len); 620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 xattr_blob = NULL;
622 } 623 }
623 624
624 inode->i_mapping->a_ops = &ceph_aops; 625 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info = 626 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info; 627 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
627 628
628 switch (inode->i_mode & S_IFMT) { 629 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO: 630 case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
674 /* set dir completion flag? */ 675 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 && 676 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP && 677 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { 678 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
679 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
678 dout(" marking %p complete (empty)\n", inode); 680 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE; 681 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2; 682 ci->i_max_offset = 2;
681 } 683 }
682 684
683 /* it may be better to set st_size in getattr instead? */ 685 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) 686 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes; 687 inode->i_size = ci->i_rbytes;
686 break; 688 break;
687 default: 689 default:
@@ -802,6 +804,37 @@ out_unlock:
802} 804}
803 805
804/* 806/*
807 * Set dentry's directory position based on the current dir's max, and
808 * order it in d_subdirs, so that dcache_readdir behaves.
809 */
810static void ceph_set_dentry_offset(struct dentry *dn)
811{
812 struct dentry *dir = dn->d_parent;
813 struct inode *inode = dn->d_parent->d_inode;
814 struct ceph_dentry_info *di;
815
816 BUG_ON(!inode);
817
818 di = ceph_dentry(dn);
819
820 spin_lock(&inode->i_lock);
821 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
822 spin_unlock(&inode->i_lock);
823 return;
824 }
825 di->offset = ceph_inode(inode)->i_max_offset++;
826 spin_unlock(&inode->i_lock);
827
828 spin_lock(&dcache_lock);
829 spin_lock(&dn->d_lock);
830 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
831 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
832 dn->d_u.d_child.prev, dn->d_u.d_child.next);
833 spin_unlock(&dn->d_lock);
834 spin_unlock(&dcache_lock);
835}
836
837/*
805 * splice a dentry to an inode. 838 * splice a dentry to an inode.
806 * caller must hold directory i_mutex for this to be safe. 839 * caller must hold directory i_mutex for this to be safe.
807 * 840 *
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
814{ 847{
815 struct dentry *realdn; 848 struct dentry *realdn;
816 849
850 BUG_ON(dn->d_inode);
851
817 /* dn must be unhashed */ 852 /* dn must be unhashed */
818 if (!d_unhashed(dn)) 853 if (!d_unhashed(dn))
819 d_drop(dn); 854 d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
835 dn = realdn; 870 dn = realdn;
836 } else { 871 } else {
837 BUG_ON(!ceph_dentry(dn)); 872 BUG_ON(!ceph_dentry(dn));
838
839 dout("dn %p attached to %p ino %llx.%llx\n", 873 dout("dn %p attached to %p ino %llx.%llx\n",
840 dn, dn->d_inode, ceph_vinop(dn->d_inode)); 874 dn, dn->d_inode, ceph_vinop(dn->d_inode));
841 } 875 }
842 if ((!prehash || *prehash) && d_unhashed(dn)) 876 if ((!prehash || *prehash) && d_unhashed(dn))
843 d_rehash(dn); 877 d_rehash(dn);
878 ceph_set_dentry_offset(dn);
844out: 879out:
845 return dn; 880 return dn;
846} 881}
847 882
848/* 883/*
849 * Set dentry's directory position based on the current dir's max, and
850 * order it in d_subdirs, so that dcache_readdir behaves.
851 */
852static void ceph_set_dentry_offset(struct dentry *dn)
853{
854 struct dentry *dir = dn->d_parent;
855 struct inode *inode = dn->d_parent->d_inode;
856 struct ceph_dentry_info *di;
857
858 BUG_ON(!inode);
859
860 di = ceph_dentry(dn);
861
862 spin_lock(&inode->i_lock);
863 di->offset = ceph_inode(inode)->i_max_offset++;
864 spin_unlock(&inode->i_lock);
865
866 spin_lock(&dcache_lock);
867 spin_lock(&dn->d_lock);
868 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
869 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
870 dn->d_u.d_child.prev, dn->d_u.d_child.next);
871 spin_unlock(&dn->d_lock);
872 spin_unlock(&dcache_lock);
873}
874
875/*
876 * Incorporate results into the local cache. This is either just 884 * Incorporate results into the local cache. This is either just
877 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 885 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
878 * after a lookup). 886 * after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
933 941
934 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 942 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
935 dout("fill_trace reply is empty!\n"); 943 dout("fill_trace reply is empty!\n");
936 if (rinfo->head->result == 0 && req->r_locked_dir) { 944 if (rinfo->head->result == 0 && req->r_locked_dir)
937 struct ceph_inode_info *ci = 945 ceph_invalidate_dir_request(req);
938 ceph_inode(req->r_locked_dir);
939 dout(" clearing %p complete (empty trace)\n",
940 req->r_locked_dir);
941 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
942 ci->i_release_count++;
943 }
944 return 0; 946 return 0;
945 } 947 }
946 948
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1011 req->r_old_dentry->d_name.len, 1013 req->r_old_dentry->d_name.len,
1012 req->r_old_dentry->d_name.name, 1014 req->r_old_dentry->d_name.name,
1013 dn, dn->d_name.len, dn->d_name.name); 1015 dn, dn->d_name.len, dn->d_name.name);
1016
1014 /* ensure target dentry is invalidated, despite 1017 /* ensure target dentry is invalidated, despite
1015 rehashing bug in vfs_rename_dir */ 1018 rehashing bug in vfs_rename_dir */
1016 dn->d_time = jiffies; 1019 ceph_invalidate_dentry_lease(dn);
1017 ceph_dentry(dn)->lease_shared_gen = 0; 1020
1018 /* take overwritten dentry's readdir offset */ 1021 /* take overwritten dentry's readdir offset */
1022 dout("dn %p gets %p offset %lld (old offset %lld)\n",
1023 req->r_old_dentry, dn, ceph_dentry(dn)->offset,
1024 ceph_dentry(req->r_old_dentry)->offset);
1019 ceph_dentry(req->r_old_dentry)->offset = 1025 ceph_dentry(req->r_old_dentry)->offset =
1020 ceph_dentry(dn)->offset; 1026 ceph_dentry(dn)->offset;
1027
1021 dn = req->r_old_dentry; /* use old_dentry */ 1028 dn = req->r_old_dentry; /* use old_dentry */
1022 in = dn->d_inode; 1029 in = dn->d_inode;
1023 } 1030 }
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1059 goto done; 1066 goto done;
1060 } 1067 }
1061 req->r_dentry = dn; /* may have spliced */ 1068 req->r_dentry = dn; /* may have spliced */
1062 ceph_set_dentry_offset(dn);
1063 igrab(in); 1069 igrab(in);
1064 } else if (ceph_ino(in) == vino.ino && 1070 } else if (ceph_ino(in) == vino.ino &&
1065 ceph_snap(in) == vino.snap) { 1071 ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1102 err = PTR_ERR(dn); 1108 err = PTR_ERR(dn);
1103 goto done; 1109 goto done;
1104 } 1110 }
1105 ceph_set_dentry_offset(dn);
1106 req->r_dentry = dn; /* may have spliced */ 1111 req->r_dentry = dn; /* may have spliced */
1107 igrab(in); 1112 igrab(in);
1108 rinfo->head->is_dentry = 1; /* fool notrace handlers */ 1113 rinfo->head->is_dentry = 1; /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1429{ 1434{
1430 struct ceph_inode_info *ci = ceph_inode(inode); 1435 struct ceph_inode_info *ci = ceph_inode(inode);
1431 1436
1432 if (queue_work(ceph_client(inode->i_sb)->trunc_wq, 1437 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1433 &ci->i_vmtruncate_work)) { 1438 &ci->i_vmtruncate_work)) {
1434 dout("ceph_queue_vmtruncate %p\n", inode); 1439 dout("ceph_queue_vmtruncate %p\n", inode);
1435 igrab(inode); 1440 igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1518 struct inode *parent_inode = dentry->d_parent->d_inode; 1523 struct inode *parent_inode = dentry->d_parent->d_inode;
1519 const unsigned int ia_valid = attr->ia_valid; 1524 const unsigned int ia_valid = attr->ia_valid;
1520 struct ceph_mds_request *req; 1525 struct ceph_mds_request *req;
1521 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; 1526 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
1522 int issued; 1527 int issued;
1523 int release = 0, dirtied = 0; 1528 int release = 0, dirtied = 0;
1524 int mask = 0; 1529 int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
102 u64 len = 1, olen; 102 u64 len = 1, olen;
103 u64 tmp; 103 u64 tmp;
104 struct ceph_object_layout ol; 104 struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..885aa5710cfd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
40static void __wake_requests(struct ceph_mds_client *mdsc, 40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 41 struct list_head *head);
42 42
43const static struct ceph_connection_operations mds_con_ops; 43static const struct ceph_connection_operations mds_con_ops;
44 44
45 45
46/* 46/*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
665 struct ceph_msg *msg; 665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h; 666 struct ceph_mds_session_head *h;
667 667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); 668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
669 if (IS_ERR(msg)) { 669 if (!msg) {
670 pr_err("create_session_msg ENOMEM creating msg\n"); 670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg)); 671 return NULL;
672 } 672 }
673 h = msg->front.iov_base; 673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op); 674 h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
687 struct ceph_msg *msg; 687 struct ceph_msg *msg;
688 int mstate; 688 int mstate;
689 int mds = session->s_mds; 689 int mds = session->s_mds;
690 int err = 0;
691 690
692 /* wait for mds to go active? */ 691 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); 692 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
698 697
699 /* send connect message */ 698 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 699 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) { 700 if (!msg)
702 err = PTR_ERR(msg); 701 return -ENOMEM;
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg); 702 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0; 703 return 0;
709} 704}
710 705
@@ -804,12 +799,49 @@ out:
804} 799}
805 800
806static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 801static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
807 void *arg) 802 void *arg)
808{ 803{
809 struct ceph_inode_info *ci = ceph_inode(inode); 804 struct ceph_inode_info *ci = ceph_inode(inode);
805 int drop = 0;
806
810 dout("removing cap %p, ci is %p, inode is %p\n", 807 dout("removing cap %p, ci is %p, inode is %p\n",
811 cap, ci, &ci->vfs_inode); 808 cap, ci, &ci->vfs_inode);
812 ceph_remove_cap(cap); 809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 if (!__ceph_is_any_real_caps(ci)) {
812 struct ceph_mds_client *mdsc =
813 &ceph_sb_to_client(inode->i_sb)->mdsc;
814
815 spin_lock(&mdsc->cap_dirty_lock);
816 if (!list_empty(&ci->i_dirty_item)) {
817 pr_info(" dropping dirty %s state for %p %lld\n",
818 ceph_cap_string(ci->i_dirty_caps),
819 inode, ceph_ino(inode));
820 ci->i_dirty_caps = 0;
821 list_del_init(&ci->i_dirty_item);
822 drop = 1;
823 }
824 if (!list_empty(&ci->i_flushing_item)) {
825 pr_info(" dropping dirty+flushing %s state for %p %lld\n",
826 ceph_cap_string(ci->i_flushing_caps),
827 inode, ceph_ino(inode));
828 ci->i_flushing_caps = 0;
829 list_del_init(&ci->i_flushing_item);
830 mdsc->num_cap_flushing--;
831 drop = 1;
832 }
833 if (drop && ci->i_wrbuffer_ref) {
834 pr_info(" dropping dirty data for %p %lld\n",
835 inode, ceph_ino(inode));
836 ci->i_wrbuffer_ref = 0;
837 ci->i_wrbuffer_ref_head = 0;
838 drop++;
839 }
840 spin_unlock(&mdsc->cap_dirty_lock);
841 }
842 spin_unlock(&inode->i_lock);
843 while (drop--)
844 iput(inode);
813 return 0; 845 return 0;
814} 846}
815 847
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
821 dout("remove_session_caps on %p\n", session); 853 dout("remove_session_caps on %p\n", session);
822 iterate_session_caps(session, remove_session_caps_cb, NULL); 854 iterate_session_caps(session, remove_session_caps_cb, NULL);
823 BUG_ON(session->s_nr_caps > 0); 855 BUG_ON(session->s_nr_caps > 0);
856 BUG_ON(!list_empty(&session->s_cap_flushing));
824 cleanup_cap_releases(session); 857 cleanup_cap_releases(session);
825} 858}
826 859
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
883 ceph_mds_state_name(state)); 916 ceph_mds_state_name(state));
884 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 917 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
885 ++session->s_renew_seq); 918 ++session->s_renew_seq);
886 if (IS_ERR(msg)) 919 if (!msg)
887 return PTR_ERR(msg); 920 return -ENOMEM;
888 ceph_con_send(&session->s_con, msg); 921 ceph_con_send(&session->s_con, msg);
889 return 0; 922 return 0;
890} 923}
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
931 struct ceph_mds_session *session) 964 struct ceph_mds_session *session)
932{ 965{
933 struct ceph_msg *msg; 966 struct ceph_msg *msg;
934 int err = 0;
935 967
936 dout("request_close_session mds%d state %s seq %lld\n", 968 dout("request_close_session mds%d state %s seq %lld\n",
937 session->s_mds, session_state_name(session->s_state), 969 session->s_mds, session_state_name(session->s_state),
938 session->s_seq); 970 session->s_seq);
939 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 971 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
940 if (IS_ERR(msg)) 972 if (!msg)
941 err = PTR_ERR(msg); 973 return -ENOMEM;
942 else 974 ceph_con_send(&session->s_con, msg);
943 ceph_con_send(&session->s_con, msg); 975 return 0;
944 return err;
945} 976}
946 977
947/* 978/*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
1059 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1060 spin_unlock(&session->s_cap_lock); 1091 spin_unlock(&session->s_cap_lock);
1061 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1062 0, 0, NULL); 1093 GFP_NOFS);
1063 if (!msg) 1094 if (!msg)
1064 goto out_unlocked; 1095 goto out_unlocked;
1065 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1096 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1151 struct ceph_msg *msg; 1182 struct ceph_msg *msg;
1152 1183
1153 dout("send_cap_releases mds%d\n", session->s_mds); 1184 dout("send_cap_releases mds%d\n", session->s_mds);
1154 while (1) { 1185 spin_lock(&session->s_cap_lock);
1155 spin_lock(&session->s_cap_lock); 1186 while (!list_empty(&session->s_cap_releases_done)) {
1156 if (list_empty(&session->s_cap_releases_done))
1157 break;
1158 msg = list_first_entry(&session->s_cap_releases_done, 1187 msg = list_first_entry(&session->s_cap_releases_done,
1159 struct ceph_msg, list_head); 1188 struct ceph_msg, list_head);
1160 list_del_init(&msg->list_head); 1189 list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1162 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1191 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1163 dout("send_cap_releases mds%d %p\n", session->s_mds, msg); 1192 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1164 ceph_con_send(&session->s_con, msg); 1193 ceph_con_send(&session->s_con, msg);
1194 spin_lock(&session->s_cap_lock);
1165 } 1195 }
1166 spin_unlock(&session->s_cap_lock); 1196 spin_unlock(&session->s_cap_lock);
1167} 1197}
1168 1198
1199static void discard_cap_releases(struct ceph_mds_client *mdsc,
1200 struct ceph_mds_session *session)
1201{
1202 struct ceph_msg *msg;
1203 struct ceph_mds_cap_release *head;
1204 unsigned num;
1205
1206 dout("discard_cap_releases mds%d\n", session->s_mds);
1207 spin_lock(&session->s_cap_lock);
1208
1209 /* zero out the in-progress message */
1210 msg = list_first_entry(&session->s_cap_releases,
1211 struct ceph_msg, list_head);
1212 head = msg->front.iov_base;
1213 num = le32_to_cpu(head->num);
1214 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1215 head->num = cpu_to_le32(0);
1216 session->s_num_cap_releases += num;
1217
1218 /* requeue completed messages */
1219 while (!list_empty(&session->s_cap_releases_done)) {
1220 msg = list_first_entry(&session->s_cap_releases_done,
1221 struct ceph_msg, list_head);
1222 list_del_init(&msg->list_head);
1223
1224 head = msg->front.iov_base;
1225 num = le32_to_cpu(head->num);
1226 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1227 num);
1228 session->s_num_cap_releases += num;
1229 head->num = cpu_to_le32(0);
1230 msg->front.iov_len = sizeof(*head);
1231 list_add(&msg->list_head, &session->s_cap_releases);
1232 }
1233
1234 spin_unlock(&session->s_cap_lock);
1235}
1236
1169/* 1237/*
1170 * requests 1238 * requests
1171 */ 1239 */
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1181 if (!req) 1249 if (!req)
1182 return ERR_PTR(-ENOMEM); 1250 return ERR_PTR(-ENOMEM);
1183 1251
1252 mutex_init(&req->r_fill_mutex);
1184 req->r_started = jiffies; 1253 req->r_started = jiffies;
1185 req->r_resend_mds = -1; 1254 req->r_resend_mds = -1;
1186 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
1251 len += 1 + temp->d_name.len; 1320 len += 1 + temp->d_name.len;
1252 temp = temp->d_parent; 1321 temp = temp->d_parent;
1253 if (temp == NULL) { 1322 if (temp == NULL) {
1254 pr_err("build_path_dentry corrupt dentry %p\n", dentry); 1323 pr_err("build_path corrupt dentry %p\n", dentry);
1255 return ERR_PTR(-EINVAL); 1324 return ERR_PTR(-EINVAL);
1256 } 1325 }
1257 } 1326 }
@@ -1267,7 +1336,7 @@ retry:
1267 struct inode *inode = temp->d_inode; 1336 struct inode *inode = temp->d_inode;
1268 1337
1269 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { 1338 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1270 dout("build_path_dentry path+%d: %p SNAPDIR\n", 1339 dout("build_path path+%d: %p SNAPDIR\n",
1271 pos, temp); 1340 pos, temp);
1272 } else if (stop_on_nosnap && inode && 1341 } else if (stop_on_nosnap && inode &&
1273 ceph_snap(inode) == CEPH_NOSNAP) { 1342 ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
1278 break; 1347 break;
1279 strncpy(path + pos, temp->d_name.name, 1348 strncpy(path + pos, temp->d_name.name,
1280 temp->d_name.len); 1349 temp->d_name.len);
1281 dout("build_path_dentry path+%d: %p '%.*s'\n",
1282 pos, temp, temp->d_name.len, path + pos);
1283 } 1350 }
1284 if (pos) 1351 if (pos)
1285 path[--pos] = '/'; 1352 path[--pos] = '/';
1286 temp = temp->d_parent; 1353 temp = temp->d_parent;
1287 if (temp == NULL) { 1354 if (temp == NULL) {
1288 pr_err("build_path_dentry corrupt dentry\n"); 1355 pr_err("build_path corrupt dentry\n");
1289 kfree(path); 1356 kfree(path);
1290 return ERR_PTR(-EINVAL); 1357 return ERR_PTR(-EINVAL);
1291 } 1358 }
1292 } 1359 }
1293 if (pos != 0) { 1360 if (pos != 0) {
1294 pr_err("build_path_dentry did not end path lookup where " 1361 pr_err("build_path did not end path lookup where "
1295 "expected, namelen is %d, pos is %d\n", len, pos); 1362 "expected, namelen is %d, pos is %d\n", len, pos);
1296 /* presumably this is only possible if racing with a 1363 /* presumably this is only possible if racing with a
1297 rename of one of the parent directories (we can not 1364 rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
1303 1370
1304 *base = ceph_ino(temp->d_inode); 1371 *base = ceph_ino(temp->d_inode);
1305 *plen = len; 1372 *plen = len;
1306 dout("build_path_dentry on %p %d built %llx '%.*s'\n", 1373 dout("build_path on %p %d built %llx '%.*s'\n",
1307 dentry, atomic_read(&dentry->d_count), *base, len, path); 1374 dentry, atomic_read(&dentry->d_count), *base, len, path);
1308 return path; 1375 return path;
1309} 1376}
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1426 if (req->r_old_dentry_drop) 1493 if (req->r_old_dentry_drop)
1427 len += req->r_old_dentry->d_name.len; 1494 len += req->r_old_dentry->d_name.len;
1428 1495
1429 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); 1496 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
1430 if (IS_ERR(msg)) 1497 if (!msg) {
1498 msg = ERR_PTR(-ENOMEM);
1431 goto out_free2; 1499 goto out_free2;
1500 }
1432 1501
1433 msg->hdr.tid = cpu_to_le64(req->r_tid); 1502 msg->hdr.tid = cpu_to_le64(req->r_tid);
1434 1503
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1517 } 1586 }
1518 msg = create_request_message(mdsc, req, mds); 1587 msg = create_request_message(mdsc, req, mds);
1519 if (IS_ERR(msg)) { 1588 if (IS_ERR(msg)) {
1520 req->r_reply = ERR_PTR(PTR_ERR(msg)); 1589 req->r_err = PTR_ERR(msg);
1521 complete_request(mdsc, req); 1590 complete_request(mdsc, req);
1522 return -PTR_ERR(msg); 1591 return PTR_ERR(msg);
1523 } 1592 }
1524 req->r_request = msg; 1593 req->r_request = msg;
1525 1594
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
1552 int mds = -1; 1621 int mds = -1;
1553 int err = -EAGAIN; 1622 int err = -EAGAIN;
1554 1623
1555 if (req->r_reply) 1624 if (req->r_err || req->r_got_result)
1556 goto out; 1625 goto out;
1557 1626
1558 if (req->r_timeout && 1627 if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
1609 return err; 1678 return err;
1610 1679
1611finish: 1680finish:
1612 req->r_reply = ERR_PTR(err); 1681 req->r_err = err;
1613 complete_request(mdsc, req); 1682 complete_request(mdsc, req);
1614 goto out; 1683 goto out;
1615} 1684}
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
1630 1699
1631/* 1700/*
1632 * Wake up threads with requests pending for @mds, so that they can 1701 * Wake up threads with requests pending for @mds, so that they can
1633 * resubmit their requests to a possibly different mds. If @all is set, 1702 * resubmit their requests to a possibly different mds.
1634 * wake up if their requests has been forwarded to @mds, too.
1635 */ 1703 */
1636static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) 1704static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1637{ 1705{
1638 struct ceph_mds_request *req; 1706 struct ceph_mds_request *req;
1639 struct rb_node *p; 1707 struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1689 __register_request(mdsc, req, dir); 1757 __register_request(mdsc, req, dir);
1690 __do_request(mdsc, req); 1758 __do_request(mdsc, req);
1691 1759
1692 /* wait */ 1760 if (req->r_err) {
1693 if (!req->r_reply) { 1761 err = req->r_err;
1694 mutex_unlock(&mdsc->mutex); 1762 __unregister_request(mdsc, req);
1695 if (req->r_timeout) { 1763 dout("do_request early error %d\n", err);
1696 err = (long)wait_for_completion_interruptible_timeout( 1764 goto out;
1697 &req->r_completion, req->r_timeout);
1698 if (err == 0)
1699 req->r_reply = ERR_PTR(-EIO);
1700 else if (err < 0)
1701 req->r_reply = ERR_PTR(err);
1702 } else {
1703 err = wait_for_completion_interruptible(
1704 &req->r_completion);
1705 if (err)
1706 req->r_reply = ERR_PTR(err);
1707 }
1708 mutex_lock(&mdsc->mutex);
1709 } 1765 }
1710 1766
1711 if (IS_ERR(req->r_reply)) { 1767 /* wait */
1712 err = PTR_ERR(req->r_reply); 1768 mutex_unlock(&mdsc->mutex);
1713 req->r_reply = NULL; 1769 dout("do_request waiting\n");
1770 if (req->r_timeout) {
1771 err = (long)wait_for_completion_interruptible_timeout(
1772 &req->r_completion, req->r_timeout);
1773 if (err == 0)
1774 err = -EIO;
1775 } else {
1776 err = wait_for_completion_interruptible(&req->r_completion);
1777 }
1778 dout("do_request waited, got %d\n", err);
1779 mutex_lock(&mdsc->mutex);
1714 1780
1715 if (err == -ERESTARTSYS) { 1781 /* only abort if we didn't race with a real reply */
1716 /* aborted */ 1782 if (req->r_got_result) {
1717 req->r_aborted = true; 1783 err = le32_to_cpu(req->r_reply_info.head->result);
1784 } else if (err < 0) {
1785 dout("aborted request %lld with %d\n", req->r_tid, err);
1718 1786
1719 if (req->r_locked_dir && 1787 /*
1720 (req->r_op & CEPH_MDS_OP_WRITE)) { 1788 * ensure we aren't running concurrently with
1721 struct ceph_inode_info *ci = 1789 * ceph_fill_trace or ceph_readdir_prepopulate, which
1722 ceph_inode(req->r_locked_dir); 1790 * rely on locks (dir mutex) held by our caller.
1791 */
1792 mutex_lock(&req->r_fill_mutex);
1793 req->r_err = err;
1794 req->r_aborted = true;
1795 mutex_unlock(&req->r_fill_mutex);
1723 1796
1724 dout("aborted, clearing I_COMPLETE on %p\n", 1797 if (req->r_locked_dir &&
1725 req->r_locked_dir); 1798 (req->r_op & CEPH_MDS_OP_WRITE))
1726 spin_lock(&req->r_locked_dir->i_lock); 1799 ceph_invalidate_dir_request(req);
1727 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1728 ci->i_release_count++;
1729 spin_unlock(&req->r_locked_dir->i_lock);
1730 }
1731 } else {
1732 /* clean up this request */
1733 __unregister_request(mdsc, req);
1734 if (!list_empty(&req->r_unsafe_item))
1735 list_del_init(&req->r_unsafe_item);
1736 complete(&req->r_safe_completion);
1737 }
1738 } else if (req->r_err) {
1739 err = req->r_err;
1740 } else { 1800 } else {
1741 err = le32_to_cpu(req->r_reply_info.head->result); 1801 err = req->r_err;
1742 } 1802 }
1743 mutex_unlock(&mdsc->mutex);
1744 1803
1804out:
1805 mutex_unlock(&mdsc->mutex);
1745 dout("do_request %p done, result %d\n", req, err); 1806 dout("do_request %p done, result %d\n", req, err);
1746 return err; 1807 return err;
1747} 1808}
1748 1809
1749/* 1810/*
1811 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
1812 * namespace request.
1813 */
1814void ceph_invalidate_dir_request(struct ceph_mds_request *req)
1815{
1816 struct inode *inode = req->r_locked_dir;
1817 struct ceph_inode_info *ci = ceph_inode(inode);
1818
1819 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
1820 spin_lock(&inode->i_lock);
1821 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1822 ci->i_release_count++;
1823 spin_unlock(&inode->i_lock);
1824
1825 if (req->r_dentry)
1826 ceph_invalidate_dentry_lease(req->r_dentry);
1827 if (req->r_old_dentry)
1828 ceph_invalidate_dentry_lease(req->r_old_dentry);
1829}
1830
1831/*
1750 * Handle mds reply. 1832 * Handle mds reply.
1751 * 1833 *
1752 * We take the session mutex and parse and process the reply immediately. 1834 * We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1797 mutex_unlock(&mdsc->mutex); 1879 mutex_unlock(&mdsc->mutex);
1798 goto out; 1880 goto out;
1799 } 1881 }
1882 if (req->r_got_safe && !head->safe) {
1883 pr_warning("got unsafe after safe on %llu from mds%d\n",
1884 tid, mds);
1885 mutex_unlock(&mdsc->mutex);
1886 goto out;
1887 }
1800 1888
1801 result = le32_to_cpu(head->result); 1889 result = le32_to_cpu(head->result);
1802 1890
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1838 mutex_unlock(&mdsc->mutex); 1926 mutex_unlock(&mdsc->mutex);
1839 goto out; 1927 goto out;
1840 } 1928 }
1841 } 1929 } else {
1842
1843 BUG_ON(req->r_reply);
1844
1845 if (!head->safe) {
1846 req->r_got_unsafe = true; 1930 req->r_got_unsafe = true;
1847 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); 1931 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1848 } 1932 }
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1871 } 1955 }
1872 1956
1873 /* insert trace into our cache */ 1957 /* insert trace into our cache */
1958 mutex_lock(&req->r_fill_mutex);
1874 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 1959 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1875 if (err == 0) { 1960 if (err == 0) {
1876 if (result == 0 && rinfo->dir_nr) 1961 if (result == 0 && rinfo->dir_nr)
1877 ceph_readdir_prepopulate(req, req->r_session); 1962 ceph_readdir_prepopulate(req, req->r_session);
1878 ceph_unreserve_caps(&req->r_caps_reservation); 1963 ceph_unreserve_caps(&req->r_caps_reservation);
1879 } 1964 }
1965 mutex_unlock(&req->r_fill_mutex);
1880 1966
1881 up_read(&mdsc->snap_rwsem); 1967 up_read(&mdsc->snap_rwsem);
1882out_err: 1968out_err:
1883 if (err) { 1969 mutex_lock(&mdsc->mutex);
1884 req->r_err = err; 1970 if (!req->r_aborted) {
1971 if (err) {
1972 req->r_err = err;
1973 } else {
1974 req->r_reply = msg;
1975 ceph_msg_get(msg);
1976 req->r_got_result = true;
1977 }
1885 } else { 1978 } else {
1886 req->r_reply = msg; 1979 dout("reply arrived after request %lld was aborted\n", tid);
1887 ceph_msg_get(msg);
1888 } 1980 }
1981 mutex_unlock(&mdsc->mutex);
1889 1982
1890 add_cap_releases(mdsc, req->r_session, -1); 1983 add_cap_releases(mdsc, req->r_session, -1);
1891 mutex_unlock(&session->s_mutex); 1984 mutex_unlock(&session->s_mutex);
@@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session,
1984 2077
1985 switch (op) { 2078 switch (op) {
1986 case CEPH_SESSION_OPEN: 2079 case CEPH_SESSION_OPEN:
2080 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2081 pr_info("mds%d reconnect success\n", session->s_mds);
1987 session->s_state = CEPH_MDS_SESSION_OPEN; 2082 session->s_state = CEPH_MDS_SESSION_OPEN;
1988 renewed_caps(mdsc, session, 0); 2083 renewed_caps(mdsc, session, 0);
1989 wake = 1; 2084 wake = 1;
@@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session,
1997 break; 2092 break;
1998 2093
1999 case CEPH_SESSION_CLOSE: 2094 case CEPH_SESSION_CLOSE:
2095 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2096 pr_info("mds%d reconnect denied\n", session->s_mds);
2000 remove_session_caps(session); 2097 remove_session_caps(session);
2001 wake = 1; /* for good measure */ 2098 wake = 1; /* for good measure */
2002 complete(&mdsc->session_close_waiters); 2099 complete(&mdsc->session_close_waiters);
2003 kick_requests(mdsc, mds, 0); /* cur only */ 2100 kick_requests(mdsc, mds);
2004 break; 2101 break;
2005 2102
2006 case CEPH_SESSION_STALE: 2103 case CEPH_SESSION_STALE:
@@ -2132,54 +2229,44 @@ out:
2132 * 2229 *
2133 * called with mdsc->mutex held. 2230 * called with mdsc->mutex held.
2134 */ 2231 */
2135static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) 2232static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2233 struct ceph_mds_session *session)
2136{ 2234{
2137 struct ceph_mds_session *session = NULL;
2138 struct ceph_msg *reply; 2235 struct ceph_msg *reply;
2139 struct rb_node *p; 2236 struct rb_node *p;
2237 int mds = session->s_mds;
2140 int err = -ENOMEM; 2238 int err = -ENOMEM;
2141 struct ceph_pagelist *pagelist; 2239 struct ceph_pagelist *pagelist;
2142 2240
2143 pr_info("reconnect to recovering mds%d\n", mds); 2241 pr_info("mds%d reconnect start\n", mds);
2144 2242
2145 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 2243 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2146 if (!pagelist) 2244 if (!pagelist)
2147 goto fail_nopagelist; 2245 goto fail_nopagelist;
2148 ceph_pagelist_init(pagelist); 2246 ceph_pagelist_init(pagelist);
2149 2247
2150 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); 2248 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
2151 if (IS_ERR(reply)) { 2249 if (!reply)
2152 err = PTR_ERR(reply);
2153 goto fail_nomsg; 2250 goto fail_nomsg;
2154 }
2155
2156 /* find session */
2157 session = __ceph_lookup_mds_session(mdsc, mds);
2158 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2159 2251
2160 if (session) { 2252 mutex_lock(&session->s_mutex);
2161 mutex_lock(&session->s_mutex); 2253 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2254 session->s_seq = 0;
2162 2255
2163 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2256 ceph_con_open(&session->s_con,
2164 session->s_seq = 0; 2257 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2165 2258
2166 ceph_con_open(&session->s_con, 2259 /* replay unsafe requests */
2167 ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 2260 replay_unsafe_requests(mdsc, session);
2168
2169 /* replay unsafe requests */
2170 replay_unsafe_requests(mdsc, session);
2171 } else {
2172 dout("no session for mds%d, will send short reconnect\n",
2173 mds);
2174 }
2175 2261
2176 down_read(&mdsc->snap_rwsem); 2262 down_read(&mdsc->snap_rwsem);
2177 2263
2178 if (!session)
2179 goto send;
2180 dout("session %p state %s\n", session, 2264 dout("session %p state %s\n", session,
2181 session_state_name(session->s_state)); 2265 session_state_name(session->s_state));
2182 2266
2267 /* drop old cap expires; we're about to reestablish that state */
2268 discard_cap_releases(mdsc, session);
2269
2183 /* traverse this session's caps */ 2270 /* traverse this session's caps */
2184 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2271 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2185 if (err) 2272 if (err)
@@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2208 goto fail; 2295 goto fail;
2209 } 2296 }
2210 2297
2211send:
2212 reply->pagelist = pagelist; 2298 reply->pagelist = pagelist;
2213 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2299 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2214 reply->nr_pages = calc_pages_for(0, pagelist->length); 2300 reply->nr_pages = calc_pages_for(0, pagelist->length);
2215 ceph_con_send(&session->s_con, reply); 2301 ceph_con_send(&session->s_con, reply);
2216 2302
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 mutex_unlock(&session->s_mutex); 2303 mutex_unlock(&session->s_mutex);
2219 2304
2220 mutex_lock(&mdsc->mutex); 2305 mutex_lock(&mdsc->mutex);
2221 __wake_requests(mdsc, &session->s_waiting); 2306 __wake_requests(mdsc, &session->s_waiting);
2222 mutex_unlock(&mdsc->mutex); 2307 mutex_unlock(&mdsc->mutex);
2223 2308
2224 ceph_put_mds_session(session);
2225
2226 up_read(&mdsc->snap_rwsem); 2309 up_read(&mdsc->snap_rwsem);
2227 mutex_lock(&mdsc->mutex);
2228 return; 2310 return;
2229 2311
2230fail: 2312fail:
2231 ceph_msg_put(reply); 2313 ceph_msg_put(reply);
2232 up_read(&mdsc->snap_rwsem); 2314 up_read(&mdsc->snap_rwsem);
2233 mutex_unlock(&session->s_mutex); 2315 mutex_unlock(&session->s_mutex);
2234 ceph_put_mds_session(session);
2235fail_nomsg: 2316fail_nomsg:
2236 ceph_pagelist_release(pagelist); 2317 ceph_pagelist_release(pagelist);
2237 kfree(pagelist); 2318 kfree(pagelist);
2238fail_nopagelist: 2319fail_nopagelist:
2239 pr_err("error %d preparing reconnect for mds%d\n", err, mds); 2320 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2240 mutex_lock(&mdsc->mutex);
2241 return; 2321 return;
2242} 2322}
2243 2323
@@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2290 } 2370 }
2291 2371
2292 /* kick any requests waiting on the recovering mds */ 2372 /* kick any requests waiting on the recovering mds */
2293 kick_requests(mdsc, i, 1); 2373 kick_requests(mdsc, i);
2294 } else if (oldstate == newstate) { 2374 } else if (oldstate == newstate) {
2295 continue; /* nothing new with this mds */ 2375 continue; /* nothing new with this mds */
2296 } 2376 }
@@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2299 * send reconnect? 2379 * send reconnect?
2300 */ 2380 */
2301 if (s->s_state == CEPH_MDS_SESSION_RESTARTING && 2381 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2302 newstate >= CEPH_MDS_STATE_RECONNECT) 2382 newstate >= CEPH_MDS_STATE_RECONNECT) {
2303 send_mds_reconnect(mdsc, i); 2383 mutex_unlock(&mdsc->mutex);
2384 send_mds_reconnect(mdsc, s);
2385 mutex_lock(&mdsc->mutex);
2386 }
2304 2387
2305 /* 2388 /*
2306 * kick requests on any mds that has gone active. 2389 * kick request on any mds that has gone active.
2307 *
2308 * kick requests on cur or forwarder: we may have sent
2309 * the request to mds1, mds1 told us it forwarded it
2310 * to mds2, but then we learn mds1 failed and can't be
2311 * sure it successfully forwarded our request before
2312 * it died.
2313 */ 2390 */
2314 if (oldstate < CEPH_MDS_STATE_ACTIVE && 2391 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2315 newstate >= CEPH_MDS_STATE_ACTIVE) { 2392 newstate >= CEPH_MDS_STATE_ACTIVE) {
2316 pr_info("mds%d reconnect completed\n", s->s_mds); 2393 if (oldstate != CEPH_MDS_STATE_CREATING &&
2317 kick_requests(mdsc, i, 1); 2394 oldstate != CEPH_MDS_STATE_STARTING)
2395 pr_info("mds%d recovery completed\n", s->s_mds);
2396 kick_requests(mdsc, i);
2318 ceph_kick_flushing_caps(mdsc, s); 2397 ceph_kick_flushing_caps(mdsc, s);
2319 wake_up_session_caps(s, 1); 2398 wake_up_session_caps(s, 1);
2320 } 2399 }
@@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2457 dnamelen = dentry->d_name.len; 2536 dnamelen = dentry->d_name.len;
2458 len += dnamelen; 2537 len += dnamelen;
2459 2538
2460 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); 2539 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
2461 if (IS_ERR(msg)) 2540 if (!msg)
2462 return; 2541 return;
2463 lease = msg->front.iov_base; 2542 lease = msg->front.iov_base;
2464 lease->action = action; 2543 lease->action = action;
@@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work)
2603 else 2682 else
2604 ceph_con_keepalive(&s->s_con); 2683 ceph_con_keepalive(&s->s_con);
2605 add_cap_releases(mdsc, s, -1); 2684 add_cap_releases(mdsc, s, -1);
2606 send_cap_releases(mdsc, s); 2685 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2686 s->s_state == CEPH_MDS_SESSION_HUNG)
2687 send_cap_releases(mdsc, s);
2607 mutex_unlock(&s->s_mutex); 2688 mutex_unlock(&s->s_mutex);
2608 ceph_put_mds_session(s); 2689 ceph_put_mds_session(s);
2609 2690
@@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2620 mdsc->client = client; 2701 mdsc->client = client;
2621 mutex_init(&mdsc->mutex); 2702 mutex_init(&mdsc->mutex);
2622 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2703 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2704 if (mdsc->mdsmap == NULL)
2705 return -ENOMEM;
2706
2623 init_completion(&mdsc->safe_umount_waiters); 2707 init_completion(&mdsc->safe_umount_waiters);
2624 init_completion(&mdsc->session_close_waiters); 2708 init_completion(&mdsc->session_close_waiters);
2625 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2709 INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2645 init_waitqueue_head(&mdsc->cap_flushing_wq); 2729 init_waitqueue_head(&mdsc->cap_flushing_wq);
2646 spin_lock_init(&mdsc->dentry_lru_lock); 2730 spin_lock_init(&mdsc->dentry_lru_lock);
2647 INIT_LIST_HEAD(&mdsc->dentry_lru); 2731 INIT_LIST_HEAD(&mdsc->dentry_lru);
2732
2648 return 0; 2733 return 0;
2649} 2734}
2650 2735
@@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2740{ 2825{
2741 u64 want_tid, want_flush; 2826 u64 want_tid, want_flush;
2742 2827
2828 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
2829 return;
2830
2743 dout("sync\n"); 2831 dout("sync\n");
2744 mutex_lock(&mdsc->mutex); 2832 mutex_lock(&mdsc->mutex);
2745 want_tid = mdsc->last_tid; 2833 want_tid = mdsc->last_tid;
@@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con)
2922static void peer_reset(struct ceph_connection *con) 3010static void peer_reset(struct ceph_connection *con)
2923{ 3011{
2924 struct ceph_mds_session *s = con->private; 3012 struct ceph_mds_session *s = con->private;
3013 struct ceph_mds_client *mdsc = s->s_mdsc;
2925 3014
2926 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", 3015 pr_warning("mds%d closed our session\n", s->s_mds);
2927 s->s_mds); 3016 send_mds_reconnect(mdsc, s);
2928} 3017}
2929 3018
2930static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 3019static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
3031 return ceph_monc_validate_auth(&mdsc->client->monc); 3120 return ceph_monc_validate_auth(&mdsc->client->monc);
3032} 3121}
3033 3122
3034const static struct ceph_connection_operations mds_con_ops = { 3123static const struct ceph_connection_operations mds_con_ops = {
3035 .get = con_get, 3124 .get = con_get,
3036 .put = con_put, 3125 .put = con_put,
3037 .dispatch = dispatch, 3126 .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ 165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */ 166 struct inode *r_target_inode; /* resulting inode */
167 167
168 struct mutex r_fill_mutex;
169
168 union ceph_mds_request_args r_args; 170 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */ 171 int r_fmode; /* file mode, if expecting cap */
170 172
@@ -213,7 +215,7 @@ struct ceph_mds_request {
213 struct completion r_safe_completion; 215 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback; 216 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */ 217 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe; 218 bool r_got_unsafe, r_got_safe, r_got_result;
217 219
218 bool r_did_prepopulate; 220 bool r_did_prepopulate;
219 u32 r_readdir_offset; 221 u32 r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode, 303 struct inode *inode,
302 struct dentry *dn, int mask); 304 struct dentry *dn, int mask);
303 305
306extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
307
304extern struct ceph_mds_request * 308extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 309ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 310extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..60b74839ebec 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *); 39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con); 40static void ceph_fault(struct ceph_connection *con);
41 41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/* 42/*
55 * nicely render a sockaddr as a string. 43 * nicely render a sockaddr as a string.
56 */ 44 */
@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con)
340 ceph_msg_put(con->out_msg); 328 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL; 329 con->out_msg = NULL;
342 } 330 }
331 con->out_keepalive_pending = false;
343 con->in_seq = 0; 332 con->in_seq = 0;
344 con->in_seq_acked = 0; 333 con->in_seq_acked = 0;
345} 334}
@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con)
357 clear_bit(WRITE_PENDING, &con->state); 346 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex); 347 mutex_lock(&con->mutex);
359 reset_connection(con); 348 reset_connection(con);
349 con->peer_global_seq = 0;
360 cancel_delayed_work(&con->work); 350 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex); 351 mutex_unlock(&con->mutex);
362 queue_con(con); 352 queue_con(con);
@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
661 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 651 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
662 con->connect_seq, global_seq, proto); 652 con->connect_seq, global_seq, proto);
663 653
664 con->out_connect.features = CEPH_FEATURE_SUPPORTED; 654 con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
665 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 655 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
666 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 656 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
667 con->out_connect.global_seq = cpu_to_le32(global_seq); 657 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con)
1124 1114
1125static int process_connect(struct ceph_connection *con) 1115static int process_connect(struct ceph_connection *con)
1126{ 1116{
1127 u64 sup_feat = CEPH_FEATURE_SUPPORTED; 1117 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
1128 u64 req_feat = CEPH_FEATURE_REQUIRED; 1118 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
1129 u64 server_feat = le64_to_cpu(con->in_reply.features); 1119 u64 server_feat = le64_to_cpu(con->in_reply.features);
1130 1120
1131 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1121 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
1233 clear_bit(CONNECTING, &con->state); 1223 clear_bit(CONNECTING, &con->state);
1234 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1224 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1235 con->connect_seq++; 1225 con->connect_seq++;
1226 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n", 1227 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq, 1228 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq), 1229 le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con)
1402 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1393 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1403 if (skip) { 1394 if (skip) {
1404 /* skip this message */ 1395 /* skip this message */
1405 dout("alloc_msg returned NULL, skipping message\n"); 1396 dout("alloc_msg said skip message\n");
1406 con->in_base_pos = -front_len - middle_len - data_len - 1397 con->in_base_pos = -front_len - middle_len - data_len -
1407 sizeof(m->footer); 1398 sizeof(m->footer);
1408 con->in_tag = CEPH_MSGR_TAG_READY; 1399 con->in_tag = CEPH_MSGR_TAG_READY;
1409 con->in_seq++; 1400 con->in_seq++;
1410 return 0; 1401 return 0;
1411 } 1402 }
1412 if (IS_ERR(con->in_msg)) { 1403 if (!con->in_msg) {
1413 ret = PTR_ERR(con->in_msg);
1414 con->in_msg = NULL;
1415 con->error_msg = 1404 con->error_msg =
1416 "error allocating memory for incoming message"; 1405 "error allocating memory for incoming message";
1417 return ret; 1406 return -ENOMEM;
1418 } 1407 }
1419 m = con->in_msg; 1408 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */ 1409 m->front.iov_len = 0; /* haven't read it yet */
@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con)
1514 1503
1515 /* if first message, set peer_name */ 1504 /* if first message, set peer_name */
1516 if (con->peer_name.type == 0) 1505 if (con->peer_name.type == 0)
1517 con->peer_name = msg->hdr.src.name; 1506 con->peer_name = msg->hdr.src;
1518 1507
1519 con->in_seq++; 1508 con->in_seq++;
1520 mutex_unlock(&con->mutex); 1509 mutex_unlock(&con->mutex);
1521 1510
1522 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", 1511 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1523 msg, le64_to_cpu(msg->hdr.seq), 1512 msg, le64_to_cpu(msg->hdr.seq),
1524 ENTITY_NAME(msg->hdr.src.name), 1513 ENTITY_NAME(msg->hdr.src),
1525 le16_to_cpu(msg->hdr.type), 1514 le16_to_cpu(msg->hdr.type),
1526 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), 1515 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1527 le32_to_cpu(msg->hdr.front_len), 1516 le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con)
1546 dout("try_write start %p state %lu nref %d\n", con, con->state, 1535 dout("try_write start %p state %lu nref %d\n", con, con->state,
1547 atomic_read(&con->nref)); 1536 atomic_read(&con->nref));
1548 1537
1549 mutex_lock(&con->mutex);
1550more: 1538more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 1539 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552 1540
@@ -1639,7 +1627,6 @@ do_next:
1639done: 1627done:
1640 ret = 0; 1628 ret = 0;
1641out: 1629out:
1642 mutex_unlock(&con->mutex);
1643 dout("try_write done on %p\n", con); 1630 dout("try_write done on %p\n", con);
1644 return ret; 1631 return ret;
1645} 1632}
@@ -1651,7 +1638,6 @@ out:
1651 */ 1638 */
1652static int try_read(struct ceph_connection *con) 1639static int try_read(struct ceph_connection *con)
1653{ 1640{
1654 struct ceph_messenger *msgr;
1655 int ret = -1; 1641 int ret = -1;
1656 1642
1657 if (!con->sock) 1643 if (!con->sock)
@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con)
1661 return 0; 1647 return 0;
1662 1648
1663 dout("try_read start on %p\n", con); 1649 dout("try_read start on %p\n", con);
1664 msgr = con->msgr;
1665
1666 mutex_lock(&con->mutex);
1667 1650
1668more: 1651more:
1669 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 1652 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1741,6 @@ more:
1758done: 1741done:
1759 ret = 0; 1742 ret = 0;
1760out: 1743out:
1761 mutex_unlock(&con->mutex);
1762 dout("try_read done on %p\n", con); 1744 dout("try_read done on %p\n", con);
1763 return ret; 1745 return ret;
1764 1746
@@ -1830,6 +1812,8 @@ more:
1830 dout("con_work %p start, clearing QUEUED\n", con); 1812 dout("con_work %p start, clearing QUEUED\n", con);
1831 clear_bit(QUEUED, &con->state); 1813 clear_bit(QUEUED, &con->state);
1832 1814
1815 mutex_lock(&con->mutex);
1816
1833 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ 1817 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1834 dout("con_work CLOSED\n"); 1818 dout("con_work CLOSED\n");
1835 con_close_socket(con); 1819 con_close_socket(con);
@@ -1844,11 +1828,16 @@ more:
1844 if (test_and_clear_bit(SOCK_CLOSED, &con->state) || 1828 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1845 try_read(con) < 0 || 1829 try_read(con) < 0 ||
1846 try_write(con) < 0) { 1830 try_write(con) < 0) {
1831 mutex_unlock(&con->mutex);
1847 backoff = 1; 1832 backoff = 1;
1848 ceph_fault(con); /* error/fault path */ 1833 ceph_fault(con); /* error/fault path */
1834 goto done_unlocked;
1849 } 1835 }
1850 1836
1851done: 1837done:
1838 mutex_unlock(&con->mutex);
1839
1840done_unlocked:
1852 clear_bit(BUSY, &con->state); 1841 clear_bit(BUSY, &con->state);
1853 dout("con->state=%lu\n", con->state); 1842 dout("con->state=%lu\n", con->state);
1854 if (test_bit(QUEUED, &con->state)) { 1843 if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1947 1936
1948 /* the zero page is needed if a request is "canceled" while the message 1937 /* the zero page is needed if a request is "canceled" while the message
1949 * is being written over the socket */ 1938 * is being written over the socket */
1950 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1939 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1951 if (!msgr->zero_page) { 1940 if (!msgr->zero_page) {
1952 kfree(msgr); 1941 kfree(msgr);
1953 return ERR_PTR(-ENOMEM); 1942 return ERR_PTR(-ENOMEM);
@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1987 } 1976 }
1988 1977
1989 /* set src+dst */ 1978 /* set src+dst */
1990 msg->hdr.src.name = con->msgr->inst.name; 1979 msg->hdr.src = con->msgr->inst.name;
1991 msg->hdr.src.addr = con->msgr->my_enc_addr;
1992 msg->hdr.orig_src = msg->hdr.src;
1993 1980
1994 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 1981 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1995 1982
@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
2083 * construct a new message with given type, size 2070 * construct a new message with given type, size
2084 * the new msg has a ref count of 1. 2071 * the new msg has a ref count of 1.
2085 */ 2072 */
2086struct ceph_msg *ceph_msg_new(int type, int front_len, 2073struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2087 int page_len, int page_off, struct page **pages)
2088{ 2074{
2089 struct ceph_msg *m; 2075 struct ceph_msg *m;
2090 2076
2091 m = kmalloc(sizeof(*m), GFP_NOFS); 2077 m = kmalloc(sizeof(*m), flags);
2092 if (m == NULL) 2078 if (m == NULL)
2093 goto out; 2079 goto out;
2094 kref_init(&m->kref); 2080 kref_init(&m->kref);
@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2100 m->hdr.version = 0; 2086 m->hdr.version = 0;
2101 m->hdr.front_len = cpu_to_le32(front_len); 2087 m->hdr.front_len = cpu_to_le32(front_len);
2102 m->hdr.middle_len = 0; 2088 m->hdr.middle_len = 0;
2103 m->hdr.data_len = cpu_to_le32(page_len); 2089 m->hdr.data_len = 0;
2104 m->hdr.data_off = cpu_to_le16(page_off); 2090 m->hdr.data_off = 0;
2105 m->hdr.reserved = 0; 2091 m->hdr.reserved = 0;
2106 m->footer.front_crc = 0; 2092 m->footer.front_crc = 0;
2107 m->footer.middle_crc = 0; 2093 m->footer.middle_crc = 0;
@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2115 /* front */ 2101 /* front */
2116 if (front_len) { 2102 if (front_len) {
2117 if (front_len > PAGE_CACHE_SIZE) { 2103 if (front_len > PAGE_CACHE_SIZE) {
2118 m->front.iov_base = __vmalloc(front_len, GFP_NOFS, 2104 m->front.iov_base = __vmalloc(front_len, flags,
2119 PAGE_KERNEL); 2105 PAGE_KERNEL);
2120 m->front_is_vmalloc = true; 2106 m->front_is_vmalloc = true;
2121 } else { 2107 } else {
2122 m->front.iov_base = kmalloc(front_len, GFP_NOFS); 2108 m->front.iov_base = kmalloc(front_len, flags);
2123 } 2109 }
2124 if (m->front.iov_base == NULL) { 2110 if (m->front.iov_base == NULL) {
2125 pr_err("msg_new can't allocate %d bytes\n", 2111 pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2135 m->middle = NULL; 2121 m->middle = NULL;
2136 2122
2137 /* data */ 2123 /* data */
2138 m->nr_pages = calc_pages_for(page_off, page_len); 2124 m->nr_pages = 0;
2139 m->pages = pages; 2125 m->pages = NULL;
2140 m->pagelist = NULL; 2126 m->pagelist = NULL;
2141 2127
2142 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, 2128 dout("ceph_msg_new %p front %d\n", m, front_len);
2143 m->nr_pages);
2144 return m; 2129 return m;
2145 2130
2146out2: 2131out2:
2147 ceph_msg_put(m); 2132 ceph_msg_put(m);
2148out: 2133out:
2149 pr_err("msg_new can't create type %d len %d\n", type, front_len); 2134 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2150 return ERR_PTR(-ENOMEM); 2135 return NULL;
2151} 2136}
2152 2137
2153/* 2138/*
@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2190 mutex_unlock(&con->mutex); 2175 mutex_unlock(&con->mutex);
2191 msg = con->ops->alloc_msg(con, hdr, skip); 2176 msg = con->ops->alloc_msg(con, hdr, skip);
2192 mutex_lock(&con->mutex); 2177 mutex_lock(&con->mutex);
2193 if (IS_ERR(msg)) 2178 if (!msg || *skip)
2194 return msg;
2195
2196 if (*skip)
2197 return NULL; 2179 return NULL;
2198 } 2180 }
2199 if (!msg) { 2181 if (!msg) {
2200 *skip = 0; 2182 *skip = 0;
2201 msg = ceph_msg_new(type, front_len, 0, 0, NULL); 2183 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2202 if (!msg) { 2184 if (!msg) {
2203 pr_err("unable to allocate msg type %d len %d\n", 2185 pr_err("unable to allocate msg type %d len %d\n",
2204 type, front_len); 2186 type, front_len);
2205 return ERR_PTR(-ENOMEM); 2187 return NULL;
2206 } 2188 }
2207 } 2189 }
2208 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2190 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2209 2191
2210 if (middle_len) { 2192 if (middle_len && !msg->middle) {
2211 ret = ceph_alloc_middle(con, msg); 2193 ret = ceph_alloc_middle(con, msg);
2212
2213 if (ret < 0) { 2194 if (ret < 0) {
2214 ceph_msg_put(msg); 2195 ceph_msg_put(msg);
2215 return msg; 2196 return NULL;
2216 } 2197 }
2217 } 2198 }
2218 2199
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..00a9430b1ffc 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
49 int *skip); 49 int *skip);
50}; 50};
51 51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */ 52/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) 53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
56 54
57struct ceph_messenger { 55struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */ 56 struct ceph_entity_inst inst; /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
144 struct ceph_entity_addr peer_addr; /* peer address */ 142 struct ceph_entity_addr peer_addr; /* peer address */
145 struct ceph_entity_name peer_name; /* peer name */ 143 struct ceph_entity_name peer_name; /* peer name */
146 struct ceph_entity_addr peer_addr_for_me; 144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
147 u32 connect_seq; /* identify the most recent connection 146 u32 connect_seq; /* identify the most recent connection
148 attempt for this connection, client */ 147 attempt for this connection, client */
149 u32 peer_global_seq; /* peer's global seq for this connection */ 148 u32 peer_global_seq; /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
158 struct list_head out_queue; 157 struct list_head out_queue;
159 struct list_head out_sent; /* sending or sent but unacked */ 158 struct list_head out_sent; /* sending or sent but unacked */
160 u64 out_seq; /* last message queued for send */ 159 u64 out_seq; /* last message queued for send */
161 u64 out_seq_sent; /* last message sent */
162 bool out_keepalive_pending; 160 bool out_keepalive_pending;
163 161
164 u64 in_seq, in_seq_acked; /* last message received, acked */ 162 u64 in_seq, in_seq_acked; /* last message received, acked */
@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
234extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); 232extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
235extern void ceph_con_put(struct ceph_connection *con); 233extern void ceph_con_put(struct ceph_connection *con);
236 234
237extern struct ceph_msg *ceph_msg_new(int type, int front_len, 235extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
238 int page_len, int page_off,
239 struct page **pages);
240extern void ceph_msg_kfree(struct ceph_msg *m); 236extern void ceph_msg_kfree(struct ceph_msg *m);
241 237
242 238
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..f6510a476e7e 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
28 * resend any outstanding requests. 28 * resend any outstanding requests.
29 */ 29 */
30 30
31const static struct ceph_connection_operations mon_con_ops; 31static const struct ceph_connection_operations mon_con_ops;
32 32
33static int __validate_auth(struct ceph_mon_client *monc); 33static int __validate_auth(struct ceph_mon_client *monc);
34 34
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
104 monc->pending_auth = 1; 104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len; 105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len); 106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
107 ceph_msg_get(monc->m_auth); /* keep our ref */ 108 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth); 109 ceph_con_send(monc->con, monc->m_auth);
109} 110}
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
187 monc->want_next_osdmap); 188 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) || 189 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) { 190 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg; 191 struct ceph_msg *msg = monc->m_subscribe;
191 struct ceph_mon_subscribe_item *i; 192 struct ceph_mon_subscribe_item *i;
192 void *p, *end; 193 void *p, *end;
193 194
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base; 195 p = msg->front.iov_base;
199 end = p + msg->front.iov_len; 196 end = p + msg->front_max;
200 197
201 dout("__send_subscribe to 'mdsmap' %u+\n", 198 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap); 199 (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
226 223
227 msg->front.iov_len = p - msg->front.iov_base; 224 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg); 226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
230 228
231 monc->sub_sent = jiffies | 1; /* never 0 */ 229 monc->sub_sent = jiffies | 1; /* never 0 */
232 } 230 }
@@ -353,14 +351,14 @@ out:
353/* 351/*
354 * statfs 352 * statfs
355 */ 353 */
356static struct ceph_mon_statfs_request *__lookup_statfs( 354static struct ceph_mon_generic_request *__lookup_generic_req(
357 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
358{ 356{
359 struct ceph_mon_statfs_request *req; 357 struct ceph_mon_generic_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node; 358 struct rb_node *n = monc->generic_request_tree.rb_node;
361 359
362 while (n) { 360 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node); 361 req = rb_entry(n, struct ceph_mon_generic_request, node);
364 if (tid < req->tid) 362 if (tid < req->tid)
365 n = n->rb_left; 363 n = n->rb_left;
366 else if (tid > req->tid) 364 else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
371 return NULL; 369 return NULL;
372} 370}
373 371
374static void __insert_statfs(struct ceph_mon_client *monc, 372static void __insert_generic_request(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new) 373 struct ceph_mon_generic_request *new)
376{ 374{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node; 375 struct rb_node **p = &monc->generic_request_tree.rb_node;
378 struct rb_node *parent = NULL; 376 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL; 377 struct ceph_mon_generic_request *req = NULL;
380 378
381 while (*p) { 379 while (*p) {
382 parent = *p; 380 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node); 381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
384 if (new->tid < req->tid) 382 if (new->tid < req->tid)
385 p = &(*p)->rb_left; 383 p = &(*p)->rb_left;
386 else if (new->tid > req->tid) 384 else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
390 } 388 }
391 389
392 rb_link_node(&new->node, parent, p); 390 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree); 391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403}
404
405static void put_generic_request(struct ceph_mon_generic_request *req)
406{
407 kref_put(&req->kref, release_generic_request);
408}
409
410static void get_generic_request(struct ceph_mon_generic_request *req)
411{
412 kref_get(&req->kref);
413}
414
415static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
416 struct ceph_msg_header *hdr,
417 int *skip)
418{
419 struct ceph_mon_client *monc = con->private;
420 struct ceph_mon_generic_request *req;
421 u64 tid = le64_to_cpu(hdr->tid);
422 struct ceph_msg *m;
423
424 mutex_lock(&monc->mutex);
425 req = __lookup_generic_req(monc, tid);
426 if (!req) {
427 dout("get_generic_reply %lld dne\n", tid);
428 *skip = 1;
429 m = NULL;
430 } else {
431 dout("get_generic_reply %lld got %p\n", tid, req->reply);
432 m = ceph_msg_get(req->reply);
433 /*
434 * we don't need to track the connection reading into
435 * this reply because we only have one open connection
436 * at a time, ever.
437 */
438 }
439 mutex_unlock(&monc->mutex);
440 return m;
394} 441}
395 442
396static void handle_statfs_reply(struct ceph_mon_client *monc, 443static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg) 444 struct ceph_msg *msg)
398{ 445{
399 struct ceph_mon_statfs_request *req; 446 struct ceph_mon_generic_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 447 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid; 448 u64 tid = le64_to_cpu(msg->hdr.tid);
402 449
403 if (msg->front.iov_len != sizeof(*reply)) 450 if (msg->front.iov_len != sizeof(*reply))
404 goto bad; 451 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 452 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407 453
408 mutex_lock(&monc->mutex); 454 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid); 455 req = __lookup_generic_req(monc, tid);
410 if (req) { 456 if (req) {
411 *req->buf = reply->st; 457 *(struct ceph_statfs *)req->buf = reply->st;
412 req->result = 0; 458 req->result = 0;
459 get_generic_request(req);
413 } 460 }
414 mutex_unlock(&monc->mutex); 461 mutex_unlock(&monc->mutex);
415 if (req) 462 if (req) {
416 complete(&req->completion); 463 complete(&req->completion);
464 put_generic_request(req);
465 }
417 return; 466 return;
418 467
419bad: 468bad:
420 pr_err("corrupt statfs reply, no tid\n"); 469 pr_err("corrupt generic reply, no tid\n");
421 ceph_msg_dump(msg); 470 ceph_msg_dump(msg);
422} 471}
423 472
424/* 473/*
425 * (re)send a statfs request 474 * Do a synchronous statfs().
426 */ 475 */
427static int send_statfs(struct ceph_mon_client *monc, 476int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
428 struct ceph_mon_statfs_request *req)
429{ 477{
430 struct ceph_msg *msg; 478 struct ceph_mon_generic_request *req;
431 struct ceph_mon_statfs *h; 479 struct ceph_mon_statfs *h;
480 int err;
432 481
433 dout("send_statfs tid %llu\n", req->tid); 482 req = kzalloc(sizeof(*req), GFP_NOFS);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); 483 if (!req)
435 if (IS_ERR(msg)) 484 return -ENOMEM;
436 return PTR_ERR(msg); 485
437 req->request = msg; 486 kref_init(&req->kref);
438 msg->hdr.tid = cpu_to_le64(req->tid); 487 req->buf = buf;
439 h = msg->front.iov_base; 488 init_completion(&req->completion);
489
490 err = -ENOMEM;
491 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
492 if (!req->request)
493 goto out;
494 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
495 if (!req->reply)
496 goto out;
497
498 /* fill out request */
499 h = req->request->front.iov_base;
440 h->monhdr.have_version = 0; 500 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1); 501 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0; 502 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid; 503 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463 504
464 /* register request */ 505 /* register request */
465 mutex_lock(&monc->mutex); 506 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid; 507 req->tid = ++monc->last_tid;
467 req.last_attempt = jiffies; 508 req->request->hdr.tid = cpu_to_le64(req->tid);
468 req.delay = BASE_DELAY_INTERVAL; 509 __insert_generic_request(monc, req);
469 __insert_statfs(monc, &req); 510 monc->num_generic_requests++;
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex); 511 mutex_unlock(&monc->mutex);
472 512
473 /* send request and wait */ 513 /* send request and wait */
474 err = send_statfs(monc, &req); 514 ceph_con_send(monc->con, ceph_msg_get(req->request));
475 if (!err) 515 err = wait_for_completion_interruptible(&req->completion);
476 err = wait_for_completion_interruptible(&req.completion);
477 516
478 mutex_lock(&monc->mutex); 517 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree); 518 rb_erase(&req->node, &monc->generic_request_tree);
480 monc->num_statfs_requests--; 519 monc->num_generic_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex); 520 mutex_unlock(&monc->mutex);
483 521
484 if (!err) 522 if (!err)
485 err = req.result; 523 err = req->result;
524
525out:
526 kref_put(&req->kref, release_generic_request);
486 return err; 527 return err;
487} 528}
488 529
489/* 530/*
490 * Resend pending statfs requests. 531 * Resend pending statfs requests.
491 */ 532 */
492static void __resend_statfs(struct ceph_mon_client *monc) 533static void __resend_generic_request(struct ceph_mon_client *monc)
493{ 534{
494 struct ceph_mon_statfs_request *req; 535 struct ceph_mon_generic_request *req;
495 struct rb_node *p; 536 struct rb_node *p;
496 537
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { 538 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node); 539 req = rb_entry(p, struct ceph_mon_generic_request, node);
499 send_statfs(monc, req); 540 ceph_con_revoke(monc->con, req->request);
541 ceph_con_send(monc->con, ceph_msg_get(req->request));
500 } 542 }
501} 543}
502 544
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 628 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 629 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588 630
589 /* msg pools */ 631 /* msgs */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 632 err = -ENOMEM;
591 sizeof(struct ceph_mon_subscribe_ack), 1, false); 633 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
592 if (err < 0) 634 sizeof(struct ceph_mon_subscribe_ack),
635 GFP_NOFS);
636 if (!monc->m_subscribe_ack)
593 goto out_monmap; 637 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply, 638
595 sizeof(struct ceph_mon_statfs_reply), 0, false); 639 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
596 if (err < 0) 640 if (!monc->m_subscribe)
597 goto out_pool1; 641 goto out_subscribe_ack;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); 642
599 if (err < 0) 643 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
600 goto out_pool2; 644 if (!monc->m_auth_reply)
601 645 goto out_subscribe;
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); 646
647 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
603 monc->pending_auth = 0; 648 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) { 649 if (!monc->m_auth)
605 err = PTR_ERR(monc->m_auth); 650 goto out_auth_reply;
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609 651
610 monc->cur_mon = -1; 652 monc->cur_mon = -1;
611 monc->hunting = true; 653 monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
613 monc->sub_sent = 0; 655 monc->sub_sent = 0;
614 656
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 657 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT; 658 monc->generic_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0; 659 monc->num_generic_requests = 0;
618 monc->last_tid = 0; 660 monc->last_tid = 0;
619 661
620 monc->have_mdsmap = 0; 662 monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
622 monc->want_next_osdmap = 1; 664 monc->want_next_osdmap = 1;
623 return 0; 665 return 0;
624 666
625out_pool3: 667out_auth_reply:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 668 ceph_msg_put(monc->m_auth_reply);
627out_pool2: 669out_subscribe:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 670 ceph_msg_put(monc->m_subscribe);
629out_pool1: 671out_subscribe_ack:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 672 ceph_msg_put(monc->m_subscribe_ack);
631out_monmap: 673out_monmap:
632 kfree(monc->monmap); 674 kfree(monc->monmap);
633out: 675out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
651 ceph_auth_destroy(monc->auth); 693 ceph_auth_destroy(monc->auth);
652 694
653 ceph_msg_put(monc->m_auth); 695 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 696 ceph_msg_put(monc->m_auth_reply);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 697 ceph_msg_put(monc->m_subscribe);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 698 ceph_msg_put(monc->m_subscribe_ack);
657 699
658 kfree(monc->monmap); 700 kfree(monc->monmap);
659} 701}
@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
681 monc->client->msgr->inst.name.num = monc->auth->global_id; 723 monc->client->msgr->inst.name.num = monc->auth->global_id;
682 724
683 __send_subscribe(monc); 725 __send_subscribe(monc);
684 __resend_statfs(monc); 726 __resend_generic_request(monc);
685 } 727 }
686 mutex_unlock(&monc->mutex); 728 mutex_unlock(&monc->mutex);
687} 729}
@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
770 812
771 switch (type) { 813 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK: 814 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); 815 m = ceph_msg_get(monc->m_subscribe_ack);
774 break; 816 break;
775 case CEPH_MSG_STATFS_REPLY: 817 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); 818 return get_generic_reply(con, hdr, skip);
777 break;
778 case CEPH_MSG_AUTH_REPLY: 819 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); 820 m = ceph_msg_get(monc->m_auth_reply);
780 break; 821 break;
781 case CEPH_MSG_MON_MAP: 822 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP: 823 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP: 824 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL); 825 m = ceph_msg_new(type, front_len, GFP_NOFS);
785 break; 826 break;
786 } 827 }
787 828
@@ -826,7 +867,7 @@ out:
826 mutex_unlock(&monc->mutex); 867 mutex_unlock(&monc->mutex);
827} 868}
828 869
829const static struct ceph_connection_operations mon_con_ops = { 870static const struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get, 871 .get = ceph_con_get,
831 .put = ceph_con_put, 872 .put = ceph_con_put,
832 .dispatch = dispatch, 873 .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
2#define _FS_CEPH_MON_CLIENT_H 2#define _FS_CEPH_MON_CLIENT_H
3 3
4#include <linux/completion.h> 4#include <linux/completion.h>
5#include <linux/kref.h>
5#include <linux/rbtree.h> 6#include <linux/rbtree.h>
6 7
7#include "messenger.h" 8#include "messenger.h"
8#include "msgpool.h"
9 9
10struct ceph_client; 10struct ceph_client;
11struct ceph_mount_args; 11struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
22}; 22};
23 23
24struct ceph_mon_client; 24struct ceph_mon_client;
25struct ceph_mon_statfs_request; 25struct ceph_mon_generic_request;
26 26
27 27
28/* 28/*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
40}; 40};
41 41
42/* 42/*
43 * statfs() is done a bit differently because we need to get data back 43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
44 * to the caller 45 * to the caller
45 */ 46 */
46struct ceph_mon_statfs_request { 47struct ceph_mon_generic_request {
48 struct kref kref;
47 u64 tid; 49 u64 tid;
48 struct rb_node node; 50 struct rb_node node;
49 int result; 51 int result;
50 struct ceph_statfs *buf; 52 void *buf;
51 struct completion completion; 53 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */ 54 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */
54}; 56};
55 57
56struct ceph_mon_client { 58struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
61 struct delayed_work delayed_work; 63 struct delayed_work delayed_work;
62 64
63 struct ceph_auth_client *auth; 65 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth; 66 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
65 int pending_auth; 67 int pending_auth;
66 68
67 bool hunting; 69 bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
70 struct ceph_connection *con; 72 struct ceph_connection *con;
71 bool have_fsid; 73 bool have_fsid;
72 74
73 /* msg pools */ 75 /* pending generic requests */
74 struct ceph_msgpool msgpool_subscribe_ack; 76 struct rb_root generic_request_tree;
75 struct ceph_msgpool msgpool_statfs_reply; 77 int num_generic_requests;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid; 78 u64 last_tid;
82 79
83 /* mds/osd map */ 80 /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
7 7
8#include "msgpool.h" 8#include "msgpool.h"
9 9
10/* 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11 * We use msg pools to preallocate memory for messages we expect to 11{
12 * receive over the wire, to avoid getting ourselves into OOM 12 struct ceph_msgpool *pool = arg;
13 * conditions at unexpected times. We take use a few different 13 void *p;
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31 14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
32 20
33/* 21static void free_fn(void *element, void *arg)
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{ 22{
38 struct ceph_msg *msg; 23 ceph_msg_put(element);
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61} 24}
62 25
63int ceph_msgpool_init(struct ceph_msgpool *pool, 26int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking) 27 int front_len, int size, bool blocking, const char *name)
65{ 28{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len; 29 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs); 30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
72 pool->num = 0; 31 if (!pool->pool)
73 pool->min = min; 32 return -ENOMEM;
74 pool->blocking = blocking; 33 pool->name = name;
75 init_waitqueue_head(&pool->wait); 34 return 0;
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81} 35}
82 36
83void ceph_msgpool_destroy(struct ceph_msgpool *pool) 37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{ 38{
85 dout("msgpool_destroy %p\n", pool); 39 mempool_destroy(pool->pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90} 40}
91 41
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) 42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
93{ 44{
94 int ret; 45 if (front_len > pool->front_len) {
95 46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
96 spin_lock(&pool->lock); 47 pool->name, front_len, pool->front_len);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1); 48 WARN_ON(1);
113 49
114 /* try to alloc a fresh message */ 50 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL); 51 return ceph_msg_new(0, front_len, GFP_NOFS);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 } 52 }
129 53
130 while (1) { 54 return mempool_alloc(pool->pool, GFP_NOFS);
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163} 55}
164 56
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) 57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{ 58{
167 spin_lock(&pool->lock); 59 /* reset msg front_len; user may have changed it */
168 if (pool->num < pool->min) { 60 msg->front.iov_len = pool->front_len;
169 /* reset msg front_len; user may have changed it */ 61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172 62
173 kref_set(&msg->kref, 1); /* retake a single ref */ 63 kref_init(&msg->kref); /* retake single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186} 64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
1#ifndef _FS_CEPH_MSGPOOL 1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL 2#define _FS_CEPH_MSGPOOL
3 3
4#include <linux/mempool.h>
4#include "messenger.h" 5#include "messenger.h"
5 6
6/* 7/*
@@ -8,18 +9,15 @@
8 * avoid unexpected OOM conditions. 9 * avoid unexpected OOM conditions.
9 */ 10 */
10struct ceph_msgpool { 11struct ceph_msgpool {
11 spinlock_t lock; 12 const char *name;
13 mempool_t *pool;
12 int front_len; /* preallocated payload size */ 14 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17}; 15};
18 16
19extern int ceph_msgpool_init(struct ceph_msgpool *pool, 17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking); 18 int front_len, int size, bool blocking,
19 const char *name);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); 20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, 21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len); 22 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); 23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
50#define CEPH_ENTITY_TYPE_MDS 0x02 50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04 51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08 52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20 53#define CEPH_ENTITY_TYPE_AUTH 0x20
55 54
56#define CEPH_ENTITY_TYPE_ANY 0xFF 55#define CEPH_ENTITY_TYPE_ANY 0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
120/* 119/*
121 * message header 120 * message header
122 */ 121 */
123struct ceph_msg_header { 122struct ceph_msg_header_old {
124 __le64 seq; /* message seq# for this session */ 123 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */ 124 __le64 tid; /* transaction id */
126 __le16 type; /* message type */ 125 __le16 type; /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
138 __le32 crc; /* header crc32c */ 137 __le32 crc; /* header crc32c */
139} __attribute__ ((packed)); 138} __attribute__ ((packed));
140 139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
141#define CEPH_MSG_PRIO_LOW 64 158#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127 159#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196 160#define CEPH_MSG_PRIO_HIGH 196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..afa7bb3895c4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
16#define OSD_OP_FRONT_LEN 4096 16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 17#define OSD_OPREPLY_FRONT_LEN 512
18 18
19const static struct ceph_connection_operations osd_con_ops; 19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc, 20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd); 21 struct ceph_osd *kickosd);
22 22
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
147 req = kzalloc(sizeof(*req), GFP_NOFS); 147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 } 148 }
149 if (req == NULL) 149 if (req == NULL)
150 return ERR_PTR(-ENOMEM); 150 return NULL;
151 151
152 req->r_osdc = osdc; 152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 153 req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); 167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (IS_ERR(msg)) { 168 if (!msg) {
169 ceph_osdc_put_request(req); 169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg)); 170 return NULL;
171 } 171 }
172 req->r_reply = msg; 172 req->r_reply = msg;
173 173
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); 181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (IS_ERR(msg)) { 182 if (!msg) {
183 ceph_osdc_put_request(req); 183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg)); 184 return NULL;
185 } 185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 187 memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
715 * should mark the osd as failed and we should find out about 715 * should mark the osd as failed and we should find out about
716 * it from an updated osd map. 716 * it from an updated osd map.
717 */ 717 */
718 while (!list_empty(&osdc->req_lru)) { 718 while (timeout && !list_empty(&osdc->req_lru)) {
719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request, 719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
720 r_req_lru_item); 720 r_req_lru_item);
721 721
@@ -1078,6 +1078,7 @@ done:
1078 if (newmap) 1078 if (newmap)
1079 kick_requests(osdc, NULL); 1079 kick_requests(osdc, NULL);
1080 up_read(&osdc->map_sem); 1080 up_read(&osdc->map_sem);
1081 wake_up(&osdc->client->auth_wq);
1081 return; 1082 return;
1082 1083
1083bad: 1084bad:
@@ -1087,45 +1088,6 @@ bad:
1087 return; 1088 return;
1088} 1089}
1089 1090
1090
1091/*
1092 * A read request prepares specific pages that data is to be read into.
1093 * When a message is being read off the wire, we call prepare_pages to
1094 * find those pages.
1095 * 0 = success, -1 failure.
1096 */
1097static int __prepare_pages(struct ceph_connection *con,
1098 struct ceph_msg_header *hdr,
1099 struct ceph_osd_request *req,
1100 u64 tid,
1101 struct ceph_msg *m)
1102{
1103 struct ceph_osd *osd = con->private;
1104 struct ceph_osd_client *osdc;
1105 int ret = -1;
1106 int data_len = le32_to_cpu(hdr->data_len);
1107 unsigned data_off = le16_to_cpu(hdr->data_off);
1108
1109 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1110
1111 if (!osd)
1112 return -1;
1113
1114 osdc = osd->o_osdc;
1115
1116 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1117 tid, req->r_num_pages, want);
1118 if (unlikely(req->r_num_pages < want))
1119 goto out;
1120 m->pages = req->r_pages;
1121 m->nr_pages = req->r_num_pages;
1122 ret = 0; /* success */
1123out:
1124 BUG_ON(ret < 0 || m->nr_pages < want);
1125
1126 return ret;
1127}
1128
1129/* 1091/*
1130 * Register request, send initial attempt. 1092 * Register request, send initial attempt.
1131 */ 1093 */
@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1252 if (!osdc->req_mempool) 1214 if (!osdc->req_mempool)
1253 goto out; 1215 goto out;
1254 1216
1255 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); 1217 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1218 "osd_op");
1256 if (err < 0) 1219 if (err < 0)
1257 goto out_mempool; 1220 goto out_mempool;
1258 err = ceph_msgpool_init(&osdc->msgpool_op_reply, 1221 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1259 OSD_OPREPLY_FRONT_LEN, 10, true); 1222 OSD_OPREPLY_FRONT_LEN, 10, true,
1223 "osd_op_reply");
1260 if (err < 0) 1224 if (err < 0)
1261 goto out_msgpool; 1225 goto out_msgpool;
1262 return 0; 1226 return 0;
@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1302 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1266 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1303 NULL, 0, truncate_seq, truncate_size, NULL, 1267 NULL, 0, truncate_seq, truncate_size, NULL,
1304 false, 1); 1268 false, 1);
1305 if (IS_ERR(req)) 1269 if (!req)
1306 return PTR_ERR(req); 1270 return -ENOMEM;
1307 1271
1308 /* it may be a short read due to an object boundary */ 1272 /* it may be a short read due to an object boundary */
1309 req->r_pages = pages; 1273 req->r_pages = pages;
@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1345 snapc, do_sync, 1309 snapc, do_sync,
1346 truncate_seq, truncate_size, mtime, 1310 truncate_seq, truncate_size, mtime,
1347 nofail, 1); 1311 nofail, 1);
1348 if (IS_ERR(req)) 1312 if (!req)
1349 return PTR_ERR(req); 1313 return -ENOMEM;
1350 1314
1351 /* it may be a short write due to an object boundary */ 1315 /* it may be a short write due to an object boundary */
1352 req->r_pages = pages; 1316 req->r_pages = pages;
@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1394} 1358}
1395 1359
1396/* 1360/*
1397 * lookup and return message for incoming reply 1361 * lookup and return message for incoming reply. set up reply message
1362 * pages.
1398 */ 1363 */
1399static struct ceph_msg *get_reply(struct ceph_connection *con, 1364static struct ceph_msg *get_reply(struct ceph_connection *con,
1400 struct ceph_msg_header *hdr, 1365 struct ceph_msg_header *hdr,
@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1407 int front = le32_to_cpu(hdr->front_len); 1372 int front = le32_to_cpu(hdr->front_len);
1408 int data_len = le32_to_cpu(hdr->data_len); 1373 int data_len = le32_to_cpu(hdr->data_len);
1409 u64 tid; 1374 u64 tid;
1410 int err;
1411 1375
1412 tid = le64_to_cpu(hdr->tid); 1376 tid = le64_to_cpu(hdr->tid);
1413 mutex_lock(&osdc->request_mutex); 1377 mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1425 req->r_reply, req->r_con_filling_msg); 1389 req->r_reply, req->r_con_filling_msg);
1426 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); 1390 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1427 ceph_con_put(req->r_con_filling_msg); 1391 ceph_con_put(req->r_con_filling_msg);
1392 req->r_con_filling_msg = NULL;
1428 } 1393 }
1429 1394
1430 if (front > req->r_reply->front.iov_len) { 1395 if (front > req->r_reply->front.iov_len) {
1431 pr_warning("get_reply front %d > preallocated %d\n", 1396 pr_warning("get_reply front %d > preallocated %d\n",
1432 front, (int)req->r_reply->front.iov_len); 1397 front, (int)req->r_reply->front.iov_len);
1433 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); 1398 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1434 if (IS_ERR(m)) 1399 if (!m)
1435 goto out; 1400 goto out;
1436 ceph_msg_put(req->r_reply); 1401 ceph_msg_put(req->r_reply);
1437 req->r_reply = m; 1402 req->r_reply = m;
@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1439 m = ceph_msg_get(req->r_reply); 1404 m = ceph_msg_get(req->r_reply);
1440 1405
1441 if (data_len > 0) { 1406 if (data_len > 0) {
1442 err = __prepare_pages(con, hdr, req, tid, m); 1407 unsigned data_off = le16_to_cpu(hdr->data_off);
1443 if (err < 0) { 1408 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1409
1410 if (unlikely(req->r_num_pages < want)) {
1411 pr_warning("tid %lld reply %d > expected %d pages\n",
1412 tid, want, m->nr_pages);
1444 *skip = 1; 1413 *skip = 1;
1445 ceph_msg_put(m); 1414 ceph_msg_put(m);
1446 m = ERR_PTR(err); 1415 m = NULL;
1416 goto out;
1447 } 1417 }
1418 m->pages = req->r_pages;
1419 m->nr_pages = req->r_num_pages;
1448 } 1420 }
1449 *skip = 0; 1421 *skip = 0;
1450 req->r_con_filling_msg = ceph_con_get(con); 1422 req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1466 1438
1467 switch (type) { 1439 switch (type) {
1468 case CEPH_MSG_OSD_MAP: 1440 case CEPH_MSG_OSD_MAP:
1469 return ceph_msg_new(type, front, 0, 0, NULL); 1441 return ceph_msg_new(type, front, GFP_NOFS);
1470 case CEPH_MSG_OSD_OPREPLY: 1442 case CEPH_MSG_OSD_OPREPLY:
1471 return get_reply(con, hdr, skip); 1443 return get_reply(con, hdr, skip);
1472 default: 1444 default:
@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
1552 return ceph_monc_validate_auth(&osdc->client->monc); 1524 return ceph_monc_validate_auth(&osdc->client->monc);
1553} 1525}
1554 1526
1555const static struct ceph_connection_operations osd_con_ops = { 1527static const struct ceph_connection_operations osd_con_ops = {
1556 .get = get_osd_con, 1528 .get = get_osd_con,
1557 .put = put_osd_con, 1529 .put = put_osd_con,
1558 .dispatch = dispatch, 1530 .dispatch = dispatch,
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
20 20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl) 21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{ 22{
23 struct page *page = alloc_page(GFP_NOFS); 23 struct page *page = __page_cache_alloc(GFP_NOFS);
24 if (!page) 24 if (!page)
25 return -ENOMEM; 25 return -ENOMEM;
26 pl->room += PAGE_SIZE; 26 pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
101 __le64 snap_seq; /* seq for per-pool snapshot */ 101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */ 102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps; 103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 uid; 105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed)); 106} __attribute__ ((packed));
107 107
108/* 108/*
@@ -208,6 +208,7 @@ enum {
208 /* read */ 208 /* read */
209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
211 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
211 212
212 /* write */ 213 /* write */
213 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 214 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
305#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 306#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
306#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 307#define EBLACKLISTED ESHUTDOWN /* blacklisted */
307 308
309/* xattr comparison */
310enum {
311 CEPH_OSD_CMPXATTR_OP_NOP = 0,
312 CEPH_OSD_CMPXATTR_OP_EQ = 1,
313 CEPH_OSD_CMPXATTR_OP_NE = 2,
314 CEPH_OSD_CMPXATTR_OP_GT = 3,
315 CEPH_OSD_CMPXATTR_OP_GTE = 4,
316 CEPH_OSD_CMPXATTR_OP_LT = 5,
317 CEPH_OSD_CMPXATTR_OP_LTE = 6
318};
319
320enum {
321 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
322 CEPH_OSD_CMPXATTR_MODE_U64 = 2
323};
324
308/* 325/*
309 * an individual object operation. each may be accompanied by some data 326 * an individual object operation. each may be accompanied by some data
310 * payload 327 * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
321 struct { 338 struct {
322 __le32 name_len; 339 __le32 name_len;
323 __le32 value_len; 340 __le32 value_len;
341 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
342 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
324 } __attribute__ ((packed)) xattr; 343 } __attribute__ ((packed)) xattr;
325 struct { 344 struct {
326 __u8 class_len; 345 __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap) 512 struct ceph_cap_snap *capsnap)
513{ 513{
514 struct inode *inode = &ci->vfs_inode; 514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 515 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
516 516
517 BUG_ON(capsnap->writing); 517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size; 518 capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba9269..7c663d9b9f81 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/parser.h> 10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/seq_file.h> 12#include <linux/seq_file.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/statfs.h> 14#include <linux/statfs.h>
16#include <linux/string.h> 15#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19 16
20#include "decode.h" 17#include "decode.h"
21#include "super.h" 18#include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
107static int ceph_syncfs(struct super_block *sb, int wait) 104static int ceph_syncfs(struct super_block *sb, int wait)
108{ 105{
109 dout("sync_fs %d\n", wait); 106 dout("sync_fs %d\n", wait);
110 ceph_osdc_sync(&ceph_client(sb)->osdc); 107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
111 ceph_mdsc_sync(&ceph_client(sb)->mdsc); 108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
112 dout("sync_fs %d done\n", wait); 109 dout("sync_fs %d done\n", wait);
113 return 0; 110 return 0;
114} 111}
115 112
113static int default_congestion_kb(void)
114{
115 int congestion_kb;
116
117 /*
118 * Copied from NFS
119 *
120 * congestion size, scale with available memory.
121 *
122 * 64MB: 8192k
123 * 128MB: 11585k
124 * 256MB: 16384k
125 * 512MB: 23170k
126 * 1GB: 32768k
127 * 2GB: 46340k
128 * 4GB: 65536k
129 * 8GB: 92681k
130 * 16GB: 131072k
131 *
132 * This allows larger machines to have larger/more transfers.
133 * Limit the default to 256M
134 */
135 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
136 if (congestion_kb > 256*1024)
137 congestion_kb = 256*1024;
138
139 return congestion_kb;
140}
116 141
117/** 142/**
118 * ceph_show_options - Show mount options in /proc/mounts 143 * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
138 seq_puts(m, ",nocrc"); 163 seq_puts(m, ",nocrc");
139 if (args->flags & CEPH_OPT_NOASYNCREADDIR) 164 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
140 seq_puts(m, ",noasyncreaddir"); 165 seq_puts(m, ",noasyncreaddir");
166
167 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
168 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
169 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
170 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
171 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
172 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
173 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
174 seq_printf(m, ",osdkeepalivetimeout=%d",
175 args->osd_keepalive_timeout);
176 if (args->wsize)
177 seq_printf(m, ",wsize=%d", args->wsize);
178 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
179 seq_printf(m, ",rsize=%d", args->rsize);
180 if (args->congestion_kb != default_congestion_kb())
181 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
182 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
183 seq_printf(m, ",caps_wanted_delay_min=%d",
184 args->caps_wanted_delay_min);
185 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
186 seq_printf(m, ",caps_wanted_delay_max=%d",
187 args->caps_wanted_delay_max);
188 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
189 seq_printf(m, ",cap_release_safety=%d",
190 args->cap_release_safety);
191 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
192 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
193 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
194 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
141 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 195 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
142 seq_printf(m, ",snapdirname=%s", args->snapdir_name); 196 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
143 if (args->name) 197 if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
161 inode_init_once(&ci->vfs_inode); 215 inode_init_once(&ci->vfs_inode);
162} 216}
163 217
164static int default_congestion_kb(void)
165{
166 int congestion_kb;
167
168 /*
169 * Copied from NFS
170 *
171 * congestion size, scale with available memory.
172 *
173 * 64MB: 8192k
174 * 128MB: 11585k
175 * 256MB: 16384k
176 * 512MB: 23170k
177 * 1GB: 32768k
178 * 2GB: 46340k
179 * 4GB: 65536k
180 * 8GB: 92681k
181 * 16GB: 131072k
182 *
183 * This allows larger machines to have larger/more transfers.
184 * Limit the default to 256M
185 */
186 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
187 if (congestion_kb > 256*1024)
188 congestion_kb = 256*1024;
189
190 return congestion_kb;
191}
192
193static int __init init_caches(void) 218static int __init init_caches(void)
194{ 219{
195 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 220 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
308 Opt_osd_idle_ttl, 333 Opt_osd_idle_ttl,
309 Opt_caps_wanted_delay_min, 334 Opt_caps_wanted_delay_min,
310 Opt_caps_wanted_delay_max, 335 Opt_caps_wanted_delay_max,
336 Opt_cap_release_safety,
311 Opt_readdir_max_entries, 337 Opt_readdir_max_entries,
338 Opt_readdir_max_bytes,
312 Opt_congestion_kb, 339 Opt_congestion_kb,
313 Opt_last_int, 340 Opt_last_int,
314 /* int args above */ 341 /* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
339 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 366 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
340 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 367 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
341 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 368 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
369 {Opt_cap_release_safety, "cap_release_safety=%d"},
342 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 370 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
371 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
343 {Opt_congestion_kb, "write_congestion_kb=%d"}, 372 {Opt_congestion_kb, "write_congestion_kb=%d"},
344 /* int args above */ 373 /* int args above */
345 {Opt_snapdirname, "snapdirname=%s"}, 374 {Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
388 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 417 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
389 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 418 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
390 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 419 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
391 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; 420 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
392 args->max_readdir = 1024; 421 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
422 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
393 args->congestion_kb = default_congestion_kb(); 423 args->congestion_kb = default_congestion_kb();
394 424
395 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 425 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
497 case Opt_readdir_max_entries: 527 case Opt_readdir_max_entries:
498 args->max_readdir = intval; 528 args->max_readdir = intval;
499 break; 529 break;
530 case Opt_readdir_max_bytes:
531 args->max_readdir_bytes = intval;
532 break;
500 case Opt_congestion_kb: 533 case Opt_congestion_kb:
501 args->congestion_kb = intval; 534 args->congestion_kb = intval;
502 break; 535 break;
@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
682/* 715/*
683 * true if we have the mon map (and have thus joined the cluster) 716 * true if we have the mon map (and have thus joined the cluster)
684 */ 717 */
685static int have_mon_map(struct ceph_client *client) 718static int have_mon_and_osd_map(struct ceph_client *client)
686{ 719{
687 return client->monc.monmap && client->monc.monmap->epoch; 720 return client->monc.monmap && client->monc.monmap->epoch &&
721 client->osdc.osdmap && client->osdc.osdmap->epoch;
688} 722}
689 723
690/* 724/*
@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
762 if (err < 0) 796 if (err < 0)
763 goto out; 797 goto out;
764 798
765 while (!have_mon_map(client)) { 799 while (!have_mon_and_osd_map(client)) {
766 err = -EIO; 800 err = -EIO;
767 if (timeout && time_after_eq(jiffies, started + timeout)) 801 if (timeout && time_after_eq(jiffies, started + timeout))
768 goto out; 802 goto out;
@@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
770 /* wait */ 804 /* wait */
771 dout("mount waiting for mon_map\n"); 805 dout("mount waiting for mon_map\n");
772 err = wait_event_interruptible_timeout(client->auth_wq, 806 err = wait_event_interruptible_timeout(client->auth_wq,
773 have_mon_map(client) || (client->auth_err < 0), 807 have_mon_and_osd_map(client) || (client->auth_err < 0),
774 timeout); 808 timeout);
775 if (err == -EINTR || err == -ERESTARTSYS) 809 if (err == -EINTR || err == -ERESTARTSYS)
776 goto out; 810 goto out;
777 if (client->auth_err < 0) { 811 if (client->auth_err < 0) {
@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
884/* 918/*
885 * construct our own bdi so we can control readahead, etc. 919 * construct our own bdi so we can control readahead, etc.
886 */ 920 */
921static atomic_long_t bdi_seq = ATOMIC_INIT(0);
922
887static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 923static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
888{ 924{
889 int err; 925 int err;
@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
893 client->backing_dev_info.ra_pages = 929 client->backing_dev_info.ra_pages =
894 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 930 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
895 >> PAGE_SHIFT; 931 >> PAGE_SHIFT;
896 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); 932 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
933 atomic_long_inc_return(&bdi_seq));
897 if (!err) 934 if (!err)
898 sb->s_bdi = &client->backing_dev_info; 935 sb->s_bdi = &client->backing_dev_info;
899 return err; 936 return err;
@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
932 goto out; 969 goto out;
933 } 970 }
934 971
935 if (ceph_client(sb) != client) { 972 if (ceph_sb_to_client(sb) != client) {
936 ceph_destroy_client(client); 973 ceph_destroy_client(client);
937 client = ceph_client(sb); 974 client = ceph_sb_to_client(sb);
938 dout("get_sb got existing client %p\n", client); 975 dout("get_sb got existing client %p\n", client);
939 } else { 976 } else {
940 dout("get_sb using new client %p\n", client); 977 dout("get_sb using new client %p\n", client);
@@ -952,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
952 989
953out_splat: 990out_splat:
954 ceph_mdsc_close_sessions(&client->mdsc); 991 ceph_mdsc_close_sessions(&client->mdsc);
955 up_write(&sb->s_umount); 992 deactivate_locked_super(sb);
956 deactivate_super(sb);
957 goto out_final; 993 goto out_final;
958 994
959out: 995out:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..3725c9ee9d08 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -52,24 +52,25 @@
52 52
53struct ceph_mount_args { 53struct ceph_mount_args {
54 int sb_flags; 54 int sb_flags;
55 int flags;
56 struct ceph_fsid fsid;
57 struct ceph_entity_addr my_addr;
55 int num_mon; 58 int num_mon;
56 struct ceph_entity_addr *mon_addr; 59 struct ceph_entity_addr *mon_addr;
57 int flags;
58 int mount_timeout; 60 int mount_timeout;
59 int osd_idle_ttl; 61 int osd_idle_ttl;
60 int caps_wanted_delay_min, caps_wanted_delay_max;
61 struct ceph_fsid fsid;
62 struct ceph_entity_addr my_addr;
63 int wsize;
64 int rsize; /* max readahead */
65 int max_readdir; /* max readdir size */
66 int congestion_kb; /* max readdir size */
67 int osd_timeout; 62 int osd_timeout;
68 int osd_keepalive_timeout; 63 int osd_keepalive_timeout;
64 int wsize;
65 int rsize; /* max readahead */
66 int congestion_kb; /* max writeback in flight */
67 int caps_wanted_delay_min, caps_wanted_delay_max;
68 int cap_release_safety;
69 int max_readdir; /* max readdir result (entires) */
70 int max_readdir_bytes; /* max readdir result (bytes) */
69 char *snapdir_name; /* default ".snap" */ 71 char *snapdir_name; /* default ".snap" */
70 char *name; 72 char *name;
71 char *secret; 73 char *secret;
72 int cap_release_safety;
73}; 74};
74 75
75/* 76/*
@@ -80,13 +81,14 @@ struct ceph_mount_args {
80#define CEPH_OSD_KEEPALIVE_DEFAULT 5 81#define CEPH_OSD_KEEPALIVE_DEFAULT 5
81#define CEPH_OSD_IDLE_TTL_DEFAULT 60 82#define CEPH_OSD_IDLE_TTL_DEFAULT 60
82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ 83#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
84#define CEPH_MAX_READDIR_DEFAULT 1024
85#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
83 86
84#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 87#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
85#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) 88#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
86 89
87#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 90#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
88#define CEPH_AUTH_NAME_DEFAULT "guest" 91#define CEPH_AUTH_NAME_DEFAULT "guest"
89
90/* 92/*
91 * Delay telling the MDS we no longer want caps, in case we reopen 93 * Delay telling the MDS we no longer want caps, in case we reopen
92 * the file. Delay a minimum amount of time, even if we send a cap 94 * the file. Delay a minimum amount of time, even if we send a cap
@@ -96,6 +98,7 @@ struct ceph_mount_args {
96#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 98#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
97#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 99#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
98 100
101#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
99 102
100/* mount state */ 103/* mount state */
101enum { 104enum {
@@ -160,12 +163,6 @@ struct ceph_client {
160#endif 163#endif
161}; 164};
162 165
163static inline struct ceph_client *ceph_client(struct super_block *sb)
164{
165 return sb->s_fs_info;
166}
167
168
169/* 166/*
170 * File i/o capability. This tracks shared state with the metadata 167 * File i/o capability. This tracks shared state with the metadata
171 * server that allows us to cache or writeback attributes or to read 168 * server that allows us to cache or writeback attributes or to read
@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
871extern void ceph_dentry_lru_add(struct dentry *dn); 868extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn); 869extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn); 870extern void ceph_dentry_lru_del(struct dentry *dn);
871extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
874 872
875/* 873/*
876 * our d_ops vary depending on whether the inode is live, 874 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
7 7
8static bool ceph_is_valid_xattr(const char *name) 8static bool ceph_is_valid_xattr(const char *name)
9{ 9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX, 10 return !strncmp(name, "ceph.", 5) ||
11 !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) || 12 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 13 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 14 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
76} 77}
77 78
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 79static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, 80 { true, "ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files}, 81 { true, "ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 82 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, 83 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 84 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 85 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 86 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, 87 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL } 88 { true, NULL, NULL }
88}; 89};
89 90
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
107} 108}
108 109
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 110static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout}, 111 { true, "ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL } 112 { NULL, NULL }
112}; 113};
113 114
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
186 ci->i_xattrs.names_size -= xattr->name_len; 187 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len; 188 ci->i_xattrs.vals_size -= xattr->val_len;
188 } 189 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len; 190 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len; 191 ci->i_xattrs.vals_size += val_len;
197 if (val) 192 if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
574 ci->i_xattrs.version, ci->i_xattrs.index_version); 569 ci->i_xattrs.version, ci->i_xattrs.index_version);
575 570
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 571 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) { 572 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
578 goto list_xattr; 573 goto list_xattr;
579 } else { 574 } else {
580 spin_unlock(&inode->i_lock); 575 spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 617static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags) 618 const char *value, size_t size, int flags)
624{ 619{
625 struct ceph_client *client = ceph_client(dentry->d_sb); 620 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode; 621 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode); 622 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode; 623 struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
641 return -ENOMEM; 636 return -ENOMEM;
642 err = -ENOMEM; 637 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) { 638 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS); 639 pages[i] = __page_cache_alloc(GFP_NOFS);
645 if (!pages[i]) { 640 if (!pages[i]) {
646 nr_pages = i; 641 nr_pages = i;
647 goto out; 642 goto out;
@@ -779,7 +774,7 @@ out:
779 774
780static int ceph_send_removexattr(struct dentry *dentry, const char *name) 775static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{ 776{
782 struct ceph_client *client = ceph_client(dentry->d_sb); 777 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc; 778 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode; 779 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode; 780 struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 4c813f2cdc52..7196077b1688 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
218 host_file = cfi->cfi_container; 218 host_file = cfi->cfi_container;
219 219
220 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); 220 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 221 if ( !err && !datasync ) {
222 lock_kernel(); 222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Pioctl operations for Coda. 2 * Pioctl operations for Coda.
3 * Original version: (C) 1996 Peter Braam 3 * Original version: (C) 1996 Peter Braam
4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University 4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
5 * 5 *
6 * Carnegie Mellon encourages users of this code to contribute improvements 6 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
26/* pioctl ops */ 28/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask); 29static int coda_ioctl_permission(struct inode *inode, int mask);
28static int coda_pioctl(struct inode * inode, struct file * filp, 30static long coda_pioctl(struct file *filp, unsigned int cmd,
29 unsigned int cmd, unsigned long user_data); 31 unsigned long user_data);
30 32
31/* exported from this file */ 33/* exported from this file */
32const struct inode_operations coda_ioctl_inode_operations = 34const struct inode_operations coda_ioctl_inode_operations = {
33{
34 .permission = coda_ioctl_permission, 35 .permission = coda_ioctl_permission,
35 .setattr = coda_setattr, 36 .setattr = coda_setattr,
36}; 37};
37 38
38const struct file_operations coda_ioctl_operations = { 39const struct file_operations coda_ioctl_operations = {
39 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
40 .ioctl = coda_pioctl, 41 .unlocked_ioctl = coda_pioctl,
41}; 42};
42 43
43/* the coda pioctl inode ops */ 44/* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
46 return (mask & MAY_EXEC) ? -EACCES : 0; 47 return (mask & MAY_EXEC) ? -EACCES : 0;
47} 48}
48 49
49static int coda_pioctl(struct inode * inode, struct file * filp, 50static long coda_pioctl(struct file *filp, unsigned int cmd,
50 unsigned int cmd, unsigned long user_data) 51 unsigned long user_data)
51{ 52{
52 struct path path; 53 struct path path;
53 int error; 54 int error;
54 struct PioctlData data; 55 struct PioctlData data;
55 struct inode *target_inode = NULL; 56 struct inode *inode = filp->f_dentry->d_inode;
56 struct coda_inode_info *cnp; 57 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp;
57 59
58 /* get the Pioctl data arguments from user space */ 60 lock_kernel();
59 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 61
60 return -EINVAL; 62 /* get the Pioctl data arguments from user space */
61 } 63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
62 64 error = -EINVAL;
63 /* 65 goto out;
64 * Look up the pathname. Note that the pathname is in
65 * user memory, and namei takes care of this
66 */
67 if (data.follow) {
68 error = user_path(data.path, &path);
69 } else {
70 error = user_lpath(data.path, &path);
71 } 66 }
72 67
73 if ( error ) { 68 /*
74 return error; 69 * Look up the pathname. Note that the pathname is in
75 } else { 70 * user memory, and namei takes care of this
71 */
72 if (data.follow)
73 error = user_path(data.path, &path);
74 else
75 error = user_lpath(data.path, &path);
76
77 if (error)
78 goto out;
79 else
76 target_inode = path.dentry->d_inode; 80 target_inode = path.dentry->d_inode;
77 } 81
78
79 /* return if it is not a Coda inode */ 82 /* return if it is not a Coda inode */
80 if ( target_inode->i_sb != inode->i_sb ) { 83 if (target_inode->i_sb != inode->i_sb) {
81 path_put(&path); 84 path_put(&path);
82 return -EINVAL; 85 error = -EINVAL;
86 goto out;
83 } 87 }
84 88
85 /* now proceed to make the upcall */ 89 /* now proceed to make the upcall */
86 cnp = ITOC(target_inode); 90 cnp = ITOC(target_inode);
87 91
88 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
89 93
90 path_put(&path); 94 path_put(&path);
91 return error;
92}
93 95
96out:
97 unlock_kernel();
98 return error;
99}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
73 return mask; 73 return mask;
74} 74}
75 75
76static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 76static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
77 unsigned int cmd, unsigned long arg)
78{ 77{
79 unsigned int data; 78 unsigned int data;
80 79
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
344 .read = coda_psdev_read, 343 .read = coda_psdev_read,
345 .write = coda_psdev_write, 344 .write = coda_psdev_write,
346 .poll = coda_psdev_poll, 345 .poll = coda_psdev_poll,
347 .ioctl = coda_psdev_ioctl, 346 .unlocked_ioctl = coda_psdev_ioctl,
348 .open = coda_psdev_open, 347 .open = coda_psdev_open,
349 .release = coda_psdev_release, 348 .release = coda_psdev_release,
350}; 349};
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..d96047b4a633 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
536 */ 536 */
537static void prune_dcache(int count) 537static void prune_dcache(int count)
538{ 538{
539 struct super_block *sb; 539 struct super_block *sb, *n;
540 int w_count; 540 int w_count;
541 int unused = dentry_stat.nr_unused; 541 int unused = dentry_stat.nr_unused;
542 int prune_ratio; 542 int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
545 if (unused == 0 || count == 0) 545 if (unused == 0 || count == 0)
546 return; 546 return;
547 spin_lock(&dcache_lock); 547 spin_lock(&dcache_lock);
548restart:
549 if (count >= unused) 548 if (count >= unused)
550 prune_ratio = 1; 549 prune_ratio = 1;
551 else 550 else
552 prune_ratio = unused / count; 551 prune_ratio = unused / count;
553 spin_lock(&sb_lock); 552 spin_lock(&sb_lock);
554 list_for_each_entry(sb, &super_blocks, s_list) { 553 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
554 if (list_empty(&sb->s_instances))
555 continue;
555 if (sb->s_nr_dentry_unused == 0) 556 if (sb->s_nr_dentry_unused == 0)
556 continue; 557 continue;
557 sb->s_count++; 558 sb->s_count++;
@@ -590,14 +591,10 @@ restart:
590 } 591 }
591 spin_lock(&sb_lock); 592 spin_lock(&sb_lock);
592 count -= pruned; 593 count -= pruned;
593 /* 594 __put_super(sb);
594 * restart only when sb is no longer on the list and 595 /* more work left to do? */
595 * we have more work to do. 596 if (count <= 0)
596 */ 597 break;
597 if (__put_super_and_need_restart(sb) && count > 0) {
598 spin_unlock(&sb_lock);
599 goto restart;
600 }
601 } 598 }
602 spin_unlock(&sb_lock); 599 spin_unlock(&sb_lock);
603 spin_unlock(&dcache_lock); 600 spin_unlock(&dcache_lock);
@@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry)
1529 spin_lock(&dentry->d_lock); 1526 spin_lock(&dentry->d_lock);
1530 isdir = S_ISDIR(dentry->d_inode->i_mode); 1527 isdir = S_ISDIR(dentry->d_inode->i_mode);
1531 if (atomic_read(&dentry->d_count) == 1) { 1528 if (atomic_read(&dentry->d_count) == 1) {
1529 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1532 dentry_iput(dentry); 1530 dentry_iput(dentry);
1533 fsnotify_nameremove(dentry, isdir); 1531 fsnotify_nameremove(dentry, isdir);
1534 return; 1532 return;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0120247b41c0..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
384 s->s_flags |= MS_ACTIVE; 384 s->s_flags |= MS_ACTIVE;
385 } 385 }
386 386
387 simple_set_mnt(mnt, s);
388
389 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); 387 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
390 388
391 error = mknod_ptmx(s); 389 error = mknod_ptmx(s);
392 if (error) 390 if (error)
393 goto out_dput; 391 goto out_undo_sget;
394 392
395 return 0; 393 simple_set_mnt(mnt, s);
396 394
397out_dput: 395 return 0;
398 dput(s->s_root); /* undo dget() in simple_set_mnt() */
399 396
400out_undo_sget: 397out_undo_sget:
401 deactivate_locked_super(s); 398 deactivate_locked_super(s);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..83c4f600786a 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
12/* A global variable is a bit ugly, but it keeps the code simple */ 12/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 13int sysctl_drop_caches;
14 14
15static void drop_pagecache_sb(struct super_block *sb) 15static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 16{
17 struct inode *inode, *toput_inode = NULL; 17 struct inode *inode, *toput_inode = NULL;
18 18
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
33 iput(toput_inode); 33 iput(toput_inode);
34} 34}
35 35
36static void drop_pagecache(void)
37{
38 struct super_block *sb;
39
40 spin_lock(&sb_lock);
41restart:
42 list_for_each_entry(sb, &super_blocks, s_list) {
43 sb->s_count++;
44 spin_unlock(&sb_lock);
45 down_read(&sb->s_umount);
46 if (sb->s_root)
47 drop_pagecache_sb(sb);
48 up_read(&sb->s_umount);
49 spin_lock(&sb_lock);
50 if (__put_super_and_need_restart(sb))
51 goto restart;
52 }
53 spin_unlock(&sb_lock);
54}
55
56static void drop_slab(void) 36static void drop_slab(void)
57{ 37{
58 int nr_objects; 38 int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
68 proc_dointvec_minmax(table, write, buffer, length, ppos); 48 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 49 if (write) {
70 if (sysctl_drop_caches & 1) 50 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 51 iterate_supers(drop_pagecache_sb, NULL);
72 if (sysctl_drop_caches & 2) 52 if (sysctl_drop_caches & 2)
73 drop_slab(); 53 drop_slab();
74 } 54 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bfc2e0f78f00..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, 731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
732 struct page *page_for_lower, 732 struct page *page_for_lower,
733 size_t offset_in_page, size_t size); 733 size_t offset_in_page, size_t size);
734int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 734int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
735 size_t size);
736int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 735int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
737 struct inode *ecryptfs_inode); 736 struct inode *ecryptfs_inode);
738int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, 737int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
739 pgoff_t page_index, 738 pgoff_t page_index,
740 size_t offset_in_page, size_t size, 739 size_t offset_in_page, size_t size,
741 struct inode *ecryptfs_inode); 740 struct inode *ecryptfs_inode);
742struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); 741struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
743int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); 742int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
744int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, 743int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
745 struct user_namespace *user_ns); 744 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6f5ebf..3bdddbcc785f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -276,9 +276,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
276static int 276static int
277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
278{ 278{
279 return vfs_fsync(ecryptfs_file_to_lower(file), 279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
280 ecryptfs_dentry_to_lower(dentry),
281 datasync);
282} 280}
283 281
284static int ecryptfs_fasync(int fd, struct file *file, int flag) 282static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2d4418affac..65dee2f336ae 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -142,19 +142,10 @@ out:
142static int grow_file(struct dentry *ecryptfs_dentry) 142static int grow_file(struct dentry *ecryptfs_dentry)
143{ 143{
144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; 144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
145 struct file fake_file;
146 struct ecryptfs_file_info tmp_file_info;
147 char zero_virt[] = { 0x00 }; 145 char zero_virt[] = { 0x00 };
148 int rc = 0; 146 int rc = 0;
149 147
150 memset(&fake_file, 0, sizeof(fake_file)); 148 rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
151 fake_file.f_path.dentry = ecryptfs_dentry;
152 memset(&tmp_file_info, 0, sizeof(tmp_file_info));
153 ecryptfs_set_file_private(&fake_file, &tmp_file_info);
154 ecryptfs_set_file_lower(
155 &fake_file,
156 ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
157 rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
158 i_size_write(ecryptfs_inode, 0); 149 i_size_write(ecryptfs_inode, 0);
159 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 150 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
160 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= 151 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
784{ 775{
785 int rc = 0; 776 int rc = 0;
786 struct inode *inode = dentry->d_inode; 777 struct inode *inode = dentry->d_inode;
787 struct dentry *lower_dentry;
788 struct file fake_ecryptfs_file;
789 struct ecryptfs_crypt_stat *crypt_stat; 778 struct ecryptfs_crypt_stat *crypt_stat;
790 loff_t i_size = i_size_read(inode); 779 loff_t i_size = i_size_read(inode);
791 loff_t lower_size_before_truncate; 780 loff_t lower_size_before_truncate;
@@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
796 goto out; 785 goto out;
797 } 786 }
798 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 787 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
799 /* Set up a fake ecryptfs file, this is used to interface with
800 * the file in the underlying filesystem so that the
801 * truncation has an effect there as well. */
802 memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
803 fake_ecryptfs_file.f_path.dentry = dentry;
804 /* Released at out_free: label */
805 ecryptfs_set_file_private(&fake_ecryptfs_file,
806 kmem_cache_alloc(ecryptfs_file_info_cache,
807 GFP_KERNEL));
808 if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
809 rc = -ENOMEM;
810 goto out;
811 }
812 lower_dentry = ecryptfs_dentry_to_lower(dentry);
813 ecryptfs_set_file_lower(
814 &fake_ecryptfs_file,
815 ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
816 /* Switch on growing or shrinking file */ 788 /* Switch on growing or shrinking file */
817 if (ia->ia_size > i_size) { 789 if (ia->ia_size > i_size) {
818 char zero[] = { 0x00 }; 790 char zero[] = { 0x00 };
@@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
822 * this triggers code that will fill in 0's throughout 794 * this triggers code that will fill in 0's throughout
823 * the intermediate portion of the previous end of the 795 * the intermediate portion of the previous end of the
824 * file and the new and of the file */ 796 * file and the new and of the file */
825 rc = ecryptfs_write(&fake_ecryptfs_file, zero, 797 rc = ecryptfs_write(inode, zero,
826 (ia->ia_size - 1), 1); 798 (ia->ia_size - 1), 1);
827 } else { /* ia->ia_size < i_size_read(inode) */ 799 } else { /* ia->ia_size < i_size_read(inode) */
828 /* We're chopping off all the pages down to the page 800 /* We're chopping off all the pages down to the page
@@ -835,10 +807,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
835 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
836 rc = vmtruncate(inode, ia->ia_size); 808 rc = vmtruncate(inode, ia->ia_size);
837 if (rc) 809 if (rc)
838 goto out_free; 810 goto out;
839 lower_ia->ia_size = ia->ia_size; 811 lower_ia->ia_size = ia->ia_size;
840 lower_ia->ia_valid |= ATTR_SIZE; 812 lower_ia->ia_valid |= ATTR_SIZE;
841 goto out_free; 813 goto out;
842 } 814 }
843 if (num_zeros) { 815 if (num_zeros) {
844 char *zeros_virt; 816 char *zeros_virt;
@@ -846,16 +818,16 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
846 zeros_virt = kzalloc(num_zeros, GFP_KERNEL); 818 zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
847 if (!zeros_virt) { 819 if (!zeros_virt) {
848 rc = -ENOMEM; 820 rc = -ENOMEM;
849 goto out_free; 821 goto out;
850 } 822 }
851 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, 823 rc = ecryptfs_write(inode, zeros_virt,
852 ia->ia_size, num_zeros); 824 ia->ia_size, num_zeros);
853 kfree(zeros_virt); 825 kfree(zeros_virt);
854 if (rc) { 826 if (rc) {
855 printk(KERN_ERR "Error attempting to zero out " 827 printk(KERN_ERR "Error attempting to zero out "
856 "the remainder of the end page on " 828 "the remainder of the end page on "
857 "reducing truncate; rc = [%d]\n", rc); 829 "reducing truncate; rc = [%d]\n", rc);
858 goto out_free; 830 goto out;
859 } 831 }
860 } 832 }
861 vmtruncate(inode, ia->ia_size); 833 vmtruncate(inode, ia->ia_size);
@@ -864,7 +836,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
864 printk(KERN_ERR "Problem with " 836 printk(KERN_ERR "Problem with "
865 "ecryptfs_write_inode_size_to_metadata; " 837 "ecryptfs_write_inode_size_to_metadata; "
866 "rc = [%d]\n", rc); 838 "rc = [%d]\n", rc);
867 goto out_free; 839 goto out;
868 } 840 }
869 /* We are reducing the size of the ecryptfs file, and need to 841 /* We are reducing the size of the ecryptfs file, and need to
870 * know if we need to reduce the size of the lower file. */ 842 * know if we need to reduce the size of the lower file. */
@@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
878 } else 850 } else
879 lower_ia->ia_valid &= ~ATTR_SIZE; 851 lower_ia->ia_valid &= ~ATTR_SIZE;
880 } 852 }
881out_free:
882 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
883 kmem_cache_free(ecryptfs_file_info_cache,
884 ecryptfs_file_to_private(&fake_ecryptfs_file));
885out: 853out:
886 return rc; 854 return rc;
887} 855}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 760983d0f25e..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
281 * 281 *
282 * Returns zero on success; non-zero on error 282 * Returns zero on success; non-zero on error
283 */ 283 */
284static int ecryptfs_parse_options(struct super_block *sb, char *options) 284static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
285{ 285{
286 char *p; 286 char *p;
287 int rc = 0; 287 int rc = 0;
@@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
293 int fn_cipher_key_bytes; 293 int fn_cipher_key_bytes;
294 int fn_cipher_key_bytes_set = 0; 294 int fn_cipher_key_bytes_set = 0;
295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
296 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 296 &sbi->mount_crypt_stat;
297 substring_t args[MAX_OPT_ARGS]; 297 substring_t args[MAX_OPT_ARGS];
298 int token; 298 int token;
299 char *sig_src; 299 char *sig_src;
@@ -483,68 +483,7 @@ out:
483} 483}
484 484
485struct kmem_cache *ecryptfs_sb_info_cache; 485struct kmem_cache *ecryptfs_sb_info_cache;
486 486static struct file_system_type ecryptfs_fs_type;
487/**
488 * ecryptfs_fill_super
489 * @sb: The ecryptfs super block
490 * @raw_data: The options passed to mount
491 * @silent: Not used but required by function prototype
492 *
493 * Sets up what we can of the sb, rest is done in ecryptfs_read_super
494 *
495 * Returns zero on success; non-zero otherwise
496 */
497static int
498ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
499{
500 struct ecryptfs_sb_info *esi;
501 int rc = 0;
502
503 /* Released in ecryptfs_put_super() */
504 ecryptfs_set_superblock_private(sb,
505 kmem_cache_zalloc(ecryptfs_sb_info_cache,
506 GFP_KERNEL));
507 esi = ecryptfs_superblock_to_private(sb);
508 if (!esi) {
509 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
510 rc = -ENOMEM;
511 goto out;
512 }
513
514 rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
515 if (rc)
516 goto out;
517
518 sb->s_bdi = &esi->bdi;
519 sb->s_op = &ecryptfs_sops;
520 /* Released through deactivate_super(sb) from get_sb_nodev */
521 sb->s_root = d_alloc(NULL, &(const struct qstr) {
522 .hash = 0,.name = "/",.len = 1});
523 if (!sb->s_root) {
524 ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
525 rc = -ENOMEM;
526 goto out;
527 }
528 sb->s_root->d_op = &ecryptfs_dops;
529 sb->s_root->d_sb = sb;
530 sb->s_root->d_parent = sb->s_root;
531 /* Released in d_release when dput(sb->s_root) is called */
532 /* through deactivate_super(sb) from get_sb_nodev() */
533 ecryptfs_set_dentry_private(sb->s_root,
534 kmem_cache_zalloc(ecryptfs_dentry_info_cache,
535 GFP_KERNEL));
536 if (!ecryptfs_dentry_to_private(sb->s_root)) {
537 ecryptfs_printk(KERN_ERR,
538 "dentry_info_cache alloc failed\n");
539 rc = -ENOMEM;
540 goto out;
541 }
542 rc = 0;
543out:
544 /* Should be able to rely on deactivate_super called from
545 * get_sb_nodev */
546 return rc;
547}
548 487
549/** 488/**
550 * ecryptfs_read_super 489 * ecryptfs_read_super
@@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
565 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); 504 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
566 goto out; 505 goto out;
567 } 506 }
507 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
508 rc = -EINVAL;
509 printk(KERN_ERR "Mount on filesystem of type "
510 "eCryptfs explicitly disallowed due to "
511 "known incompatibilities\n");
512 goto out_free;
513 }
568 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); 514 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
569 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; 515 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
570 sb->s_blocksize = path.dentry->d_sb->s_blocksize; 516 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -588,11 +534,8 @@ out:
588 * @dev_name: The path to mount over 534 * @dev_name: The path to mount over
589 * @raw_data: The options passed into the kernel 535 * @raw_data: The options passed into the kernel
590 * 536 *
591 * The whole ecryptfs_get_sb process is broken into 4 functions: 537 * The whole ecryptfs_get_sb process is broken into 3 functions:
592 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any 538 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
593 * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
594 * with as much information as it can before needing
595 * the lower filesystem.
596 * ecryptfs_read_super(): this accesses the lower filesystem and uses 539 * ecryptfs_read_super(): this accesses the lower filesystem and uses
597 * ecryptfs_interpose to perform most of the linking 540 * ecryptfs_interpose to perform most of the linking
598 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) 541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
601 const char *dev_name, void *raw_data, 544 const char *dev_name, void *raw_data,
602 struct vfsmount *mnt) 545 struct vfsmount *mnt)
603{ 546{
547 struct super_block *s;
548 struct ecryptfs_sb_info *sbi;
549 struct ecryptfs_dentry_info *root_info;
550 const char *err = "Getting sb failed";
604 int rc; 551 int rc;
605 struct super_block *sb;
606 552
607 rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); 553 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
608 if (rc < 0) { 554 if (!sbi) {
609 printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); 555 rc = -ENOMEM;
610 goto out; 556 goto out;
611 } 557 }
612 sb = mnt->mnt_sb; 558
613 rc = ecryptfs_parse_options(sb, raw_data); 559 rc = ecryptfs_parse_options(sbi, raw_data);
614 if (rc) { 560 if (rc) {
615 printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); 561 err = "Error parsing options";
616 goto out_abort; 562 goto out;
563 }
564
565 s = sget(fs_type, NULL, set_anon_super, NULL);
566 if (IS_ERR(s)) {
567 rc = PTR_ERR(s);
568 goto out;
617 } 569 }
618 rc = ecryptfs_read_super(sb, dev_name); 570
571 s->s_flags = flags;
572 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
619 if (rc) { 573 if (rc) {
620 printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); 574 deactivate_locked_super(s);
621 goto out_abort; 575 goto out;
622 } 576 }
623 goto out; 577
624out_abort: 578 ecryptfs_set_superblock_private(s, sbi);
625 dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ 579 s->s_bdi = &sbi->bdi;
626 deactivate_locked_super(sb); 580
581 /* ->kill_sb() will take care of sbi after that point */
582 sbi = NULL;
583 s->s_op = &ecryptfs_sops;
584
585 rc = -ENOMEM;
586 s->s_root = d_alloc(NULL, &(const struct qstr) {
587 .hash = 0,.name = "/",.len = 1});
588 if (!s->s_root) {
589 deactivate_locked_super(s);
590 goto out;
591 }
592 s->s_root->d_op = &ecryptfs_dops;
593 s->s_root->d_sb = s;
594 s->s_root->d_parent = s->s_root;
595
596 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
597 if (!root_info) {
598 deactivate_locked_super(s);
599 goto out;
600 }
601 /* ->kill_sb() will take care of root_info */
602 ecryptfs_set_dentry_private(s->s_root, root_info);
603 s->s_flags |= MS_ACTIVE;
604 rc = ecryptfs_read_super(s, dev_name);
605 if (rc) {
606 deactivate_locked_super(s);
607 err = "Reading sb failed";
608 goto out;
609 }
610 simple_set_mnt(mnt, s);
611 return 0;
612
627out: 613out:
614 if (sbi) {
615 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
628 return rc; 619 return rc;
629} 620}
630 621
@@ -633,11 +624,16 @@ out:
633 * @sb: The ecryptfs super block 624 * @sb: The ecryptfs super block
634 * 625 *
635 * Used to bring the superblock down and free the private data. 626 * Used to bring the superblock down and free the private data.
636 * Private data is free'd in ecryptfs_put_super()
637 */ 627 */
638static void ecryptfs_kill_block_super(struct super_block *sb) 628static void ecryptfs_kill_block_super(struct super_block *sb)
639{ 629{
640 generic_shutdown_super(sb); 630 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
631 kill_anon_super(sb);
632 if (!sb_info)
633 return;
634 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
635 bdi_destroy(&sb_info->bdi);
636 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
641} 637}
642 638
643static struct file_system_type ecryptfs_fs_type = { 639static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2ee9a3a7b68c..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -44,17 +44,9 @@
44 * Returns locked and up-to-date page (if ok), with increased 44 * Returns locked and up-to-date page (if ok), with increased
45 * refcnt. 45 * refcnt.
46 */ 46 */
47struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) 47struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
48{ 48{
49 struct dentry *dentry; 49 struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
50 struct inode *inode;
51 struct address_space *mapping;
52 struct page *page;
53
54 dentry = file->f_path.dentry;
55 inode = dentry->d_inode;
56 mapping = inode->i_mapping;
57 page = read_mapping_page(mapping, index, (void *)file);
58 if (!IS_ERR(page)) 50 if (!IS_ERR(page))
59 lock_page(page); 51 lock_page(page);
60 return page; 52 return page;
@@ -198,7 +190,7 @@ out:
198static int ecryptfs_readpage(struct file *file, struct page *page) 190static int ecryptfs_readpage(struct file *file, struct page *page)
199{ 191{
200 struct ecryptfs_crypt_stat *crypt_stat = 192 struct ecryptfs_crypt_stat *crypt_stat =
201 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 193 &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
202 int rc = 0; 194 int rc = 0;
203 195
204 if (!crypt_stat 196 if (!crypt_stat
@@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
300 292
301 if (!PageUptodate(page)) { 293 if (!PageUptodate(page)) {
302 struct ecryptfs_crypt_stat *crypt_stat = 294 struct ecryptfs_crypt_stat *crypt_stat =
303 &ecryptfs_inode_to_private( 295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
304 file->f_path.dentry->d_inode)->crypt_stat;
305 296
306 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) 297 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
307 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { 298 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
487 unsigned to = from + copied; 478 unsigned to = from + copied;
488 struct inode *ecryptfs_inode = mapping->host; 479 struct inode *ecryptfs_inode = mapping->host;
489 struct ecryptfs_crypt_stat *crypt_stat = 480 struct ecryptfs_crypt_stat *crypt_stat =
490 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 481 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
491 int rc; 482 int rc;
492 483
493 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { 484 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
93 93
94/** 94/**
95 * ecryptfs_write 95 * ecryptfs_write
96 * @ecryptfs_file: The eCryptfs file into which to write 96 * @ecryptfs_inode: The eCryptfs file into which to write
97 * @data: Virtual address where data to write is located 97 * @data: Virtual address where data to write is located
98 * @offset: Offset in the eCryptfs file at which to begin writing the 98 * @offset: Offset in the eCryptfs file at which to begin writing the
99 * data from @data 99 * data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
109 * 109 *
110 * Returns zero on success; non-zero otherwise 110 * Returns zero on success; non-zero otherwise
111 */ 111 */
112int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 112int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
113 size_t size) 113 size_t size)
114{ 114{
115 struct page *ecryptfs_page; 115 struct page *ecryptfs_page;
116 struct ecryptfs_crypt_stat *crypt_stat; 116 struct ecryptfs_crypt_stat *crypt_stat;
117 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
118 char *ecryptfs_page_virt; 117 char *ecryptfs_page_virt;
119 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); 118 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
120 loff_t data_offset = 0; 119 loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
145 if (num_bytes > total_remaining_zeros) 144 if (num_bytes > total_remaining_zeros)
146 num_bytes = total_remaining_zeros; 145 num_bytes = total_remaining_zeros;
147 } 146 }
148 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 147 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
149 ecryptfs_page_idx); 148 ecryptfs_page_idx);
150 if (IS_ERR(ecryptfs_page)) { 149 if (IS_ERR(ecryptfs_page)) {
151 rc = PTR_ERR(ecryptfs_page); 150 rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
302int ecryptfs_read(char *data, loff_t offset, size_t size, 301int ecryptfs_read(char *data, loff_t offset, size_t size,
303 struct file *ecryptfs_file) 302 struct file *ecryptfs_file)
304{ 303{
304 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
305 struct page *ecryptfs_page; 305 struct page *ecryptfs_page;
306 char *ecryptfs_page_virt; 306 char *ecryptfs_page_virt;
307 loff_t ecryptfs_file_size = 307 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
308 i_size_read(ecryptfs_file->f_dentry->d_inode);
309 loff_t data_offset = 0; 308 loff_t data_offset = 0;
310 loff_t pos; 309 loff_t pos;
311 int rc = 0; 310 int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
327 326
328 if (num_bytes > total_remaining_bytes) 327 if (num_bytes > total_remaining_bytes)
329 num_bytes = total_remaining_bytes; 328 num_bytes = total_remaining_bytes;
330 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 329 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
331 ecryptfs_page_idx); 330 ecryptfs_page_idx);
332 if (IS_ERR(ecryptfs_page)) { 331 if (IS_ERR(ecryptfs_page)) {
333 rc = PTR_ERR(ecryptfs_page); 332 rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0c0ae491d231..0435886e4a9f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
109} 109}
110 110
111/** 111/**
112 * ecryptfs_put_super
113 * @sb: Pointer to the ecryptfs super block
114 *
115 * Final actions when unmounting a file system.
116 * This will handle deallocation and release of our private data.
117 */
118static void ecryptfs_put_super(struct super_block *sb)
119{
120 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
121
122 lock_kernel();
123
124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
125 bdi_destroy(&sb_info->bdi);
126 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
127 ecryptfs_set_superblock_private(sb, NULL);
128
129 unlock_kernel();
130}
131
132/**
133 * ecryptfs_statfs 112 * ecryptfs_statfs
134 * @sb: The ecryptfs super block 113 * @sb: The ecryptfs super block
135 * @buf: The struct kstatfs to fill in with stats 114 * @buf: The struct kstatfs to fill in with stats
@@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
203 .alloc_inode = ecryptfs_alloc_inode, 182 .alloc_inode = ecryptfs_alloc_inode,
204 .destroy_inode = ecryptfs_destroy_inode, 183 .destroy_inode = ecryptfs_destroy_inode,
205 .drop_inode = generic_delete_inode, 184 .drop_inode = generic_delete_inode,
206 .put_super = ecryptfs_put_super,
207 .statfs = ecryptfs_statfs, 185 .statfs = ecryptfs_statfs,
208 .remount_fs = NULL, 186 .remount_fs = NULL,
209 .clear_inode = ecryptfs_clear_inode, 187 .clear_inode = ecryptfs_clear_inode,
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..9badbc0bfb1d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
242 * use STACK_TOP because that can depend on attributes which aren't 242 * use STACK_TOP because that can depend on attributes which aren't
243 * configured yet. 243 * configured yet.
244 */ 244 */
245 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
245 vma->vm_end = STACK_TOP_MAX; 246 vma->vm_end = STACK_TOP_MAX;
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 247 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 248 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 250 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 251 err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
616 else if (executable_stack == EXSTACK_DISABLE_X) 617 else if (executable_stack == EXSTACK_DISABLE_X)
617 vm_flags &= ~VM_EXEC; 618 vm_flags &= ~VM_EXEC;
618 vm_flags |= mm->def_flags; 619 vm_flags |= mm->def_flags;
620 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
619 621
620 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621 vm_flags); 623 vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 632 goto out_unlock;
631 } 633 }
632 634
635 /* mprotect_fixup is overkill to remove the temporary stack flags */
636 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
637
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 638 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 639 stack_size = vma->vm_end - vma->vm_start;
635 /* 640 /*
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
608 de->inode_no = cpu_to_le64(parent->i_ino); 608 de->inode_no = cpu_to_le64(parent->i_ino);
609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); 609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
610 exofs_set_de_type(de, inode); 610 exofs_set_de_type(de, inode);
611 kunmap_atomic(page, KM_USER0); 611 kunmap_atomic(kaddr, KM_USER0);
612 err = exofs_commit_chunk(page, 0, chunk_size); 612 err = exofs_commit_chunk(page, 0, chunk_size);
613fail: 613fail:
614 page_cache_release(page); 614 page_cache_release(page);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 76d2a79ef93e..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
755 return ret; 755 return ret;
756} 756}
757 757
758static int exofs_releasepage(struct page *page, gfp_t gfp)
759{
760 EXOFS_DBGMSG("page 0x%lx\n", page->index);
761 WARN_ON(1);
762 return try_to_free_buffers(page);
763}
764
765static void exofs_invalidatepage(struct page *page, unsigned long offset)
766{
767 EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
768 WARN_ON(1);
769
770 block_invalidatepage(page, offset);
771}
772
758const struct address_space_operations exofs_aops = { 773const struct address_space_operations exofs_aops = {
759 .readpage = exofs_readpage, 774 .readpage = exofs_readpage,
760 .readpages = exofs_readpages, 775 .readpages = exofs_readpages,
@@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = {
762 .writepages = exofs_writepages, 777 .writepages = exofs_writepages,
763 .write_begin = exofs_write_begin_export, 778 .write_begin = exofs_write_begin_export,
764 .write_end = exofs_write_end, 779 .write_end = exofs_write_end,
780 .releasepage = exofs_releasepage,
781 .set_page_dirty = __set_page_dirty_nobuffers,
782 .invalidatepage = exofs_invalidatepage,
783
784 /* Not implemented Yet */
785 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
786 .direct_IO = NULL, /* TODO: Should be trivial to do */
787
788 /* With these NULL has special meaning or default is not exported */
789 .sync_page = NULL,
790 .get_xip_mem = NULL,
791 .migratepage = NULL,
792 .launder_page = NULL,
793 .is_partially_uptodate = NULL,
794 .error_remove_page = NULL,
765}; 795};
766 796
767/****************************************************************************** 797/******************************************************************************
@@ -1123,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1123 sbi = sb->s_fs_info; 1153 sbi = sb->s_fs_info;
1124 1154
1125 sb->s_dirt = 1; 1155 sb->s_dirt = 1;
1126 inode->i_uid = current->cred->fsuid; 1156 inode_init_owner(inode, dir, mode);
1127 if (dir->i_mode & S_ISGID) {
1128 inode->i_gid = dir->i_gid;
1129 if (S_ISDIR(mode))
1130 mode |= S_ISGID;
1131 } else {
1132 inode->i_gid = current->cred->fsgid;
1133 }
1134 inode->i_mode = mode;
1135
1136 inode->i_ino = sbi->s_nextid++; 1157 inode->i_ino = sbi->s_nextid++;
1137 inode->i_blkbits = EXOFS_BLKSHIFT; 1158 inode->i_blkbits = EXOFS_BLKSHIFT;
1138 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1159 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..ca7e2a0ed98a 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -420,7 +420,7 @@ release_and_out:
420 return error; 420 return error;
421} 421}
422 422
423struct xattr_handler ext2_xattr_acl_access_handler = { 423const struct xattr_handler ext2_xattr_acl_access_handler = {
424 .prefix = POSIX_ACL_XATTR_ACCESS, 424 .prefix = POSIX_ACL_XATTR_ACCESS,
425 .flags = ACL_TYPE_ACCESS, 425 .flags = ACL_TYPE_ACCESS,
426 .list = ext2_xattr_list_acl_access, 426 .list = ext2_xattr_list_acl_access,
@@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
428 .set = ext2_xattr_set_acl, 428 .set = ext2_xattr_set_acl,
429}; 429};
430 430
431struct xattr_handler ext2_xattr_acl_default_handler = { 431const struct xattr_handler ext2_xattr_acl_default_handler = {
432 .prefix = POSIX_ACL_XATTR_DEFAULT, 432 .prefix = POSIX_ACL_XATTR_DEFAULT,
433 .flags = ACL_TYPE_DEFAULT, 433 .flags = ACL_TYPE_DEFAULT,
434 .list = ext2_xattr_list_acl_default, 434 .list = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 3cf038c055d7..e8766a396776 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1332,6 +1332,12 @@ retry_alloc:
1332 1332
1333 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1333 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1334 /* 1334 /*
1335 * skip this group (and avoid loading bitmap) if there
1336 * are no free blocks
1337 */
1338 if (!free_blocks)
1339 continue;
1340 /*
1335 * skip this group if the number of 1341 * skip this group if the number of
1336 * free blocks is less than half of the reservation 1342 * free blocks is less than half of the reservation
1337 * window size. 1343 * window size.
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad7d572ee8dc..938dbc739d00 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
106 struct super_block * sb = inode->i_sb; 106 struct super_block * sb = inode->i_sb;
107 int is_directory; 107 int is_directory;
108 unsigned long ino; 108 unsigned long ino;
109 struct buffer_head *bitmap_bh = NULL; 109 struct buffer_head *bitmap_bh;
110 unsigned long block_group; 110 unsigned long block_group;
111 unsigned long bit; 111 unsigned long bit;
112 struct ext2_super_block * es; 112 struct ext2_super_block * es;
@@ -135,14 +135,13 @@ void ext2_free_inode (struct inode * inode)
135 ino > le32_to_cpu(es->s_inodes_count)) { 135 ino > le32_to_cpu(es->s_inodes_count)) {
136 ext2_error (sb, "ext2_free_inode", 136 ext2_error (sb, "ext2_free_inode",
137 "reserved or nonexistent inode %lu", ino); 137 "reserved or nonexistent inode %lu", ino);
138 goto error_return; 138 return;
139 } 139 }
140 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); 140 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
141 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); 141 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
142 brelse(bitmap_bh);
143 bitmap_bh = read_inode_bitmap(sb, block_group); 142 bitmap_bh = read_inode_bitmap(sb, block_group);
144 if (!bitmap_bh) 143 if (!bitmap_bh)
145 goto error_return; 144 return;
146 145
147 /* Ok, now we can actually update the inode bitmaps.. */ 146 /* Ok, now we can actually update the inode bitmaps.. */
148 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group), 147 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +153,7 @@ void ext2_free_inode (struct inode * inode)
154 mark_buffer_dirty(bitmap_bh); 153 mark_buffer_dirty(bitmap_bh);
155 if (sb->s_flags & MS_SYNCHRONOUS) 154 if (sb->s_flags & MS_SYNCHRONOUS)
156 sync_dirty_buffer(bitmap_bh); 155 sync_dirty_buffer(bitmap_bh);
157error_return: 156
158 brelse(bitmap_bh); 157 brelse(bitmap_bh);
159} 158}
160 159
@@ -550,16 +549,12 @@ got:
550 549
551 sb->s_dirt = 1; 550 sb->s_dirt = 1;
552 mark_buffer_dirty(bh2); 551 mark_buffer_dirty(bh2);
553 inode->i_uid = current_fsuid(); 552 if (test_opt(sb, GRPID)) {
554 if (test_opt (sb, GRPID)) 553 inode->i_mode = mode;
555 inode->i_gid = dir->i_gid; 554 inode->i_uid = current_fsuid();
556 else if (dir->i_mode & S_ISGID) {
557 inode->i_gid = dir->i_gid; 555 inode->i_gid = dir->i_gid;
558 if (S_ISDIR(mode))
559 mode |= S_ISGID;
560 } else 556 } else
561 inode->i_gid = current_fsgid(); 557 inode_init_owner(inode, dir, mode);
562 inode->i_mode = mode;
563 558
564 inode->i_ino = ino; 559 inode->i_ino = ino;
565 inode->i_blocks = 0; 560 inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc119aad..527c46d9bc1f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25#include <linux/smp_lock.h>
26#include <linux/time.h> 25#include <linux/time.h>
27#include <linux/highuid.h> 26#include <linux/highuid.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
@@ -1406,11 +1405,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
1406 /* If this is the first large file 1405 /* If this is the first large file
1407 * created, add a flag to the superblock. 1406 * created, add a flag to the superblock.
1408 */ 1407 */
1409 lock_kernel(); 1408 spin_lock(&EXT2_SB(sb)->s_lock);
1410 ext2_update_dynamic_rev(sb); 1409 ext2_update_dynamic_rev(sb);
1411 EXT2_SET_RO_COMPAT_FEATURE(sb, 1410 EXT2_SET_RO_COMPAT_FEATURE(sb,
1412 EXT2_FEATURE_RO_COMPAT_LARGE_FILE); 1411 EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
1413 unlock_kernel(); 1412 spin_unlock(&EXT2_SB(sb)->s_lock);
1414 ext2_write_super(sb); 1413 ext2_write_super(sb);
1415 } 1414 }
1416 } 1415 }
@@ -1467,7 +1466,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1467 if (error) 1466 if (error)
1468 return error; 1467 return error;
1469 1468
1470 if (iattr->ia_valid & ATTR_SIZE) 1469 if (is_quota_modification(inode, iattr))
1471 dquot_initialize(inode); 1470 dquot_initialize(inode);
1472 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1471 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1473 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1472 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a303b675..71e9eb1fa696 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
26#include <linux/random.h> 26#include <linux/random.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/exportfs.h> 28#include <linux/exportfs.h>
29#include <linux/smp_lock.h>
30#include <linux/vfs.h> 29#include <linux/vfs.h>
31#include <linux/seq_file.h> 30#include <linux/seq_file.h>
32#include <linux/mount.h> 31#include <linux/mount.h>
@@ -39,7 +38,7 @@
39#include "xip.h" 38#include "xip.h"
40 39
41static void ext2_sync_super(struct super_block *sb, 40static void ext2_sync_super(struct super_block *sb,
42 struct ext2_super_block *es); 41 struct ext2_super_block *es, int wait);
43static int ext2_remount (struct super_block * sb, int * flags, char * data); 42static int ext2_remount (struct super_block * sb, int * flags, char * data);
44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
45static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
52 struct ext2_super_block *es = sbi->s_es; 51 struct ext2_super_block *es = sbi->s_es;
53 52
54 if (!(sb->s_flags & MS_RDONLY)) { 53 if (!(sb->s_flags & MS_RDONLY)) {
54 spin_lock(&sbi->s_lock);
55 sbi->s_mount_state |= EXT2_ERROR_FS; 55 sbi->s_mount_state |= EXT2_ERROR_FS;
56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS); 56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
57 ext2_sync_super(sb, es); 57 spin_unlock(&sbi->s_lock);
58 ext2_sync_super(sb, es, 1);
58 } 59 }
59 60
60 va_start(args, fmt); 61 va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
84 va_end(args); 85 va_end(args);
85} 86}
86 87
88/*
89 * This must be called with sbi->s_lock held.
90 */
87void ext2_update_dynamic_rev(struct super_block *sb) 91void ext2_update_dynamic_rev(struct super_block *sb)
88{ 92{
89 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 93 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,8 +119,6 @@ static void ext2_put_super (struct super_block * sb)
115 int i; 119 int i;
116 struct ext2_sb_info *sbi = EXT2_SB(sb); 120 struct ext2_sb_info *sbi = EXT2_SB(sb);
117 121
118 lock_kernel();
119
120 if (sb->s_dirt) 122 if (sb->s_dirt)
121 ext2_write_super(sb); 123 ext2_write_super(sb);
122 124
@@ -124,8 +126,10 @@ static void ext2_put_super (struct super_block * sb)
124 if (!(sb->s_flags & MS_RDONLY)) { 126 if (!(sb->s_flags & MS_RDONLY)) {
125 struct ext2_super_block *es = sbi->s_es; 127 struct ext2_super_block *es = sbi->s_es;
126 128
129 spin_lock(&sbi->s_lock);
127 es->s_state = cpu_to_le16(sbi->s_mount_state); 130 es->s_state = cpu_to_le16(sbi->s_mount_state);
128 ext2_sync_super(sb, es); 131 spin_unlock(&sbi->s_lock);
132 ext2_sync_super(sb, es, 1);
129 } 133 }
130 db_count = sbi->s_gdb_count; 134 db_count = sbi->s_gdb_count;
131 for (i = 0; i < db_count; i++) 135 for (i = 0; i < db_count; i++)
@@ -140,8 +144,6 @@ static void ext2_put_super (struct super_block * sb)
140 sb->s_fs_info = NULL; 144 sb->s_fs_info = NULL;
141 kfree(sbi->s_blockgroup_lock); 145 kfree(sbi->s_blockgroup_lock);
142 kfree(sbi); 146 kfree(sbi);
143
144 unlock_kernel();
145} 147}
146 148
147static struct kmem_cache * ext2_inode_cachep; 149static struct kmem_cache * ext2_inode_cachep;
@@ -209,6 +211,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
209 struct ext2_super_block *es = sbi->s_es; 211 struct ext2_super_block *es = sbi->s_es;
210 unsigned long def_mount_opts; 212 unsigned long def_mount_opts;
211 213
214 spin_lock(&sbi->s_lock);
212 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 215 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
213 216
214 if (sbi->s_sb_block != 1) 217 if (sbi->s_sb_block != 1)
@@ -281,6 +284,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
281 if (!test_opt(sb, RESERVATION)) 284 if (!test_opt(sb, RESERVATION))
282 seq_puts(seq, ",noreservation"); 285 seq_puts(seq, ",noreservation");
283 286
287 spin_unlock(&sbi->s_lock);
284 return 0; 288 return 0;
285} 289}
286 290
@@ -606,7 +610,6 @@ static int ext2_setup_super (struct super_block * sb,
606 if (!le16_to_cpu(es->s_max_mnt_count)) 610 if (!le16_to_cpu(es->s_max_mnt_count))
607 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); 611 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
608 le16_add_cpu(&es->s_mnt_count, 1); 612 le16_add_cpu(&es->s_mnt_count, 1);
609 ext2_write_super(sb);
610 if (test_opt (sb, DEBUG)) 613 if (test_opt (sb, DEBUG))
611 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " 614 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
612 "bpg=%lu, ipg=%lu, mo=%04lx]", 615 "bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -767,6 +770,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
767 sb->s_fs_info = sbi; 770 sb->s_fs_info = sbi;
768 sbi->s_sb_block = sb_block; 771 sbi->s_sb_block = sb_block;
769 772
773 spin_lock_init(&sbi->s_lock);
774
770 /* 775 /*
771 * See what the current blocksize for the device is, and 776 * See what the current blocksize for the device is, and
772 * use that as the blocksize. Otherwise (or if the blocksize 777 * use that as the blocksize. Otherwise (or if the blocksize
@@ -1079,7 +1084,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1079 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) 1084 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
1080 ext2_msg(sb, KERN_WARNING, 1085 ext2_msg(sb, KERN_WARNING,
1081 "warning: mounting ext3 filesystem as ext2"); 1086 "warning: mounting ext3 filesystem as ext2");
1082 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); 1087 if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
1088 sb->s_flags |= MS_RDONLY;
1089 ext2_write_super(sb);
1083 return 0; 1090 return 0;
1084 1091
1085cantfind_ext2: 1092cantfind_ext2:
@@ -1120,30 +1127,26 @@ static void ext2_clear_super_error(struct super_block *sb)
1120 * be remapped. Nothing we can do but to retry the 1127 * be remapped. Nothing we can do but to retry the
1121 * write and hope for the best. 1128 * write and hope for the best.
1122 */ 1129 */
1123 printk(KERN_ERR "EXT2-fs: %s previous I/O error to " 1130 ext2_msg(sb, KERN_ERR,
1124 "superblock detected", sb->s_id); 1131 "previous I/O error to superblock detected\n");
1125 clear_buffer_write_io_error(sbh); 1132 clear_buffer_write_io_error(sbh);
1126 set_buffer_uptodate(sbh); 1133 set_buffer_uptodate(sbh);
1127 } 1134 }
1128} 1135}
1129 1136
1130static void ext2_commit_super (struct super_block * sb, 1137static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
1131 struct ext2_super_block * es) 1138 int wait)
1132{
1133 ext2_clear_super_error(sb);
1134 es->s_wtime = cpu_to_le32(get_seconds());
1135 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1136 sb->s_dirt = 0;
1137}
1138
1139static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1140{ 1139{
1141 ext2_clear_super_error(sb); 1140 ext2_clear_super_error(sb);
1141 spin_lock(&EXT2_SB(sb)->s_lock);
1142 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); 1142 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
1143 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); 1143 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
1144 es->s_wtime = cpu_to_le32(get_seconds()); 1144 es->s_wtime = cpu_to_le32(get_seconds());
1145 /* unlock before we do IO */
1146 spin_unlock(&EXT2_SB(sb)->s_lock);
1145 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 1147 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1146 sync_dirty_buffer(EXT2_SB(sb)->s_sbh); 1148 if (wait)
1149 sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
1147 sb->s_dirt = 0; 1150 sb->s_dirt = 0;
1148} 1151}
1149 1152
@@ -1157,43 +1160,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1157 * may have been checked while mounted and e2fsck may have 1160 * may have been checked while mounted and e2fsck may have
1158 * set s_state to EXT2_VALID_FS after some corrections. 1161 * set s_state to EXT2_VALID_FS after some corrections.
1159 */ 1162 */
1160
1161static int ext2_sync_fs(struct super_block *sb, int wait) 1163static int ext2_sync_fs(struct super_block *sb, int wait)
1162{ 1164{
1165 struct ext2_sb_info *sbi = EXT2_SB(sb);
1163 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 1166 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
1164 struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
1165
1166 lock_kernel();
1167 if (buffer_write_io_error(sbh)) {
1168 /*
1169 * Oh, dear. A previous attempt to write the
1170 * superblock failed. This could happen because the
1171 * USB device was yanked out. Or it could happen to
1172 * be a transient write error and maybe the block will
1173 * be remapped. Nothing we can do but to retry the
1174 * write and hope for the best.
1175 */
1176 ext2_msg(sb, KERN_ERR,
1177 "previous I/O error to superblock detected\n");
1178 clear_buffer_write_io_error(sbh);
1179 set_buffer_uptodate(sbh);
1180 }
1181 1167
1168 spin_lock(&sbi->s_lock);
1182 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { 1169 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
1183 ext2_debug("setting valid to 0\n"); 1170 ext2_debug("setting valid to 0\n");
1184 es->s_state &= cpu_to_le16(~EXT2_VALID_FS); 1171 es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
1185 es->s_free_blocks_count =
1186 cpu_to_le32(ext2_count_free_blocks(sb));
1187 es->s_free_inodes_count =
1188 cpu_to_le32(ext2_count_free_inodes(sb));
1189 es->s_mtime = cpu_to_le32(get_seconds());
1190 ext2_sync_super(sb, es);
1191 } else {
1192 ext2_commit_super(sb, es);
1193 } 1172 }
1194 sb->s_dirt = 0; 1173 spin_unlock(&sbi->s_lock);
1195 unlock_kernel(); 1174 ext2_sync_super(sb, es, wait);
1196
1197 return 0; 1175 return 0;
1198} 1176}
1199 1177
@@ -1215,7 +1193,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1215 unsigned long old_sb_flags; 1193 unsigned long old_sb_flags;
1216 int err; 1194 int err;
1217 1195
1218 lock_kernel(); 1196 spin_lock(&sbi->s_lock);
1219 1197
1220 /* Store the old options */ 1198 /* Store the old options */
1221 old_sb_flags = sb->s_flags; 1199 old_sb_flags = sb->s_flags;
@@ -1254,13 +1232,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1254 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; 1232 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1255 } 1233 }
1256 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1234 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1257 unlock_kernel(); 1235 spin_unlock(&sbi->s_lock);
1258 return 0; 1236 return 0;
1259 } 1237 }
1260 if (*flags & MS_RDONLY) { 1238 if (*flags & MS_RDONLY) {
1261 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || 1239 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
1262 !(sbi->s_mount_state & EXT2_VALID_FS)) { 1240 !(sbi->s_mount_state & EXT2_VALID_FS)) {
1263 unlock_kernel(); 1241 spin_unlock(&sbi->s_lock);
1264 return 0; 1242 return 0;
1265 } 1243 }
1266 /* 1244 /*
@@ -1269,6 +1247,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1269 */ 1247 */
1270 es->s_state = cpu_to_le16(sbi->s_mount_state); 1248 es->s_state = cpu_to_le16(sbi->s_mount_state);
1271 es->s_mtime = cpu_to_le32(get_seconds()); 1249 es->s_mtime = cpu_to_le32(get_seconds());
1250 spin_unlock(&sbi->s_lock);
1251 ext2_sync_super(sb, es, 1);
1272 } else { 1252 } else {
1273 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1253 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
1274 ~EXT2_FEATURE_RO_COMPAT_SUPP); 1254 ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1288,16 +1268,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1288 sbi->s_mount_state = le16_to_cpu(es->s_state); 1268 sbi->s_mount_state = le16_to_cpu(es->s_state);
1289 if (!ext2_setup_super (sb, es, 0)) 1269 if (!ext2_setup_super (sb, es, 0))
1290 sb->s_flags &= ~MS_RDONLY; 1270 sb->s_flags &= ~MS_RDONLY;
1271 spin_unlock(&sbi->s_lock);
1272 ext2_write_super(sb);
1291 } 1273 }
1292 ext2_sync_super(sb, es);
1293 unlock_kernel();
1294 return 0; 1274 return 0;
1295restore_opts: 1275restore_opts:
1296 sbi->s_mount_opt = old_opts.s_mount_opt; 1276 sbi->s_mount_opt = old_opts.s_mount_opt;
1297 sbi->s_resuid = old_opts.s_resuid; 1277 sbi->s_resuid = old_opts.s_resuid;
1298 sbi->s_resgid = old_opts.s_resgid; 1278 sbi->s_resgid = old_opts.s_resgid;
1299 sb->s_flags = old_sb_flags; 1279 sb->s_flags = old_sb_flags;
1300 unlock_kernel(); 1280 spin_unlock(&sbi->s_lock);
1301 return err; 1281 return err;
1302} 1282}
1303 1283
@@ -1308,6 +1288,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1308 struct ext2_super_block *es = sbi->s_es; 1288 struct ext2_super_block *es = sbi->s_es;
1309 u64 fsid; 1289 u64 fsid;
1310 1290
1291 spin_lock(&sbi->s_lock);
1292
1311 if (test_opt (sb, MINIX_DF)) 1293 if (test_opt (sb, MINIX_DF))
1312 sbi->s_overhead_last = 0; 1294 sbi->s_overhead_last = 0;
1313 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { 1295 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1362,6 +1344,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1362 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 1344 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
1363 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 1345 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1364 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 1346 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1347 spin_unlock(&sbi->s_lock);
1365 return 0; 1348 return 0;
1366} 1349}
1367 1350
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e44dc92609be..7c3915780b19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
101 101
102static struct mb_cache *ext2_xattr_cache; 102static struct mb_cache *ext2_xattr_cache;
103 103
104static struct xattr_handler *ext2_xattr_handler_map[] = { 104static const struct xattr_handler *ext2_xattr_handler_map[] = {
105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
106#ifdef CONFIG_EXT2_FS_POSIX_ACL 106#ifdef CONFIG_EXT2_FS_POSIX_ACL
107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, 107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
113#endif 113#endif
114}; 114};
115 115
116struct xattr_handler *ext2_xattr_handlers[] = { 116const struct xattr_handler *ext2_xattr_handlers[] = {
117 &ext2_xattr_user_handler, 117 &ext2_xattr_user_handler,
118 &ext2_xattr_trusted_handler, 118 &ext2_xattr_trusted_handler,
119#ifdef CONFIG_EXT2_FS_POSIX_ACL 119#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
126 NULL 126 NULL
127}; 127};
128 128
129static inline struct xattr_handler * 129static inline const struct xattr_handler *
130ext2_xattr_handler(int name_index) 130ext2_xattr_handler(int name_index)
131{ 131{
132 struct xattr_handler *handler = NULL; 132 const struct xattr_handler *handler = NULL;
133 133
134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) 134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
135 handler = ext2_xattr_handler_map[name_index]; 135 handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
298 /* list the attribute names */ 298 /* list the attribute names */
299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); 299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
300 entry = EXT2_XATTR_NEXT(entry)) { 300 entry = EXT2_XATTR_NEXT(entry)) {
301 struct xattr_handler *handler = 301 const struct xattr_handler *handler =
302 ext2_xattr_handler(entry->e_name_index); 302 ext2_xattr_handler(entry->e_name_index);
303 303
304 if (handler) { 304 if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) 345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
346 return; 346 return;
347 347
348 spin_lock(&EXT2_SB(sb)->s_lock);
348 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); 349 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
350 spin_unlock(&EXT2_SB(sb)->s_lock);
349 sb->s_dirt = 1; 351 sb->s_dirt = 1;
350 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 352 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
351} 353}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
55 55
56# ifdef CONFIG_EXT2_FS_XATTR 56# ifdef CONFIG_EXT2_FS_XATTR
57 57
58extern struct xattr_handler ext2_xattr_user_handler; 58extern const struct xattr_handler ext2_xattr_user_handler;
59extern struct xattr_handler ext2_xattr_trusted_handler; 59extern const struct xattr_handler ext2_xattr_trusted_handler;
60extern struct xattr_handler ext2_xattr_acl_access_handler; 60extern const struct xattr_handler ext2_xattr_acl_access_handler;
61extern struct xattr_handler ext2_xattr_acl_default_handler; 61extern const struct xattr_handler ext2_xattr_acl_default_handler;
62extern struct xattr_handler ext2_xattr_security_handler; 62extern const struct xattr_handler ext2_xattr_security_handler;
63 63
64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); 64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
65 65
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
72extern int init_ext2_xattr(void); 72extern int init_ext2_xattr(void);
73extern void exit_ext2_xattr(void); 73extern void exit_ext2_xattr(void);
74 74
75extern struct xattr_handler *ext2_xattr_handlers[]; 75extern const struct xattr_handler *ext2_xattr_handlers[];
76 76
77# else /* CONFIG_EXT2_FS_XATTR */ 77# else /* CONFIG_EXT2_FS_XATTR */
78 78
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index b118c6383c6d..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
67 return err; 67 return err;
68} 68}
69 69
70struct xattr_handler ext2_xattr_security_handler = { 70const struct xattr_handler ext2_xattr_security_handler = {
71 .prefix = XATTR_SECURITY_PREFIX, 71 .prefix = XATTR_SECURITY_PREFIX,
72 .list = ext2_xattr_security_list, 72 .list = ext2_xattr_security_list,
73 .get = ext2_xattr_security_get, 73 .get = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
50 value, size, flags); 50 value, size, flags);
51} 51}
52 52
53struct xattr_handler ext2_xattr_trusted_handler = { 53const struct xattr_handler ext2_xattr_trusted_handler = {
54 .prefix = XATTR_TRUSTED_PREFIX, 54 .prefix = XATTR_TRUSTED_PREFIX,
55 .list = ext2_xattr_trusted_list, 55 .list = ext2_xattr_trusted_list,
56 .get = ext2_xattr_trusted_get, 56 .get = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext2_xattr_user_handler = { 57const struct xattr_handler ext2_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext2_xattr_user_list, 59 .list = ext2_xattr_user_list,
60 .get = ext2_xattr_user_get, 60 .get = ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..01552abbca3c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -456,7 +456,7 @@ release_and_out:
456 return error; 456 return error;
457} 457}
458 458
459struct xattr_handler ext3_xattr_acl_access_handler = { 459const struct xattr_handler ext3_xattr_acl_access_handler = {
460 .prefix = POSIX_ACL_XATTR_ACCESS, 460 .prefix = POSIX_ACL_XATTR_ACCESS,
461 .flags = ACL_TYPE_ACCESS, 461 .flags = ACL_TYPE_ACCESS,
462 .list = ext3_xattr_list_acl_access, 462 .list = ext3_xattr_list_acl_access,
@@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
464 .set = ext3_xattr_set_acl, 464 .set = ext3_xattr_set_acl,
465}; 465};
466 466
467struct xattr_handler ext3_xattr_acl_default_handler = { 467const struct xattr_handler ext3_xattr_acl_default_handler = {
468 .prefix = POSIX_ACL_XATTR_DEFAULT, 468 .prefix = POSIX_ACL_XATTR_DEFAULT,
469 .flags = ACL_TYPE_DEFAULT, 469 .flags = ACL_TYPE_DEFAULT,
470 .list = ext3_xattr_list_acl_default, 470 .list = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a177122a1b25..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1584,6 +1584,12 @@ retry_alloc:
1584 goto io_error; 1584 goto io_error;
1585 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1585 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1586 /* 1586 /*
1587 * skip this group (and avoid loading bitmap) if there
1588 * are no free blocks
1589 */
1590 if (!free_blocks)
1591 continue;
1592 /*
1587 * skip this group if the number of 1593 * skip this group if the number of
1588 * free blocks is less than half of the reservation 1594 * free blocks is less than half of the reservation
1589 * window size. 1595 * window size.
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..fcf7487734b6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,7 +48,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 struct ext3_inode_info *ei = EXT3_I(inode); 49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
51 int ret = 0; 51 int ret, needs_barrier = 0;
52 tid_t commit_tid; 52 tid_t commit_tid;
53 53
54 if (inode->i_sb->s_flags & MS_RDONLY) 54 if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
70 * (they were dirtied by commit). But that's OK - the blocks are 70 * (they were dirtied by commit). But that's OK - the blocks are
71 * safe in-journal, which is all fsync() needs to ensure. 71 * safe in-journal, which is all fsync() needs to ensure.
72 */ 72 */
73 if (ext3_should_journal_data(inode)) { 73 if (ext3_should_journal_data(inode))
74 ret = ext3_force_commit(inode->i_sb); 74 return ext3_force_commit(inode->i_sb);
75 goto out;
76 }
77 75
78 if (datasync) 76 if (datasync)
79 commit_tid = atomic_read(&ei->i_datasync_tid); 77 commit_tid = atomic_read(&ei->i_datasync_tid);
80 else 78 else
81 commit_tid = atomic_read(&ei->i_sync_tid); 79 commit_tid = atomic_read(&ei->i_sync_tid);
82 80
83 if (log_start_commit(journal, commit_tid)) { 81 if (test_opt(inode->i_sb, BARRIER) &&
84 log_wait_commit(journal, commit_tid); 82 !journal_trans_will_send_data_barrier(journal, commit_tid))
85 goto out; 83 needs_barrier = 1;
86 } 84 log_start_commit(journal, commit_tid);
85 ret = log_wait_commit(journal, commit_tid);
87 86
88 /* 87 /*
89 * In case we didn't commit a transaction, we have to flush 88 * In case we didn't commit a transaction, we have to flush
90 * disk caches manually so that data really is on persistent 89 * disk caches manually so that data really is on persistent
91 * storage 90 * storage
92 */ 91 */
93 if (test_opt(inode->i_sb, BARRIER)) 92 if (needs_barrier)
94 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
95out: 94 BLKDEV_IFL_WAIT);
96 return ret; 95 return ret;
97} 96}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 0d0e97ed3ff6..498021eb88fb 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -538,16 +538,13 @@ got:
538 if (S_ISDIR(mode)) 538 if (S_ISDIR(mode))
539 percpu_counter_inc(&sbi->s_dirs_counter); 539 percpu_counter_inc(&sbi->s_dirs_counter);
540 540
541 inode->i_uid = current_fsuid(); 541
542 if (test_opt (sb, GRPID)) 542 if (test_opt(sb, GRPID)) {
543 inode->i_gid = dir->i_gid; 543 inode->i_mode = mode;
544 else if (dir->i_mode & S_ISGID) { 544 inode->i_uid = current_fsuid();
545 inode->i_gid = dir->i_gid; 545 inode->i_gid = dir->i_gid;
546 if (S_ISDIR(mode))
547 mode |= S_ISGID;
548 } else 546 } else
549 inode->i_gid = current_fsgid(); 547 inode_init_owner(inode, dir, mode);
550 inode->i_mode = mode;
551 548
552 inode->i_ino = ino; 549 inode->i_ino = ino;
553 /* This is the optimal IO size (for stat), not the fs block size */ 550 /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ea33bdf0a300..735f0190ec2a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3151,7 +3151,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3151 if (error) 3151 if (error)
3152 return error; 3152 return error;
3153 3153
3154 if (ia_valid & ATTR_SIZE) 3154 if (is_quota_modification(inode, attr))
3155 dquot_initialize(inode); 3155 dquot_initialize(inode);
3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1bee604cc6cd..0fc1293d0e96 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -653,8 +653,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
653 seq_printf(seq, ",commit=%u", 653 seq_printf(seq, ",commit=%u",
654 (unsigned) (sbi->s_commit_interval / HZ)); 654 (unsigned) (sbi->s_commit_interval / HZ));
655 } 655 }
656 if (test_opt(sb, BARRIER)) 656
657 seq_puts(seq, ",barrier=1"); 657 /*
658 * Always display barrier state so it's clear what the status is.
659 */
660 seq_puts(seq, ",barrier=");
661 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
658 if (test_opt(sb, NOBH)) 662 if (test_opt(sb, NOBH))
659 seq_puts(seq, ",nobh"); 663 seq_puts(seq, ",nobh");
660 664
@@ -810,8 +814,8 @@ enum {
810 Opt_data_err_abort, Opt_data_err_ignore, 814 Opt_data_err_abort, Opt_data_err_ignore,
811 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 815 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
812 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 816 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
813 Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, 817 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
814 Opt_usrquota, Opt_grpquota 818 Opt_resize, Opt_usrquota, Opt_grpquota
815}; 819};
816 820
817static const match_table_t tokens = { 821static const match_table_t tokens = {
@@ -865,6 +869,8 @@ static const match_table_t tokens = {
865 {Opt_quota, "quota"}, 869 {Opt_quota, "quota"},
866 {Opt_usrquota, "usrquota"}, 870 {Opt_usrquota, "usrquota"},
867 {Opt_barrier, "barrier=%u"}, 871 {Opt_barrier, "barrier=%u"},
872 {Opt_barrier, "barrier"},
873 {Opt_nobarrier, "nobarrier"},
868 {Opt_resize, "resize"}, 874 {Opt_resize, "resize"},
869 {Opt_err, NULL}, 875 {Opt_err, NULL},
870}; 876};
@@ -967,7 +973,11 @@ static int parse_options (char *options, struct super_block *sb,
967 int token; 973 int token;
968 if (!*p) 974 if (!*p)
969 continue; 975 continue;
970 976 /*
977 * Initialize args struct so we know whether arg was
978 * found; some options take optional arguments.
979 */
980 args[0].to = args[0].from = 0;
971 token = match_token(p, tokens, args); 981 token = match_token(p, tokens, args);
972 switch (token) { 982 switch (token) {
973 case Opt_bsd_df: 983 case Opt_bsd_df:
@@ -1215,9 +1225,15 @@ set_qf_format:
1215 case Opt_abort: 1225 case Opt_abort:
1216 set_opt(sbi->s_mount_opt, ABORT); 1226 set_opt(sbi->s_mount_opt, ABORT);
1217 break; 1227 break;
1228 case Opt_nobarrier:
1229 clear_opt(sbi->s_mount_opt, BARRIER);
1230 break;
1218 case Opt_barrier: 1231 case Opt_barrier:
1219 if (match_int(&args[0], &option)) 1232 if (args[0].from) {
1220 return 0; 1233 if (match_int(&args[0], &option))
1234 return 0;
1235 } else
1236 option = 1; /* No argument, default to 1 */
1221 if (option) 1237 if (option)
1222 set_opt(sbi->s_mount_opt, BARRIER); 1238 set_opt(sbi->s_mount_opt, BARRIER);
1223 else 1239 else
@@ -1890,21 +1906,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1890 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 1906 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1891 spin_lock_init(&sbi->s_next_gen_lock); 1907 spin_lock_init(&sbi->s_next_gen_lock);
1892 1908
1893 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1894 ext3_count_free_blocks(sb));
1895 if (!err) {
1896 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1897 ext3_count_free_inodes(sb));
1898 }
1899 if (!err) {
1900 err = percpu_counter_init(&sbi->s_dirs_counter,
1901 ext3_count_dirs(sb));
1902 }
1903 if (err) {
1904 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1905 goto failed_mount3;
1906 }
1907
1908 /* per fileystem reservation list head & lock */ 1909 /* per fileystem reservation list head & lock */
1909 spin_lock_init(&sbi->s_rsv_window_lock); 1910 spin_lock_init(&sbi->s_rsv_window_lock);
1910 sbi->s_rsv_window_root = RB_ROOT; 1911 sbi->s_rsv_window_root = RB_ROOT;
@@ -1945,15 +1946,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1945 if (!test_opt(sb, NOLOAD) && 1946 if (!test_opt(sb, NOLOAD) &&
1946 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { 1947 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
1947 if (ext3_load_journal(sb, es, journal_devnum)) 1948 if (ext3_load_journal(sb, es, journal_devnum))
1948 goto failed_mount3; 1949 goto failed_mount2;
1949 } else if (journal_inum) { 1950 } else if (journal_inum) {
1950 if (ext3_create_journal(sb, es, journal_inum)) 1951 if (ext3_create_journal(sb, es, journal_inum))
1951 goto failed_mount3; 1952 goto failed_mount2;
1952 } else { 1953 } else {
1953 if (!silent) 1954 if (!silent)
1954 ext3_msg(sb, KERN_ERR, 1955 ext3_msg(sb, KERN_ERR,
1955 "error: no journal found. " 1956 "error: no journal found. "
1956 "mounting ext3 over ext2?"); 1957 "mounting ext3 over ext2?");
1958 goto failed_mount2;
1959 }
1960 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1961 ext3_count_free_blocks(sb));
1962 if (!err) {
1963 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1964 ext3_count_free_inodes(sb));
1965 }
1966 if (!err) {
1967 err = percpu_counter_init(&sbi->s_dirs_counter,
1968 ext3_count_dirs(sb));
1969 }
1970 if (err) {
1971 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1957 goto failed_mount3; 1972 goto failed_mount3;
1958 } 1973 }
1959 1974
@@ -1978,7 +1993,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1978 ext3_msg(sb, KERN_ERR, 1993 ext3_msg(sb, KERN_ERR,
1979 "error: journal does not support " 1994 "error: journal does not support "
1980 "requested data journaling mode"); 1995 "requested data journaling mode");
1981 goto failed_mount4; 1996 goto failed_mount3;
1982 } 1997 }
1983 default: 1998 default:
1984 break; 1999 break;
@@ -2001,19 +2016,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2001 if (IS_ERR(root)) { 2016 if (IS_ERR(root)) {
2002 ext3_msg(sb, KERN_ERR, "error: get root inode failed"); 2017 ext3_msg(sb, KERN_ERR, "error: get root inode failed");
2003 ret = PTR_ERR(root); 2018 ret = PTR_ERR(root);
2004 goto failed_mount4; 2019 goto failed_mount3;
2005 } 2020 }
2006 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 2021 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2007 iput(root); 2022 iput(root);
2008 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); 2023 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
2009 goto failed_mount4; 2024 goto failed_mount3;
2010 } 2025 }
2011 sb->s_root = d_alloc_root(root); 2026 sb->s_root = d_alloc_root(root);
2012 if (!sb->s_root) { 2027 if (!sb->s_root) {
2013 ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); 2028 ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
2014 iput(root); 2029 iput(root);
2015 ret = -ENOMEM; 2030 ret = -ENOMEM;
2016 goto failed_mount4; 2031 goto failed_mount3;
2017 } 2032 }
2018 2033
2019 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); 2034 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2039,12 +2054,11 @@ cantfind_ext3:
2039 sb->s_id); 2054 sb->s_id);
2040 goto failed_mount; 2055 goto failed_mount;
2041 2056
2042failed_mount4:
2043 journal_destroy(sbi->s_journal);
2044failed_mount3: 2057failed_mount3:
2045 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2058 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2046 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2059 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2047 percpu_counter_destroy(&sbi->s_dirs_counter); 2060 percpu_counter_destroy(&sbi->s_dirs_counter);
2061 journal_destroy(sbi->s_journal);
2048failed_mount2: 2062failed_mount2:
2049 for (i = 0; i < db_count; i++) 2063 for (i = 0; i < db_count; i++)
2050 brelse(sbi->s_group_desc[i]); 2064 brelse(sbi->s_group_desc[i]);
@@ -2278,6 +2292,9 @@ static int ext3_load_journal(struct super_block *sb,
2278 return -EINVAL; 2292 return -EINVAL;
2279 } 2293 }
2280 2294
2295 if (!(journal->j_flags & JFS_BARRIER))
2296 printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
2297
2281 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2298 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2282 err = journal_update_format(journal); 2299 err = journal_update_format(journal);
2283 if (err) { 2300 if (err) {
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..71fb8d65e54c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
104 104
105static struct mb_cache *ext3_xattr_cache; 105static struct mb_cache *ext3_xattr_cache;
106 106
107static struct xattr_handler *ext3_xattr_handler_map[] = { 107static const struct xattr_handler *ext3_xattr_handler_map[] = {
108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, 108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
109#ifdef CONFIG_EXT3_FS_POSIX_ACL 109#ifdef CONFIG_EXT3_FS_POSIX_ACL
110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, 110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
116#endif 116#endif
117}; 117};
118 118
119struct xattr_handler *ext3_xattr_handlers[] = { 119const struct xattr_handler *ext3_xattr_handlers[] = {
120 &ext3_xattr_user_handler, 120 &ext3_xattr_user_handler,
121 &ext3_xattr_trusted_handler, 121 &ext3_xattr_trusted_handler,
122#ifdef CONFIG_EXT3_FS_POSIX_ACL 122#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
129 NULL 129 NULL
130}; 130};
131 131
132static inline struct xattr_handler * 132static inline const struct xattr_handler *
133ext3_xattr_handler(int name_index) 133ext3_xattr_handler(int name_index)
134{ 134{
135 struct xattr_handler *handler = NULL; 135 const struct xattr_handler *handler = NULL;
136 136
137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) 137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
138 handler = ext3_xattr_handler_map[name_index]; 138 handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
338 size_t rest = buffer_size; 338 size_t rest = buffer_size;
339 339
340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { 340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
341 struct xattr_handler *handler = 341 const struct xattr_handler *handler =
342 ext3_xattr_handler(entry->e_name_index); 342 ext3_xattr_handler(entry->e_name_index);
343 343
344 if (handler) { 344 if (handler) {
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
58 58
59# ifdef CONFIG_EXT3_FS_XATTR 59# ifdef CONFIG_EXT3_FS_XATTR
60 60
61extern struct xattr_handler ext3_xattr_user_handler; 61extern const struct xattr_handler ext3_xattr_user_handler;
62extern struct xattr_handler ext3_xattr_trusted_handler; 62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern struct xattr_handler ext3_xattr_acl_access_handler; 63extern const struct xattr_handler ext3_xattr_acl_access_handler;
64extern struct xattr_handler ext3_xattr_acl_default_handler; 64extern const struct xattr_handler ext3_xattr_acl_default_handler;
65extern struct xattr_handler ext3_xattr_security_handler; 65extern const struct xattr_handler ext3_xattr_security_handler;
66 66
67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); 67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
68 68
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
76extern int init_ext3_xattr(void); 76extern int init_ext3_xattr(void);
77extern void exit_ext3_xattr(void); 77extern void exit_ext3_xattr(void);
78 78
79extern struct xattr_handler *ext3_xattr_handlers[]; 79extern const struct xattr_handler *ext3_xattr_handlers[];
80 80
81# else /* CONFIG_EXT3_FS_XATTR */ 81# else /* CONFIG_EXT3_FS_XATTR */
82 82
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3af91f476dff..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext3_xattr_security_handler = { 72const struct xattr_handler ext3_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext3_xattr_security_list, 74 .list = ext3_xattr_security_list,
75 .get = ext3_xattr_security_get, 75 .get = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
51 value, size, flags); 51 value, size, flags);
52} 52}
53 53
54struct xattr_handler ext3_xattr_trusted_handler = { 54const struct xattr_handler ext3_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext3_xattr_trusted_list, 56 .list = ext3_xattr_trusted_list,
57 .get = ext3_xattr_trusted_get, 57 .get = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext3_xattr_user_handler = { 57const struct xattr_handler ext3_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext3_xattr_user_list, 59 .list = ext3_xattr_user_list,
60 .get = ext3_xattr_user_get, 60 .get = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..feaf498feaa6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
454 return error; 454 return error;
455} 455}
456 456
457struct xattr_handler ext4_xattr_acl_access_handler = { 457const struct xattr_handler ext4_xattr_acl_access_handler = {
458 .prefix = POSIX_ACL_XATTR_ACCESS, 458 .prefix = POSIX_ACL_XATTR_ACCESS,
459 .flags = ACL_TYPE_ACCESS, 459 .flags = ACL_TYPE_ACCESS,
460 .list = ext4_xattr_list_acl_access, 460 .list = ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
462 .set = ext4_xattr_set_acl, 462 .set = ext4_xattr_set_acl,
463}; 463};
464 464
465struct xattr_handler ext4_xattr_acl_default_handler = { 465const struct xattr_handler ext4_xattr_acl_default_handler = {
466 .prefix = POSIX_ACL_XATTR_DEFAULT, 466 .prefix = POSIX_ACL_XATTR_DEFAULT,
467 .flags = ACL_TYPE_DEFAULT, 467 .flags = ACL_TYPE_DEFAULT,
468 .list = ext4_xattr_list_acl_default, 468 .list = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
100 if (ext4_should_writeback_data(inode) && 100 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) && 101 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER)) 102 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 103 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 NULL, BLKDEV_IFL_WAIT);
104 jbd2_log_wait_commit(journal, commit_tid); 105 jbd2_log_wait_commit(journal, commit_tid);
105 } else if (journal->j_flags & JBD2_BARRIER) 106 } else if (journal->j_flags & JBD2_BARRIER)
106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 107 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
108 BLKDEV_IFL_WAIT);
107 return ret; 109 return ret;
108} 110}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef6ccd6..1a0e183a2f04 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -979,16 +979,12 @@ got:
979 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 979 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
980 } 980 }
981 981
982 inode->i_uid = current_fsuid(); 982 if (test_opt(sb, GRPID)) {
983 if (test_opt(sb, GRPID)) 983 inode->i_mode = mode;
984 inode->i_uid = current_fsuid();
984 inode->i_gid = dir->i_gid; 985 inode->i_gid = dir->i_gid;
985 else if (dir->i_mode & S_ISGID) {
986 inode->i_gid = dir->i_gid;
987 if (S_ISDIR(mode))
988 mode |= S_ISGID;
989 } else 986 } else
990 inode->i_gid = current_fsgid(); 987 inode_init_owner(inode, dir, mode);
991 inode->i_mode = mode;
992 988
993 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 989 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
994 /* This is the optimal IO size (for stat), not the fs block size */ 990 /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 81d605412844..3e0f6af9d08d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5425,7 +5425,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5425 if (error) 5425 if (error)
5426 return error; 5426 return error;
5427 5427
5428 if (ia_valid & ATTR_SIZE) 5428 if (is_quota_modification(inode, attr))
5429 dquot_initialize(inode); 5429 dquot_initialize(inode);
5430 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5430 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5431 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5431 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..2de0e9515089 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
97 97
98static struct mb_cache *ext4_xattr_cache; 98static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static const struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
109#endif 109#endif
110}; 110};
111 111
112struct xattr_handler *ext4_xattr_handlers[] = { 112const struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
122 NULL 122 NULL
123}; 123};
124 124
125static inline struct xattr_handler * 125static inline const struct xattr_handler *
126ext4_xattr_handler(int name_index) 126ext4_xattr_handler(int name_index)
127{ 127{
128 struct xattr_handler *handler = NULL; 128 const struct xattr_handler *handler = NULL;
129 129
130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) 130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
131 handler = ext4_xattr_handler_map[name_index]; 131 handler = ext4_xattr_handler_map[name_index];
@@ -332,7 +332,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
332 size_t rest = buffer_size; 332 size_t rest = buffer_size;
333 333
334 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { 334 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
335 struct xattr_handler *handler = 335 const struct xattr_handler *handler =
336 ext4_xattr_handler(entry->e_name_index); 336 ext4_xattr_handler(entry->e_name_index);
337 337
338 if (handler) { 338 if (handler) {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
65 65
66# ifdef CONFIG_EXT4_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern const struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern const struct xattr_handler ext4_xattr_trusted_handler;
70extern struct xattr_handler ext4_xattr_acl_access_handler; 70extern const struct xattr_handler ext4_xattr_acl_access_handler;
71extern struct xattr_handler ext4_xattr_acl_default_handler; 71extern const struct xattr_handler ext4_xattr_acl_default_handler;
72extern struct xattr_handler ext4_xattr_security_handler; 72extern const struct xattr_handler ext4_xattr_security_handler;
73 73
74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); 74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
75 75
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
86extern int init_ext4_xattr(void); 86extern int init_ext4_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 8b145e98df07..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext4_xattr_security_handler = { 72const struct xattr_handler ext4_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext4_xattr_security_list, 74 .list = ext4_xattr_security_list,
75 .get = ext4_xattr_security_get, 75 .get = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
51 name, value, size, flags); 51 name, value, size, flags);
52} 52}
53 53
54struct xattr_handler ext4_xattr_trusted_handler = { 54const struct xattr_handler ext4_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext4_xattr_trusted_list, 56 .list = ext4_xattr_trusted_list,
57 .get = ext4_xattr_trusted_get, 57 .get = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext4_xattr_user_handler = { 57const struct xattr_handler ext4_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext4_xattr_user_list, 59 .list = ext4_xattr_user_list,
60 .get = ext4_xattr_user_get, 60 .get = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
242 while (*fclus < cluster) { 242 while (*fclus < cluster) {
243 /* prevent the infinite loop of cluster chain */ 243 /* prevent the infinite loop of cluster chain */
244 if (*fclus > limit) { 244 if (*fclus > limit) {
245 fat_fs_error(sb, "%s: detected the cluster chain loop" 245 fat_fs_error_ratelimit(sb,
246 " (i_pos %lld)", __func__, 246 "%s: detected the cluster chain loop"
247 MSDOS_I(inode)->i_pos); 247 " (i_pos %lld)", __func__,
248 MSDOS_I(inode)->i_pos);
248 nr = -EIO; 249 nr = -EIO;
249 goto out; 250 goto out;
250 } 251 }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
253 if (nr < 0) 254 if (nr < 0)
254 goto out; 255 goto out;
255 else if (nr == FAT_ENT_FREE) { 256 else if (nr == FAT_ENT_FREE) {
256 fat_fs_error(sb, "%s: invalid cluster chain" 257 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
257 " (i_pos %lld)", __func__, 258 " (i_pos %lld)", __func__,
258 MSDOS_I(inode)->i_pos); 259 MSDOS_I(inode)->i_pos);
259 nr = -EIO; 260 nr = -EIO;
260 goto out; 261 goto out;
261 } else if (nr == FAT_ENT_EOF) { 262 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <linux/kernel.h>
22#include "fat.h" 23#include "fat.h"
23 24
24/* 25/*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
140{ 141{
141 const wchar_t *ip; 142 const wchar_t *ip;
142 wchar_t ec; 143 wchar_t ec;
143 unsigned char *op, nc; 144 unsigned char *op;
144 int charlen; 145 int charlen;
145 int k;
146 146
147 ip = uni; 147 ip = uni;
148 op = ascii; 148 op = ascii;
149 149
150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { 150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
151 ec = *ip++; 151 ec = *ip++;
152 if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { 152 if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
153 op += charlen; 153 op += charlen;
154 len -= charlen; 154 len -= charlen;
155 } else { 155 } else {
156 if (uni_xlate == 1) { 156 if (uni_xlate == 1) {
157 *op = ':'; 157 *op++ = ':';
158 for (k = 4; k > 0; k--) { 158 op = pack_hex_byte(op, ec >> 8);
159 nc = ec & 0xF; 159 op = pack_hex_byte(op, ec);
160 op[k] = nc > 9 ? nc + ('a' - 10)
161 : nc + '0';
162 ec >>= 4;
163 }
164 op += 5;
165 len -= 5; 160 len -= 5;
166 } else { 161 } else {
167 *op++ = '?'; 162 *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
758 return ret; 753 return ret;
759} 754}
760 755
761static int fat_dir_ioctl(struct inode *inode, struct file *filp, 756static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
762 unsigned int cmd, unsigned long arg) 757 unsigned long arg)
763{ 758{
759 struct inode *inode = filp->f_path.dentry->d_inode;
764 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; 760 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
765 int short_only, both; 761 int short_only, both;
766 762
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
774 both = 1; 770 both = 1;
775 break; 771 break;
776 default: 772 default:
777 return fat_generic_ioctl(inode, filp, cmd, arg); 773 return fat_generic_ioctl(filp, cmd, arg);
778 } 774 }
779 775
780 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) 776 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
814 both = 1; 810 both = 1;
815 break; 811 break;
816 default: 812 default:
817 return -ENOIOCTLCMD; 813 return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
818 } 814 }
819 815
820 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) 816 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
836 .llseek = generic_file_llseek, 832 .llseek = generic_file_llseek,
837 .read = generic_read_dir, 833 .read = generic_read_dir,
838 .readdir = fat_readdir, 834 .readdir = fat_readdir,
839 .ioctl = fat_dir_ioctl, 835 .unlocked_ioctl = fat_dir_ioctl,
840#ifdef CONFIG_COMPAT 836#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 837 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 838#endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..53dba57b49a1 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/ratelimit.h>
9#include <linux/msdos_fs.h> 10#include <linux/msdos_fs.h>
10 11
11/* 12/*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
82 struct fatent_operations *fatent_ops; 83 struct fatent_operations *fatent_ops;
83 struct inode *fat_inode; 84 struct inode *fat_inode;
84 85
86 struct ratelimit_state ratelimit;
87
85 spinlock_t inode_hash_lock; 88 spinlock_t inode_hash_lock;
86 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 89 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
87}; 90};
@@ -298,8 +301,8 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
298extern int fat_count_free_clusters(struct super_block *sb); 301extern int fat_count_free_clusters(struct super_block *sb);
299 302
300/* fat/file.c */ 303/* fat/file.c */
301extern int fat_generic_ioctl(struct inode *inode, struct file *filp, 304extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
302 unsigned int cmd, unsigned long arg); 305 unsigned long arg);
303extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
304extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
305extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323 struct inode *i2); 326 struct inode *i2);
324/* fat/misc.c */ 327/* fat/misc.c */
325extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 328extern void
326 __attribute__ ((format (printf, 2, 3))) __cold; 329__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
330 __attribute__ ((format (printf, 3, 4))) __cold;
331#define fat_fs_error(s, fmt, args...) \
332 __fat_fs_error(s, 1, fmt , ## args)
333#define fat_fs_error_ratelimit(s, fmt, args...) \
334 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
327extern int fat_clusters_flush(struct super_block *sb); 335extern int fat_clusters_flush(struct super_block *sb);
328extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 336extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
329extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 337extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..a14c2f6a489e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/capability.h> 9#include <linux/capability.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/compat.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/time.h> 13#include <linux/time.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
114 return err; 115 return err;
115} 116}
116 117
117int fat_generic_ioctl(struct inode *inode, struct file *filp, 118long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118 unsigned int cmd, unsigned long arg)
119{ 119{
120 struct inode *inode = filp->f_path.dentry->d_inode;
120 u32 __user *user_attr = (u32 __user *)arg; 121 u32 __user *user_attr = (u32 __user *)arg;
121 122
122 switch (cmd) { 123 switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
129 } 130 }
130} 131}
131 132
133#ifdef CONFIG_COMPAT
134static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
135 unsigned long arg)
136
137{
138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
139}
140#endif
141
132static int fat_file_release(struct inode *inode, struct file *filp) 142static int fat_file_release(struct inode *inode, struct file *filp)
133{ 143{
134 if ((filp->f_mode & FMODE_WRITE) && 144 if ((filp->f_mode & FMODE_WRITE) &&
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
159 .aio_write = generic_file_aio_write, 169 .aio_write = generic_file_aio_write,
160 .mmap = generic_file_mmap, 170 .mmap = generic_file_mmap,
161 .release = fat_file_release, 171 .release = fat_file_release,
162 .ioctl = fat_generic_ioctl, 172 .unlocked_ioctl = fat_generic_ioctl,
173#ifdef CONFIG_COMPAT
174 .compat_ioctl = fat_generic_compat_ioctl,
175#endif
163 .fsync = fat_file_fsync, 176 .fsync = fat_file_fsync,
164 .splice_read = generic_file_splice_read, 177 .splice_read = generic_file_splice_read,
165}; 178};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..ed33904926ee 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1250,6 +1250,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1250 sb->s_op = &fat_sops; 1250 sb->s_op = &fat_sops;
1251 sb->s_export_op = &fat_export_ops; 1251 sb->s_export_op = &fat_export_ops;
1252 sbi->dir_ops = fs_dir_inode_ops; 1252 sbi->dir_ops = fs_dir_inode_ops;
1253 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1254 DEFAULT_RATELIMIT_BURST);
1253 1255
1254 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1256 error = parse_options(data, isvfat, silent, &debug, &sbi->options);
1255 if (error) 1257 if (error)
@@ -1497,10 +1499,8 @@ out_fail:
1497 iput(fat_inode); 1499 iput(fat_inode);
1498 if (root_inode) 1500 if (root_inode)
1499 iput(root_inode); 1501 iput(root_inode);
1500 if (sbi->nls_io) 1502 unload_nls(sbi->nls_io);
1501 unload_nls(sbi->nls_io); 1503 unload_nls(sbi->nls_disk);
1502 if (sbi->nls_disk)
1503 unload_nls(sbi->nls_disk);
1504 if (sbi->options.iocharset != fat_default_iocharset) 1504 if (sbi->options.iocharset != fat_default_iocharset)
1505 kfree(sbi->options.iocharset); 1505 kfree(sbi->options.iocharset);
1506 sb->s_fs_info = NULL; 1506 sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void fat_fs_error(struct super_block *s, const char *fmt, ...) 23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
26 va_list args; 26 va_list args;
27 27
28 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); 28 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
29 30
30 printk(KERN_ERR " "); 31 printk(KERN_ERR " ");
31 va_start(args, fmt); 32 va_start(args, fmt);
32 vprintk(fmt, args); 33 vprintk(fmt, args);
33 va_end(args); 34 va_end(args);
34 printk("\n"); 35 printk("\n");
36 }
35 37
36 if (opts->errors == FAT_ERRORS_PANIC) 38 if (opts->errors == FAT_ERRORS_PANIC)
37 panic(" FAT fs panic from previous error\n"); 39 panic("FAT: fs panic from previous error\n");
38 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
39 s->s_flags |= MS_RDONLY; 41 s->s_flags |= MS_RDONLY;
40 printk(KERN_ERR " File system has been set read-only\n"); 42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
41 } 43 }
42} 44}
43EXPORT_SYMBOL_GPL(fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
44 46
45/* Flushes the number of free clusters on FAT32 */ 47/* Flushes the number of free clusters on FAT32 */
46/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0a140741b39e..f74d270ba155 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/pipe_fs_i.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/ptrace.h> 19#include <linux/ptrace.h>
19#include <linux/signal.h> 20#include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
412 case F_NOTIFY: 413 case F_NOTIFY:
413 err = fcntl_dirnotify(fd, filp, arg); 414 err = fcntl_dirnotify(fd, filp, arg);
414 break; 415 break;
416 case F_SETPIPE_SZ:
417 case F_GETPIPE_SZ:
418 err = pipe_fcntl(filp, cmd, arg);
419 break;
415 default: 420 default:
416 break; 421 break;
417 } 422 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..ea8592b90696 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -42,9 +42,10 @@ struct wb_writeback_args {
42 long nr_pages; 42 long nr_pages;
43 struct super_block *sb; 43 struct super_block *sb;
44 enum writeback_sync_modes sync_mode; 44 enum writeback_sync_modes sync_mode;
45 int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 int for_background:1; 47 unsigned int for_background:1;
48 unsigned int sb_pinned:1;
48}; 49};
49 50
50/* 51/*
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
192} 193}
193 194
194static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 195static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
195 struct wb_writeback_args *args) 196 struct wb_writeback_args *args,
197 int wait)
196{ 198{
197 struct bdi_work *work; 199 struct bdi_work *work;
198 200
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
204 if (work) { 206 if (work) {
205 bdi_work_init(work, args); 207 bdi_work_init(work, args);
206 bdi_queue_work(bdi, work); 208 bdi_queue_work(bdi, work);
209 if (wait)
210 bdi_wait_on_work_clear(work);
207 } else { 211 } else {
208 struct bdi_writeback *wb = &bdi->wb; 212 struct bdi_writeback *wb = &bdi->wb;
209 213
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
230 .sync_mode = WB_SYNC_ALL, 234 .sync_mode = WB_SYNC_ALL,
231 .nr_pages = LONG_MAX, 235 .nr_pages = LONG_MAX,
232 .range_cyclic = 0, 236 .range_cyclic = 0,
237 /*
238 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
239 * lets make it explicitly clear.
240 */
241 .sb_pinned = 1,
233 }; 242 };
234 struct bdi_work work; 243 struct bdi_work work;
235 244
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
245 * @bdi: the backing device to write from 254 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block 255 * @sb: write inodes from this super_block
247 * @nr_pages: the number of pages to write 256 * @nr_pages: the number of pages to write
257 * @sb_locked: caller already holds sb umount sem.
248 * 258 *
249 * Description: 259 * Description:
250 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 260 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
251 * started when this function returns, we make no guarentees on 261 * started when this function returns, we make no guarentees on
252 * completion. Caller need not hold sb s_umount semaphore. 262 * completion. Caller specifies whether sb umount sem is held already or not.
253 * 263 *
254 */ 264 */
255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 265void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
256 long nr_pages) 266 long nr_pages, int sb_locked)
257{ 267{
258 struct wb_writeback_args args = { 268 struct wb_writeback_args args = {
259 .sb = sb, 269 .sb = sb,
260 .sync_mode = WB_SYNC_NONE, 270 .sync_mode = WB_SYNC_NONE,
261 .nr_pages = nr_pages, 271 .nr_pages = nr_pages,
262 .range_cyclic = 1, 272 .range_cyclic = 1,
273 .sb_pinned = sb_locked,
263 }; 274 };
264 275
265 /* 276 /*
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
271 args.for_background = 1; 282 args.for_background = 1;
272 } 283 }
273 284
274 bdi_alloc_queue_work(bdi, &args); 285 bdi_alloc_queue_work(bdi, &args, sb_locked);
275} 286}
276 287
277/* 288/*
@@ -398,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode)
398 wait_queue_head_t *wqh; 409 wait_queue_head_t *wqh;
399 410
400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 411 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
401 do { 412 while (inode->i_state & I_SYNC) {
402 spin_unlock(&inode_lock); 413 spin_unlock(&inode_lock);
403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 414 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
404 spin_lock(&inode_lock); 415 spin_lock(&inode_lock);
405 } while (inode->i_state & I_SYNC); 416 }
406} 417}
407 418
408/* 419/*
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
452 463
453 BUG_ON(inode->i_state & I_SYNC); 464 BUG_ON(inode->i_state & I_SYNC);
454 465
455 /* Set I_SYNC, reset I_DIRTY */ 466 /* Set I_SYNC, reset I_DIRTY_PAGES */
456 dirty = inode->i_state & I_DIRTY;
457 inode->i_state |= I_SYNC; 467 inode->i_state |= I_SYNC;
458 inode->i_state &= ~I_DIRTY; 468 inode->i_state &= ~I_DIRTY_PAGES;
459
460 spin_unlock(&inode_lock); 469 spin_unlock(&inode_lock);
461 470
462 ret = do_writepages(mapping, wbc); 471 ret = do_writepages(mapping, wbc);
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
472 ret = err; 481 ret = err;
473 } 482 }
474 483
484 /*
485 * Some filesystems may redirty the inode during the writeback
486 * due to delalloc, clear dirty metadata flags right before
487 * write_inode()
488 */
489 spin_lock(&inode_lock);
490 dirty = inode->i_state & I_DIRTY;
491 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
492 spin_unlock(&inode_lock);
475 /* Don't write the inode if only I_DIRTY_PAGES was set */ 493 /* Don't write the inode if only I_DIRTY_PAGES was set */
476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 494 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc); 495 int err = write_inode(inode, wbc);
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
577 /* 595 /*
578 * Caller must already hold the ref for this 596 * Caller must already hold the ref for this
579 */ 597 */
580 if (wbc->sync_mode == WB_SYNC_ALL) { 598 if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
581 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 599 WARN_ON(!rwsem_is_locked(&sb->s_umount));
582 return SB_NOT_PINNED; 600 return SB_NOT_PINNED;
583 } 601 }
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
751 .for_kupdate = args->for_kupdate, 769 .for_kupdate = args->for_kupdate,
752 .for_background = args->for_background, 770 .for_background = args->for_background,
753 .range_cyclic = args->range_cyclic, 771 .range_cyclic = args->range_cyclic,
772 .sb_pinned = args->sb_pinned,
754 }; 773 };
755 unsigned long oldest_jif; 774 unsigned long oldest_jif;
756 long wrote = 0; 775 long wrote = 0;
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
852 unsigned long expired; 871 unsigned long expired;
853 long nr_pages; 872 long nr_pages;
854 873
874 /*
875 * When set to zero, disable periodic writeback
876 */
877 if (!dirty_writeback_interval)
878 return 0;
879
855 expired = wb->last_old_flush + 880 expired = wb->last_old_flush +
856 msecs_to_jiffies(dirty_writeback_interval * 10); 881 msecs_to_jiffies(dirty_writeback_interval * 10);
857 if (time_before(jiffies, expired)) 882 if (time_before(jiffies, expired))
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
887 912
888 while ((work = get_next_work_item(bdi, wb)) != NULL) { 913 while ((work = get_next_work_item(bdi, wb)) != NULL) {
889 struct wb_writeback_args args = work->args; 914 struct wb_writeback_args args = work->args;
915 int post_clear;
890 916
891 /* 917 /*
892 * Override sync mode, in case we must wait for completion 918 * Override sync mode, in case we must wait for completion
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
894 if (force_wait) 920 if (force_wait)
895 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
896 922
923 post_clear = WB_SYNC_ALL || args.sb_pinned;
924
897 /* 925 /*
898 * If this isn't a data integrity operation, just notify 926 * If this isn't a data integrity operation, just notify
899 * that we have seen this work and we are now starting it. 927 * that we have seen this work and we are now starting it.
900 */ 928 */
901 if (args.sync_mode == WB_SYNC_NONE) 929 if (!post_clear)
902 wb_clear_pending(wb, work); 930 wb_clear_pending(wb, work);
903 931
904 wrote += wb_writeback(wb, &args); 932 wrote += wb_writeback(wb, &args);
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
907 * This is a data integrity writeback, so only do the 935 * This is a data integrity writeback, so only do the
908 * notification when we have completed the work. 936 * notification when we have completed the work.
909 */ 937 */
910 if (args.sync_mode == WB_SYNC_ALL) 938 if (post_clear)
911 wb_clear_pending(wb, work); 939 wb_clear_pending(wb, work);
912 } 940 }
913 941
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
947 break; 975 break;
948 } 976 }
949 977
950 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 978 if (dirty_writeback_interval) {
951 schedule_timeout_interruptible(wait_jiffies); 979 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
980 schedule_timeout_interruptible(wait_jiffies);
981 } else {
982 set_current_state(TASK_INTERRUPTIBLE);
983 if (list_empty_careful(&wb->bdi->work_list) &&
984 !kthread_should_stop())
985 schedule();
986 __set_current_state(TASK_RUNNING);
987 }
988
952 try_to_freeze(); 989 try_to_freeze();
953 } 990 }
954 991
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
974 if (!bdi_has_dirty_io(bdi)) 1011 if (!bdi_has_dirty_io(bdi))
975 continue; 1012 continue;
976 1013
977 bdi_alloc_queue_work(bdi, &args); 1014 bdi_alloc_queue_work(bdi, &args, 0);
978 } 1015 }
979 1016
980 rcu_read_unlock(); 1017 rcu_read_unlock();
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
1183 iput(old_inode); 1220 iput(old_inode);
1184} 1221}
1185 1222
1223static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1224{
1225 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1226 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1227 long nr_to_write;
1228
1229 nr_to_write = nr_dirty + nr_unstable +
1230 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1231
1232 bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
1233}
1234
1186/** 1235/**
1187 * writeback_inodes_sb - writeback dirty inodes from given super_block 1236 * writeback_inodes_sb - writeback dirty inodes from given super_block
1188 * @sb: the superblock 1237 * @sb: the superblock
@@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb)
1194 */ 1243 */
1195void writeback_inodes_sb(struct super_block *sb) 1244void writeback_inodes_sb(struct super_block *sb)
1196{ 1245{
1197 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1246 __writeback_inodes_sb(sb, 0);
1198 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1199 long nr_to_write;
1200
1201 nr_to_write = nr_dirty + nr_unstable +
1202 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1203
1204 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1205} 1247}
1206EXPORT_SYMBOL(writeback_inodes_sb); 1248EXPORT_SYMBOL(writeback_inodes_sb);
1207 1249
1208/** 1250/**
1251 * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
1252 * @sb: the superblock
1253 *
1254 * Like writeback_inodes_sb(), except the caller already holds the
1255 * sb umount sem.
1256 */
1257void writeback_inodes_sb_locked(struct super_block *sb)
1258{
1259 __writeback_inodes_sb(sb, 1);
1260}
1261
1262/**
1209 * writeback_inodes_sb_if_idle - start writeback if none underway 1263 * writeback_inodes_sb_if_idle - start writeback if none underway
1210 * @sb: the superblock 1264 * @sb: the superblock
1211 * 1265 *
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index fe5df5457656..99800e564157 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
201 return -EAGAIN; 201 return -EAGAIN;
202} 202}
203 203
204struct xattr_handler generic_acl_access_handler = { 204const struct xattr_handler generic_acl_access_handler = {
205 .prefix = POSIX_ACL_XATTR_ACCESS, 205 .prefix = POSIX_ACL_XATTR_ACCESS,
206 .flags = ACL_TYPE_ACCESS, 206 .flags = ACL_TYPE_ACCESS,
207 .list = generic_acl_list, 207 .list = generic_acl_list,
@@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
209 .set = generic_acl_set, 209 .set = generic_acl_set,
210}; 210};
211 211
212struct xattr_handler generic_acl_default_handler = { 212const struct xattr_handler generic_acl_default_handler = {
213 .prefix = POSIX_ACL_XATTR_DEFAULT, 213 .prefix = POSIX_ACL_XATTR_DEFAULT,
214 .flags = ACL_TYPE_DEFAULT, 214 .flags = ACL_TYPE_DEFAULT,
215 .list = generic_acl_list, 215 .list = generic_acl_list,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index ca991d776592..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -339,7 +339,7 @@ out:
339 return error; 339 return error;
340} 340}
341 341
342struct xattr_handler gfs2_xattr_system_handler = { 342const struct xattr_handler gfs2_xattr_system_handler = {
343 .prefix = XATTR_SYSTEM_PREFIX, 343 .prefix = XATTR_SYSTEM_PREFIX,
344 .flags = GFS2_EATYPE_SYS, 344 .flags = GFS2_EATYPE_SYS,
345 .get = gfs2_xattr_system_get, 345 .get = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
19extern int gfs2_check_acl(struct inode *inode, int mask); 19extern int gfs2_check_acl(struct inode *inode, int mask);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern struct xattr_handler gfs2_xattr_system_handler; 22extern const struct xattr_handler gfs2_xattr_system_handler;
23 23
24#endif /* __ACL_DOT_H__ */ 24#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index d5f4661287f9..49667d68769e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1476,8 +1476,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1476 return 0; 1476 return 0;
1477} 1477}
1478 1478
1479static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id, 1479static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1480 struct fs_disk_quota *fdq) 1480 struct fs_disk_quota *fdq)
1481{ 1481{
1482 struct gfs2_sbd *sdp = sb->s_fs_info; 1482 struct gfs2_sbd *sdp = sb->s_fs_info;
1483 struct gfs2_quota_lvb *qlvb; 1483 struct gfs2_quota_lvb *qlvb;
@@ -1521,8 +1521,8 @@ out:
1521/* GFS2 only supports a subset of the XFS fields */ 1521/* GFS2 only supports a subset of the XFS fields */
1522#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) 1522#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
1523 1523
1524static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id, 1524static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1525 struct fs_disk_quota *fdq) 1525 struct fs_disk_quota *fdq)
1526{ 1526{
1527 struct gfs2_sbd *sdp = sb->s_fs_info; 1527 struct gfs2_sbd *sdp = sb->s_fs_info;
1528 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); 1528 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1629,7 +1629,7 @@ out_put:
1629const struct quotactl_ops gfs2_quotactl_ops = { 1629const struct quotactl_ops gfs2_quotactl_ops = {
1630 .quota_sync = gfs2_quota_sync, 1630 .quota_sync = gfs2_quota_sync,
1631 .get_xstate = gfs2_quota_get_xstate, 1631 .get_xstate = gfs2_quota_get_xstate,
1632 .get_xquota = gfs2_xquota_get, 1632 .get_dqblk = gfs2_get_dqblk,
1633 .set_xquota = gfs2_xquota_set, 1633 .set_dqblk = gfs2_set_dqblk,
1634}; 1634};
1635 1635
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6daf4c65a3c8..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 854 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 855 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 856 nr_sects, GFP_NOFS,
857 DISCARD_FL_BARRIER); 857 BLKDEV_IFL_WAIT |
858 BLKDEV_IFL_BARRIER);
858 if (rv) 859 if (rv)
859 goto fail; 860 goto fail;
860 nr_sects = 0; 861 nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
869 } 870 }
870 if (nr_sects) { 871 if (nr_sects) {
871 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
872 DISCARD_FL_BARRIER); 873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
873 if (rv) 874 if (rv)
874 goto fail; 875 goto fail;
875 } 876 }
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
54extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
55extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
56extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[]; 57extern const struct xattr_handler *gfs2_xattr_handlers[];
58 58
59#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
60 60
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..82f93da00d1b 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
1535 return error; 1535 return error;
1536} 1536}
1537 1537
1538static struct xattr_handler gfs2_xattr_user_handler = { 1538static const struct xattr_handler gfs2_xattr_user_handler = {
1539 .prefix = XATTR_USER_PREFIX, 1539 .prefix = XATTR_USER_PREFIX,
1540 .flags = GFS2_EATYPE_USR, 1540 .flags = GFS2_EATYPE_USR,
1541 .get = gfs2_xattr_get, 1541 .get = gfs2_xattr_get,
1542 .set = gfs2_xattr_set, 1542 .set = gfs2_xattr_set,
1543}; 1543};
1544 1544
1545static struct xattr_handler gfs2_xattr_security_handler = { 1545static const struct xattr_handler gfs2_xattr_security_handler = {
1546 .prefix = XATTR_SECURITY_PREFIX, 1546 .prefix = XATTR_SECURITY_PREFIX,
1547 .flags = GFS2_EATYPE_SECURITY, 1547 .flags = GFS2_EATYPE_SECURITY,
1548 .get = gfs2_xattr_get, 1548 .get = gfs2_xattr_get,
1549 .set = gfs2_xattr_set, 1549 .set = gfs2_xattr_set,
1550}; 1550};
1551 1551
1552struct xattr_handler *gfs2_xattr_handlers[] = { 1552const struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler, 1553 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler, 1554 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler, 1555 &gfs2_xattr_system_handler,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
494const struct file_operations hfsplus_dir_operations = { 494const struct file_operations hfsplus_dir_operations = {
495 .read = generic_read_dir, 495 .read = generic_read_dir,
496 .readdir = hfsplus_readdir, 496 .readdir = hfsplus_readdir,
497 .ioctl = hfsplus_ioctl, 497 .unlocked_ioctl = hfsplus_ioctl,
498 .llseek = generic_file_llseek, 498 .llseek = generic_file_llseek,
499 .release = hfsplus_dir_release, 499 .release = hfsplus_dir_release,
500}; 500};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
337void hfsplus_delete_inode(struct inode *); 337void hfsplus_delete_inode(struct inode *);
338 338
339/* ioctl.c */ 339/* ioctl.c */
340int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 340long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
341 unsigned long arg);
342int hfsplus_setxattr(struct dentry *dentry, const char *name, 341int hfsplus_setxattr(struct dentry *dentry, const char *name,
343 const void *value, size_t size, int flags); 342 const void *value, size_t size, int flags);
344ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 343ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
285 .fsync = file_fsync, 285 .fsync = file_fsync,
286 .open = hfsplus_file_open, 286 .open = hfsplus_file_open,
287 .release = hfsplus_file_release, 287 .release = hfsplus_file_release,
288 .ioctl = hfsplus_ioctl, 288 .unlocked_ioctl = hfsplus_ioctl,
289}; 289};
290 290
291struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 291struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "hfsplus_fs.h" 22#include "hfsplus_fs.h"
22 23
23int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 25{
26 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 27 unsigned int flags;
27 28
29 lock_kernel();
28 switch (cmd) { 30 switch (cmd) {
29 case HFSPLUS_IOC_EXT2_GETFLAGS: 31 case HFSPLUS_IOC_EXT2_GETFLAGS:
30 flags = 0; 32 flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
38 case HFSPLUS_IOC_EXT2_SETFLAGS: { 40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
39 int err = 0; 41 int err = 0;
40 err = mnt_want_write(filp->f_path.mnt); 42 err = mnt_want_write(filp->f_path.mnt);
41 if (err) 43 if (err) {
44 unlock_kernel();
42 return err; 45 return err;
46 }
43 47
44 if (!is_owner_or_cap(inode)) { 48 if (!is_owner_or_cap(inode)) {
45 err = -EACCES; 49 err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
85 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
86setflags_out: 90setflags_out:
87 mnt_drop_write(filp->f_path.mnt); 91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
88 return err; 93 return err;
89 } 94 }
90 default: 95 default:
96 unlock_kernel();
91 return -ENOTTY; 97 return -ENOTTY;
92 } 98 }
93} 99}
diff --git a/fs/inode.c b/fs/inode.c
index 258ec22bb298..2bee20ae3d65 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,11 +286,9 @@ static void init_once(void *foo)
286 */ 286 */
287void __iget(struct inode *inode) 287void __iget(struct inode *inode)
288{ 288{
289 if (atomic_read(&inode->i_count)) { 289 if (atomic_inc_return(&inode->i_count) != 1)
290 atomic_inc(&inode->i_count);
291 return; 290 return;
292 } 291
293 atomic_inc(&inode->i_count);
294 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 292 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
295 list_move(&inode->i_list, &inode_in_use); 293 list_move(&inode->i_list, &inode_in_use);
296 inodes_stat.nr_unused--; 294 inodes_stat.nr_unused--;
@@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1608 inode->i_ino); 1606 inode->i_ino);
1609} 1607}
1610EXPORT_SYMBOL(init_special_inode); 1608EXPORT_SYMBOL(init_special_inode);
1609
1610/**
1611 * Init uid,gid,mode for new inode according to posix standards
1612 * @inode: New inode
1613 * @dir: Directory inode
1614 * @mode: mode of the new inode
1615 */
1616void inode_init_owner(struct inode *inode, const struct inode *dir,
1617 mode_t mode)
1618{
1619 inode->i_uid = current_fsuid();
1620 if (dir && dir->i_mode & S_ISGID) {
1621 inode->i_gid = dir->i_gid;
1622 if (S_ISDIR(mode))
1623 mode |= S_ISGID;
1624 } else
1625 inode->i_gid = current_fsgid();
1626 inode->i_mode = mode;
1627}
1628EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..6b706bc60a66 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void);
87 * super.c 87 * super.c
88 */ 88 */
89extern int do_remount_sb(struct super_block *, int, void *, int); 89extern int do_remount_sb(struct super_block *, int, void *, int);
90extern void __put_super(struct super_block *sb);
91extern void put_super(struct super_block *sb);
90 92
91/* 93/*
92 * open.c 94 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4da939..2d140a713861 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
525 if (sb->s_op->freeze_fs == NULL) 525 if (sb->s_op->freeze_fs == NULL)
526 return -EOPNOTSUPP; 526 return -EOPNOTSUPP;
527 527
528 /* If a blockdevice-backed filesystem isn't specified, return. */
529 if (sb->s_bdev == NULL)
530 return -EINVAL;
531
532 /* Freeze */ 528 /* Freeze */
533 sb = freeze_bdev(sb->s_bdev); 529 return freeze_super(sb);
534 if (IS_ERR(sb))
535 return PTR_ERR(sb);
536 return 0;
537} 530}
538 531
539static int ioctl_fsthaw(struct file *filp) 532static int ioctl_fsthaw(struct file *filp)
@@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
543 if (!capable(CAP_SYS_ADMIN)) 536 if (!capable(CAP_SYS_ADMIN))
544 return -EPERM; 537 return -EPERM;
545 538
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 /* Thaw */ 539 /* Thaw */
551 return thaw_bdev(sb->s_bdev, sb); 540 return thaw_super(sb);
552} 541}
553 542
554/* 543/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ecb44c94ba8d..28a9ddaa0c49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -786,6 +786,12 @@ wait_for_iobuf:
786 786
787 jbd_debug(3, "JBD: commit phase 6\n"); 787 jbd_debug(3, "JBD: commit phase 6\n");
788 788
789 /* All metadata is written, now write commit record and do cleanup */
790 spin_lock(&journal->j_state_lock);
791 J_ASSERT(commit_transaction->t_state == T_COMMIT);
792 commit_transaction->t_state = T_COMMIT_RECORD;
793 spin_unlock(&journal->j_state_lock);
794
789 if (journal_write_commit_record(journal, commit_transaction)) 795 if (journal_write_commit_record(journal, commit_transaction))
790 err = -EIO; 796 err = -EIO;
791 797
@@ -923,7 +929,7 @@ restart_loop:
923 929
924 jbd_debug(3, "JBD: commit phase 8\n"); 930 jbd_debug(3, "JBD: commit phase 8\n");
925 931
926 J_ASSERT(commit_transaction->t_state == T_COMMIT); 932 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
927 933
928 commit_transaction->t_state = T_FINISHED; 934 commit_transaction->t_state = T_FINISHED;
929 J_ASSERT(commit_transaction == journal->j_committing_transaction); 935 J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..93d1e47647bd 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
565} 565}
566 566
567/* 567/*
568 * Return 1 if a given transaction has not yet sent barrier request
569 * connected with a transaction commit. If 0 is returned, transaction
570 * may or may not have sent the barrier. Used to avoid sending barrier
571 * twice in common cases.
572 */
573int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
574{
575 int ret = 0;
576 transaction_t *commit_trans;
577
578 if (!(journal->j_flags & JFS_BARRIER))
579 return 0;
580 spin_lock(&journal->j_state_lock);
581 /* Transaction already committed? */
582 if (tid_geq(journal->j_commit_sequence, tid))
583 goto out;
584 /*
585 * Transaction is being committed and we already proceeded to
586 * writing commit record?
587 */
588 commit_trans = journal->j_committing_transaction;
589 if (commit_trans && commit_trans->t_tid == tid &&
590 commit_trans->t_state >= T_COMMIT_RECORD)
591 goto out;
592 ret = 1;
593out:
594 spin_unlock(&journal->j_state_lock);
595 return ret;
596}
597EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
598
599/*
568 * Log buffer allocation routines: 600 * Log buffer allocation routines:
569 */ 601 */
570 602
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
1157{ 1189{
1158 int err = 0; 1190 int err = 0;
1159 1191
1192
1160 /* Wait for the commit thread to wake up and die. */ 1193 /* Wait for the commit thread to wake up and die. */
1161 journal_kill_thread(journal); 1194 journal_kill_thread(journal);
1162 1195
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 530 */
531 if ((journal->j_fs_dev != journal->j_dev) && 531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL); 533 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
534 BLKDEV_IFL_WAIT);
534 if (!(journal->j_flags & JBD2_ABORT)) 535 if (!(journal->j_flags & JBD2_ABORT))
535 jbd2_journal_update_superblock(journal, 1); 536 jbd2_journal_update_superblock(journal, 1);
536 return 0; 537 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL); 720 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
721 BLKDEV_IFL_WAIT);
721 722
722 /* Done it all: now write the commit record asynchronously. */ 723 /* Done it all: now write the commit record asynchronously. */
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 724 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
727 if (err) 728 if (err)
728 __jbd2_journal_abort_hard(journal); 729 __jbd2_journal_abort_hard(journal);
729 if (journal->j_flags & JBD2_BARRIER) 730 if (journal->j_flags & JBD2_BARRIER)
730 blkdev_issue_flush(journal->j_dev, NULL); 731 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 BLKDEV_IFL_WAIT);
731 } 733 }
732 734
733 err = journal_finish_inode_data_buffers(journal, commit_transaction); 735 err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..a33aab6b5e68 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
419 return rc; 419 return rc;
420} 420}
421 421
422struct xattr_handler jffs2_acl_access_xattr_handler = { 422const struct xattr_handler jffs2_acl_access_xattr_handler = {
423 .prefix = POSIX_ACL_XATTR_ACCESS, 423 .prefix = POSIX_ACL_XATTR_ACCESS,
424 .flags = ACL_TYPE_DEFAULT, 424 .flags = ACL_TYPE_DEFAULT,
425 .list = jffs2_acl_access_listxattr, 425 .list = jffs2_acl_access_listxattr,
@@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
427 .set = jffs2_acl_setxattr, 427 .set = jffs2_acl_setxattr,
428}; 428};
429 429
430struct xattr_handler jffs2_acl_default_xattr_handler = { 430const struct xattr_handler jffs2_acl_default_xattr_handler = {
431 .prefix = POSIX_ACL_XATTR_DEFAULT, 431 .prefix = POSIX_ACL_XATTR_DEFAULT,
432 .flags = ACL_TYPE_DEFAULT, 432 .flags = ACL_TYPE_DEFAULT,
433 .list = jffs2_acl_default_listxattr, 433 .list = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern struct xattr_handler jffs2_acl_access_xattr_handler; 34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
35extern struct xattr_handler jffs2_acl_default_xattr_handler; 35extern const struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
77 return retlen; 77 return retlen;
78} 78}
79 79
80struct xattr_handler jffs2_security_xattr_handler = { 80const struct xattr_handler jffs2_security_xattr_handler = {
81 .prefix = XATTR_SECURITY_PREFIX, 81 .prefix = XATTR_SECURITY_PREFIX,
82 .list = jffs2_security_listxattr, 82 .list = jffs2_security_listxattr,
83 .set = jffs2_security_setxattr, 83 .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..a2d58c96f1b4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) 904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
905 * is an implementation of setxattr handler on jffs2. 905 * is an implementation of setxattr handler on jffs2.
906 * -------------------------------------------------- */ 906 * -------------------------------------------------- */
907struct xattr_handler *jffs2_xattr_handlers[] = { 907const struct xattr_handler *jffs2_xattr_handlers[] = {
908 &jffs2_user_xattr_handler, 908 &jffs2_user_xattr_handler,
909#ifdef CONFIG_JFFS2_FS_SECURITY 909#ifdef CONFIG_JFFS2_FS_SECURITY
910 &jffs2_security_xattr_handler, 910 &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
917 NULL 917 NULL
918}; 918};
919 919
920static struct xattr_handler *xprefix_to_handler(int xprefix) { 920static const struct xattr_handler *xprefix_to_handler(int xprefix) {
921 struct xattr_handler *ret; 921 const struct xattr_handler *ret;
922 922
923 switch (xprefix) { 923 switch (xprefix) {
924 case JFFS2_XPREFIX_USER: 924 case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
955 struct jffs2_inode_cache *ic = f->inocache; 955 struct jffs2_inode_cache *ic = f->inocache;
956 struct jffs2_xattr_ref *ref, **pref; 956 struct jffs2_xattr_ref *ref, **pref;
957 struct jffs2_xattr_datum *xd; 957 struct jffs2_xattr_datum *xd;
958 struct xattr_handler *xhandle; 958 const struct xattr_handler *xhandle;
959 ssize_t len, rc; 959 ssize_t len, rc;
960 int retry = 0; 960 int retry = 0;
961 961
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, 93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
94 const char *buffer, size_t size, int flags); 94 const char *buffer, size_t size, int flags);
95 95
96extern struct xattr_handler *jffs2_xattr_handlers[]; 96extern const struct xattr_handler *jffs2_xattr_handlers[];
97extern struct xattr_handler jffs2_user_xattr_handler; 97extern const struct xattr_handler jffs2_user_xattr_handler;
98extern struct xattr_handler jffs2_trusted_xattr_handler; 98extern const struct xattr_handler jffs2_trusted_xattr_handler;
99 99
100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); 100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
101#define jffs2_getxattr generic_getxattr 101#define jffs2_getxattr generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
122 122
123#ifdef CONFIG_JFFS2_FS_SECURITY 123#ifdef CONFIG_JFFS2_FS_SECURITY
124extern int jffs2_init_security(struct inode *inode, struct inode *dir); 124extern int jffs2_init_security(struct inode *inode, struct inode *dir);
125extern struct xattr_handler jffs2_security_xattr_handler; 125extern const struct xattr_handler jffs2_security_xattr_handler;
126#else 126#else
127#define jffs2_init_security(inode,dir) (0) 127#define jffs2_init_security(inode,dir) (0)
128#endif /* CONFIG_JFFS2_FS_SECURITY */ 128#endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_trusted_xattr_handler = { 50const struct xattr_handler jffs2_trusted_xattr_handler = {
51 .prefix = XATTR_TRUSTED_PREFIX, 51 .prefix = XATTR_TRUSTED_PREFIX,
52 .list = jffs2_trusted_listxattr, 52 .list = jffs2_trusted_listxattr,
53 .set = jffs2_trusted_setxattr, 53 .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_user_xattr_handler = { 50const struct xattr_handler jffs2_user_xattr_handler = {
51 .prefix = XATTR_USER_PREFIX, 51 .prefix = XATTR_USER_PREFIX,
52 .list = jffs2_user_listxattr, 52 .list = jffs2_user_listxattr,
53 .set = jffs2_user_setxattr, 53 .set = jffs2_user_setxattr,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982b3f24..85d9ec659225 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -98,7 +98,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
98 if (rc) 98 if (rc)
99 return rc; 99 return rc;
100 100
101 if (iattr->ia_valid & ATTR_SIZE) 101 if (is_quota_modification(inode, iattr))
102 dquot_initialize(inode); 102 dquot_initialize(inode);
103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
98 goto fail_unlock; 98 goto fail_unlock;
99 } 99 }
100 100
101 inode->i_uid = current_fsuid(); 101 inode_init_owner(inode, parent, mode);
102 if (parent->i_mode & S_ISGID) {
103 inode->i_gid = parent->i_gid;
104 if (S_ISDIR(mode))
105 mode |= S_ISGID;
106 } else
107 inode->i_gid = current_fsgid();
108
109 /* 102 /*
110 * New inodes need to save sane values on disk when 103 * New inodes need to save sane values on disk when
111 * uid & gid mount options are used 104 * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
121 if (rc) 114 if (rc)
122 goto fail_drop; 115 goto fail_drop;
123 116
124 inode->i_mode = mode;
125 /* inherit flags from parent */ 117 /* inherit flags from parent */
126 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; 118 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
127 119
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
134 if (S_ISLNK(mode)) 126 if (S_ISLNK(mode))
135 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); 127 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
136 } 128 }
137 jfs_inode->mode2 |= mode; 129 jfs_inode->mode2 |= inode->i_mode;
138 130
139 inode->i_blocks = 0; 131 inode->i_blocks = 0;
140 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 132 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 755a92e8daa7..f602e230e162 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
358 inode->i_mode = mode; 358 inode->i_mode = mode;
359 logfs_set_ino_generation(sb, inode); 359 logfs_set_ino_generation(sb, inode);
360 360
361 inode->i_uid = current_fsuid(); 361 inode_init_owner(inode, dir, mode);
362 inode->i_gid = current_fsgid();
363 if (dir->i_mode & S_ISGID) {
364 inode->i_gid = dir->i_gid;
365 if (S_ISDIR(mode))
366 inode->i_mode |= S_ISGID;
367 }
368
369 logfs_inode_setops(inode); 362 logfs_inode_setops(inode);
370 insert_inode_hash(inode); 363 insert_inode_hash(inode);
371 364
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..482779fe4e7c 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
221 clear_inode(inode); /* clear in-memory copy */ 221 clear_inode(inode); /* clear in-memory copy */
222} 222}
223 223
224struct inode * minix_new_inode(const struct inode * dir, int * error) 224struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
225{ 225{
226 struct super_block *sb = dir->i_sb; 226 struct super_block *sb = dir->i_sb;
227 struct minix_sb_info *sbi = minix_sb(sb); 227 struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
263 iput(inode); 263 iput(inode);
264 return NULL; 264 return NULL;
265 } 265 }
266 inode->i_uid = current_fsuid(); 266 inode_init_owner(inode, dir, mode);
267 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
268 inode->i_ino = j; 267 inode->i_ino = j;
269 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 268 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
270 inode->i_blocks = 0; 269 inode->i_blocks = 0;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..111f34ee9e3b 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
46extern struct inode *minix_iget(struct super_block *, unsigned long); 46extern struct inode *minix_iget(struct super_block *, unsigned long);
47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); 47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode * dir, int * error); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..e20ee85955d1 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
46 if (!old_valid_dev(rdev)) 46 if (!old_valid_dev(rdev))
47 return -EINVAL; 47 return -EINVAL;
48 48
49 inode = minix_new_inode(dir, &error); 49 inode = minix_new_inode(dir, mode, &error);
50 50
51 if (inode) { 51 if (inode) {
52 inode->i_mode = mode;
53 minix_set_inode(inode, rdev); 52 minix_set_inode(inode, rdev);
54 mark_inode_dirty(inode); 53 mark_inode_dirty(inode);
55 error = add_nondir(dentry, inode); 54 error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
73 if (i > dir->i_sb->s_blocksize) 72 if (i > dir->i_sb->s_blocksize)
74 goto out; 73 goto out;
75 74
76 inode = minix_new_inode(dir, &err); 75 inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
77 if (!inode) 76 if (!inode)
78 goto out; 77 goto out;
79 78
80 inode->i_mode = S_IFLNK | 0777;
81 minix_set_inode(inode, 0); 79 minix_set_inode(inode, 0);
82 err = page_symlink(inode, symname, i); 80 err = page_symlink(inode, symname, i);
83 if (err) 81 if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
117 115
118 inode_inc_link_count(dir); 116 inode_inc_link_count(dir);
119 117
120 inode = minix_new_inode(dir, &err); 118 inode = minix_new_inode(dir, mode, &err);
121 if (!inode) 119 if (!inode)
122 goto out_dir; 120 goto out_dir;
123 121
124 inode->i_mode = S_IFDIR | mode;
125 if (dir->i_mode & S_ISGID)
126 inode->i_mode |= S_ISGID;
127 minix_set_inode(inode, 0); 122 minix_set_inode(inode, 0);
128 123
129 inode_inc_link_count(inode); 124 inode_inc_link_count(inode);
diff --git a/fs/namei.c b/fs/namei.c
index b86b96fe1dc3..48e1f60520ea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
523static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 523static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
524{ 524{
525 dput(nd->path.dentry); 525 dput(nd->path.dentry);
526 if (nd->path.mnt != path->mnt) 526 if (nd->path.mnt != path->mnt) {
527 mntput(nd->path.mnt); 527 mntput(nd->path.mnt);
528 nd->path.mnt = path->mnt; 528 nd->path.mnt = path->mnt;
529 }
529 nd->path.dentry = path->dentry; 530 nd->path.dentry = path->dentry;
530} 531}
531 532
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..92dde6f8d893 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -51,7 +51,7 @@ const struct file_operations ncp_dir_operations =
51{ 51{
52 .read = generic_read_dir, 52 .read = generic_read_dir,
53 .readdir = ncp_readdir, 53 .readdir = ncp_readdir,
54 .ioctl = ncp_ioctl, 54 .unlocked_ioctl = ncp_ioctl,
55#ifdef CONFIG_COMPAT 55#ifdef CONFIG_COMPAT
56 .compat_ioctl = ncp_compat_ioctl, 56 .compat_ioctl = ncp_compat_ioctl,
57#endif 57#endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..b93870892892 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
295 .llseek = ncp_remote_llseek, 295 .llseek = ncp_remote_llseek,
296 .read = ncp_file_read, 296 .read = ncp_file_read,
297 .write = ncp_file_write, 297 .write = ncp_file_write,
298 .ioctl = ncp_ioctl, 298 .unlocked_ioctl = ncp_ioctl,
299#ifdef CONFIG_COMPAT 299#ifdef CONFIG_COMPAT
300 .compat_ioctl = ncp_compat_ioctl, 300 .compat_ioctl = ncp_compat_ioctl,
301#endif 301#endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,6 +20,7 @@
20#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/smp_lock.h>
23 24
24#include <linux/ncp_fs.h> 25#include <linux/ncp_fs.h>
25 26
@@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 262}
262#endif /* CONFIG_NCPFS_NLS */ 263#endif /* CONFIG_NCPFS_NLS */
263 264
264static int __ncp_ioctl(struct inode *inode, struct file *filp, 265static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
265 unsigned int cmd, unsigned long arg)
266{ 266{
267 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 268 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 269 int result;
269 struct ncp_ioctl_request request; 270 struct ncp_ioctl_request request;
@@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
841 } 842 }
842} 843}
843 844
844int ncp_ioctl(struct inode *inode, struct file *filp, 845long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845 unsigned int cmd, unsigned long arg)
846{ 846{
847 int ret; 847 long ret;
848 848
849 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) { 850 if (ncp_ioctl_need_write(cmd)) {
850 /* 851 /*
851 * inside the ioctl(), any failures which 852 * inside the ioctl(), any failures which
@@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
853 * -EACCESS, so it seems consistent to keep 854 * -EACCESS, so it seems consistent to keep
854 * that here. 855 * that here.
855 */ 856 */
856 if (mnt_want_write(filp->f_path.mnt)) 857 if (mnt_want_write(filp->f_path.mnt)) {
857 return -EACCES; 858 ret = -EACCES;
859 goto out;
860 }
858 } 861 }
859 ret = __ncp_ioctl(inode, filp, cmd, arg); 862 ret = __ncp_ioctl(filp, cmd, arg);
860 if (ncp_ioctl_need_write(cmd)) 863 if (ncp_ioctl_need_write(cmd))
861 mnt_drop_write(filp->f_path.mnt); 864 mnt_drop_write(filp->f_path.mnt);
865
866out:
867 unlock_kernel();
862 return ret; 868 return ret;
863} 869}
864 870
865#ifdef CONFIG_COMPAT 871#ifdef CONFIG_COMPAT
866long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 872long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
867{ 873{
868 struct inode *inode = file->f_path.dentry->d_inode; 874 long ret;
869 int ret;
870 875
871 lock_kernel(); 876 lock_kernel();
872 arg = (unsigned long) compat_ptr(arg); 877 arg = (unsigned long) compat_ptr(arg);
873 ret = ncp_ioctl(inode, file, cmd, arg); 878 ret = ncp_ioctl(file, cmd, arg);
874 unlock_kernel(); 879 unlock_kernel();
875 return ret; 880 return ret;
876} 881}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa2..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
1060 goto out_nomem; 1060 goto out_nomem;
1061 rc = strict_strtoul(string, 10, &option); 1061 rc = strict_strtoul(string, 10, &option);
1062 kfree(string); 1062 kfree(string);
1063 if (rc != 0 || option > USHORT_MAX) 1063 if (rc != 0 || option > USHRT_MAX)
1064 goto out_invalid_value; 1064 goto out_invalid_value;
1065 mnt->nfs_server.port = option; 1065 mnt->nfs_server.port = option;
1066 break; 1066 break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
1181 goto out_nomem; 1181 goto out_nomem;
1182 rc = strict_strtoul(string, 10, &option); 1182 rc = strict_strtoul(string, 10, &option);
1183 kfree(string); 1183 kfree(string);
1184 if (rc != 0 || option > USHORT_MAX) 1184 if (rc != 0 || option > USHRT_MAX)
1185 goto out_invalid_value; 1185 goto out_invalid_value;
1186 mnt->mount_server.port = option; 1186 mnt->mount_server.port = option;
1187 break; 1187 break;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7a9ae3254a4b..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -44,8 +44,7 @@
44#define NFSDDBG_FACILITY NFSDDBG_PROC 44#define NFSDDBG_FACILITY NFSDDBG_PROC
45 45
46/* Globals */ 46/* Globals */
47static struct path rec_dir; 47static struct file *rec_file;
48static int rec_dir_init = 0;
49 48
50static int 49static int
51nfs4_save_creds(const struct cred **original_creds) 50nfs4_save_creds(const struct cred **original_creds)
@@ -117,33 +116,28 @@ out_no_tfm:
117 return status; 116 return status;
118} 117}
119 118
120static void
121nfsd4_sync_rec_dir(void)
122{
123 vfs_fsync(NULL, rec_dir.dentry, 0);
124}
125
126int 119int
127nfsd4_create_clid_dir(struct nfs4_client *clp) 120nfsd4_create_clid_dir(struct nfs4_client *clp)
128{ 121{
129 const struct cred *original_cred; 122 const struct cred *original_cred;
130 char *dname = clp->cl_recdir; 123 char *dname = clp->cl_recdir;
131 struct dentry *dentry; 124 struct dentry *dir, *dentry;
132 int status; 125 int status;
133 126
134 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 127 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
135 128
136 if (!rec_dir_init || clp->cl_firststate) 129 if (!rec_file || clp->cl_firststate)
137 return 0; 130 return 0;
138 131
139 status = nfs4_save_creds(&original_cred); 132 status = nfs4_save_creds(&original_cred);
140 if (status < 0) 133 if (status < 0)
141 return status; 134 return status;
142 135
136 dir = rec_file->f_path.dentry;
143 /* lock the parent */ 137 /* lock the parent */
144 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 138 mutex_lock(&dir->d_inode->i_mutex);
145 139
146 dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); 140 dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
147 if (IS_ERR(dentry)) { 141 if (IS_ERR(dentry)) {
148 status = PTR_ERR(dentry); 142 status = PTR_ERR(dentry);
149 goto out_unlock; 143 goto out_unlock;
@@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
153 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); 147 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
154 goto out_put; 148 goto out_put;
155 } 149 }
156 status = mnt_want_write(rec_dir.mnt); 150 status = mnt_want_write(rec_file->f_path.mnt);
157 if (status) 151 if (status)
158 goto out_put; 152 goto out_put;
159 status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); 153 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
160 mnt_drop_write(rec_dir.mnt); 154 mnt_drop_write(rec_file->f_path.mnt);
161out_put: 155out_put:
162 dput(dentry); 156 dput(dentry);
163out_unlock: 157out_unlock:
164 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 158 mutex_unlock(&dir->d_inode->i_mutex);
165 if (status == 0) { 159 if (status == 0) {
166 clp->cl_firststate = 1; 160 clp->cl_firststate = 1;
167 nfsd4_sync_rec_dir(); 161 vfs_fsync(rec_file, 0);
168 } 162 }
169 nfs4_reset_creds(original_cred); 163 nfs4_reset_creds(original_cred);
170 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); 164 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
206 struct dentry *dentry; 200 struct dentry *dentry;
207 int status; 201 int status;
208 202
209 if (!rec_dir_init) 203 if (!rec_file)
210 return 0; 204 return 0;
211 205
212 status = nfs4_save_creds(&original_cred); 206 status = nfs4_save_creds(&original_cred);
213 if (status < 0) 207 if (status < 0)
214 return status; 208 return status;
215 209
216 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 210 filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
217 current_cred()); 211 current_cred());
218 status = PTR_ERR(filp); 212 status = PTR_ERR(filp);
219 if (IS_ERR(filp)) 213 if (IS_ERR(filp))
@@ -250,13 +244,14 @@ out:
250static int 244static int
251nfsd4_unlink_clid_dir(char *name, int namlen) 245nfsd4_unlink_clid_dir(char *name, int namlen)
252{ 246{
253 struct dentry *dentry; 247 struct dentry *dir, *dentry;
254 int status; 248 int status;
255 249
256 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 250 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
257 251
258 mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 252 dir = rec_file->f_path.dentry;
259 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 253 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
254 dentry = lookup_one_len(name, dir, namlen);
260 if (IS_ERR(dentry)) { 255 if (IS_ERR(dentry)) {
261 status = PTR_ERR(dentry); 256 status = PTR_ERR(dentry);
262 goto out_unlock; 257 goto out_unlock;
@@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
264 status = -ENOENT; 259 status = -ENOENT;
265 if (!dentry->d_inode) 260 if (!dentry->d_inode)
266 goto out; 261 goto out;
267 status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); 262 status = vfs_rmdir(dir->d_inode, dentry);
268out: 263out:
269 dput(dentry); 264 dput(dentry);
270out_unlock: 265out_unlock:
271 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 266 mutex_unlock(&dir->d_inode->i_mutex);
272 return status; 267 return status;
273} 268}
274 269
@@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
278 const struct cred *original_cred; 273 const struct cred *original_cred;
279 int status; 274 int status;
280 275
281 if (!rec_dir_init || !clp->cl_firststate) 276 if (!rec_file || !clp->cl_firststate)
282 return; 277 return;
283 278
284 status = mnt_want_write(rec_dir.mnt); 279 status = mnt_want_write(rec_file->f_path.mnt);
285 if (status) 280 if (status)
286 goto out; 281 goto out;
287 clp->cl_firststate = 0; 282 clp->cl_firststate = 0;
@@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
293 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 288 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
294 nfs4_reset_creds(original_cred); 289 nfs4_reset_creds(original_cred);
295 if (status == 0) 290 if (status == 0)
296 nfsd4_sync_rec_dir(); 291 vfs_fsync(rec_file, 0);
297 mnt_drop_write(rec_dir.mnt); 292 mnt_drop_write(rec_file->f_path.mnt);
298out: 293out:
299 if (status) 294 if (status)
300 printk("NFSD: Failed to remove expired client state directory" 295 printk("NFSD: Failed to remove expired client state directory"
@@ -323,19 +318,19 @@ void
323nfsd4_recdir_purge_old(void) { 318nfsd4_recdir_purge_old(void) {
324 int status; 319 int status;
325 320
326 if (!rec_dir_init) 321 if (!rec_file)
327 return; 322 return;
328 status = mnt_want_write(rec_dir.mnt); 323 status = mnt_want_write(rec_file->f_path.mnt);
329 if (status) 324 if (status)
330 goto out; 325 goto out;
331 status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); 326 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
332 if (status == 0) 327 if (status == 0)
333 nfsd4_sync_rec_dir(); 328 vfs_fsync(rec_file, 0);
334 mnt_drop_write(rec_dir.mnt); 329 mnt_drop_write(rec_file->f_path.mnt);
335out: 330out:
336 if (status) 331 if (status)
337 printk("nfsd4: failed to purge old clients from recovery" 332 printk("nfsd4: failed to purge old clients from recovery"
338 " directory %s\n", rec_dir.dentry->d_name.name); 333 " directory %s\n", rec_file->f_path.dentry->d_name.name);
339} 334}
340 335
341static int 336static int
@@ -355,10 +350,13 @@ int
355nfsd4_recdir_load(void) { 350nfsd4_recdir_load(void) {
356 int status; 351 int status;
357 352
358 status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); 353 if (!rec_file)
354 return 0;
355
356 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
359 if (status) 357 if (status)
360 printk("nfsd4: failed loading clients from recovery" 358 printk("nfsd4: failed loading clients from recovery"
361 " directory %s\n", rec_dir.dentry->d_name.name); 359 " directory %s\n", rec_file->f_path.dentry->d_name.name);
362 return status; 360 return status;
363} 361}
364 362
@@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
375 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 373 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
376 rec_dirname); 374 rec_dirname);
377 375
378 BUG_ON(rec_dir_init); 376 BUG_ON(rec_file);
379 377
380 status = nfs4_save_creds(&original_cred); 378 status = nfs4_save_creds(&original_cred);
381 if (status < 0) { 379 if (status < 0) {
@@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
385 return; 383 return;
386 } 384 }
387 385
388 status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 386 rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
389 &rec_dir); 387 if (IS_ERR(rec_file)) {
390 if (status)
391 printk("NFSD: unable to find recovery directory %s\n", 388 printk("NFSD: unable to find recovery directory %s\n",
392 rec_dirname); 389 rec_dirname);
390 rec_file = NULL;
391 }
393 392
394 if (!status)
395 rec_dir_init = 1;
396 nfs4_reset_creds(original_cred); 393 nfs4_reset_creds(original_cred);
397} 394}
398 395
399void 396void
400nfsd4_shutdown_recdir(void) 397nfsd4_shutdown_recdir(void)
401{ 398{
402 if (!rec_dir_init) 399 if (!rec_file)
403 return; 400 return;
404 rec_dir_init = 0; 401 fput(rec_file);
405 path_put(&rec_dir); 402 rec_file = NULL;
406} 403}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
998 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 998 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
999 return -EINVAL; 999 return -EINVAL;
1000 1000
1001 if (port < 1 || port > USHORT_MAX) 1001 if (port < 1 || port > USHRT_MAX)
1002 return -EINVAL; 1002 return -EINVAL;
1003 1003
1004 err = nfsd_create_serv(); 1004 err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) 1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1041 return -EINVAL; 1041 return -EINVAL;
1042 1042
1043 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) 1043 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
1044 return -EINVAL; 1044 return -EINVAL;
1045 1045
1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23c06f77f4ca..ebbf3b6b2457 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -999,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file)
999 999
1000 if (inode->i_state & I_DIRTY) { 1000 if (inode->i_state & I_DIRTY) {
1001 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 1001 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1002 err = vfs_fsync(file, file->f_path.dentry, 0); 1002 err = vfs_fsync(file, 0);
1003 } 1003 }
1004 last_ino = inode->i_ino; 1004 last_ino = inode->i_ino;
1005 last_dev = inode->i_sb->s_dev; 1005 last_dev = inode->i_sb->s_dev;
@@ -1175,8 +1175,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1175 if (err) 1175 if (err)
1176 goto out; 1176 goto out;
1177 if (EX_ISSYNC(fhp->fh_export)) { 1177 if (EX_ISSYNC(fhp->fh_export)) {
1178 int err2 = vfs_fsync_range(file, file->f_path.dentry, 1178 int err2 = vfs_fsync_range(file, offset, end, 0);
1179 offset, end, 0);
1180 1179
1181 if (err2 != -EINVAL) 1180 if (err2 != -EINVAL)
1182 err = nfserrno(err2); 1181 err = nfserrno(err2);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5e226d4b41d3..39e038ac8fcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
281 281
282 atomic_inc(&sbi->s_inodes_count); 282 atomic_inc(&sbi->s_inodes_count);
283 283 inode_init_owner(inode, dir, mode);
284 inode->i_uid = current_fsuid();
285 if (dir->i_mode & S_ISGID) {
286 inode->i_gid = dir->i_gid;
287 if (S_ISDIR(mode))
288 mode |= S_ISGID;
289 } else
290 inode->i_gid = current_fsgid();
291
292 inode->i_mode = mode;
293 inode->i_ino = ino; 284 inode->i_ino = ino;
294 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 285 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
295 286
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a756168a21c2..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
674 start * sects_per_block, 674 start * sects_per_block,
675 nblocks * sects_per_block, 675 nblocks * sects_per_block,
676 GFP_NOFS, 676 GFP_NOFS,
677 DISCARD_FL_BARRIER); 677 BLKDEV_IFL_BARRIER);
678 if (ret < 0) 678 if (ret < 0)
679 return ret; 679 return ret;
680 nblocks = 0; 680 nblocks = 0;
@@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
684 ret = blkdev_issue_discard(nilfs->ns_bdev, 684 ret = blkdev_issue_discard(nilfs->ns_bdev,
685 start * sects_per_block, 685 start * sects_per_block,
686 nblocks * sects_per_block, 686 nblocks * sects_per_block,
687 GFP_NOFS, DISCARD_FL_BARRIER); 687 GFP_NOFS, BLKDEV_IFL_BARRIER);
688 return ret; 688 return ret;
689} 689}
690 690
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914ccb..27b75ebc7460 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
110int pin_inotify_watch(struct inotify_watch *watch) 110int pin_inotify_watch(struct inotify_watch *watch)
111{ 111{
112 struct super_block *sb = watch->inode->i_sb; 112 struct super_block *sb = watch->inode->i_sb;
113 spin_lock(&sb_lock); 113 if (atomic_inc_not_zero(&sb->s_active)) {
114 if (sb->s_count >= S_BIAS) {
115 atomic_inc(&sb->s_active);
116 spin_unlock(&sb_lock);
117 atomic_inc(&watch->count); 114 atomic_inc(&watch->count);
118 return 1; 115 return 1;
119 } 116 }
120 spin_unlock(&sb_lock);
121 return 0; 117 return 0;
122} 118}
123 119
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
515 * done. Cleanup is just deactivate_super(). However, that leaves a messy 511 * done. Cleanup is just deactivate_super(). However, that leaves a messy
516 * case - what if we *are* racing with umount() and active references to 512 * case - what if we *are* racing with umount() and active references to
517 * superblock can't be acquired anymore? We can bump ->s_count, grab 513 * superblock can't be acquired anymore? We can bump ->s_count, grab
518 * ->s_umount, which will almost certainly wait until the superblock is shut 514 * ->s_umount, which will wait until the superblock is shut down and the
519 * down and the watch in question is pining for fjords. That's fine, but 515 * watch in question is pining for fjords.
520 * there is a problem - we might have hit the window between ->s_active
521 * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
522 * is past the point of no return and is heading for shutdown) and the
523 * moment when deactivate_super() acquires ->s_umount. We could just do
524 * drop_super() yield() and retry, but that's rather antisocial and this
525 * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having
526 * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
527 * that we won't race with inotify_umount_inodes(). So we could grab a
528 * reference to watch and do the rest as above, just with drop_super() instead
529 * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we
530 * could grab ->s_umount. So the watch could've been gone already.
531 *
532 * That still can be dealt with - we need to save watch->wd, do idr_find()
533 * and compare its result with our pointer. If they match, we either have
534 * the damn thing still alive or we'd lost not one but two races at once,
535 * the watch had been killed and a new one got created with the same ->wd
536 * at the same address. That couldn't have happened in inotify_destroy(),
537 * but inotify_rm_wd() could run into that. Still, "new one got created"
538 * is not a problem - we have every right to kill it or leave it alone,
539 * whatever's more convenient.
540 *
541 * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
542 * "grab it and kill it" check. If it's been our original watch, we are
543 * fine, if it's a newcomer - nevermind, just pretend that we'd won the
544 * race and kill the fscker anyway; we are safe since we know that its
545 * superblock won't be going away.
546 * 516 *
547 * And yes, this is far beyond mere "not very pretty"; so's the entire 517 * And yes, this is far beyond mere "not very pretty"; so's the entire
548 * concept of inotify to start with. 518 * concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
556 * Called with ih->mutex held, drops it. Possible return values: 526 * Called with ih->mutex held, drops it. Possible return values:
557 * 0 - nothing to do, it has died 527 * 0 - nothing to do, it has died
558 * 1 - remove it, drop the reference and deactivate_super() 528 * 1 - remove it, drop the reference and deactivate_super()
559 * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
560 * that variant, since it involved a lot of PITA, but that's the best that
561 * could've been done.
562 */ 529 */
563static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) 530static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
564{ 531{
565 struct super_block *sb = watch->inode->i_sb; 532 struct super_block *sb = watch->inode->i_sb;
566 s32 wd = watch->wd;
567 533
568 spin_lock(&sb_lock); 534 if (atomic_inc_not_zero(&sb->s_active)) {
569 if (sb->s_count >= S_BIAS) {
570 atomic_inc(&sb->s_active);
571 spin_unlock(&sb_lock);
572 get_inotify_watch(watch); 535 get_inotify_watch(watch);
573 mutex_unlock(&ih->mutex); 536 mutex_unlock(&ih->mutex);
574 return 1; /* the best outcome */ 537 return 1; /* the best outcome */
575 } 538 }
539 spin_lock(&sb_lock);
576 sb->s_count++; 540 sb->s_count++;
577 spin_unlock(&sb_lock); 541 spin_unlock(&sb_lock);
578 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ 542 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
579 down_read(&sb->s_umount); 543 down_read(&sb->s_umount);
580 if (likely(!sb->s_root)) { 544 /* fs is already shut down; the watch is dead */
581 /* fs is already shut down; the watch is dead */ 545 drop_super(sb);
582 drop_super(sb); 546 return 0;
583 return 0;
584 }
585 /* raced with the final deactivate_super() */
586 mutex_lock(&ih->mutex);
587 if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
588 /* the watch is dead */
589 mutex_unlock(&ih->mutex);
590 drop_super(sb);
591 return 0;
592 }
593 /* still alive or freed and reused with the same sb and wd; kill */
594 get_inotify_watch(watch);
595 mutex_unlock(&ih->mutex);
596 return 2;
597} 547}
598 548
599static void unpin_and_kill(struct inotify_watch *watch, int how) 549static void unpin_and_kill(struct inotify_watch *watch)
600{ 550{
601 struct super_block *sb = watch->inode->i_sb; 551 struct super_block *sb = watch->inode->i_sb;
602 put_inotify_watch(watch); 552 put_inotify_watch(watch);
603 switch (how) { 553 deactivate_super(sb);
604 case 1:
605 deactivate_super(sb);
606 break;
607 case 2:
608 drop_super(sb);
609 }
610} 554}
611 555
612/** 556/**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
628 struct list_head *watches; 572 struct list_head *watches;
629 struct super_block *sb; 573 struct super_block *sb;
630 struct inode *inode; 574 struct inode *inode;
631 int how;
632 575
633 mutex_lock(&ih->mutex); 576 mutex_lock(&ih->mutex);
634 watches = &ih->watches; 577 watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
638 } 581 }
639 watch = list_first_entry(watches, struct inotify_watch, h_list); 582 watch = list_first_entry(watches, struct inotify_watch, h_list);
640 sb = watch->inode->i_sb; 583 sb = watch->inode->i_sb;
641 how = pin_to_kill(ih, watch); 584 if (!pin_to_kill(ih, watch))
642 if (!how)
643 continue; 585 continue;
644 586
645 inode = watch->inode; 587 inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
654 596
655 mutex_unlock(&ih->mutex); 597 mutex_unlock(&ih->mutex);
656 mutex_unlock(&inode->inotify_mutex); 598 mutex_unlock(&inode->inotify_mutex);
657 unpin_and_kill(watch, how); 599 unpin_and_kill(watch);
658 } 600 }
659 601
660 /* free this handle: the put matching the get in inotify_init() */ 602 /* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
857 struct inotify_watch *watch; 799 struct inotify_watch *watch;
858 struct super_block *sb; 800 struct super_block *sb;
859 struct inode *inode; 801 struct inode *inode;
860 int how;
861 802
862 mutex_lock(&ih->mutex); 803 mutex_lock(&ih->mutex);
863 watch = idr_find(&ih->idr, wd); 804 watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
866 return -EINVAL; 807 return -EINVAL;
867 } 808 }
868 sb = watch->inode->i_sb; 809 sb = watch->inode->i_sb;
869 how = pin_to_kill(ih, watch); 810 if (!pin_to_kill(ih, watch))
870 if (!how)
871 return 0; 811 return 0;
872 812
873 inode = watch->inode; 813 inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
881 821
882 mutex_unlock(&ih->mutex); 822 mutex_unlock(&ih->mutex);
883 mutex_unlock(&inode->inotify_mutex); 823 mutex_unlock(&inode->inotify_mutex);
884 unpin_and_kill(watch, how); 824 unpin_and_kill(watch);
885 825
886 return 0; 826 return 0;
887} 827}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..a1924a0d2ab0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
98 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * the page at all. For a more detailed explanation see ntfs_truncate() in
99 * fs/ntfs/inode.c. 99 * fs/ntfs/inode.c.
100 * 100 *
101 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
102 * pages.
103 *
104 * Return 0 on success and -errno on error. In the case that an error is 101 * Return 0 on success and -errno on error. In the case that an error is
105 * encountered it is possible that the initialized size will already have been 102 * encountered it is possible that the initialized size will already have been
106 * incremented some way towards @new_init_size but it is guaranteed that if 103 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
110 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
111 * held by the caller. 108 * held by the caller.
112 */ 109 */
113static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
114 struct page **cached_page, struct pagevec *lru_pvec)
115{ 111{
116 s64 old_init_size; 112 s64 old_init_size;
117 loff_t old_i_size; 113 loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
403 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
404 * starting at index @index. 400 * starting at index @index.
405 * 401 *
406 * If a page is newly created, increment its refcount and add it to the 402 * If a page is newly created, add it to lru list
407 * caller's lru-buffering pagevec @lru_pvec.
408 *
409 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
410 * are obtained at once instead of just one page and that 0 is returned on
411 * success and -errno on error.
412 * 403 *
413 * Note, the page locks are obtained in ascending page index order. 404 * Note, the page locks are obtained in ascending page index order.
414 */ 405 */
415static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
416 pgoff_t index, const unsigned nr_pages, struct page **pages, 407 pgoff_t index, const unsigned nr_pages, struct page **pages,
417 struct page **cached_page, struct pagevec *lru_pvec) 408 struct page **cached_page)
418{ 409{
419 int err, nr; 410 int err, nr;
420 411
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
430 goto err_out; 421 goto err_out;
431 } 422 }
432 } 423 }
433 err = add_to_page_cache(*cached_page, mapping, index, 424 err = add_to_page_cache_lru(*cached_page, mapping, index,
434 GFP_KERNEL); 425 GFP_KERNEL);
435 if (unlikely(err)) { 426 if (unlikely(err)) {
436 if (err == -EEXIST) 427 if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
438 goto err_out; 429 goto err_out;
439 } 430 }
440 pages[nr] = *cached_page; 431 pages[nr] = *cached_page;
441 page_cache_get(*cached_page);
442 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
443 __pagevec_lru_add_file(lru_pvec);
444 *cached_page = NULL; 432 *cached_page = NULL;
445 } 433 }
446 index++; 434 index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1800 ssize_t status, written; 1788 ssize_t status, written;
1801 unsigned nr_pages; 1789 unsigned nr_pages;
1802 int err; 1790 int err;
1803 struct pagevec lru_pvec;
1804 1791
1805 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1806 "pos 0x%llx, count 0x%lx.", 1793 "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1912 } 1899 }
1913 } 1900 }
1914 } 1901 }
1915 pagevec_init(&lru_pvec, 0);
1916 written = 0; 1902 written = 0;
1917 /* 1903 /*
1918 * If the write starts beyond the initialized size, extend it up to the 1904 * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1925 ll = ni->initialized_size; 1911 ll = ni->initialized_size;
1926 read_unlock_irqrestore(&ni->size_lock, flags); 1912 read_unlock_irqrestore(&ni->size_lock, flags);
1927 if (pos > ll) { 1913 if (pos > ll) {
1928 err = ntfs_attr_extend_initialized(ni, pos, &cached_page, 1914 err = ntfs_attr_extend_initialized(ni, pos);
1929 &lru_pvec);
1930 if (err < 0) { 1915 if (err < 0) {
1931 ntfs_error(vol->sb, "Cannot perform write to inode " 1916 ntfs_error(vol->sb, "Cannot perform write to inode "
1932 "0x%lx, attribute type 0x%x, because " 1917 "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2012 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1997 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2013 /* Get and lock @do_pages starting at index @start_idx. */ 1998 /* Get and lock @do_pages starting at index @start_idx. */
2014 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1999 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2015 pages, &cached_page, &lru_pvec); 2000 pages, &cached_page);
2016 if (unlikely(status)) 2001 if (unlikely(status))
2017 break; 2002 break;
2018 /* 2003 /*
@@ -2077,7 +2062,6 @@ err_out:
2077 *ppos = pos; 2062 *ppos = pos;
2078 if (cached_page) 2063 if (cached_page)
2079 page_cache_release(cached_page); 2064 page_cache_release(cached_page);
2080 pagevec_lru_add_file(&lru_pvec);
2081 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2065 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2082 written ? "written" : "status", (unsigned long)written, 2066 written ? "written" : "status", (unsigned long)written,
2083 (long)status); 2067 (long)status);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e13fc9e8fcdc..da702294d7e7 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -489,7 +489,7 @@ cleanup:
489 return ret; 489 return ret;
490} 490}
491 491
492struct xattr_handler ocfs2_xattr_acl_access_handler = { 492const struct xattr_handler ocfs2_xattr_acl_access_handler = {
493 .prefix = POSIX_ACL_XATTR_ACCESS, 493 .prefix = POSIX_ACL_XATTR_ACCESS,
494 .flags = ACL_TYPE_ACCESS, 494 .flags = ACL_TYPE_ACCESS,
495 .list = ocfs2_xattr_list_acl_access, 495 .list = ocfs2_xattr_list_acl_access,
@@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
497 .set = ocfs2_xattr_set_acl, 497 .set = ocfs2_xattr_set_acl,
498}; 498};
499 499
500struct xattr_handler ocfs2_xattr_acl_default_handler = { 500const struct xattr_handler ocfs2_xattr_acl_default_handler = {
501 .prefix = POSIX_ACL_XATTR_DEFAULT, 501 .prefix = POSIX_ACL_XATTR_DEFAULT,
502 .flags = ACL_TYPE_DEFAULT, 502 .flags = ACL_TYPE_DEFAULT,
503 .list = ocfs2_xattr_list_acl_default, 503 .list = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
404 * larger than 16 bits. 404 * larger than 16 bits.
405 */ 405 */
406 BUG_ON(ecc > USHORT_MAX); 406 BUG_ON(ecc > USHRT_MAX);
407 407
408 bc->bc_crc32e = cpu_to_le32(crc); 408 bc->bc_crc32e = cpu_to_le32(crc);
409 bc->bc_ecc = cpu_to_le16((u16)ecc); 409 bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
509 * larger than 16 bits. 509 * larger than 16 bits.
510 */ 510 */
511 BUG_ON(ecc > USHORT_MAX); 511 BUG_ON(ecc > USHRT_MAX);
512 512
513 bc->bc_crc32e = cpu_to_le32(crc); 513 bc->bc_crc32e = cpu_to_le32(crc);
514 bc->bc_ecc = cpu_to_le16((u16)ecc); 514 bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da4..39eb16ac5f98 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3897 oinfo->dqi_gi.dqi_free_entry = 3897 oinfo->dqi_gi.dqi_free_entry =
3898 be32_to_cpu(lvb->lvb_free_entry); 3898 be32_to_cpu(lvb->lvb_free_entry);
3899 } else { 3899 } else {
3900 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh); 3900 status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
3901 oinfo->dqi_giblk, &bh);
3901 if (status) { 3902 if (status) {
3902 mlog_errno(status); 3903 mlog_errno(status);
3903 goto bail; 3904 goto bail;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f74f1400eccd..97e54b9e654b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -933,9 +933,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
933 struct ocfs2_super *osb = OCFS2_SB(sb); 933 struct ocfs2_super *osb = OCFS2_SB(sb);
934 struct buffer_head *bh = NULL; 934 struct buffer_head *bh = NULL;
935 handle_t *handle = NULL; 935 handle_t *handle = NULL;
936 int qtype;
937 struct dquot *transfer_from[MAXQUOTAS] = { };
938 struct dquot *transfer_to[MAXQUOTAS] = { }; 936 struct dquot *transfer_to[MAXQUOTAS] = { };
937 int qtype;
939 938
940 mlog_entry("(0x%p, '%.*s')\n", dentry, 939 mlog_entry("(0x%p, '%.*s')\n", dentry,
941 dentry->d_name.len, dentry->d_name.name); 940 dentry->d_name.len, dentry->d_name.name);
@@ -966,10 +965,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
966 if (status) 965 if (status)
967 return status; 966 return status;
968 967
968 if (is_quota_modification(inode, attr))
969 dquot_initialize(inode);
969 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 970 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
970 if (size_change) { 971 if (size_change) {
971 dquot_initialize(inode);
972
973 status = ocfs2_rw_lock(inode, 1); 972 status = ocfs2_rw_lock(inode, 1);
974 if (status < 0) { 973 if (status < 0) {
975 mlog_errno(status); 974 mlog_errno(status);
@@ -1019,9 +1018,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1019 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1018 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1020 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1019 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1021 USRQUOTA); 1020 USRQUOTA);
1022 transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, 1021 if (!transfer_to[USRQUOTA]) {
1023 USRQUOTA);
1024 if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
1025 status = -ESRCH; 1022 status = -ESRCH;
1026 goto bail_unlock; 1023 goto bail_unlock;
1027 } 1024 }
@@ -1031,9 +1028,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1031 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1028 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1032 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1029 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1033 GRPQUOTA); 1030 GRPQUOTA);
1034 transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, 1031 if (!transfer_to[GRPQUOTA]) {
1035 GRPQUOTA);
1036 if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
1037 status = -ESRCH; 1032 status = -ESRCH;
1038 goto bail_unlock; 1033 goto bail_unlock;
1039 } 1034 }
@@ -1045,7 +1040,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1045 mlog_errno(status); 1040 mlog_errno(status);
1046 goto bail_unlock; 1041 goto bail_unlock;
1047 } 1042 }
1048 status = dquot_transfer(inode, attr); 1043 status = __dquot_transfer(inode, transfer_to);
1049 if (status < 0) 1044 if (status < 0)
1050 goto bail_commit; 1045 goto bail_commit;
1051 } else { 1046 } else {
@@ -1085,10 +1080,8 @@ bail:
1085 brelse(bh); 1080 brelse(bh);
1086 1081
1087 /* Release quota pointers in case we acquired them */ 1082 /* Release quota pointers in case we acquired them */
1088 for (qtype = 0; qtype < MAXQUOTAS; qtype++) { 1083 for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1089 dqput(transfer_to[qtype]); 1084 dqput(transfer_to[qtype]);
1090 dqput(transfer_from[qtype]);
1091 }
1092 1085
1093 if (!status && attr->ia_valid & ATTR_MODE) { 1086 if (!status && attr->ia_valid & ATTR_MODE) {
1094 status = ocfs2_acl_chmod(inode); 1087 status = ocfs2_acl_chmod(inode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index db5dd3ed4df4..f171b51a74f7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
204 inode->i_nlink = 2; 204 inode->i_nlink = 2;
205 else 205 else
206 inode->i_nlink = 1; 206 inode->i_nlink = 1;
207 inode->i_uid = current_fsuid(); 207 inode_init_owner(inode, dir, mode);
208 if (dir->i_mode & S_ISGID) {
209 inode->i_gid = dir->i_gid;
210 if (S_ISDIR(mode))
211 mode |= S_ISGID;
212 } else
213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode;
215 dquot_initialize(inode); 208 dquot_initialize(inode);
216 return inode; 209 return inode;
217} 210}
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
23struct ocfs2_dquot { 23struct ocfs2_dquot {
24 struct dquot dq_dquot; /* Generic VFS dquot */ 24 struct dquot dq_dquot; /* Generic VFS dquot */
25 loff_t dq_local_off; /* Offset in the local quota file */ 25 loff_t dq_local_off; /* Offset in the local quota file */
26 u64 dq_local_phys_blk; /* Physical block carrying quota structure */
26 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ 27 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
27 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ 28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
28 s64 dq_origspace; /* Last globally synced space usage */ 29 s64 dq_origspace; /* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
51 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ 52 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
52 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ 53 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
53 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ 54 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
55 u64 dqi_giblk; /* Number of block with global information header */
54 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ 56 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
55 struct buffer_head *dqi_ibh; /* Buffer with information header */ 57 struct buffer_head *dqi_libh; /* Buffer with local information header */
56 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ 58 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
57 struct delayed_work dqi_sync_work; /* Work for syncing dquots */ 59 struct delayed_work dqi_sync_work; /* Work for syncing dquots */
58 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery 60 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
102 104
103int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 105int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
104void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 106void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
105int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 107int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
106 struct buffer_head **bh); 108int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
109 struct buffer_head **bh);
110int ocfs2_create_local_dquot(struct dquot *dquot);
111int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
112int ocfs2_local_write_dquot(struct dquot *dquot);
107 113
108extern const struct dquot_operations ocfs2_quota_operations; 114extern const struct dquot_operations ocfs2_quota_operations;
109extern struct quota_format_type ocfs2_quota_format; 115extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 04ae76d8c6ab..2bb35fe00511 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -25,8 +25,44 @@
25#include "dlmglue.h" 25#include "dlmglue.h"
26#include "uptodate.h" 26#include "uptodate.h"
27#include "super.h" 27#include "super.h"
28#include "buffer_head_io.h"
28#include "quota.h" 29#include "quota.h"
29 30
31/*
32 * Locking of quotas with OCFS2 is rather complex. Here are rules that
33 * should be obeyed by all the functions:
34 * - any write of quota structure (either to local or global file) is protected
35 * by dqio_mutex or dquot->dq_lock.
36 * - any modification of global quota file holds inode cluster lock, i_mutex,
37 * and ip_alloc_sem of the global quota file (achieved by
38 * ocfs2_lock_global_qf). It also has to hold qinfo_lock.
39 * - an allocation of new blocks for local quota file is protected by
40 * its ip_alloc_sem
41 *
42 * A rough sketch of locking dependencies (lf = local file, gf = global file):
43 * Normal filesystem operation:
44 * start_trans -> dqio_mutex -> write to lf
45 * Syncing of local and global file:
46 * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
47 * write to gf
48 * -> write to lf
49 * Acquire dquot for the first time:
50 * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
51 * -> alloc space for gf
52 * -> start_trans -> qinfo_lock -> write to gf
53 * -> ip_alloc_sem of lf -> alloc space for lf
54 * -> write to lf
55 * Release last reference to dquot:
56 * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
57 * -> write to lf
58 * Note that all the above operations also hold the inode cluster lock of lf.
59 * Recovery:
60 * inode cluster lock of recovered lf
61 * -> read bitmaps -> ip_alloc_sem of lf
62 * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
63 * write to gf
64 */
65
30static struct workqueue_struct *ocfs2_quota_wq = NULL; 66static struct workqueue_struct *ocfs2_quota_wq = NULL;
31 67
32static void qsync_work_fn(struct work_struct *work); 68static void qsync_work_fn(struct work_struct *work);
@@ -91,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
91 .is_id = ocfs2_global_is_id, 127 .is_id = ocfs2_global_is_id,
92}; 128};
93 129
94static int ocfs2_validate_quota_block(struct super_block *sb, 130int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
95 struct buffer_head *bh)
96{ 131{
97 struct ocfs2_disk_dqtrailer *dqt = 132 struct ocfs2_disk_dqtrailer *dqt =
98 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); 133 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -110,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
110 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); 145 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
111} 146}
112 147
113int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 148int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
114 struct buffer_head **bh) 149 struct buffer_head **bhp)
115{ 150{
116 int rc = 0; 151 int rc;
117 struct buffer_head *tmp = *bh; 152
118 153 *bhp = NULL;
119 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { 154 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
120 ocfs2_error(inode->i_sb, 155 ocfs2_validate_quota_block);
121 "Quota file %llu is probably corrupted! Requested "
122 "to read block %Lu but file has size only %Lu\n",
123 (unsigned long long)OCFS2_I(inode)->ip_blkno,
124 (unsigned long long)v_block,
125 (unsigned long long)i_size_read(inode));
126 return -EIO;
127 }
128 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
129 ocfs2_validate_quota_block);
130 if (rc) 156 if (rc)
131 mlog_errno(rc); 157 mlog_errno(rc);
132
133 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
134 if (!rc && !*bh)
135 *bh = tmp;
136
137 return rc; 158 return rc;
138} 159}
139 160
140static int ocfs2_get_quota_block(struct inode *inode, int block,
141 struct buffer_head **bh)
142{
143 u64 pblock, pcount;
144 int err;
145
146 down_read(&OCFS2_I(inode)->ip_alloc_sem);
147 err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
148 up_read(&OCFS2_I(inode)->ip_alloc_sem);
149 if (err) {
150 mlog_errno(err);
151 return err;
152 }
153 *bh = sb_getblk(inode->i_sb, pblock);
154 if (!*bh) {
155 err = -EIO;
156 mlog_errno(err);
157 }
158 return err;
159}
160
161/* Read data from global quotafile - avoid pagecache and such because we cannot 161/* Read data from global quotafile - avoid pagecache and such because we cannot
162 * afford acquiring the locks... We use quota cluster lock to serialize 162 * afford acquiring the locks... We use quota cluster lock to serialize
163 * operations. Caller is responsible for acquiring it. */ 163 * operations. Caller is responsible for acquiring it. */
@@ -172,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
172 int err = 0; 172 int err = 0;
173 struct buffer_head *bh; 173 struct buffer_head *bh;
174 size_t toread, tocopy; 174 size_t toread, tocopy;
175 u64 pblock = 0, pcount = 0;
175 176
176 if (off > i_size) 177 if (off > i_size)
177 return 0; 178 return 0;
@@ -180,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
180 toread = len; 181 toread = len;
181 while (toread > 0) { 182 while (toread > 0) {
182 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); 183 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
184 if (!pcount) {
185 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
186 &pcount, NULL);
187 if (err) {
188 mlog_errno(err);
189 return err;
190 }
191 } else {
192 pcount--;
193 pblock++;
194 }
183 bh = NULL; 195 bh = NULL;
184 err = ocfs2_read_quota_block(gqinode, blk, &bh); 196 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
185 if (err) { 197 if (err) {
186 mlog_errno(err); 198 mlog_errno(err);
187 return err; 199 return err;
@@ -209,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
209 int err = 0, new = 0, ja_type; 221 int err = 0, new = 0, ja_type;
210 struct buffer_head *bh = NULL; 222 struct buffer_head *bh = NULL;
211 handle_t *handle = journal_current_handle(); 223 handle_t *handle = journal_current_handle();
224 u64 pblock, pcount;
212 225
213 if (!handle) { 226 if (!handle) {
214 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " 227 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -221,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
221 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; 234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
222 } 235 }
223 236
224 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
225 if (gqinode->i_size < off + len) { 237 if (gqinode->i_size < off + len) {
226 loff_t rounded_end = 238 loff_t rounded_end =
227 ocfs2_align_bytes_to_blocks(sb, off + len); 239 ocfs2_align_bytes_to_blocks(sb, off + len);
228 240
229 /* Space is already allocated in ocfs2_global_read_dquot() */ 241 /* Space is already allocated in ocfs2_acquire_dquot() */
230 err = ocfs2_simple_size_update(gqinode, 242 err = ocfs2_simple_size_update(gqinode,
231 oinfo->dqi_gqi_bh, 243 oinfo->dqi_gqi_bh,
232 rounded_end); 244 rounded_end);
@@ -234,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
234 goto out; 246 goto out;
235 new = 1; 247 new = 1;
236 } 248 }
249 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
250 if (err) {
251 mlog_errno(err);
252 goto out;
253 }
237 /* Not rewriting whole block? */ 254 /* Not rewriting whole block? */
238 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && 255 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
239 !new) { 256 !new) {
240 err = ocfs2_read_quota_block(gqinode, blk, &bh); 257 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
241 ja_type = OCFS2_JOURNAL_ACCESS_WRITE; 258 ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
242 } else { 259 } else {
243 err = ocfs2_get_quota_block(gqinode, blk, &bh); 260 bh = sb_getblk(sb, pblock);
261 if (!bh)
262 err = -ENOMEM;
244 ja_type = OCFS2_JOURNAL_ACCESS_CREATE; 263 ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
245 } 264 }
246 if (err) { 265 if (err) {
@@ -265,13 +284,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
265 brelse(bh); 284 brelse(bh);
266out: 285out:
267 if (err) { 286 if (err) {
268 mutex_unlock(&gqinode->i_mutex);
269 mlog_errno(err); 287 mlog_errno(err);
270 return err; 288 return err;
271 } 289 }
272 gqinode->i_version++; 290 gqinode->i_version++;
273 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); 291 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
274 mutex_unlock(&gqinode->i_mutex);
275 return len; 292 return len;
276} 293}
277 294
@@ -289,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
289 else 306 else
290 WARN_ON(bh != oinfo->dqi_gqi_bh); 307 WARN_ON(bh != oinfo->dqi_gqi_bh);
291 spin_unlock(&dq_data_lock); 308 spin_unlock(&dq_data_lock);
309 if (ex) {
310 mutex_lock(&oinfo->dqi_gqinode->i_mutex);
311 down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
312 } else {
313 down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
314 }
292 return 0; 315 return 0;
293} 316}
294 317
295void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) 318void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
296{ 319{
320 if (ex) {
321 up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
322 mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
323 } else {
324 up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
325 }
297 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); 326 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
298 brelse(oinfo->dqi_gqi_bh); 327 brelse(oinfo->dqi_gqi_bh);
299 spin_lock(&dq_data_lock); 328 spin_lock(&dq_data_lock);
@@ -311,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
311 struct ocfs2_global_disk_dqinfo dinfo; 340 struct ocfs2_global_disk_dqinfo dinfo;
312 struct mem_dqinfo *info = sb_dqinfo(sb, type); 341 struct mem_dqinfo *info = sb_dqinfo(sb, type);
313 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; 342 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
343 u64 pcount;
314 int status; 344 int status;
315 345
316 mlog_entry_void(); 346 mlog_entry_void();
@@ -337,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
337 mlog_errno(status); 367 mlog_errno(status);
338 goto out_err; 368 goto out_err;
339 } 369 }
370
371 status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
372 &pcount, NULL);
373 if (status < 0)
374 goto out_unlock;
375
376 status = ocfs2_qinfo_lock(oinfo, 0);
377 if (status < 0)
378 goto out_unlock;
340 status = sb->s_op->quota_read(sb, type, (char *)&dinfo, 379 status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
341 sizeof(struct ocfs2_global_disk_dqinfo), 380 sizeof(struct ocfs2_global_disk_dqinfo),
342 OCFS2_GLOBAL_INFO_OFF); 381 OCFS2_GLOBAL_INFO_OFF);
382 ocfs2_qinfo_unlock(oinfo, 0);
343 ocfs2_unlock_global_qf(oinfo, 0); 383 ocfs2_unlock_global_qf(oinfo, 0);
344 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { 384 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
345 mlog(ML_ERROR, "Cannot read global quota info (%d).\n", 385 mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -366,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
366out_err: 406out_err:
367 mlog_exit(status); 407 mlog_exit(status);
368 return status; 408 return status;
409out_unlock:
410 ocfs2_unlock_global_qf(oinfo, 0);
411 mlog_errno(status);
412 goto out_err;
369} 413}
370 414
371/* Write information to global quota file. Expects exlusive lock on quota 415/* Write information to global quota file. Expects exlusive lock on quota
@@ -424,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
424 468
425static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) 469static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
426{ 470{
427 /* We modify all the allocated blocks, tree root, and info block */ 471 /* We modify all the allocated blocks, tree root, info block and
472 * the inode */
428 return (ocfs2_global_qinit_alloc(sb, type) + 2) * 473 return (ocfs2_global_qinit_alloc(sb, type) + 2) *
429 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; 474 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
430}
431
432/* Read in information from global quota file and acquire a reference to it.
433 * dquot_acquire() has already started the transaction and locked quota file */
434int ocfs2_global_read_dquot(struct dquot *dquot)
435{
436 int err, err2, ex = 0;
437 struct super_block *sb = dquot->dq_sb;
438 int type = dquot->dq_type;
439 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
440 struct ocfs2_super *osb = OCFS2_SB(sb);
441 struct inode *gqinode = info->dqi_gqinode;
442 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
443 handle_t *handle = NULL;
444
445 err = ocfs2_qinfo_lock(info, 0);
446 if (err < 0)
447 goto out;
448 err = qtree_read_dquot(&info->dqi_gi, dquot);
449 if (err < 0)
450 goto out_qlock;
451 OCFS2_DQUOT(dquot)->dq_use_count++;
452 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
453 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
454 ocfs2_qinfo_unlock(info, 0);
455
456 if (!dquot->dq_off) { /* No real quota entry? */
457 ex = 1;
458 /*
459 * Add blocks to quota file before we start a transaction since
460 * locking allocators ranks above a transaction start
461 */
462 WARN_ON(journal_current_handle());
463 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
464 err = ocfs2_extend_no_holes(gqinode,
465 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
466 gqinode->i_size);
467 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
468 if (err < 0)
469 goto out;
470 }
471
472 handle = ocfs2_start_trans(osb,
473 ocfs2_calc_global_qinit_credits(sb, type));
474 if (IS_ERR(handle)) {
475 err = PTR_ERR(handle);
476 goto out;
477 }
478 err = ocfs2_qinfo_lock(info, ex);
479 if (err < 0)
480 goto out_trans;
481 err = qtree_write_dquot(&info->dqi_gi, dquot);
482 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
483 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
484 if (!err)
485 err = err2;
486 }
487out_qlock:
488 if (ex)
489 ocfs2_qinfo_unlock(info, 1);
490 else
491 ocfs2_qinfo_unlock(info, 0);
492out_trans:
493 if (handle)
494 ocfs2_commit_trans(osb, handle);
495out:
496 if (err < 0)
497 mlog_errno(err);
498 return err;
499} 475}
500 476
501/* Sync local information about quota modifications with global quota file. 477/* Sync local information about quota modifications with global quota file.
@@ -636,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
636 } 612 }
637 mutex_lock(&sb_dqopt(sb)->dqio_mutex); 613 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
638 status = ocfs2_sync_dquot(dquot); 614 status = ocfs2_sync_dquot(dquot);
639 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
640 if (status < 0) 615 if (status < 0)
641 mlog_errno(status); 616 mlog_errno(status);
642 /* We have to write local structure as well... */ 617 /* We have to write local structure as well... */
643 dquot_mark_dquot_dirty(dquot); 618 status = ocfs2_local_write_dquot(dquot);
644 status = dquot_commit(dquot);
645 if (status < 0) 619 if (status < 0)
646 mlog_errno(status); 620 mlog_errno(status);
621 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
647 ocfs2_commit_trans(osb, handle); 622 ocfs2_commit_trans(osb, handle);
648out_ilock: 623out_ilock:
649 ocfs2_unlock_global_qf(oinfo, 1); 624 ocfs2_unlock_global_qf(oinfo, 1);
@@ -682,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
682 mlog_errno(status); 657 mlog_errno(status);
683 goto out; 658 goto out;
684 } 659 }
685 status = dquot_commit(dquot); 660 mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
661 status = ocfs2_local_write_dquot(dquot);
662 mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
686 ocfs2_commit_trans(osb, handle); 663 ocfs2_commit_trans(osb, handle);
687out: 664out:
688 mlog_exit(status); 665 mlog_exit(status);
@@ -713,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
713 690
714 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 691 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
715 692
693 mutex_lock(&dquot->dq_lock);
694 /* Check whether we are not racing with some other dqget() */
695 if (atomic_read(&dquot->dq_count) > 1)
696 goto out;
716 status = ocfs2_lock_global_qf(oinfo, 1); 697 status = ocfs2_lock_global_qf(oinfo, 1);
717 if (status < 0) 698 if (status < 0)
718 goto out; 699 goto out;
@@ -723,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
723 mlog_errno(status); 704 mlog_errno(status);
724 goto out_ilock; 705 goto out_ilock;
725 } 706 }
726 status = dquot_release(dquot); 707
708 status = ocfs2_global_release_dquot(dquot);
709 if (status < 0) {
710 mlog_errno(status);
711 goto out_trans;
712 }
713 status = ocfs2_local_release_dquot(handle, dquot);
714 /*
715 * If we fail here, we cannot do much as global structure is
716 * already released. So just complain...
717 */
718 if (status < 0)
719 mlog_errno(status);
720 clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
721out_trans:
727 ocfs2_commit_trans(osb, handle); 722 ocfs2_commit_trans(osb, handle);
728out_ilock: 723out_ilock:
729 ocfs2_unlock_global_qf(oinfo, 1); 724 ocfs2_unlock_global_qf(oinfo, 1);
730out: 725out:
726 mutex_unlock(&dquot->dq_lock);
731 mlog_exit(status); 727 mlog_exit(status);
732 return status; 728 return status;
733} 729}
734 730
731/*
732 * Read global dquot structure from disk or create it if it does
733 * not exist. Also update use count of the global structure and
734 * create structure in node-local quota file.
735 */
735static int ocfs2_acquire_dquot(struct dquot *dquot) 736static int ocfs2_acquire_dquot(struct dquot *dquot)
736{ 737{
737 struct ocfs2_mem_dqinfo *oinfo = 738 int status = 0, err;
738 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 739 int ex = 0;
739 int status = 0; 740 struct super_block *sb = dquot->dq_sb;
741 struct ocfs2_super *osb = OCFS2_SB(sb);
742 int type = dquot->dq_type;
743 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
744 struct inode *gqinode = info->dqi_gqinode;
745 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
746 handle_t *handle;
740 747
741 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 748 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
742 /* We need an exclusive lock, because we're going to update use count 749 mutex_lock(&dquot->dq_lock);
743 * and instantiate possibly new dquot structure */ 750 /*
744 status = ocfs2_lock_global_qf(oinfo, 1); 751 * We need an exclusive lock, because we're going to update use count
752 * and instantiate possibly new dquot structure
753 */
754 status = ocfs2_lock_global_qf(info, 1);
745 if (status < 0) 755 if (status < 0)
746 goto out; 756 goto out;
747 status = dquot_acquire(dquot); 757 if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
748 ocfs2_unlock_global_qf(oinfo, 1); 758 status = ocfs2_qinfo_lock(info, 0);
759 if (status < 0)
760 goto out_dq;
761 status = qtree_read_dquot(&info->dqi_gi, dquot);
762 ocfs2_qinfo_unlock(info, 0);
763 if (status < 0)
764 goto out_dq;
765 }
766 set_bit(DQ_READ_B, &dquot->dq_flags);
767
768 OCFS2_DQUOT(dquot)->dq_use_count++;
769 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
770 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
771 if (!dquot->dq_off) { /* No real quota entry? */
772 ex = 1;
773 /*
774 * Add blocks to quota file before we start a transaction since
775 * locking allocators ranks above a transaction start
776 */
777 WARN_ON(journal_current_handle());
778 status = ocfs2_extend_no_holes(gqinode,
779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
780 gqinode->i_size);
781 if (status < 0)
782 goto out_dq;
783 }
784
785 handle = ocfs2_start_trans(osb,
786 ocfs2_calc_global_qinit_credits(sb, type));
787 if (IS_ERR(handle)) {
788 status = PTR_ERR(handle);
789 goto out_dq;
790 }
791 status = ocfs2_qinfo_lock(info, ex);
792 if (status < 0)
793 goto out_trans;
794 status = qtree_write_dquot(&info->dqi_gi, dquot);
795 if (ex && info_dirty(sb_dqinfo(sb, type))) {
796 err = __ocfs2_global_write_info(sb, type);
797 if (!status)
798 status = err;
799 }
800 ocfs2_qinfo_unlock(info, ex);
801out_trans:
802 ocfs2_commit_trans(osb, handle);
803out_dq:
804 ocfs2_unlock_global_qf(info, 1);
805 if (status < 0)
806 goto out;
807
808 status = ocfs2_create_local_dquot(dquot);
809 if (status < 0)
810 goto out;
811 set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
749out: 812out:
813 mutex_unlock(&dquot->dq_lock);
750 mlog_exit(status); 814 mlog_exit(status);
751 return status; 815 return status;
752} 816}
@@ -768,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
768 struct ocfs2_super *osb = OCFS2_SB(sb); 832 struct ocfs2_super *osb = OCFS2_SB(sb);
769 833
770 mlog_entry("id=%u, type=%d", dquot->dq_id, type); 834 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
771 dquot_mark_dquot_dirty(dquot);
772 835
773 /* In case user set some limits, sync dquot immediately to global 836 /* In case user set some limits, sync dquot immediately to global
774 * quota file so that information propagates quicker */ 837 * quota file so that information propagates quicker */
@@ -791,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
791 mlog_errno(status); 854 mlog_errno(status);
792 goto out_ilock; 855 goto out_ilock;
793 } 856 }
857 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
794 status = ocfs2_sync_dquot(dquot); 858 status = ocfs2_sync_dquot(dquot);
795 if (status < 0) { 859 if (status < 0) {
796 mlog_errno(status); 860 mlog_errno(status);
797 goto out_trans; 861 goto out_dlock;
798 } 862 }
799 /* Now write updated local dquot structure */ 863 /* Now write updated local dquot structure */
800 status = dquot_commit(dquot); 864 status = ocfs2_local_write_dquot(dquot);
801out_trans: 865out_dlock:
866 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
802 ocfs2_commit_trans(osb, handle); 867 ocfs2_commit_trans(osb, handle);
803out_ilock: 868out_ilock:
804 ocfs2_unlock_global_qf(oinfo, 1); 869 ocfs2_unlock_global_qf(oinfo, 1);
@@ -850,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
850} 915}
851 916
852const struct dquot_operations ocfs2_quota_operations = { 917const struct dquot_operations ocfs2_quota_operations = {
853 .write_dquot = ocfs2_write_dquot, 918 /* We never make dquot dirty so .write_dquot is never called */
854 .acquire_dquot = ocfs2_acquire_dquot, 919 .acquire_dquot = ocfs2_acquire_dquot,
855 .release_dquot = ocfs2_release_dquot, 920 .release_dquot = ocfs2_release_dquot,
856 .mark_dirty = ocfs2_mark_dquot_dirty, 921 .mark_dirty = ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 884b641f199e..8bd70d4d184d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -22,6 +22,7 @@
22#include "dlmglue.h" 22#include "dlmglue.h"
23#include "quota.h" 23#include "quota.h"
24#include "uptodate.h" 24#include "uptodate.h"
25#include "super.h"
25 26
26/* Number of local quota structures per block */ 27/* Number of local quota structures per block */
27static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) 28static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -129,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
129 return 0; 130 return 0;
130} 131}
131 132
133/*
134 * Read quota block from a given logical offset.
135 *
136 * This function acquires ip_alloc_sem and thus it must not be called with a
137 * transaction started.
138 */
139static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
140 struct buffer_head **bh)
141{
142 int rc = 0;
143 struct buffer_head *tmp = *bh;
144
145 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
146 ocfs2_error(inode->i_sb,
147 "Quota file %llu is probably corrupted! Requested "
148 "to read block %Lu but file has size only %Lu\n",
149 (unsigned long long)OCFS2_I(inode)->ip_blkno,
150 (unsigned long long)v_block,
151 (unsigned long long)i_size_read(inode));
152 return -EIO;
153 }
154 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
155 ocfs2_validate_quota_block);
156 if (rc)
157 mlog_errno(rc);
158
159 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
160 if (!rc && !*bh)
161 *bh = tmp;
162
163 return rc;
164}
165
132/* Check whether we understand format of quota files */ 166/* Check whether we understand format of quota files */
133static int ocfs2_local_check_quota_file(struct super_block *sb, int type) 167static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
134{ 168{
@@ -671,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
671 INIT_LIST_HEAD(&oinfo->dqi_chunk); 705 INIT_LIST_HEAD(&oinfo->dqi_chunk);
672 oinfo->dqi_rec = NULL; 706 oinfo->dqi_rec = NULL;
673 oinfo->dqi_lqi_bh = NULL; 707 oinfo->dqi_lqi_bh = NULL;
674 oinfo->dqi_ibh = NULL; 708 oinfo->dqi_libh = NULL;
675 709
676 status = ocfs2_global_read_info(sb, type); 710 status = ocfs2_global_read_info(sb, type);
677 if (status < 0) 711 if (status < 0)
@@ -697,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
697 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); 731 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
698 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); 732 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
699 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); 733 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
700 oinfo->dqi_ibh = bh; 734 oinfo->dqi_libh = bh;
701 735
702 /* We crashed when using local quota file? */ 736 /* We crashed when using local quota file? */
703 if (!(info->dqi_flags & OLQF_CLEAN)) { 737 if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -759,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
759{ 793{
760 struct mem_dqinfo *info = sb_dqinfo(sb, type); 794 struct mem_dqinfo *info = sb_dqinfo(sb, type);
761 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) 795 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
762 ->dqi_ibh; 796 ->dqi_libh;
763 int status; 797 int status;
764 798
765 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, 799 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -782,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
782 int mark_clean = 1, len; 816 int mark_clean = 1, len;
783 int status; 817 int status;
784 818
785 /* At this point we know there are no more dquots and thus
786 * even if there's some sync in the pdflush queue, it won't
787 * find any dquots and return without doing anything */
788 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
789 iput(oinfo->dqi_gqinode); 819 iput(oinfo->dqi_gqinode);
790 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); 820 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
791 ocfs2_lock_res_free(&oinfo->dqi_gqlock); 821 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -820,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
820 /* Mark local file as clean */ 850 /* Mark local file as clean */
821 info->dqi_flags |= OLQF_CLEAN; 851 info->dqi_flags |= OLQF_CLEAN;
822 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], 852 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
823 oinfo->dqi_ibh, 853 oinfo->dqi_libh,
824 olq_update_info, 854 olq_update_info,
825 info); 855 info);
826 if (status < 0) { 856 if (status < 0) {
@@ -830,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
830 860
831out: 861out:
832 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); 862 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
833 brelse(oinfo->dqi_ibh); 863 brelse(oinfo->dqi_libh);
834 brelse(oinfo->dqi_lqi_bh); 864 brelse(oinfo->dqi_lqi_bh);
835 kfree(oinfo); 865 kfree(oinfo);
836 return 0; 866 return 0;
@@ -858,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
858} 888}
859 889
860/* Write dquot to local quota file */ 890/* Write dquot to local quota file */
861static int ocfs2_local_write_dquot(struct dquot *dquot) 891int ocfs2_local_write_dquot(struct dquot *dquot)
862{ 892{
863 struct super_block *sb = dquot->dq_sb; 893 struct super_block *sb = dquot->dq_sb;
864 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 894 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
865 struct buffer_head *bh = NULL; 895 struct buffer_head *bh;
896 struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
866 int status; 897 int status;
867 898
868 status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type], 899 status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
869 ol_dqblk_file_block(sb, od->dq_local_off), 900 &bh);
870 &bh);
871 if (status) { 901 if (status) {
872 mlog_errno(status); 902 mlog_errno(status);
873 goto out; 903 goto out;
874 } 904 }
875 status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh, 905 status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
876 olq_set_dquot, od);
877 if (status < 0) { 906 if (status < 0) {
878 mlog_errno(status); 907 mlog_errno(status);
879 goto out; 908 goto out;
@@ -973,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
973 } 1002 }
974 1003
975 /* Initialize chunk header */ 1004 /* Initialize chunk header */
976 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
977 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1005 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
978 &p_blkno, NULL, NULL); 1006 &p_blkno, NULL, NULL);
979 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
980 if (status < 0) { 1007 if (status < 0) {
981 mlog_errno(status); 1008 mlog_errno(status);
982 goto out_trans; 1009 goto out_trans;
@@ -1004,10 +1031,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1004 ocfs2_journal_dirty(handle, bh); 1031 ocfs2_journal_dirty(handle, bh);
1005 1032
1006 /* Initialize new block with structures */ 1033 /* Initialize new block with structures */
1007 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1008 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, 1034 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
1009 &p_blkno, NULL, NULL); 1035 &p_blkno, NULL, NULL);
1010 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1011 if (status < 0) { 1036 if (status < 0) {
1012 mlog_errno(status); 1037 mlog_errno(status);
1013 goto out_trans; 1038 goto out_trans;
@@ -1104,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1104 } 1129 }
1105 1130
1106 /* Get buffer from the just added block */ 1131 /* Get buffer from the just added block */
1107 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1108 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1132 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
1109 &p_blkno, NULL, NULL); 1133 &p_blkno, NULL, NULL);
1110 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1111 if (status < 0) { 1134 if (status < 0) {
1112 mlog_errno(status); 1135 mlog_errno(status);
1113 goto out; 1136 goto out;
@@ -1188,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1188} 1211}
1189 1212
1190/* Create dquot in the local file for given id */ 1213/* Create dquot in the local file for given id */
1191static int ocfs2_create_local_dquot(struct dquot *dquot) 1214int ocfs2_create_local_dquot(struct dquot *dquot)
1192{ 1215{
1193 struct super_block *sb = dquot->dq_sb; 1216 struct super_block *sb = dquot->dq_sb;
1194 int type = dquot->dq_type; 1217 int type = dquot->dq_type;
@@ -1197,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1197 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 1220 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1198 int offset; 1221 int offset;
1199 int status; 1222 int status;
1223 u64 pcount;
1200 1224
1225 down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1201 chunk = ocfs2_find_free_entry(sb, type, &offset); 1226 chunk = ocfs2_find_free_entry(sb, type, &offset);
1202 if (!chunk) { 1227 if (!chunk) {
1203 chunk = ocfs2_extend_local_quota_file(sb, type, &offset); 1228 chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
1204 if (IS_ERR(chunk)) 1229 if (IS_ERR(chunk)) {
1205 return PTR_ERR(chunk); 1230 status = PTR_ERR(chunk);
1231 goto out;
1232 }
1206 } else if (IS_ERR(chunk)) { 1233 } else if (IS_ERR(chunk)) {
1207 return PTR_ERR(chunk); 1234 status = PTR_ERR(chunk);
1235 goto out;
1208 } 1236 }
1209 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); 1237 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
1210 od->dq_chunk = chunk; 1238 od->dq_chunk = chunk;
1239 status = ocfs2_extent_map_get_blocks(lqinode,
1240 ol_dqblk_block(sb, chunk->qc_num, offset),
1241 &od->dq_local_phys_blk,
1242 &pcount,
1243 NULL);
1211 1244
1212 /* Initialize dquot structure on disk */ 1245 /* Initialize dquot structure on disk */
1213 status = ocfs2_local_write_dquot(dquot); 1246 status = ocfs2_local_write_dquot(dquot);
@@ -1224,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1224 goto out; 1257 goto out;
1225 } 1258 }
1226out: 1259out:
1260 up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1227 return status; 1261 return status;
1228} 1262}
1229 1263
1230/* Create entry in local file for dquot, load data from the global file */ 1264/*
1231static int ocfs2_local_read_dquot(struct dquot *dquot) 1265 * Release dquot structure from local quota file. ocfs2_release_dquot() has
1232{ 1266 * already started a transaction and written all changes to global quota file
1233 int status; 1267 */
1234 1268int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1235 mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
1236
1237 status = ocfs2_global_read_dquot(dquot);
1238 if (status < 0) {
1239 mlog_errno(status);
1240 goto out_err;
1241 }
1242
1243 /* Now create entry in the local quota file */
1244 status = ocfs2_create_local_dquot(dquot);
1245 if (status < 0) {
1246 mlog_errno(status);
1247 goto out_err;
1248 }
1249 mlog_exit(0);
1250 return 0;
1251out_err:
1252 mlog_exit(status);
1253 return status;
1254}
1255
1256/* Release dquot structure from local quota file. ocfs2_release_dquot() has
1257 * already started a transaction and obtained exclusive lock for global
1258 * quota file. */
1259static int ocfs2_local_release_dquot(struct dquot *dquot)
1260{ 1269{
1261 int status; 1270 int status;
1262 int type = dquot->dq_type; 1271 int type = dquot->dq_type;
@@ -1264,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1264 struct super_block *sb = dquot->dq_sb; 1273 struct super_block *sb = dquot->dq_sb;
1265 struct ocfs2_local_disk_chunk *dchunk; 1274 struct ocfs2_local_disk_chunk *dchunk;
1266 int offset; 1275 int offset;
1267 handle_t *handle = journal_current_handle();
1268
1269 BUG_ON(!handle);
1270 /* First write all local changes to global file */
1271 status = ocfs2_global_release_dquot(dquot);
1272 if (status < 0) {
1273 mlog_errno(status);
1274 goto out;
1275 }
1276 1276
1277 status = ocfs2_journal_access_dq(handle, 1277 status = ocfs2_journal_access_dq(handle,
1278 INODE_CACHE(sb_dqopt(sb)->files[type]), 1278 INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1305,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
1305 .read_file_info = ocfs2_local_read_info, 1305 .read_file_info = ocfs2_local_read_info,
1306 .write_file_info = ocfs2_global_write_info, 1306 .write_file_info = ocfs2_global_write_info,
1307 .free_file_info = ocfs2_local_free_info, 1307 .free_file_info = ocfs2_local_free_info,
1308 .read_dqblk = ocfs2_local_read_dquot,
1309 .commit_dqblk = ocfs2_local_write_dquot,
1310 .release_dqblk = ocfs2_local_release_dquot,
1311}; 1308};
1312 1309
1313struct quota_format_type ocfs2_quota_format = { 1310struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1c2c39f6f0b6..2c26ce251cb3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -938,12 +938,16 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
938 int type; 938 int type;
939 struct inode *inode; 939 struct inode *inode;
940 struct super_block *sb = osb->sb; 940 struct super_block *sb = osb->sb;
941 struct ocfs2_mem_dqinfo *oinfo;
941 942
942 /* We mostly ignore errors in this function because there's not much 943 /* We mostly ignore errors in this function because there's not much
943 * we can do when we see them */ 944 * we can do when we see them */
944 for (type = 0; type < MAXQUOTAS; type++) { 945 for (type = 0; type < MAXQUOTAS; type++) {
945 if (!sb_has_quota_loaded(sb, type)) 946 if (!sb_has_quota_loaded(sb, type))
946 continue; 947 continue;
948 /* Cancel periodic syncing before we grab dqonoff_mutex */
949 oinfo = sb_dqinfo(sb, type)->dqi_priv;
950 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
947 inode = igrab(sb->s_dquot.files[type]); 951 inode = igrab(sb->s_dquot.files[type]);
948 /* Turn off quotas. This will remove all dquot structures from 952 /* Turn off quotas. This will remove all dquot structures from
949 * memory and so they will be automatically synced to global 953 * memory and so they will be automatically synced to global
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 98ee6c44102d..e97b34842cfe 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -97,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
97 .xv.xr_list.l_count = cpu_to_le16(1), 97 .xv.xr_list.l_count = cpu_to_le16(1),
98}; 98};
99 99
100struct xattr_handler *ocfs2_xattr_handlers[] = { 100const struct xattr_handler *ocfs2_xattr_handlers[] = {
101 &ocfs2_xattr_user_handler, 101 &ocfs2_xattr_user_handler,
102 &ocfs2_xattr_acl_access_handler, 102 &ocfs2_xattr_acl_access_handler,
103 &ocfs2_xattr_acl_default_handler, 103 &ocfs2_xattr_acl_default_handler,
@@ -106,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
106 NULL 106 NULL
107}; 107};
108 108
109static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 109static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
112 = &ocfs2_xattr_acl_access_handler, 112 = &ocfs2_xattr_acl_access_handler,
@@ -540,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
540 540
541static inline const char *ocfs2_xattr_prefix(int name_index) 541static inline const char *ocfs2_xattr_prefix(int name_index)
542{ 542{
543 struct xattr_handler *handler = NULL; 543 const struct xattr_handler *handler = NULL;
544 544
545 if (name_index > 0 && name_index < OCFS2_XATTR_MAX) 545 if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
546 handler = ocfs2_xattr_handler_map[name_index]; 546 handler = ocfs2_xattr_handler_map[name_index];
@@ -7213,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle,
7213 xattr_ac, data_ac); 7213 xattr_ac, data_ac);
7214} 7214}
7215 7215
7216struct xattr_handler ocfs2_xattr_security_handler = { 7216const struct xattr_handler ocfs2_xattr_security_handler = {
7217 .prefix = XATTR_SECURITY_PREFIX, 7217 .prefix = XATTR_SECURITY_PREFIX,
7218 .list = ocfs2_xattr_security_list, 7218 .list = ocfs2_xattr_security_list,
7219 .get = ocfs2_xattr_security_get, 7219 .get = ocfs2_xattr_security_get,
@@ -7257,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
7257 name, value, size, flags); 7257 name, value, size, flags);
7258} 7258}
7259 7259
7260struct xattr_handler ocfs2_xattr_trusted_handler = { 7260const struct xattr_handler ocfs2_xattr_trusted_handler = {
7261 .prefix = XATTR_TRUSTED_PREFIX, 7261 .prefix = XATTR_TRUSTED_PREFIX,
7262 .list = ocfs2_xattr_trusted_list, 7262 .list = ocfs2_xattr_trusted_list,
7263 .get = ocfs2_xattr_trusted_get, 7263 .get = ocfs2_xattr_trusted_get,
@@ -7313,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
7313 name, value, size, flags); 7313 name, value, size, flags);
7314} 7314}
7315 7315
7316struct xattr_handler ocfs2_xattr_user_handler = { 7316const struct xattr_handler ocfs2_xattr_user_handler = {
7317 .prefix = XATTR_USER_PREFIX, 7317 .prefix = XATTR_USER_PREFIX,
7318 .list = ocfs2_xattr_user_list, 7318 .list = ocfs2_xattr_user_list,
7319 .get = ocfs2_xattr_user_get, 7319 .get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
37 size_t value_len; 37 size_t value_len;
38}; 38};
39 39
40extern struct xattr_handler ocfs2_xattr_user_handler; 40extern const struct xattr_handler ocfs2_xattr_user_handler;
41extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern const struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler; 42extern const struct xattr_handler ocfs2_xattr_security_handler;
43extern struct xattr_handler ocfs2_xattr_acl_access_handler; 43extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
44extern struct xattr_handler ocfs2_xattr_acl_default_handler; 44extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
45extern struct xattr_handler *ocfs2_xattr_handlers[]; 45extern const struct xattr_handler *ocfs2_xattr_handlers[];
46 46
47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, 48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index b44bb835e8ea..089839a6cc64 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
37 goto fail; 37 goto fail;
38 38
39 inode->i_ino = new_block; 39 inode->i_ino = new_block;
40 inode->i_mode = mode; 40 inode_init_owner(inode, NULL, mode);
41 inode->i_uid = current_fsuid();
42 inode->i_gid = current_fsgid();
43 inode->i_mapping->a_ops = &omfs_aops; 41 inode->i_mapping->a_ops = &omfs_aops;
44 42
45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 43 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9f718e..5463266db9e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -17,7 +17,6 @@
17#include <linux/securebits.h> 17#include <linux/securebits.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/vfs.h>
21#include <linux/fcntl.h> 20#include <linux/fcntl.h>
22#include <linux/slab.h> 21#include <linux/slab.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
@@ -33,171 +32,6 @@
33 32
34#include "internal.h" 33#include "internal.h"
35 34
36int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
37{
38 int retval = -ENODEV;
39
40 if (dentry) {
41 retval = -ENOSYS;
42 if (dentry->d_sb->s_op->statfs) {
43 memset(buf, 0, sizeof(*buf));
44 retval = security_sb_statfs(dentry);
45 if (retval)
46 return retval;
47 retval = dentry->d_sb->s_op->statfs(dentry, buf);
48 if (retval == 0 && buf->f_frsize == 0)
49 buf->f_frsize = buf->f_bsize;
50 }
51 }
52 return retval;
53}
54
55EXPORT_SYMBOL(vfs_statfs);
56
57static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
58{
59 struct kstatfs st;
60 int retval;
61
62 retval = vfs_statfs(dentry, &st);
63 if (retval)
64 return retval;
65
66 if (sizeof(*buf) == sizeof(st))
67 memcpy(buf, &st, sizeof(st));
68 else {
69 if (sizeof buf->f_blocks == 4) {
70 if ((st.f_blocks | st.f_bfree | st.f_bavail |
71 st.f_bsize | st.f_frsize) &
72 0xffffffff00000000ULL)
73 return -EOVERFLOW;
74 /*
75 * f_files and f_ffree may be -1; it's okay to stuff
76 * that into 32 bits
77 */
78 if (st.f_files != -1 &&
79 (st.f_files & 0xffffffff00000000ULL))
80 return -EOVERFLOW;
81 if (st.f_ffree != -1 &&
82 (st.f_ffree & 0xffffffff00000000ULL))
83 return -EOVERFLOW;
84 }
85
86 buf->f_type = st.f_type;
87 buf->f_bsize = st.f_bsize;
88 buf->f_blocks = st.f_blocks;
89 buf->f_bfree = st.f_bfree;
90 buf->f_bavail = st.f_bavail;
91 buf->f_files = st.f_files;
92 buf->f_ffree = st.f_ffree;
93 buf->f_fsid = st.f_fsid;
94 buf->f_namelen = st.f_namelen;
95 buf->f_frsize = st.f_frsize;
96 memset(buf->f_spare, 0, sizeof(buf->f_spare));
97 }
98 return 0;
99}
100
101static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
102{
103 struct kstatfs st;
104 int retval;
105
106 retval = vfs_statfs(dentry, &st);
107 if (retval)
108 return retval;
109
110 if (sizeof(*buf) == sizeof(st))
111 memcpy(buf, &st, sizeof(st));
112 else {
113 buf->f_type = st.f_type;
114 buf->f_bsize = st.f_bsize;
115 buf->f_blocks = st.f_blocks;
116 buf->f_bfree = st.f_bfree;
117 buf->f_bavail = st.f_bavail;
118 buf->f_files = st.f_files;
119 buf->f_ffree = st.f_ffree;
120 buf->f_fsid = st.f_fsid;
121 buf->f_namelen = st.f_namelen;
122 buf->f_frsize = st.f_frsize;
123 memset(buf->f_spare, 0, sizeof(buf->f_spare));
124 }
125 return 0;
126}
127
128SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
129{
130 struct path path;
131 int error;
132
133 error = user_path(pathname, &path);
134 if (!error) {
135 struct statfs tmp;
136 error = vfs_statfs_native(path.dentry, &tmp);
137 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
138 error = -EFAULT;
139 path_put(&path);
140 }
141 return error;
142}
143
144SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
145{
146 struct path path;
147 long error;
148
149 if (sz != sizeof(*buf))
150 return -EINVAL;
151 error = user_path(pathname, &path);
152 if (!error) {
153 struct statfs64 tmp;
154 error = vfs_statfs64(path.dentry, &tmp);
155 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
156 error = -EFAULT;
157 path_put(&path);
158 }
159 return error;
160}
161
162SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
163{
164 struct file * file;
165 struct statfs tmp;
166 int error;
167
168 error = -EBADF;
169 file = fget(fd);
170 if (!file)
171 goto out;
172 error = vfs_statfs_native(file->f_path.dentry, &tmp);
173 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
174 error = -EFAULT;
175 fput(file);
176out:
177 return error;
178}
179
180SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
181{
182 struct file * file;
183 struct statfs64 tmp;
184 int error;
185
186 if (sz != sizeof(*buf))
187 return -EINVAL;
188
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = vfs_statfs64(file->f_path.dentry, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error;
199}
200
201int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 35int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
202 struct file *filp) 36 struct file *filp)
203{ 37{
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
70 70
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS) 72 defined(CONFIG_ACORN_PARTITION_ADFS)
73static int 73static int riscix_partition(struct parsed_partitions *state,
74riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 74 unsigned long first_sect, int slot,
75 unsigned long first_sect, int slot, unsigned long nr_sects) 75 unsigned long nr_sects)
76{ 76{
77 Sector sect; 77 Sector sect;
78 struct riscix_record *rr; 78 struct riscix_record *rr;
79 79
80 rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect); 80 rr = read_part_sector(state, first_sect, &sect);
81 if (!rr) 81 if (!rr)
82 return -1; 82 return -1;
83 83
@@ -123,9 +123,9 @@ struct linux_part {
123 123
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS) 125 defined(CONFIG_ACORN_PARTITION_ADFS)
126static int 126static int linux_partition(struct parsed_partitions *state,
127linux_partition(struct parsed_partitions *state, struct block_device *bdev, 127 unsigned long first_sect, int slot,
128 unsigned long first_sect, int slot, unsigned long nr_sects) 128 unsigned long nr_sects)
129{ 129{
130 Sector sect; 130 Sector sect;
131 struct linux_part *linuxp; 131 struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
135 135
136 put_partition(state, slot++, first_sect, size); 136 put_partition(state, slot++, first_sect, size);
137 137
138 linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect); 138 linuxp = read_part_sector(state, first_sect, &sect);
139 if (!linuxp) 139 if (!linuxp)
140 return -1; 140 return -1;
141 141
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
157#endif 157#endif
158 158
159#ifdef CONFIG_ACORN_PARTITION_CUMANA 159#ifdef CONFIG_ACORN_PARTITION_CUMANA
160int 160int adfspart_check_CUMANA(struct parsed_partitions *state)
161adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
162{ 161{
163 unsigned long first_sector = 0; 162 unsigned long first_sector = 0;
164 unsigned int start_blk = 0; 163 unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
185 struct adfs_discrecord *dr; 184 struct adfs_discrecord *dr;
186 unsigned int nr_sects; 185 unsigned int nr_sects;
187 186
188 data = read_dev_sector(bdev, start_blk * 2 + 6, &sect); 187 data = read_part_sector(state, start_blk * 2 + 6, &sect);
189 if (!data) 188 if (!data)
190 return -1; 189 return -1;
191 190
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
217#ifdef CONFIG_ACORN_PARTITION_RISCIX 216#ifdef CONFIG_ACORN_PARTITION_RISCIX
218 case PARTITION_RISCIX_SCSI: 217 case PARTITION_RISCIX_SCSI:
219 /* RISCiX - we don't know how to find the next one. */ 218 /* RISCiX - we don't know how to find the next one. */
220 slot = riscix_partition(state, bdev, first_sector, 219 slot = riscix_partition(state, first_sector, slot,
221 slot, nr_sects); 220 nr_sects);
222 break; 221 break;
223#endif 222#endif
224 223
225 case PARTITION_LINUX: 224 case PARTITION_LINUX:
226 slot = linux_partition(state, bdev, first_sector, 225 slot = linux_partition(state, first_sector, slot,
227 slot, nr_sects); 226 nr_sects);
228 break; 227 break;
229 } 228 }
230 put_dev_sector(sect); 229 put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
249 * hda1 = ADFS partition on first drive. 248 * hda1 = ADFS partition on first drive.
250 * hda2 = non-ADFS partition. 249 * hda2 = non-ADFS partition.
251 */ 250 */
252int 251int adfspart_check_ADFS(struct parsed_partitions *state)
253adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
254{ 252{
255 unsigned long start_sect, nr_sects, sectscyl, heads; 253 unsigned long start_sect, nr_sects, sectscyl, heads;
256 Sector sect; 254 Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
259 unsigned char id; 257 unsigned char id;
260 int slot = 1; 258 int slot = 1;
261 259
262 data = read_dev_sector(bdev, 6, &sect); 260 data = read_part_sector(state, 6, &sect);
263 if (!data) 261 if (!data)
264 return -1; 262 return -1;
265 263
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
278 /* 276 /*
279 * Work out start of non-adfs partition. 277 * Work out start of non-adfs partition.
280 */ 278 */
281 nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; 279 nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
282 280
283 if (start_sect) { 281 if (start_sect) {
284 switch (id) { 282 switch (id) {
285#ifdef CONFIG_ACORN_PARTITION_RISCIX 283#ifdef CONFIG_ACORN_PARTITION_RISCIX
286 case PARTITION_RISCIX_SCSI: 284 case PARTITION_RISCIX_SCSI:
287 case PARTITION_RISCIX_MFM: 285 case PARTITION_RISCIX_MFM:
288 slot = riscix_partition(state, bdev, start_sect, 286 slot = riscix_partition(state, start_sect, slot,
289 slot, nr_sects); 287 nr_sects);
290 break; 288 break;
291#endif 289#endif
292 290
293 case PARTITION_LINUX: 291 case PARTITION_LINUX:
294 slot = linux_partition(state, bdev, start_sect, 292 slot = linux_partition(state, start_sect, slot,
295 slot, nr_sects); 293 nr_sects);
296 break; 294 break;
297 } 295 }
298 } 296 }
@@ -308,10 +306,11 @@ struct ics_part {
308 __le32 size; 306 __le32 size;
309}; 307};
310 308
311static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) 309static int adfspart_check_ICSLinux(struct parsed_partitions *state,
310 unsigned long block)
312{ 311{
313 Sector sect; 312 Sector sect;
314 unsigned char *data = read_dev_sector(bdev, block, &sect); 313 unsigned char *data = read_part_sector(state, block, &sect);
315 int result = 0; 314 int result = 0;
316 315
317 if (data) { 316 if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
349 * hda2 = ADFS partition 1 on first drive. 348 * hda2 = ADFS partition 1 on first drive.
350 * ..etc.. 349 * ..etc..
351 */ 350 */
352int 351int adfspart_check_ICS(struct parsed_partitions *state)
353adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
354{ 352{
355 const unsigned char *data; 353 const unsigned char *data;
356 const struct ics_part *p; 354 const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
360 /* 358 /*
361 * Try ICS style partitions - sector 0 contains partition info. 359 * Try ICS style partitions - sector 0 contains partition info.
362 */ 360 */
363 data = read_dev_sector(bdev, 0, &sect); 361 data = read_part_sector(state, 0, &sect);
364 if (!data) 362 if (!data)
365 return -1; 363 return -1;
366 364
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
392 * partition is. We must not make this visible 390 * partition is. We must not make this visible
393 * to the filesystem. 391 * to the filesystem.
394 */ 392 */
395 if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { 393 if (size > 1 && adfspart_check_ICSLinux(state, start)) {
396 start += 1; 394 start += 1;
397 size -= 1; 395 size -= 1;
398 } 396 }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
446 * hda2 = ADFS partition 1 on first drive. 444 * hda2 = ADFS partition 1 on first drive.
447 * ..etc.. 445 * ..etc..
448 */ 446 */
449int 447int adfspart_check_POWERTEC(struct parsed_partitions *state)
450adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
451{ 448{
452 Sector sect; 449 Sector sect;
453 const unsigned char *data; 450 const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
455 int slot = 1; 452 int slot = 1;
456 int i; 453 int i;
457 454
458 data = read_dev_sector(bdev, 0, &sect); 455 data = read_part_sector(state, 0, &sect);
459 if (!data) 456 if (!data)
460 return -1; 457 return -1;
461 458
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
508 * 1. The individual ADFS boot block entries that are placed on the disk. 505 * 1. The individual ADFS boot block entries that are placed on the disk.
509 * 2. The start address of the next entry. 506 * 2. The start address of the next entry.
510 */ 507 */
511int 508int adfspart_check_EESOX(struct parsed_partitions *state)
512adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
513{ 509{
514 Sector sect; 510 Sector sect;
515 const unsigned char *data; 511 const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
518 sector_t start = 0; 514 sector_t start = 0;
519 int i, slot = 1; 515 int i, slot = 1;
520 516
521 data = read_dev_sector(bdev, 7, &sect); 517 data = read_part_sector(state, 7, &sect);
522 if (!data) 518 if (!data)
523 return -1; 519 return -1;
524 520
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
545 if (i != 0) { 541 if (i != 0) {
546 sector_t size; 542 sector_t size;
547 543
548 size = get_capacity(bdev->bd_disk); 544 size = get_capacity(state->bdev->bd_disk);
549 put_partition(state, slot++, start, size - start); 545 put_partition(state, slot++, start, size - start);
550 printk("\n"); 546 printk("\n");
551 } 547 }
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
7 * format, and everyone stick to it? 7 * format, and everyone stick to it?
8 */ 8 */
9 9
10int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); 10int adfspart_check_CUMANA(struct parsed_partitions *state);
11int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); 11int adfspart_check_ADFS(struct parsed_partitions *state);
12int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); 12int adfspart_check_ICS(struct parsed_partitions *state);
13int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); 13int adfspart_check_POWERTEC(struct parsed_partitions *state);
14int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); 14int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
23 return sum; 23 return sum;
24} 24}
25 25
26int 26int amiga_partition(struct parsed_partitions *state)
27amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
28{ 27{
29 Sector sect; 28 Sector sect;
30 unsigned char *data; 29 unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
38 for (blk = 0; ; blk++, put_dev_sector(sect)) { 37 for (blk = 0; ; blk++, put_dev_sector(sect)) {
39 if (blk == RDB_ALLOCATION_LIMIT) 38 if (blk == RDB_ALLOCATION_LIMIT)
40 goto rdb_done; 39 goto rdb_done;
41 data = read_dev_sector(bdev, blk, &sect); 40 data = read_part_sector(state, blk, &sect);
42 if (!data) { 41 if (!data) {
43 if (warn_no_part) 42 if (warn_no_part)
44 printk("Dev %s: unable to read RDB block %d\n", 43 printk("Dev %s: unable to read RDB block %d\n",
45 bdevname(bdev, b), blk); 44 bdevname(state->bdev, b), blk);
46 res = -1; 45 res = -1;
47 goto rdb_done; 46 goto rdb_done;
48 } 47 }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
64 } 63 }
65 64
66 printk("Dev %s: RDB in block %d has bad checksum\n", 65 printk("Dev %s: RDB in block %d has bad checksum\n",
67 bdevname(bdev, b), blk); 66 bdevname(state->bdev, b), blk);
68 } 67 }
69 68
70 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
75 put_dev_sector(sect); 74 put_dev_sector(sect);
76 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 75 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
77 blk *= blksize; /* Read in terms partition table understands */ 76 blk *= blksize; /* Read in terms partition table understands */
78 data = read_dev_sector(bdev, blk, &sect); 77 data = read_part_sector(state, blk, &sect);
79 if (!data) { 78 if (!data) {
80 if (warn_no_part) 79 if (warn_no_part)
81 printk("Dev %s: unable to read partition block %d\n", 80 printk("Dev %s: unable to read partition block %d\n",
82 bdevname(bdev, b), blk); 81 bdevname(state->bdev, b), blk);
83 res = -1; 82 res = -1;
84 goto rdb_done; 83 goto rdb_done;
85 } 84 }
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
2 * fs/partitions/amiga.h 2 * fs/partitions/amiga.h
3 */ 3 */
4 4
5int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); 5int amiga_partition(struct parsed_partitions *state);
6 6
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
30 memcmp (s, "RAW", 3) == 0 ; 30 memcmp (s, "RAW", 3) == 0 ;
31} 31}
32 32
33int atari_partition(struct parsed_partitions *state, struct block_device *bdev) 33int atari_partition(struct parsed_partitions *state)
34{ 34{
35 Sector sect; 35 Sector sect;
36 struct rootsector *rs; 36 struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ 42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
43#endif 43#endif
44 44
45 rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect); 45 rs = read_part_sector(state, 0, &sect);
46 if (!rs) 46 if (!rs)
47 return -1; 47 return -1;
48 48
49 /* Verify this is an Atari rootsector: */ 49 /* Verify this is an Atari rootsector: */
50 hd_size = bdev->bd_inode->i_size >> 9; 50 hd_size = state->bdev->bd_inode->i_size >> 9;
51 if (!VALID_PARTITION(&rs->part[0], hd_size) && 51 if (!VALID_PARTITION(&rs->part[0], hd_size) &&
52 !VALID_PARTITION(&rs->part[1], hd_size) && 52 !VALID_PARTITION(&rs->part[1], hd_size) &&
53 !VALID_PARTITION(&rs->part[2], hd_size) && 53 !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
84 printk(" XGM<"); 84 printk(" XGM<");
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
88 if (!xrs) { 88 if (!xrs) {
89 printk (" block %ld read failed\n", partsect); 89 printk (" block %ld read failed\n", partsect);
90 put_dev_sector(sect); 90 put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
31 u16 checksum; /* checksum for bootable disks */ 31 u16 checksum; /* checksum for bootable disks */
32} __attribute__((__packed__)); 32} __attribute__((__packed__));
33 33
34int atari_partition(struct parsed_partitions *state, struct block_device *bdev); 34int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
45 45
46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ 46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
47 47
48static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { 48static int (*check_part[])(struct parsed_partitions *) = {
49 /* 49 /*
50 * Probe partition formats with tables at disk address 0 50 * Probe partition formats with tables at disk address 0
51 * that also have an ADFS boot block at 0xdc0. 51 * that also have an ADFS boot block at 0xdc0.
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
161 struct parsed_partitions *state; 161 struct parsed_partitions *state;
162 int i, res, err; 162 int i, res, err;
163 163
164 state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
165 if (!state) 165 if (!state)
166 return NULL; 166 return NULL;
167 167
168 state->bdev = bdev;
168 disk_name(hd, 0, state->name); 169 disk_name(hd, 0, state->name);
169 printk(KERN_INFO " %s:", state->name); 170 printk(KERN_INFO " %s:", state->name);
170 if (isdigit(state->name[strlen(state->name)-1])) 171 if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
174 i = res = err = 0; 175 i = res = err = 0;
175 while (!res && check_part[i]) { 176 while (!res && check_part[i]) {
176 memset(&state->parts, 0, sizeof(state->parts)); 177 memset(&state->parts, 0, sizeof(state->parts));
177 res = check_part[i++](state, bdev); 178 res = check_part[i++](state);
178 if (res < 0) { 179 if (res < 0) {
179 /* We have hit an I/O error which we don't report now. 180 /* We have hit an I/O error which we don't report now.
180 * But record it, and let the others do their job. 181 * But record it, and let the others do their job.
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
186 } 187 }
187 if (res > 0) 188 if (res > 0)
188 return state; 189 return state;
190 if (state->access_beyond_eod)
191 err = -ENOSPC;
189 if (err) 192 if (err)
190 /* The partition is unrecognized. So report I/O errors if there were any */ 193 /* The partition is unrecognized. So report I/O errors if there were any */
191 res = err; 194 res = err;
@@ -538,12 +541,33 @@ exit:
538 disk_part_iter_exit(&piter); 541 disk_part_iter_exit(&piter);
539} 542}
540 543
544static bool disk_unlock_native_capacity(struct gendisk *disk)
545{
546 const struct block_device_operations *bdops = disk->fops;
547
548 if (bdops->unlock_native_capacity &&
549 !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
550 printk(KERN_CONT "enabling native capacity\n");
551 bdops->unlock_native_capacity(disk);
552 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
553 return true;
554 } else {
555 printk(KERN_CONT "truncated\n");
556 return false;
557 }
558}
559
541int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 560int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
542{ 561{
562 struct parsed_partitions *state = NULL;
543 struct disk_part_iter piter; 563 struct disk_part_iter piter;
544 struct hd_struct *part; 564 struct hd_struct *part;
545 struct parsed_partitions *state;
546 int p, highest, res; 565 int p, highest, res;
566rescan:
567 if (state && !IS_ERR(state)) {
568 kfree(state);
569 state = NULL;
570 }
547 571
548 if (bdev->bd_part_count) 572 if (bdev->bd_part_count)
549 return -EBUSY; 573 return -EBUSY;
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
562 bdev->bd_invalidated = 0; 586 bdev->bd_invalidated = 0;
563 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 587 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
564 return 0; 588 return 0;
565 if (IS_ERR(state)) /* I/O error reading the partition table */ 589 if (IS_ERR(state)) {
590 /*
591 * I/O error reading the partition table. If any
592 * partition code tried to read beyond EOD, retry
593 * after unlocking native capacity.
594 */
595 if (PTR_ERR(state) == -ENOSPC) {
596 printk(KERN_WARNING "%s: partition table beyond EOD, ",
597 disk->disk_name);
598 if (disk_unlock_native_capacity(disk))
599 goto rescan;
600 }
566 return -EIO; 601 return -EIO;
602 }
603 /*
604 * If any partition code tried to read beyond EOD, try
605 * unlocking native capacity even if partition table is
606 * sucessfully read as we could be missing some partitions.
607 */
608 if (state->access_beyond_eod) {
609 printk(KERN_WARNING
610 "%s: partition table partially beyond EOD, ",
611 disk->disk_name);
612 if (disk_unlock_native_capacity(disk))
613 goto rescan;
614 }
567 615
568 /* tell userspace that the media / partition table may have changed */ 616 /* tell userspace that the media / partition table may have changed */
569 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); 617 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
581 /* add partitions */ 629 /* add partitions */
582 for (p = 1; p < state->limit; p++) { 630 for (p = 1; p < state->limit; p++) {
583 sector_t size, from; 631 sector_t size, from;
584try_scan: 632
585 size = state->parts[p].size; 633 size = state->parts[p].size;
586 if (!size) 634 if (!size)
587 continue; 635 continue;
@@ -589,30 +637,21 @@ try_scan:
589 from = state->parts[p].from; 637 from = state->parts[p].from;
590 if (from >= get_capacity(disk)) { 638 if (from >= get_capacity(disk)) {
591 printk(KERN_WARNING 639 printk(KERN_WARNING
592 "%s: p%d ignored, start %llu is behind the end of the disk\n", 640 "%s: p%d start %llu is beyond EOD, ",
593 disk->disk_name, p, (unsigned long long) from); 641 disk->disk_name, p, (unsigned long long) from);
642 if (disk_unlock_native_capacity(disk))
643 goto rescan;
594 continue; 644 continue;
595 } 645 }
596 646
597 if (from + size > get_capacity(disk)) { 647 if (from + size > get_capacity(disk)) {
598 const struct block_device_operations *bdops = disk->fops;
599 unsigned long long capacity;
600
601 printk(KERN_WARNING 648 printk(KERN_WARNING
602 "%s: p%d size %llu exceeds device capacity, ", 649 "%s: p%d size %llu extends beyond EOD, ",
603 disk->disk_name, p, (unsigned long long) size); 650 disk->disk_name, p, (unsigned long long) size);
604 651
605 if (bdops->set_capacity && 652 if (disk_unlock_native_capacity(disk)) {
606 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { 653 /* free state and restart */
607 printk(KERN_CONT "enabling native capacity\n"); 654 goto rescan;
608 capacity = bdops->set_capacity(disk, ~0ULL);
609 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
610 if (capacity > get_capacity(disk)) {
611 set_capacity(disk, capacity);
612 check_disk_size_change(disk, bdev);
613 bdev->bd_invalidated = 0;
614 }
615 goto try_scan;
616 } else { 655 } else {
617 /* 656 /*
618 * we can not ignore partitions of broken tables 657 * we can not ignore partitions of broken tables
@@ -620,7 +659,6 @@ try_scan:
620 * we limit them to the end of the disk to avoid 659 * we limit them to the end of the disk to avoid
621 * creating invalid block devices 660 * creating invalid block devices
622 */ 661 */
623 printk(KERN_CONT "limited to end of disk\n");
624 size = get_capacity(disk) - from; 662 size = get_capacity(disk) - from;
625 } 663 }
626 } 664 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
6 * description. 6 * description.
7 */ 7 */
8struct parsed_partitions { 8struct parsed_partitions {
9 struct block_device *bdev;
9 char name[BDEVNAME_SIZE]; 10 char name[BDEVNAME_SIZE];
10 struct { 11 struct {
11 sector_t from; 12 sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
14 } parts[DISK_MAX_PARTS]; 15 } parts[DISK_MAX_PARTS];
15 int next; 16 int next;
16 int limit; 17 int limit;
18 bool access_beyond_eod;
17}; 19};
18 20
21static inline void *read_part_sector(struct parsed_partitions *state,
22 sector_t n, Sector *p)
23{
24 if (n >= get_capacity(state->bdev->bd_disk)) {
25 state->access_beyond_eod = true;
26 return NULL;
27 }
28 return read_dev_sector(state->bdev, n, p);
29}
30
19static inline void 31static inline void
20put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 32put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
21{ 33{
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..9efb2cfe2410 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
140 * the part[0] entry for this disk, and is the number of 140 * the part[0] entry for this disk, and is the number of
141 * physical sectors available on the disk. 141 * physical sectors available on the disk.
142 */ 142 */
143static u64 143static u64 last_lba(struct block_device *bdev)
144last_lba(struct block_device *bdev)
145{ 144{
146 if (!bdev || !bdev->bd_inode) 145 if (!bdev || !bdev->bd_inode)
147 return 0; 146 return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
181 180
182/** 181/**
183 * read_lba(): Read bytes from disk, starting at given LBA 182 * read_lba(): Read bytes from disk, starting at given LBA
184 * @bdev 183 * @state
185 * @lba 184 * @lba
186 * @buffer 185 * @buffer
187 * @size_t 186 * @size_t
188 * 187 *
189 * Description: Reads @count bytes from @bdev into @buffer. 188 * Description: Reads @count bytes from @state->bdev into @buffer.
190 * Returns number of bytes read on success, 0 on error. 189 * Returns number of bytes read on success, 0 on error.
191 */ 190 */
192static size_t 191static size_t read_lba(struct parsed_partitions *state,
193read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 192 u64 lba, u8 *buffer, size_t count)
194{ 193{
195 size_t totalreadcount = 0; 194 size_t totalreadcount = 0;
195 struct block_device *bdev = state->bdev;
196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512); 196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
197 197
198 if (!bdev || !buffer || lba > last_lba(bdev)) 198 if (!buffer || lba > last_lba(bdev))
199 return 0; 199 return 0;
200 200
201 while (count) { 201 while (count) {
202 int copied = 512; 202 int copied = 512;
203 Sector sect; 203 Sector sect;
204 unsigned char *data = read_dev_sector(bdev, n++, &sect); 204 unsigned char *data = read_part_sector(state, n++, &sect);
205 if (!data) 205 if (!data)
206 break; 206 break;
207 if (copied > count) 207 if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
217 217
218/** 218/**
219 * alloc_read_gpt_entries(): reads partition entries from disk 219 * alloc_read_gpt_entries(): reads partition entries from disk
220 * @bdev 220 * @state
221 * @gpt - GPT header 221 * @gpt - GPT header
222 * 222 *
223 * Description: Returns ptes on success, NULL on error. 223 * Description: Returns ptes on success, NULL on error.
224 * Allocates space for PTEs based on information found in @gpt. 224 * Allocates space for PTEs based on information found in @gpt.
225 * Notes: remember to free pte when you're done! 225 * Notes: remember to free pte when you're done!
226 */ 226 */
227static gpt_entry * 227static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
228alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) 228 gpt_header *gpt)
229{ 229{
230 size_t count; 230 size_t count;
231 gpt_entry *pte; 231 gpt_entry *pte;
232 if (!bdev || !gpt) 232
233 if (!gpt)
233 return NULL; 234 return NULL;
234 235
235 count = le32_to_cpu(gpt->num_partition_entries) * 236 count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
240 if (!pte) 241 if (!pte)
241 return NULL; 242 return NULL;
242 243
243 if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), 244 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
244 (u8 *) pte, 245 (u8 *) pte,
245 count) < count) { 246 count) < count) {
246 kfree(pte); 247 kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
252 253
253/** 254/**
254 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk 255 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
255 * @bdev 256 * @state
256 * @lba is the Logical Block Address of the partition table 257 * @lba is the Logical Block Address of the partition table
257 * 258 *
258 * Description: returns GPT header on success, NULL on error. Allocates 259 * Description: returns GPT header on success, NULL on error. Allocates
259 * and fills a GPT header starting at @ from @bdev. 260 * and fills a GPT header starting at @ from @state->bdev.
260 * Note: remember to free gpt when finished with it. 261 * Note: remember to free gpt when finished with it.
261 */ 262 */
262static gpt_header * 263static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
263alloc_read_gpt_header(struct block_device *bdev, u64 lba) 264 u64 lba)
264{ 265{
265 gpt_header *gpt; 266 gpt_header *gpt;
266 unsigned ssz = bdev_logical_block_size(bdev); 267 unsigned ssz = bdev_logical_block_size(state->bdev);
267
268 if (!bdev)
269 return NULL;
270 268
271 gpt = kzalloc(ssz, GFP_KERNEL); 269 gpt = kzalloc(ssz, GFP_KERNEL);
272 if (!gpt) 270 if (!gpt)
273 return NULL; 271 return NULL;
274 272
275 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { 273 if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
276 kfree(gpt); 274 kfree(gpt);
277 gpt=NULL; 275 gpt=NULL;
278 return NULL; 276 return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
283 281
284/** 282/**
285 * is_gpt_valid() - tests one GPT header and PTEs for validity 283 * is_gpt_valid() - tests one GPT header and PTEs for validity
286 * @bdev 284 * @state
287 * @lba is the logical block address of the GPT header to test 285 * @lba is the logical block address of the GPT header to test
288 * @gpt is a GPT header ptr, filled on return. 286 * @gpt is a GPT header ptr, filled on return.
289 * @ptes is a PTEs ptr, filled on return. 287 * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
291 * Description: returns 1 if valid, 0 on error. 289 * Description: returns 1 if valid, 0 on error.
292 * If valid, returns pointers to newly allocated GPT header and PTEs. 290 * If valid, returns pointers to newly allocated GPT header and PTEs.
293 */ 291 */
294static int 292static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
295is_gpt_valid(struct block_device *bdev, u64 lba, 293 gpt_header **gpt, gpt_entry **ptes)
296 gpt_header **gpt, gpt_entry **ptes)
297{ 294{
298 u32 crc, origcrc; 295 u32 crc, origcrc;
299 u64 lastlba; 296 u64 lastlba;
300 297
301 if (!bdev || !gpt || !ptes) 298 if (!ptes)
302 return 0; 299 return 0;
303 if (!(*gpt = alloc_read_gpt_header(bdev, lba))) 300 if (!(*gpt = alloc_read_gpt_header(state, lba)))
304 return 0; 301 return 0;
305 302
306 /* Check the GUID Partition Table signature */ 303 /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
336 /* Check the first_usable_lba and last_usable_lba are 333 /* Check the first_usable_lba and last_usable_lba are
337 * within the disk. 334 * within the disk.
338 */ 335 */
339 lastlba = last_lba(bdev); 336 lastlba = last_lba(state->bdev);
340 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { 337 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
341 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", 338 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
342 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), 339 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
350 goto fail; 347 goto fail;
351 } 348 }
352 349
353 if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) 350 if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
354 goto fail; 351 goto fail;
355 352
356 /* Check the GUID Partition Entry Array CRC */ 353 /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
495 492
496/** 493/**
497 * find_valid_gpt() - Search disk for valid GPT headers and PTEs 494 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
498 * @bdev 495 * @state
499 * @gpt is a GPT header ptr, filled on return. 496 * @gpt is a GPT header ptr, filled on return.
500 * @ptes is a PTEs ptr, filled on return. 497 * @ptes is a PTEs ptr, filled on return.
501 * Description: Returns 1 if valid, 0 on error. 498 * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
508 * This protects against devices which misreport their size, and forces 505 * This protects against devices which misreport their size, and forces
509 * the user to decide to use the Alternate GPT. 506 * the user to decide to use the Alternate GPT.
510 */ 507 */
511static int 508static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
512find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) 509 gpt_entry **ptes)
513{ 510{
514 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; 511 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
515 gpt_header *pgpt = NULL, *agpt = NULL; 512 gpt_header *pgpt = NULL, *agpt = NULL;
516 gpt_entry *pptes = NULL, *aptes = NULL; 513 gpt_entry *pptes = NULL, *aptes = NULL;
517 legacy_mbr *legacymbr; 514 legacy_mbr *legacymbr;
518 u64 lastlba; 515 u64 lastlba;
519 if (!bdev || !gpt || !ptes) 516
517 if (!ptes)
520 return 0; 518 return 0;
521 519
522 lastlba = last_lba(bdev); 520 lastlba = last_lba(state->bdev);
523 if (!force_gpt) { 521 if (!force_gpt) {
524 /* This will be added to the EFI Spec. per Intel after v1.02. */ 522 /* This will be added to the EFI Spec. per Intel after v1.02. */
525 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); 523 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
526 if (legacymbr) { 524 if (legacymbr) {
527 read_lba(bdev, 0, (u8 *) legacymbr, 525 read_lba(state, 0, (u8 *) legacymbr,
528 sizeof (*legacymbr)); 526 sizeof (*legacymbr));
529 good_pmbr = is_pmbr_valid(legacymbr); 527 good_pmbr = is_pmbr_valid(legacymbr);
530 kfree(legacymbr); 528 kfree(legacymbr);
531 } 529 }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
533 goto fail; 531 goto fail;
534 } 532 }
535 533
536 good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, 534 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
537 &pgpt, &pptes); 535 &pgpt, &pptes);
538 if (good_pgpt) 536 if (good_pgpt)
539 good_agpt = is_gpt_valid(bdev, 537 good_agpt = is_gpt_valid(state,
540 le64_to_cpu(pgpt->alternate_lba), 538 le64_to_cpu(pgpt->alternate_lba),
541 &agpt, &aptes); 539 &agpt, &aptes);
542 if (!good_agpt && force_gpt) 540 if (!good_agpt && force_gpt)
543 good_agpt = is_gpt_valid(bdev, lastlba, 541 good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
544 &agpt, &aptes);
545 542
546 /* The obviously unsuccessful case */ 543 /* The obviously unsuccessful case */
547 if (!good_pgpt && !good_agpt) 544 if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
583} 580}
584 581
585/** 582/**
586 * efi_partition(struct parsed_partitions *state, struct block_device *bdev) 583 * efi_partition(struct parsed_partitions *state)
587 * @state 584 * @state
588 * @bdev
589 * 585 *
590 * Description: called from check.c, if the disk contains GPT 586 * Description: called from check.c, if the disk contains GPT
591 * partitions, sets up partition entries in the kernel. 587 * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
602 * 1 if successful 598 * 1 if successful
603 * 599 *
604 */ 600 */
605int 601int efi_partition(struct parsed_partitions *state)
606efi_partition(struct parsed_partitions *state, struct block_device *bdev)
607{ 602{
608 gpt_header *gpt = NULL; 603 gpt_header *gpt = NULL;
609 gpt_entry *ptes = NULL; 604 gpt_entry *ptes = NULL;
610 u32 i; 605 u32 i;
611 unsigned ssz = bdev_logical_block_size(bdev) / 512; 606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
612 607
613 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
614 kfree(gpt); 609 kfree(gpt);
615 kfree(ptes); 610 kfree(ptes);
616 return 0; 611 return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
623 u64 size = le64_to_cpu(ptes[i].ending_lba) - 618 u64 size = le64_to_cpu(ptes[i].ending_lba) -
624 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 619 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
625 620
626 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 621 if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
627 continue; 622 continue;
628 623
629 put_partition(state, i+1, start * ssz, size * ssz); 624 put_partition(state, i+1, start * ssz, size * ssz);
@@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
631 /* If this is a RAID volume, tell md */ 626 /* If this is a RAID volume, tell md */
632 if (!efi_guidcmp(ptes[i].partition_type_guid, 627 if (!efi_guidcmp(ptes[i].partition_type_guid,
633 PARTITION_LINUX_RAID_GUID)) 628 PARTITION_LINUX_RAID_GUID))
634 state->parts[i+1].flags = 1; 629 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635 } 630 }
636 kfree(ptes); 631 kfree(ptes);
637 kfree(gpt); 632 kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
110} __attribute__ ((packed)) legacy_mbr; 110} __attribute__ ((packed)) legacy_mbr;
111 111
112/* Functions */ 112/* Functions */
113extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); 113extern int efi_partition(struct parsed_partitions *state);
114 114
115#endif 115#endif
116 116
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
58 58
59/* 59/*
60 */ 60 */
61int 61int ibm_partition(struct parsed_partitions *state)
62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
63{ 62{
63 struct block_device *bdev = state->bdev;
64 int blocksize, res; 64 int blocksize, res;
65 loff_t i_size, offset, size, fmt_size; 65 loff_t i_size, offset, size, fmt_size;
66 dasd_information2_t *info; 66 dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
100 /* 100 /*
101 * Get volume label, extract name and type. 101 * Get volume label, extract name and type.
102 */ 102 */
103 data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect); 103 data = read_part_sector(state, info->label_block*(blocksize/512),
104 &sect);
104 if (data == NULL) 105 if (data == NULL)
105 goto out_readerr; 106 goto out_readerr;
106 107
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
193 */ 194 */
194 blk = cchhb2blk(&label->vol.vtoc, geo) + 1; 195 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
195 counter = 0; 196 counter = 0;
196 data = read_dev_sector(bdev, blk * (blocksize/512), 197 data = read_part_sector(state, blk * (blocksize/512),
197 &sect); 198 &sect);
198 while (data != NULL) { 199 while (data != NULL) {
199 struct vtoc_format1_label f1; 200 struct vtoc_format1_label f1;
200 201
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
208 || f1.DS1FMTID == _ascebc['7'] 209 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) { 210 || f1.DS1FMTID == _ascebc['9']) {
210 blk++; 211 blk++;
211 data = read_dev_sector(bdev, blk * 212 data = read_part_sector(state,
212 (blocksize/512), 213 blk * (blocksize/512), &sect);
213 &sect);
214 continue; 214 continue;
215 } 215 }
216 216
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
230 size * (blocksize >> 9)); 230 size * (blocksize >> 9));
231 counter++; 231 counter++;
232 blk++; 232 blk++;
233 data = read_dev_sector(bdev, 233 data = read_part_sector(state,
234 blk * (blocksize/512), 234 blk * (blocksize/512), &sect);
235 &sect);
236 } 235 }
237 236
238 if (!data) 237 if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
int ibm_partition(struct parsed_partitions *, struct block_device *); int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "karma.h" 10#include "karma.h"
11 11
12int karma_partition(struct parsed_partitions *state, struct block_device *bdev) 12int karma_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 int slot = 1; 15 int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
29 } __attribute__((packed)) *label; 29 } __attribute__((packed)) *label;
30 struct d_partition *p; 30 struct d_partition *p;
31 31
32 data = read_dev_sector(bdev, 0, &sect); 32 data = read_part_sector(state, 0, &sect);
33 if (!data) 33 if (!data)
34 return -1; 34 return -1;
35 35
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
4 4
5#define KARMA_LABEL_MAGIC 0xAB56 5#define KARMA_LABEL_MAGIC 0xAB56
6 6
7int karma_partition(struct parsed_partitions *state, struct block_device *bdev); 7int karma_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/stringify.h> 28#include <linux/stringify.h>
29#include <linux/kernel.h>
29#include "ldm.h" 30#include "ldm.h"
30#include "check.h" 31#include "check.h"
31#include "msdos.h" 32#include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
77 int h; 78 int h;
78 79
79 /* high part */ 80 /* high part */
80 if ((x = src[0] - '0') <= '9'-'0') h = x; 81 x = h = hex_to_bin(src[0]);
81 else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; 82 if (h < 0)
82 else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; 83 return -1;
83 else return -1;
84 h <<= 4;
85 84
86 /* low part */ 85 /* low part */
87 if ((x = src[1] - '0') <= '9'-'0') return h | x; 86 h = hex_to_bin(src[1]);
88 if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); 87 if (h < 0)
89 if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); 88 return -1;
90 return -1; 89
90 return (x << 4) + h;
91} 91}
92 92
93/** 93/**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
309 309
310/** 310/**
311 * ldm_validate_privheads - Compare the primary privhead with its backups 311 * ldm_validate_privheads - Compare the primary privhead with its backups
312 * @bdev: Device holding the LDM Database 312 * @state: Partition check state including device holding the LDM Database
313 * @ph1: Memory struct to fill with ph contents 313 * @ph1: Memory struct to fill with ph contents
314 * 314 *
315 * Read and compare all three privheads from disk. 315 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
321 * Return: 'true' Success 321 * Return: 'true' Success
322 * 'false' Error 322 * 'false' Error
323 */ 323 */
324static bool ldm_validate_privheads (struct block_device *bdev, 324static bool ldm_validate_privheads(struct parsed_partitions *state,
325 struct privhead *ph1) 325 struct privhead *ph1)
326{ 326{
327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; 327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
328 struct privhead *ph[3] = { ph1 }; 328 struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
332 long num_sects; 332 long num_sects;
333 int i; 333 int i;
334 334
335 BUG_ON (!bdev || !ph1); 335 BUG_ON (!state || !ph1);
336 336
337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); 337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); 338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
346 346
347 /* Read and parse privheads */ 347 /* Read and parse privheads */
348 for (i = 0; i < 3; i++) { 348 for (i = 0; i < 3; i++) {
349 data = read_dev_sector (bdev, 349 data = read_part_sector(state, ph[0]->config_start + off[i],
350 ph[0]->config_start + off[i], &sect); 350 &sect);
351 if (!data) { 351 if (!data) {
352 ldm_crit ("Disk read failed."); 352 ldm_crit ("Disk read failed.");
353 goto out; 353 goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
363 } 363 }
364 } 364 }
365 365
366 num_sects = bdev->bd_inode->i_size >> 9; 366 num_sects = state->bdev->bd_inode->i_size >> 9;
367 367
368 if ((ph[0]->config_start > num_sects) || 368 if ((ph[0]->config_start > num_sects) ||
369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { 369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
397 397
398/** 398/**
399 * ldm_validate_tocblocks - Validate the table of contents and its backups 399 * ldm_validate_tocblocks - Validate the table of contents and its backups
400 * @bdev: Device holding the LDM Database 400 * @state: Partition check state including device holding the LDM Database
401 * @base: Offset, into @bdev, of the database 401 * @base: Offset, into @state->bdev, of the database
402 * @ldb: Cache of the database structures 402 * @ldb: Cache of the database structures
403 * 403 *
404 * Find and compare the four tables of contents of the LDM Database stored on 404 * Find and compare the four tables of contents of the LDM Database stored on
405 * @bdev and return the parsed information into @toc1. 405 * @state->bdev and return the parsed information into @toc1.
406 * 406 *
407 * The offsets and sizes of the configs are range-checked against a privhead. 407 * The offsets and sizes of the configs are range-checked against a privhead.
408 * 408 *
409 * Return: 'true' @toc1 contains validated TOCBLOCK info 409 * Return: 'true' @toc1 contains validated TOCBLOCK info
410 * 'false' @toc1 contents are undefined 410 * 'false' @toc1 contents are undefined
411 */ 411 */
412static bool ldm_validate_tocblocks(struct block_device *bdev, 412static bool ldm_validate_tocblocks(struct parsed_partitions *state,
413 unsigned long base, struct ldmdb *ldb) 413 unsigned long base, struct ldmdb *ldb)
414{ 414{
415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; 415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
416 struct tocblock *tb[4]; 416 struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
420 int i, nr_tbs; 420 int i, nr_tbs;
421 bool result = false; 421 bool result = false;
422 422
423 BUG_ON(!bdev || !ldb); 423 BUG_ON(!state || !ldb);
424 ph = &ldb->ph; 424 ph = &ldb->ph;
425 tb[0] = &ldb->toc; 425 tb[0] = &ldb->toc;
426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); 426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
437 * skip any that fail as long as we get at least one valid TOCBLOCK. 437 * skip any that fail as long as we get at least one valid TOCBLOCK.
438 */ 438 */
439 for (nr_tbs = i = 0; i < 4; i++) { 439 for (nr_tbs = i = 0; i < 4; i++) {
440 data = read_dev_sector(bdev, base + off[i], &sect); 440 data = read_part_sector(state, base + off[i], &sect);
441 if (!data) { 441 if (!data) {
442 ldm_error("Disk read failed for TOCBLOCK %d.", i); 442 ldm_error("Disk read failed for TOCBLOCK %d.", i);
443 continue; 443 continue;
@@ -473,7 +473,7 @@ err:
473 473
474/** 474/**
475 * ldm_validate_vmdb - Read the VMDB and validate it 475 * ldm_validate_vmdb - Read the VMDB and validate it
476 * @bdev: Device holding the LDM Database 476 * @state: Partition check state including device holding the LDM Database
477 * @base: Offset, into @bdev, of the database 477 * @base: Offset, into @bdev, of the database
478 * @ldb: Cache of the database structures 478 * @ldb: Cache of the database structures
479 * 479 *
@@ -483,8 +483,8 @@ err:
483 * Return: 'true' @ldb contains validated VBDB info 483 * Return: 'true' @ldb contains validated VBDB info
484 * 'false' @ldb contents are undefined 484 * 'false' @ldb contents are undefined
485 */ 485 */
486static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, 486static bool ldm_validate_vmdb(struct parsed_partitions *state,
487 struct ldmdb *ldb) 487 unsigned long base, struct ldmdb *ldb)
488{ 488{
489 Sector sect; 489 Sector sect;
490 u8 *data; 490 u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
492 struct vmdb *vm; 492 struct vmdb *vm;
493 struct tocblock *toc; 493 struct tocblock *toc;
494 494
495 BUG_ON (!bdev || !ldb); 495 BUG_ON (!state || !ldb);
496 496
497 vm = &ldb->vm; 497 vm = &ldb->vm;
498 toc = &ldb->toc; 498 toc = &ldb->toc;
499 499
500 data = read_dev_sector (bdev, base + OFF_VMDB, &sect); 500 data = read_part_sector(state, base + OFF_VMDB, &sect);
501 if (!data) { 501 if (!data) {
502 ldm_crit ("Disk read failed."); 502 ldm_crit ("Disk read failed.");
503 return false; 503 return false;
@@ -534,21 +534,21 @@ out:
534 534
535/** 535/**
536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk 536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
537 * @bdev: Device holding the LDM Database 537 * @state: Partition check state including device holding the LDM Database
538 * 538 *
539 * This function provides a weak test to decide whether the device is a dynamic 539 * This function provides a weak test to decide whether the device is a dynamic
540 * disk or not. It looks for an MS-DOS-style partition table containing at 540 * disk or not. It looks for an MS-DOS-style partition table containing at
541 * least one partition of type 0x42 (formerly SFS, now used by Windows for 541 * least one partition of type 0x42 (formerly SFS, now used by Windows for
542 * dynamic disks). 542 * dynamic disks).
543 * 543 *
544 * N.B. The only possible error can come from the read_dev_sector and that is 544 * N.B. The only possible error can come from the read_part_sector and that is
545 * only likely to happen if the underlying device is strange. If that IS 545 * only likely to happen if the underlying device is strange. If that IS
546 * the case we should return zero to let someone else try. 546 * the case we should return zero to let someone else try.
547 * 547 *
548 * Return: 'true' @bdev is a dynamic disk 548 * Return: 'true' @state->bdev is a dynamic disk
549 * 'false' @bdev is not a dynamic disk, or an error occurred 549 * 'false' @state->bdev is not a dynamic disk, or an error occurred
550 */ 550 */
551static bool ldm_validate_partition_table (struct block_device *bdev) 551static bool ldm_validate_partition_table(struct parsed_partitions *state)
552{ 552{
553 Sector sect; 553 Sector sect;
554 u8 *data; 554 u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
556 int i; 556 int i;
557 bool result = false; 557 bool result = false;
558 558
559 BUG_ON (!bdev); 559 BUG_ON(!state);
560 560
561 data = read_dev_sector (bdev, 0, &sect); 561 data = read_part_sector(state, 0, &sect);
562 if (!data) { 562 if (!data) {
563 ldm_crit ("Disk read failed."); 563 ldm_crit ("Disk read failed.");
564 return false; 564 return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1391 1391
1392/** 1392/**
1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory 1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
1394 * @bdev: Device holding the LDM Database 1394 * @state: Partition check state including device holding the LDM Database
1395 * @base: Offset, into @bdev, of the database 1395 * @base: Offset, into @state->bdev, of the database
1396 * @ldb: Cache of the database structures 1396 * @ldb: Cache of the database structures
1397 * 1397 *
1398 * To use the information from the VBLKs, they need to be read from the disk, 1398 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1401 * Return: 'true' All the VBLKs were read successfully 1401 * Return: 'true' All the VBLKs were read successfully
1402 * 'false' An error occurred 1402 * 'false' An error occurred
1403 */ 1403 */
1404static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, 1404static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
1405 struct ldmdb *ldb) 1405 struct ldmdb *ldb)
1406{ 1406{
1407 int size, perbuf, skip, finish, s, v, recs; 1407 int size, perbuf, skip, finish, s, v, recs;
1408 u8 *data = NULL; 1408 u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1410 bool result = false; 1410 bool result = false;
1411 LIST_HEAD (frags); 1411 LIST_HEAD (frags);
1412 1412
1413 BUG_ON (!bdev || !ldb); 1413 BUG_ON(!state || !ldb);
1414 1414
1415 size = ldb->vm.vblk_size; 1415 size = ldb->vm.vblk_size;
1416 perbuf = 512 / size; 1416 perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1418 finish = (size * ldb->vm.last_vblk_seq) >> 9; 1418 finish = (size * ldb->vm.last_vblk_seq) >> 9;
1419 1419
1420 for (s = skip; s < finish; s++) { /* For each sector */ 1420 for (s = skip; s < finish; s++) { /* For each sector */
1421 data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect); 1421 data = read_part_sector(state, base + OFF_VMDB + s, &sect);
1422 if (!data) { 1422 if (!data) {
1423 ldm_crit ("Disk read failed."); 1423 ldm_crit ("Disk read failed.");
1424 goto out; 1424 goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
1474 1474
1475/** 1475/**
1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it 1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it
1477 * @pp: List of the partitions parsed so far 1477 * @state: Partition check state including device holding the LDM Database
1478 * @bdev: Device holding the LDM Database
1479 * 1478 *
1480 * This determines whether the device @bdev is a dynamic disk and if so creates 1479 * This determines whether the device @bdev is a dynamic disk and if so creates
1481 * the partitions necessary in the gendisk structure pointed to by @hd. 1480 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
1485 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, 1484 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
1486 * and so on: the actual data containing partitions. 1485 * and so on: the actual data containing partitions.
1487 * 1486 *
1488 * Return: 1 Success, @bdev is a dynamic disk and we handled it 1487 * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
1489 * 0 Success, @bdev is not a dynamic disk 1488 * 0 Success, @state->bdev is not a dynamic disk
1490 * -1 An error occurred before enough information had been read 1489 * -1 An error occurred before enough information had been read
1491 * Or @bdev is a dynamic disk, but it may be corrupted 1490 * Or @state->bdev is a dynamic disk, but it may be corrupted
1492 */ 1491 */
1493int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) 1492int ldm_partition(struct parsed_partitions *state)
1494{ 1493{
1495 struct ldmdb *ldb; 1494 struct ldmdb *ldb;
1496 unsigned long base; 1495 unsigned long base;
1497 int result = -1; 1496 int result = -1;
1498 1497
1499 BUG_ON (!pp || !bdev); 1498 BUG_ON(!state);
1500 1499
1501 /* Look for signs of a Dynamic Disk */ 1500 /* Look for signs of a Dynamic Disk */
1502 if (!ldm_validate_partition_table (bdev)) 1501 if (!ldm_validate_partition_table(state))
1503 return 0; 1502 return 0;
1504 1503
1505 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); 1504 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1509 } 1508 }
1510 1509
1511 /* Parse and check privheads. */ 1510 /* Parse and check privheads. */
1512 if (!ldm_validate_privheads (bdev, &ldb->ph)) 1511 if (!ldm_validate_privheads(state, &ldb->ph))
1513 goto out; /* Already logged */ 1512 goto out; /* Already logged */
1514 1513
1515 /* All further references are relative to base (database start). */ 1514 /* All further references are relative to base (database start). */
1516 base = ldb->ph.config_start; 1515 base = ldb->ph.config_start;
1517 1516
1518 /* Parse and check tocs and vmdb. */ 1517 /* Parse and check tocs and vmdb. */
1519 if (!ldm_validate_tocblocks (bdev, base, ldb) || 1518 if (!ldm_validate_tocblocks(state, base, ldb) ||
1520 !ldm_validate_vmdb (bdev, base, ldb)) 1519 !ldm_validate_vmdb(state, base, ldb))
1521 goto out; /* Already logged */ 1520 goto out; /* Already logged */
1522 1521
1523 /* Initialize vblk lists in ldmdb struct */ 1522 /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1527 INIT_LIST_HEAD (&ldb->v_comp); 1526 INIT_LIST_HEAD (&ldb->v_comp);
1528 INIT_LIST_HEAD (&ldb->v_part); 1527 INIT_LIST_HEAD (&ldb->v_part);
1529 1528
1530 if (!ldm_get_vblks (bdev, base, ldb)) { 1529 if (!ldm_get_vblks(state, base, ldb)) {
1531 ldm_crit ("Failed to read the VBLKs from the database."); 1530 ldm_crit ("Failed to read the VBLKs from the database.");
1532 goto cleanup; 1531 goto cleanup;
1533 } 1532 }
1534 1533
1535 /* Finally, create the data partition devices. */ 1534 /* Finally, create the data partition devices. */
1536 if (ldm_create_data_partitions (pp, ldb)) { 1535 if (ldm_create_data_partitions(state, ldb)) {
1537 ldm_debug ("Parsed LDM database successfully."); 1536 ldm_debug ("Parsed LDM database successfully.");
1538 result = 1; 1537 result = 1;
1539 } 1538 }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */
209 struct list_head v_part; 209 struct list_head v_part;
210}; 210};
211 211
212int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); 212int ldm_partition(struct parsed_partitions *state);
213 213
214#endif /* _FS_PT_LDM_H_ */ 214#endif /* _FS_PT_LDM_H_ */
215 215
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..74465ff7c263 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
27 stg[i] = 0; 27 stg[i] = 0;
28} 28}
29 29
30int mac_partition(struct parsed_partitions *state, struct block_device *bdev) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1; 32 int slot = 1;
33 Sector sect; 33 Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
42 struct mac_driver_desc *md; 42 struct mac_driver_desc *md;
43 43
44 /* Get 0th block and look at the first partition map entry. */ 44 /* Get 0th block and look at the first partition map entry. */
45 md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect); 45 md = read_part_sector(state, 0, &sect);
46 if (!md) 46 if (!md)
47 return -1; 47 return -1;
48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { 48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
51 } 51 }
52 secsize = be16_to_cpu(md->block_size); 52 secsize = be16_to_cpu(md->block_size);
53 put_dev_sector(sect); 53 put_dev_sector(sect);
54 data = read_dev_sector(bdev, secsize/512, &sect); 54 data = read_part_sector(state, secsize/512, &sect);
55 if (!data) 55 if (!data)
56 return -1; 56 return -1;
57 part = (struct mac_partition *) (data + secsize%512); 57 part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
66 put_dev_sector(sect); 66 put_dev_sector(sect);
67 data = read_dev_sector(bdev, pos/512, &sect); 67 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 68 if (!data)
69 return -1; 69 return -1;
70 part = (struct mac_partition *) (data + pos%512); 70 part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
75 be32_to_cpu(part->block_count) * (secsize/512)); 75 be32_to_cpu(part->block_count) * (secsize/512));
76 76
77 if (!strnicmp(part->type, "Linux_RAID", 10)) 77 if (!strnicmp(part->type, "Linux_RAID", 10))
78 state->parts[slot].flags = 1; 78 state->parts[slot].flags = ADDPART_FLAG_RAID;
79#ifdef CONFIG_PPC_PMAC 79#ifdef CONFIG_PPC_PMAC
80 /* 80 /*
81 * If this is the first bootable partition, tell the 81 * If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
123 } 123 }
124#ifdef CONFIG_PPC_PMAC 124#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 125 if (found_root_goodness)
126 note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); 126 note_bootable_part(state->bdev->bd_dev, found_root,
127 found_root_goodness);
127#endif 128#endif
128 129
129 put_dev_sector(sect); 130 put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
41 /* ... more stuff */ 41 /* ... more stuff */
42}; 42};
43 43
44int mac_partition(struct parsed_partitions *state, struct block_device *bdev); 44int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..15bfb7b1e044 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
64#define AIX_LABEL_MAGIC2 0xC2 64#define AIX_LABEL_MAGIC2 0xC2
65#define AIX_LABEL_MAGIC3 0xD4 65#define AIX_LABEL_MAGIC3 0xD4
66#define AIX_LABEL_MAGIC4 0xC1 66#define AIX_LABEL_MAGIC4 0xC1
67static int aix_magic_present(unsigned char *p, struct block_device *bdev) 67static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
68{ 68{
69 struct partition *pt = (struct partition *) (p + 0x1be); 69 struct partition *pt = (struct partition *) (p + 0x1be);
70 Sector sect; 70 Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
85 is_extended_partition(pt)) 85 is_extended_partition(pt))
86 return 0; 86 return 0;
87 } 87 }
88 d = read_dev_sector(bdev, 7, &sect); 88 d = read_part_sector(state, 7, &sect);
89 if (d) { 89 if (d) {
90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') 90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
91 ret = 1; 91 ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
105 * only for the actual data partitions. 105 * only for the actual data partitions.
106 */ 106 */
107 107
108static void 108static void parse_extended(struct parsed_partitions *state,
109parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109 sector_t first_sector, sector_t first_size)
110 sector_t first_sector, sector_t first_size)
111{ 110{
112 struct partition *p; 111 struct partition *p;
113 Sector sect; 112 Sector sect;
114 unsigned char *data; 113 unsigned char *data;
115 sector_t this_sector, this_size; 114 sector_t this_sector, this_size;
116 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 115 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
117 int loopct = 0; /* number of links followed 116 int loopct = 0; /* number of links followed
118 without finding a data partition */ 117 without finding a data partition */
119 int i; 118 int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
126 return; 125 return;
127 if (state->next == state->limit) 126 if (state->next == state->limit)
128 return; 127 return;
129 data = read_dev_sector(bdev, this_sector, &sect); 128 data = read_part_sector(state, this_sector, &sect);
130 if (!data) 129 if (!data)
131 return; 130 return;
132 131
@@ -198,9 +197,8 @@ done:
198/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also 197/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
199 indicates linux swap. Be careful before believing this is Solaris. */ 198 indicates linux swap. Be careful before believing this is Solaris. */
200 199
201static void 200static void parse_solaris_x86(struct parsed_partitions *state,
202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 201 sector_t offset, sector_t size, int origin)
203 sector_t offset, sector_t size, int origin)
204{ 202{
205#ifdef CONFIG_SOLARIS_X86_PARTITION 203#ifdef CONFIG_SOLARIS_X86_PARTITION
206 Sector sect; 204 Sector sect;
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
208 int i; 206 int i;
209 short max_nparts; 207 short max_nparts;
210 208
211 v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect); 209 v = read_part_sector(state, offset + 1, &sect);
212 if (!v) 210 if (!v)
213 return; 211 return;
214 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { 212 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
245 * Create devices for BSD partitions listed in a disklabel, under a 243 * Create devices for BSD partitions listed in a disklabel, under a
246 * dos-like partition. See parse_extended() for more information. 244 * dos-like partition. See parse_extended() for more information.
247 */ 245 */
248static void 246static void parse_bsd(struct parsed_partitions *state,
249parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 247 sector_t offset, sector_t size, int origin, char *flavour,
250 sector_t offset, sector_t size, int origin, char *flavour, 248 int max_partitions)
251 int max_partitions)
252{ 249{
253 Sector sect; 250 Sector sect;
254 struct bsd_disklabel *l; 251 struct bsd_disklabel *l;
255 struct bsd_partition *p; 252 struct bsd_partition *p;
256 253
257 l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect); 254 l = read_part_sector(state, offset + 1, &sect);
258 if (!l) 255 if (!l)
259 return; 256 return;
260 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { 257 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
291} 288}
292#endif 289#endif
293 290
294static void 291static void parse_freebsd(struct parsed_partitions *state,
295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 292 sector_t offset, sector_t size, int origin)
296 sector_t offset, sector_t size, int origin)
297{ 293{
298#ifdef CONFIG_BSD_DISKLABEL 294#ifdef CONFIG_BSD_DISKLABEL
299 parse_bsd(state, bdev, offset, size, origin, 295 parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
300 "bsd", BSD_MAXPARTITIONS);
301#endif 296#endif
302} 297}
303 298
304static void 299static void parse_netbsd(struct parsed_partitions *state,
305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 300 sector_t offset, sector_t size, int origin)
306 sector_t offset, sector_t size, int origin)
307{ 301{
308#ifdef CONFIG_BSD_DISKLABEL 302#ifdef CONFIG_BSD_DISKLABEL
309 parse_bsd(state, bdev, offset, size, origin, 303 parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
310 "netbsd", BSD_MAXPARTITIONS);
311#endif 304#endif
312} 305}
313 306
314static void 307static void parse_openbsd(struct parsed_partitions *state,
315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 308 sector_t offset, sector_t size, int origin)
316 sector_t offset, sector_t size, int origin)
317{ 309{
318#ifdef CONFIG_BSD_DISKLABEL 310#ifdef CONFIG_BSD_DISKLABEL
319 parse_bsd(state, bdev, offset, size, origin, 311 parse_bsd(state, offset, size, origin, "openbsd",
320 "openbsd", OPENBSD_MAXPARTITIONS); 312 OPENBSD_MAXPARTITIONS);
321#endif 313#endif
322} 314}
323 315
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
325 * Create devices for Unixware partitions listed in a disklabel, under a 317 * Create devices for Unixware partitions listed in a disklabel, under a
326 * dos-like partition. See parse_extended() for more information. 318 * dos-like partition. See parse_extended() for more information.
327 */ 319 */
328static void 320static void parse_unixware(struct parsed_partitions *state,
329parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 321 sector_t offset, sector_t size, int origin)
330 sector_t offset, sector_t size, int origin)
331{ 322{
332#ifdef CONFIG_UNIXWARE_DISKLABEL 323#ifdef CONFIG_UNIXWARE_DISKLABEL
333 Sector sect; 324 Sector sect;
334 struct unixware_disklabel *l; 325 struct unixware_disklabel *l;
335 struct unixware_slice *p; 326 struct unixware_slice *p;
336 327
337 l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect); 328 l = read_part_sector(state, offset + 29, &sect);
338 if (!l) 329 if (!l)
339 return; 330 return;
340 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || 331 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
365 * Anand Krishnamurthy <anandk@wiproge.med.ge.com> 356 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
366 * Rajeev V. Pillai <rajeevvp@yahoo.com> 357 * Rajeev V. Pillai <rajeevvp@yahoo.com>
367 */ 358 */
368static void 359static void parse_minix(struct parsed_partitions *state,
369parse_minix(struct parsed_partitions *state, struct block_device *bdev, 360 sector_t offset, sector_t size, int origin)
370 sector_t offset, sector_t size, int origin)
371{ 361{
372#ifdef CONFIG_MINIX_SUBPARTITION 362#ifdef CONFIG_MINIX_SUBPARTITION
373 Sector sect; 363 Sector sect;
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
375 struct partition *p; 365 struct partition *p;
376 int i; 366 int i;
377 367
378 data = read_dev_sector(bdev, offset, &sect); 368 data = read_part_sector(state, offset, &sect);
379 if (!data) 369 if (!data)
380 return; 370 return;
381 371
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
404 394
405static struct { 395static struct {
406 unsigned char id; 396 unsigned char id;
407 void (*parse)(struct parsed_partitions *, struct block_device *, 397 void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
408 sector_t, sector_t, int);
409} subtypes[] = { 398} subtypes[] = {
410 {FREEBSD_PARTITION, parse_freebsd}, 399 {FREEBSD_PARTITION, parse_freebsd},
411 {NETBSD_PARTITION, parse_netbsd}, 400 {NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +406,16 @@ static struct {
417 {0, NULL}, 406 {0, NULL},
418}; 407};
419 408
420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 409int msdos_partition(struct parsed_partitions *state)
421{ 410{
422 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 411 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
423 Sector sect; 412 Sector sect;
424 unsigned char *data; 413 unsigned char *data;
425 struct partition *p; 414 struct partition *p;
426 struct fat_boot_sector *fb; 415 struct fat_boot_sector *fb;
427 int slot; 416 int slot;
428 417
429 data = read_dev_sector(bdev, 0, &sect); 418 data = read_part_sector(state, 0, &sect);
430 if (!data) 419 if (!data)
431 return -1; 420 return -1;
432 if (!msdos_magic_present(data + 510)) { 421 if (!msdos_magic_present(data + 510)) {
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
434 return 0; 423 return 0;
435 } 424 }
436 425
437 if (aix_magic_present(data, bdev)) { 426 if (aix_magic_present(state, data)) {
438 put_dev_sector(sect); 427 put_dev_sector(sect);
439 printk( " [AIX]"); 428 printk( " [AIX]");
440 return 0; 429 return 0;
@@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
503 put_partition(state, slot, start, n); 492 put_partition(state, slot, start, n);
504 493
505 printk(" <"); 494 printk(" <");
506 parse_extended(state, bdev, start, size); 495 parse_extended(state, start, size);
507 printk(" >"); 496 printk(" >");
508 continue; 497 continue;
509 } 498 }
510 put_partition(state, slot, start, size); 499 put_partition(state, slot, start, size);
511 if (SYS_IND(p) == LINUX_RAID_PARTITION) 500 if (SYS_IND(p) == LINUX_RAID_PARTITION)
512 state->parts[slot].flags = 1; 501 state->parts[slot].flags = ADDPART_FLAG_RAID;
513 if (SYS_IND(p) == DM6_PARTITION) 502 if (SYS_IND(p) == DM6_PARTITION)
514 printk("[DM]"); 503 printk("[DM]");
515 if (SYS_IND(p) == EZD_PARTITION) 504 if (SYS_IND(p) == EZD_PARTITION)
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
532 521
533 if (!subtypes[n].parse) 522 if (!subtypes[n].parse)
534 continue; 523 continue;
535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size, 524 subtypes[n].parse(state, start_sect(p) * sector_size,
536 nr_sects(p)*sector_size, slot); 525 nr_sects(p) * sector_size, slot);
537 } 526 }
538 put_dev_sector(sect); 527 put_dev_sector(sect);
539 return 1; 528 return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
4 4
5#define MSDOS_LABEL_MAGIC 0xAA55 5#define MSDOS_LABEL_MAGIC 0xAA55
6 6
7int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); 7int msdos_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13int osf_partition(struct parsed_partitions *state, struct block_device *bdev) 13int osf_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 int slot = 1; 16 int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
49 } * label; 49 } * label;
50 struct d_partition * partition; 50 struct d_partition * partition;
51 51
52 data = read_dev_sector(bdev, 0, &sect); 52 data = read_part_sector(state, 0, &sect);
53 if (!data) 53 if (!data)
54 return -1; 54 return -1;
55 55
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
4 4
5#define DISKLABELMAGIC (0x82564557UL) 5#define DISKLABELMAGIC (0x82564557UL)
6 6
7int osf_partition(struct parsed_partitions *state, struct block_device *bdev); 7int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
27 __be32 _unused1; /* Padding */ 27 __be32 _unused1; /* Padding */
28}; 28};
29 29
30int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) 30int sgi_partition(struct parsed_partitions *state)
31{ 31{
32 int i, csum; 32 int i, csum;
33 __be32 magic; 33 __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
39 struct sgi_partition *p; 39 struct sgi_partition *p;
40 char b[BDEVNAME_SIZE]; 40 char b[BDEVNAME_SIZE];
41 41
42 label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect); 42 label = read_part_sector(state, 0, &sect);
43 if (!label) 43 if (!label)
44 return -1; 44 return -1;
45 p = &label->partitions[0]; 45 p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
57 } 57 }
58 if(csum) { 58 if(csum) {
59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", 59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
60 bdevname(bdev, b)); 60 bdevname(state->bdev, b));
61 put_dev_sector(sect); 61 put_dev_sector(sect);
62 return 0; 62 return 0;
63 } 63 }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
2 * fs/partitions/sgi.h 2 * fs/partitions/sgi.h
3 */ 3 */
4 4
5extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); 5extern int sgi_partition(struct parsed_partitions *state);
6 6
7#define SGI_LABEL_MAGIC 0x0be5a941 7#define SGI_LABEL_MAGIC 0x0be5a941
8 8
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "sun.h" 11#include "sun.h"
12 12
13int sun_partition(struct parsed_partitions *state, struct block_device *bdev) 13int sun_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 __be16 csum; 16 __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
61 int use_vtoc; 61 int use_vtoc;
62 int nparts; 62 int nparts;
63 63
64 label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect); 64 label = read_part_sector(state, 0, &sect);
65 if (!label) 65 if (!label)
66 return -1; 66 return -1;
67 67
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
78 csum ^= *ush--; 78 csum ^= *ush--;
79 if (csum) { 79 if (csum) {
80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", 80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
81 bdevname(bdev, b)); 81 bdevname(state->bdev, b));
82 put_dev_sector(sect); 82 put_dev_sector(sect);
83 return 0; 83 return 0;
84 } 84 }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
5#define SUN_LABEL_MAGIC 0xDABE 5#define SUN_LABEL_MAGIC 0xDABE
6#define SUN_VTOC_SANITY 0x600DDEEE 6#define SUN_VTOC_SANITY 0x600DDEEE
7 7
8int sun_partition(struct parsed_partitions *state, struct block_device *bdev); 8int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
46}; 46};
47 47
48 48
49int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) 49int sysv68_partition(struct parsed_partitions *state)
50{ 50{
51 int i, slices; 51 int i, slices;
52 int slot = 1; 52 int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 57
58 data = read_dev_sector(bdev, 0, &sect); 58 data = read_part_sector(state, 0, &sect);
59 if (!data) 59 if (!data)
60 return -1; 60 return -1;
61 61
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
68 i = be32_to_cpu(b->dk_ios.ios_slcblk); 68 i = be32_to_cpu(b->dk_ios.ios_slcblk);
69 put_dev_sector(sect); 69 put_dev_sector(sect);
70 70
71 data = read_dev_sector(bdev, i, &sect); 71 data = read_part_sector(state, i, &sect);
72 if (!data) 72 if (!data)
73 return -1; 73 return -1;
74 74
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "ultrix.h" 10#include "ultrix.h"
11 11
12int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) 12int ultrix_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 Sector sect; 15 Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
26#define PT_MAGIC 0x032957 /* Partition magic number */ 26#define PT_MAGIC 0x032957 /* Partition magic number */
27#define PT_VALID 1 /* Indicates if struct is valid */ 27#define PT_VALID 1 /* Indicates if struct is valid */
28 28
29 data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect); 29 data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
30 if (!data) 30 if (!data)
31 return -1; 31 return -1;
32 32
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
2 * fs/partitions/ultrix.h 2 * fs/partitions/ultrix.h
3 */ 3 */
4 4
5int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); 5int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..d79872eba09a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/log2.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/pipe_fs_i.h> 16#include <linux/pipe_fs_i.h>
16#include <linux/uio.h> 17#include <linux/uio.h>
@@ -18,11 +19,18 @@
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/audit.h> 20#include <linux/audit.h>
20#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/fcntl.h>
21 23
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <asm/ioctls.h> 25#include <asm/ioctls.h>
24 26
25/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-pages
30 */
31unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
32
33/*
26 * We use a start+len construction, which provides full use of the 34 * We use a start+len construction, which provides full use of the
27 * allocated memory. 35 * allocated memory.
28 * -- Florian Coosmann (FGC) 36 * -- Florian Coosmann (FGC)
@@ -390,7 +398,7 @@ redo:
390 if (!buf->len) { 398 if (!buf->len) {
391 buf->ops = NULL; 399 buf->ops = NULL;
392 ops->release(pipe, buf); 400 ops->release(pipe, buf);
393 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 401 curbuf = (curbuf + 1) & (pipe->buffers - 1);
394 pipe->curbuf = curbuf; 402 pipe->curbuf = curbuf;
395 pipe->nrbufs = --bufs; 403 pipe->nrbufs = --bufs;
396 do_wakeup = 1; 404 do_wakeup = 1;
@@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
472 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 480 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
473 if (pipe->nrbufs && chars != 0) { 481 if (pipe->nrbufs && chars != 0) {
474 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 482 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
475 (PIPE_BUFFERS-1); 483 (pipe->buffers - 1);
476 struct pipe_buffer *buf = pipe->bufs + lastbuf; 484 struct pipe_buffer *buf = pipe->bufs + lastbuf;
477 const struct pipe_buf_operations *ops = buf->ops; 485 const struct pipe_buf_operations *ops = buf->ops;
478 int offset = buf->offset + buf->len; 486 int offset = buf->offset + buf->len;
@@ -518,8 +526,8 @@ redo1:
518 break; 526 break;
519 } 527 }
520 bufs = pipe->nrbufs; 528 bufs = pipe->nrbufs;
521 if (bufs < PIPE_BUFFERS) { 529 if (bufs < pipe->buffers) {
522 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 530 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
523 struct pipe_buffer *buf = pipe->bufs + newbuf; 531 struct pipe_buffer *buf = pipe->bufs + newbuf;
524 struct page *page = pipe->tmp_page; 532 struct page *page = pipe->tmp_page;
525 char *src; 533 char *src;
@@ -580,7 +588,7 @@ redo2:
580 if (!total_len) 588 if (!total_len)
581 break; 589 break;
582 } 590 }
583 if (bufs < PIPE_BUFFERS) 591 if (bufs < pipe->buffers)
584 continue; 592 continue;
585 if (filp->f_flags & O_NONBLOCK) { 593 if (filp->f_flags & O_NONBLOCK) {
586 if (!ret) 594 if (!ret)
@@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
640 nrbufs = pipe->nrbufs; 648 nrbufs = pipe->nrbufs;
641 while (--nrbufs >= 0) { 649 while (--nrbufs >= 0) {
642 count += pipe->bufs[buf].len; 650 count += pipe->bufs[buf].len;
643 buf = (buf+1) & (PIPE_BUFFERS-1); 651 buf = (buf+1) & (pipe->buffers - 1);
644 } 652 }
645 mutex_unlock(&inode->i_mutex); 653 mutex_unlock(&inode->i_mutex);
646 654
@@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait)
671 } 679 }
672 680
673 if (filp->f_mode & FMODE_WRITE) { 681 if (filp->f_mode & FMODE_WRITE) {
674 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 682 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
675 /* 683 /*
676 * Most Unices do not set POLLERR for FIFOs but on Linux they 684 * Most Unices do not set POLLERR for FIFOs but on Linux they
677 * behave exactly like pipes for poll(). 685 * behave exactly like pipes for poll().
@@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
877 885
878 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 886 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
879 if (pipe) { 887 if (pipe) {
880 init_waitqueue_head(&pipe->wait); 888 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
881 pipe->r_counter = pipe->w_counter = 1; 889 if (pipe->bufs) {
882 pipe->inode = inode; 890 init_waitqueue_head(&pipe->wait);
891 pipe->r_counter = pipe->w_counter = 1;
892 pipe->inode = inode;
893 pipe->buffers = PIPE_DEF_BUFFERS;
894 return pipe;
895 }
896 kfree(pipe);
883 } 897 }
884 898
885 return pipe; 899 return NULL;
886} 900}
887 901
888void __free_pipe_info(struct pipe_inode_info *pipe) 902void __free_pipe_info(struct pipe_inode_info *pipe)
889{ 903{
890 int i; 904 int i;
891 905
892 for (i = 0; i < PIPE_BUFFERS; i++) { 906 for (i = 0; i < pipe->buffers; i++) {
893 struct pipe_buffer *buf = pipe->bufs + i; 907 struct pipe_buffer *buf = pipe->bufs + i;
894 if (buf->ops) 908 if (buf->ops)
895 buf->ops->release(pipe, buf); 909 buf->ops->release(pipe, buf);
896 } 910 }
897 if (pipe->tmp_page) 911 if (pipe->tmp_page)
898 __free_page(pipe->tmp_page); 912 __free_page(pipe->tmp_page);
913 kfree(pipe->bufs);
899 kfree(pipe); 914 kfree(pipe);
900} 915}
901 916
@@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1094} 1109}
1095 1110
1096/* 1111/*
1112 * Allocate a new array of pipe buffers and copy the info over. Returns the
1113 * pipe size if successful, or return -ERROR on error.
1114 */
1115static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1116{
1117 struct pipe_buffer *bufs;
1118
1119 /*
1120 * Must be a power-of-2 currently
1121 */
1122 if (!is_power_of_2(arg))
1123 return -EINVAL;
1124
1125 /*
1126 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1127 * expect a lot of shrink+grow operations, just free and allocate
1128 * again like we would do for growing. If the pipe currently
1129 * contains more buffers than arg, then return busy.
1130 */
1131 if (arg < pipe->nrbufs)
1132 return -EBUSY;
1133
1134 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
1135 if (unlikely(!bufs))
1136 return -ENOMEM;
1137
1138 /*
1139 * The pipe array wraps around, so just start the new one at zero
1140 * and adjust the indexes.
1141 */
1142 if (pipe->nrbufs) {
1143 const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
1144 const unsigned int head = pipe->nrbufs - tail;
1145
1146 if (head)
1147 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1148 if (tail)
1149 memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
1150 }
1151
1152 pipe->curbuf = 0;
1153 kfree(pipe->bufs);
1154 pipe->bufs = bufs;
1155 pipe->buffers = arg;
1156 return arg;
1157}
1158
1159long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1160{
1161 struct pipe_inode_info *pipe;
1162 long ret;
1163
1164 pipe = file->f_path.dentry->d_inode->i_pipe;
1165 if (!pipe)
1166 return -EBADF;
1167
1168 mutex_lock(&pipe->inode->i_mutex);
1169
1170 switch (cmd) {
1171 case F_SETPIPE_SZ:
1172 if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
1173 return -EINVAL;
1174 /*
1175 * The pipe needs to be at least 2 pages large to
1176 * guarantee POSIX behaviour.
1177 */
1178 if (arg < 2)
1179 return -EINVAL;
1180 ret = pipe_set_size(pipe, arg);
1181 break;
1182 case F_GETPIPE_SZ:
1183 ret = pipe->buffers;
1184 break;
1185 default:
1186 ret = -EINVAL;
1187 break;
1188 }
1189
1190 mutex_unlock(&pipe->inode->i_mutex);
1191 return ret;
1192}
1193
1194/*
1097 * pipefs should _never_ be mounted by userland - too much of security hassle, 1195 * pipefs should _never_ be mounted by userland - too much of security hassle,
1098 * no real gain from having the whole whorehouse mounted. So we don't need 1196 * no real gain from having the whole whorehouse mounted. So we don't need
1099 * any operations on the root directory. However, we need a non-trivial 1197 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
634 return err; 634 return err;
635} 635}
636 636
637#ifdef CONFIG_HUGETLB_PAGE
637static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 638static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
638{ 639{
639 u64 pme = 0; 640 u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
664 665
665 return err; 666 return err;
666} 667}
668#endif /* HUGETLB_PAGE */
667 669
668/* 670/*
669 * /proc/pid/pagemap - an array mapping virtual pages to pfns 671 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
733 735
734 pagemap_walk.pmd_entry = pagemap_pte_range; 736 pagemap_walk.pmd_entry = pagemap_pte_range;
735 pagemap_walk.pte_hole = pagemap_pte_hole; 737 pagemap_walk.pte_hole = pagemap_pte_hole;
738#ifdef CONFIG_HUGETLB_PAGE
736 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 739 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
740#endif
737 pagemap_walk.mm = mm; 741 pagemap_walk.mm = mm;
738 pagemap_walk.private = &pm; 742 pagemap_walk.private = &pm;
739 743
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 788b5802a7ce..655a4c52b8c3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -82,7 +82,7 @@
82 82
83/* 83/*
84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas 84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
85 * and quota formats, dqstats structure containing statistics about the lists 85 * and quota formats.
86 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and 86 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
87 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. 87 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
88 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly 88 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -132,7 +132,9 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); 132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
135static char *quotatypes[] = INITQFNAMES; 136static char *quotatypes[] = INITQFNAMES;
137#endif
136static struct quota_format_type *quota_formats; /* List of registered formats */ 138static struct quota_format_type *quota_formats; /* List of registered formats */
137static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; 139static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
138 140
@@ -226,6 +228,10 @@ static struct hlist_head *dquot_hash;
226 228
227struct dqstats dqstats; 229struct dqstats dqstats;
228EXPORT_SYMBOL(dqstats); 230EXPORT_SYMBOL(dqstats);
231#ifdef CONFIG_SMP
232struct dqstats *dqstats_pcpu;
233EXPORT_SYMBOL(dqstats_pcpu);
234#endif
229 235
230static qsize_t inode_get_rsv_space(struct inode *inode); 236static qsize_t inode_get_rsv_space(struct inode *inode);
231static void __dquot_initialize(struct inode *inode, int type); 237static void __dquot_initialize(struct inode *inode, int type);
@@ -273,7 +279,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
273static inline void put_dquot_last(struct dquot *dquot) 279static inline void put_dquot_last(struct dquot *dquot)
274{ 280{
275 list_add_tail(&dquot->dq_free, &free_dquots); 281 list_add_tail(&dquot->dq_free, &free_dquots);
276 dqstats.free_dquots++; 282 dqstats_inc(DQST_FREE_DQUOTS);
277} 283}
278 284
279static inline void remove_free_dquot(struct dquot *dquot) 285static inline void remove_free_dquot(struct dquot *dquot)
@@ -281,7 +287,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
281 if (list_empty(&dquot->dq_free)) 287 if (list_empty(&dquot->dq_free))
282 return; 288 return;
283 list_del_init(&dquot->dq_free); 289 list_del_init(&dquot->dq_free);
284 dqstats.free_dquots--; 290 dqstats_dec(DQST_FREE_DQUOTS);
285} 291}
286 292
287static inline void put_inuse(struct dquot *dquot) 293static inline void put_inuse(struct dquot *dquot)
@@ -289,12 +295,12 @@ static inline void put_inuse(struct dquot *dquot)
289 /* We add to the back of inuse list so we don't have to restart 295 /* We add to the back of inuse list so we don't have to restart
290 * when traversing this list and we block */ 296 * when traversing this list and we block */
291 list_add_tail(&dquot->dq_inuse, &inuse_list); 297 list_add_tail(&dquot->dq_inuse, &inuse_list);
292 dqstats.allocated_dquots++; 298 dqstats_inc(DQST_ALLOC_DQUOTS);
293} 299}
294 300
295static inline void remove_inuse(struct dquot *dquot) 301static inline void remove_inuse(struct dquot *dquot)
296{ 302{
297 dqstats.allocated_dquots--; 303 dqstats_dec(DQST_ALLOC_DQUOTS);
298 list_del(&dquot->dq_inuse); 304 list_del(&dquot->dq_inuse);
299} 305}
300/* 306/*
@@ -317,14 +323,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
317 return dquot->dq_sb->dq_op->mark_dirty(dquot); 323 return dquot->dq_sb->dq_op->mark_dirty(dquot);
318} 324}
319 325
326/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
320int dquot_mark_dquot_dirty(struct dquot *dquot) 327int dquot_mark_dquot_dirty(struct dquot *dquot)
321{ 328{
329 int ret = 1;
330
331 /* If quota is dirty already, we don't have to acquire dq_list_lock */
332 if (test_bit(DQ_MOD_B, &dquot->dq_flags))
333 return 1;
334
322 spin_lock(&dq_list_lock); 335 spin_lock(&dq_list_lock);
323 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) 336 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
324 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> 337 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
325 info[dquot->dq_type].dqi_dirty_list); 338 info[dquot->dq_type].dqi_dirty_list);
339 ret = 0;
340 }
326 spin_unlock(&dq_list_lock); 341 spin_unlock(&dq_list_lock);
327 return 0; 342 return ret;
328} 343}
329EXPORT_SYMBOL(dquot_mark_dquot_dirty); 344EXPORT_SYMBOL(dquot_mark_dquot_dirty);
330 345
@@ -550,8 +565,8 @@ int dquot_scan_active(struct super_block *sb,
550 continue; 565 continue;
551 /* Now we have active dquot so we can just increase use count */ 566 /* Now we have active dquot so we can just increase use count */
552 atomic_inc(&dquot->dq_count); 567 atomic_inc(&dquot->dq_count);
553 dqstats.lookups++;
554 spin_unlock(&dq_list_lock); 568 spin_unlock(&dq_list_lock);
569 dqstats_inc(DQST_LOOKUPS);
555 dqput(old_dquot); 570 dqput(old_dquot);
556 old_dquot = dquot; 571 old_dquot = dquot;
557 ret = fn(dquot, priv); 572 ret = fn(dquot, priv);
@@ -596,8 +611,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
596 * holding reference so we can safely just increase 611 * holding reference so we can safely just increase
597 * use count */ 612 * use count */
598 atomic_inc(&dquot->dq_count); 613 atomic_inc(&dquot->dq_count);
599 dqstats.lookups++;
600 spin_unlock(&dq_list_lock); 614 spin_unlock(&dq_list_lock);
615 dqstats_inc(DQST_LOOKUPS);
601 sb->dq_op->write_dquot(dquot); 616 sb->dq_op->write_dquot(dquot);
602 dqput(dquot); 617 dqput(dquot);
603 spin_lock(&dq_list_lock); 618 spin_lock(&dq_list_lock);
@@ -609,9 +624,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
609 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) 624 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
610 && info_dirty(&dqopt->info[cnt])) 625 && info_dirty(&dqopt->info[cnt]))
611 sb->dq_op->write_info(sb, cnt); 626 sb->dq_op->write_info(sb, cnt);
612 spin_lock(&dq_list_lock); 627 dqstats_inc(DQST_SYNCS);
613 dqstats.syncs++;
614 spin_unlock(&dq_list_lock);
615 mutex_unlock(&dqopt->dqonoff_mutex); 628 mutex_unlock(&dqopt->dqonoff_mutex);
616 629
617 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) 630 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
@@ -663,6 +676,22 @@ static void prune_dqcache(int count)
663 } 676 }
664} 677}
665 678
679static int dqstats_read(unsigned int type)
680{
681 int count = 0;
682#ifdef CONFIG_SMP
683 int cpu;
684 for_each_possible_cpu(cpu)
685 count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
686 /* Statistics reading is racy, but absolute accuracy isn't required */
687 if (count < 0)
688 count = 0;
689#else
690 count = dqstats.stat[type];
691#endif
692 return count;
693}
694
666/* 695/*
667 * This is called from kswapd when we think we need some 696 * This is called from kswapd when we think we need some
668 * more memory 697 * more memory
@@ -675,7 +704,7 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
675 prune_dqcache(nr); 704 prune_dqcache(nr);
676 spin_unlock(&dq_list_lock); 705 spin_unlock(&dq_list_lock);
677 } 706 }
678 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; 707 return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure;
679} 708}
680 709
681static struct shrinker dqcache_shrinker = { 710static struct shrinker dqcache_shrinker = {
@@ -703,10 +732,7 @@ void dqput(struct dquot *dquot)
703 BUG(); 732 BUG();
704 } 733 }
705#endif 734#endif
706 735 dqstats_inc(DQST_DROPS);
707 spin_lock(&dq_list_lock);
708 dqstats.drops++;
709 spin_unlock(&dq_list_lock);
710we_slept: 736we_slept:
711 spin_lock(&dq_list_lock); 737 spin_lock(&dq_list_lock);
712 if (atomic_read(&dquot->dq_count) > 1) { 738 if (atomic_read(&dquot->dq_count) > 1) {
@@ -823,15 +849,15 @@ we_slept:
823 put_inuse(dquot); 849 put_inuse(dquot);
824 /* hash it first so it can be found */ 850 /* hash it first so it can be found */
825 insert_dquot_hash(dquot); 851 insert_dquot_hash(dquot);
826 dqstats.lookups++;
827 spin_unlock(&dq_list_lock); 852 spin_unlock(&dq_list_lock);
853 dqstats_inc(DQST_LOOKUPS);
828 } else { 854 } else {
829 if (!atomic_read(&dquot->dq_count)) 855 if (!atomic_read(&dquot->dq_count))
830 remove_free_dquot(dquot); 856 remove_free_dquot(dquot);
831 atomic_inc(&dquot->dq_count); 857 atomic_inc(&dquot->dq_count);
832 dqstats.cache_hits++;
833 dqstats.lookups++;
834 spin_unlock(&dq_list_lock); 858 spin_unlock(&dq_list_lock);
859 dqstats_inc(DQST_CACHE_HITS);
860 dqstats_inc(DQST_LOOKUPS);
835 } 861 }
836 /* Wait for dq_lock - after this we know that either dquot_release() is 862 /* Wait for dq_lock - after this we know that either dquot_release() is
837 * already finished or it will be canceled due to dq_count > 1 test */ 863 * already finished or it will be canceled due to dq_count > 1 test */
@@ -1677,16 +1703,19 @@ EXPORT_SYMBOL(dquot_free_inode);
1677 1703
1678/* 1704/*
1679 * Transfer the number of inode and blocks from one diskquota to an other. 1705 * Transfer the number of inode and blocks from one diskquota to an other.
1706 * On success, dquot references in transfer_to are consumed and references
1707 * to original dquots that need to be released are placed there. On failure,
1708 * references are kept untouched.
1680 * 1709 *
1681 * This operation can block, but only after everything is updated 1710 * This operation can block, but only after everything is updated
1682 * A transaction must be started when entering this function. 1711 * A transaction must be started when entering this function.
1712 *
1683 */ 1713 */
1684static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) 1714int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1685{ 1715{
1686 qsize_t space, cur_space; 1716 qsize_t space, cur_space;
1687 qsize_t rsv_space = 0; 1717 qsize_t rsv_space = 0;
1688 struct dquot *transfer_from[MAXQUOTAS]; 1718 struct dquot *transfer_from[MAXQUOTAS] = {};
1689 struct dquot *transfer_to[MAXQUOTAS];
1690 int cnt, ret = 0; 1719 int cnt, ret = 0;
1691 char warntype_to[MAXQUOTAS]; 1720 char warntype_to[MAXQUOTAS];
1692 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1721 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1696,19 +1725,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1696 if (IS_NOQUOTA(inode)) 1725 if (IS_NOQUOTA(inode))
1697 return 0; 1726 return 0;
1698 /* Initialize the arrays */ 1727 /* Initialize the arrays */
1699 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1728 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1700 transfer_from[cnt] = NULL;
1701 transfer_to[cnt] = NULL;
1702 warntype_to[cnt] = QUOTA_NL_NOWARN; 1729 warntype_to[cnt] = QUOTA_NL_NOWARN;
1703 }
1704 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1705 if (mask & (1 << cnt))
1706 transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
1707 }
1708 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1730 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1709 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1731 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1710 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1732 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1711 goto put_all; 1733 return 0;
1712 } 1734 }
1713 spin_lock(&dq_data_lock); 1735 spin_lock(&dq_data_lock);
1714 cur_space = inode_get_bytes(inode); 1736 cur_space = inode_get_bytes(inode);
@@ -1760,47 +1782,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1760 1782
1761 mark_all_dquot_dirty(transfer_from); 1783 mark_all_dquot_dirty(transfer_from);
1762 mark_all_dquot_dirty(transfer_to); 1784 mark_all_dquot_dirty(transfer_to);
1763 /* The reference we got is transferred to the inode */ 1785 /* Pass back references to put */
1764 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1786 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1765 transfer_to[cnt] = NULL; 1787 transfer_to[cnt] = transfer_from[cnt];
1766warn_put_all: 1788warn:
1767 flush_warnings(transfer_to, warntype_to); 1789 flush_warnings(transfer_to, warntype_to);
1768 flush_warnings(transfer_from, warntype_from_inodes); 1790 flush_warnings(transfer_from, warntype_from_inodes);
1769 flush_warnings(transfer_from, warntype_from_space); 1791 flush_warnings(transfer_from, warntype_from_space);
1770put_all:
1771 dqput_all(transfer_from);
1772 dqput_all(transfer_to);
1773 return ret; 1792 return ret;
1774over_quota: 1793over_quota:
1775 spin_unlock(&dq_data_lock); 1794 spin_unlock(&dq_data_lock);
1776 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1795 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1777 /* Clear dquot pointers we don't want to dqput() */ 1796 goto warn;
1778 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1779 transfer_from[cnt] = NULL;
1780 goto warn_put_all;
1781} 1797}
1798EXPORT_SYMBOL(__dquot_transfer);
1782 1799
1783/* Wrapper for transferring ownership of an inode for uid/gid only 1800/* Wrapper for transferring ownership of an inode for uid/gid only
1784 * Called from FSXXX_setattr() 1801 * Called from FSXXX_setattr()
1785 */ 1802 */
1786int dquot_transfer(struct inode *inode, struct iattr *iattr) 1803int dquot_transfer(struct inode *inode, struct iattr *iattr)
1787{ 1804{
1788 qid_t chid[MAXQUOTAS]; 1805 struct dquot *transfer_to[MAXQUOTAS] = {};
1789 unsigned long mask = 0; 1806 struct super_block *sb = inode->i_sb;
1807 int ret;
1790 1808
1791 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { 1809 if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
1792 mask |= 1 << USRQUOTA; 1810 return 0;
1793 chid[USRQUOTA] = iattr->ia_uid; 1811
1794 } 1812 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
1795 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { 1813 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
1796 mask |= 1 << GRPQUOTA; 1814 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
1797 chid[GRPQUOTA] = iattr->ia_gid; 1815 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA);
1798 } 1816
1799 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { 1817 ret = __dquot_transfer(inode, transfer_to);
1800 dquot_initialize(inode); 1818 dqput_all(transfer_to);
1801 return __dquot_transfer(inode, chid, mask); 1819 return ret;
1802 }
1803 return 0;
1804} 1820}
1805EXPORT_SYMBOL(dquot_transfer); 1821EXPORT_SYMBOL(dquot_transfer);
1806 1822
@@ -2275,25 +2291,30 @@ static inline qsize_t stoqb(qsize_t space)
2275} 2291}
2276 2292
2277/* Generic routine for getting common part of quota structure */ 2293/* Generic routine for getting common part of quota structure */
2278static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) 2294static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2279{ 2295{
2280 struct mem_dqblk *dm = &dquot->dq_dqb; 2296 struct mem_dqblk *dm = &dquot->dq_dqb;
2281 2297
2298 memset(di, 0, sizeof(*di));
2299 di->d_version = FS_DQUOT_VERSION;
2300 di->d_flags = dquot->dq_type == USRQUOTA ?
2301 XFS_USER_QUOTA : XFS_GROUP_QUOTA;
2302 di->d_id = dquot->dq_id;
2303
2282 spin_lock(&dq_data_lock); 2304 spin_lock(&dq_data_lock);
2283 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); 2305 di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
2284 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); 2306 di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
2285 di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace; 2307 di->d_ino_hardlimit = dm->dqb_ihardlimit;
2286 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2308 di->d_ino_softlimit = dm->dqb_isoftlimit;
2287 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2309 di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
2288 di->dqb_curinodes = dm->dqb_curinodes; 2310 di->d_icount = dm->dqb_curinodes;
2289 di->dqb_btime = dm->dqb_btime; 2311 di->d_btimer = dm->dqb_btime;
2290 di->dqb_itime = dm->dqb_itime; 2312 di->d_itimer = dm->dqb_itime;
2291 di->dqb_valid = QIF_ALL;
2292 spin_unlock(&dq_data_lock); 2313 spin_unlock(&dq_data_lock);
2293} 2314}
2294 2315
2295int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, 2316int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2296 struct if_dqblk *di) 2317 struct fs_disk_quota *di)
2297{ 2318{
2298 struct dquot *dquot; 2319 struct dquot *dquot;
2299 2320
@@ -2307,51 +2328,70 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2307} 2328}
2308EXPORT_SYMBOL(vfs_get_dqblk); 2329EXPORT_SYMBOL(vfs_get_dqblk);
2309 2330
2331#define VFS_FS_DQ_MASK \
2332 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
2333 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
2334 FS_DQ_BTIMER | FS_DQ_ITIMER)
2335
2310/* Generic routine for setting common part of quota structure */ 2336/* Generic routine for setting common part of quota structure */
2311static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) 2337static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2312{ 2338{
2313 struct mem_dqblk *dm = &dquot->dq_dqb; 2339 struct mem_dqblk *dm = &dquot->dq_dqb;
2314 int check_blim = 0, check_ilim = 0; 2340 int check_blim = 0, check_ilim = 0;
2315 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; 2341 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
2316 2342
2317 if ((di->dqb_valid & QIF_BLIMITS && 2343 if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
2318 (di->dqb_bhardlimit > dqi->dqi_maxblimit || 2344 return -EINVAL;
2319 di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || 2345
2320 (di->dqb_valid & QIF_ILIMITS && 2346 if (((di->d_fieldmask & FS_DQ_BSOFT) &&
2321 (di->dqb_ihardlimit > dqi->dqi_maxilimit || 2347 (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
2322 di->dqb_isoftlimit > dqi->dqi_maxilimit))) 2348 ((di->d_fieldmask & FS_DQ_BHARD) &&
2349 (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
2350 ((di->d_fieldmask & FS_DQ_ISOFT) &&
2351 (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
2352 ((di->d_fieldmask & FS_DQ_IHARD) &&
2353 (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
2323 return -ERANGE; 2354 return -ERANGE;
2324 2355
2325 spin_lock(&dq_data_lock); 2356 spin_lock(&dq_data_lock);
2326 if (di->dqb_valid & QIF_SPACE) { 2357 if (di->d_fieldmask & FS_DQ_BCOUNT) {
2327 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; 2358 dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
2328 check_blim = 1; 2359 check_blim = 1;
2329 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2360 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2330 } 2361 }
2331 if (di->dqb_valid & QIF_BLIMITS) { 2362
2332 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); 2363 if (di->d_fieldmask & FS_DQ_BSOFT)
2333 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); 2364 dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
2365 if (di->d_fieldmask & FS_DQ_BHARD)
2366 dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
2367 if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
2334 check_blim = 1; 2368 check_blim = 1;
2335 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2369 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2336 } 2370 }
2337 if (di->dqb_valid & QIF_INODES) { 2371
2338 dm->dqb_curinodes = di->dqb_curinodes; 2372 if (di->d_fieldmask & FS_DQ_ICOUNT) {
2373 dm->dqb_curinodes = di->d_icount;
2339 check_ilim = 1; 2374 check_ilim = 1;
2340 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2375 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2341 } 2376 }
2342 if (di->dqb_valid & QIF_ILIMITS) { 2377
2343 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2378 if (di->d_fieldmask & FS_DQ_ISOFT)
2344 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2379 dm->dqb_isoftlimit = di->d_ino_softlimit;
2380 if (di->d_fieldmask & FS_DQ_IHARD)
2381 dm->dqb_ihardlimit = di->d_ino_hardlimit;
2382 if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
2345 check_ilim = 1; 2383 check_ilim = 1;
2346 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2384 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2347 } 2385 }
2348 if (di->dqb_valid & QIF_BTIME) { 2386
2349 dm->dqb_btime = di->dqb_btime; 2387 if (di->d_fieldmask & FS_DQ_BTIMER) {
2388 dm->dqb_btime = di->d_btimer;
2350 check_blim = 1; 2389 check_blim = 1;
2351 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2390 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2352 } 2391 }
2353 if (di->dqb_valid & QIF_ITIME) { 2392
2354 dm->dqb_itime = di->dqb_itime; 2393 if (di->d_fieldmask & FS_DQ_ITIMER) {
2394 dm->dqb_itime = di->d_itimer;
2355 check_ilim = 1; 2395 check_ilim = 1;
2356 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2396 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2357 } 2397 }
@@ -2361,7 +2401,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2361 dm->dqb_curspace < dm->dqb_bsoftlimit) { 2401 dm->dqb_curspace < dm->dqb_bsoftlimit) {
2362 dm->dqb_btime = 0; 2402 dm->dqb_btime = 0;
2363 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2403 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
2364 } else if (!(di->dqb_valid & QIF_BTIME)) 2404 } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
2365 /* Set grace only if user hasn't provided his own... */ 2405 /* Set grace only if user hasn't provided his own... */
2366 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; 2406 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
2367 } 2407 }
@@ -2370,7 +2410,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2370 dm->dqb_curinodes < dm->dqb_isoftlimit) { 2410 dm->dqb_curinodes < dm->dqb_isoftlimit) {
2371 dm->dqb_itime = 0; 2411 dm->dqb_itime = 0;
2372 clear_bit(DQ_INODES_B, &dquot->dq_flags); 2412 clear_bit(DQ_INODES_B, &dquot->dq_flags);
2373 } else if (!(di->dqb_valid & QIF_ITIME)) 2413 } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
2374 /* Set grace only if user hasn't provided his own... */ 2414 /* Set grace only if user hasn't provided his own... */
2375 dm->dqb_itime = get_seconds() + dqi->dqi_igrace; 2415 dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
2376 } 2416 }
@@ -2386,7 +2426,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2386} 2426}
2387 2427
2388int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, 2428int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2389 struct if_dqblk *di) 2429 struct fs_disk_quota *di)
2390{ 2430{
2391 struct dquot *dquot; 2431 struct dquot *dquot;
2392 int rc; 2432 int rc;
@@ -2465,62 +2505,74 @@ const struct quotactl_ops vfs_quotactl_ops = {
2465 .set_dqblk = vfs_set_dqblk 2505 .set_dqblk = vfs_set_dqblk
2466}; 2506};
2467 2507
2508
2509static int do_proc_dqstats(struct ctl_table *table, int write,
2510 void __user *buffer, size_t *lenp, loff_t *ppos)
2511{
2512#ifdef CONFIG_SMP
2513 /* Update global table */
2514 unsigned int type = (int *)table->data - dqstats.stat;
2515 dqstats.stat[type] = dqstats_read(type);
2516#endif
2517 return proc_dointvec(table, write, buffer, lenp, ppos);
2518}
2519
2468static ctl_table fs_dqstats_table[] = { 2520static ctl_table fs_dqstats_table[] = {
2469 { 2521 {
2470 .procname = "lookups", 2522 .procname = "lookups",
2471 .data = &dqstats.lookups, 2523 .data = &dqstats.stat[DQST_LOOKUPS],
2472 .maxlen = sizeof(int), 2524 .maxlen = sizeof(int),
2473 .mode = 0444, 2525 .mode = 0444,
2474 .proc_handler = proc_dointvec, 2526 .proc_handler = do_proc_dqstats,
2475 }, 2527 },
2476 { 2528 {
2477 .procname = "drops", 2529 .procname = "drops",
2478 .data = &dqstats.drops, 2530 .data = &dqstats.stat[DQST_DROPS],
2479 .maxlen = sizeof(int), 2531 .maxlen = sizeof(int),
2480 .mode = 0444, 2532 .mode = 0444,
2481 .proc_handler = proc_dointvec, 2533 .proc_handler = do_proc_dqstats,
2482 }, 2534 },
2483 { 2535 {
2484 .procname = "reads", 2536 .procname = "reads",
2485 .data = &dqstats.reads, 2537 .data = &dqstats.stat[DQST_READS],
2486 .maxlen = sizeof(int), 2538 .maxlen = sizeof(int),
2487 .mode = 0444, 2539 .mode = 0444,
2488 .proc_handler = proc_dointvec, 2540 .proc_handler = do_proc_dqstats,
2489 }, 2541 },
2490 { 2542 {
2491 .procname = "writes", 2543 .procname = "writes",
2492 .data = &dqstats.writes, 2544 .data = &dqstats.stat[DQST_WRITES],
2493 .maxlen = sizeof(int), 2545 .maxlen = sizeof(int),
2494 .mode = 0444, 2546 .mode = 0444,
2495 .proc_handler = proc_dointvec, 2547 .proc_handler = do_proc_dqstats,
2496 }, 2548 },
2497 { 2549 {
2498 .procname = "cache_hits", 2550 .procname = "cache_hits",
2499 .data = &dqstats.cache_hits, 2551 .data = &dqstats.stat[DQST_CACHE_HITS],
2500 .maxlen = sizeof(int), 2552 .maxlen = sizeof(int),
2501 .mode = 0444, 2553 .mode = 0444,
2502 .proc_handler = proc_dointvec, 2554 .proc_handler = do_proc_dqstats,
2503 }, 2555 },
2504 { 2556 {
2505 .procname = "allocated_dquots", 2557 .procname = "allocated_dquots",
2506 .data = &dqstats.allocated_dquots, 2558 .data = &dqstats.stat[DQST_ALLOC_DQUOTS],
2507 .maxlen = sizeof(int), 2559 .maxlen = sizeof(int),
2508 .mode = 0444, 2560 .mode = 0444,
2509 .proc_handler = proc_dointvec, 2561 .proc_handler = do_proc_dqstats,
2510 }, 2562 },
2511 { 2563 {
2512 .procname = "free_dquots", 2564 .procname = "free_dquots",
2513 .data = &dqstats.free_dquots, 2565 .data = &dqstats.stat[DQST_FREE_DQUOTS],
2514 .maxlen = sizeof(int), 2566 .maxlen = sizeof(int),
2515 .mode = 0444, 2567 .mode = 0444,
2516 .proc_handler = proc_dointvec, 2568 .proc_handler = do_proc_dqstats,
2517 }, 2569 },
2518 { 2570 {
2519 .procname = "syncs", 2571 .procname = "syncs",
2520 .data = &dqstats.syncs, 2572 .data = &dqstats.stat[DQST_SYNCS],
2521 .maxlen = sizeof(int), 2573 .maxlen = sizeof(int),
2522 .mode = 0444, 2574 .mode = 0444,
2523 .proc_handler = proc_dointvec, 2575 .proc_handler = do_proc_dqstats,
2524 }, 2576 },
2525#ifdef CONFIG_PRINT_QUOTA_WARNING 2577#ifdef CONFIG_PRINT_QUOTA_WARNING
2526 { 2578 {
@@ -2572,6 +2624,13 @@ static int __init dquot_init(void)
2572 if (!dquot_hash) 2624 if (!dquot_hash)
2573 panic("Cannot create dquot hash table"); 2625 panic("Cannot create dquot hash table");
2574 2626
2627#ifdef CONFIG_SMP
2628 dqstats_pcpu = alloc_percpu(struct dqstats);
2629 if (!dqstats_pcpu)
2630 panic("Cannot create dquot stats table");
2631#endif
2632 memset(&dqstats, 0, sizeof(struct dqstats));
2633
2575 /* Find power-of-two hlist_heads which can fit into allocation */ 2634 /* Find power-of-two hlist_heads which can fit into allocation */
2576 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); 2635 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
2577 dq_hash_bits = 0; 2636 dq_hash_bits = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95388f9b7356..ce3dfd066f59 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
45 return security_quotactl(cmd, type, id, sb); 45 return security_quotactl(cmd, type, id, sb);
46} 46}
47 47
48static void quota_sync_one(struct super_block *sb, void *arg)
49{
50 if (sb->s_qcop && sb->s_qcop->quota_sync)
51 sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
52}
53
48static int quota_sync_all(int type) 54static int quota_sync_all(int type)
49{ 55{
50 struct super_block *sb;
51 int ret; 56 int ret;
52 57
53 if (type >= MAXQUOTAS) 58 if (type >= MAXQUOTAS)
54 return -EINVAL; 59 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL); 60 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret) 61 if (!ret)
57 return ret; 62 iterate_supers(quota_sync_one, &type);
58 63 return ret;
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
74 }
75 spin_unlock(&sb_lock);
76
77 return 0;
78} 64}
79 65
80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
113 struct if_dqinfo info; 99 struct if_dqinfo info;
114 int ret; 100 int ret;
115 101
116 if (!sb_has_quota_active(sb, type))
117 return -ESRCH;
118 if (!sb->s_qcop->get_info) 102 if (!sb->s_qcop->get_info)
119 return -ENOSYS; 103 return -ENOSYS;
120 ret = sb->s_qcop->get_info(sb, type, &info); 104 ret = sb->s_qcop->get_info(sb, type, &info);
@@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
129 113
130 if (copy_from_user(&info, addr, sizeof(info))) 114 if (copy_from_user(&info, addr, sizeof(info)))
131 return -EFAULT; 115 return -EFAULT;
132 if (!sb_has_quota_active(sb, type))
133 return -ESRCH;
134 if (!sb->s_qcop->set_info) 116 if (!sb->s_qcop->set_info)
135 return -ENOSYS; 117 return -ENOSYS;
136 return sb->s_qcop->set_info(sb, type, &info); 118 return sb->s_qcop->set_info(sb, type, &info);
137} 119}
138 120
121static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
122{
123 dst->dqb_bhardlimit = src->d_blk_hardlimit;
124 dst->dqb_bsoftlimit = src->d_blk_softlimit;
125 dst->dqb_curspace = src->d_bcount;
126 dst->dqb_ihardlimit = src->d_ino_hardlimit;
127 dst->dqb_isoftlimit = src->d_ino_softlimit;
128 dst->dqb_curinodes = src->d_icount;
129 dst->dqb_btime = src->d_btimer;
130 dst->dqb_itime = src->d_itimer;
131 dst->dqb_valid = QIF_ALL;
132}
133
139static int quota_getquota(struct super_block *sb, int type, qid_t id, 134static int quota_getquota(struct super_block *sb, int type, qid_t id,
140 void __user *addr) 135 void __user *addr)
141{ 136{
137 struct fs_disk_quota fdq;
142 struct if_dqblk idq; 138 struct if_dqblk idq;
143 int ret; 139 int ret;
144 140
145 if (!sb_has_quota_active(sb, type))
146 return -ESRCH;
147 if (!sb->s_qcop->get_dqblk) 141 if (!sb->s_qcop->get_dqblk)
148 return -ENOSYS; 142 return -ENOSYS;
149 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); 143 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
150 if (ret) 144 if (ret)
151 return ret; 145 return ret;
146 copy_to_if_dqblk(&idq, &fdq);
152 if (copy_to_user(addr, &idq, sizeof(idq))) 147 if (copy_to_user(addr, &idq, sizeof(idq)))
153 return -EFAULT; 148 return -EFAULT;
154 return 0; 149 return 0;
155} 150}
156 151
152static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
153{
154 dst->d_blk_hardlimit = src->dqb_bhardlimit;
155 dst->d_blk_softlimit = src->dqb_bsoftlimit;
156 dst->d_bcount = src->dqb_curspace;
157 dst->d_ino_hardlimit = src->dqb_ihardlimit;
158 dst->d_ino_softlimit = src->dqb_isoftlimit;
159 dst->d_icount = src->dqb_curinodes;
160 dst->d_btimer = src->dqb_btime;
161 dst->d_itimer = src->dqb_itime;
162
163 dst->d_fieldmask = 0;
164 if (src->dqb_valid & QIF_BLIMITS)
165 dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
166 if (src->dqb_valid & QIF_SPACE)
167 dst->d_fieldmask |= FS_DQ_BCOUNT;
168 if (src->dqb_valid & QIF_ILIMITS)
169 dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
170 if (src->dqb_valid & QIF_INODES)
171 dst->d_fieldmask |= FS_DQ_ICOUNT;
172 if (src->dqb_valid & QIF_BTIME)
173 dst->d_fieldmask |= FS_DQ_BTIMER;
174 if (src->dqb_valid & QIF_ITIME)
175 dst->d_fieldmask |= FS_DQ_ITIMER;
176}
177
157static int quota_setquota(struct super_block *sb, int type, qid_t id, 178static int quota_setquota(struct super_block *sb, int type, qid_t id,
158 void __user *addr) 179 void __user *addr)
159{ 180{
181 struct fs_disk_quota fdq;
160 struct if_dqblk idq; 182 struct if_dqblk idq;
161 183
162 if (copy_from_user(&idq, addr, sizeof(idq))) 184 if (copy_from_user(&idq, addr, sizeof(idq)))
163 return -EFAULT; 185 return -EFAULT;
164 if (!sb_has_quota_active(sb, type))
165 return -ESRCH;
166 if (!sb->s_qcop->set_dqblk) 186 if (!sb->s_qcop->set_dqblk)
167 return -ENOSYS; 187 return -ENOSYS;
168 return sb->s_qcop->set_dqblk(sb, type, id, &idq); 188 copy_from_if_dqblk(&fdq, &idq);
189 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
169} 190}
170 191
171static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) 192static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
199 220
200 if (copy_from_user(&fdq, addr, sizeof(fdq))) 221 if (copy_from_user(&fdq, addr, sizeof(fdq)))
201 return -EFAULT; 222 return -EFAULT;
202 if (!sb->s_qcop->set_xquota) 223 if (!sb->s_qcop->set_dqblk)
203 return -ENOSYS; 224 return -ENOSYS;
204 return sb->s_qcop->set_xquota(sb, type, id, &fdq); 225 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
205} 226}
206 227
207static int quota_getxquota(struct super_block *sb, int type, qid_t id, 228static int quota_getxquota(struct super_block *sb, int type, qid_t id,
@@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
210 struct fs_disk_quota fdq; 231 struct fs_disk_quota fdq;
211 int ret; 232 int ret;
212 233
213 if (!sb->s_qcop->get_xquota) 234 if (!sb->s_qcop->get_dqblk)
214 return -ENOSYS; 235 return -ENOSYS;
215 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); 236 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
216 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) 237 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
217 return -EFAULT; 238 return -EFAULT;
218 return ret; 239 return ret;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..24f03407eeb5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,17 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) 60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
61{ 61{
62 struct super_block *sb = info->dqi_sb; 62 struct super_block *sb = info->dqi_sb;
63 ssize_t ret;
63 64
64 return sb->s_op->quota_write(sb, info->dqi_type, buf, 65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
65 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
67 if (ret != info->dqi_usable_bs) {
68 q_warn(KERN_WARNING "VFS: dquota write failed on "
69 "dev %s\n", sb->s_id);
70 if (ret >= 0)
71 ret = -EIO;
72 }
73 return ret;
66} 74}
67 75
68/* Remove empty block from list and return it */ 76/* Remove empty block from list and return it */
@@ -152,7 +160,7 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
152 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); 160 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
153 /* No matter whether write succeeds block is out of list */ 161 /* No matter whether write succeeds block is out of list */
154 if (write_blk(info, blk, buf) < 0) 162 if (write_blk(info, blk, buf) < 0)
155 printk(KERN_ERR 163 q_warn(KERN_ERR
156 "VFS: Can't write block (%u) with free entries.\n", 164 "VFS: Can't write block (%u) with free entries.\n",
157 blk); 165 blk);
158 return 0; 166 return 0;
@@ -244,7 +252,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
244 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { 252 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
245 *err = remove_free_dqentry(info, buf, blk); 253 *err = remove_free_dqentry(info, buf, blk);
246 if (*err < 0) { 254 if (*err < 0) {
247 printk(KERN_ERR "VFS: find_free_dqentry(): Can't " 255 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
248 "remove block (%u) from entry free list.\n", 256 "remove block (%u) from entry free list.\n",
249 blk); 257 blk);
250 goto out_buf; 258 goto out_buf;
@@ -268,7 +276,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
268#endif 276#endif
269 *err = write_blk(info, blk, buf); 277 *err = write_blk(info, blk, buf);
270 if (*err < 0) { 278 if (*err < 0) {
271 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " 279 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
272 "data block %u.\n", blk); 280 "data block %u.\n", blk);
273 goto out_buf; 281 goto out_buf;
274 } 282 }
@@ -303,7 +311,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
303 } else { 311 } else {
304 ret = read_blk(info, *treeblk, buf); 312 ret = read_blk(info, *treeblk, buf);
305 if (ret < 0) { 313 if (ret < 0) {
306 printk(KERN_ERR "VFS: Can't read tree quota block " 314 q_warn(KERN_ERR "VFS: Can't read tree quota block "
307 "%u.\n", *treeblk); 315 "%u.\n", *treeblk);
308 goto out_buf; 316 goto out_buf;
309 } 317 }
@@ -365,7 +373,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
365 if (!dquot->dq_off) { 373 if (!dquot->dq_off) {
366 ret = dq_insert_tree(info, dquot); 374 ret = dq_insert_tree(info, dquot);
367 if (ret < 0) { 375 if (ret < 0) {
368 printk(KERN_ERR "VFS: Error %zd occurred while " 376 q_warn(KERN_ERR "VFS: Error %zd occurred while "
369 "creating quota.\n", ret); 377 "creating quota.\n", ret);
370 kfree(ddquot); 378 kfree(ddquot);
371 return ret; 379 return ret;
@@ -377,14 +385,14 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
377 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, 385 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
378 dquot->dq_off); 386 dquot->dq_off);
379 if (ret != info->dqi_entry_size) { 387 if (ret != info->dqi_entry_size) {
380 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 388 q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
381 sb->s_id); 389 sb->s_id);
382 if (ret >= 0) 390 if (ret >= 0)
383 ret = -ENOSPC; 391 ret = -ENOSPC;
384 } else { 392 } else {
385 ret = 0; 393 ret = 0;
386 } 394 }
387 dqstats.writes++; 395 dqstats_inc(DQST_WRITES);
388 kfree(ddquot); 396 kfree(ddquot);
389 397
390 return ret; 398 return ret;
@@ -402,14 +410,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
402 if (!buf) 410 if (!buf)
403 return -ENOMEM; 411 return -ENOMEM;
404 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { 412 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
405 printk(KERN_ERR "VFS: Quota structure has offset to other " 413 q_warn(KERN_ERR "VFS: Quota structure has offset to other "
406 "block (%u) than it should (%u).\n", blk, 414 "block (%u) than it should (%u).\n", blk,
407 (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); 415 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
408 goto out_buf; 416 goto out_buf;
409 } 417 }
410 ret = read_blk(info, blk, buf); 418 ret = read_blk(info, blk, buf);
411 if (ret < 0) { 419 if (ret < 0) {
412 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); 420 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
413 goto out_buf; 421 goto out_buf;
414 } 422 }
415 dh = (struct qt_disk_dqdbheader *)buf; 423 dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,7 +427,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
419 if (ret >= 0) 427 if (ret >= 0)
420 ret = put_free_dqblk(info, buf, blk); 428 ret = put_free_dqblk(info, buf, blk);
421 if (ret < 0) { 429 if (ret < 0) {
422 printk(KERN_ERR "VFS: Can't move quota data block (%u) " 430 q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
423 "to free list.\n", blk); 431 "to free list.\n", blk);
424 goto out_buf; 432 goto out_buf;
425 } 433 }
@@ -432,14 +440,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
432 /* Insert will write block itself */ 440 /* Insert will write block itself */
433 ret = insert_free_dqentry(info, buf, blk); 441 ret = insert_free_dqentry(info, buf, blk);
434 if (ret < 0) { 442 if (ret < 0) {
435 printk(KERN_ERR "VFS: Can't insert quota data " 443 q_warn(KERN_ERR "VFS: Can't insert quota data "
436 "block (%u) to free entry list.\n", blk); 444 "block (%u) to free entry list.\n", blk);
437 goto out_buf; 445 goto out_buf;
438 } 446 }
439 } else { 447 } else {
440 ret = write_blk(info, blk, buf); 448 ret = write_blk(info, blk, buf);
441 if (ret < 0) { 449 if (ret < 0) {
442 printk(KERN_ERR "VFS: Can't write quota data " 450 q_warn(KERN_ERR "VFS: Can't write quota data "
443 "block %u\n", blk); 451 "block %u\n", blk);
444 goto out_buf; 452 goto out_buf;
445 } 453 }
@@ -464,7 +472,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
464 return -ENOMEM; 472 return -ENOMEM;
465 ret = read_blk(info, *blk, buf); 473 ret = read_blk(info, *blk, buf);
466 if (ret < 0) { 474 if (ret < 0) {
467 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); 475 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
468 goto out_buf; 476 goto out_buf;
469 } 477 }
470 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 478 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,7 +496,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
488 } else { 496 } else {
489 ret = write_blk(info, *blk, buf); 497 ret = write_blk(info, *blk, buf);
490 if (ret < 0) 498 if (ret < 0)
491 printk(KERN_ERR "VFS: Can't write quota tree " 499 q_warn(KERN_ERR "VFS: Can't write quota tree "
492 "block %u.\n", *blk); 500 "block %u.\n", *blk);
493 } 501 }
494 } 502 }
@@ -521,7 +529,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
521 return -ENOMEM; 529 return -ENOMEM;
522 ret = read_blk(info, blk, buf); 530 ret = read_blk(info, blk, buf);
523 if (ret < 0) { 531 if (ret < 0) {
524 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 532 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
525 goto out_buf; 533 goto out_buf;
526 } 534 }
527 ddquot = buf + sizeof(struct qt_disk_dqdbheader); 535 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,7 +539,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
531 ddquot += info->dqi_entry_size; 539 ddquot += info->dqi_entry_size;
532 } 540 }
533 if (i == qtree_dqstr_in_blk(info)) { 541 if (i == qtree_dqstr_in_blk(info)) {
534 printk(KERN_ERR "VFS: Quota for id %u referenced " 542 q_warn(KERN_ERR "VFS: Quota for id %u referenced "
535 "but not present.\n", dquot->dq_id); 543 "but not present.\n", dquot->dq_id);
536 ret = -EIO; 544 ret = -EIO;
537 goto out_buf; 545 goto out_buf;
@@ -556,7 +564,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
556 return -ENOMEM; 564 return -ENOMEM;
557 ret = read_blk(info, blk, buf); 565 ret = read_blk(info, blk, buf);
558 if (ret < 0) { 566 if (ret < 0) {
559 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 567 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
560 goto out_buf; 568 goto out_buf;
561 } 569 }
562 ret = 0; 570 ret = 0;
@@ -599,7 +607,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
599 offset = find_dqentry(info, dquot); 607 offset = find_dqentry(info, dquot);
600 if (offset <= 0) { /* Entry not present? */ 608 if (offset <= 0) { /* Entry not present? */
601 if (offset < 0) 609 if (offset < 0)
602 printk(KERN_ERR "VFS: Can't read quota " 610 q_warn(KERN_ERR "VFS: Can't read quota "
603 "structure for id %u.\n", dquot->dq_id); 611 "structure for id %u.\n", dquot->dq_id);
604 dquot->dq_off = 0; 612 dquot->dq_off = 0;
605 set_bit(DQ_FAKE_B, &dquot->dq_flags); 613 set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -617,7 +625,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
617 if (ret != info->dqi_entry_size) { 625 if (ret != info->dqi_entry_size) {
618 if (ret >= 0) 626 if (ret >= 0)
619 ret = -EIO; 627 ret = -EIO;
620 printk(KERN_ERR "VFS: Error while reading quota " 628 q_warn(KERN_ERR "VFS: Error while reading quota "
621 "structure for id %u.\n", dquot->dq_id); 629 "structure for id %u.\n", dquot->dq_id);
622 set_bit(DQ_FAKE_B, &dquot->dq_flags); 630 set_bit(DQ_FAKE_B, &dquot->dq_flags);
623 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 631 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -634,7 +642,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
634 spin_unlock(&dq_data_lock); 642 spin_unlock(&dq_data_lock);
635 kfree(ddquot); 643 kfree(ddquot);
636out: 644out:
637 dqstats.reads++; 645 dqstats_inc(DQST_READS);
638 return ret; 646 return ret;
639} 647}
640EXPORT_SYMBOL(qtree_read_dquot); 648EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..ccc3e71fb1d8 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,4 +22,10 @@ struct qt_disk_dqdbheader {
22 22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ 23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24 24
25#define q_warn(fmt, args...) \
26do { \
27 if (printk_ratelimit()) \
28 printk(fmt, ## args); \
29} while(0)
30
25#endif /* _LINUX_QUOTAIO_TREE_H */ 31#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..4af344c5852a 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
71 dquot->dq_dqb.dqb_ihardlimit == 0 && 71 dquot->dq_dqb.dqb_ihardlimit == 0 &&
72 dquot->dq_dqb.dqb_isoftlimit == 0) 72 dquot->dq_dqb.dqb_isoftlimit == 0)
73 set_bit(DQ_FAKE_B, &dquot->dq_flags); 73 set_bit(DQ_FAKE_B, &dquot->dq_flags);
74 dqstats.reads++; 74 dqstats_inc(DQST_READS);
75 75
76 return 0; 76 return 0;
77} 77}
@@ -104,7 +104,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
104 ret = 0; 104 ret = 0;
105 105
106out: 106out:
107 dqstats.writes++; 107 dqstats_inc(DQST_WRITES);
108 108
109 return ret; 109 return ret;
110} 110}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..135206af1458 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,7 +63,7 @@ static int v2_read_header(struct super_block *sb, int type,
63 size = sb->s_op->quota_read(sb, type, (char *)dqhead, 63 size = sb->s_op->quota_read(sb, type, (char *)dqhead,
64 sizeof(struct v2_disk_dqheader), 0); 64 sizeof(struct v2_disk_dqheader), 0);
65 if (size != sizeof(struct v2_disk_dqheader)) { 65 if (size != sizeof(struct v2_disk_dqheader)) {
66 printk(KERN_WARNING "quota_v2: Failed header read:" 66 q_warn(KERN_WARNING "quota_v2: Failed header read:"
67 " expected=%zd got=%zd\n", 67 " expected=%zd got=%zd\n",
68 sizeof(struct v2_disk_dqheader), size); 68 sizeof(struct v2_disk_dqheader), size);
69 return 0; 69 return 0;
@@ -106,7 +106,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
108 if (size != sizeof(struct v2_disk_dqinfo)) { 108 if (size != sizeof(struct v2_disk_dqinfo)) {
109 printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", 109 q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
110 sb->s_id); 110 sb->s_id);
111 return -1; 111 return -1;
112 } 112 }
@@ -167,7 +167,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
169 if (size != sizeof(struct v2_disk_dqinfo)) { 169 if (size != sizeof(struct v2_disk_dqinfo)) {
170 printk(KERN_WARNING "Can't write info structure on device %s.\n", 170 q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
171 sb->s_id); 171 sb->s_id);
172 return -1; 172 return -1;
173 } 173 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c94853473ca9..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, 52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
53}; 53};
54 54
55struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) 55struct inode *ramfs_get_inode(struct super_block *sb,
56 const struct inode *dir, int mode, dev_t dev)
56{ 57{
57 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
58 59
59 if (inode) { 60 if (inode) {
60 inode->i_mode = mode; 61 inode_init_owner(inode, dir, mode);
61 inode->i_uid = current_fsuid();
62 inode->i_gid = current_fsgid();
63 inode->i_mapping->a_ops = &ramfs_aops; 62 inode->i_mapping->a_ops = &ramfs_aops;
64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
65 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 64 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
95static int 94static int
96ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 95ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
97{ 96{
98 struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); 97 struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
99 int error = -ENOSPC; 98 int error = -ENOSPC;
100 99
101 if (inode) { 100 if (inode) {
102 if (dir->i_mode & S_ISGID) {
103 inode->i_gid = dir->i_gid;
104 if (S_ISDIR(mode))
105 inode->i_mode |= S_ISGID;
106 }
107 d_instantiate(dentry, inode); 101 d_instantiate(dentry, inode);
108 dget(dentry); /* Extra count - pin the dentry in core */ 102 dget(dentry); /* Extra count - pin the dentry in core */
109 error = 0; 103 error = 0;
@@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
130 struct inode *inode; 124 struct inode *inode;
131 int error = -ENOSPC; 125 int error = -ENOSPC;
132 126
133 inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); 127 inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
134 if (inode) { 128 if (inode) {
135 int l = strlen(symname)+1; 129 int l = strlen(symname)+1;
136 error = page_symlink(inode, symname, l); 130 error = page_symlink(inode, symname, l);
137 if (!error) { 131 if (!error) {
138 if (dir->i_mode & S_ISGID)
139 inode->i_gid = dir->i_gid;
140 d_instantiate(dentry, inode); 132 d_instantiate(dentry, inode);
141 dget(dentry); 133 dget(dentry);
142 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 134 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -214,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
214 return 0; 206 return 0;
215} 207}
216 208
217static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 209int ramfs_fill_super(struct super_block *sb, void *data, int silent)
218{ 210{
219 struct ramfs_fs_info *fsi; 211 struct ramfs_fs_info *fsi;
220 struct inode *inode = NULL; 212 struct inode *inode = NULL;
@@ -241,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
241 sb->s_op = &ramfs_ops; 233 sb->s_op = &ramfs_ops;
242 sb->s_time_gran = 1; 234 sb->s_time_gran = 1;
243 235
244 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 236 inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
245 if (!inode) { 237 if (!inode) {
246 err = -ENOMEM; 238 err = -ENOMEM;
247 goto fail; 239 goto fail;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
147 barrier_done = reiserfs_commit_for_inode(inode); 147 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(inode->i_sb); 148 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 150 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
151 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 152 if (barrier_done < 0)
152 return barrier_done; 153 return barrier_done;
153 return (err < 0) ? -EIO : 0; 154 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index dc2c65e04853..0f22fdaf54ac 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3076,9 +3076,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3077 3077
3078 depth = reiserfs_write_lock_once(inode->i_sb); 3078 depth = reiserfs_write_lock_once(inode->i_sb);
3079 if (attr->ia_valid & ATTR_SIZE) { 3079 if (is_quota_modification(inode, attr))
3080 dquot_initialize(inode); 3080 dquot_initialize(inode);
3081 3081
3082 if (attr->ia_valid & ATTR_SIZE) {
3082 /* version 2 items will be caught by the s_maxbytes check 3083 /* version 2 items will be caught by the s_maxbytes check
3083 ** done for us in vmtruncate 3084 ** done for us in vmtruncate
3084 */ 3085 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d0c43cb99ffc..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
561*/ 561*/
562static int new_inode_init(struct inode *inode, struct inode *dir, int mode) 562static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
563{ 563{
564
565 /* the quota init calls have to know who to charge the quota to, so
566 ** we have to set uid and gid here
567 */
568 inode->i_uid = current_fsuid();
569 inode->i_mode = mode;
570 /* Make inode invalid - just in case we are going to drop it before 564 /* Make inode invalid - just in case we are going to drop it before
571 * the initialization happens */ 565 * the initialization happens */
572 INODE_PKEY(inode)->k_objectid = 0; 566 INODE_PKEY(inode)->k_objectid = 0;
573 567 /* the quota init calls have to know who to charge the quota to, so
574 if (dir->i_mode & S_ISGID) { 568 ** we have to set uid and gid here
575 inode->i_gid = dir->i_gid; 569 */
576 if (S_ISDIR(mode)) 570 inode_init_owner(inode, dir, mode);
577 inode->i_mode |= S_ISGID;
578 } else {
579 inode->i_gid = current_fsgid();
580 }
581 dquot_initialize(inode); 571 dquot_initialize(inode);
582 return 0; 572 return 0;
583} 573}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e7cc00e636dc..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -723,11 +723,11 @@ out:
723 (handler) = *(handlers)++) 723 (handler) = *(handlers)++)
724 724
725/* This is the implementation for the xattr plugin infrastructure */ 725/* This is the implementation for the xattr plugin infrastructure */
726static inline struct xattr_handler * 726static inline const struct xattr_handler *
727find_xattr_handler_prefix(struct xattr_handler **handlers, 727find_xattr_handler_prefix(const struct xattr_handler **handlers,
728 const char *name) 728 const char *name)
729{ 729{
730 struct xattr_handler *xah; 730 const struct xattr_handler *xah;
731 731
732 if (!handlers) 732 if (!handlers)
733 return NULL; 733 return NULL;
@@ -748,7 +748,7 @@ ssize_t
748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
749 size_t size) 749 size_t size)
750{ 750{
751 struct xattr_handler *handler; 751 const struct xattr_handler *handler;
752 752
753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
754 754
@@ -767,7 +767,7 @@ int
767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
768 size_t size, int flags) 768 size_t size, int flags)
769{ 769{
770 struct xattr_handler *handler; 770 const struct xattr_handler *handler;
771 771
772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
773 773
@@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
784 */ 784 */
785int reiserfs_removexattr(struct dentry *dentry, const char *name) 785int reiserfs_removexattr(struct dentry *dentry, const char *name)
786{ 786{
787 struct xattr_handler *handler; 787 const struct xattr_handler *handler;
788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
789 789
790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
807 size_t size; 807 size_t size;
808 if (name[0] != '.' || 808 if (name[0] != '.' ||
809 (namelen != 1 && (name[1] != '.' || namelen != 2))) { 809 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
810 struct xattr_handler *handler; 810 const struct xattr_handler *handler;
811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, 811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
812 name); 812 name);
813 if (!handler) /* Unsupported xattr name */ 813 if (!handler) /* Unsupported xattr name */
@@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
920#endif 920#endif
921 921
922/* Actual operations that are exported to VFS-land */ 922/* Actual operations that are exported to VFS-land */
923struct xattr_handler *reiserfs_xattr_handlers[] = { 923const struct xattr_handler *reiserfs_xattr_handlers[] = {
924#ifdef CONFIG_REISERFS_FS_XATTR 924#ifdef CONFIG_REISERFS_FS_XATTR
925 &reiserfs_xattr_user_handler, 925 &reiserfs_xattr_user_handler,
926 &reiserfs_xattr_trusted_handler, 926 &reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9cdb759645a9..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
500 return size; 500 return size;
501} 501}
502 502
503struct xattr_handler reiserfs_posix_acl_access_handler = { 503const struct xattr_handler reiserfs_posix_acl_access_handler = {
504 .prefix = POSIX_ACL_XATTR_ACCESS, 504 .prefix = POSIX_ACL_XATTR_ACCESS,
505 .flags = ACL_TYPE_ACCESS, 505 .flags = ACL_TYPE_ACCESS,
506 .get = posix_acl_get, 506 .get = posix_acl_get,
@@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
520 return size; 520 return size;
521} 521}
522 522
523struct xattr_handler reiserfs_posix_acl_default_handler = { 523const struct xattr_handler reiserfs_posix_acl_default_handler = {
524 .prefix = POSIX_ACL_XATTR_DEFAULT, 524 .prefix = POSIX_ACL_XATTR_DEFAULT,
525 .flags = ACL_TYPE_DEFAULT, 525 .flags = ACL_TYPE_DEFAULT,
526 .get = posix_acl_get, 526 .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 7271a477c041..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
111 sec->value = NULL; 111 sec->value = NULL;
112} 112}
113 113
114struct xattr_handler reiserfs_xattr_security_handler = { 114const struct xattr_handler reiserfs_xattr_security_handler = {
115 .prefix = XATTR_SECURITY_PREFIX, 115 .prefix = XATTR_SECURITY_PREFIX,
116 .get = security_get, 116 .get = security_get,
117 .set = security_set, 117 .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
48 return len; 48 return len;
49} 49}
50 50
51struct xattr_handler reiserfs_xattr_trusted_handler = { 51const struct xattr_handler reiserfs_xattr_trusted_handler = {
52 .prefix = XATTR_TRUSTED_PREFIX, 52 .prefix = XATTR_TRUSTED_PREFIX,
53 .get = trusted_get, 53 .get = trusted_get,
54 .set = trusted_set, 54 .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
44 return len; 44 return len;
45} 45}
46 46
47struct xattr_handler reiserfs_xattr_user_handler = { 47const struct xattr_handler reiserfs_xattr_user_handler = {
48 .prefix = XATTR_USER_PREFIX, 48 .prefix = XATTR_USER_PREFIX,
49 .get = user_get, 49 .get = user_get,
50 .set = user_set, 50 .set = user_set,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..6c978428892d 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -39,7 +39,7 @@ const struct file_operations smb_dir_operations =
39{ 39{
40 .read = generic_read_dir, 40 .read = generic_read_dir,
41 .readdir = smb_readdir, 41 .readdir = smb_readdir,
42 .ioctl = smb_ioctl, 42 .unlocked_ioctl = smb_ioctl,
43 .open = smb_dir_open, 43 .open = smb_dir_open,
44}; 44};
45 45
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..84ecf0e43f91 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -437,7 +437,7 @@ const struct file_operations smb_file_operations =
437 .aio_read = smb_file_aio_read, 437 .aio_read = smb_file_aio_read,
438 .write = do_sync_write, 438 .write = do_sync_write,
439 .aio_write = smb_file_aio_write, 439 .aio_write = smb_file_aio_write,
440 .ioctl = smb_ioctl, 440 .unlocked_ioctl = smb_ioctl,
441 .mmap = smb_file_mmap, 441 .mmap = smb_file_mmap,
442 .open = smb_file_open, 442 .open = smb_file_open,
443 .release = smb_file_release, 443 .release = smb_file_release,
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
16#include <linux/net.h> 17#include <linux/net.h>
17 18
18#include <linux/smb_fs.h> 19#include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
22 23
23#include "proto.h" 24#include "proto.h"
24 25
25int 26long
26smb_ioctl(struct inode *inode, struct file *filp, 27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
27 unsigned int cmd, unsigned long arg)
28{ 28{
29 struct smb_sb_info *server = server_from_inode(inode); 29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt; 30 struct smb_conn_opt opt;
31 int result = -EINVAL; 31 int result = -EINVAL;
32 32
33 lock_kernel();
33 switch (cmd) { 34 switch (cmd) {
34 uid16_t uid16; 35 uid16_t uid16;
35 uid_t uid32; 36 uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
62 default: 63 default:
63 break; 64 break;
64 } 65 }
66 unlock_kernel();
65 67
66 return result; 68 return result;
67} 69}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations; 67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations; 68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */ 69/* ioctl.c */
70extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); 70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */ 71/* smbiod.c */
72extern void smbiod_wake_up(void); 72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server); 73extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
19 18
20#include <asm/uaccess.h> 19#include <asm/uaccess.h>
21#include <asm/system.h> 20#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
193 break; 193 break;
194 } 194 }
195 195
196 if (pipe->nrbufs < PIPE_BUFFERS) { 196 if (pipe->nrbufs < pipe->buffers) {
197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
198 struct pipe_buffer *buf = pipe->bufs + newbuf; 198 struct pipe_buffer *buf = pipe->bufs + newbuf;
199 199
200 buf->page = spd->pages[page_nr]; 200 buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
214 214
215 if (!--spd->nr_pages) 215 if (!--spd->nr_pages)
216 break; 216 break;
217 if (pipe->nrbufs < PIPE_BUFFERS) 217 if (pipe->nrbufs < pipe->buffers)
218 continue; 218 continue;
219 219
220 break; 220 break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
265 page_cache_release(spd->pages[i]); 265 page_cache_release(spd->pages[i]);
266} 266}
267 267
268/*
269 * Check if we need to grow the arrays holding pages and partial page
270 * descriptions.
271 */
272int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
273{
274 if (pipe->buffers <= PIPE_DEF_BUFFERS)
275 return 0;
276
277 spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
278 spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
279
280 if (spd->pages && spd->partial)
281 return 0;
282
283 kfree(spd->pages);
284 kfree(spd->partial);
285 return -ENOMEM;
286}
287
288void splice_shrink_spd(struct pipe_inode_info *pipe,
289 struct splice_pipe_desc *spd)
290{
291 if (pipe->buffers <= PIPE_DEF_BUFFERS)
292 return;
293
294 kfree(spd->pages);
295 kfree(spd->partial);
296}
297
268static int 298static int
269__generic_file_splice_read(struct file *in, loff_t *ppos, 299__generic_file_splice_read(struct file *in, loff_t *ppos,
270 struct pipe_inode_info *pipe, size_t len, 300 struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
272{ 302{
273 struct address_space *mapping = in->f_mapping; 303 struct address_space *mapping = in->f_mapping;
274 unsigned int loff, nr_pages, req_pages; 304 unsigned int loff, nr_pages, req_pages;
275 struct page *pages[PIPE_BUFFERS]; 305 struct page *pages[PIPE_DEF_BUFFERS];
276 struct partial_page partial[PIPE_BUFFERS]; 306 struct partial_page partial[PIPE_DEF_BUFFERS];
277 struct page *page; 307 struct page *page;
278 pgoff_t index, end_index; 308 pgoff_t index, end_index;
279 loff_t isize; 309 loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
286 .spd_release = spd_release_page, 316 .spd_release = spd_release_page,
287 }; 317 };
288 318
319 if (splice_grow_spd(pipe, &spd))
320 return -ENOMEM;
321
289 index = *ppos >> PAGE_CACHE_SHIFT; 322 index = *ppos >> PAGE_CACHE_SHIFT;
290 loff = *ppos & ~PAGE_CACHE_MASK; 323 loff = *ppos & ~PAGE_CACHE_MASK;
291 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 324 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
292 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 325 nr_pages = min(req_pages, pipe->buffers);
293 326
294 /* 327 /*
295 * Lookup the (hopefully) full range of pages we need. 328 * Lookup the (hopefully) full range of pages we need.
296 */ 329 */
297 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 330 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
298 index += spd.nr_pages; 331 index += spd.nr_pages;
299 332
300 /* 333 /*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
335 unlock_page(page); 368 unlock_page(page);
336 } 369 }
337 370
338 pages[spd.nr_pages++] = page; 371 spd.pages[spd.nr_pages++] = page;
339 index++; 372 index++;
340 } 373 }
341 374
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
356 * this_len is the max we'll use from this page 389 * this_len is the max we'll use from this page
357 */ 390 */
358 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 391 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
359 page = pages[page_nr]; 392 page = spd.pages[page_nr];
360 393
361 if (PageReadahead(page)) 394 if (PageReadahead(page))
362 page_cache_async_readahead(mapping, &in->f_ra, in, 395 page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
393 error = -ENOMEM; 426 error = -ENOMEM;
394 break; 427 break;
395 } 428 }
396 page_cache_release(pages[page_nr]); 429 page_cache_release(spd.pages[page_nr]);
397 pages[page_nr] = page; 430 spd.pages[page_nr] = page;
398 } 431 }
399 /* 432 /*
400 * page was already under io and is now done, great 433 * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
451 len = this_len; 484 len = this_len;
452 } 485 }
453 486
454 partial[page_nr].offset = loff; 487 spd.partial[page_nr].offset = loff;
455 partial[page_nr].len = this_len; 488 spd.partial[page_nr].len = this_len;
456 len -= this_len; 489 len -= this_len;
457 loff = 0; 490 loff = 0;
458 spd.nr_pages++; 491 spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
464 * we got, 'nr_pages' is how many pages are in the map. 497 * we got, 'nr_pages' is how many pages are in the map.
465 */ 498 */
466 while (page_nr < nr_pages) 499 while (page_nr < nr_pages)
467 page_cache_release(pages[page_nr++]); 500 page_cache_release(spd.pages[page_nr++]);
468 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 501 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
469 502
470 if (spd.nr_pages) 503 if (spd.nr_pages)
471 return splice_to_pipe(pipe, &spd); 504 error = splice_to_pipe(pipe, &spd);
472 505
506 splice_shrink_spd(pipe, &spd);
473 return error; 507 return error;
474} 508}
475 509
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
560 unsigned int nr_pages; 594 unsigned int nr_pages;
561 unsigned int nr_freed; 595 unsigned int nr_freed;
562 size_t offset; 596 size_t offset;
563 struct page *pages[PIPE_BUFFERS]; 597 struct page *pages[PIPE_DEF_BUFFERS];
564 struct partial_page partial[PIPE_BUFFERS]; 598 struct partial_page partial[PIPE_DEF_BUFFERS];
565 struct iovec vec[PIPE_BUFFERS]; 599 struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
566 pgoff_t index; 600 pgoff_t index;
567 ssize_t res; 601 ssize_t res;
568 size_t this_len; 602 size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
576 .spd_release = spd_release_page, 610 .spd_release = spd_release_page,
577 }; 611 };
578 612
613 if (splice_grow_spd(pipe, &spd))
614 return -ENOMEM;
615
616 res = -ENOMEM;
617 vec = __vec;
618 if (pipe->buffers > PIPE_DEF_BUFFERS) {
619 vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
620 if (!vec)
621 goto shrink_ret;
622 }
623
579 index = *ppos >> PAGE_CACHE_SHIFT; 624 index = *ppos >> PAGE_CACHE_SHIFT;
580 offset = *ppos & ~PAGE_CACHE_MASK; 625 offset = *ppos & ~PAGE_CACHE_MASK;
581 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 626 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
582 627
583 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { 628 for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
584 struct page *page; 629 struct page *page;
585 630
586 page = alloc_page(GFP_USER); 631 page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
591 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 636 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
592 vec[i].iov_base = (void __user *) page_address(page); 637 vec[i].iov_base = (void __user *) page_address(page);
593 vec[i].iov_len = this_len; 638 vec[i].iov_len = this_len;
594 pages[i] = page; 639 spd.pages[i] = page;
595 spd.nr_pages++; 640 spd.nr_pages++;
596 len -= this_len; 641 len -= this_len;
597 offset = 0; 642 offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
610 nr_freed = 0; 655 nr_freed = 0;
611 for (i = 0; i < spd.nr_pages; i++) { 656 for (i = 0; i < spd.nr_pages; i++) {
612 this_len = min_t(size_t, vec[i].iov_len, res); 657 this_len = min_t(size_t, vec[i].iov_len, res);
613 partial[i].offset = 0; 658 spd.partial[i].offset = 0;
614 partial[i].len = this_len; 659 spd.partial[i].len = this_len;
615 if (!this_len) { 660 if (!this_len) {
616 __free_page(pages[i]); 661 __free_page(spd.pages[i]);
617 pages[i] = NULL; 662 spd.pages[i] = NULL;
618 nr_freed++; 663 nr_freed++;
619 } 664 }
620 res -= this_len; 665 res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
625 if (res > 0) 670 if (res > 0)
626 *ppos += res; 671 *ppos += res;
627 672
673shrink_ret:
674 if (vec != __vec)
675 kfree(vec);
676 splice_shrink_spd(pipe, &spd);
628 return res; 677 return res;
629 678
630err: 679err:
631 for (i = 0; i < spd.nr_pages; i++) 680 for (i = 0; i < spd.nr_pages; i++)
632 __free_page(pages[i]); 681 __free_page(spd.pages[i]);
633 682
634 return error; 683 res = error;
684 goto shrink_ret;
635} 685}
636EXPORT_SYMBOL(default_file_splice_read); 686EXPORT_SYMBOL(default_file_splice_read);
637 687
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
784 if (!buf->len) { 834 if (!buf->len) {
785 buf->ops = NULL; 835 buf->ops = NULL;
786 ops->release(pipe, buf); 836 ops->release(pipe, buf);
787 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 837 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
788 pipe->nrbufs--; 838 pipe->nrbufs--;
789 if (pipe->inode) 839 if (pipe->inode)
790 sd->need_wakeup = true; 840 sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
1211 * If we did an incomplete transfer we must release 1261 * If we did an incomplete transfer we must release
1212 * the pipe buffers in question: 1262 * the pipe buffers in question:
1213 */ 1263 */
1214 for (i = 0; i < PIPE_BUFFERS; i++) { 1264 for (i = 0; i < pipe->buffers; i++) {
1215 struct pipe_buffer *buf = pipe->bufs + i; 1265 struct pipe_buffer *buf = pipe->bufs + i;
1216 1266
1217 if (buf->ops) { 1267 if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1371 */ 1421 */
1372static int get_iovec_page_array(const struct iovec __user *iov, 1422static int get_iovec_page_array(const struct iovec __user *iov,
1373 unsigned int nr_vecs, struct page **pages, 1423 unsigned int nr_vecs, struct page **pages,
1374 struct partial_page *partial, int aligned) 1424 struct partial_page *partial, int aligned,
1425 unsigned int pipe_buffers)
1375{ 1426{
1376 int buffers = 0, error = 0; 1427 int buffers = 0, error = 0;
1377 1428
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1414 break; 1465 break;
1415 1466
1416 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1467 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1417 if (npages > PIPE_BUFFERS - buffers) 1468 if (npages > pipe_buffers - buffers)
1418 npages = PIPE_BUFFERS - buffers; 1469 npages = pipe_buffers - buffers;
1419 1470
1420 error = get_user_pages_fast((unsigned long)base, npages, 1471 error = get_user_pages_fast((unsigned long)base, npages,
1421 0, &pages[buffers]); 1472 0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1450 * or if we mapped the max number of pages that we have 1501 * or if we mapped the max number of pages that we have
1451 * room for. 1502 * room for.
1452 */ 1503 */
1453 if (error < npages || buffers == PIPE_BUFFERS) 1504 if (error < npages || buffers == pipe_buffers)
1454 break; 1505 break;
1455 1506
1456 nr_vecs--; 1507 nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1593 unsigned long nr_segs, unsigned int flags) 1644 unsigned long nr_segs, unsigned int flags)
1594{ 1645{
1595 struct pipe_inode_info *pipe; 1646 struct pipe_inode_info *pipe;
1596 struct page *pages[PIPE_BUFFERS]; 1647 struct page *pages[PIPE_DEF_BUFFERS];
1597 struct partial_page partial[PIPE_BUFFERS]; 1648 struct partial_page partial[PIPE_DEF_BUFFERS];
1598 struct splice_pipe_desc spd = { 1649 struct splice_pipe_desc spd = {
1599 .pages = pages, 1650 .pages = pages,
1600 .partial = partial, 1651 .partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1602 .ops = &user_page_pipe_buf_ops, 1653 .ops = &user_page_pipe_buf_ops,
1603 .spd_release = spd_release_page, 1654 .spd_release = spd_release_page,
1604 }; 1655 };
1656 long ret;
1605 1657
1606 pipe = pipe_info(file->f_path.dentry->d_inode); 1658 pipe = pipe_info(file->f_path.dentry->d_inode);
1607 if (!pipe) 1659 if (!pipe)
1608 return -EBADF; 1660 return -EBADF;
1609 1661
1610 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1662 if (splice_grow_spd(pipe, &spd))
1611 flags & SPLICE_F_GIFT); 1663 return -ENOMEM;
1664
1665 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1666 spd.partial, flags & SPLICE_F_GIFT,
1667 pipe->buffers);
1612 if (spd.nr_pages <= 0) 1668 if (spd.nr_pages <= 0)
1613 return spd.nr_pages; 1669 ret = spd.nr_pages;
1670 else
1671 ret = splice_to_pipe(pipe, &spd);
1614 1672
1615 return splice_to_pipe(pipe, &spd); 1673 splice_shrink_spd(pipe, &spd);
1674 return ret;
1616} 1675}
1617 1676
1618/* 1677/*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1738 * Check ->nrbufs without the inode lock first. This function 1797 * Check ->nrbufs without the inode lock first. This function
1739 * is speculative anyways, so missing one is ok. 1798 * is speculative anyways, so missing one is ok.
1740 */ 1799 */
1741 if (pipe->nrbufs < PIPE_BUFFERS) 1800 if (pipe->nrbufs < pipe->buffers)
1742 return 0; 1801 return 0;
1743 1802
1744 ret = 0; 1803 ret = 0;
1745 pipe_lock(pipe); 1804 pipe_lock(pipe);
1746 1805
1747 while (pipe->nrbufs >= PIPE_BUFFERS) { 1806 while (pipe->nrbufs >= pipe->buffers) {
1748 if (!pipe->readers) { 1807 if (!pipe->readers) {
1749 send_sig(SIGPIPE, current, 0); 1808 send_sig(SIGPIPE, current, 0);
1750 ret = -EPIPE; 1809 ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
1810 * Cannot make any progress, because either the input 1869 * Cannot make any progress, because either the input
1811 * pipe is empty or the output pipe is full. 1870 * pipe is empty or the output pipe is full.
1812 */ 1871 */
1813 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { 1872 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1814 /* Already processed some buffers, break */ 1873 /* Already processed some buffers, break */
1815 if (ret) 1874 if (ret)
1816 break; 1875 break;
@@ -1831,7 +1890,7 @@ retry:
1831 } 1890 }
1832 1891
1833 ibuf = ipipe->bufs + ipipe->curbuf; 1892 ibuf = ipipe->bufs + ipipe->curbuf;
1834 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; 1893 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1835 obuf = opipe->bufs + nbuf; 1894 obuf = opipe->bufs + nbuf;
1836 1895
1837 if (len >= ibuf->len) { 1896 if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
1841 *obuf = *ibuf; 1900 *obuf = *ibuf;
1842 ibuf->ops = NULL; 1901 ibuf->ops = NULL;
1843 opipe->nrbufs++; 1902 opipe->nrbufs++;
1844 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; 1903 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1845 ipipe->nrbufs--; 1904 ipipe->nrbufs--;
1846 input_wakeup = true; 1905 input_wakeup = true;
1847 } else { 1906 } else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1914 * If we have iterated all input buffers or ran out of 1973 * If we have iterated all input buffers or ran out of
1915 * output room, break. 1974 * output room, break.
1916 */ 1975 */
1917 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1976 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1918 break; 1977 break;
1919 1978
1920 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1979 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1921 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1980 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1922 1981
1923 /* 1982 /*
1924 * Get a reference to this pipe buffer, 1983 * Get a reference to this pipe buffer,
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..4ef021f3b612
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
1#include <linux/syscalls.h>
2#include <linux/module.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/statfs.h>
7#include <linux/security.h>
8#include <linux/uaccess.h>
9
10int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
11{
12 int retval = -ENODEV;
13
14 if (dentry) {
15 retval = -ENOSYS;
16 if (dentry->d_sb->s_op->statfs) {
17 memset(buf, 0, sizeof(*buf));
18 retval = security_sb_statfs(dentry);
19 if (retval)
20 return retval;
21 retval = dentry->d_sb->s_op->statfs(dentry, buf);
22 if (retval == 0 && buf->f_frsize == 0)
23 buf->f_frsize = buf->f_bsize;
24 }
25 }
26 return retval;
27}
28
29EXPORT_SYMBOL(vfs_statfs);
30
31static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
32{
33 struct kstatfs st;
34 int retval;
35
36 retval = vfs_statfs(dentry, &st);
37 if (retval)
38 return retval;
39
40 if (sizeof(*buf) == sizeof(st))
41 memcpy(buf, &st, sizeof(st));
42 else {
43 if (sizeof buf->f_blocks == 4) {
44 if ((st.f_blocks | st.f_bfree | st.f_bavail |
45 st.f_bsize | st.f_frsize) &
46 0xffffffff00000000ULL)
47 return -EOVERFLOW;
48 /*
49 * f_files and f_ffree may be -1; it's okay to stuff
50 * that into 32 bits
51 */
52 if (st.f_files != -1 &&
53 (st.f_files & 0xffffffff00000000ULL))
54 return -EOVERFLOW;
55 if (st.f_ffree != -1 &&
56 (st.f_ffree & 0xffffffff00000000ULL))
57 return -EOVERFLOW;
58 }
59
60 buf->f_type = st.f_type;
61 buf->f_bsize = st.f_bsize;
62 buf->f_blocks = st.f_blocks;
63 buf->f_bfree = st.f_bfree;
64 buf->f_bavail = st.f_bavail;
65 buf->f_files = st.f_files;
66 buf->f_ffree = st.f_ffree;
67 buf->f_fsid = st.f_fsid;
68 buf->f_namelen = st.f_namelen;
69 buf->f_frsize = st.f_frsize;
70 memset(buf->f_spare, 0, sizeof(buf->f_spare));
71 }
72 return 0;
73}
74
75static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
76{
77 struct kstatfs st;
78 int retval;
79
80 retval = vfs_statfs(dentry, &st);
81 if (retval)
82 return retval;
83
84 if (sizeof(*buf) == sizeof(st))
85 memcpy(buf, &st, sizeof(st));
86 else {
87 buf->f_type = st.f_type;
88 buf->f_bsize = st.f_bsize;
89 buf->f_blocks = st.f_blocks;
90 buf->f_bfree = st.f_bfree;
91 buf->f_bavail = st.f_bavail;
92 buf->f_files = st.f_files;
93 buf->f_ffree = st.f_ffree;
94 buf->f_fsid = st.f_fsid;
95 buf->f_namelen = st.f_namelen;
96 buf->f_frsize = st.f_frsize;
97 memset(buf->f_spare, 0, sizeof(buf->f_spare));
98 }
99 return 0;
100}
101
102SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
103{
104 struct path path;
105 int error;
106
107 error = user_path(pathname, &path);
108 if (!error) {
109 struct statfs tmp;
110 error = vfs_statfs_native(path.dentry, &tmp);
111 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
112 error = -EFAULT;
113 path_put(&path);
114 }
115 return error;
116}
117
118SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
119{
120 struct path path;
121 long error;
122
123 if (sz != sizeof(*buf))
124 return -EINVAL;
125 error = user_path(pathname, &path);
126 if (!error) {
127 struct statfs64 tmp;
128 error = vfs_statfs64(path.dentry, &tmp);
129 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
130 error = -EFAULT;
131 path_put(&path);
132 }
133 return error;
134}
135
136SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
137{
138 struct file *file;
139 struct statfs tmp;
140 int error;
141
142 error = -EBADF;
143 file = fget(fd);
144 if (!file)
145 goto out;
146 error = vfs_statfs_native(file->f_path.dentry, &tmp);
147 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
148 error = -EFAULT;
149 fput(file);
150out:
151 return error;
152}
153
154SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
155{
156 struct file *file;
157 struct statfs64 tmp;
158 int error;
159
160 if (sz != sizeof(*buf))
161 return -EINVAL;
162
163 error = -EBADF;
164 file = fget(fd);
165 if (!file)
166 goto out;
167 error = vfs_statfs64(file->f_path.dentry, &tmp);
168 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
169 error = -EFAULT;
170 fput(file);
171out:
172 return error;
173}
174
175SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
176{
177 struct super_block *s;
178 struct ustat tmp;
179 struct kstatfs sbuf;
180 int err;
181
182 s = user_get_super(new_decode_dev(dev));
183 if (!s)
184 return -EINVAL;
185
186 err = vfs_statfs(s->s_root, &sbuf);
187 drop_super(s);
188 if (err)
189 return err;
190
191 memset(&tmp,0,sizeof(struct ustat));
192 tmp.f_tfree = sbuf.f_bfree;
193 tmp.f_tinode = sbuf.f_ffree;
194
195 return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
196}
diff --git a/fs/super.c b/fs/super.c
index 1527e6a0ee35..69688b15f1fa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,23 +22,15 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/smp_lock.h>
27#include <linux/acct.h> 25#include <linux/acct.h>
28#include <linux/blkdev.h> 26#include <linux/blkdev.h>
29#include <linux/quotaops.h> 27#include <linux/quotaops.h>
30#include <linux/namei.h>
31#include <linux/mount.h> 28#include <linux/mount.h>
32#include <linux/security.h> 29#include <linux/security.h>
33#include <linux/syscalls.h>
34#include <linux/vfs.h>
35#include <linux/writeback.h> /* for the emergency remount stuff */ 30#include <linux/writeback.h> /* for the emergency remount stuff */
36#include <linux/idr.h> 31#include <linux/idr.h>
37#include <linux/kobject.h>
38#include <linux/mutex.h> 32#include <linux/mutex.h>
39#include <linux/file.h>
40#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
41#include <asm/uaccess.h>
42#include "internal.h" 34#include "internal.h"
43 35
44 36
@@ -93,9 +85,10 @@ static struct super_block *alloc_super(struct file_system_type *type)
93 * subclass. 85 * subclass.
94 */ 86 */
95 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); 87 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
96 s->s_count = S_BIAS; 88 s->s_count = 1;
97 atomic_set(&s->s_active, 1); 89 atomic_set(&s->s_active, 1);
98 mutex_init(&s->s_vfs_rename_mutex); 90 mutex_init(&s->s_vfs_rename_mutex);
91 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
99 mutex_init(&s->s_dquot.dqio_mutex); 92 mutex_init(&s->s_dquot.dqio_mutex);
100 mutex_init(&s->s_dquot.dqonoff_mutex); 93 mutex_init(&s->s_dquot.dqonoff_mutex);
101 init_rwsem(&s->s_dquot.dqptr_sem); 94 init_rwsem(&s->s_dquot.dqptr_sem);
@@ -127,39 +120,14 @@ static inline void destroy_super(struct super_block *s)
127/* Superblock refcounting */ 120/* Superblock refcounting */
128 121
129/* 122/*
130 * Drop a superblock's refcount. Returns non-zero if the superblock was 123 * Drop a superblock's refcount. The caller must hold sb_lock.
131 * destroyed. The caller must hold sb_lock.
132 */ 124 */
133static int __put_super(struct super_block *sb) 125void __put_super(struct super_block *sb)
134{ 126{
135 int ret = 0;
136
137 if (!--sb->s_count) { 127 if (!--sb->s_count) {
128 list_del_init(&sb->s_list);
138 destroy_super(sb); 129 destroy_super(sb);
139 ret = 1;
140 } 130 }
141 return ret;
142}
143
144/*
145 * Drop a superblock's refcount.
146 * Returns non-zero if the superblock is about to be destroyed and
147 * at least is already removed from super_blocks list, so if we are
148 * making a loop through super blocks then we need to restart.
149 * The caller must hold sb_lock.
150 */
151int __put_super_and_need_restart(struct super_block *sb)
152{
153 /* check for race with generic_shutdown_super() */
154 if (list_empty(&sb->s_list)) {
155 /* super block is removed, need to restart... */
156 __put_super(sb);
157 return 1;
158 }
159 /* can't be the last, since s_list is still in use */
160 sb->s_count--;
161 BUG_ON(sb->s_count == 0);
162 return 0;
163} 131}
164 132
165/** 133/**
@@ -178,57 +146,48 @@ void put_super(struct super_block *sb)
178 146
179 147
180/** 148/**
181 * deactivate_super - drop an active reference to superblock 149 * deactivate_locked_super - drop an active reference to superblock
182 * @s: superblock to deactivate 150 * @s: superblock to deactivate
183 * 151 *
184 * Drops an active reference to superblock, acquiring a temprory one if 152 * Drops an active reference to superblock, converting it into a temprory
185 * there is no active references left. In that case we lock superblock, 153 * one if there is no other active references left. In that case we
186 * tell fs driver to shut it down and drop the temporary reference we 154 * tell fs driver to shut it down and drop the temporary reference we
187 * had just acquired. 155 * had just acquired.
156 *
157 * Caller holds exclusive lock on superblock; that lock is released.
188 */ 158 */
189void deactivate_super(struct super_block *s) 159void deactivate_locked_super(struct super_block *s)
190{ 160{
191 struct file_system_type *fs = s->s_type; 161 struct file_system_type *fs = s->s_type;
192 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 162 if (atomic_dec_and_test(&s->s_active)) {
193 s->s_count -= S_BIAS-1;
194 spin_unlock(&sb_lock);
195 vfs_dq_off(s, 0); 163 vfs_dq_off(s, 0);
196 down_write(&s->s_umount);
197 fs->kill_sb(s); 164 fs->kill_sb(s);
198 put_filesystem(fs); 165 put_filesystem(fs);
199 put_super(s); 166 put_super(s);
167 } else {
168 up_write(&s->s_umount);
200 } 169 }
201} 170}
202 171
203EXPORT_SYMBOL(deactivate_super); 172EXPORT_SYMBOL(deactivate_locked_super);
204 173
205/** 174/**
206 * deactivate_locked_super - drop an active reference to superblock 175 * deactivate_super - drop an active reference to superblock
207 * @s: superblock to deactivate 176 * @s: superblock to deactivate
208 * 177 *
209 * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that 178 * Variant of deactivate_locked_super(), except that superblock is *not*
210 * it does not unlock it until it's all over. As the result, it's safe to 179 * locked by caller. If we are going to drop the final active reference,
211 * use to dispose of new superblock on ->get_sb() failure exits - nobody 180 * lock will be acquired prior to that.
212 * will see the sucker until it's all over. Equivalent using up_write +
213 * deactivate_super is safe for that purpose only if superblock is either
214 * safe to use or has NULL ->s_root when we unlock.
215 */ 181 */
216void deactivate_locked_super(struct super_block *s) 182void deactivate_super(struct super_block *s)
217{ 183{
218 struct file_system_type *fs = s->s_type; 184 if (!atomic_add_unless(&s->s_active, -1, 1)) {
219 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 185 down_write(&s->s_umount);
220 s->s_count -= S_BIAS-1; 186 deactivate_locked_super(s);
221 spin_unlock(&sb_lock);
222 vfs_dq_off(s, 0);
223 fs->kill_sb(s);
224 put_filesystem(fs);
225 put_super(s);
226 } else {
227 up_write(&s->s_umount);
228 } 187 }
229} 188}
230 189
231EXPORT_SYMBOL(deactivate_locked_super); 190EXPORT_SYMBOL(deactivate_super);
232 191
233/** 192/**
234 * grab_super - acquire an active reference 193 * grab_super - acquire an active reference
@@ -243,22 +202,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
243 */ 202 */
244static int grab_super(struct super_block *s) __releases(sb_lock) 203static int grab_super(struct super_block *s) __releases(sb_lock)
245{ 204{
205 if (atomic_inc_not_zero(&s->s_active)) {
206 spin_unlock(&sb_lock);
207 return 1;
208 }
209 /* it's going away */
246 s->s_count++; 210 s->s_count++;
247 spin_unlock(&sb_lock); 211 spin_unlock(&sb_lock);
212 /* wait for it to die */
248 down_write(&s->s_umount); 213 down_write(&s->s_umount);
249 if (s->s_root) {
250 spin_lock(&sb_lock);
251 if (s->s_count > S_BIAS) {
252 atomic_inc(&s->s_active);
253 s->s_count--;
254 spin_unlock(&sb_lock);
255 return 1;
256 }
257 spin_unlock(&sb_lock);
258 }
259 up_write(&s->s_umount); 214 up_write(&s->s_umount);
260 put_super(s); 215 put_super(s);
261 yield();
262 return 0; 216 return 0;
263} 217}
264 218
@@ -321,8 +275,7 @@ void generic_shutdown_super(struct super_block *sb)
321 } 275 }
322 spin_lock(&sb_lock); 276 spin_lock(&sb_lock);
323 /* should be initialized for __put_super_and_need_restart() */ 277 /* should be initialized for __put_super_and_need_restart() */
324 list_del_init(&sb->s_list); 278 list_del_init(&sb->s_instances);
325 list_del(&sb->s_instances);
326 spin_unlock(&sb_lock); 279 spin_unlock(&sb_lock);
327 up_write(&sb->s_umount); 280 up_write(&sb->s_umount);
328} 281}
@@ -357,6 +310,7 @@ retry:
357 up_write(&s->s_umount); 310 up_write(&s->s_umount);
358 destroy_super(s); 311 destroy_super(s);
359 } 312 }
313 down_write(&old->s_umount);
360 return old; 314 return old;
361 } 315 }
362 } 316 }
@@ -408,11 +362,12 @@ EXPORT_SYMBOL(drop_super);
408 */ 362 */
409void sync_supers(void) 363void sync_supers(void)
410{ 364{
411 struct super_block *sb; 365 struct super_block *sb, *n;
412 366
413 spin_lock(&sb_lock); 367 spin_lock(&sb_lock);
414restart: 368 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
415 list_for_each_entry(sb, &super_blocks, s_list) { 369 if (list_empty(&sb->s_instances))
370 continue;
416 if (sb->s_op->write_super && sb->s_dirt) { 371 if (sb->s_op->write_super && sb->s_dirt) {
417 sb->s_count++; 372 sb->s_count++;
418 spin_unlock(&sb_lock); 373 spin_unlock(&sb_lock);
@@ -423,14 +378,43 @@ restart:
423 up_read(&sb->s_umount); 378 up_read(&sb->s_umount);
424 379
425 spin_lock(&sb_lock); 380 spin_lock(&sb_lock);
426 if (__put_super_and_need_restart(sb)) 381 __put_super(sb);
427 goto restart;
428 } 382 }
429 } 383 }
430 spin_unlock(&sb_lock); 384 spin_unlock(&sb_lock);
431} 385}
432 386
433/** 387/**
388 * iterate_supers - call function for all active superblocks
389 * @f: function to call
390 * @arg: argument to pass to it
391 *
392 * Scans the superblock list and calls given function, passing it
393 * locked superblock and given argument.
394 */
395void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
396{
397 struct super_block *sb, *n;
398
399 spin_lock(&sb_lock);
400 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
401 if (list_empty(&sb->s_instances))
402 continue;
403 sb->s_count++;
404 spin_unlock(&sb_lock);
405
406 down_read(&sb->s_umount);
407 if (sb->s_root)
408 f(sb, arg);
409 up_read(&sb->s_umount);
410
411 spin_lock(&sb_lock);
412 __put_super(sb);
413 }
414 spin_unlock(&sb_lock);
415}
416
417/**
434 * get_super - get the superblock of a device 418 * get_super - get the superblock of a device
435 * @bdev: device to get the superblock for 419 * @bdev: device to get the superblock for
436 * 420 *
@@ -438,7 +422,7 @@ restart:
438 * mounted on the device given. %NULL is returned if no match is found. 422 * mounted on the device given. %NULL is returned if no match is found.
439 */ 423 */
440 424
441struct super_block * get_super(struct block_device *bdev) 425struct super_block *get_super(struct block_device *bdev)
442{ 426{
443 struct super_block *sb; 427 struct super_block *sb;
444 428
@@ -448,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev)
448 spin_lock(&sb_lock); 432 spin_lock(&sb_lock);
449rescan: 433rescan:
450 list_for_each_entry(sb, &super_blocks, s_list) { 434 list_for_each_entry(sb, &super_blocks, s_list) {
435 if (list_empty(&sb->s_instances))
436 continue;
451 if (sb->s_bdev == bdev) { 437 if (sb->s_bdev == bdev) {
452 sb->s_count++; 438 sb->s_count++;
453 spin_unlock(&sb_lock); 439 spin_unlock(&sb_lock);
454 down_read(&sb->s_umount); 440 down_read(&sb->s_umount);
441 /* still alive? */
455 if (sb->s_root) 442 if (sb->s_root)
456 return sb; 443 return sb;
457 up_read(&sb->s_umount); 444 up_read(&sb->s_umount);
458 /* restart only when sb is no longer on the list */ 445 /* nope, got unmounted */
459 spin_lock(&sb_lock); 446 spin_lock(&sb_lock);
460 if (__put_super_and_need_restart(sb)) 447 __put_super(sb);
461 goto rescan; 448 goto rescan;
462 } 449 }
463 } 450 }
464 spin_unlock(&sb_lock); 451 spin_unlock(&sb_lock);
@@ -473,7 +460,7 @@ EXPORT_SYMBOL(get_super);
473 * 460 *
474 * Scans the superblock list and finds the superblock of the file system 461 * Scans the superblock list and finds the superblock of the file system
475 * mounted on the device given. Returns the superblock with an active 462 * mounted on the device given. Returns the superblock with an active
476 * reference and s_umount held exclusively or %NULL if none was found. 463 * reference or %NULL if none was found.
477 */ 464 */
478struct super_block *get_active_super(struct block_device *bdev) 465struct super_block *get_active_super(struct block_device *bdev)
479{ 466{
@@ -482,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev)
482 if (!bdev) 469 if (!bdev)
483 return NULL; 470 return NULL;
484 471
472restart:
485 spin_lock(&sb_lock); 473 spin_lock(&sb_lock);
486 list_for_each_entry(sb, &super_blocks, s_list) { 474 list_for_each_entry(sb, &super_blocks, s_list) {
487 if (sb->s_bdev != bdev) 475 if (list_empty(&sb->s_instances))
488 continue; 476 continue;
489 477 if (sb->s_bdev == bdev) {
490 sb->s_count++; 478 if (grab_super(sb)) /* drops sb_lock */
491 spin_unlock(&sb_lock);
492 down_write(&sb->s_umount);
493 if (sb->s_root) {
494 spin_lock(&sb_lock);
495 if (sb->s_count > S_BIAS) {
496 atomic_inc(&sb->s_active);
497 sb->s_count--;
498 spin_unlock(&sb_lock);
499 return sb; 479 return sb;
500 } 480 else
501 spin_unlock(&sb_lock); 481 goto restart;
502 } 482 }
503 up_write(&sb->s_umount);
504 put_super(sb);
505 yield();
506 spin_lock(&sb_lock);
507 } 483 }
508 spin_unlock(&sb_lock); 484 spin_unlock(&sb_lock);
509 return NULL; 485 return NULL;
510} 486}
511 487
512struct super_block * user_get_super(dev_t dev) 488struct super_block *user_get_super(dev_t dev)
513{ 489{
514 struct super_block *sb; 490 struct super_block *sb;
515 491
516 spin_lock(&sb_lock); 492 spin_lock(&sb_lock);
517rescan: 493rescan:
518 list_for_each_entry(sb, &super_blocks, s_list) { 494 list_for_each_entry(sb, &super_blocks, s_list) {
495 if (list_empty(&sb->s_instances))
496 continue;
519 if (sb->s_dev == dev) { 497 if (sb->s_dev == dev) {
520 sb->s_count++; 498 sb->s_count++;
521 spin_unlock(&sb_lock); 499 spin_unlock(&sb_lock);
522 down_read(&sb->s_umount); 500 down_read(&sb->s_umount);
501 /* still alive? */
523 if (sb->s_root) 502 if (sb->s_root)
524 return sb; 503 return sb;
525 up_read(&sb->s_umount); 504 up_read(&sb->s_umount);
526 /* restart only when sb is no longer on the list */ 505 /* nope, got unmounted */
527 spin_lock(&sb_lock); 506 spin_lock(&sb_lock);
528 if (__put_super_and_need_restart(sb)) 507 __put_super(sb);
529 goto rescan; 508 goto rescan;
530 } 509 }
531 } 510 }
532 spin_unlock(&sb_lock); 511 spin_unlock(&sb_lock);
533 return NULL; 512 return NULL;
534} 513}
535 514
536SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
537{
538 struct super_block *s;
539 struct ustat tmp;
540 struct kstatfs sbuf;
541 int err = -EINVAL;
542
543 s = user_get_super(new_decode_dev(dev));
544 if (s == NULL)
545 goto out;
546 err = vfs_statfs(s->s_root, &sbuf);
547 drop_super(s);
548 if (err)
549 goto out;
550
551 memset(&tmp,0,sizeof(struct ustat));
552 tmp.f_tfree = sbuf.f_bfree;
553 tmp.f_tinode = sbuf.f_ffree;
554
555 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
556out:
557 return err;
558}
559
560/** 515/**
561 * do_remount_sb - asks filesystem to change mount options. 516 * do_remount_sb - asks filesystem to change mount options.
562 * @sb: superblock in question 517 * @sb: superblock in question
@@ -622,24 +577,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
622 577
623static void do_emergency_remount(struct work_struct *work) 578static void do_emergency_remount(struct work_struct *work)
624{ 579{
625 struct super_block *sb; 580 struct super_block *sb, *n;
626 581
627 spin_lock(&sb_lock); 582 spin_lock(&sb_lock);
628 list_for_each_entry(sb, &super_blocks, s_list) { 583 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
584 if (list_empty(&sb->s_instances))
585 continue;
629 sb->s_count++; 586 sb->s_count++;
630 spin_unlock(&sb_lock); 587 spin_unlock(&sb_lock);
631 down_write(&sb->s_umount); 588 down_write(&sb->s_umount);
632 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { 589 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
633 /* 590 /*
634 * ->remount_fs needs lock_kernel().
635 *
636 * What lock protects sb->s_flags?? 591 * What lock protects sb->s_flags??
637 */ 592 */
638 do_remount_sb(sb, MS_RDONLY, NULL, 1); 593 do_remount_sb(sb, MS_RDONLY, NULL, 1);
639 } 594 }
640 up_write(&sb->s_umount); 595 up_write(&sb->s_umount);
641 put_super(sb);
642 spin_lock(&sb_lock); 596 spin_lock(&sb_lock);
597 __put_super(sb);
643 } 598 }
644 spin_unlock(&sb_lock); 599 spin_unlock(&sb_lock);
645 kfree(work); 600 kfree(work);
@@ -990,6 +945,96 @@ out:
990 945
991EXPORT_SYMBOL_GPL(vfs_kern_mount); 946EXPORT_SYMBOL_GPL(vfs_kern_mount);
992 947
948/**
949 * freeze_super -- lock the filesystem and force it into a consistent state
950 * @super: the super to lock
951 *
952 * Syncs the super to make sure the filesystem is consistent and calls the fs's
953 * freeze_fs. Subsequent calls to this without first thawing the fs will return
954 * -EBUSY.
955 */
956int freeze_super(struct super_block *sb)
957{
958 int ret;
959
960 atomic_inc(&sb->s_active);
961 down_write(&sb->s_umount);
962 if (sb->s_frozen) {
963 deactivate_locked_super(sb);
964 return -EBUSY;
965 }
966
967 if (sb->s_flags & MS_RDONLY) {
968 sb->s_frozen = SB_FREEZE_TRANS;
969 smp_wmb();
970 up_write(&sb->s_umount);
971 return 0;
972 }
973
974 sb->s_frozen = SB_FREEZE_WRITE;
975 smp_wmb();
976
977 sync_filesystem(sb);
978
979 sb->s_frozen = SB_FREEZE_TRANS;
980 smp_wmb();
981
982 sync_blockdev(sb->s_bdev);
983 if (sb->s_op->freeze_fs) {
984 ret = sb->s_op->freeze_fs(sb);
985 if (ret) {
986 printk(KERN_ERR
987 "VFS:Filesystem freeze failed\n");
988 sb->s_frozen = SB_UNFROZEN;
989 deactivate_locked_super(sb);
990 return ret;
991 }
992 }
993 up_write(&sb->s_umount);
994 return 0;
995}
996EXPORT_SYMBOL(freeze_super);
997
998/**
999 * thaw_super -- unlock filesystem
1000 * @sb: the super to thaw
1001 *
1002 * Unlocks the filesystem and marks it writeable again after freeze_super().
1003 */
1004int thaw_super(struct super_block *sb)
1005{
1006 int error;
1007
1008 down_write(&sb->s_umount);
1009 if (sb->s_frozen == SB_UNFROZEN) {
1010 up_write(&sb->s_umount);
1011 return -EINVAL;
1012 }
1013
1014 if (sb->s_flags & MS_RDONLY)
1015 goto out;
1016
1017 if (sb->s_op->unfreeze_fs) {
1018 error = sb->s_op->unfreeze_fs(sb);
1019 if (error) {
1020 printk(KERN_ERR
1021 "VFS:Filesystem thaw failed\n");
1022 sb->s_frozen = SB_FREEZE_TRANS;
1023 up_write(&sb->s_umount);
1024 return error;
1025 }
1026 }
1027
1028out:
1029 sb->s_frozen = SB_UNFROZEN;
1030 smp_wmb();
1031 wake_up(&sb->s_wait_unfrozen);
1032 deactivate_locked_super(sb);
1033
1034 return 0;
1035}
1036EXPORT_SYMBOL(thaw_super);
1037
993static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 1038static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
994{ 1039{
995 int err; 1040 int err;
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..e8cbd415e50a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
42 if (wait) 42 if (wait)
43 sync_inodes_sb(sb); 43 sync_inodes_sb(sb);
44 else 44 else
45 writeback_inodes_sb(sb); 45 writeback_inodes_sb_locked(sb);
46 46
47 if (sb->s_op->sync_fs) 47 if (sb->s_op->sync_fs)
48 sb->s_op->sync_fs(sb, wait); 48 sb->s_op->sync_fs(sb, wait);
@@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
77} 77}
78EXPORT_SYMBOL_GPL(sync_filesystem); 78EXPORT_SYMBOL_GPL(sync_filesystem);
79 79
80static void sync_one_sb(struct super_block *sb, void *arg)
81{
82 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
83 __sync_filesystem(sb, *(int *)arg);
84}
80/* 85/*
81 * Sync all the data for all the filesystems (called by sys_sync() and 86 * Sync all the data for all the filesystems (called by sys_sync() and
82 * emergency sync) 87 * emergency sync)
83 *
84 * This operation is careful to avoid the livelock which could easily happen
85 * if two or more filesystems are being continuously dirtied. s_need_sync
86 * is used only here. We set it against all filesystems and then clear it as
87 * we sync them. So redirtied filesystems are skipped.
88 *
89 * But if process A is currently running sync_filesystems and then process B
90 * calls sync_filesystems as well, process B will set all the s_need_sync
91 * flags again, which will cause process A to resync everything. Fix that with
92 * a local mutex.
93 */ 88 */
94static void sync_filesystems(int wait) 89static void sync_filesystems(int wait)
95{ 90{
96 struct super_block *sb; 91 iterate_supers(sync_one_sb, &wait);
97 static DEFINE_MUTEX(mutex);
98
99 mutex_lock(&mutex); /* Could be down_interruptible */
100 spin_lock(&sb_lock);
101 list_for_each_entry(sb, &super_blocks, s_list)
102 sb->s_need_sync = 1;
103
104restart:
105 list_for_each_entry(sb, &super_blocks, s_list) {
106 if (!sb->s_need_sync)
107 continue;
108 sb->s_need_sync = 0;
109 sb->s_count++;
110 spin_unlock(&sb_lock);
111
112 down_read(&sb->s_umount);
113 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
114 __sync_filesystem(sb, wait);
115 up_read(&sb->s_umount);
116
117 /* restart only when sb is no longer on the list */
118 spin_lock(&sb_lock);
119 if (__put_super_and_need_restart(sb))
120 goto restart;
121 }
122 spin_unlock(&sb_lock);
123 mutex_unlock(&mutex);
124} 92}
125 93
126/* 94/*
@@ -190,7 +158,6 @@ EXPORT_SYMBOL(file_fsync);
190/** 158/**
191 * vfs_fsync_range - helper to sync a range of data & metadata to disk 159 * vfs_fsync_range - helper to sync a range of data & metadata to disk
192 * @file: file to sync 160 * @file: file to sync
193 * @dentry: dentry of @file
194 * @start: offset in bytes of the beginning of data range to sync 161 * @start: offset in bytes of the beginning of data range to sync
195 * @end: offset in bytes of the end of data range (inclusive) 162 * @end: offset in bytes of the end of data range (inclusive)
196 * @datasync: perform only datasync 163 * @datasync: perform only datasync
@@ -198,32 +165,13 @@ EXPORT_SYMBOL(file_fsync);
198 * Write back data in range @start..@end and metadata for @file to disk. If 165 * Write back data in range @start..@end and metadata for @file to disk. If
199 * @datasync is set only metadata needed to access modified file data is 166 * @datasync is set only metadata needed to access modified file data is
200 * written. 167 * written.
201 *
202 * In case this function is called from nfsd @file may be %NULL and
203 * only @dentry is set. This can only happen when the filesystem
204 * implements the export_operations API.
205 */ 168 */
206int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, 169int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
207 loff_t end, int datasync)
208{ 170{
209 const struct file_operations *fop; 171 struct address_space *mapping = file->f_mapping;
210 struct address_space *mapping;
211 int err, ret; 172 int err, ret;
212 173
213 /* 174 if (!file->f_op || !file->f_op->fsync) {
214 * Get mapping and operations from the file in case we have
215 * as file, or get the default values for them in case we
216 * don't have a struct file available. Damn nfsd..
217 */
218 if (file) {
219 mapping = file->f_mapping;
220 fop = file->f_op;
221 } else {
222 mapping = dentry->d_inode->i_mapping;
223 fop = dentry->d_inode->i_fop;
224 }
225
226 if (!fop || !fop->fsync) {
227 ret = -EINVAL; 175 ret = -EINVAL;
228 goto out; 176 goto out;
229 } 177 }
@@ -235,7 +183,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
235 * livelocks in fsync_buffers_list(). 183 * livelocks in fsync_buffers_list().
236 */ 184 */
237 mutex_lock(&mapping->host->i_mutex); 185 mutex_lock(&mapping->host->i_mutex);
238 err = fop->fsync(file, dentry, datasync); 186 err = file->f_op->fsync(file, file->f_path.dentry, datasync);
239 if (!ret) 187 if (!ret)
240 ret = err; 188 ret = err;
241 mutex_unlock(&mapping->host->i_mutex); 189 mutex_unlock(&mapping->host->i_mutex);
@@ -248,19 +196,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
248/** 196/**
249 * vfs_fsync - perform a fsync or fdatasync on a file 197 * vfs_fsync - perform a fsync or fdatasync on a file
250 * @file: file to sync 198 * @file: file to sync
251 * @dentry: dentry of @file
252 * @datasync: only perform a fdatasync operation 199 * @datasync: only perform a fdatasync operation
253 * 200 *
254 * Write back data and metadata for @file to disk. If @datasync is 201 * Write back data and metadata for @file to disk. If @datasync is
255 * set only metadata needed to access modified file data is written. 202 * set only metadata needed to access modified file data is written.
256 *
257 * In case this function is called from nfsd @file may be %NULL and
258 * only @dentry is set. This can only happen when the filesystem
259 * implements the export_operations API.
260 */ 203 */
261int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 204int vfs_fsync(struct file *file, int datasync)
262{ 205{
263 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); 206 return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
264} 207}
265EXPORT_SYMBOL(vfs_fsync); 208EXPORT_SYMBOL(vfs_fsync);
266 209
@@ -271,7 +214,7 @@ static int do_fsync(unsigned int fd, int datasync)
271 214
272 file = fget(fd); 215 file = fget(fd);
273 if (file) { 216 if (file) {
274 ret = vfs_fsync(file, file->f_path.dentry, datasync); 217 ret = vfs_fsync(file, datasync);
275 fput(file); 218 fput(file);
276 } 219 }
277 return ret; 220 return ret;
@@ -299,8 +242,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
299{ 242{
300 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) 243 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
301 return 0; 244 return 0;
302 return vfs_fsync_range(file, file->f_path.dentry, pos, 245 return vfs_fsync_range(file, pos, pos + count - 1,
303 pos + count - 1,
304 (file->f_flags & __O_SYNC) ? 0 : 1); 246 (file->f_flags & __O_SYNC) ? 0 : 1);
305} 247}
306EXPORT_SYMBOL(generic_write_sync); 248EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d293593e52..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
46}; 46};
47 47
48static int 48static int
49fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) 49fill_read(struct file *file, char *buffer, loff_t off, size_t count)
50{ 50{
51 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 51 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
54 int rc; 54 int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(file, kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
@@ -70,8 +70,7 @@ static ssize_t
70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) 70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
71{ 71{
72 struct bin_buffer *bb = file->private_data; 72 struct bin_buffer *bb = file->private_data;
73 struct dentry *dentry = file->f_path.dentry; 73 int size = file->f_path.dentry->d_inode->i_size;
74 int size = dentry->d_inode->i_size;
75 loff_t offs = *off; 74 loff_t offs = *off;
76 int count = min_t(size_t, bytes, PAGE_SIZE); 75 int count = min_t(size_t, bytes, PAGE_SIZE);
77 char *temp; 76 char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
92 91
93 mutex_lock(&bb->mutex); 92 mutex_lock(&bb->mutex);
94 93
95 count = fill_read(dentry, bb->buffer, offs, count); 94 count = fill_read(file, bb->buffer, offs, count);
96 if (count < 0) { 95 if (count < 0) {
97 mutex_unlock(&bb->mutex); 96 mutex_unlock(&bb->mutex);
98 goto out_free; 97 goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
117} 116}
118 117
119static int 118static int
120flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) 119flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
121{ 120{
122 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 121 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
123 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 122 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
124 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 123 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
125 int rc; 124 int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
130 129
131 rc = -EIO; 130 rc = -EIO;
132 if (attr->write) 131 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 132 rc = attr->write(file, kobj, attr, buffer, offset, count);
134 133
135 sysfs_put_active(attr_sd); 134 sysfs_put_active(attr_sd);
136 135
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
141 size_t bytes, loff_t *off) 140 size_t bytes, loff_t *off)
142{ 141{
143 struct bin_buffer *bb = file->private_data; 142 struct bin_buffer *bb = file->private_data;
144 struct dentry *dentry = file->f_path.dentry; 143 int size = file->f_path.dentry->d_inode->i_size;
145 int size = dentry->d_inode->i_size;
146 loff_t offs = *off; 144 loff_t offs = *off;
147 int count = min_t(size_t, bytes, PAGE_SIZE); 145 int count = min_t(size_t, bytes, PAGE_SIZE);
148 char *temp; 146 char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
165 163
166 memcpy(bb->buffer, temp, count); 164 memcpy(bb->buffer, temp, count);
167 165
168 count = flush_write(dentry, bb->buffer, offs, count); 166 count = flush_write(file, bb->buffer, offs, count);
169 mutex_unlock(&bb->mutex); 167 mutex_unlock(&bb->mutex);
170 168
171 if (count > 0) 169 if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
363 if (!attr->mmap) 361 if (!attr->mmap)
364 goto out_put; 362 goto out_put;
365 363
366 rc = attr->mmap(kobj, attr, vma); 364 rc = attr->mmap(file, kobj, attr, vma);
367 if (rc) 365 if (rc)
368 goto out_put; 366 goto out_put;
369 367
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
501void sysfs_remove_bin_file(struct kobject *kobj, 499void sysfs_remove_bin_file(struct kobject *kobj,
502 const struct bin_attribute *attr) 500 const struct bin_attribute *attr)
503{ 501{
504 sysfs_hash_and_remove(kobj->sd, attr->attr.name); 502 sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
505} 503}
506 504
507EXPORT_SYMBOL_GPL(sysfs_create_bin_file); 505EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 590717861c7a..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
380{ 380{
381 struct sysfs_inode_attrs *ps_iattr; 381 struct sysfs_inode_attrs *ps_iattr;
382 382
383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) 383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
384 return -EEXIST; 384 return -EEXIST;
385 385
386 sd->s_parent = sysfs_get(acxt->parent_sd); 386 sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
533 * Pointer to sysfs_dirent if found, NULL if not. 533 * Pointer to sysfs_dirent if found, NULL if not.
534 */ 534 */
535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
536 const void *ns,
536 const unsigned char *name) 537 const unsigned char *name)
537{ 538{
538 struct sysfs_dirent *sd; 539 struct sysfs_dirent *sd;
539 540
540 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) 541 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
542 if (ns && sd->s_ns && (sd->s_ns != ns))
543 continue;
541 if (!strcmp(sd->s_name, name)) 544 if (!strcmp(sd->s_name, name))
542 return sd; 545 return sd;
546 }
543 return NULL; 547 return NULL;
544} 548}
545 549
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
558 * Pointer to sysfs_dirent if found, NULL if not. 562 * Pointer to sysfs_dirent if found, NULL if not.
559 */ 563 */
560struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 564struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
565 const void *ns,
561 const unsigned char *name) 566 const unsigned char *name)
562{ 567{
563 struct sysfs_dirent *sd; 568 struct sysfs_dirent *sd;
564 569
565 mutex_lock(&sysfs_mutex); 570 mutex_lock(&sysfs_mutex);
566 sd = sysfs_find_dirent(parent_sd, name); 571 sd = sysfs_find_dirent(parent_sd, ns, name);
567 sysfs_get(sd); 572 sysfs_get(sd);
568 mutex_unlock(&sysfs_mutex); 573 mutex_unlock(&sysfs_mutex);
569 574
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
572EXPORT_SYMBOL_GPL(sysfs_get_dirent); 577EXPORT_SYMBOL_GPL(sysfs_get_dirent);
573 578
574static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, 579static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
575 const char *name, struct sysfs_dirent **p_sd) 580 enum kobj_ns_type type, const void *ns, const char *name,
581 struct sysfs_dirent **p_sd)
576{ 582{
577 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 583 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
578 struct sysfs_addrm_cxt acxt; 584 struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
583 sd = sysfs_new_dirent(name, mode, SYSFS_DIR); 589 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
584 if (!sd) 590 if (!sd)
585 return -ENOMEM; 591 return -ENOMEM;
592
593 sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
594 sd->s_ns = ns;
586 sd->s_dir.kobj = kobj; 595 sd->s_dir.kobj = kobj;
587 596
588 /* link in */ 597 /* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
601int sysfs_create_subdir(struct kobject *kobj, const char *name, 610int sysfs_create_subdir(struct kobject *kobj, const char *name,
602 struct sysfs_dirent **p_sd) 611 struct sysfs_dirent **p_sd)
603{ 612{
604 return create_dir(kobj, kobj->sd, name, p_sd); 613 return create_dir(kobj, kobj->sd,
614 KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
615}
616
617/**
618 * sysfs_read_ns_type: return associated ns_type
619 * @kobj: the kobject being queried
620 *
621 * Each kobject can be tagged with exactly one namespace type
622 * (i.e. network or user). Return the ns_type associated with
623 * this object if any
624 */
625static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
626{
627 const struct kobj_ns_type_operations *ops;
628 enum kobj_ns_type type;
629
630 ops = kobj_child_ns_ops(kobj);
631 if (!ops)
632 return KOBJ_NS_TYPE_NONE;
633
634 type = ops->type;
635 BUG_ON(type <= KOBJ_NS_TYPE_NONE);
636 BUG_ON(type >= KOBJ_NS_TYPES);
637 BUG_ON(!kobj_ns_type_registered(type));
638
639 return type;
605} 640}
606 641
607/** 642/**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
610 */ 645 */
611int sysfs_create_dir(struct kobject * kobj) 646int sysfs_create_dir(struct kobject * kobj)
612{ 647{
648 enum kobj_ns_type type;
613 struct sysfs_dirent *parent_sd, *sd; 649 struct sysfs_dirent *parent_sd, *sd;
650 const void *ns = NULL;
614 int error = 0; 651 int error = 0;
615 652
616 BUG_ON(!kobj); 653 BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
620 else 657 else
621 parent_sd = &sysfs_root; 658 parent_sd = &sysfs_root;
622 659
623 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); 660 if (sysfs_ns_type(parent_sd))
661 ns = kobj->ktype->namespace(kobj);
662 type = sysfs_read_ns_type(kobj);
663
664 error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
624 if (!error) 665 if (!error)
625 kobj->sd = sd; 666 kobj->sd = sd;
626 return error; 667 return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
630 struct nameidata *nd) 671 struct nameidata *nd)
631{ 672{
632 struct dentry *ret = NULL; 673 struct dentry *ret = NULL;
633 struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata; 674 struct dentry *parent = dentry->d_parent;
675 struct sysfs_dirent *parent_sd = parent->d_fsdata;
634 struct sysfs_dirent *sd; 676 struct sysfs_dirent *sd;
635 struct inode *inode; 677 struct inode *inode;
678 enum kobj_ns_type type;
679 const void *ns;
636 680
637 mutex_lock(&sysfs_mutex); 681 mutex_lock(&sysfs_mutex);
638 682
639 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); 683 type = sysfs_ns_type(parent_sd);
684 ns = sysfs_info(dir->i_sb)->ns[type];
685
686 sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
640 687
641 /* no such entry */ 688 /* no such entry */
642 if (!sd) { 689 if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
735} 782}
736 783
737int sysfs_rename(struct sysfs_dirent *sd, 784int sysfs_rename(struct sysfs_dirent *sd,
738 struct sysfs_dirent *new_parent_sd, const char *new_name) 785 struct sysfs_dirent *new_parent_sd, const void *new_ns,
786 const char *new_name)
739{ 787{
740 const char *dup_name = NULL; 788 const char *dup_name = NULL;
741 int error; 789 int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
743 mutex_lock(&sysfs_mutex); 791 mutex_lock(&sysfs_mutex);
744 792
745 error = 0; 793 error = 0;
746 if ((sd->s_parent == new_parent_sd) && 794 if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
747 (strcmp(sd->s_name, new_name) == 0)) 795 (strcmp(sd->s_name, new_name) == 0))
748 goto out; /* nothing to rename */ 796 goto out; /* nothing to rename */
749 797
750 error = -EEXIST; 798 error = -EEXIST;
751 if (sysfs_find_dirent(new_parent_sd, new_name)) 799 if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
752 goto out; 800 goto out;
753 801
754 /* rename sysfs_dirent */ 802 /* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
770 sd->s_parent = new_parent_sd; 818 sd->s_parent = new_parent_sd;
771 sysfs_link_sibling(sd); 819 sysfs_link_sibling(sd);
772 } 820 }
821 sd->s_ns = new_ns;
773 822
774 error = 0; 823 error = 0;
775 out: 824 out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
780 829
781int sysfs_rename_dir(struct kobject *kobj, const char *new_name) 830int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
782{ 831{
783 return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); 832 struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
833 const void *new_ns = NULL;
834
835 if (sysfs_ns_type(parent_sd))
836 new_ns = kobj->ktype->namespace(kobj);
837
838 return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
784} 839}
785 840
786int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) 841int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
787{ 842{
788 struct sysfs_dirent *sd = kobj->sd; 843 struct sysfs_dirent *sd = kobj->sd;
789 struct sysfs_dirent *new_parent_sd; 844 struct sysfs_dirent *new_parent_sd;
845 const void *new_ns = NULL;
790 846
791 BUG_ON(!sd->s_parent); 847 BUG_ON(!sd->s_parent);
848 if (sysfs_ns_type(sd->s_parent))
849 new_ns = kobj->ktype->namespace(kobj);
792 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? 850 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
793 new_parent_kobj->sd : &sysfs_root; 851 new_parent_kobj->sd : &sysfs_root;
794 852
795 return sysfs_rename(sd, new_parent_sd, sd->s_name); 853 return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
796} 854}
797 855
798/* Relationship between s_mode and the DT_xxx types */ 856/* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
807 return 0; 865 return 0;
808} 866}
809 867
810static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd, 868static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
811 ino_t ino, struct sysfs_dirent *pos) 869 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
812{ 870{
813 if (pos) { 871 if (pos) {
814 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && 872 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
815 pos->s_parent == parent_sd && 873 pos->s_parent == parent_sd &&
816 ino == pos->s_ino; 874 ino == pos->s_ino;
817 sysfs_put(pos); 875 sysfs_put(pos);
818 if (valid) 876 if (!valid)
819 return pos; 877 pos = NULL;
820 } 878 }
821 pos = NULL; 879 if (!pos && (ino > 1) && (ino < INT_MAX)) {
822 if ((ino > 1) && (ino < INT_MAX)) {
823 pos = parent_sd->s_dir.children; 880 pos = parent_sd->s_dir.children;
824 while (pos && (ino > pos->s_ino)) 881 while (pos && (ino > pos->s_ino))
825 pos = pos->s_sibling; 882 pos = pos->s_sibling;
826 } 883 }
884 while (pos && pos->s_ns && pos->s_ns != ns)
885 pos = pos->s_sibling;
827 return pos; 886 return pos;
828} 887}
829 888
830static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd, 889static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
831 ino_t ino, struct sysfs_dirent *pos) 890 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
832{ 891{
833 pos = sysfs_dir_pos(parent_sd, ino, pos); 892 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
834 if (pos) 893 if (pos)
835 pos = pos->s_sibling; 894 pos = pos->s_sibling;
895 while (pos && pos->s_ns && pos->s_ns != ns)
896 pos = pos->s_sibling;
836 return pos; 897 return pos;
837} 898}
838 899
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
841 struct dentry *dentry = filp->f_path.dentry; 902 struct dentry *dentry = filp->f_path.dentry;
842 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 903 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
843 struct sysfs_dirent *pos = filp->private_data; 904 struct sysfs_dirent *pos = filp->private_data;
905 enum kobj_ns_type type;
906 const void *ns;
844 ino_t ino; 907 ino_t ino;
845 908
909 type = sysfs_ns_type(parent_sd);
910 ns = sysfs_info(dentry->d_sb)->ns[type];
911
846 if (filp->f_pos == 0) { 912 if (filp->f_pos == 0) {
847 ino = parent_sd->s_ino; 913 ino = parent_sd->s_ino;
848 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) 914 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
857 filp->f_pos++; 923 filp->f_pos++;
858 } 924 }
859 mutex_lock(&sysfs_mutex); 925 mutex_lock(&sysfs_mutex);
860 for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos); 926 for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
861 pos; 927 pos;
862 pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) { 928 pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
863 const char * name; 929 const char * name;
864 unsigned int type; 930 unsigned int type;
865 int len, ret; 931 int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b2582746..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
478 mutex_lock(&sysfs_mutex); 478 mutex_lock(&sysfs_mutex);
479 479
480 if (sd && dir) 480 if (sd && dir)
481 sd = sysfs_find_dirent(sd, dir); 481 /* Only directories are tagged, so no need to pass
482 * a tag explicitly.
483 */
484 sd = sysfs_find_dirent(sd, NULL, dir);
482 if (sd && attr) 485 if (sd && attr)
483 sd = sysfs_find_dirent(sd, attr); 486 sd = sysfs_find_dirent(sd, NULL, attr);
484 if (sd) 487 if (sd)
485 sysfs_notify_dirent(sd); 488 sysfs_notify_dirent(sd);
486 489
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
569 int error; 572 int error;
570 573
571 if (group) 574 if (group)
572 dir_sd = sysfs_get_dirent(kobj->sd, group); 575 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
573 else 576 else
574 dir_sd = sysfs_get(kobj->sd); 577 dir_sd = sysfs_get(kobj->sd);
575 578
@@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
599 mutex_lock(&sysfs_mutex); 602 mutex_lock(&sysfs_mutex);
600 603
601 rc = -ENOENT; 604 rc = -ENOENT;
602 sd = sysfs_find_dirent(kobj->sd, attr->name); 605 sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
603 if (!sd) 606 if (!sd)
604 goto out; 607 goto out;
605 608
@@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
624 627
625void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 628void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
626{ 629{
627 sysfs_hash_and_remove(kobj->sd, attr->name); 630 sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
628} 631}
629 632
630void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) 633void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
646 struct sysfs_dirent *dir_sd; 649 struct sysfs_dirent *dir_sd;
647 650
648 if (group) 651 if (group)
649 dir_sd = sysfs_get_dirent(kobj->sd, group); 652 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
650 else 653 else
651 dir_sd = sysfs_get(kobj->sd); 654 dir_sd = sysfs_get(kobj->sd);
652 if (dir_sd) { 655 if (dir_sd) {
653 sysfs_hash_and_remove(dir_sd, attr->name); 656 sysfs_hash_and_remove(dir_sd, NULL, attr->name);
654 sysfs_put(dir_sd); 657 sysfs_put(dir_sd);
655 } 658 }
656} 659}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
23 int i; 23 int i;
24 24
25 for (i = 0, attr = grp->attrs; *attr; i++, attr++) 25 for (i = 0, attr = grp->attrs; *attr; i++, attr++)
26 sysfs_hash_and_remove(dir_sd, (*attr)->name); 26 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
27} 27}
28 28
29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
39 * visibility. Do this by first removing then 39 * visibility. Do this by first removing then
40 * re-adding (if required) the file */ 40 * re-adding (if required) the file */
41 if (update) 41 if (update)
42 sysfs_hash_and_remove(dir_sd, (*attr)->name); 42 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
43 if (grp->is_visible) { 43 if (grp->is_visible) {
44 mode = grp->is_visible(kobj, *attr, i); 44 mode = grp->is_visible(kobj, *attr, i);
45 if (!mode) 45 if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
132 struct sysfs_dirent *sd; 132 struct sysfs_dirent *sd;
133 133
134 if (grp->name) { 134 if (grp->name) {
135 sd = sysfs_get_dirent(dir_sd, grp->name); 135 sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
136 if (!sd) { 136 if (!sd) {
137 WARN(!sd, KERN_WARNING "sysfs group %p not found for " 137 WARN(!sd, KERN_WARNING "sysfs group %p not found for "
138 "kobject '%s'\n", grp, kobject_name(kobj)); 138 "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a4a0a9419711..bbd77e95cf7f 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -324,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode)
324 sysfs_put(sd); 324 sysfs_put(sd);
325} 325}
326 326
327int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 327int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
328{ 328{
329 struct sysfs_addrm_cxt acxt; 329 struct sysfs_addrm_cxt acxt;
330 struct sysfs_dirent *sd; 330 struct sysfs_dirent *sd;
@@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
334 334
335 sysfs_addrm_start(&acxt, dir_sd); 335 sysfs_addrm_start(&acxt, dir_sd);
336 336
337 sd = sysfs_find_dirent(dir_sd, name); 337 sd = sysfs_find_dirent(dir_sd, ns, name);
338 if (sd && (sd->s_ns != ns))
339 sd = NULL;
338 if (sd) 340 if (sd)
339 sysfs_remove_one(&acxt, sd); 341 sysfs_remove_one(&acxt, sd);
340 342
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 776137828dca..281c0c9bc39f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = {
35struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
36 .s_name = "", 36 .s_name = "",
37 .s_count = ATOMIC_INIT(1), 37 .s_count = ATOMIC_INIT(1),
38 .s_flags = SYSFS_DIR, 38 .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
40 .s_ino = 1, 40 .s_ino = 1,
41}; 41};
@@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
72 return 0; 72 return 0;
73} 73}
74 74
75static int sysfs_test_super(struct super_block *sb, void *data)
76{
77 struct sysfs_super_info *sb_info = sysfs_info(sb);
78 struct sysfs_super_info *info = data;
79 enum kobj_ns_type type;
80 int found = 1;
81
82 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
83 if (sb_info->ns[type] != info->ns[type])
84 found = 0;
85 }
86 return found;
87}
88
89static int sysfs_set_super(struct super_block *sb, void *data)
90{
91 int error;
92 error = set_anon_super(sb, data);
93 if (!error)
94 sb->s_fs_info = data;
95 return error;
96}
97
75static int sysfs_get_sb(struct file_system_type *fs_type, 98static int sysfs_get_sb(struct file_system_type *fs_type,
76 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
77{ 100{
78 return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type;
103 struct super_block *sb;
104 int error;
105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info)
109 goto out;
110
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type);
113
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info);
117 if (IS_ERR(sb)) {
118 error = PTR_ERR(sb);
119 goto out;
120 }
121 if (!sb->s_root) {
122 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) {
125 deactivate_locked_super(sb);
126 goto out;
127 }
128 sb->s_flags |= MS_ACTIVE;
129 }
130
131 simple_set_mnt(mnt, sb);
132 error = 0;
133out:
134 return error;
135}
136
137static void sysfs_kill_sb(struct super_block *sb)
138{
139 struct sysfs_super_info *info = sysfs_info(sb);
140
141 /* Remove the superblock from fs_supers/s_instances
142 * so we can't find it, before freeing sysfs_super_info.
143 */
144 kill_anon_super(sb);
145 kfree(info);
79} 146}
80 147
81static struct file_system_type sysfs_fs_type = { 148static struct file_system_type sysfs_fs_type = {
82 .name = "sysfs", 149 .name = "sysfs",
83 .get_sb = sysfs_get_sb, 150 .get_sb = sysfs_get_sb,
84 .kill_sb = kill_anon_super, 151 .kill_sb = sysfs_kill_sb,
85}; 152};
86 153
154void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
155{
156 struct super_block *sb;
157
158 mutex_lock(&sysfs_mutex);
159 spin_lock(&sb_lock);
160 list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
161 struct sysfs_super_info *info = sysfs_info(sb);
162 /*
163 * If we see a superblock on the fs_supers/s_instances
164 * list the unmount has not completed and sb->s_fs_info
165 * points to a valid struct sysfs_super_info.
166 */
167 /* Ignore superblocks with the wrong ns */
168 if (info->ns[type] != ns)
169 continue;
170 info->ns[type] = NULL;
171 }
172 spin_unlock(&sb_lock);
173 mutex_unlock(&sysfs_mutex);
174}
175
87int __init sysfs_init(void) 176int __init sysfs_init(void)
88{ 177{
89 int err = -ENOMEM; 178 int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 942f239a2132..f71246bebfe4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -58,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
58 if (!sd) 58 if (!sd)
59 goto out_put; 59 goto out_put;
60 60
61 if (sysfs_ns_type(parent_sd))
62 sd->s_ns = target->ktype->namespace(target);
61 sd->s_symlink.target_sd = target_sd; 63 sd->s_symlink.target_sd = target_sd;
62 target_sd = NULL; /* reference is now owned by the symlink */ 64 target_sd = NULL; /* reference is now owned by the symlink */
63 65
@@ -107,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
107} 109}
108 110
109/** 111/**
112 * sysfs_delete_link - remove symlink in object's directory.
113 * @kobj: object we're acting for.
114 * @targ: object we're pointing to.
115 * @name: name of the symlink to remove.
116 *
117 * Unlike sysfs_remove_link sysfs_delete_link has enough information
118 * to successfully delete symlinks in tagged directories.
119 */
120void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
121 const char *name)
122{
123 const void *ns = NULL;
124 spin_lock(&sysfs_assoc_lock);
125 if (targ->sd)
126 ns = targ->sd->s_ns;
127 spin_unlock(&sysfs_assoc_lock);
128 sysfs_hash_and_remove(kobj->sd, ns, name);
129}
130
131/**
110 * sysfs_remove_link - remove symlink in object's directory. 132 * sysfs_remove_link - remove symlink in object's directory.
111 * @kobj: object we're acting for. 133 * @kobj: object we're acting for.
112 * @name: name of the symlink to remove. 134 * @name: name of the symlink to remove.
@@ -121,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
121 else 143 else
122 parent_sd = kobj->sd; 144 parent_sd = kobj->sd;
123 145
124 sysfs_hash_and_remove(parent_sd, name); 146 sysfs_hash_and_remove(parent_sd, NULL, name);
125} 147}
126 148
127/** 149/**
@@ -137,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
137 const char *old, const char *new) 159 const char *old, const char *new)
138{ 160{
139 struct sysfs_dirent *parent_sd, *sd = NULL; 161 struct sysfs_dirent *parent_sd, *sd = NULL;
162 const void *old_ns = NULL, *new_ns = NULL;
140 int result; 163 int result;
141 164
142 if (!kobj) 165 if (!kobj)
@@ -144,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
144 else 167 else
145 parent_sd = kobj->sd; 168 parent_sd = kobj->sd;
146 169
170 if (targ->sd)
171 old_ns = targ->sd->s_ns;
172
147 result = -ENOENT; 173 result = -ENOENT;
148 sd = sysfs_get_dirent(parent_sd, old); 174 sd = sysfs_get_dirent(parent_sd, old_ns, old);
149 if (!sd) 175 if (!sd)
150 goto out; 176 goto out;
151 177
@@ -155,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
155 if (sd->s_symlink.target_sd->s_dir.kobj != targ) 181 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
156 goto out; 182 goto out;
157 183
158 result = sysfs_rename(sd, parent_sd, new); 184 if (sysfs_ns_type(parent_sd))
185 new_ns = targ->ktype->namespace(targ);
186
187 result = sysfs_rename(sd, parent_sd, new_ns, new);
159 188
160out: 189out:
161 sysfs_put(sd); 190 sysfs_put(sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44fb5d3..6a13105b5594 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
58 struct sysfs_dirent *s_sibling; 58 struct sysfs_dirent *s_sibling;
59 const char *s_name; 59 const char *s_name;
60 60
61 const void *s_ns; /* namespace tag */
61 union { 62 union {
62 struct sysfs_elem_dir s_dir; 63 struct sysfs_elem_dir s_dir;
63 struct sysfs_elem_symlink s_symlink; 64 struct sysfs_elem_symlink s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 82#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
82#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) 83#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
83 84
84#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 85/* identify any namespace tag on sysfs_dirents */
85#define SYSFS_FLAG_REMOVED 0x0200 86#define SYSFS_NS_TYPE_MASK 0xff00
87#define SYSFS_NS_TYPE_SHIFT 8
88
89#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
90#define SYSFS_FLAG_REMOVED 0x020000
86 91
87static inline unsigned int sysfs_type(struct sysfs_dirent *sd) 92static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
88{ 93{
89 return sd->s_flags & SYSFS_TYPE_MASK; 94 return sd->s_flags & SYSFS_TYPE_MASK;
90} 95}
91 96
97/*
98 * Return any namespace tags on this dirent.
99 * enum kobj_ns_type is defined in linux/kobject.h
100 */
101static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
102{
103 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
104}
105
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 106#ifdef CONFIG_DEBUG_LOCK_ALLOC
93#define sysfs_dirent_init_lockdep(sd) \ 107#define sysfs_dirent_init_lockdep(sd) \
94do { \ 108do { \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
114/* 128/*
115 * mount.c 129 * mount.c
116 */ 130 */
131
132/*
133 * Each sb is associated with a set of namespace tags (i.e.
134 * the network namespace of the task which mounted this sysfs
135 * instance).
136 */
137struct sysfs_super_info {
138 const void *ns[KOBJ_NS_TYPES];
139};
140#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
117extern struct sysfs_dirent sysfs_root; 141extern struct sysfs_dirent sysfs_root;
118extern struct kmem_cache *sysfs_dir_cachep; 142extern struct kmem_cache *sysfs_dir_cachep;
119 143
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
137void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); 161void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
138 162
139struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 163struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
164 const void *ns,
140 const unsigned char *name); 165 const unsigned char *name);
141struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 166struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
167 const void *ns,
142 const unsigned char *name); 168 const unsigned char *name);
143struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); 169struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
144 170
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
149void sysfs_remove_subdir(struct sysfs_dirent *sd); 175void sysfs_remove_subdir(struct sysfs_dirent *sd);
150 176
151int sysfs_rename(struct sysfs_dirent *sd, 177int sysfs_rename(struct sysfs_dirent *sd,
152 struct sysfs_dirent *new_parent_sd, const char *new_name); 178 struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
153 179
154static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) 180static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
155{ 181{
@@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
179int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 205int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
180int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 206int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
181 size_t size, int flags); 207 size_t size, int flags);
182int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 208int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
183int sysfs_inode_init(void); 209int sysfs_inode_init(void);
184 210
185/* 211/*
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..bbd69bdb0fa8 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); 159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); 160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
161 dirty_sb(sb); 161 dirty_sb(sb);
162 162 inode_init_owner(inode, dir, mode);
163 if (dir->i_mode & S_ISGID) {
164 inode->i_gid = dir->i_gid;
165 if (S_ISDIR(mode))
166 mode |= S_ISGID;
167 } else
168 inode->i_gid = current_fsgid();
169
170 inode->i_uid = current_fsuid();
171 inode->i_ino = fs16_to_cpu(sbi, ino); 163 inode->i_ino = fs16_to_cpu(sbi, ino);
172 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 164 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
173 inode->i_blocks = 0; 165 inode->i_blocks = 0;
@@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
176 insert_inode_hash(inode); 168 insert_inode_hash(inode);
177 mark_inode_dirty(inode); 169 mark_inode_dirty(inode);
178 170
179 inode->i_mode = mode; /* for sysv_write_inode() */
180 sysv_write_inode(inode, 0); /* ensure inode not allocated again */ 171 sysv_write_inode(inode, 0); /* ensure inode not allocated again */
181 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ 172 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
182 /* That's it. */ 173 /* That's it. */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
104 */ 104 */
105 inode->i_flags |= (S_NOCMTIME); 105 inode->i_flags |= (S_NOCMTIME);
106 106
107 inode->i_uid = current_fsuid(); 107 inode_init_owner(inode, dir, mode);
108 if (dir->i_mode & S_ISGID) {
109 inode->i_gid = dir->i_gid;
110 if (S_ISDIR(mode))
111 mode |= S_ISGID;
112 } else
113 inode->i_gid = current_fsgid();
114 inode->i_mode = mode;
115 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
116 ubifs_current_time(inode); 109 ubifs_current_time(inode);
117 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a436251e..3a84455c2a77 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -209,6 +209,6 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .read = generic_read_dir, 210 .read = generic_read_dir,
211 .readdir = udf_readdir, 211 .readdir = udf_readdir,
212 .ioctl = udf_ioctl, 212 .unlocked_ioctl = udf_ioctl,
213 .fsync = simple_fsync, 213 .fsync = simple_fsync,
214}; 214};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 4b6a46ccbf46..baae3a723946 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -37,6 +37,7 @@
37#include <linux/quotaops.h> 37#include <linux/quotaops.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/aio.h> 39#include <linux/aio.h>
40#include <linux/smp_lock.h>
40 41
41#include "udf_i.h" 42#include "udf_i.h"
42#include "udf_sb.h" 43#include "udf_sb.h"
@@ -144,50 +145,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
144 return retval; 145 return retval;
145} 146}
146 147
147int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 148long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
148 unsigned long arg)
149{ 149{
150 struct inode *inode = filp->f_dentry->d_inode;
150 long old_block, new_block; 151 long old_block, new_block;
151 int result = -EINVAL; 152 int result = -EINVAL;
152 153
154 lock_kernel();
155
153 if (file_permission(filp, MAY_READ) != 0) { 156 if (file_permission(filp, MAY_READ) != 0) {
154 udf_debug("no permission to access inode %lu\n", 157 udf_debug("no permission to access inode %lu\n", inode->i_ino);
155 inode->i_ino); 158 result = -EPERM;
156 return -EPERM; 159 goto out;
157 } 160 }
158 161
159 if (!arg) { 162 if (!arg) {
160 udf_debug("invalid argument to udf_ioctl\n"); 163 udf_debug("invalid argument to udf_ioctl\n");
161 return -EINVAL; 164 result = -EINVAL;
165 goto out;
162 } 166 }
163 167
164 switch (cmd) { 168 switch (cmd) {
165 case UDF_GETVOLIDENT: 169 case UDF_GETVOLIDENT:
166 if (copy_to_user((char __user *)arg, 170 if (copy_to_user((char __user *)arg,
167 UDF_SB(inode->i_sb)->s_volume_ident, 32)) 171 UDF_SB(inode->i_sb)->s_volume_ident, 32))
168 return -EFAULT; 172 result = -EFAULT;
169 else 173 else
170 return 0; 174 result = 0;
175 goto out;
171 case UDF_RELOCATE_BLOCKS: 176 case UDF_RELOCATE_BLOCKS:
172 if (!capable(CAP_SYS_ADMIN)) 177 if (!capable(CAP_SYS_ADMIN)) {
173 return -EACCES; 178 result = -EACCES;
174 if (get_user(old_block, (long __user *)arg)) 179 goto out;
175 return -EFAULT; 180 }
181 if (get_user(old_block, (long __user *)arg)) {
182 result = -EFAULT;
183 goto out;
184 }
176 result = udf_relocate_blocks(inode->i_sb, 185 result = udf_relocate_blocks(inode->i_sb,
177 old_block, &new_block); 186 old_block, &new_block);
178 if (result == 0) 187 if (result == 0)
179 result = put_user(new_block, (long __user *)arg); 188 result = put_user(new_block, (long __user *)arg);
180 return result; 189 goto out;
181 case UDF_GETEASIZE: 190 case UDF_GETEASIZE:
182 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); 191 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
183 break; 192 goto out;
184 case UDF_GETEABLOCK: 193 case UDF_GETEABLOCK:
185 result = copy_to_user((char __user *)arg, 194 result = copy_to_user((char __user *)arg,
186 UDF_I(inode)->i_ext.i_data, 195 UDF_I(inode)->i_ext.i_data,
187 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; 196 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
188 break; 197 goto out;
189 } 198 }
190 199
200out:
201 unlock_kernel();
191 return result; 202 return result;
192} 203}
193 204
@@ -207,7 +218,7 @@ static int udf_release_file(struct inode *inode, struct file *filp)
207const struct file_operations udf_file_operations = { 218const struct file_operations udf_file_operations = {
208 .read = do_sync_read, 219 .read = do_sync_read,
209 .aio_read = generic_file_aio_read, 220 .aio_read = generic_file_aio_read,
210 .ioctl = udf_ioctl, 221 .unlocked_ioctl = udf_ioctl,
211 .open = dquot_file_open, 222 .open = dquot_file_open,
212 .mmap = generic_file_mmap, 223 .mmap = generic_file_mmap,
213 .write = do_sync_write, 224 .write = do_sync_write,
@@ -227,7 +238,7 @@ int udf_setattr(struct dentry *dentry, struct iattr *iattr)
227 if (error) 238 if (error)
228 return error; 239 return error;
229 240
230 if (iattr->ia_valid & ATTR_SIZE) 241 if (is_quota_modification(inode, iattr))
231 dquot_initialize(inode); 242 dquot_initialize(inode);
232 243
233 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 244 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..2b5586c7f02a 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -124,15 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
124 udf_updated_lvid(sb); 124 udf_updated_lvid(sb);
125 } 125 }
126 mutex_unlock(&sbi->s_alloc_mutex); 126 mutex_unlock(&sbi->s_alloc_mutex);
127 inode->i_mode = mode; 127
128 inode->i_uid = current_fsuid(); 128 inode_init_owner(inode, dir, mode);
129 if (dir->i_mode & S_ISGID) {
130 inode->i_gid = dir->i_gid;
131 if (S_ISDIR(mode))
132 mode |= S_ISGID;
133 } else {
134 inode->i_gid = current_fsgid();
135 }
136 129
137 iinfo->i_location.logicalBlockNum = block; 130 iinfo->i_location.logicalBlockNum = block;
138 iinfo->i_location.partitionReferenceNum = 131 iinfo->i_location.partitionReferenceNum =
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 75816025f95f..585f733615dc 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -579,7 +579,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
579 inode->i_data.a_ops = &udf_aops; 579 inode->i_data.a_ops = &udf_aops;
580 inode->i_op = &udf_file_inode_operations; 580 inode->i_op = &udf_file_inode_operations;
581 inode->i_fop = &udf_file_operations; 581 inode->i_fop = &udf_file_operations;
582 inode->i_mode = mode;
583 mark_inode_dirty(inode); 582 mark_inode_dirty(inode);
584 583
585 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 584 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -627,7 +626,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 goto out; 626 goto out;
628 627
629 iinfo = UDF_I(inode); 628 iinfo = UDF_I(inode);
630 inode->i_uid = current_fsuid();
631 init_special_inode(inode, mode, rdev); 629 init_special_inode(inode, mode, rdev);
632 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 630 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
633 if (!fi) { 631 if (!fi) {
@@ -674,7 +672,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
674 goto out; 672 goto out;
675 673
676 err = -EIO; 674 err = -EIO;
677 inode = udf_new_inode(dir, S_IFDIR, &err); 675 inode = udf_new_inode(dir, S_IFDIR | mode, &err);
678 if (!inode) 676 if (!inode)
679 goto out; 677 goto out;
680 678
@@ -697,9 +695,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
697 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; 695 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
698 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); 696 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
699 brelse(fibh.sbh); 697 brelse(fibh.sbh);
700 inode->i_mode = S_IFDIR | mode;
701 if (dir->i_mode & S_ISGID)
702 inode->i_mode |= S_ISGID;
703 mark_inode_dirty(inode); 698 mark_inode_dirty(inode);
704 699
705 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 700 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -912,7 +907,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
912 dquot_initialize(dir); 907 dquot_initialize(dir);
913 908
914 lock_kernel(); 909 lock_kernel();
915 inode = udf_new_inode(dir, S_IFLNK, &err); 910 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
916 if (!inode) 911 if (!inode)
917 goto out; 912 goto out;
918 913
@@ -923,7 +918,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 } 918 }
924 919
925 iinfo = UDF_I(inode); 920 iinfo = UDF_I(inode);
926 inode->i_mode = S_IFLNK | S_IRWXUGO;
927 inode->i_data.a_ops = &udf_symlink_aops; 921 inode->i_data.a_ops = &udf_symlink_aops;
928 inode->i_op = &udf_symlink_inode_operations; 922 inode->i_op = &udf_symlink_inode_operations;
929 923
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 702a1148e702..9079ff7d6255 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -130,8 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
130 uint8_t *, uint8_t *); 130 uint8_t *, uint8_t *);
131 131
132/* file.c */ 132/* file.c */
133extern int udf_ioctl(struct inode *, struct file *, unsigned int, 133extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134 unsigned long);
135extern int udf_setattr(struct dentry *dentry, struct iattr *iattr); 134extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
136/* inode.c */ 135/* inode.c */
137extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..3a959d55084d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -303,15 +303,7 @@ cg_found:
303 sb->s_dirt = 1; 303 sb->s_dirt = 1;
304 304
305 inode->i_ino = cg * uspi->s_ipg + bit; 305 inode->i_ino = cg * uspi->s_ipg + bit;
306 inode->i_mode = mode; 306 inode_init_owner(inode, dir, mode);
307 inode->i_uid = current_fsuid();
308 if (dir->i_mode & S_ISGID) {
309 inode->i_gid = dir->i_gid;
310 if (S_ISDIR(mode))
311 inode->i_mode |= S_ISGID;
312 } else
313 inode->i_gid = current_fsgid();
314
315 inode->i_blocks = 0; 307 inode->i_blocks = 0;
316 inode->i_generation = 0; 308 inode->i_generation = 0;
317 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 309 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 80b68c3702d1..cffa756f1047 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -603,7 +603,7 @@ static void ufs_set_inode_ops(struct inode *inode)
603 if (!inode->i_blocks) 603 if (!inode->i_blocks)
604 inode->i_op = &ufs_fast_symlink_inode_operations; 604 inode->i_op = &ufs_fast_symlink_inode_operations;
605 else { 605 else {
606 inode->i_op = &page_symlink_inode_operations; 606 inode->i_op = &ufs_symlink_inode_operations;
607 inode->i_mapping->a_ops = &ufs_aops; 607 inode->i_mapping->a_ops = &ufs_aops;
608 } 608 }
609 } else 609 } else
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 118556243e7a..eabc02eb1294 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -148,7 +148,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
148 148
149 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { 149 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
150 /* slow symlink */ 150 /* slow symlink */
151 inode->i_op = &page_symlink_inode_operations; 151 inode->i_op = &ufs_symlink_inode_operations;
152 inode->i_mapping->a_ops = &ufs_aops; 152 inode->i_mapping->a_ops = &ufs_aops;
153 err = page_symlink(inode, symname, l); 153 err = page_symlink(inode, symname, l);
154 if (err) 154 if (err)
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
42const struct inode_operations ufs_fast_symlink_inode_operations = { 42const struct inode_operations ufs_fast_symlink_inode_operations = {
43 .readlink = generic_readlink, 43 .readlink = generic_readlink,
44 .follow_link = ufs_follow_link, 44 .follow_link = ufs_follow_link,
45 .setattr = ufs_setattr,
46};
47
48const struct inode_operations ufs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52 .setattr = ufs_setattr,
45}; 53};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index d3b6270cb377..f294c44577dc 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -508,7 +508,7 @@ out:
508 * - there is no way to know old size 508 * - there is no way to know old size
509 * - there is no way inform user about error, if it happens in `truncate' 509 * - there is no way inform user about error, if it happens in `truncate'
510 */ 510 */
511static int ufs_setattr(struct dentry *dentry, struct iattr *attr) 511int ufs_setattr(struct dentry *dentry, struct iattr *attr)
512{ 512{
513 struct inode *inode = dentry->d_inode; 513 struct inode *inode = dentry->d_inode;
514 unsigned int ia_valid = attr->ia_valid; 514 unsigned int ia_valid = attr->ia_valid;
@@ -518,18 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (error) 518 if (error)
519 return error; 519 return error;
520 520
521 if (is_quota_modification(inode, attr))
522 dquot_initialize(inode);
523
521 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 524 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
522 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 525 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
523 error = dquot_transfer(inode, attr); 526 error = dquot_transfer(inode, attr);
524 if (error) 527 if (error)
525 return error; 528 return error;
526 } 529 }
527 if (ia_valid & ATTR_SIZE && 530 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
528 attr->ia_size != i_size_read(inode)) {
529 loff_t old_i_size = inode->i_size; 531 loff_t old_i_size = inode->i_size;
530 532
531 dquot_initialize(inode);
532
533 error = vmtruncate(inode, attr->ia_size); 533 error = vmtruncate(inode, attr->ia_size);
534 if (error) 534 if (error)
535 return error; 535 return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 43f9f5d5670e..179ae6b3180a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
122 122
123/* symlink.c */ 123/* symlink.c */
124extern const struct inode_operations ufs_fast_symlink_inode_operations; 124extern const struct inode_operations ufs_fast_symlink_inode_operations;
125extern const struct inode_operations ufs_symlink_inode_operations;
125 126
126/* truncate.c */ 127/* truncate.c */
127extern int ufs_truncate (struct inode *, loff_t); 128extern int ufs_truncate (struct inode *, loff_t);
129extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
128 130
129static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) 131static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
130{ 132{
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
590/* 590/*
591 * Find the xattr_handler with the matching prefix. 591 * Find the xattr_handler with the matching prefix.
592 */ 592 */
593static struct xattr_handler * 593static const struct xattr_handler *
594xattr_resolve_name(struct xattr_handler **handlers, const char **name) 594xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
595{ 595{
596 struct xattr_handler *handler; 596 const struct xattr_handler *handler;
597 597
598 if (!*name) 598 if (!*name)
599 return NULL; 599 return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
614ssize_t 614ssize_t
615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) 615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
616{ 616{
617 struct xattr_handler *handler; 617 const struct xattr_handler *handler;
618 618
619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
620 if (!handler) 620 if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
629ssize_t 629ssize_t
630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
631{ 631{
632 struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; 632 const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
633 unsigned int size = 0; 633 unsigned int size = 0;
634 634
635 if (!buffer) { 635 if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
659int 659int
660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) 660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
661{ 661{
662 struct xattr_handler *handler; 662 const struct xattr_handler *handler;
663 663
664 if (size == 0) 664 if (size == 0)
665 value = ""; /* empty EA, do not remove */ 665 value = ""; /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
676int 676int
677generic_removexattr(struct dentry *dentry, const char *name) 677generic_removexattr(struct dentry *dentry, const char *name)
678{ 678{
679 struct xattr_handler *handler; 679 const struct xattr_handler *handler;
680 680
681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
682 if (!handler) 682 if (!handler)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d60..9f769b5b38fc 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
440 return error; 440 return error;
441} 441}
442 442
443struct xattr_handler xfs_xattr_acl_access_handler = { 443const struct xattr_handler xfs_xattr_acl_access_handler = {
444 .prefix = POSIX_ACL_XATTR_ACCESS, 444 .prefix = POSIX_ACL_XATTR_ACCESS,
445 .flags = ACL_TYPE_ACCESS, 445 .flags = ACL_TYPE_ACCESS,
446 .get = xfs_xattr_acl_get, 446 .get = xfs_xattr_acl_get,
447 .set = xfs_xattr_acl_set, 447 .set = xfs_xattr_acl_set,
448}; 448};
449 449
450struct xattr_handler xfs_xattr_acl_default_handler = { 450const struct xattr_handler xfs_xattr_acl_default_handler = {
451 .prefix = POSIX_ACL_XATTR_DEFAULT, 451 .prefix = POSIX_ACL_XATTR_DEFAULT,
452 .flags = ACL_TYPE_DEFAULT, 452 .flags = ACL_TYPE_DEFAULT,
453 .get = xfs_xattr_acl_get, 453 .get = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c43..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
37 37
38#include "xfs_sb.h" 38#include "xfs_sb.h"
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h"
40#include "xfs_ag.h" 41#include "xfs_ag.h"
41#include "xfs_dmapi.h" 42#include "xfs_dmapi.h"
42#include "xfs_mount.h" 43#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
850 * Note that this in no way locks the underlying pages, so it is only 851 * Note that this in no way locks the underlying pages, so it is only
851 * useful for synchronizing concurrent use of buffer objects, not for 852 * useful for synchronizing concurrent use of buffer objects, not for
852 * synchronizing independent access to the underlying pages. 853 * synchronizing independent access to the underlying pages.
854 *
855 * If we come across a stale, pinned, locked buffer, we know that we
856 * are being asked to lock a buffer that has been reallocated. Because
857 * it is pinned, we know that the log has not been pushed to disk and
858 * hence it will still be locked. Rather than sleeping until someone
859 * else pushes the log, push it ourselves before trying to get the lock.
853 */ 860 */
854void 861void
855xfs_buf_lock( 862xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
857{ 864{
858 trace_xfs_buf_lock(bp, _RET_IP_); 865 trace_xfs_buf_lock(bp, _RET_IP_);
859 866
867 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
868 xfs_log_force(bp->b_mount, 0);
860 if (atomic_read(&bp->b_io_remaining)) 869 if (atomic_read(&bp->b_io_remaining))
861 blk_run_address_space(bp->b_target->bt_mapping); 870 blk_run_address_space(bp->b_target->bt_mapping);
862 down(&bp->b_sema); 871 down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 1947514ce1ad..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
19#include "xfs_dmapi.h" 19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 20#include "xfs_sb.h"
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_log.h"
22#include "xfs_ag.h" 23#include "xfs_ag.h"
23#include "xfs_mount.h" 24#include "xfs_mount.h"
24#include "xfs_quota.h" 25#include "xfs_quota.h"
@@ -97,7 +98,7 @@ xfs_fs_set_xstate(
97} 98}
98 99
99STATIC int 100STATIC int
100xfs_fs_get_xquota( 101xfs_fs_get_dqblk(
101 struct super_block *sb, 102 struct super_block *sb,
102 int type, 103 int type,
103 qid_t id, 104 qid_t id,
@@ -114,7 +115,7 @@ xfs_fs_get_xquota(
114} 115}
115 116
116STATIC int 117STATIC int
117xfs_fs_set_xquota( 118xfs_fs_set_dqblk(
118 struct super_block *sb, 119 struct super_block *sb,
119 int type, 120 int type,
120 qid_t id, 121 qid_t id,
@@ -135,6 +136,6 @@ xfs_fs_set_xquota(
135const struct quotactl_ops xfs_quotactl_operations = { 136const struct quotactl_ops xfs_quotactl_operations = {
136 .get_xstate = xfs_fs_get_xstate, 137 .get_xstate = xfs_fs_get_xstate,
137 .set_xstate = xfs_fs_set_xstate, 138 .set_xstate = xfs_fs_set_xstate,
138 .get_xquota = xfs_fs_get_xquota, 139 .get_dqblk = xfs_fs_get_dqblk,
139 .set_xquota = xfs_fs_set_xquota, 140 .set_dqblk = xfs_fs_set_dqblk,
140}; 141};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e9002513e08f..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ 121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
122 124
123/* 125/*
124 * Table driven mount option parser. 126 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
374 mp->m_flags |= XFS_MOUNT_DMAPI; 376 mp->m_flags |= XFS_MOUNT_DMAPI;
375 } else if (!strcmp(this_char, MNTOPT_DMI)) { 377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
376 mp->m_flags |= XFS_MOUNT_DMAPI; 378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN,
382 "Enabling EXPERIMENTAL delayed logging feature "
383 "- use at your own risk.\n");
384 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
385 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
377 } else if (!strcmp(this_char, "ihashsize")) { 386 } else if (!strcmp(this_char, "ihashsize")) {
378 cmn_err(CE_WARN, 387 cmn_err(CE_WARN,
379 "XFS: ihashsize no longer used, option is deprecated."); 388 "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
535 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
536 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, 545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
537 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
538 { 0, NULL } 548 { 0, NULL }
539 }; 549 };
540 static struct proc_xfs_info xfs_info_unset[] = { 550 static struct proc_xfs_info xfs_info_unset[] = {
@@ -725,7 +735,8 @@ void
725xfs_blkdev_issue_flush( 735xfs_blkdev_issue_flush(
726 xfs_buftarg_t *buftarg) 736 xfs_buftarg_t *buftarg)
727{ 737{
728 blkdev_issue_flush(buftarg->bt_bdev, NULL); 738 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
739 BLKDEV_IFL_WAIT);
729} 740}
730 741
731STATIC void 742STATIC void
@@ -1754,7 +1765,7 @@ xfs_init_zones(void)
1754 * but it is much faster. 1765 * but it is much faster.
1755 */ 1766 */
1756 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + 1767 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
1757 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / 1768 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1758 NBWORD) * sizeof(int))), "xfs_buf_item"); 1769 NBWORD) * sizeof(int))), "xfs_buf_item");
1759 if (!xfs_buf_item_zone) 1770 if (!xfs_buf_item_zone)
1760 goto out_destroy_trans_zone; 1771 goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..519618e9279e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
86 86
87extern const struct export_operations xfs_export_operations; 87extern const struct export_operations xfs_export_operations;
88extern struct xattr_handler *xfs_xattr_handlers[]; 88extern const struct xattr_handler *xfs_xattr_handlers[];
89extern const struct quotactl_ops xfs_quotactl_operations; 89extern const struct quotactl_ops xfs_quotactl_operations;
90 90
91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd2901..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
1059 1059
1060); 1060);
1061 1061
1062#define XFS_BUSY_SYNC \
1063 { 0, "async" }, \
1064 { 1, "sync" }
1065
1062TRACE_EVENT(xfs_alloc_busy, 1066TRACE_EVENT(xfs_alloc_busy,
1063 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1067 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1064 xfs_extlen_t len, int slot), 1068 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1065 TP_ARGS(mp, agno, agbno, len, slot), 1069 TP_ARGS(trans, agno, agbno, len, sync),
1066 TP_STRUCT__entry( 1070 TP_STRUCT__entry(
1067 __field(dev_t, dev) 1071 __field(dev_t, dev)
1072 __field(struct xfs_trans *, tp)
1073 __field(int, tid)
1068 __field(xfs_agnumber_t, agno) 1074 __field(xfs_agnumber_t, agno)
1069 __field(xfs_agblock_t, agbno) 1075 __field(xfs_agblock_t, agbno)
1070 __field(xfs_extlen_t, len) 1076 __field(xfs_extlen_t, len)
1071 __field(int, slot) 1077 __field(int, sync)
1072 ), 1078 ),
1073 TP_fast_assign( 1079 TP_fast_assign(
1074 __entry->dev = mp->m_super->s_dev; 1080 __entry->dev = trans->t_mountp->m_super->s_dev;
1081 __entry->tp = trans;
1082 __entry->tid = trans->t_ticket->t_tid;
1075 __entry->agno = agno; 1083 __entry->agno = agno;
1076 __entry->agbno = agbno; 1084 __entry->agbno = agbno;
1077 __entry->len = len; 1085 __entry->len = len;
1078 __entry->slot = slot; 1086 __entry->sync = sync;
1079 ), 1087 ),
1080 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1088 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1081 MAJOR(__entry->dev), MINOR(__entry->dev), 1089 MAJOR(__entry->dev), MINOR(__entry->dev),
1090 __entry->tp,
1091 __entry->tid,
1082 __entry->agno, 1092 __entry->agno,
1083 __entry->agbno, 1093 __entry->agbno,
1084 __entry->len, 1094 __entry->len,
1085 __entry->slot) 1095 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1086 1096
1087); 1097);
1088 1098
1089#define XFS_BUSY_STATES \
1090 { 0, "found" }, \
1091 { 1, "missing" }
1092
1093TRACE_EVENT(xfs_alloc_unbusy, 1099TRACE_EVENT(xfs_alloc_unbusy,
1094 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1100 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1095 int slot, int found), 1101 xfs_agblock_t agbno, xfs_extlen_t len),
1096 TP_ARGS(mp, agno, slot, found), 1102 TP_ARGS(mp, agno, agbno, len),
1097 TP_STRUCT__entry( 1103 TP_STRUCT__entry(
1098 __field(dev_t, dev) 1104 __field(dev_t, dev)
1099 __field(xfs_agnumber_t, agno) 1105 __field(xfs_agnumber_t, agno)
1100 __field(int, slot) 1106 __field(xfs_agblock_t, agbno)
1101 __field(int, found) 1107 __field(xfs_extlen_t, len)
1102 ), 1108 ),
1103 TP_fast_assign( 1109 TP_fast_assign(
1104 __entry->dev = mp->m_super->s_dev; 1110 __entry->dev = mp->m_super->s_dev;
1105 __entry->agno = agno; 1111 __entry->agno = agno;
1106 __entry->slot = slot; 1112 __entry->agbno = agbno;
1107 __entry->found = found; 1113 __entry->len = len;
1108 ), 1114 ),
1109 TP_printk("dev %d:%d agno %u slot %d %s", 1115 TP_printk("dev %d:%d agno %u agbno %u len %u",
1110 MAJOR(__entry->dev), MINOR(__entry->dev), 1116 MAJOR(__entry->dev), MINOR(__entry->dev),
1111 __entry->agno, 1117 __entry->agno,
1112 __entry->slot, 1118 __entry->agbno,
1113 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1119 __entry->len)
1114); 1120);
1115 1121
1122#define XFS_BUSY_STATES \
1123 { 0, "missing" }, \
1124 { 1, "found" }
1125
1116TRACE_EVENT(xfs_alloc_busysearch, 1126TRACE_EVENT(xfs_alloc_busysearch,
1117 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1127 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1118 xfs_extlen_t len, xfs_lsn_t lsn), 1128 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1119 TP_ARGS(mp, agno, agbno, len, lsn), 1129 TP_ARGS(mp, agno, agbno, len, found),
1120 TP_STRUCT__entry( 1130 TP_STRUCT__entry(
1121 __field(dev_t, dev) 1131 __field(dev_t, dev)
1122 __field(xfs_agnumber_t, agno) 1132 __field(xfs_agnumber_t, agno)
1123 __field(xfs_agblock_t, agbno) 1133 __field(xfs_agblock_t, agbno)
1124 __field(xfs_extlen_t, len) 1134 __field(xfs_extlen_t, len)
1125 __field(xfs_lsn_t, lsn) 1135 __field(int, found)
1126 ), 1136 ),
1127 TP_fast_assign( 1137 TP_fast_assign(
1128 __entry->dev = mp->m_super->s_dev; 1138 __entry->dev = mp->m_super->s_dev;
1129 __entry->agno = agno; 1139 __entry->agno = agno;
1130 __entry->agbno = agbno; 1140 __entry->agbno = agbno;
1131 __entry->len = len; 1141 __entry->len = len;
1132 __entry->lsn = lsn; 1142 __entry->found = found;
1133 ), 1143 ),
1134 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1144 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1135 MAJOR(__entry->dev), MINOR(__entry->dev), 1145 MAJOR(__entry->dev), MINOR(__entry->dev),
1136 __entry->agno, 1146 __entry->agno,
1137 __entry->agbno, 1147 __entry->agbno,
1138 __entry->len, 1148 __entry->len,
1149 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1150);
1151
1152TRACE_EVENT(xfs_trans_commit_lsn,
1153 TP_PROTO(struct xfs_trans *trans),
1154 TP_ARGS(trans),
1155 TP_STRUCT__entry(
1156 __field(dev_t, dev)
1157 __field(struct xfs_trans *, tp)
1158 __field(xfs_lsn_t, lsn)
1159 ),
1160 TP_fast_assign(
1161 __entry->dev = trans->t_mountp->m_super->s_dev;
1162 __entry->tp = trans;
1163 __entry->lsn = trans->t_commit_lsn;
1164 ),
1165 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1166 MAJOR(__entry->dev), MINOR(__entry->dev),
1167 __entry->tp,
1139 __entry->lsn) 1168 __entry->lsn)
1140); 1169);
1141 1170
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
72 (void *)value, size, xflags); 72 (void *)value, size, xflags);
73} 73}
74 74
75static struct xattr_handler xfs_xattr_user_handler = { 75static const struct xattr_handler xfs_xattr_user_handler = {
76 .prefix = XATTR_USER_PREFIX, 76 .prefix = XATTR_USER_PREFIX,
77 .flags = 0, /* no flags implies user namespace */ 77 .flags = 0, /* no flags implies user namespace */
78 .get = xfs_xattr_get, 78 .get = xfs_xattr_get,
79 .set = xfs_xattr_set, 79 .set = xfs_xattr_set,
80}; 80};
81 81
82static struct xattr_handler xfs_xattr_trusted_handler = { 82static const struct xattr_handler xfs_xattr_trusted_handler = {
83 .prefix = XATTR_TRUSTED_PREFIX, 83 .prefix = XATTR_TRUSTED_PREFIX,
84 .flags = ATTR_ROOT, 84 .flags = ATTR_ROOT,
85 .get = xfs_xattr_get, 85 .get = xfs_xattr_get,
86 .set = xfs_xattr_set, 86 .set = xfs_xattr_set,
87}; 87};
88 88
89static struct xattr_handler xfs_xattr_security_handler = { 89static const struct xattr_handler xfs_xattr_security_handler = {
90 .prefix = XATTR_SECURITY_PREFIX, 90 .prefix = XATTR_SECURITY_PREFIX,
91 .flags = ATTR_SECURE, 91 .flags = ATTR_SECURE,
92 .get = xfs_xattr_get, 92 .get = xfs_xattr_get,
93 .set = xfs_xattr_set, 93 .set = xfs_xattr_set,
94}; 94};
95 95
96struct xattr_handler *xfs_xattr_handlers[] = { 96const struct xattr_handler *xfs_xattr_handlers[] = {
97 &xfs_xattr_user_handler, 97 &xfs_xattr_user_handler,
98 &xfs_xattr_trusted_handler, 98 &xfs_xattr_trusted_handler,
99 &xfs_xattr_security_handler, 99 &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df0129..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) 344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
345 xfs_qm_dqinit_core(curid, type, d); 345 xfs_qm_dqinit_core(curid, type, d);
346 xfs_trans_dquot_buf(tp, bp, 346 xfs_trans_dquot_buf(tp, bp,
347 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 347 (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
348 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 348 ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
349 XFS_BLI_GDQUOT_BUF))); 349 XFS_BLF_GDQUOT_BUF)));
350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
351} 351}
352 352
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 26fa43140f2e..92b002f1805f 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -448,6 +448,9 @@ xfs_qm_scall_getqstat(
448 return 0; 448 return 0;
449} 449}
450 450
451#define XFS_DQ_MASK \
452 (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
453
451/* 454/*
452 * Adjust quota limits, and start/stop timers accordingly. 455 * Adjust quota limits, and start/stop timers accordingly.
453 */ 456 */
@@ -465,9 +468,10 @@ xfs_qm_scall_setqlim(
465 int error; 468 int error;
466 xfs_qcnt_t hard, soft; 469 xfs_qcnt_t hard, soft;
467 470
468 if ((newlim->d_fieldmask & 471 if (newlim->d_fieldmask & ~XFS_DQ_MASK)
469 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) 472 return EINVAL;
470 return (0); 473 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
474 return 0;
471 475
472 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 476 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
473 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, 477 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
49extern int posix_acl_access_exists(struct inode *inode); 49extern int posix_acl_access_exists(struct inode *inode);
50extern int posix_acl_default_exists(struct inode *inode); 50extern int posix_acl_default_exists(struct inode *inode);
51 51
52extern struct xattr_handler xfs_xattr_acl_access_handler; 52extern const struct xattr_handler xfs_xattr_acl_access_handler;
53extern struct xattr_handler xfs_xattr_acl_default_handler; 53extern const struct xattr_handler xfs_xattr_acl_default_handler;
54#else 54#else
55# define xfs_check_acl NULL 55# define xfs_check_acl NULL
56# define xfs_get_acl(inode, type) NULL 56# define xfs_get_acl(inode, type) NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Used in perag to mark blocks that have been freed 178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * but whose transactions aren't committed to disk yet. 179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
180 */ 183 */
181typedef struct xfs_perag_busy { 184struct xfs_busy_extent {
182 xfs_agblock_t busy_start; 185 struct rb_node rb_node; /* ag by-bno indexed search tree */
183 xfs_extlen_t busy_length; 186 struct list_head list; /* transaction busy extent list */
184 struct xfs_trans *busy_tp; /* transaction that did the free */ 187 xfs_agnumber_t agno;
185} xfs_perag_busy_t; 188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191};
186 192
187/* 193/*
188 * Per-ag incore structure, copies of information in agf and agi, 194 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
216 xfs_agino_t pagl_leftrec; 222 xfs_agino_t pagl_leftrec;
217 xfs_agino_t pagl_rightrec; 223 xfs_agino_t pagl_rightrec;
218#ifdef __KERNEL__ 224#ifdef __KERNEL__
219 spinlock_t pagb_lock; /* lock for pagb_list */ 225 spinlock_t pagb_lock; /* lock for pagb_tree */
226 struct rb_root pagb_tree; /* ordered tree of busy extents */
220 227
221 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
222 229
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
226 int pag_ici_reclaimable; /* reclaimable inodes */ 233 int pag_ici_reclaimable; /* reclaimable inodes */
227#endif 234#endif
228 int pagb_count; /* pagb slots in use */ 235 int pagb_count; /* pagb slots in use */
229 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
230} xfs_perag_t; 236} xfs_perag_t;
231 237
232/* 238/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
46#define XFSA_FIXUP_BNO_OK 1 46#define XFSA_FIXUP_BNO_OK 1
47#define XFSA_FIXUP_CNT_OK 2 47#define XFSA_FIXUP_CNT_OK 2
48 48
49STATIC void 49static int
50xfs_alloc_search_busy(xfs_trans_t *tp, 50xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agnumber_t agno, 51 xfs_agblock_t bno, xfs_extlen_t len);
52 xfs_agblock_t bno,
53 xfs_extlen_t len);
54 52
55/* 53/*
56 * Prototypes for per-ag allocation routines 54 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
540 be32_to_cpu(agf->agf_length)); 538 be32_to_cpu(agf->agf_length));
541 xfs_alloc_log_agf(args->tp, args->agbp, 539 xfs_alloc_log_agf(args->tp, args->agbp,
542 XFS_AGF_FREEBLKS); 540 XFS_AGF_FREEBLKS);
543 /* search the busylist for these blocks */ 541 /*
544 xfs_alloc_search_busy(args->tp, args->agno, 542 * Search the busylist for these blocks and mark the
545 args->agbno, args->len); 543 * transaction as synchronous if blocks are found. This
544 * avoids the need to block due to a synchronous log
545 * force to ensure correct ordering as the synchronous
546 * transaction will guarantee that for us.
547 */
548 if (xfs_alloc_busy_search(args->mp, args->agno,
549 args->agbno, args->len))
550 xfs_trans_set_sync(args->tp);
546 } 551 }
547 if (!args->isfl) 552 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp, 553 xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
1693 * when the iclog commits to disk. If a busy block is allocated, 1698 * when the iclog commits to disk. If a busy block is allocated,
1694 * the iclog is pushed up to the LSN that freed the block. 1699 * the iclog is pushed up to the LSN that freed the block.
1695 */ 1700 */
1696 xfs_alloc_mark_busy(tp, agno, bno, len); 1701 xfs_alloc_busy_insert(tp, agno, bno, len);
1697 return 0; 1702 return 0;
1698 1703
1699 error0: 1704 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
1989 *bnop = bno; 1994 *bnop = bno;
1990 1995
1991 /* 1996 /*
1992 * As blocks are freed, they are added to the per-ag busy list 1997 * As blocks are freed, they are added to the per-ag busy list and
1993 * and remain there until the freeing transaction is committed to 1998 * remain there until the freeing transaction is committed to disk.
1994 * disk. Now that we have allocated blocks, this list must be 1999 * Now that we have allocated blocks, this list must be searched to see
1995 * searched to see if a block is being reused. If one is, then 2000 * if a block is being reused. If one is, then the freeing transaction
1996 * the freeing transaction must be pushed to disk NOW by forcing 2001 * must be pushed to disk before this transaction.
1997 * to disk all iclogs up that transaction's LSN. 2002 *
2003 * We do this by setting the current transaction to a sync transaction
2004 * which guarantees that the freeing transaction is on disk before this
2005 * transaction. This is done instead of a synchronous log force here so
2006 * that we don't sit and wait with the AGF locked in the transaction
2007 * during the log force.
1998 */ 2008 */
1999 xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); 2009 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2010 xfs_trans_set_sync(tp);
2000 return 0; 2011 return 0;
2001} 2012}
2002 2013
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2212 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2202 spin_lock_init(&pag->pagb_lock); 2213 spin_lock_init(&pag->pagb_lock);
2203 pag->pagb_count = 0; 2214 pag->pagb_count = 0;
2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); 2215 pag->pagb_tree = RB_ROOT;
2205 pag->pagf_init = 1; 2216 pag->pagf_init = 1;
2206 } 2217 }
2207#ifdef DEBUG 2218#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
2479 * list is reused, the transaction that freed it must be forced to disk 2490 * list is reused, the transaction that freed it must be forced to disk
2480 * before continuing to use the block. 2491 * before continuing to use the block.
2481 * 2492 *
2482 * xfs_alloc_mark_busy - add to the per-ag busy list 2493 * xfs_alloc_busy_insert - add to the per-ag busy list
2483 * xfs_alloc_clear_busy - remove an item from the per-ag busy list 2494 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2495 * xfs_alloc_busy_search - search for a busy extent
2496 */
2497
2498/*
2499 * Insert a new extent into the busy tree.
2500 *
2501 * The busy extent tree is indexed by the start block of the busy extent.
2502 * there can be multiple overlapping ranges in the busy extent tree but only
2503 * ever one entry at a given start block. The reason for this is that
2504 * multi-block extents can be freed, then smaller chunks of that extent
2505 * allocated and freed again before the first transaction commit is on disk.
2506 * If the exact same start block is freed a second time, we have to wait for
2507 * that busy extent to pass out of the tree before the new extent is inserted.
2508 * There are two main cases we have to handle here.
2509 *
2510 * The first case is a transaction that triggers a "free - allocate - free"
2511 * cycle. This can occur during btree manipulations as a btree block is freed
2512 * to the freelist, then allocated from the free list, then freed again. In
2513 * this case, the second extxpnet free is what triggers the duplicate and as
2514 * such the transaction IDs should match. Because the extent was allocated in
2515 * this transaction, the transaction must be marked as synchronous. This is
2516 * true for all cases where the free/alloc/free occurs in the one transaction,
2517 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2518 * This serves to catch violations of the second case quite effectively.
2519 *
2520 * The second case is where the free/alloc/free occur in different
2521 * transactions. In this case, the thread freeing the extent the second time
2522 * can't mark the extent busy immediately because it is already tracked in a
2523 * transaction that may be committing. When the log commit for the existing
2524 * busy extent completes, the busy extent will be removed from the tree. If we
2525 * allow the second busy insert to continue using that busy extent structure,
2526 * it can be freed before this transaction is safely in the log. Hence our
2527 * only option in this case is to force the log to remove the existing busy
2528 * extent from the list before we insert the new one with the current
2529 * transaction ID.
2530 *
2531 * The problem we are trying to avoid in the free-alloc-free in separate
2532 * transactions is most easily described with a timeline:
2533 *
2534 * Thread 1 Thread 2 Thread 3 xfslogd
2535 * xact alloc
2536 * free X
2537 * mark busy
2538 * commit xact
2539 * free xact
2540 * xact alloc
2541 * alloc X
2542 * busy search
2543 * mark xact sync
2544 * commit xact
2545 * free xact
2546 * force log
2547 * checkpoint starts
2548 * ....
2549 * xact alloc
2550 * free X
2551 * mark busy
2552 * finds match
2553 * *** KABOOM! ***
2554 * ....
2555 * log IO completes
2556 * unbusy X
2557 * checkpoint completes
2558 *
2559 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2560 * the checkpoint completes, and the busy extent it matched will have been
2561 * removed from the tree when it is woken. Hence it can then continue safely.
2562 *
2563 * However, to ensure this matching process is robust, we need to use the
2564 * transaction ID for identifying transaction, as delayed logging results in
2565 * the busy extent and transaction lifecycles being different. i.e. the busy
2566 * extent is active for a lot longer than the transaction. Hence the
2567 * transaction structure can be freed and reallocated, then mark the same
2568 * extent busy again in the new transaction. In this case the new transaction
2569 * will have a different tid but can have the same address, and hence we need
2570 * to check against the tid.
2571 *
2572 * Future: for delayed logging, we could avoid the log force if the extent was
2573 * first freed in the current checkpoint sequence. This, however, requires the
2574 * ability to pin the current checkpoint in memory until this transaction
2575 * commits to ensure that both the original free and the current one combine
2576 * logically into the one checkpoint. If the checkpoint sequences are
2577 * different, however, we still need to wait on a log force.
2484 */ 2578 */
2485void 2579void
2486xfs_alloc_mark_busy(xfs_trans_t *tp, 2580xfs_alloc_busy_insert(
2487 xfs_agnumber_t agno, 2581 struct xfs_trans *tp,
2488 xfs_agblock_t bno, 2582 xfs_agnumber_t agno,
2489 xfs_extlen_t len) 2583 xfs_agblock_t bno,
2584 xfs_extlen_t len)
2490{ 2585{
2491 xfs_perag_busy_t *bsy; 2586 struct xfs_busy_extent *new;
2587 struct xfs_busy_extent *busyp;
2492 struct xfs_perag *pag; 2588 struct xfs_perag *pag;
2493 int n; 2589 struct rb_node **rbp;
2590 struct rb_node *parent;
2591 int match;
2494 2592
2495 pag = xfs_perag_get(tp->t_mountp, agno);
2496 spin_lock(&pag->pagb_lock);
2497 2593
2498 /* search pagb_list for an open slot */ 2594 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2499 for (bsy = pag->pagb_list, n = 0; 2595 if (!new) {
2500 n < XFS_PAGB_NUM_SLOTS; 2596 /*
2501 bsy++, n++) { 2597 * No Memory! Since it is now not possible to track the free
2502 if (bsy->busy_tp == NULL) { 2598 * block, make this a synchronous transaction to insure that
2503 break; 2599 * the block is not reused before this transaction commits.
2504 } 2600 */
2601 trace_xfs_alloc_busy(tp, agno, bno, len, 1);
2602 xfs_trans_set_sync(tp);
2603 return;
2505 } 2604 }
2506 2605
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); 2606 new->agno = agno;
2607 new->bno = bno;
2608 new->length = len;
2609 new->tid = xfs_log_get_trans_ident(tp);
2508 2610
2509 if (n < XFS_PAGB_NUM_SLOTS) { 2611 INIT_LIST_HEAD(&new->list);
2510 bsy = &pag->pagb_list[n]; 2612
2511 pag->pagb_count++; 2613 /* trace before insert to be able to see failed inserts */
2512 bsy->busy_start = bno; 2614 trace_xfs_alloc_busy(tp, agno, bno, len, 0);
2513 bsy->busy_length = len; 2615
2514 bsy->busy_tp = tp; 2616 pag = xfs_perag_get(tp->t_mountp, new->agno);
2515 xfs_trans_add_busy(tp, agno, n); 2617restart:
2516 } else { 2618 spin_lock(&pag->pagb_lock);
2619 rbp = &pag->pagb_tree.rb_node;
2620 parent = NULL;
2621 busyp = NULL;
2622 match = 0;
2623 while (*rbp && match >= 0) {
2624 parent = *rbp;
2625 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2626
2627 if (new->bno < busyp->bno) {
2628 /* may overlap, but exact start block is lower */
2629 rbp = &(*rbp)->rb_left;
2630 if (new->bno + new->length > busyp->bno)
2631 match = busyp->tid == new->tid ? 1 : -1;
2632 } else if (new->bno > busyp->bno) {
2633 /* may overlap, but exact start block is higher */
2634 rbp = &(*rbp)->rb_right;
2635 if (bno < busyp->bno + busyp->length)
2636 match = busyp->tid == new->tid ? 1 : -1;
2637 } else {
2638 match = busyp->tid == new->tid ? 1 : -1;
2639 break;
2640 }
2641 }
2642 if (match < 0) {
2643 /* overlap marked busy in different transaction */
2644 spin_unlock(&pag->pagb_lock);
2645 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2646 goto restart;
2647 }
2648 if (match > 0) {
2517 /* 2649 /*
2518 * The busy list is full! Since it is now not possible to 2650 * overlap marked busy in same transaction. Update if exact
2519 * track the free block, make this a synchronous transaction 2651 * start block match, otherwise combine the busy extents into
2520 * to insure that the block is not reused before this 2652 * a single range.
2521 * transaction commits.
2522 */ 2653 */
2523 xfs_trans_set_sync(tp); 2654 if (busyp->bno == new->bno) {
2524 } 2655 busyp->length = max(busyp->length, new->length);
2656 spin_unlock(&pag->pagb_lock);
2657 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2658 xfs_perag_put(pag);
2659 kmem_free(new);
2660 return;
2661 }
2662 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2663 new->length = max(busyp->bno + busyp->length,
2664 new->bno + new->length) -
2665 min(busyp->bno, new->bno);
2666 new->bno = min(busyp->bno, new->bno);
2667 } else
2668 busyp = NULL;
2525 2669
2670 rb_link_node(&new->rb_node, parent, rbp);
2671 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2672
2673 list_add(&new->list, &tp->t_busy);
2526 spin_unlock(&pag->pagb_lock); 2674 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag); 2675 xfs_perag_put(pag);
2676 kmem_free(busyp);
2528} 2677}
2529 2678
2530void 2679/*
2531xfs_alloc_clear_busy(xfs_trans_t *tp, 2680 * Search for a busy extent within the range of the extent we are about to
2532 xfs_agnumber_t agno, 2681 * allocate. You need to be holding the busy extent tree lock when calling
2533 int idx) 2682 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2683 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2684 * match. This is done so that a non-zero return indicates an overlap that
2685 * will require a synchronous transaction, but it can still be
2686 * used to distinguish between a partial or exact match.
2687 */
2688static int
2689xfs_alloc_busy_search(
2690 struct xfs_mount *mp,
2691 xfs_agnumber_t agno,
2692 xfs_agblock_t bno,
2693 xfs_extlen_t len)
2534{ 2694{
2535 struct xfs_perag *pag; 2695 struct xfs_perag *pag;
2536 xfs_perag_busy_t *list; 2696 struct rb_node *rbp;
2697 struct xfs_busy_extent *busyp;
2698 int match = 0;
2537 2699
2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2700 pag = xfs_perag_get(mp, agno);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock); 2701 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2542 2702
2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); 2703 rbp = pag->pagb_tree.rb_node;
2544 2704
2545 if (list[idx].busy_tp == tp) { 2705 /* find closest start bno overlap */
2546 list[idx].busy_tp = NULL; 2706 while (rbp) {
2547 pag->pagb_count--; 2707 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2708 if (bno < busyp->bno) {
2709 /* may overlap, but exact start block is lower */
2710 if (bno + len > busyp->bno)
2711 match = -1;
2712 rbp = rbp->rb_left;
2713 } else if (bno > busyp->bno) {
2714 /* may overlap, but exact start block is higher */
2715 if (bno < busyp->bno + busyp->length)
2716 match = -1;
2717 rbp = rbp->rb_right;
2718 } else {
2719 /* bno matches busyp, length determines exact match */
2720 match = (busyp->length == len) ? 1 : -1;
2721 break;
2722 }
2548 } 2723 }
2549
2550 spin_unlock(&pag->pagb_lock); 2724 spin_unlock(&pag->pagb_lock);
2725 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2551 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
2727 return match;
2552} 2728}
2553 2729
2554 2730void
2555/* 2731xfs_alloc_busy_clear(
2556 * If we find the extent in the busy list, force the log out to get the 2732 struct xfs_mount *mp,
2557 * extent out of the busy list so the caller can use it straight away. 2733 struct xfs_busy_extent *busyp)
2558 */
2559STATIC void
2560xfs_alloc_search_busy(xfs_trans_t *tp,
2561 xfs_agnumber_t agno,
2562 xfs_agblock_t bno,
2563 xfs_extlen_t len)
2564{ 2734{
2565 struct xfs_perag *pag; 2735 struct xfs_perag *pag;
2566 xfs_perag_busy_t *bsy;
2567 xfs_agblock_t uend, bend;
2568 xfs_lsn_t lsn = 0;
2569 int cnt;
2570 2736
2571 pag = xfs_perag_get(tp->t_mountp, agno); 2737 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2572 spin_lock(&pag->pagb_lock); 2738 busyp->length);
2573 cnt = pag->pagb_count;
2574 2739
2575 /* 2740 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2576 * search pagb_list for this slot, skipping open slots. We have to 2741 busyp->length) == 1);
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2581 uend = bno + len - 1;
2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2584 if (!bsy->busy_tp)
2585 continue;
2586 2742
2587 bend = bsy->busy_start + bsy->busy_length - 1; 2743 list_del_init(&busyp->list);
2588 if (bno > bend || uend < bsy->busy_start)
2589 continue;
2590 2744
2591 /* (start1,length1) within (start2, length2) */ 2745 pag = xfs_perag_get(mp, busyp->agno);
2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2746 spin_lock(&pag->pagb_lock);
2593 lsn = bsy->busy_tp->t_commit_lsn; 2747 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2594 }
2595 spin_unlock(&pag->pagb_lock); 2748 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag); 2749 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2598 2750
2599 /* 2751 kmem_free(busyp);
2600 * If a block was found, force the log through the LSN of the
2601 * transaction that freed the block
2602 */
2603 if (lsn)
2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2605} 2752}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
22struct xfs_mount; 22struct xfs_mount;
23struct xfs_perag; 23struct xfs_perag;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_busy_extent;
25 26
26/* 27/*
27 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
119#ifdef __KERNEL__ 120#ifdef __KERNEL__
120 121
121void 122void
122xfs_alloc_mark_busy(xfs_trans_t *tp, 123xfs_alloc_busy_insert(xfs_trans_t *tp,
123 xfs_agnumber_t agno, 124 xfs_agnumber_t agno,
124 xfs_agblock_t bno, 125 xfs_agblock_t bno,
125 xfs_extlen_t len); 126 xfs_extlen_t len);
126 127
127void 128void
128xfs_alloc_clear_busy(xfs_trans_t *tp, 129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
129 xfs_agnumber_t ag,
130 int idx);
131 130
132#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
133 132
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
134 * disk. If a busy block is allocated, the iclog is pushed up to the 134 * disk. If a busy block is allocated, the iclog is pushed up to the
135 * LSN that freed the block. 135 * LSN that freed the block.
136 */ 136 */
137 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 137 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
138 xfs_trans_agbtree_delta(cur->bc_tp, -1); 138 xfs_trans_agbtree_delta(cur->bc_tp, -1);
139 return 0; 139 return 0;
140} 140}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727b..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
64 nbytes = last - first + 1; 64 nbytes = last - first + 1;
65 bfset(bip->bli_logged, first, nbytes); 65 bfset(bip->bli_logged, first, nbytes);
66 for (x = 0; x < nbytes; x++) { 66 for (x = 0; x < nbytes; x++) {
67 chunk_num = byte >> XFS_BLI_SHIFT; 67 chunk_num = byte >> XFS_BLF_SHIFT;
68 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 68 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
69 bit_num = chunk_num & (NBWORD - 1); 69 bit_num = chunk_num & (NBWORD - 1);
70 wordp = &(bip->bli_format.blf_data_map[word_num]); 70 wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
166 * cancel flag in it. 166 * cancel flag in it.
167 */ 167 */
168 trace_xfs_buf_item_size_stale(bip); 168 trace_xfs_buf_item_size_stale(bip);
169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 169 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
170 return 1; 170 return 1;
171 } 171 }
172 172
@@ -197,9 +197,9 @@ xfs_buf_item_size(
197 } else if (next_bit != last_bit + 1) { 197 } else if (next_bit != last_bit + 1) {
198 last_bit = next_bit; 198 last_bit = next_bit;
199 nvecs++; 199 nvecs++;
200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
201 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 201 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
202 XFS_BLI_CHUNK)) { 202 XFS_BLF_CHUNK)) {
203 last_bit = next_bit; 203 last_bit = next_bit;
204 nvecs++; 204 nvecs++;
205 } else { 205 } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
257 /*
258 * If it is an inode buffer, transfer the in-memory state to the
259 * format flags and clear the in-memory state. We do not transfer
260 * this state if the inode buffer allocation has not yet been committed
261 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262 * correct replay of the inode allocation.
263 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 }
270
257 if (bip->bli_flags & XFS_BLI_STALE) { 271 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 272 /*
259 * The buffer is stale, so all we need to log 273 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
261 * cancel flag in it. 275 * cancel flag in it.
262 */ 276 */
263 trace_xfs_buf_item_format_stale(bip); 277 trace_xfs_buf_item_format_stale(bip);
264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 278 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
265 bip->bli_format.blf_size = nvecs; 279 bip->bli_format.blf_size = nvecs;
266 return; 280 return;
267 } 281 }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
294 * keep counting and scanning. 308 * keep counting and scanning.
295 */ 309 */
296 if (next_bit == -1) { 310 if (next_bit == -1) {
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 311 buffer_offset = first_bit * XFS_BLF_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 312 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 313 vecp->i_len = nbits * XFS_BLF_CHUNK;
300 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 314 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 315 nvecs++;
302 break; 316 break;
303 } else if (next_bit != last_bit + 1) { 317 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 318 buffer_offset = first_bit * XFS_BLF_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 319 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 320 vecp->i_len = nbits * XFS_BLF_CHUNK;
307 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 321 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 322 nvecs++;
309 vecp++; 323 vecp++;
310 first_bit = next_bit; 324 first_bit = next_bit;
311 last_bit = next_bit; 325 last_bit = next_bit;
312 nbits = 1; 326 nbits = 1;
313 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 327 } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
314 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 328 (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
315 XFS_BLI_CHUNK)) { 329 XFS_BLF_CHUNK)) {
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 330 buffer_offset = first_bit * XFS_BLF_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 331 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 332 vecp->i_len = nbits * XFS_BLF_CHUNK;
319 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 333 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 334/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 335 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
341} 355}
342 356
343/* 357/*
344 * This is called to pin the buffer associated with the buf log 358 * This is called to pin the buffer associated with the buf log item in memory
345 * item in memory so it cannot be written out. Simply call bpin() 359 * so it cannot be written out. Simply call bpin() on the buffer to do this.
346 * on the buffer to do this. 360 *
361 * We also always take a reference to the buffer log item here so that the bli
362 * is held while the item is pinned in memory. This means that we can
363 * unconditionally drop the reference count a transaction holds when the
364 * transaction is completed.
347 */ 365 */
366
348STATIC void 367STATIC void
349xfs_buf_item_pin( 368xfs_buf_item_pin(
350 xfs_buf_log_item_t *bip) 369 xfs_buf_log_item_t *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
356 ASSERT(atomic_read(&bip->bli_refcount) > 0); 375 ASSERT(atomic_read(&bip->bli_refcount) > 0);
357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
358 (bip->bli_flags & XFS_BLI_STALE)); 377 (bip->bli_flags & XFS_BLI_STALE));
378 atomic_inc(&bip->bli_refcount);
359 trace_xfs_buf_item_pin(bip); 379 trace_xfs_buf_item_pin(bip);
360 xfs_bpin(bp); 380 xfs_bpin(bp);
361} 381}
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
393 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 413 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 414 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
395 ASSERT(XFS_BUF_ISSTALE(bp)); 415 ASSERT(XFS_BUF_ISSTALE(bp));
396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 416 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
397 trace_xfs_buf_item_unpin_stale(bip); 417 trace_xfs_buf_item_unpin_stale(bip);
398 418
399 /* 419 /*
@@ -489,20 +509,23 @@ xfs_buf_item_trylock(
489} 509}
490 510
491/* 511/*
492 * Release the buffer associated with the buf log item. 512 * Release the buffer associated with the buf log item. If there is no dirty
493 * If there is no dirty logged data associated with the 513 * logged data associated with the buffer recorded in the buf log item, then
494 * buffer recorded in the buf log item, then free the 514 * free the buf log item and remove the reference to it in the buffer.
495 * buf log item and remove the reference to it in the 515 *
496 * buffer. 516 * This call ignores the recursion count. It is only called when the buffer
517 * should REALLY be unlocked, regardless of the recursion count.
497 * 518 *
498 * This call ignores the recursion count. It is only called 519 * We unconditionally drop the transaction's reference to the log item. If the
499 * when the buffer should REALLY be unlocked, regardless 520 * item was logged, then another reference was taken when it was pinned, so we
500 * of the recursion count. 521 * can safely drop the transaction reference now. This also allows us to avoid
522 * potential races with the unpin code freeing the bli by not referencing the
523 * bli after we've dropped the reference count.
501 * 524 *
502 * If the XFS_BLI_HOLD flag is set in the buf log item, then 525 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
503 * free the log item if necessary but do not unlock the buffer. 526 * if necessary but do not unlock the buffer. This is for support of
504 * This is for support of xfs_trans_bhold(). Make sure the 527 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
505 * XFS_BLI_HOLD field is cleared if we don't free the item. 528 * free the item.
506 */ 529 */
507STATIC void 530STATIC void
508xfs_buf_item_unlock( 531xfs_buf_item_unlock(
@@ -514,73 +537,54 @@ xfs_buf_item_unlock(
514 537
515 bp = bip->bli_buf; 538 bp = bip->bli_buf;
516 539
517 /* 540 /* Clear the buffer's association with this transaction. */
518 * Clear the buffer's association with this transaction.
519 */
520 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 541 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
521 542
522 /* 543 /*
523 * If this is a transaction abort, don't return early. 544 * If this is a transaction abort, don't return early. Instead, allow
524 * Instead, allow the brelse to happen. 545 * the brelse to happen. Normally it would be done for stale
525 * Normally it would be done for stale (cancelled) buffers 546 * (cancelled) buffers at unpin time, but we'll never go through the
526 * at unpin time, but we'll never go through the pin/unpin 547 * pin/unpin cycle if we abort inside commit.
527 * cycle if we abort inside commit.
528 */ 548 */
529 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 549 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
530 550
531 /* 551 /*
532 * If the buf item is marked stale, then don't do anything. 552 * Before possibly freeing the buf item, determine if we should
533 * We'll unlock the buffer and free the buf item when the 553 * release the buffer at the end of this routine.
534 * buffer is unpinned for the last time.
535 */ 554 */
536 if (bip->bli_flags & XFS_BLI_STALE) { 555 hold = bip->bli_flags & XFS_BLI_HOLD;
537 bip->bli_flags &= ~XFS_BLI_LOGGED; 556
538 trace_xfs_buf_item_unlock_stale(bip); 557 /* Clear the per transaction state. */
539 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 558 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
540 if (!aborted)
541 return;
542 }
543 559
544 /* 560 /*
545 * Drop the transaction's reference to the log item if 561 * If the buf item is marked stale, then don't do anything. We'll
546 * it was not logged as part of the transaction. Otherwise 562 * unlock the buffer and free the buf item when the buffer is unpinned
547 * we'll drop the reference in xfs_buf_item_unpin() when 563 * for the last time.
548 * the transaction is really through with the buffer.
549 */ 564 */
550 if (!(bip->bli_flags & XFS_BLI_LOGGED)) { 565 if (bip->bli_flags & XFS_BLI_STALE) {
551 atomic_dec(&bip->bli_refcount); 566 trace_xfs_buf_item_unlock_stale(bip);
552 } else { 567 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
553 /* 568 if (!aborted) {
554 * Clear the logged flag since this is per 569 atomic_dec(&bip->bli_refcount);
555 * transaction state. 570 return;
556 */ 571 }
557 bip->bli_flags &= ~XFS_BLI_LOGGED;
558 } 572 }
559 573
560 /*
561 * Before possibly freeing the buf item, determine if we should
562 * release the buffer at the end of this routine.
563 */
564 hold = bip->bli_flags & XFS_BLI_HOLD;
565 trace_xfs_buf_item_unlock(bip); 574 trace_xfs_buf_item_unlock(bip);
566 575
567 /* 576 /*
568 * If the buf item isn't tracking any data, free it. 577 * If the buf item isn't tracking any data, free it, otherwise drop the
569 * Otherwise, if XFS_BLI_HOLD is set clear it. 578 * reference we hold to it.
570 */ 579 */
571 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 580 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
572 bip->bli_format.blf_map_size)) { 581 bip->bli_format.blf_map_size))
573 xfs_buf_item_relse(bp); 582 xfs_buf_item_relse(bp);
574 } else if (hold) { 583 else
575 bip->bli_flags &= ~XFS_BLI_HOLD; 584 atomic_dec(&bip->bli_refcount);
576 }
577 585
578 /* 586 if (!hold)
579 * Release the buffer if XFS_BLI_HOLD was not set.
580 */
581 if (!hold) {
582 xfs_buf_relse(bp); 587 xfs_buf_relse(bp);
583 }
584} 588}
585 589
586/* 590/*
@@ -717,12 +721,12 @@ xfs_buf_item_init(
717 } 721 }
718 722
719 /* 723 /*
720 * chunks is the number of XFS_BLI_CHUNK size pieces 724 * chunks is the number of XFS_BLF_CHUNK size pieces
721 * the buffer can be divided into. Make sure not to 725 * the buffer can be divided into. Make sure not to
722 * truncate any pieces. map_size is the size of the 726 * truncate any pieces. map_size is the size of the
723 * bitmap needed to describe the chunks of the buffer. 727 * bitmap needed to describe the chunks of the buffer.
724 */ 728 */
725 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 729 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
726 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 730 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
727 731
728 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 732 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -790,8 +794,8 @@ xfs_buf_item_log(
790 /* 794 /*
791 * Convert byte offsets to bit numbers. 795 * Convert byte offsets to bit numbers.
792 */ 796 */
793 first_bit = first >> XFS_BLI_SHIFT; 797 first_bit = first >> XFS_BLF_SHIFT;
794 last_bit = last >> XFS_BLI_SHIFT; 798 last_bit = last >> XFS_BLF_SHIFT;
795 799
796 /* 800 /*
797 * Calculate the total number of bits to be set. 801 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index df4454511f73..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format {
41 * This flag indicates that the buffer contains on disk inodes 41 * This flag indicates that the buffer contains on disk inodes
42 * and requires special recovery handling. 42 * and requires special recovery handling.
43 */ 43 */
44#define XFS_BLI_INODE_BUF 0x1 44#define XFS_BLF_INODE_BUF 0x1
45/* 45/*
46 * This flag indicates that the buffer should not be replayed 46 * This flag indicates that the buffer should not be replayed
47 * during recovery because its blocks are being freed. 47 * during recovery because its blocks are being freed.
48 */ 48 */
49#define XFS_BLI_CANCEL 0x2 49#define XFS_BLF_CANCEL 0x2
50/* 50/*
51 * This flag indicates that the buffer contains on disk 51 * This flag indicates that the buffer contains on disk
52 * user or group dquots and may require special recovery handling. 52 * user or group dquots and may require special recovery handling.
53 */ 53 */
54#define XFS_BLI_UDQUOT_BUF 0x4 54#define XFS_BLF_UDQUOT_BUF 0x4
55#define XFS_BLI_PDQUOT_BUF 0x8 55#define XFS_BLF_PDQUOT_BUF 0x8
56#define XFS_BLI_GDQUOT_BUF 0x10 56#define XFS_BLF_GDQUOT_BUF 0x10
57 57
58#define XFS_BLI_CHUNK 128 58#define XFS_BLF_CHUNK 128
59#define XFS_BLI_SHIFT 7 59#define XFS_BLF_SHIFT 7
60#define BIT_TO_WORD_SHIFT 5 60#define BIT_TO_WORD_SHIFT 5
61#define NBWORD (NBBY * sizeof(unsigned int)) 61#define NBWORD (NBBY * sizeof(unsigned int))
62 62
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ef96175c0744..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
170 va_list ap; 170 va_list ap;
171 171
172#ifdef DEBUG 172#ifdef DEBUG
173 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; 173 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
174#endif 174#endif
175 175
176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag) 176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3038dd52c72a..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
56STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
57STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
58 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
59 xlog_in_core_t **commit_iclog, uint flags);
60 57
61/* local state machine functions */ 58/* local state machine functions */
62STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 59STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
86STATIC void xlog_ungrant_log_space(xlog_t *log, 83STATIC void xlog_ungrant_log_space(xlog_t *log,
87 xlog_ticket_t *ticket); 84 xlog_ticket_t *ticket);
88 85
89
90/* local ticket functions */
91STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
92 int unit_bytes,
93 int count,
94 char clientid,
95 uint flags);
96
97#if defined(DEBUG) 86#if defined(DEBUG)
98STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 87STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
99STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 88STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
@@ -360,6 +349,15 @@ xfs_log_reserve(
360 ASSERT(flags & XFS_LOG_PERM_RESERV); 349 ASSERT(flags & XFS_LOG_PERM_RESERV);
361 internal_ticket = *ticket; 350 internal_ticket = *ticket;
362 351
352 /*
353 * this is a new transaction on the ticket, so we need to
354 * change the transaction ID so that the next transaction has a
355 * different TID in the log. Just add one to the existing tid
356 * so that we can see chains of rolling transactions in the log
357 * easily.
358 */
359 internal_ticket->t_tid++;
360
363 trace_xfs_log_reserve(log, internal_ticket); 361 trace_xfs_log_reserve(log, internal_ticket);
364 362
365 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 363 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -367,7 +365,8 @@ xfs_log_reserve(
367 } else { 365 } else {
368 /* may sleep if need to allocate more tickets */ 366 /* may sleep if need to allocate more tickets */
369 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, 367 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
370 client, flags); 368 client, flags,
369 KM_SLEEP|KM_MAYFAIL);
371 if (!internal_ticket) 370 if (!internal_ticket)
372 return XFS_ERROR(ENOMEM); 371 return XFS_ERROR(ENOMEM);
373 internal_ticket->t_trans_type = t_type; 372 internal_ticket->t_trans_type = t_type;
@@ -452,6 +451,13 @@ xfs_log_mount(
452 /* Normal transactions can now occur */ 451 /* Normal transactions can now occur */
453 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 452 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
454 453
454 /*
455 * Now the log has been fully initialised and we know were our
456 * space grant counters are, we can initialise the permanent ticket
457 * needed for delayed logging to work.
458 */
459 xlog_cil_init_post_recovery(mp->m_log);
460
455 return 0; 461 return 0;
456 462
457out_destroy_ail: 463out_destroy_ail:
@@ -658,6 +664,10 @@ xfs_log_item_init(
658 item->li_ailp = mp->m_ail; 664 item->li_ailp = mp->m_ail;
659 item->li_type = type; 665 item->li_type = type;
660 item->li_ops = ops; 666 item->li_ops = ops;
667 item->li_lv = NULL;
668
669 INIT_LIST_HEAD(&item->li_ail);
670 INIT_LIST_HEAD(&item->li_cil);
661} 671}
662 672
663/* 673/*
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1168 *iclogp = log->l_iclog; /* complete ring */ 1178 *iclogp = log->l_iclog; /* complete ring */
1169 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1179 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1170 1180
1181 error = xlog_cil_init(log);
1182 if (error)
1183 goto out_free_iclog;
1171 return log; 1184 return log;
1172 1185
1173out_free_iclog: 1186out_free_iclog:
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
1494 xlog_in_core_t *iclog, *next_iclog; 1507 xlog_in_core_t *iclog, *next_iclog;
1495 int i; 1508 int i;
1496 1509
1510 xlog_cil_destroy(log);
1511
1497 iclog = log->l_iclog; 1512 iclog = log->l_iclog;
1498 for (i=0; i<log->l_iclog_bufs; i++) { 1513 for (i=0; i<log->l_iclog_bufs; i++) {
1499 sv_destroy(&iclog->ic_force_wait); 1514 sv_destroy(&iclog->ic_force_wait);
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
1536 * print out info relating to regions written which consume 1551 * print out info relating to regions written which consume
1537 * the reservation 1552 * the reservation
1538 */ 1553 */
1539STATIC void 1554void
1540xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1555xlog_print_tic_res(
1556 struct xfs_mount *mp,
1557 struct xlog_ticket *ticket)
1541{ 1558{
1542 uint i; 1559 uint i;
1543 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1560 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1637 "bad-rtype" : res_type_str[r_type-1]), 1654 "bad-rtype" : res_type_str[r_type-1]),
1638 ticket->t_res_arr[i].r_len); 1655 ticket->t_res_arr[i].r_len);
1639 } 1656 }
1657
1658 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1659 "xfs_log_write: reservation ran out. Need to up reservation");
1660 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1640} 1661}
1641 1662
1642/* 1663/*
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish(
1865 * we don't update ic_offset until the end when we know exactly how many 1886 * we don't update ic_offset until the end when we know exactly how many
1866 * bytes have been written out. 1887 * bytes have been written out.
1867 */ 1888 */
1868STATIC int 1889int
1869xlog_write( 1890xlog_write(
1870 struct log *log, 1891 struct log *log,
1871 struct xfs_log_vec *log_vector, 1892 struct xfs_log_vec *log_vector,
@@ -1889,22 +1910,26 @@ xlog_write(
1889 *start_lsn = 0; 1910 *start_lsn = 0;
1890 1911
1891 len = xlog_write_calc_vec_length(ticket, log_vector); 1912 len = xlog_write_calc_vec_length(ticket, log_vector);
1892 if (ticket->t_curr_res < len) { 1913 if (log->l_cilp) {
1893 xlog_print_tic_res(log->l_mp, ticket); 1914 /*
1894#ifdef DEBUG 1915 * Region headers and bytes are already accounted for.
1895 xlog_panic( 1916 * We only need to take into account start records and
1896 "xfs_log_write: reservation ran out. Need to up reservation"); 1917 * split regions in this function.
1897#else 1918 */
1898 /* Customer configurable panic */ 1919 if (ticket->t_flags & XLOG_TIC_INITED)
1899 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, 1920 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1900 "xfs_log_write: reservation ran out. Need to up reservation");
1901 1921
1902 /* If we did not panic, shutdown the filesystem */ 1922 /*
1903 xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); 1923 * Commit record headers need to be accounted for. These
1904#endif 1924 * come in as separate writes so are easy to detect.
1905 } 1925 */
1926 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1927 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1928 } else
1929 ticket->t_curr_res -= len;
1906 1930
1907 ticket->t_curr_res -= len; 1931 if (ticket->t_curr_res < 0)
1932 xlog_print_tic_res(log->l_mp, ticket);
1908 1933
1909 index = 0; 1934 index = 0;
1910 lv = log_vector; 1935 lv = log_vector;
@@ -3000,6 +3025,8 @@ _xfs_log_force(
3000 3025
3001 XFS_STATS_INC(xs_log_force); 3026 XFS_STATS_INC(xs_log_force);
3002 3027
3028 xlog_cil_push(log, 1);
3029
3003 spin_lock(&log->l_icloglock); 3030 spin_lock(&log->l_icloglock);
3004 3031
3005 iclog = log->l_iclog; 3032 iclog = log->l_iclog;
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn(
3149 3176
3150 XFS_STATS_INC(xs_log_force); 3177 XFS_STATS_INC(xs_log_force);
3151 3178
3179 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN)
3182 return 0;
3183 }
3184
3152try_again: 3185try_again:
3153 spin_lock(&log->l_icloglock); 3186 spin_lock(&log->l_icloglock);
3154 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get(
3313 return ticket; 3346 return ticket;
3314} 3347}
3315 3348
3349xlog_tid_t
3350xfs_log_get_trans_ident(
3351 struct xfs_trans *tp)
3352{
3353 return tp->t_ticket->t_tid;
3354}
3355
3316/* 3356/*
3317 * Allocate and initialise a new log ticket. 3357 * Allocate and initialise a new log ticket.
3318 */ 3358 */
3319STATIC xlog_ticket_t * 3359xlog_ticket_t *
3320xlog_ticket_alloc( 3360xlog_ticket_alloc(
3321 struct log *log, 3361 struct log *log,
3322 int unit_bytes, 3362 int unit_bytes,
3323 int cnt, 3363 int cnt,
3324 char client, 3364 char client,
3325 uint xflags) 3365 uint xflags,
3366 int alloc_flags)
3326{ 3367{
3327 struct xlog_ticket *tic; 3368 struct xlog_ticket *tic;
3328 uint num_headers; 3369 uint num_headers;
3329 int iclog_space; 3370 int iclog_space;
3330 3371
3331 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3372 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3332 if (!tic) 3373 if (!tic)
3333 return NULL; 3374 return NULL;
3334 3375
@@ -3647,6 +3688,11 @@ xlog_state_ioerror(
3647 * c. nothing new gets queued up after (a) and (b) are done. 3688 * c. nothing new gets queued up after (a) and (b) are done.
3648 * d. if !logerror, flush the iclogs to disk, then seal them off 3689 * d. if !logerror, flush the iclogs to disk, then seal them off
3649 * for business. 3690 * for business.
3691 *
3692 * Note: for delayed logging the !logerror case needs to flush the regions
3693 * held in memory out to the iclogs before flushing them to disk. This needs
3694 * to be done before the log is marked as shutdown, otherwise the flush to the
3695 * iclogs will fail.
3650 */ 3696 */
3651int 3697int
3652xfs_log_force_umount( 3698xfs_log_force_umount(
@@ -3680,6 +3726,16 @@ xfs_log_force_umount(
3680 return 1; 3726 return 1;
3681 } 3727 }
3682 retval = 0; 3728 retval = 0;
3729
3730 /*
3731 * Flush the in memory commit item list before marking the log as
3732 * being shut down. We need to do it in this order to ensure all the
3733 * completed transactions are flushed to disk with the xfs_log_force()
3734 * call below.
3735 */
3736 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3737 xlog_cil_push(log, 1);
3738
3683 /* 3739 /*
3684 * We must hold both the GRANT lock and the LOG lock, 3740 * We must hold both the GRANT lock and the LOG lock,
3685 * before we mark the filesystem SHUTDOWN and wake 3741 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 229d1f36ba9a..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21/* get lsn fields */
22
23#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) 22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
24#define BLOCK_LSN(lsn) ((uint)(lsn)) 23#define BLOCK_LSN(lsn) ((uint)(lsn))
25 24
@@ -114,6 +113,9 @@ struct xfs_log_vec {
114 struct xfs_log_vec *lv_next; /* next lv in build list */ 113 struct xfs_log_vec *lv_next; /* next lv in build list */
115 int lv_niovecs; /* number of iovecs in lv */ 114 int lv_niovecs; /* number of iovecs in lv */
116 struct xfs_log_iovec *lv_iovecp; /* iovec array */ 115 struct xfs_log_iovec *lv_iovecp; /* iovec array */
116 struct xfs_log_item *lv_item; /* owner */
117 char *lv_buf; /* formatted buffer */
118 int lv_buf_len; /* size of formatted buffer */
117}; 119};
118 120
119/* 121/*
@@ -134,6 +136,7 @@ struct xlog_in_core;
134struct xlog_ticket; 136struct xlog_ticket;
135struct xfs_log_item; 137struct xfs_log_item;
136struct xfs_item_ops; 138struct xfs_item_ops;
139struct xfs_trans;
137 140
138void xfs_log_item_init(struct xfs_mount *mp, 141void xfs_log_item_init(struct xfs_mount *mp,
139 struct xfs_log_item *item, 142 struct xfs_log_item *item,
@@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp);
187 190
188void xlog_iodone(struct xfs_buf *); 191void xlog_iodone(struct xfs_buf *);
189 192
190struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 193struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
191void xfs_log_ticket_put(struct xlog_ticket *ticket); 194void xfs_log_ticket_put(struct xlog_ticket *ticket);
192 195
196xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
197
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags);
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
202
193#endif 203#endif
194 204
195 205
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_error.h"
33#include "xfs_alloc.h"
34
35/*
36 * Perform initial CIL structure initialisation. If the CIL is not
37 * enabled in this filesystem, ensure the log->l_cilp is null so
38 * we can check this conditional to determine if we are doing delayed
39 * logging or not.
40 */
41int
42xlog_cil_init(
43 struct log *log)
44{
45 struct xfs_cil *cil;
46 struct xfs_cil_ctx *ctx;
47
48 log->l_cilp = NULL;
49 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
50 return 0;
51
52 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
53 if (!cil)
54 return ENOMEM;
55
56 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
57 if (!ctx) {
58 kmem_free(cil);
59 return ENOMEM;
60 }
61
62 INIT_LIST_HEAD(&cil->xc_cil);
63 INIT_LIST_HEAD(&cil->xc_committing);
64 spin_lock_init(&cil->xc_cil_lock);
65 init_rwsem(&cil->xc_ctx_lock);
66 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
67
68 INIT_LIST_HEAD(&ctx->committing);
69 INIT_LIST_HEAD(&ctx->busy_extents);
70 ctx->sequence = 1;
71 ctx->cil = cil;
72 cil->xc_ctx = ctx;
73
74 cil->xc_log = log;
75 log->l_cilp = cil;
76 return 0;
77}
78
79void
80xlog_cil_destroy(
81 struct log *log)
82{
83 if (!log->l_cilp)
84 return;
85
86 if (log->l_cilp->xc_ctx) {
87 if (log->l_cilp->xc_ctx->ticket)
88 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
89 kmem_free(log->l_cilp->xc_ctx);
90 }
91
92 ASSERT(list_empty(&log->l_cilp->xc_cil));
93 kmem_free(log->l_cilp);
94}
95
96/*
97 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
98 * recover, so we don't allow failure here. Also, we allocate in a context that
99 * we don't want to be issuing transactions from, so we need to tell the
100 * allocation code this as well.
101 *
102 * We don't reserve any space for the ticket - we are going to steal whatever
103 * space we require from transactions as they commit. To ensure we reserve all
104 * the space required, we need to set the current reservation of the ticket to
105 * zero so that we know to steal the initial transaction overhead from the
106 * first transaction commit.
107 */
108static struct xlog_ticket *
109xlog_cil_ticket_alloc(
110 struct log *log)
111{
112 struct xlog_ticket *tic;
113
114 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
115 KM_SLEEP|KM_NOFS);
116 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
117
118 /*
119 * set the current reservation to zero so we know to steal the basic
120 * transaction overhead reservation from the first transaction commit.
121 */
122 tic->t_curr_res = 0;
123 return tic;
124}
125
126/*
127 * After the first stage of log recovery is done, we know where the head and
128 * tail of the log are. We need this log initialisation done before we can
129 * initialise the first CIL checkpoint context.
130 *
131 * Here we allocate a log ticket to track space usage during a CIL push. This
132 * ticket is passed to xlog_write() directly so that we don't slowly leak log
133 * space by failing to account for space used by log headers and additional
134 * region headers for split regions.
135 */
136void
137xlog_cil_init_post_recovery(
138 struct log *log)
139{
140 if (!log->l_cilp)
141 return;
142
143 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
144 log->l_cilp->xc_ctx->sequence = 1;
145 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
146 log->l_curr_block);
147}
148
149/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
212 * Now transfer enough transaction reservation to the context ticket
213 * for the checkpoint. The context ticket is special - the unit
214 * reservation has to grow as well as the current reservation as we
215 * steal from tickets so we can correctly determine the space used
216 * during the transaction commit.
217 */
218 if (ctx->ticket->t_curr_res == 0) {
219 /* first commit in checkpoint, steal the header reservation */
220 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
221 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
222 ticket->t_curr_res -= ctx->ticket->t_unit_res;
223 }
224
225 /* do we need space for more log record headers? */
226 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
227 if (len > 0 && (ctx->space_used / iclog_space !=
228 (ctx->space_used + len) / iclog_space)) {
229 int hdrs;
230
231 hdrs = (len + iclog_space - 1) / iclog_space;
232 /* need to take into account split region headers, too */
233 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
234 ctx->ticket->t_unit_res += hdrs;
235 ctx->ticket->t_curr_res += hdrs;
236 ticket->t_curr_res -= hdrs;
237 ASSERT(ticket->t_curr_res >= len);
238 }
239 ticket->t_curr_res -= len;
240 ctx->space_used += len;
241
242 spin_unlock(&cil->xc_cil_lock);
243}
244
245/*
246 * Format log item into a flat buffers
247 *
248 * For delayed logging, we need to hold a formatted buffer containing all the
249 * changes on the log item. This enables us to relog the item in memory and
250 * write it out asynchronously without needing to relock the object that was
251 * modified at the time it gets written into the iclog.
252 *
253 * This function builds a vector for the changes in each log item in the
254 * transaction. It then works out the length of the buffer needed for each log
255 * item, allocates them and formats the vector for the item into the buffer.
256 * The buffer is then attached to the log item are then inserted into the
257 * Committed Item List for tracking until the next checkpoint is written out.
258 *
259 * We don't set up region headers during this process; we simply copy the
260 * regions into the flat buffer. We can do this because we still have to do a
261 * formatting step to write the regions into the iclog buffer. Writing the
262 * ophdrs during the iclog write means that we can support splitting large
263 * regions across iclog boundares without needing a change in the format of the
264 * item/region encapsulation.
265 *
266 * Hence what we need to do now is change the rewrite the vector array to point
267 * to the copied region inside the buffer we just allocated. This allows us to
268 * format the regions into the iclog as though they are being formatted
269 * directly out of the objects themselves.
270 */
271static void
272xlog_cil_format_items(
273 struct log *log,
274 struct xfs_log_vec *log_vector,
275 struct xlog_ticket *ticket,
276 xfs_lsn_t *start_lsn)
277{
278 struct xfs_log_vec *lv;
279
280 if (start_lsn)
281 *start_lsn = log->l_cilp->xc_ctx->sequence;
282
283 ASSERT(log_vector);
284 for (lv = log_vector; lv; lv = lv->lv_next) {
285 void *ptr;
286 int index;
287 int len = 0;
288
289 /* build the vector array and calculate it's length */
290 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
291 for (index = 0; index < lv->lv_niovecs; index++)
292 len += lv->lv_iovecp[index].i_len;
293
294 lv->lv_buf_len = len;
295 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
296 ptr = lv->lv_buf;
297
298 for (index = 0; index < lv->lv_niovecs; index++) {
299 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
300
301 memcpy(ptr, vec->i_addr, vec->i_len);
302 vec->i_addr = ptr;
303 ptr += vec->i_len;
304 }
305 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
306
307 xlog_cil_insert(log, ticket, lv->lv_item, lv);
308 }
309}
310
311static void
312xlog_cil_free_logvec(
313 struct xfs_log_vec *log_vector)
314{
315 struct xfs_log_vec *lv;
316
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 }
323}
324
325/*
326 * Commit a transaction with the given vector to the Committed Item List.
327 *
328 * To do this, we need to format the item, pin it in memory if required and
329 * account for the space used by the transaction. Once we have done that we
330 * need to release the unused reservation for the transaction, attach the
331 * transaction to the checkpoint context so we carry the busy extents through
332 * to checkpoint completion, and then unlock all the items in the transaction.
333 *
334 * For more specific information about the order of operations in
335 * xfs_log_commit_cil() please refer to the comments in
336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
341 */
342int
343xfs_log_commit_cil(
344 struct xfs_mount *mp,
345 struct xfs_trans *tp,
346 struct xfs_log_vec *log_vector,
347 xfs_lsn_t *commit_lsn,
348 int flags)
349{
350 struct log *log = mp->m_log;
351 int log_flags = 0;
352 int push = 0;
353
354 if (flags & XFS_TRANS_RELEASE_LOG_RES)
355 log_flags = XFS_LOG_REL_PERM_RESERV;
356
357 if (XLOG_FORCED_SHUTDOWN(log)) {
358 xlog_cil_free_logvec(log_vector);
359 return XFS_ERROR(EIO);
360 }
361
362 /* lock out background commit */
363 down_read(&log->l_cilp->xc_ctx_lock);
364 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
365
366 /* check we didn't blow the reservation */
367 if (tp->t_ticket->t_curr_res < 0)
368 xlog_print_tic_res(log->l_mp, tp->t_ticket);
369
370 /* attach the transaction to the CIL if it has any busy extents */
371 if (!list_empty(&tp->t_busy)) {
372 spin_lock(&log->l_cilp->xc_cil_lock);
373 list_splice_init(&tp->t_busy,
374 &log->l_cilp->xc_ctx->busy_extents);
375 spin_unlock(&log->l_cilp->xc_cil_lock);
376 }
377
378 tp->t_commit_lsn = *commit_lsn;
379 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
380 xfs_trans_unreserve_and_mod_sb(tp);
381
382 /* check for background commit before unlock */
383 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
384 push = 1;
385 up_read(&log->l_cilp->xc_ctx_lock);
386
387 /*
388 * We need to push CIL every so often so we don't cache more than we
389 * can fit in the log. The limit really is that a checkpoint can't be
390 * more than half the log (the current checkpoint is not allowed to
391 * overwrite the previous checkpoint), but commit latency and memory
392 * usage limit this to a smaller size in most cases.
393 */
394 if (push)
395 xlog_cil_push(log, 0);
396 return 0;
397}
398
399/*
400 * Mark all items committed and clear busy extents. We free the log vector
401 * chains in a separate pass so that we unpin the log items as quickly as
402 * possible.
403 */
404static void
405xlog_cil_committed(
406 void *args,
407 int abort)
408{
409 struct xfs_cil_ctx *ctx = args;
410 struct xfs_log_vec *lv;
411 int abortflag = abort ? XFS_LI_ABORTED : 0;
412 struct xfs_busy_extent *busyp, *n;
413
414 /* unpin all the log items */
415 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
416 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
417 abortflag);
418 }
419
420 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
421 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
422
423 spin_lock(&ctx->cil->xc_cil_lock);
424 list_del(&ctx->committing);
425 spin_unlock(&ctx->cil->xc_cil_lock);
426
427 xlog_cil_free_logvec(ctx->lv_chain);
428 kmem_free(ctx);
429}
430
431/*
432 * Push the Committed Item List to the log. If the push_now flag is not set,
433 * then it is a background flush and so we can chose to ignore it.
434 */
435int
436xlog_cil_push(
437 struct log *log,
438 int push_now)
439{
440 struct xfs_cil *cil = log->l_cilp;
441 struct xfs_log_vec *lv;
442 struct xfs_cil_ctx *ctx;
443 struct xfs_cil_ctx *new_ctx;
444 struct xlog_in_core *commit_iclog;
445 struct xlog_ticket *tic;
446 int num_lv;
447 int num_iovecs;
448 int len;
449 int error = 0;
450 struct xfs_trans_header thdr;
451 struct xfs_log_iovec lhdr;
452 struct xfs_log_vec lvhdr = { NULL };
453 xfs_lsn_t commit_lsn;
454
455 if (!cil)
456 return 0;
457
458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
459 new_ctx->ticket = xlog_cil_ticket_alloc(log);
460
461 /* lock out transaction commit, but don't block on background push */
462 if (!down_write_trylock(&cil->xc_ctx_lock)) {
463 if (!push_now)
464 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock);
466 }
467 ctx = cil->xc_ctx;
468
469 /* check if we've anything to push */
470 if (list_empty(&cil->xc_cil))
471 goto out_skip;
472
473 /* check for spurious background flush */
474 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
475 goto out_skip;
476
477 /*
478 * pull all the log vectors off the items in the CIL, and
479 * remove the items from the CIL. We don't need the CIL lock
480 * here because it's only needed on the transaction commit
481 * side which is currently locked out by the flush lock.
482 */
483 lv = NULL;
484 num_lv = 0;
485 num_iovecs = 0;
486 len = 0;
487 while (!list_empty(&cil->xc_cil)) {
488 struct xfs_log_item *item;
489 int i;
490
491 item = list_first_entry(&cil->xc_cil,
492 struct xfs_log_item, li_cil);
493 list_del_init(&item->li_cil);
494 if (!ctx->lv_chain)
495 ctx->lv_chain = item->li_lv;
496 else
497 lv->lv_next = item->li_lv;
498 lv = item->li_lv;
499 item->li_lv = NULL;
500
501 num_lv++;
502 num_iovecs += lv->lv_niovecs;
503 for (i = 0; i < lv->lv_niovecs; i++)
504 len += lv->lv_iovecp[i].i_len;
505 }
506
507 /*
508 * initialise the new context and attach it to the CIL. Then attach
509 * the current context to the CIL committing lsit so it can be found
510 * during log forces to extract the commit lsn of the sequence that
511 * needs to be forced.
512 */
513 INIT_LIST_HEAD(&new_ctx->committing);
514 INIT_LIST_HEAD(&new_ctx->busy_extents);
515 new_ctx->sequence = ctx->sequence + 1;
516 new_ctx->cil = cil;
517 cil->xc_ctx = new_ctx;
518
519 /*
520 * The switch is now done, so we can drop the context lock and move out
521 * of a shared context. We can't just go straight to the commit record,
522 * though - we need to synchronise with previous and future commits so
523 * that the commit records are correctly ordered in the log to ensure
524 * that we process items during log IO completion in the correct order.
525 *
526 * For example, if we get an EFI in one checkpoint and the EFD in the
527 * next (e.g. due to log forces), we do not want the checkpoint with
528 * the EFD to be committed before the checkpoint with the EFI. Hence
529 * we must strictly order the commit records of the checkpoints so
530 * that: a) the checkpoint callbacks are attached to the iclogs in the
531 * correct order; and b) the checkpoints are replayed in correct order
532 * in log recovery.
533 *
534 * Hence we need to add this context to the committing context list so
535 * that higher sequences will wait for us to write out a commit record
536 * before they do.
537 */
538 spin_lock(&cil->xc_cil_lock);
539 list_add(&ctx->committing, &cil->xc_committing);
540 spin_unlock(&cil->xc_cil_lock);
541 up_write(&cil->xc_ctx_lock);
542
543 /*
544 * Build a checkpoint transaction header and write it to the log to
545 * begin the transaction. We need to account for the space used by the
546 * transaction header here as it is not accounted for in xlog_write().
547 *
548 * The LSN we need to pass to the log items on transaction commit is
549 * the LSN reported by the first log vector write. If we use the commit
550 * record lsn then we can move the tail beyond the grant write head.
551 */
552 tic = ctx->ticket;
553 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
554 thdr.th_type = XFS_TRANS_CHECKPOINT;
555 thdr.th_tid = tic->t_tid;
556 thdr.th_num_items = num_iovecs;
557 lhdr.i_addr = (xfs_caddr_t)&thdr;
558 lhdr.i_len = sizeof(xfs_trans_header_t);
559 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
560 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
561
562 lvhdr.lv_niovecs = 1;
563 lvhdr.lv_iovecp = &lhdr;
564 lvhdr.lv_next = ctx->lv_chain;
565
566 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
567 if (error)
568 goto out_abort;
569
570 /*
571 * now that we've written the checkpoint into the log, strictly
572 * order the commit records so replay will get them in the right order.
573 */
574restart:
575 spin_lock(&cil->xc_cil_lock);
576 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
577 /*
578 * Higher sequences will wait for this one so skip them.
579 * Don't wait for own own sequence, either.
580 */
581 if (new_ctx->sequence >= ctx->sequence)
582 continue;
583 if (!new_ctx->commit_lsn) {
584 /*
585 * It is still being pushed! Wait for the push to
586 * complete, then start again from the beginning.
587 */
588 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
589 goto restart;
590 }
591 }
592 spin_unlock(&cil->xc_cil_lock);
593
594 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
595 if (error || commit_lsn == -1)
596 goto out_abort;
597
598 /* attach all the transactions w/ busy extents to iclog */
599 ctx->log_cb.cb_func = xlog_cil_committed;
600 ctx->log_cb.cb_arg = ctx;
601 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
602 if (error)
603 goto out_abort;
604
605 /*
606 * now the checkpoint commit is complete and we've attached the
607 * callbacks to the iclog we can assign the commit LSN to the context
608 * and wake up anyone who is waiting for the commit to complete.
609 */
610 spin_lock(&cil->xc_cil_lock);
611 ctx->commit_lsn = commit_lsn;
612 sv_broadcast(&cil->xc_commit_wait);
613 spin_unlock(&cil->xc_cil_lock);
614
615 /* release the hounds! */
616 return xfs_log_release_iclog(log->l_mp, commit_iclog);
617
618out_skip:
619 up_write(&cil->xc_ctx_lock);
620out_free_ticket:
621 xfs_log_ticket_put(new_ctx->ticket);
622 kmem_free(new_ctx);
623 return 0;
624
625out_abort:
626 xlog_cil_committed(ctx, XFS_LI_ABORTED);
627 return XFS_ERROR(EIO);
628}
629
630/*
631 * Conditionally push the CIL based on the sequence passed in.
632 *
633 * We only need to push if we haven't already pushed the sequence
634 * number given. Hence the only time we will trigger a push here is
635 * if the push sequence is the same as the current context.
636 *
637 * We return the current commit lsn to allow the callers to determine if a
638 * iclog flush is necessary following this call.
639 *
640 * XXX: Initially, just push the CIL unconditionally and return whatever
641 * commit lsn is there. It'll be empty, so this is broken for now.
642 */
643xfs_lsn_t
644xlog_cil_push_lsn(
645 struct log *log,
646 xfs_lsn_t push_seq)
647{
648 struct xfs_cil *cil = log->l_cilp;
649 struct xfs_cil_ctx *ctx;
650 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
651
652restart:
653 down_write(&cil->xc_ctx_lock);
654 ASSERT(push_seq <= cil->xc_ctx->sequence);
655
656 /* check to see if we need to force out the current context */
657 if (push_seq == cil->xc_ctx->sequence) {
658 up_write(&cil->xc_ctx_lock);
659 xlog_cil_push(log, 1);
660 goto restart;
661 }
662
663 /*
664 * See if we can find a previous sequence still committing.
665 * We can drop the flush lock as soon as we have the cil lock
666 * because we are now only comparing contexts protected by
667 * the cil lock.
668 *
669 * We need to wait for all previous sequence commits to complete
670 * before allowing the force of push_seq to go ahead. Hence block
671 * on commits for those as well.
672 */
673 spin_lock(&cil->xc_cil_lock);
674 up_write(&cil->xc_ctx_lock);
675 list_for_each_entry(ctx, &cil->xc_committing, committing) {
676 if (ctx->sequence > push_seq)
677 continue;
678 if (!ctx->commit_lsn) {
679 /*
680 * It is still being pushed! Wait for the push to
681 * complete, then start again from the beginning.
682 */
683 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
684 goto restart;
685 }
686 if (ctx->sequence != push_seq)
687 continue;
688 /* found it! */
689 commit_lsn = ctx->commit_lsn;
690 }
691 spin_unlock(&cil->xc_cil_lock);
692 return commit_lsn;
693}
694
695/*
696 * Check if the current log item was first committed in this sequence.
697 * We can't rely on just the log item being in the CIL, we have to check
698 * the recorded commit sequence number.
699 *
700 * Note: for this to be used in a non-racy manner, it has to be called with
701 * CIL flushing locked out. As a result, it should only be used during the
702 * transaction commit process when deciding what to format into the item.
703 */
704bool
705xfs_log_item_in_current_chkpt(
706 struct xfs_log_item *lip)
707{
708 struct xfs_cil_ctx *ctx;
709
710 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711 return false;
712 if (list_empty(&lip->li_cil))
713 return false;
714
715 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716
717 /*
718 * li_seq is written on the first commit of a log item to record the
719 * first checkpoint it is written to. Hence if it is different to the
720 * current sequence, we're in a new checkpoint.
721 */
722 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723 return false;
724 return true;
725}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9cf695154451..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 154 shutdown */
155typedef __uint32_t xlog_tid_t;
156
157 155
158#ifdef __KERNEL__ 156#ifdef __KERNEL__
159/* 157/*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
379} xlog_in_core_t; 377} xlog_in_core_t;
380 378
381/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425};
426
427/*
428 * The amount of log space we should the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space
430 * effectively, that it is large enough to capture sufficient relogging to
431 * reduce log buffer IO significantly, but it is not too large for the log or
432 * induces too much latency when writing out through the iclogs. We track both
433 * space consumed and the number of vectors in the checkpoint context, so we
434 * need to decide which to use for limiting.
435 *
436 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of
438 * at least 512 bytes per 32k of log space just for the LR headers. That means
439 * 16KB of reservation per megabyte of delayed logging space we will consume,
440 * plus various headers. The number of headers will vary based on the num of
441 * io vectors, so limiting on a specific number of vectors is going to result
442 * in transactions of varying size. IOWs, it is more consistent to track and
443 * limit space consumed in the log rather than by the number of objects being
444 * logged in order to prevent checkpoint ticket overruns.
445 *
446 * Further, use of static reservations through the log grant mechanism is
447 * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
448 * grant) and a significant deadlock potential because regranting write space
449 * can block on log pushes. Hence if we have to regrant log space during a log
450 * push, we can deadlock.
451 *
452 * However, we can avoid this by use of a dynamic "reservation stealing"
453 * technique during transaction commit whereby unused reservation space in the
454 * transaction ticket is transferred to the CIL ctx commit ticket to cover the
455 * space needed by the checkpoint transaction. This means that we never need to
456 * specifically reserve space for the CIL checkpoint transaction, nor do we
457 * need to regrant space once the checkpoint completes. This also means the
458 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself.
460 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the
462 * checkpoint size so long as they don't violate any other size rules. Hence
463 * the initial maximum size for the checkpoint transaction will be set to a
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
465 * right now based on the latency of writing out a large amount of data through
466 * the circular iclog buffers.
467 */
468
469#define XLOG_CIL_SPACE_LIMIT(log) \
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471
472/*
382 * The reservation head lsn is not made up of a cycle number and block number. 473 * The reservation head lsn is not made up of a cycle number and block number.
383 * Instead, it uses a cycle number and byte number. Logs don't expect to 474 * Instead, it uses a cycle number and byte number. Logs don't expect to
384 * overflow 31 bits worth of byte offset, so using a byte number will mean 475 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
388 /* The following fields don't need locking */ 479 /* The following fields don't need locking */
389 struct xfs_mount *l_mp; /* mount point */ 480 struct xfs_mount *l_mp; /* mount point */
390 struct xfs_ail *l_ailp; /* AIL log is working with */ 481 struct xfs_ail *l_ailp; /* AIL log is working with */
482 struct xfs_cil *l_cilp; /* CIL log is working with */
391 struct xfs_buf *l_xbuf; /* extra buffer for log 483 struct xfs_buf *l_xbuf; /* extra buffer for log
392 * wrapping */ 484 * wrapping */
393 struct xfs_buftarg *l_targ; /* buftarg of log */ 485 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -438,14 +530,17 @@ typedef struct log {
438 530
439#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 531#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
440 532
441
442/* common routines */ 533/* common routines */
443extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 534extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
444extern int xlog_recover(xlog_t *log); 535extern int xlog_recover(xlog_t *log);
445extern int xlog_recover_finish(xlog_t *log); 536extern int xlog_recover_finish(xlog_t *log);
446extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 537extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
447 538
448extern kmem_zone_t *xfs_log_ticket_zone; 539extern kmem_zone_t *xfs_log_ticket_zone;
540struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
541 int count, char client, uint xflags,
542 int alloc_flags);
543
449 544
450static inline void 545static inline void
451xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) 546xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
455 *off += bytes; 550 *off += bytes;
456} 551}
457 552
553void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
554int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
555 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
556 xlog_in_core_t **commit_iclog, uint flags);
557
558/*
559 * Committed Item List interfaces
560 */
561int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log);
564
565int xlog_cil_push(struct log *log, int push_now);
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
567
458/* 568/*
459 * Unmount record type is used as a pseudo transaction type for the ticket. 569 * Unmount record type is used as a pseudo transaction type for the ticket.
460 * It's value must be outside the range of XFS_TRANS_* values. 570 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0de08e366315..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans(
1576 1576
1577 switch (ITEM_TYPE(item)) { 1577 switch (ITEM_TYPE(item)) {
1578 case XFS_LI_BUF: 1578 case XFS_LI_BUF:
1579 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1579 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1580 trace_xfs_log_recover_item_reorder_head(log, 1580 trace_xfs_log_recover_item_reorder_head(log,
1581 trans, item, pass); 1581 trans, item, pass);
1582 list_move(&item->ri_list, &trans->r_itemq); 1582 list_move(&item->ri_list, &trans->r_itemq);
@@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1(
1638 /* 1638 /*
1639 * If this isn't a cancel buffer item, then just return. 1639 * If this isn't a cancel buffer item, then just return.
1640 */ 1640 */
1641 if (!(flags & XFS_BLI_CANCEL)) { 1641 if (!(flags & XFS_BLF_CANCEL)) {
1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1643 return; 1643 return;
1644 } 1644 }
@@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1(
1696 * Check to see whether the buffer being recovered has a corresponding 1696 * Check to see whether the buffer being recovered has a corresponding
1697 * entry in the buffer cancel record table. If it does then return 1 1697 * entry in the buffer cancel record table. If it does then return 1
1698 * so that it will be cancelled, otherwise return 0. If the buffer is 1698 * so that it will be cancelled, otherwise return 0. If the buffer is
1699 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1699 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1700 * the refcount on the entry in the table and remove it from the table 1700 * the refcount on the entry in the table and remove it from the table
1701 * if this is the last reference. 1701 * if this is the last reference.
1702 * 1702 *
@@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled(
1721 * There is nothing in the table built in pass one, 1721 * There is nothing in the table built in pass one,
1722 * so this buffer must not be cancelled. 1722 * so this buffer must not be cancelled.
1723 */ 1723 */
1724 ASSERT(!(flags & XFS_BLI_CANCEL)); 1724 ASSERT(!(flags & XFS_BLF_CANCEL));
1725 return 0; 1725 return 0;
1726 } 1726 }
1727 1727
@@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled(
1733 * There is no corresponding entry in the table built 1733 * There is no corresponding entry in the table built
1734 * in pass one, so this buffer has not been cancelled. 1734 * in pass one, so this buffer has not been cancelled.
1735 */ 1735 */
1736 ASSERT(!(flags & XFS_BLI_CANCEL)); 1736 ASSERT(!(flags & XFS_BLF_CANCEL));
1737 return 0; 1737 return 0;
1738 } 1738 }
1739 1739
@@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled(
1752 * one in the table and remove it if this is the 1752 * one in the table and remove it if this is the
1753 * last reference. 1753 * last reference.
1754 */ 1754 */
1755 if (flags & XFS_BLI_CANCEL) { 1755 if (flags & XFS_BLF_CANCEL) {
1756 bcp->bc_refcount--; 1756 bcp->bc_refcount--;
1757 if (bcp->bc_refcount == 0) { 1757 if (bcp->bc_refcount == 0) {
1758 if (prevp == NULL) { 1758 if (prevp == NULL) {
@@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled(
1772 * We didn't find a corresponding entry in the table, so 1772 * We didn't find a corresponding entry in the table, so
1773 * return 0 so that the buffer is NOT cancelled. 1773 * return 0 so that the buffer is NOT cancelled.
1774 */ 1774 */
1775 ASSERT(!(flags & XFS_BLI_CANCEL)); 1775 ASSERT(!(flags & XFS_BLF_CANCEL));
1776 return 0; 1776 return 0;
1777} 1777}
1778 1778
@@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer(
1874 nbits = xfs_contig_bits(data_map, map_size, 1874 nbits = xfs_contig_bits(data_map, map_size,
1875 bit); 1875 bit);
1876 ASSERT(nbits > 0); 1876 ASSERT(nbits > 0);
1877 reg_buf_offset = bit << XFS_BLI_SHIFT; 1877 reg_buf_offset = bit << XFS_BLF_SHIFT;
1878 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1878 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1879 item_index++; 1879 item_index++;
1880 } 1880 }
1881 1881
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer(
1889 } 1889 }
1890 1890
1891 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1891 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1894 1894
1895 /* 1895 /*
@@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer(
1955 nbits = xfs_contig_bits(data_map, map_size, bit); 1955 nbits = xfs_contig_bits(data_map, map_size, bit);
1956 ASSERT(nbits > 0); 1956 ASSERT(nbits > 0);
1957 ASSERT(item->ri_buf[i].i_addr != NULL); 1957 ASSERT(item->ri_buf[i].i_addr != NULL);
1958 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1958 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1959 ASSERT(XFS_BUF_COUNT(bp) >= 1959 ASSERT(XFS_BUF_COUNT(bp) >=
1960 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1960 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1961 1961
1962 /* 1962 /*
1963 * Do a sanity check if this is a dquot buffer. Just checking 1963 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer(
1966 */ 1966 */
1967 error = 0; 1967 error = 0;
1968 if (buf_f->blf_flags & 1968 if (buf_f->blf_flags &
1969 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1969 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1970 if (item->ri_buf[i].i_addr == NULL) { 1970 if (item->ri_buf[i].i_addr == NULL) {
1971 cmn_err(CE_ALERT, 1971 cmn_err(CE_ALERT,
1972 "XFS: NULL dquot in %s.", __func__); 1972 "XFS: NULL dquot in %s.", __func__);
@@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer(
1987 } 1987 }
1988 1988
1989 memcpy(xfs_buf_offset(bp, 1989 memcpy(xfs_buf_offset(bp,
1990 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1990 (uint)bit << XFS_BLF_SHIFT), /* dest */
1991 item->ri_buf[i].i_addr, /* source */ 1991 item->ri_buf[i].i_addr, /* source */
1992 nbits<<XFS_BLI_SHIFT); /* length */ 1992 nbits<<XFS_BLF_SHIFT); /* length */
1993 next: 1993 next:
1994 i++; 1994 i++;
1995 bit += nbits; 1995 bit += nbits;
@@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
2148 } 2148 }
2149 2149
2150 type = 0; 2150 type = 0;
2151 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2151 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2152 type |= XFS_DQ_USER; 2152 type |= XFS_DQ_USER;
2153 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2153 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2154 type |= XFS_DQ_PROJ; 2154 type |= XFS_DQ_PROJ;
2155 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2155 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2156 type |= XFS_DQ_GROUP; 2156 type |= XFS_DQ_GROUP;
2157 /* 2157 /*
2158 * This type of quotas was turned off, so ignore this buffer 2158 * This type of quotas was turned off, so ignore this buffer
@@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
2173 * here which overlaps that may be stale. 2173 * here which overlaps that may be stale.
2174 * 2174 *
2175 * When meta-data buffers are freed at run time we log a buffer item 2175 * When meta-data buffers are freed at run time we log a buffer item
2176 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2176 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2177 * of the buffer in the log should not be replayed at recovery time. 2177 * of the buffer in the log should not be replayed at recovery time.
2178 * This is so that if the blocks covered by the buffer are reused for 2178 * This is so that if the blocks covered by the buffer are reused for
2179 * file data before we crash we don't end up replaying old, freed 2179 * file data before we crash we don't end up replaying old, freed
@@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans(
2207 if (pass == XLOG_RECOVER_PASS1) { 2207 if (pass == XLOG_RECOVER_PASS1) {
2208 /* 2208 /*
2209 * In this pass we're only looking for buf items 2209 * In this pass we're only looking for buf items
2210 * with the XFS_BLI_CANCEL bit set. 2210 * with the XFS_BLF_CANCEL bit set.
2211 */ 2211 */
2212 xlog_recover_do_buffer_pass1(log, buf_f); 2212 xlog_recover_do_buffer_pass1(log, buf_f);
2213 return 0; 2213 return 0;
@@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans(
2244 2244
2245 mp = log->l_mp; 2245 mp = log->l_mp;
2246 buf_flags = XBF_LOCK; 2246 buf_flags = XBF_LOCK;
2247 if (!(flags & XFS_BLI_INODE_BUF)) 2247 if (!(flags & XFS_BLF_INODE_BUF))
2248 buf_flags |= XBF_MAPPED; 2248 buf_flags |= XBF_MAPPED;
2249 2249
2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans(
2257 } 2257 }
2258 2258
2259 error = 0; 2259 error = 0;
2260 if (flags & XFS_BLI_INODE_BUF) { 2260 if (flags & XFS_BLF_INODE_BUF) {
2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2262 } else if (flags & 2262 } else if (flags &
2263 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2263 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2265 } else { 2265 } else {
2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
28#define XLOG_RHASH(tid) \ 28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) 29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30 30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) 31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32 32
33 33
34/* 34/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
269 must be synchronous except 269 must be synchronous except
270 for space allocations */ 270 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
271#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
272#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
273#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be578ecb4af2..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,6 +44,7 @@
44#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
47#include "xfs_trace.h"
47 48
48kmem_zone_t *xfs_trans_zone; 49kmem_zone_t *xfs_trans_zone;
49 50
@@ -243,9 +244,8 @@ _xfs_trans_alloc(
243 tp->t_type = type; 244 tp->t_type = type;
244 tp->t_mountp = mp; 245 tp->t_mountp = mp;
245 tp->t_items_free = XFS_LIC_NUM_SLOTS; 246 tp->t_items_free = XFS_LIC_NUM_SLOTS;
246 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
247 xfs_lic_init(&(tp->t_items)); 247 xfs_lic_init(&(tp->t_items));
248 XFS_LBC_INIT(&(tp->t_busy)); 248 INIT_LIST_HEAD(&tp->t_busy);
249 return tp; 249 return tp;
250} 250}
251 251
@@ -255,8 +255,13 @@ _xfs_trans_alloc(
255 */ 255 */
256STATIC void 256STATIC void
257xfs_trans_free( 257xfs_trans_free(
258 xfs_trans_t *tp) 258 struct xfs_trans *tp)
259{ 259{
260 struct xfs_busy_extent *busyp, *n;
261
262 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
263 xfs_alloc_busy_clear(tp->t_mountp, busyp);
264
260 atomic_dec(&tp->t_mountp->m_active_trans); 265 atomic_dec(&tp->t_mountp->m_active_trans);
261 xfs_trans_free_dqinfo(tp); 266 xfs_trans_free_dqinfo(tp);
262 kmem_zone_free(xfs_trans_zone, tp); 267 kmem_zone_free(xfs_trans_zone, tp);
@@ -285,9 +290,8 @@ xfs_trans_dup(
285 ntp->t_type = tp->t_type; 290 ntp->t_type = tp->t_type;
286 ntp->t_mountp = tp->t_mountp; 291 ntp->t_mountp = tp->t_mountp;
287 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 292 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
288 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
289 xfs_lic_init(&(ntp->t_items)); 293 xfs_lic_init(&(ntp->t_items));
290 XFS_LBC_INIT(&(ntp->t_busy)); 294 INIT_LIST_HEAD(&ntp->t_busy);
291 295
292 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 296 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
293 ASSERT(tp->t_ticket != NULL); 297 ASSERT(tp->t_ticket != NULL);
@@ -423,7 +427,6 @@ undo_blocks:
423 return error; 427 return error;
424} 428}
425 429
426
427/* 430/*
428 * Record the indicated change to the given field for application 431 * Record the indicated change to the given field for application
429 * to the file system's superblock when the transaction commits. 432 * to the file system's superblock when the transaction commits.
@@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas(
652 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 655 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
653 * still need to update the incore superblock with the changes. 656 * still need to update the incore superblock with the changes.
654 */ 657 */
655STATIC void 658void
656xfs_trans_unreserve_and_mod_sb( 659xfs_trans_unreserve_and_mod_sb(
657 xfs_trans_t *tp) 660 xfs_trans_t *tp)
658{ 661{
@@ -880,7 +883,7 @@ xfs_trans_fill_vecs(
880 * they could be immediately flushed and we'd have to race with the flusher 883 * they could be immediately flushed and we'd have to race with the flusher
881 * trying to pull the item from the AIL as we add it. 884 * trying to pull the item from the AIL as we add it.
882 */ 885 */
883static void 886void
884xfs_trans_item_committed( 887xfs_trans_item_committed(
885 struct xfs_log_item *lip, 888 struct xfs_log_item *lip,
886 xfs_lsn_t commit_lsn, 889 xfs_lsn_t commit_lsn,
@@ -930,26 +933,6 @@ xfs_trans_item_committed(
930 IOP_UNPIN(lip); 933 IOP_UNPIN(lip);
931} 934}
932 935
933/* Clear all the per-AG busy list items listed in this transaction */
934static void
935xfs_trans_clear_busy_extents(
936 struct xfs_trans *tp)
937{
938 xfs_log_busy_chunk_t *lbcp;
939 xfs_log_busy_slot_t *lbsp;
940 int i;
941
942 for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
943 i = 0;
944 for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
945 if (XFS_LBC_ISFREE(lbcp, i))
946 continue;
947 xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
948 }
949 }
950 xfs_trans_free_busy(tp);
951}
952
953/* 936/*
954 * This is typically called by the LM when a transaction has been fully 937 * This is typically called by the LM when a transaction has been fully
955 * committed to disk. It needs to unpin the items which have 938 * committed to disk. It needs to unpin the items which have
@@ -984,7 +967,6 @@ xfs_trans_committed(
984 kmem_free(licp); 967 kmem_free(licp);
985 } 968 }
986 969
987 xfs_trans_clear_busy_extents(tp);
988 xfs_trans_free(tp); 970 xfs_trans_free(tp);
989} 971}
990 972
@@ -1012,8 +994,7 @@ xfs_trans_uncommit(
1012 xfs_trans_unreserve_and_mod_sb(tp); 994 xfs_trans_unreserve_and_mod_sb(tp);
1013 xfs_trans_unreserve_and_mod_dquots(tp); 995 xfs_trans_unreserve_and_mod_dquots(tp);
1014 996
1015 xfs_trans_free_items(tp, flags); 997 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1016 xfs_trans_free_busy(tp);
1017 xfs_trans_free(tp); 998 xfs_trans_free(tp);
1018} 999}
1019 1000
@@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog(
1075 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1056 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
1076 1057
1077 tp->t_commit_lsn = *commit_lsn; 1058 tp->t_commit_lsn = *commit_lsn;
1059 trace_xfs_trans_commit_lsn(tp);
1060
1078 if (nvec > XFS_TRANS_LOGVEC_COUNT) 1061 if (nvec > XFS_TRANS_LOGVEC_COUNT)
1079 kmem_free(log_vector); 1062 kmem_free(log_vector);
1080 1063
@@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog(
1161 return xfs_log_release_iclog(mp, commit_iclog); 1144 return xfs_log_release_iclog(mp, commit_iclog);
1162} 1145}
1163 1146
1147/*
1148 * Walk the log items and allocate log vector structures for
1149 * each item large enough to fit all the vectors they require.
1150 * Note that this format differs from the old log vector format in
1151 * that there is no transaction header in these log vectors.
1152 */
1153STATIC struct xfs_log_vec *
1154xfs_trans_alloc_log_vecs(
1155 xfs_trans_t *tp)
1156{
1157 xfs_log_item_desc_t *lidp;
1158 struct xfs_log_vec *lv = NULL;
1159 struct xfs_log_vec *ret_lv = NULL;
1160
1161 lidp = xfs_trans_first_item(tp);
1162
1163 /* Bail out if we didn't find a log item. */
1164 if (!lidp) {
1165 ASSERT(0);
1166 return NULL;
1167 }
1168
1169 while (lidp != NULL) {
1170 struct xfs_log_vec *new_lv;
1171
1172 /* Skip items which aren't dirty in this transaction. */
1173 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1174 lidp = xfs_trans_next_item(tp, lidp);
1175 continue;
1176 }
1177
1178 /* Skip items that do not have any vectors for writing */
1179 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1180 if (!lidp->lid_size) {
1181 lidp = xfs_trans_next_item(tp, lidp);
1182 continue;
1183 }
1184
1185 new_lv = kmem_zalloc(sizeof(*new_lv) +
1186 lidp->lid_size * sizeof(struct xfs_log_iovec),
1187 KM_SLEEP);
1188
1189 /* The allocated iovec region lies beyond the log vector. */
1190 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1191 new_lv->lv_niovecs = lidp->lid_size;
1192 new_lv->lv_item = lidp->lid_item;
1193 if (!ret_lv)
1194 ret_lv = new_lv;
1195 else
1196 lv->lv_next = new_lv;
1197 lv = new_lv;
1198 lidp = xfs_trans_next_item(tp, lidp);
1199 }
1200
1201 return ret_lv;
1202}
1203
1204static int
1205xfs_trans_commit_cil(
1206 struct xfs_mount *mp,
1207 struct xfs_trans *tp,
1208 xfs_lsn_t *commit_lsn,
1209 int flags)
1210{
1211 struct xfs_log_vec *log_vector;
1212 int error;
1213
1214 /*
1215 * Get each log item to allocate a vector structure for
1216 * the log item to to pass to the log write code. The
1217 * CIL commit code will format the vector and save it away.
1218 */
1219 log_vector = xfs_trans_alloc_log_vecs(tp);
1220 if (!log_vector)
1221 return ENOMEM;
1222
1223 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1224 if (error)
1225 return error;
1226
1227 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1228
1229 /* xfs_trans_free_items() unlocks them first */
1230 xfs_trans_free_items(tp, *commit_lsn, 0);
1231 xfs_trans_free(tp);
1232 return 0;
1233}
1164 1234
1165/* 1235/*
1166 * xfs_trans_commit 1236 * xfs_trans_commit
@@ -1221,7 +1291,11 @@ _xfs_trans_commit(
1221 xfs_trans_apply_sb_deltas(tp); 1291 xfs_trans_apply_sb_deltas(tp);
1222 xfs_trans_apply_dquot_deltas(tp); 1292 xfs_trans_apply_dquot_deltas(tp);
1223 1293
1224 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); 1294 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1295 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1296 else
1297 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1298
1225 if (error == ENOMEM) { 1299 if (error == ENOMEM) {
1226 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1300 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1227 error = XFS_ERROR(EIO); 1301 error = XFS_ERROR(EIO);
@@ -1259,8 +1333,7 @@ out_unreserve:
1259 error = XFS_ERROR(EIO); 1333 error = XFS_ERROR(EIO);
1260 } 1334 }
1261 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1335 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1262 xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); 1336 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1263 xfs_trans_free_busy(tp);
1264 xfs_trans_free(tp); 1337 xfs_trans_free(tp);
1265 1338
1266 XFS_STATS_INC(xs_trans_empty); 1339 XFS_STATS_INC(xs_trans_empty);
@@ -1338,8 +1411,7 @@ xfs_trans_cancel(
1338 /* mark this thread as no longer being in a transaction */ 1411 /* mark this thread as no longer being in a transaction */
1339 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1412 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1340 1413
1341 xfs_trans_free_items(tp, flags); 1414 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1342 xfs_trans_free_busy(tp);
1343 xfs_trans_free(tp); 1415 xfs_trans_free(tp);
1344} 1416}
1345 1417
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c62beee0921e..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
106#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
107#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
110/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
111 112
112#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
151 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
152 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
153 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -813,6 +815,7 @@ struct xfs_log_item_desc;
813struct xfs_mount; 815struct xfs_mount;
814struct xfs_trans; 816struct xfs_trans;
815struct xfs_dquot_acct; 817struct xfs_dquot_acct;
818struct xfs_busy_extent;
816 819
817typedef struct xfs_log_item { 820typedef struct xfs_log_item {
818 struct list_head li_ail; /* AIL pointers */ 821 struct list_head li_ail; /* AIL pointers */
@@ -828,6 +831,11 @@ typedef struct xfs_log_item {
828 /* buffer item iodone */ 831 /* buffer item iodone */
829 /* callback func */ 832 /* callback func */
830 struct xfs_item_ops *li_ops; /* function list */ 833 struct xfs_item_ops *li_ops; /* function list */
834
835 /* delayed logging */
836 struct list_head li_cil; /* CIL pointers */
837 struct xfs_log_vec *li_lv; /* active log vector */
838 xfs_lsn_t li_seq; /* CIL commit seq */
831} xfs_log_item_t; 839} xfs_log_item_t;
832 840
833#define XFS_LI_IN_AIL 0x1 841#define XFS_LI_IN_AIL 0x1
@@ -872,34 +880,6 @@ typedef struct xfs_item_ops {
872#define XFS_ITEM_PUSHBUF 3 880#define XFS_ITEM_PUSHBUF 3
873 881
874/* 882/*
875 * This structure is used to maintain a list of block ranges that have been
876 * freed in the transaction. The ranges are listed in the perag[] busy list
877 * between when they're freed and the transaction is committed to disk.
878 */
879
880typedef struct xfs_log_busy_slot {
881 xfs_agnumber_t lbc_ag;
882 ushort lbc_idx; /* index in perag.busy[] */
883} xfs_log_busy_slot_t;
884
885#define XFS_LBC_NUM_SLOTS 31
886typedef struct xfs_log_busy_chunk {
887 struct xfs_log_busy_chunk *lbc_next;
888 uint lbc_free; /* free slots bitmask */
889 ushort lbc_unused; /* first unused */
890 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
891} xfs_log_busy_chunk_t;
892
893#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
894#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
895
896#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
897#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
898#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
899#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
900#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
901
902/*
903 * This is the type of function which can be given to xfs_trans_callback() 883 * This is the type of function which can be given to xfs_trans_callback()
904 * to be called upon the transaction's commit to disk. 884 * to be called upon the transaction's commit to disk.
905 */ 885 */
@@ -950,8 +930,7 @@ typedef struct xfs_trans {
950 unsigned int t_items_free; /* log item descs free */ 930 unsigned int t_items_free; /* log item descs free */
951 xfs_log_item_chunk_t t_items; /* first log item desc chunk */ 931 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
952 xfs_trans_header_t t_header; /* header for in-log trans */ 932 xfs_trans_header_t t_header; /* header for in-log trans */
953 unsigned int t_busy_free; /* busy descs free */ 933 struct list_head t_busy; /* list of busy extents */
954 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
955 unsigned long t_pflags; /* saved process flags state */ 934 unsigned long t_pflags; /* saved process flags state */
956} xfs_trans_t; 935} xfs_trans_t;
957 936
@@ -1025,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *,
1025void xfs_trans_cancel(xfs_trans_t *, int); 1004void xfs_trans_cancel(xfs_trans_t *, int);
1026int xfs_trans_ail_init(struct xfs_mount *); 1005int xfs_trans_ail_init(struct xfs_mount *);
1027void xfs_trans_ail_destroy(struct xfs_mount *); 1006void xfs_trans_ail_destroy(struct xfs_mount *);
1028xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1029 xfs_agnumber_t ag,
1030 xfs_extlen_t idx);
1031 1007
1032extern kmem_zone_t *xfs_trans_zone; 1008extern kmem_zone_t *xfs_trans_zone;
1033 1009
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9cd809025f3a..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -114,7 +114,7 @@ _xfs_trans_bjoin(
114 xfs_buf_item_init(bp, tp->t_mountp); 114 xfs_buf_item_init(bp, tp->t_mountp);
115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
119 if (reset_recur) 119 if (reset_recur)
120 bip->bli_recur = 0; 120 bip->bli_recur = 0;
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
515 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 ASSERT(atomic_read(&bip->bli_refcount) > 0);
516 516
517 /* 517 /*
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
619 619
620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
623 ASSERT(atomic_read(&bip->bli_refcount) > 0); 623 ASSERT(atomic_read(&bip->bli_refcount) > 0);
624 bip->bli_flags |= XFS_BLI_HOLD; 624 bip->bli_flags |= XFS_BLI_HOLD;
625 trace_xfs_trans_bhold(bip); 625 trace_xfs_trans_bhold(bip);
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
641 641
642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
645 ASSERT(atomic_read(&bip->bli_refcount) > 0); 645 ASSERT(atomic_read(&bip->bli_refcount) > 0);
646 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 646 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
647 bip->bli_flags &= ~XFS_BLI_HOLD; 647 bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
704 bip->bli_flags &= ~XFS_BLI_STALE; 704 bip->bli_flags &= ~XFS_BLI_STALE;
705 ASSERT(XFS_BUF_ISSTALE(bp)); 705 ASSERT(XFS_BUF_ISSTALE(bp));
706 XFS_BUF_UNSTALE(bp); 706 XFS_BUF_UNSTALE(bp);
707 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; 707 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
708 } 708 }
709 709
710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); 710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -762,8 +762,8 @@ xfs_trans_binval(
762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
763 ASSERT(XFS_BUF_ISSTALE(bp)); 763 ASSERT(XFS_BUF_ISSTALE(bp));
764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); 765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
766 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 766 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
769 return; 769 return;
@@ -774,7 +774,7 @@ xfs_trans_binval(
774 * in the buf log item. The STALE flag will be used in 774 * in the buf log item. The STALE flag will be used in
775 * xfs_buf_item_unpin() to determine if it should clean up 775 * xfs_buf_item_unpin() to determine if it should clean up
776 * when the last reference to the buf item is given up. 776 * when the last reference to the buf item is given up.
777 * We set the XFS_BLI_CANCEL flag in the buf log format structure 777 * We set the XFS_BLF_CANCEL flag in the buf log format structure
778 * and log the buf item. This will be used at recovery time 778 * and log the buf item. This will be used at recovery time
779 * to determine that copies of the buffer in the log before 779 * to determine that copies of the buffer in the log before
780 * this should not be replayed. 780 * this should not be replayed.
@@ -792,9 +792,9 @@ xfs_trans_binval(
792 XFS_BUF_UNDELAYWRITE(bp); 792 XFS_BUF_UNDELAYWRITE(bp);
793 XFS_BUF_STALE(bp); 793 XFS_BUF_STALE(bp);
794 bip->bli_flags |= XFS_BLI_STALE; 794 bip->bli_flags |= XFS_BLI_STALE;
795 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 795 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
796 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; 796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
797 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
798 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
799 (bip->bli_format.blf_map_size * sizeof(uint))); 799 (bip->bli_format.blf_map_size * sizeof(uint)));
800 lidp->lid_flags |= XFS_LID_DIRTY; 800 lidp->lid_flags |= XFS_LID_DIRTY;
@@ -802,16 +802,16 @@ xfs_trans_binval(
802} 802}
803 803
804/* 804/*
805 * This call is used to indicate that the buffer contains on-disk 805 * This call is used to indicate that the buffer contains on-disk inodes which
806 * inodes which must be handled specially during recovery. They 806 * must be handled specially during recovery. They require special handling
807 * require special handling because only the di_next_unlinked from 807 * because only the di_next_unlinked from the inodes in the buffer should be
808 * the inodes in the buffer should be recovered. The rest of the 808 * recovered. The rest of the data in the buffer is logged via the inodes
809 * data in the buffer is logged via the inodes themselves. 809 * themselves.
810 * 810 *
811 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 811 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
812 * format structure so that we'll know what to do at recovery time. 812 * transferred to the buffer's log format structure so that we'll know what to
813 * do at recovery time.
813 */ 814 */
814/* ARGSUSED */
815void 815void
816xfs_trans_inode_buf( 816xfs_trans_inode_buf(
817 xfs_trans_t *tp, 817 xfs_trans_t *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
827 ASSERT(atomic_read(&bip->bli_refcount) > 0); 827 ASSERT(atomic_read(&bip->bli_refcount) > 0);
828 828
829 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; 829 bip->bli_flags |= XFS_BLI_INODE_BUF;
830} 830}
831 831
832/* 832/*
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf(
908 ASSERT(XFS_BUF_ISBUSY(bp)); 908 ASSERT(XFS_BUF_ISBUSY(bp));
909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
911 ASSERT(type == XFS_BLI_UDQUOT_BUF || 911 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
912 type == XFS_BLI_PDQUOT_BUF || 912 type == XFS_BLF_PDQUOT_BUF ||
913 type == XFS_BLI_GDQUOT_BUF); 913 type == XFS_BLF_GDQUOT_BUF);
914 914
915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
916 ASSERT(atomic_read(&bip->bli_refcount) > 0); 916 ASSERT(atomic_read(&bip->bli_refcount) > 0);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
299void 299void
300xfs_trans_free_items( 300xfs_trans_free_items(
301 xfs_trans_t *tp, 301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
302 int flags) 303 int flags)
303{ 304{
304 xfs_log_item_chunk_t *licp; 305 xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
311 * Special case the embedded chunk so we don't free it below. 312 * Special case the embedded chunk so we don't free it below.
312 */ 313 */
313 if (!xfs_lic_are_all_free(licp)) { 314 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
315 xfs_lic_all_free(licp); 316 xfs_lic_all_free(licp);
316 licp->lic_unused = 0; 317 licp->lic_unused = 0;
317 } 318 }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
322 */ 323 */
323 while (licp != NULL) { 324 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp)); 325 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
326 next_licp = licp->lic_next; 327 next_licp = licp->lic_next;
327 kmem_free(licp); 328 kmem_free(licp);
328 licp = next_licp; 329 licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
438 439
439 return freed; 440 return freed;
440} 441}
441
442
443/*
444 * This is called to add the given busy item to the transaction's
445 * list of busy items. It must find a free busy item descriptor
446 * or allocate a new one and add the item to that descriptor.
447 * The function returns a pointer to busy descriptor used to point
448 * to the new busy entry. The log busy entry will now point to its new
449 * descriptor with its ???? field.
450 */
451xfs_log_busy_slot_t *
452xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
453{
454 xfs_log_busy_chunk_t *lbcp;
455 xfs_log_busy_slot_t *lbsp;
456 int i=0;
457
458 /*
459 * If there are no free descriptors, allocate a new chunk
460 * of them and put it at the front of the chunk list.
461 */
462 if (tp->t_busy_free == 0) {
463 lbcp = (xfs_log_busy_chunk_t*)
464 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
465 ASSERT(lbcp != NULL);
466 /*
467 * Initialize the chunk, and then
468 * claim the first slot in the newly allocated chunk.
469 */
470 XFS_LBC_INIT(lbcp);
471 XFS_LBC_CLAIM(lbcp, 0);
472 lbcp->lbc_unused = 1;
473 lbsp = XFS_LBC_SLOT(lbcp, 0);
474
475 /*
476 * Link in the new chunk and update the free count.
477 */
478 lbcp->lbc_next = tp->t_busy.lbc_next;
479 tp->t_busy.lbc_next = lbcp;
480 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
481
482 /*
483 * Initialize the descriptor and the generic portion
484 * of the log item.
485 *
486 * Point the new slot at this item and return it.
487 * Also point the log item at its currently active
488 * descriptor and set the item's mount pointer.
489 */
490 lbsp->lbc_ag = ag;
491 lbsp->lbc_idx = idx;
492 return lbsp;
493 }
494
495 /*
496 * Find the free descriptor. It is somewhere in the chunklist
497 * of descriptors.
498 */
499 lbcp = &tp->t_busy;
500 while (lbcp != NULL) {
501 if (XFS_LBC_VACANCY(lbcp)) {
502 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
503 i = lbcp->lbc_unused;
504 break;
505 } else {
506 /* out-of-order vacancy */
507 cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
508 ASSERT(0);
509 }
510 }
511 lbcp = lbcp->lbc_next;
512 }
513 ASSERT(lbcp != NULL);
514 /*
515 * If we find a free descriptor, claim it,
516 * initialize it, and return it.
517 */
518 XFS_LBC_CLAIM(lbcp, i);
519 if (lbcp->lbc_unused <= i) {
520 lbcp->lbc_unused = i + 1;
521 }
522 lbsp = XFS_LBC_SLOT(lbcp, i);
523 tp->t_busy_free--;
524 lbsp->lbc_ag = ag;
525 lbsp->lbc_idx = idx;
526 return lbsp;
527}
528
529
530/*
531 * xfs_trans_free_busy
532 * Free all of the busy lists from a transaction
533 */
534void
535xfs_trans_free_busy(xfs_trans_t *tp)
536{
537 xfs_log_busy_chunk_t *lbcp;
538 xfs_log_busy_chunk_t *lbcq;
539
540 lbcp = tp->t_busy.lbc_next;
541 while (lbcp != NULL) {
542 lbcq = lbcp->lbc_next;
543 kmem_free(lbcp);
544 lbcp = lbcq;
545 }
546
547 XFS_LBC_INIT(&tp->t_busy);
548 tp->t_busy.lbc_unused = 0;
549}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); 35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, 36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *); 37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int); 38
39void xfs_trans_unlock_items(struct xfs_trans *, 39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40 xfs_lsn_t); 40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41void xfs_trans_free_busy(xfs_trans_t *tp); 41 int flags);
42xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, 42
43 xfs_agnumber_t ag, 43void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_extlen_t idx); 44 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
45 46
46/* 47/*
47 * AIL traversal cursor. 48 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ 76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77 77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
78/* 80/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 81 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 82 * Disk based types: