aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_dir.c8
-rw-r--r--fs/9p/vfs_file.c11
-rw-r--r--fs/9p/vfs_inode.c111
-rw-r--r--fs/9p/vfs_super.c55
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/dir.c6
-rw-r--r--fs/afs/file.c64
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/autofs4/root.c22
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/block_dev.c330
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/inode.c11
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/buffer.c26
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/auth.c9
-rw-r--r--fs/ceph/auth.h2
-rw-r--r--fs/ceph/auth_none.c1
-rw-r--r--fs/ceph/auth_x.c19
-rw-r--r--fs/ceph/caps.c24
-rw-r--r--fs/ceph/ceph_fs.h62
-rw-r--r--fs/ceph/ceph_strings.c16
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/dir.c45
-rw-r--r--fs/ceph/export.c14
-rw-r--r--fs/ceph/file.c19
-rw-r--r--fs/ceph/inode.c97
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/mds_client.c385
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/messenger.c91
-rw-r--r--fs/ceph/messenger.h10
-rw-r--r--fs/ceph/mon_client.c257
-rw-r--r--fs/ceph/mon_client.h27
-rw-r--r--fs/ceph/msgpool.c180
-rw-r--r--fs/ceph/msgpool.h12
-rw-r--r--fs/ceph/msgr.h21
-rw-r--r--fs/ceph/osd_client.c98
-rw-r--r--fs/ceph/pagelist.c2
-rw-r--r--fs/ceph/rados.h23
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c128
-rw-r--r--fs/ceph/super.h30
-rw-r--r--fs/ceph/xattr.c35
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/coda/pioctl.c76
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/devpts/inode.c9
-rw-r--r--fs/dlm/lock.c5
-rw-r--r--fs/dlm/user.c88
-rw-r--r--fs/drop_caches.c24
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c48
-rw-r--r--fs/ecryptfs/main.c166
-rw-r--r--fs/ecryptfs/mmap.c19
-rw-r--r--fs/ecryptfs/read_write.c13
-rw-r--r--fs/ecryptfs/super.c22
-rw-r--r--fs/exec.c7
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/inode.c41
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/balloc.c6
-rw-r--r--fs/ext2/ialloc.c21
-rw-r--r--fs/ext2/inode.c7
-rw-r--r--fs/ext2/super.c99
-rw-r--r--fs/ext2/xattr.c12
-rw-r--r--fs/ext2/xattr.h12
-rw-r--r--fs/ext2/xattr_security.c2
-rw-r--r--fs/ext2/xattr_trusted.c2
-rw-r--r--fs/ext2/xattr_user.c2
-rw-r--r--fs/ext3/acl.c4
-rw-r--r--fs/ext3/balloc.c6
-rw-r--r--fs/ext3/fsync.c23
-rw-r--r--fs/ext3/ialloc.c13
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/super.c77
-rw-r--r--fs/ext3/xattr.c10
-rw-r--r--fs/ext3/xattr.h12
-rw-r--r--fs/ext3/xattr_security.c2
-rw-r--r--fs/ext3/xattr_trusted.c2
-rw-r--r--fs/ext3/xattr_user.c2
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/ext4/ialloc.c12
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/xattr.c10
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ext4/xattr_security.c2
-rw-r--r--fs/ext4/xattr_trusted.c2
-rw-r--r--fs/ext4/xattr_user.c2
-rw-r--r--fs/fat/cache.c13
-rw-r--r--fs/fat/dir.c28
-rw-r--r--fs/fat/fat.h16
-rw-r--r--fs/fat/file.c19
-rw-r--r--fs/fat/inode.c8
-rw-r--r--fs/fat/misc.c22
-rw-r--r--fs/fcntl.c71
-rw-r--r--fs/fs-writeback.c108
-rw-r--r--fs/generic_acl.c4
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c17
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/export.c2
-rw-r--r--fs/gfs2/file.c7
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/incore.h11
-rw-r--r--fs/gfs2/inode.c103
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/log.c160
-rw-r--r--fs/gfs2/log.h30
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c5
-rw-r--r--fs/gfs2/ops_fstype.c19
-rw-r--r--fs/gfs2/quota.c114
-rw-r--r--fs/gfs2/rgrp.c81
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/sys.c6
-rw-r--r--fs/gfs2/trans.c18
-rw-r--r--fs/gfs2/xattr.c6
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h3
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c12
-rw-r--r--fs/inode.c26
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c15
-rw-r--r--fs/jbd/commit.c8
-rw-r--r--fs/jbd/journal.c33
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/background.c3
-rw-r--r--fs/jffs2/erase.c12
-rw-r--r--fs/jffs2/fs.c10
-rw-r--r--fs/jffs2/gc.c17
-rw-r--r--fs/jffs2/nodelist.h10
-rw-r--r--fs/jffs2/nodemgmt.c28
-rw-r--r--fs/jffs2/os-linux.h3
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/security.c2
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/jffs2/xattr.c8
-rw-r--r--fs/jffs2/xattr.h8
-rw-r--r--fs/jffs2/xattr_trusted.c2
-rw-r--r--fs/jffs2/xattr_user.c2
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_inode.c12
-rw-r--r--fs/logfs/inode.c9
-rw-r--r--fs/minix/bitmap.c5
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/minix/namei.c11
-rw-r--r--fs/namei.c5
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/ncpfs/ioctl.c27
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfsd/nfs4recover.c87
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/nilfs2/alloc.c154
-rw-r--r--fs/nilfs2/alloc.h7
-rw-r--r--fs/nilfs2/btree.c91
-rw-r--r--fs/nilfs2/btree.h23
-rw-r--r--fs/nilfs2/inode.c15
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segbuf.c70
-rw-r--r--fs/nilfs2/segbuf.h10
-rw-r--r--fs/nilfs2/segment.c157
-rw-r--r--fs/nilfs2/segment.h6
-rw-r--r--fs/nilfs2/super.c218
-rw-r--r--fs/nilfs2/the_nilfs.c14
-rw-r--r--fs/notify/inotify/inotify.c88
-rw-r--r--fs/ntfs/file.c28
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/alloc.c908
-rw-r--r--fs/ocfs2/alloc.h12
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/tcp.c3
-rw-r--r--fs/ocfs2/dir.c75
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c28
-rw-r--r--fs/ocfs2/dlm/dlmlock.c6
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c30
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c27
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c3
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/file.c236
-rw-r--r--fs/ocfs2/inode.c45
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c26
-rw-r--r--fs/ocfs2/journal.h15
-rw-r--r--fs/ocfs2/localalloc.c275
-rw-r--r--fs/ocfs2/localalloc.h3
-rw-r--r--fs/ocfs2/mmap.c48
-rw-r--r--fs/ocfs2/namei.c100
-rw-r--r--fs/ocfs2/ocfs2.h22
-rw-r--r--fs/ocfs2/ocfs2_fs.h144
-rw-r--r--fs/ocfs2/quota.h12
-rw-r--r--fs/ocfs2/quota_global.c351
-rw-r--r--fs/ocfs2/quota_local.c183
-rw-r--r--fs/ocfs2/refcounttree.c74
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/reservations.c847
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c19
-rw-r--r--fs/ocfs2/suballoc.c688
-rw-r--r--fs/ocfs2/suballoc.h21
-rw-r--r--fs/ocfs2/super.c92
-rw-r--r--fs/ocfs2/super.h7
-rw-r--r--fs/ocfs2/xattr.c115
-rw-r--r--fs/ocfs2/xattr.h12
-rw-r--r--fs/omfs/inode.c4
-rw-r--r--fs/open.c166
-rw-r--r--fs/partitions/acorn.c68
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c13
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c8
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c84
-rw-r--r--fs/partitions/check.h12
-rw-r--r--fs/partitions/efi.c93
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c21
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c4
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c107
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c13
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c87
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c4
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c6
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c6
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c6
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c4
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c122
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/quota/dquot.c275
-rw-r--r--fs/quota/quota.c95
-rw-r--r--fs/quota/quota_tree.c50
-rw-r--r--fs/quota/quota_tree.h6
-rw-r--r--fs/quota/quota_v1.c4
-rw-r--r--fs/quota/quota_v2.c6
-rw-r--r--fs/ramfs/inode.c22
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c3
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/xattr.c16
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/smbfs/dir.c2
-rw-r--r--fs/smbfs/file.c2
-rw-r--r--fs/smbfs/ioctl.c10
-rw-r--r--fs/smbfs/proto.h2
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c151
-rw-r--r--fs/statfs.c196
-rw-r--r--fs/super.c321
-rw-r--r--fs/sync.c88
-rw-r--r--fs/sysfs/bin.c26
-rw-r--r--fs/sysfs/dir.c114
-rw-r--r--fs/sysfs/file.c17
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysfs/inode.c6
-rw-r--r--fs/sysfs/mount.c95
-rw-r--r--fs/sysfs/symlink.c36
-rw-r--r--fs/sysfs/sysfs.h34
-rw-r--r--fs/sysv/ialloc.c11
-rw-r--r--fs/timerfd.c25
-rw-r--r--fs/ubifs/dir.c9
-rw-r--r--fs/ubifs/io.c1
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/file.c45
-rw-r--r--fs/udf/ialloc.c11
-rw-r--r--fs/udf/namei.c10
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/ufs/ialloc.c10
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/symlink.c8
-rw-r--r--fs/ufs/truncate.c10
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c231
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c36
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c91
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h233
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c8
-rw-r--r--fs/xfs/quota/xfs_dquot.c199
-rw-r--r--fs/xfs/quota/xfs_dquot.h35
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c30
-rw-r--r--fs/xfs/quota/xfs_qm.c609
-rw-r--r--fs/xfs/quota/xfs_qm.h23
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c162
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h102
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c29
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h24
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_buf_item.c221
-rw-r--r--fs/xfs/xfs_buf_item.h20
-rw-r--r--fs/xfs/xfs_error.c32
-rw-r--r--fs/xfs/xfs_error.h9
-rw-r--r--fs/xfs/xfs_extfree_item.c18
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode_item.c21
-rw-r--r--fs/xfs/xfs_iomap.c123
-rw-r--r--fs/xfs/xfs_iomap.h47
-rw-r--r--fs/xfs/xfs_log.c796
-rw-r--r--fs/xfs/xfs_log.h27
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h130
-rw-r--r--fs/xfs/xfs_log_recover.c355
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.c7
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_trans.c810
-rw-r--r--fs/xfs/xfs_trans.h58
-rw-r--r--fs/xfs/xfs_trans_buf.c233
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
369 files changed, 10972 insertions, 7789 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
40extern struct file_system_type v9fs_fs_type; 40extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 41extern const struct address_space_operations v9fs_addr_operations;
42extern const struct file_operations v9fs_file_operations; 42extern const struct file_operations v9fs_file_operations;
43extern const struct file_operations v9fs_file_operations_dotl;
43extern const struct file_operations v9fs_dir_operations; 44extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl;
44extern const struct dentry_operations v9fs_dentry_operations; 46extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 47extern const struct dentry_operations v9fs_cached_dentry_operations;
46 48
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
203 .open = v9fs_file_open, 203 .open = v9fs_file_open,
204 .release = v9fs_dir_release, 204 .release = v9fs_dir_release,
205}; 205};
206
207const struct file_operations v9fs_dir_operations_dotl = {
208 .read = generic_read_dir,
209 .llseek = generic_file_llseek,
210 .readdir = v9fs_dir_readdir,
211 .open = v9fs_file_open,
212 .release = v9fs_dir_release,
213};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..25b300e1c9d7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -296,3 +296,14 @@ const struct file_operations v9fs_file_operations = {
296 .mmap = generic_file_readonly_mmap, 296 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync, 297 .fsync = v9fs_file_fsync,
298}; 298};
299
300const struct file_operations v9fs_file_operations_dotl = {
301 .llseek = generic_file_llseek,
302 .read = v9fs_file_read,
303 .write = v9fs_file_write,
304 .open = v9fs_file_open,
305 .release = v9fs_dir_release,
306 .lock = v9fs_file_lock,
307 .mmap = generic_file_readonly_mmap,
308 .fsync = v9fs_file_fsync,
309};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f2434fc9d2c4..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,9 +44,12 @@
44#include "cache.h" 44#include "cache.h"
45 45
46static const struct inode_operations v9fs_dir_inode_operations; 46static const struct inode_operations v9fs_dir_inode_operations;
47static const struct inode_operations v9fs_dir_inode_operations_ext; 47static const struct inode_operations v9fs_dir_inode_operations_dotu;
48static const struct inode_operations v9fs_dir_inode_operations_dotl;
48static const struct inode_operations v9fs_file_inode_operations; 49static const struct inode_operations v9fs_file_inode_operations;
50static const struct inode_operations v9fs_file_inode_operations_dotl;
49static const struct inode_operations v9fs_symlink_inode_operations; 51static const struct inode_operations v9fs_symlink_inode_operations;
52static const struct inode_operations v9fs_symlink_inode_operations_dotl;
50 53
51/** 54/**
52 * unixmode2p9mode - convert unix mode bits to plan 9 55 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -253,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
253 return ERR_PTR(-ENOMEM); 256 return ERR_PTR(-ENOMEM);
254 } 257 }
255 258
256 inode->i_mode = mode; 259 inode_init_owner(inode, NULL, mode);
257 inode->i_uid = current_fsuid();
258 inode->i_gid = current_fsgid();
259 inode->i_blocks = 0; 260 inode->i_blocks = 0;
260 inode->i_rdev = 0; 261 inode->i_rdev = 0;
261 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 262 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -275,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
275 init_special_inode(inode, inode->i_mode, inode->i_rdev); 276 init_special_inode(inode, inode->i_mode, inode->i_rdev);
276 break; 277 break;
277 case S_IFREG: 278 case S_IFREG:
278 inode->i_op = &v9fs_file_inode_operations; 279 if (v9fs_proto_dotl(v9ses)) {
279 inode->i_fop = &v9fs_file_operations; 280 inode->i_op = &v9fs_file_inode_operations_dotl;
281 inode->i_fop = &v9fs_file_operations_dotl;
282 } else {
283 inode->i_op = &v9fs_file_inode_operations;
284 inode->i_fop = &v9fs_file_operations;
285 }
286
280 break; 287 break;
288
281 case S_IFLNK: 289 case S_IFLNK:
282 if (!v9fs_proto_dotu(v9ses)) { 290 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
283 P9_DPRINTK(P9_DEBUG_ERROR, 291 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
284 "extended modes used w/o 9P2000.u\n"); 292 "legacy protocol.\n");
285 err = -EINVAL; 293 err = -EINVAL;
286 goto error; 294 goto error;
287 } 295 }
288 inode->i_op = &v9fs_symlink_inode_operations; 296
297 if (v9fs_proto_dotl(v9ses))
298 inode->i_op = &v9fs_symlink_inode_operations_dotl;
299 else
300 inode->i_op = &v9fs_symlink_inode_operations;
301
289 break; 302 break;
290 case S_IFDIR: 303 case S_IFDIR:
291 inc_nlink(inode); 304 inc_nlink(inode);
292 if (v9fs_proto_dotu(v9ses)) 305 if (v9fs_proto_dotl(v9ses))
293 inode->i_op = &v9fs_dir_inode_operations_ext; 306 inode->i_op = &v9fs_dir_inode_operations_dotl;
307 else if (v9fs_proto_dotu(v9ses))
308 inode->i_op = &v9fs_dir_inode_operations_dotu;
294 else 309 else
295 inode->i_op = &v9fs_dir_inode_operations; 310 inode->i_op = &v9fs_dir_inode_operations;
296 inode->i_fop = &v9fs_dir_operations; 311
312 if (v9fs_proto_dotl(v9ses))
313 inode->i_fop = &v9fs_dir_operations_dotl;
314 else
315 inode->i_fop = &v9fs_dir_operations;
316
297 break; 317 break;
298 default: 318 default:
299 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", 319 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -434,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
434{ 454{
435 int retval; 455 int retval;
436 struct inode *file_inode; 456 struct inode *file_inode;
437 struct v9fs_session_info *v9ses;
438 struct p9_fid *v9fid; 457 struct p9_fid *v9fid;
439 458
440 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 459 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
441 rmdir); 460 rmdir);
442 461
443 file_inode = file->d_inode; 462 file_inode = file->d_inode;
444 v9ses = v9fs_inode2v9ses(file_inode);
445 v9fid = v9fs_fid_clone(file); 463 v9fid = v9fs_fid_clone(file);
446 if (IS_ERR(v9fid)) 464 if (IS_ERR(v9fid))
447 return PTR_ERR(v9fid); 465 return PTR_ERR(v9fid);
@@ -484,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
484 ofid = NULL; 502 ofid = NULL;
485 fid = NULL; 503 fid = NULL;
486 name = (char *) dentry->d_name.name; 504 name = (char *) dentry->d_name.name;
487 dfid = v9fs_fid_clone(dentry->d_parent); 505 dfid = v9fs_fid_lookup(dentry->d_parent);
488 if (IS_ERR(dfid)) { 506 if (IS_ERR(dfid)) {
489 err = PTR_ERR(dfid); 507 err = PTR_ERR(dfid);
490 P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); 508 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
491 dfid = NULL; 509 return ERR_PTR(err);
492 goto error;
493 } 510 }
494 511
495 /* clone a fid to use for creation */ 512 /* clone a fid to use for creation */
@@ -497,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
497 if (IS_ERR(ofid)) { 514 if (IS_ERR(ofid)) {
498 err = PTR_ERR(ofid); 515 err = PTR_ERR(ofid);
499 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 516 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
500 ofid = NULL; 517 return ERR_PTR(err);
501 goto error;
502 } 518 }
503 519
504 err = p9_client_fcreate(ofid, name, perm, mode, extension); 520 err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -508,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
508 } 524 }
509 525
510 /* now walk from the parent so we can get unopened fid */ 526 /* now walk from the parent so we can get unopened fid */
511 fid = p9_client_walk(dfid, 1, &name, 0); 527 fid = p9_client_walk(dfid, 1, &name, 1);
512 if (IS_ERR(fid)) { 528 if (IS_ERR(fid)) {
513 err = PTR_ERR(fid); 529 err = PTR_ERR(fid);
514 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 530 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
515 fid = NULL; 531 fid = NULL;
516 goto error; 532 goto error;
517 } else 533 }
518 dfid = NULL;
519 534
520 /* instantiate inode and assign the unopened fid to the dentry */ 535 /* instantiate inode and assign the unopened fid to the dentry */
521 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 536 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -538,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
538 return ofid; 553 return ofid;
539 554
540error: 555error:
541 if (dfid)
542 p9_client_clunk(dfid);
543
544 if (ofid) 556 if (ofid)
545 p9_client_clunk(ofid); 557 p9_client_clunk(ofid);
546 558
@@ -675,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
675 if (IS_ERR(fid)) { 687 if (IS_ERR(fid)) {
676 result = PTR_ERR(fid); 688 result = PTR_ERR(fid);
677 if (result == -ENOENT) { 689 if (result == -ENOENT) {
678 d_add(dentry, NULL); 690 inode = NULL;
679 return NULL; 691 goto inst_out;
680 } 692 }
681 693
682 return ERR_PTR(result); 694 return ERR_PTR(result);
@@ -693,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
693 if (result < 0) 705 if (result < 0)
694 goto error; 706 goto error;
695 707
696 if ((fid->qid.version) && (v9ses->cache)) 708inst_out:
709 if (v9ses->cache)
697 dentry->d_op = &v9fs_cached_dentry_operations; 710 dentry->d_op = &v9fs_cached_dentry_operations;
698 else 711 else
699 dentry->d_op = &v9fs_dentry_operations; 712 dentry->d_op = &v9fs_dentry_operations;
@@ -772,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
772 goto clunk_olddir; 785 goto clunk_olddir;
773 } 786 }
774 787
788 if (v9fs_proto_dotl(v9ses)) {
789 retval = p9_client_rename(oldfid, newdirfid,
790 (char *) new_dentry->d_name.name);
791 if (retval != -ENOSYS)
792 goto clunk_newdir;
793 }
794
775 /* 9P can only handle file rename in the same directory */ 795 /* 9P can only handle file rename in the same directory */
776 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { 796 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
777 P9_DPRINTK(P9_DEBUG_ERROR, 797 P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1197,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1197 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); 1217 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
1198 else if (S_ISFIFO(mode)) 1218 else if (S_ISFIFO(mode))
1199 *name = 0; 1219 *name = 0;
1220 else if (S_ISSOCK(mode))
1221 *name = 0;
1200 else { 1222 else {
1201 __putname(name); 1223 __putname(name);
1202 return -EINVAL; 1224 return -EINVAL;
@@ -1208,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1208 return retval; 1230 return retval;
1209} 1231}
1210 1232
1211static const struct inode_operations v9fs_dir_inode_operations_ext = { 1233static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1234 .create = v9fs_vfs_create,
1235 .lookup = v9fs_vfs_lookup,
1236 .symlink = v9fs_vfs_symlink,
1237 .link = v9fs_vfs_link,
1238 .unlink = v9fs_vfs_unlink,
1239 .mkdir = v9fs_vfs_mkdir,
1240 .rmdir = v9fs_vfs_rmdir,
1241 .mknod = v9fs_vfs_mknod,
1242 .rename = v9fs_vfs_rename,
1243 .getattr = v9fs_vfs_getattr,
1244 .setattr = v9fs_vfs_setattr,
1245};
1246
1247static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1212 .create = v9fs_vfs_create, 1248 .create = v9fs_vfs_create,
1213 .lookup = v9fs_vfs_lookup, 1249 .lookup = v9fs_vfs_lookup,
1214 .symlink = v9fs_vfs_symlink, 1250 .symlink = v9fs_vfs_symlink,
@@ -1239,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
1239 .setattr = v9fs_vfs_setattr, 1275 .setattr = v9fs_vfs_setattr,
1240}; 1276};
1241 1277
1278static const struct inode_operations v9fs_file_inode_operations_dotl = {
1279 .getattr = v9fs_vfs_getattr,
1280 .setattr = v9fs_vfs_setattr,
1281};
1282
1242static const struct inode_operations v9fs_symlink_inode_operations = { 1283static const struct inode_operations v9fs_symlink_inode_operations = {
1243 .readlink = generic_readlink, 1284 .readlink = generic_readlink,
1244 .follow_link = v9fs_vfs_follow_link, 1285 .follow_link = v9fs_vfs_follow_link,
@@ -1246,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
1246 .getattr = v9fs_vfs_getattr, 1287 .getattr = v9fs_vfs_getattr,
1247 .setattr = v9fs_vfs_setattr, 1288 .setattr = v9fs_vfs_setattr,
1248}; 1289};
1290
1291static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
1292 .readlink = generic_readlink,
1293 .follow_link = v9fs_vfs_follow_link,
1294 .put_link = v9fs_vfs_put_link,
1295 .getattr = v9fs_vfs_getattr,
1296 .setattr = v9fs_vfs_setattr,
1297};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,6 +38,7 @@
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h>
41#include <net/9p/9p.h> 42#include <net/9p/9p.h>
42#include <net/9p/client.h> 43#include <net/9p/client.h>
43 44
@@ -45,7 +46,7 @@
45#include "v9fs_vfs.h" 46#include "v9fs_vfs.h"
46#include "fid.h" 47#include "fid.h"
47 48
48static const struct super_operations v9fs_super_ops; 49static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
49 50
50/** 51/**
51 * v9fs_set_super - set the superblock 52 * v9fs_set_super - set the superblock
@@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
76 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 77 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
77 sb->s_blocksize = 1 << sb->s_blocksize_bits; 78 sb->s_blocksize = 1 << sb->s_blocksize_bits;
78 sb->s_magic = V9FS_MAGIC; 79 sb->s_magic = V9FS_MAGIC;
79 sb->s_op = &v9fs_super_ops; 80 if (v9fs_proto_dotl(v9ses))
81 sb->s_op = &v9fs_super_ops_dotl;
82 else
83 sb->s_op = &v9fs_super_ops;
80 sb->s_bdi = &v9ses->bdi; 84 sb->s_bdi = &v9ses->bdi;
81 85
82 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 86 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb)
211 v9fs_session_begin_cancel(v9ses); 215 v9fs_session_begin_cancel(v9ses);
212} 216}
213 217
218static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
219{
220 struct v9fs_session_info *v9ses;
221 struct p9_fid *fid;
222 struct p9_rstatfs rs;
223 int res;
224
225 fid = v9fs_fid_lookup(dentry);
226 if (IS_ERR(fid)) {
227 res = PTR_ERR(fid);
228 goto done;
229 }
230
231 v9ses = v9fs_inode2v9ses(dentry->d_inode);
232 if (v9fs_proto_dotl(v9ses)) {
233 res = p9_client_statfs(fid, &rs);
234 if (res == 0) {
235 buf->f_type = rs.type;
236 buf->f_bsize = rs.bsize;
237 buf->f_blocks = rs.blocks;
238 buf->f_bfree = rs.bfree;
239 buf->f_bavail = rs.bavail;
240 buf->f_files = rs.files;
241 buf->f_ffree = rs.ffree;
242 buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
243 buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
244 buf->f_namelen = rs.namelen;
245 }
246 if (res != -ENOSYS)
247 goto done;
248 }
249 res = simple_statfs(dentry, buf);
250done:
251 return res;
252}
253
214static const struct super_operations v9fs_super_ops = { 254static const struct super_operations v9fs_super_ops = {
215#ifdef CONFIG_9P_FSCACHE 255#ifdef CONFIG_9P_FSCACHE
216 .alloc_inode = v9fs_alloc_inode, 256 .alloc_inode = v9fs_alloc_inode,
@@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
222 .umount_begin = v9fs_umount_begin, 262 .umount_begin = v9fs_umount_begin,
223}; 263};
224 264
265static const struct super_operations v9fs_super_ops_dotl = {
266#ifdef CONFIG_9P_FSCACHE
267 .alloc_inode = v9fs_alloc_inode,
268 .destroy_inode = v9fs_destroy_inode,
269#endif
270 .statfs = v9fs_statfs,
271 .clear_inode = v9fs_clear_inode,
272 .show_options = generic_show_options,
273 .umount_begin = v9fs_umount_begin,
274};
275
225struct file_system_type v9fs_fs_type = { 276struct file_system_type v9fs_fs_type = {
226 .name = "9p", 277 .name = "9p",
227 .get_sb = v9fs_get_sb, 278 .get_sb = v9fs_get_sb,
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index adc1cb771b57..b42d5cc1d6d2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
189 struct key *key) 189 struct key *key)
190{ 190{
191 struct page *page; 191 struct page *page;
192 struct file file = {
193 .private_data = key,
194 };
195
196 _enter("{%lu},%lu", dir->i_ino, index); 192 _enter("{%lu},%lu", dir->i_ino, index);
197 193
198 page = read_mapping_page(dir->i_mapping, index, &file); 194 page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
199 if (!IS_ERR(page)) { 195 if (!IS_ERR(page)) {
200 kmap(page); 196 kmap(page);
201 if (!PageChecked(page)) 197 if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0df9bc2b724d..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
121#endif 121#endif
122 122
123/* 123/*
124 * AFS read page from file, directory or symlink 124 * read page from file, directory or symlink, given a key to use
125 */ 125 */
126static int afs_readpage(struct file *file, struct page *page) 126int afs_page_filler(void *data, struct page *page)
127{ 127{
128 struct afs_vnode *vnode; 128 struct inode *inode = page->mapping->host;
129 struct inode *inode; 129 struct afs_vnode *vnode = AFS_FS_I(inode);
130 struct key *key; 130 struct key *key = data;
131 size_t len; 131 size_t len;
132 off_t offset; 132 off_t offset;
133 int ret; 133 int ret;
134 134
135 inode = page->mapping->host;
136
137 if (file) {
138 key = file->private_data;
139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
147
148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 135 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
149 136
150 vnode = AFS_FS_I(inode);
151
152 BUG_ON(!PageLocked(page)); 137 BUG_ON(!PageLocked(page));
153 138
154 ret = -ESTALE; 139 ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
214 unlock_page(page); 199 unlock_page(page);
215 } 200 }
216 201
217 if (!file)
218 key_put(key);
219 _leave(" = 0"); 202 _leave(" = 0");
220 return 0; 203 return 0;
221 204
222error: 205error:
223 SetPageError(page); 206 SetPageError(page);
224 unlock_page(page); 207 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
228 _leave(" = %d", ret); 208 _leave(" = %d", ret);
229 return ret; 209 return ret;
230} 210}
231 211
232/* 212/*
213 * read page from file, directory or symlink, given a file to nominate the key
214 * to be used
215 */
216static int afs_readpage(struct file *file, struct page *page)
217{
218 struct key *key;
219 int ret;
220
221 if (file) {
222 key = file->private_data;
223 ASSERT(key != NULL);
224 ret = afs_page_filler(key, page);
225 } else {
226 struct inode *inode = page->mapping->host;
227 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
228 if (IS_ERR(key)) {
229 ret = PTR_ERR(key);
230 } else {
231 ret = afs_page_filler(key, page);
232 key_put(key);
233 }
234 }
235 return ret;
236}
237
238/*
233 * read a set of pages 239 * read a set of pages
234 */ 240 */
235static int afs_readpages(struct file *file, struct address_space *mapping, 241static int afs_readpages(struct file *file, struct address_space *mapping,
236 struct list_head *pages, unsigned nr_pages) 242 struct list_head *pages, unsigned nr_pages)
237{ 243{
244 struct key *key = file->private_data;
238 struct afs_vnode *vnode; 245 struct afs_vnode *vnode;
239 int ret = 0; 246 int ret = 0;
240 247
241 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); 248 _enter("{%d},{%lu},,%d",
249 key_serial(key), mapping->host->i_ino, nr_pages);
250
251 ASSERT(key != NULL);
242 252
243 vnode = AFS_FS_I(mapping->host); 253 vnode = AFS_FS_I(mapping->host);
244 if (vnode->flags & AFS_VNODE_DELETED) { 254 if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
279 } 289 }
280 290
281 /* load the missing pages from the network */ 291 /* load the missing pages from the network */
282 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); 292 ret = read_cache_pages(mapping, pages, afs_page_filler, key);
283 293
284 _leave(" = %d [netting]", ret); 294 _leave(" = %d [netting]", ret);
285 return ret; 295 return ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a10f2582844f..807f284cc75e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations;
494 494
495extern int afs_open(struct inode *, struct file *); 495extern int afs_open(struct inode *, struct file *);
496extern int afs_release(struct inode *, struct file *); 496extern int afs_release(struct inode *, struct file *);
497extern int afs_page_filler(void *, struct page *);
497 498
498/* 499/*
499 * flock.c 500 * flock.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b3feddc4f7d6..a9e23039ea34 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
49 */ 49 */
50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) 50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
51{ 51{
52 struct file file = {
53 .private_data = key,
54 };
55 struct page *page; 52 struct page *page;
56 size_t size; 53 size_t size;
57 char *buf; 54 char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
61 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 58 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
62 59
63 /* read the contents of the symlink into the pagecache */ 60 /* read the contents of the symlink into the pagecache */
64 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); 61 page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
62 afs_page_filler, key);
65 if (IS_ERR(page)) { 63 if (IS_ERR(page)) {
66 ret = PTR_ERR(page); 64 ret = PTR_ERR(page);
67 goto out; 65 goto out;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..9bd4b3876c99 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
205 * that it already _is_ on the dirty list. 205 * that it already _is_ on the dirty list.
206 */ 206 */
207 inode->i_state = I_DIRTY; 207 inode->i_state = I_DIRTY;
208 inode->i_mode = S_IRUSR | S_IWUSR; 208 inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
209 inode->i_uid = current_fsuid(); 209 inode->i_uid = current_fsuid();
210 inode->i_gid = current_fsgid(); 210 inode->i_gid = current_fsgid();
211 inode->i_flags |= S_PRIVATE; 211 inode->i_flags |= S_PRIVATE;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,14 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/smp_lock.h>
21#include "autofs_i.h" 22#include "autofs_i.h"
22 23
23static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 24static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
24static int autofs4_dir_unlink(struct inode *,struct dentry *); 25static int autofs4_dir_unlink(struct inode *,struct dentry *);
25static int autofs4_dir_rmdir(struct inode *,struct dentry *); 26static int autofs4_dir_rmdir(struct inode *,struct dentry *);
26static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 27static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
27static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
28static int autofs4_dir_open(struct inode *inode, struct file *file); 29static int autofs4_dir_open(struct inode *inode, struct file *file);
29static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 30static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
30static void *autofs4_follow_link(struct dentry *, struct nameidata *); 31static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
38 .read = generic_read_dir, 39 .read = generic_read_dir,
39 .readdir = dcache_readdir, 40 .readdir = dcache_readdir,
40 .llseek = dcache_dir_lseek, 41 .llseek = dcache_dir_lseek,
41 .ioctl = autofs4_root_ioctl, 42 .unlocked_ioctl = autofs4_root_ioctl,
42}; 43};
43 44
44const struct file_operations autofs4_dir_operations = { 45const struct file_operations autofs4_dir_operations = {
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
902 * ioctl()'s on the root directory is the chief method for the daemon to 903 * ioctl()'s on the root directory is the chief method for the daemon to
903 * generate kernel reactions 904 * generate kernel reactions
904 */ 905 */
905static int autofs4_root_ioctl(struct inode *inode, struct file *filp, 906static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
906 unsigned int cmd, unsigned long arg) 907 unsigned int cmd, unsigned long arg)
907{ 908{
908 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 909 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
909 void __user *p = (void __user *)arg; 910 void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
947 return -ENOSYS; 948 return -ENOSYS;
948 } 949 }
949} 950}
951
952static long autofs4_root_ioctl(struct file *filp,
953 unsigned int cmd, unsigned long arg)
954{
955 long ret;
956 struct inode *inode = filp->f_dentry->d_inode;
957
958 lock_kernel();
959 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
960 unlock_kernel();
961
962 return ret;
963}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..8f73841fc974 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
105 } 105 }
106 set_bit(ino, info->si_imap); 106 set_bit(ino, info->si_imap);
107 info->si_freei--; 107 info->si_freei--;
108 inode->i_uid = current_fsuid(); 108 inode_init_owner(inode, dir, mode);
109 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
110 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 109 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
111 inode->i_blocks = 0; 110 inode->i_blocks = 0;
112 inode->i_op = &bfs_file_inops; 111 inode->i_op = &bfs_file_inops;
113 inode->i_fop = &bfs_file_operations; 112 inode->i_fop = &bfs_file_operations;
114 inode->i_mapping->a_ops = &bfs_aops; 113 inode->i_mapping->a_ops = &bfs_aops;
115 inode->i_mode = mode;
116 inode->i_ino = ino; 114 inode->i_ino = ino;
117 BFS_I(inode)->i_dsk_ino = ino; 115 BFS_I(inode)->i_dsk_ino = ino;
118 BFS_I(inode)->i_sblock = 0; 116 BFS_I(inode)->i_sblock = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..26e5f5026620 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
245 sb = get_active_super(bdev); 245 sb = get_active_super(bdev);
246 if (!sb) 246 if (!sb)
247 goto out; 247 goto out;
248 if (sb->s_flags & MS_RDONLY) { 248 error = freeze_super(sb);
249 sb->s_frozen = SB_FREEZE_TRANS; 249 if (error) {
250 up_write(&sb->s_umount); 250 deactivate_super(sb);
251 bdev->bd_fsfreeze_count--;
251 mutex_unlock(&bdev->bd_fsfreeze_mutex); 252 mutex_unlock(&bdev->bd_fsfreeze_mutex);
252 return sb; 253 return ERR_PTR(error);
253 }
254
255 sb->s_frozen = SB_FREEZE_WRITE;
256 smp_wmb();
257
258 sync_filesystem(sb);
259
260 sb->s_frozen = SB_FREEZE_TRANS;
261 smp_wmb();
262
263 sync_blockdev(sb->s_bdev);
264
265 if (sb->s_op->freeze_fs) {
266 error = sb->s_op->freeze_fs(sb);
267 if (error) {
268 printk(KERN_ERR
269 "VFS:Filesystem freeze failed\n");
270 sb->s_frozen = SB_UNFROZEN;
271 deactivate_locked_super(sb);
272 bdev->bd_fsfreeze_count--;
273 mutex_unlock(&bdev->bd_fsfreeze_mutex);
274 return ERR_PTR(error);
275 }
276 } 254 }
277 up_write(&sb->s_umount); 255 deactivate_super(sb);
278
279 out: 256 out:
280 sync_blockdev(bdev); 257 sync_blockdev(bdev);
281 mutex_unlock(&bdev->bd_fsfreeze_mutex); 258 mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
296 273
297 mutex_lock(&bdev->bd_fsfreeze_mutex); 274 mutex_lock(&bdev->bd_fsfreeze_mutex);
298 if (!bdev->bd_fsfreeze_count) 275 if (!bdev->bd_fsfreeze_count)
299 goto out_unlock; 276 goto out;
300 277
301 error = 0; 278 error = 0;
302 if (--bdev->bd_fsfreeze_count > 0) 279 if (--bdev->bd_fsfreeze_count > 0)
303 goto out_unlock; 280 goto out;
304 281
305 if (!sb) 282 if (!sb)
306 goto out_unlock; 283 goto out;
307
308 BUG_ON(sb->s_bdev != bdev);
309 down_write(&sb->s_umount);
310 if (sb->s_flags & MS_RDONLY)
311 goto out_unfrozen;
312
313 if (sb->s_op->unfreeze_fs) {
314 error = sb->s_op->unfreeze_fs(sb);
315 if (error) {
316 printk(KERN_ERR
317 "VFS:Filesystem thaw failed\n");
318 sb->s_frozen = SB_FREEZE_TRANS;
319 bdev->bd_fsfreeze_count++;
320 mutex_unlock(&bdev->bd_fsfreeze_mutex);
321 return error;
322 }
323 }
324
325out_unfrozen:
326 sb->s_frozen = SB_UNFROZEN;
327 smp_wmb();
328 wake_up(&sb->s_wait_unfrozen);
329 284
330 if (sb) 285 error = thaw_super(sb);
331 deactivate_locked_super(sb); 286 if (error) {
332out_unlock: 287 bdev->bd_fsfreeze_count++;
288 mutex_unlock(&bdev->bd_fsfreeze_mutex);
289 return error;
290 }
291out:
333 mutex_unlock(&bdev->bd_fsfreeze_mutex); 292 mutex_unlock(&bdev->bd_fsfreeze_mutex);
334 return 0; 293 return 0;
335} 294}
@@ -417,7 +376,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
417 */ 376 */
418 mutex_unlock(&bd_inode->i_mutex); 377 mutex_unlock(&bd_inode->i_mutex);
419 378
420 error = blkdev_issue_flush(bdev, NULL); 379 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
421 if (error == -EOPNOTSUPP) 380 if (error == -EOPNOTSUPP)
422 error = 0; 381 error = 0;
423 382
@@ -668,41 +627,209 @@ void bd_forget(struct inode *inode)
668 iput(bdev->bd_inode); 627 iput(bdev->bd_inode);
669} 628}
670 629
671int bd_claim(struct block_device *bdev, void *holder) 630/**
631 * bd_may_claim - test whether a block device can be claimed
632 * @bdev: block device of interest
633 * @whole: whole block device containing @bdev, may equal @bdev
634 * @holder: holder trying to claim @bdev
635 *
636 * Test whther @bdev can be claimed by @holder.
637 *
638 * CONTEXT:
639 * spin_lock(&bdev_lock).
640 *
641 * RETURNS:
642 * %true if @bdev can be claimed, %false otherwise.
643 */
644static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
645 void *holder)
672{ 646{
673 int res;
674 spin_lock(&bdev_lock);
675
676 /* first decide result */
677 if (bdev->bd_holder == holder) 647 if (bdev->bd_holder == holder)
678 res = 0; /* already a holder */ 648 return true; /* already a holder */
679 else if (bdev->bd_holder != NULL) 649 else if (bdev->bd_holder != NULL)
680 res = -EBUSY; /* held by someone else */ 650 return false; /* held by someone else */
681 else if (bdev->bd_contains == bdev) 651 else if (bdev->bd_contains == bdev)
682 res = 0; /* is a whole device which isn't held */ 652 return true; /* is a whole device which isn't held */
683 653
684 else if (bdev->bd_contains->bd_holder == bd_claim) 654 else if (whole->bd_holder == bd_claim)
685 res = 0; /* is a partition of a device that is being partitioned */ 655 return true; /* is a partition of a device that is being partitioned */
686 else if (bdev->bd_contains->bd_holder != NULL) 656 else if (whole->bd_holder != NULL)
687 res = -EBUSY; /* is a partition of a held device */ 657 return false; /* is a partition of a held device */
688 else 658 else
689 res = 0; /* is a partition of an un-held device */ 659 return true; /* is a partition of an un-held device */
660}
661
662/**
663 * bd_prepare_to_claim - prepare to claim a block device
664 * @bdev: block device of interest
665 * @whole: the whole device containing @bdev, may equal @bdev
666 * @holder: holder trying to claim @bdev
667 *
668 * Prepare to claim @bdev. This function fails if @bdev is already
669 * claimed by another holder and waits if another claiming is in
670 * progress. This function doesn't actually claim. On successful
671 * return, the caller has ownership of bd_claiming and bd_holder[s].
672 *
673 * CONTEXT:
674 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
675 * it multiple times.
676 *
677 * RETURNS:
678 * 0 if @bdev can be claimed, -EBUSY otherwise.
679 */
680static int bd_prepare_to_claim(struct block_device *bdev,
681 struct block_device *whole, void *holder)
682{
683retry:
684 /* if someone else claimed, fail */
685 if (!bd_may_claim(bdev, whole, holder))
686 return -EBUSY;
687
688 /* if someone else is claiming, wait for it to finish */
689 if (whole->bd_claiming && whole->bd_claiming != holder) {
690 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
691 DEFINE_WAIT(wait);
692
693 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
694 spin_unlock(&bdev_lock);
695 schedule();
696 finish_wait(wq, &wait);
697 spin_lock(&bdev_lock);
698 goto retry;
699 }
700
701 /* yay, all mine */
702 return 0;
703}
690 704
691 /* now impose change */ 705/**
692 if (res==0) { 706 * bd_start_claiming - start claiming a block device
707 * @bdev: block device of interest
708 * @holder: holder trying to claim @bdev
709 *
710 * @bdev is about to be opened exclusively. Check @bdev can be opened
711 * exclusively and mark that an exclusive open is in progress. Each
712 * successful call to this function must be matched with a call to
713 * either bd_claim() or bd_abort_claiming(). If this function
714 * succeeds, the matching bd_claim() is guaranteed to succeed.
715 *
716 * CONTEXT:
717 * Might sleep.
718 *
719 * RETURNS:
720 * Pointer to the block device containing @bdev on success, ERR_PTR()
721 * value on failure.
722 */
723static struct block_device *bd_start_claiming(struct block_device *bdev,
724 void *holder)
725{
726 struct gendisk *disk;
727 struct block_device *whole;
728 int partno, err;
729
730 might_sleep();
731
732 /*
733 * @bdev might not have been initialized properly yet, look up
734 * and grab the outer block device the hard way.
735 */
736 disk = get_gendisk(bdev->bd_dev, &partno);
737 if (!disk)
738 return ERR_PTR(-ENXIO);
739
740 whole = bdget_disk(disk, 0);
741 put_disk(disk);
742 if (!whole)
743 return ERR_PTR(-ENOMEM);
744
745 /* prepare to claim, if successful, mark claiming in progress */
746 spin_lock(&bdev_lock);
747
748 err = bd_prepare_to_claim(bdev, whole, holder);
749 if (err == 0) {
750 whole->bd_claiming = holder;
751 spin_unlock(&bdev_lock);
752 return whole;
753 } else {
754 spin_unlock(&bdev_lock);
755 bdput(whole);
756 return ERR_PTR(err);
757 }
758}
759
760/* releases bdev_lock */
761static void __bd_abort_claiming(struct block_device *whole, void *holder)
762{
763 BUG_ON(whole->bd_claiming != holder);
764 whole->bd_claiming = NULL;
765 wake_up_bit(&whole->bd_claiming, 0);
766
767 spin_unlock(&bdev_lock);
768 bdput(whole);
769}
770
771/**
772 * bd_abort_claiming - abort claiming a block device
773 * @whole: whole block device returned by bd_start_claiming()
774 * @holder: holder trying to claim @bdev
775 *
776 * Abort a claiming block started by bd_start_claiming(). Note that
777 * @whole is not the block device to be claimed but the whole device
778 * returned by bd_start_claiming().
779 *
780 * CONTEXT:
781 * Grabs and releases bdev_lock.
782 */
783static void bd_abort_claiming(struct block_device *whole, void *holder)
784{
785 spin_lock(&bdev_lock);
786 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
787}
788
789/**
790 * bd_claim - claim a block device
791 * @bdev: block device to claim
792 * @holder: holder trying to claim @bdev
793 *
794 * Try to claim @bdev which must have been opened successfully. This
795 * function may be called with or without preceding
796 * blk_start_claiming(). In the former case, this function is always
797 * successful and terminates the claiming block.
798 *
799 * CONTEXT:
800 * Might sleep.
801 *
802 * RETURNS:
803 * 0 if successful, -EBUSY if @bdev is already claimed.
804 */
805int bd_claim(struct block_device *bdev, void *holder)
806{
807 struct block_device *whole = bdev->bd_contains;
808 int res;
809
810 might_sleep();
811
812 spin_lock(&bdev_lock);
813
814 res = bd_prepare_to_claim(bdev, whole, holder);
815 if (res == 0) {
693 /* note that for a whole device bd_holders 816 /* note that for a whole device bd_holders
694 * will be incremented twice, and bd_holder will 817 * will be incremented twice, and bd_holder will
695 * be set to bd_claim before being set to holder 818 * be set to bd_claim before being set to holder
696 */ 819 */
697 bdev->bd_contains->bd_holders ++; 820 whole->bd_holders++;
698 bdev->bd_contains->bd_holder = bd_claim; 821 whole->bd_holder = bd_claim;
699 bdev->bd_holders++; 822 bdev->bd_holders++;
700 bdev->bd_holder = holder; 823 bdev->bd_holder = holder;
701 } 824 }
702 spin_unlock(&bdev_lock); 825
826 if (whole->bd_claiming)
827 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
828 else
829 spin_unlock(&bdev_lock);
830
703 return res; 831 return res;
704} 832}
705
706EXPORT_SYMBOL(bd_claim); 833EXPORT_SYMBOL(bd_claim);
707 834
708void bd_release(struct block_device *bdev) 835void bd_release(struct block_device *bdev)
@@ -1316,6 +1443,7 @@ EXPORT_SYMBOL(blkdev_get);
1316 1443
1317static int blkdev_open(struct inode * inode, struct file * filp) 1444static int blkdev_open(struct inode * inode, struct file * filp)
1318{ 1445{
1446 struct block_device *whole = NULL;
1319 struct block_device *bdev; 1447 struct block_device *bdev;
1320 int res; 1448 int res;
1321 1449
@@ -1338,22 +1466,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1338 if (bdev == NULL) 1466 if (bdev == NULL)
1339 return -ENOMEM; 1467 return -ENOMEM;
1340 1468
1469 if (filp->f_mode & FMODE_EXCL) {
1470 whole = bd_start_claiming(bdev, filp);
1471 if (IS_ERR(whole)) {
1472 bdput(bdev);
1473 return PTR_ERR(whole);
1474 }
1475 }
1476
1341 filp->f_mapping = bdev->bd_inode->i_mapping; 1477 filp->f_mapping = bdev->bd_inode->i_mapping;
1342 1478
1343 res = blkdev_get(bdev, filp->f_mode); 1479 res = blkdev_get(bdev, filp->f_mode);
1344 if (res)
1345 return res;
1346 1480
1347 if (filp->f_mode & FMODE_EXCL) { 1481 if (whole) {
1348 res = bd_claim(bdev, filp); 1482 if (res == 0)
1349 if (res) 1483 BUG_ON(bd_claim(bdev, filp) != 0);
1350 goto out_blkdev_put; 1484 else
1485 bd_abort_claiming(whole, filp);
1351 } 1486 }
1352 1487
1353 return 0;
1354
1355 out_blkdev_put:
1356 blkdev_put(bdev, filp->f_mode);
1357 return res; 1488 return res;
1358} 1489}
1359 1490
@@ -1564,27 +1695,34 @@ EXPORT_SYMBOL(lookup_bdev);
1564 */ 1695 */
1565struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1696struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1566{ 1697{
1567 struct block_device *bdev; 1698 struct block_device *bdev, *whole;
1568 int error = 0; 1699 int error;
1569 1700
1570 bdev = lookup_bdev(path); 1701 bdev = lookup_bdev(path);
1571 if (IS_ERR(bdev)) 1702 if (IS_ERR(bdev))
1572 return bdev; 1703 return bdev;
1573 1704
1705 whole = bd_start_claiming(bdev, holder);
1706 if (IS_ERR(whole)) {
1707 bdput(bdev);
1708 return whole;
1709 }
1710
1574 error = blkdev_get(bdev, mode); 1711 error = blkdev_get(bdev, mode);
1575 if (error) 1712 if (error)
1576 return ERR_PTR(error); 1713 goto out_abort_claiming;
1714
1577 error = -EACCES; 1715 error = -EACCES;
1578 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1716 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1579 goto blkdev_put; 1717 goto out_blkdev_put;
1580 error = bd_claim(bdev, holder);
1581 if (error)
1582 goto blkdev_put;
1583 1718
1719 BUG_ON(bd_claim(bdev, holder) != 0);
1584 return bdev; 1720 return bdev;
1585 1721
1586blkdev_put: 1722out_blkdev_put:
1587 blkdev_put(bdev, mode); 1723 blkdev_put(bdev, mode);
1724out_abort_claiming:
1725 bd_abort_claiming(whole, holder);
1588 return ERR_PTR(error); 1726 return ERR_PTR(error);
1589} 1727}
1590 1728
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26724ec..8d432cd9d580 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -282,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode)
282 return ret; 282 return ret;
283} 283}
284 284
285struct xattr_handler btrfs_xattr_acl_default_handler = { 285const struct xattr_handler btrfs_xattr_acl_default_handler = {
286 .prefix = POSIX_ACL_XATTR_DEFAULT, 286 .prefix = POSIX_ACL_XATTR_DEFAULT,
287 .flags = ACL_TYPE_DEFAULT, 287 .flags = ACL_TYPE_DEFAULT,
288 .get = btrfs_xattr_acl_get, 288 .get = btrfs_xattr_acl_get,
289 .set = btrfs_xattr_acl_set, 289 .set = btrfs_xattr_acl_set,
290}; 290};
291 291
292struct xattr_handler btrfs_xattr_acl_access_handler = { 292const struct xattr_handler btrfs_xattr_acl_access_handler = {
293 .prefix = POSIX_ACL_XATTR_ACCESS, 293 .prefix = POSIX_ACL_XATTR_ACCESS,
294 .flags = ACL_TYPE_ACCESS, 294 .flags = ACL_TYPE_ACCESS,
295 .get = btrfs_xattr_acl_get, 295 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1589 u64 start, u64 len) 1589 u64 start, u64 len)
1590{ 1590{
1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1592 DISCARD_FL_BARRIER); 1592 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1593} 1593}
1594 1594
1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..d601629b85d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4121 if (ret != 0) 4121 if (ret != 0)
4122 goto fail; 4122 goto fail;
4123 4123
4124 inode->i_uid = current_fsuid(); 4124 inode_init_owner(inode, dir, mode);
4125
4126 if (dir && (dir->i_mode & S_ISGID)) {
4127 inode->i_gid = dir->i_gid;
4128 if (S_ISDIR(mode))
4129 mode |= S_ISGID;
4130 } else
4131 inode->i_gid = current_fsgid();
4132
4133 inode->i_mode = mode;
4134 inode->i_ino = objectid; 4125 inode->i_ino = objectid;
4135 inode_set_bytes(inode, 0); 4126 inode_set_bytes(inode, 0);
4136 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4127 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..59acd3eb288a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -282,7 +282,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 282 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 283 * attributes are handled directly.
284 */ 284 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 285const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 286#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 287 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 288 &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..e8aa7081d25c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
275 return; 275 return;
276 276
277 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
278 lru_add_drain_all(); /* make sure all lru add caches are flushed */
278 invalidate_mapping_pages(mapping, 0, -1); 279 invalidate_mapping_pages(mapping, 0, -1);
279} 280}
280EXPORT_SYMBOL(invalidate_bdev); 281EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
560 return err; 561 return err;
561} 562}
562 563
563static void do_thaw_all(struct work_struct *work) 564static void do_thaw_one(struct super_block *sb, void *unused)
564{ 565{
565 struct super_block *sb;
566 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
567 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
568 printk(KERN_WARNING "Emergency Thaw on %s\n",
569 bdevname(sb->s_bdev, b));
570}
567 571
568 spin_lock(&sb_lock); 572static void do_thaw_all(struct work_struct *work)
569restart: 573{
570 list_for_each_entry(sb, &super_blocks, s_list) { 574 iterate_supers(do_thaw_one, NULL);
571 sb->s_count++;
572 spin_unlock(&sb_lock);
573 down_read(&sb->s_umount);
574 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
575 printk(KERN_WARNING "Emergency Thaw on %s\n",
576 bdevname(sb->s_bdev, b));
577 up_read(&sb->s_umount);
578 spin_lock(&sb_lock);
579 if (__put_super_and_need_restart(sb))
580 goto restart;
581 }
582 spin_unlock(&sb_lock);
583 kfree(work); 575 kfree(work);
584 printk(KERN_WARNING "Emergency Thaw complete\n"); 576 printk(KERN_WARNING "Emergency Thaw complete\n");
585} 577}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0; 275 int rc = 0;
276 struct page **pages; 276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset; 277 loff_t offset;
279 u64 len; 278 u64 len;
280 279
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
297 if (rc < 0) 296 if (rc < 0)
298 goto out; 297 goto out;
299 298
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0; 299 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 300 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page = 301 struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
312 zero_user_segment(page, s, PAGE_CACHE_SIZE); 309 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 } 310 }
314 311
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { 312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page); 313 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n", 314 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page); 315 inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
323 flush_dcache_page(page); 320 flush_dcache_page(page);
324 SetPageUptodate(page); 321 SetPageUptodate(page);
325 unlock_page(page); 322 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0) 323 page_cache_release(page);
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 } 324 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0; 325 rc = 0;
331 326
332out: 327out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
568 ceph_release_pages(req->r_pages, req->r_num_pages); 563 ceph_release_pages(req->r_pages, req->r_num_pages);
569 if (req->r_pages_from_pool) 564 if (req->r_pages_from_pool)
570 mempool_free(req->r_pages, 565 mempool_free(req->r_pages,
571 ceph_client(inode->i_sb)->wb_pagevec_pool); 566 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
572 else 567 else
573 kfree(req->r_pages); 568 kfree(req->r_pages);
574 ceph_osdc_put_request(req); 569 ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..9f46de2ba7a7 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
150 150
151 ret = ac->ops->build_request(ac, p + sizeof(u32), end); 151 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
152 if (ret < 0) { 152 if (ret < 0) {
153 pr_err("error %d building request\n", ret); 153 pr_err("error %d building auth method %s request\n", ret,
154 ac->ops->name);
154 return ret; 155 return ret;
155 } 156 }
156 dout(" built request %d bytes\n", ret); 157 dout(" built request %d bytes\n", ret);
@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
216 if (ac->protocol != protocol) { 217 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol); 218 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) { 219 if (ret) {
219 pr_err("error %d on auth protocol %d init\n", 220 pr_err("error %d on auth method %s init\n",
220 ret, protocol); 221 ret, ac->ops->name);
221 goto out; 222 goto out;
222 } 223 }
223 } 224 }
@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
229 if (ret == -EAGAIN) { 230 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len); 231 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) { 232 } else if (ret) {
232 pr_err("authentication error %d\n", ret); 233 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret; 234 return ret;
234 } 235 }
235 return 0; 236 return 0;
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..4429a707c021 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
15struct ceph_authorizer; 15struct ceph_authorizer;
16 16
17struct ceph_auth_client_ops { 17struct ceph_auth_client_ops {
18 const char *name;
19
18 /* 20 /*
19 * true if we are authenticated and can connect to 21 * true if we are authenticated and can connect to
20 * services. 22 * services.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..24407c119291 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
94} 94}
95 95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = { 96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .name = "none",
97 .reset = reset, 98 .reset = reset,
98 .destroy = destroy, 99 .destroy = destroy,
99 .is_authenticated = is_authenticated, 100 .is_authenticated = is_authenticated,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..7b206231566d 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
127 int ret; 127 int ret;
128 char *dbuf; 128 char *dbuf;
129 char *ticket_buf; 129 char *ticket_buf;
130 u8 struct_v; 130 u8 reply_struct_v;
131 131
132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); 132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
133 if (!dbuf) 133 if (!dbuf)
@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
139 goto out_dbuf; 139 goto out_dbuf;
140 140
141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad); 141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
142 struct_v = ceph_decode_8(&p); 142 reply_struct_v = ceph_decode_8(&p);
143 if (struct_v != 1) 143 if (reply_struct_v != 1)
144 goto bad; 144 goto bad;
145 num = ceph_decode_32(&p); 145 num = ceph_decode_32(&p);
146 dout("%d tickets\n", num); 146 dout("%d tickets\n", num);
147 while (num--) { 147 while (num--) {
148 int type; 148 int type;
149 u8 struct_v; 149 u8 tkt_struct_v, blob_struct_v;
150 struct ceph_x_ticket_handler *th; 150 struct ceph_x_ticket_handler *th;
151 void *dp, *dend; 151 void *dp, *dend;
152 int dlen; 152 int dlen;
@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
165 type = ceph_decode_32(&p); 165 type = ceph_decode_32(&p);
166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); 166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
167 167
168 struct_v = ceph_decode_8(&p); 168 tkt_struct_v = ceph_decode_8(&p);
169 if (struct_v != 1) 169 if (tkt_struct_v != 1)
170 goto bad; 170 goto bad;
171 171
172 th = get_ticket_handler(ac, type); 172 th = get_ticket_handler(ac, type);
@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
186 dend = dbuf + dlen; 186 dend = dbuf + dlen;
187 dp = dbuf; 187 dp = dbuf;
188 188
189 struct_v = ceph_decode_8(&dp); 189 tkt_struct_v = ceph_decode_8(&dp);
190 if (struct_v != 1) 190 if (tkt_struct_v != 1)
191 goto bad; 191 goto bad;
192 192
193 memcpy(&old_key, &th->session_key, sizeof(old_key)); 193 memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
224 tpend = tp + dlen; 224 tpend = tp + dlen;
225 dout(" ticket blob is %d bytes\n", dlen); 225 dout(" ticket blob is %d bytes\n", dlen);
226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); 226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
227 struct_v = ceph_decode_8(&tp); 227 blob_struct_v = ceph_decode_8(&tp);
228 new_secret_id = ceph_decode_64(&tp); 228 new_secret_id = ceph_decode_64(&tp);
229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); 229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
230 if (ret) 230 if (ret)
@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
618 618
619 619
620static const struct ceph_auth_client_ops ceph_x_ops = { 620static const struct ceph_auth_client_ops ceph_x_ops = {
621 .name = "x",
621 .is_authenticated = ceph_x_is_authenticated, 622 .is_authenticated = ceph_x_is_authenticated,
622 .build_request = ceph_x_build_request, 623 .build_request = ceph_x_build_request,
623 .handle_reply = ceph_x_handle_reply, 624 .handle_reply = ceph_x_handle_reply,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..0dd0b81e64f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
867{ 867{
868 struct ceph_mds_session *session = cap->session; 868 struct ceph_mds_session *session = cap->session;
869 struct ceph_inode_info *ci = cap->ci; 869 struct ceph_inode_info *ci = cap->ci;
870 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 870 struct ceph_mds_client *mdsc =
871 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
871 int removed = 0; 872 int removed = 0;
872 873
873 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 874 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
937 seq, issue_seq, mseq, follows, size, max_size, 938 seq, issue_seq, mseq, follows, size, max_size,
938 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 939 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
939 940
940 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); 941 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
941 if (IS_ERR(msg)) 942 if (!msg)
942 return PTR_ERR(msg); 943 return -ENOMEM;
943 944
944 msg->hdr.tid = cpu_to_le64(flush_tid); 945 msg->hdr.tid = cpu_to_le64(flush_tid);
945 946
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1298 */ 1299 */
1299void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1300void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1300{ 1301{
1301 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 1302 struct ceph_mds_client *mdsc =
1303 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1302 struct inode *inode = &ci->vfs_inode; 1304 struct inode *inode = &ci->vfs_inode;
1303 int was = ci->i_dirty_caps; 1305 int was = ci->i_dirty_caps;
1304 int dirty = 0; 1306 int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1336static int __mark_caps_flushing(struct inode *inode, 1338static int __mark_caps_flushing(struct inode *inode,
1337 struct ceph_mds_session *session) 1339 struct ceph_mds_session *session)
1338{ 1340{
1339 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1341 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1340 struct ceph_inode_info *ci = ceph_inode(inode); 1342 struct ceph_inode_info *ci = ceph_inode(inode);
1341 int flushing; 1343 int flushing;
1342 1344
@@ -1663,7 +1665,7 @@ ack:
1663static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1665static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1664 unsigned *flush_tid) 1666 unsigned *flush_tid)
1665{ 1667{
1666 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1668 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1667 struct ceph_inode_info *ci = ceph_inode(inode); 1669 struct ceph_inode_info *ci = ceph_inode(inode);
1668 int unlock_session = session ? 0 : 1; 1670 int unlock_session = session ? 0 : 1;
1669 int flushing = 0; 1671 int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
1716static int caps_are_flushed(struct inode *inode, unsigned tid) 1718static int caps_are_flushed(struct inode *inode, unsigned tid)
1717{ 1719{
1718 struct ceph_inode_info *ci = ceph_inode(inode); 1720 struct ceph_inode_info *ci = ceph_inode(inode);
1719 int dirty, i, ret = 1; 1721 int i, ret = 1;
1720 1722
1721 spin_lock(&inode->i_lock); 1723 spin_lock(&inode->i_lock);
1722 dirty = __ceph_caps_dirty(ci);
1723 for (i = 0; i < CEPH_CAP_BITS; i++) 1724 for (i = 0; i < CEPH_CAP_BITS; i++)
1724 if ((ci->i_flushing_caps & (1 << i)) && 1725 if ((ci->i_flushing_caps & (1 << i)) &&
1725 ci->i_cap_flush_tid[i] <= tid) { 1726 ci->i_cap_flush_tid[i] <= tid) {
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1829 err = wait_event_interruptible(ci->i_cap_wq, 1830 err = wait_event_interruptible(ci->i_cap_wq,
1830 caps_are_flushed(inode, flush_tid)); 1831 caps_are_flushed(inode, flush_tid));
1831 } else { 1832 } else {
1832 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1833 struct ceph_mds_client *mdsc =
1834 &ceph_sb_to_client(inode->i_sb)->mdsc;
1833 1835
1834 spin_lock(&inode->i_lock); 1836 spin_lock(&inode->i_lock);
1835 if (__ceph_caps_dirty(ci)) 1837 if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2411 __releases(inode->i_lock) 2413 __releases(inode->i_lock)
2412{ 2414{
2413 struct ceph_inode_info *ci = ceph_inode(inode); 2415 struct ceph_inode_info *ci = ceph_inode(inode);
2414 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 2416 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
2415 unsigned seq = le32_to_cpu(m->seq); 2417 unsigned seq = le32_to_cpu(m->seq);
2416 int dirty = le32_to_cpu(m->dirty); 2418 int dirty = le32_to_cpu(m->dirty);
2417 int cleaned = 0; 2419 int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..3b9eeed097b3 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
19 * Ceph release version 19 * Ceph release version
20 */ 20 */
21#define CEPH_VERSION_MAJOR 0 21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19 22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0 23#define CEPH_VERSION_PATCH 0
24 24
25#define _CEPH_STRINGIFY(x) #x 25#define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
36 * client-facing protocol. 36 * client-facing protocol.
37 */ 37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ 38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ 39#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */ 40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
@@ -53,8 +53,18 @@
53/* 53/*
54 * feature bits 54 * feature bits
55 */ 55 */
56#define CEPH_FEATURE_SUPPORTED 0 56#define CEPH_FEATURE_UID 1
57#define CEPH_FEATURE_REQUIRED 0 57#define CEPH_FEATURE_NOSRCADDR 2
58#define CEPH_FEATURE_FLOCK 4
59
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
58 68
59 69
60/* 70/*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
91#define CEPH_AUTH_NONE 0x1 101#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2 102#define CEPH_AUTH_CEPHX 0x2
93 103
104#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
105
94 106
95/********************************************* 107/*********************************************
96 * message layer 108 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
128#define CEPH_MSG_CLIENT_SNAP 0x312 140#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 141#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130 142
143/* pool ops */
144#define CEPH_MSG_POOLOP_REPLY 48
145#define CEPH_MSG_POOLOP 49
146
147
131/* osd */ 148/* osd */
132#define CEPH_MSG_OSD_MAP 41 149#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42 150#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43 151#define CEPH_MSG_OSD_OPREPLY 43
135 152
153/* pool operations */
154enum {
155 POOL_OP_CREATE = 0x01,
156 POOL_OP_DELETE = 0x02,
157 POOL_OP_AUID_CHANGE = 0x03,
158 POOL_OP_CREATE_SNAP = 0x11,
159 POOL_OP_DELETE_SNAP = 0x12,
160 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
161 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
162};
163
136struct ceph_mon_request_header { 164struct ceph_mon_request_header {
137 __le64 have_version; 165 __le64 have_version;
138 __le16 session_mon; 166 __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
155 struct ceph_statfs st; 183 struct ceph_statfs st;
156} __attribute__ ((packed)); 184} __attribute__ ((packed));
157 185
186const char *ceph_pool_op_name(int op);
187
188struct ceph_mon_poolop {
189 struct ceph_mon_request_header monhdr;
190 struct ceph_fsid fsid;
191 __le32 pool;
192 __le32 op;
193 __le64 auid;
194 __le64 snapid;
195 __le32 name_len;
196} __attribute__ ((packed));
197
198struct ceph_mon_poolop_reply {
199 struct ceph_mon_request_header monhdr;
200 struct ceph_fsid fsid;
201 __le32 reply_code;
202 __le32 epoch;
203 char has_data;
204 char data[0];
205} __attribute__ ((packed));
206
207struct ceph_mon_unmanaged_snap {
208 __le64 snapid;
209} __attribute__ ((packed));
210
158struct ceph_osd_getmap { 211struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr; 212 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid; 213 struct ceph_fsid fsid;
@@ -308,6 +361,7 @@ union ceph_mds_request_args {
308 struct { 361 struct {
309 __le32 frag; /* which dir fragment */ 362 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */ 363 __le32 max_entries; /* how many dentries to grab */
364 __le32 max_bytes;
311 } __attribute__ ((packed)) readdir; 365 } __attribute__ ((packed)) readdir;
312 struct { 366 struct {
313 __le32 mode; 367 __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
10 case CEPH_ENTITY_TYPE_OSD: return "osd"; 10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon"; 11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client"; 12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth"; 13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown"; 14 default: return "unknown";
16 } 15 }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; 44 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; 45 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr"; 46 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
47 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
48 48
49 case CEPH_OSD_OP_PULL: return "pull"; 49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push"; 50 case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
174 } 174 }
175 return "???"; 175 return "???";
176} 176}
177
178const char *ceph_pool_op_name(int op)
179{
180 switch (op) {
181 case POOL_OP_CREATE: return "create";
182 case POOL_OP_DELETE: return "delete";
183 case POOL_OP_AUID_CHANGE: return "auid change";
184 case POOL_OP_CREATE_SNAP: return "create snap";
185 case POOL_OP_DELETE_SNAP: return "delete snap";
186 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
187 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
188 }
189 return "???";
190}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
113static int monc_show(struct seq_file *s, void *p) 113static int monc_show(struct seq_file *s, void *p)
114{ 114{
115 struct ceph_client *client = s->private; 115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req; 116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc; 117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp; 118 struct rb_node *rp;
119 119
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
126 if (monc->want_next_osdmap) 126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n"); 127 seq_printf(s, "want next osdmap\n");
128 128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { 129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node); 130 __u16 op;
131 seq_printf(s, "%lld statfs\n", req->tid); 131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
132 } 137 }
133 138
134 mutex_unlock(&monc->mutex); 139 mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..4fd30900eff7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
51 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
52 52
53 spin_lock(&dentry->d_lock); 53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */ 54 if (dentry->d_fsdata) {
55 /* lost a race */
56 kmem_cache_free(ceph_dentry_cachep, di);
55 goto out_unlock; 57 goto out_unlock;
58 }
56 di->dentry = dentry; 59 di->dentry = dentry;
57 di->lease_session = NULL; 60 di->lease_session = NULL;
58 dentry->d_fsdata = di; 61 dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
125 dentry = list_entry(p, struct dentry, d_u.d_child); 128 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry); 129 di = ceph_dentry(dentry);
127 while (1) { 130 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, 131 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
132 d_unhashed(dentry) ? "!hashed" : "hashed",
129 parent->d_subdirs.prev, parent->d_subdirs.next); 133 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) { 134 if (p == &parent->d_subdirs) {
131 fi->at_end = 1; 135 fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
229 u32 ftype; 233 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo; 234 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir; 235 const int max_entries = client->mount_args->max_readdir;
236 const int max_bytes = client->mount_args->max_readdir_bytes;
232 237
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end) 239 if (fi->at_end)
@@ -312,6 +317,7 @@ more:
312 req->r_readdir_offset = fi->next_offset; 317 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag); 318 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 319 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
320 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
315 req->r_num_caps = max_entries + 1; 321 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req); 322 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) { 323 if (err < 0) {
@@ -335,7 +341,7 @@ more:
335 if (req->r_reply_info.dir_end) { 341 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name); 342 kfree(fi->last_name);
337 fi->last_name = NULL; 343 fi->last_name = NULL;
338 fi->next_offset = 0; 344 fi->next_offset = 2;
339 } else { 345 } else {
340 rinfo = &req->r_reply_info; 346 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi, 347 err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 484struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err) 485 struct dentry *dentry, int err)
480{ 486{
481 struct ceph_client *client = ceph_client(dentry->d_sb); 487 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode; 488 struct inode *parent = dentry->d_parent->d_inode;
483 489
484 /* .snap dir? */ 490 /* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
568 !is_root_ceph_dentry(dir, dentry) && 574 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 575 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 576 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock); 577 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir); 578 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL); 579 d_add(dentry, NULL);
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
888 893
889 /* ensure target dentry is invalidated, despite 894 /* ensure target dentry is invalidated, despite
890 rehashing bug in vfs_rename_dir */ 895 rehashing bug in vfs_rename_dir */
891 new_dentry->d_time = jiffies; 896 ceph_invalidate_dentry_lease(new_dentry);
892 ceph_dentry(new_dentry)->lease_shared_gen = 0;
893 } 897 }
894 ceph_mdsc_put_request(req); 898 ceph_mdsc_put_request(req);
895 return err; 899 return err;
896} 900}
897 901
902/*
903 * Ensure a dentry lease will no longer revalidate.
904 */
905void ceph_invalidate_dentry_lease(struct dentry *dentry)
906{
907 spin_lock(&dentry->d_lock);
908 dentry->d_time = jiffies;
909 ceph_dentry(dentry)->lease_shared_gen = 0;
910 spin_unlock(&dentry->d_lock);
911}
898 912
899/* 913/*
900 * Check if dentry lease is valid. If not, delete the lease. Try to 914 * Check if dentry lease is valid. If not, delete the lease. Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
972{ 986{
973 struct inode *dir = dentry->d_parent->d_inode; 987 struct inode *dir = dentry->d_parent->d_inode;
974 988
975 dout("d_revalidate %p '%.*s' inode %p\n", dentry, 989 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
976 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 990 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
991 ceph_dentry(dentry)->offset);
977 992
978 /* always trust cached snapped dentries, snapdir dentry */ 993 /* always trust cached snapped dentries, snapdir dentry */
979 if (ceph_snap(dir) != CEPH_NOSNAP) { 994 if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1050 struct ceph_inode_info *ci = ceph_inode(inode); 1065 struct ceph_inode_info *ci = ceph_inode(inode);
1051 int left; 1066 int left;
1052 1067
1053 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) 1068 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1054 return -EISDIR; 1069 return -EISDIR;
1055 1070
1056 if (!cf->dir_info) { 1071 if (!cf->dir_info) {
@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1152 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1167 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1153 dn->d_name.len, dn->d_name.name); 1168 dn->d_name.len, dn->d_name.name);
1154 if (di) { 1169 if (di) {
1155 mdsc = &ceph_client(dn->d_sb)->mdsc; 1170 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1156 spin_lock(&mdsc->dentry_lru_lock); 1171 spin_lock(&mdsc->dentry_lru_lock);
1157 list_add_tail(&di->lru, &mdsc->dentry_lru); 1172 list_add_tail(&di->lru, &mdsc->dentry_lru);
1158 mdsc->num_dentry++; 1173 mdsc->num_dentry++;
@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1165 struct ceph_dentry_info *di = ceph_dentry(dn); 1180 struct ceph_dentry_info *di = ceph_dentry(dn);
1166 struct ceph_mds_client *mdsc; 1181 struct ceph_mds_client *mdsc;
1167 1182
1168 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, 1183 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1169 dn->d_name.len, dn->d_name.name); 1184 dn->d_name.len, dn->d_name.name, di->offset);
1170 if (di) { 1185 if (di) {
1171 mdsc = &ceph_client(dn->d_sb)->mdsc; 1186 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1172 spin_lock(&mdsc->dentry_lru_lock); 1187 spin_lock(&mdsc->dentry_lru_lock);
1173 list_move_tail(&di->lru, &mdsc->dentry_lru); 1188 list_move_tail(&di->lru, &mdsc->dentry_lru);
1174 spin_unlock(&mdsc->dentry_lru_lock); 1189 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1183 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1198 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1184 dn->d_name.len, dn->d_name.name); 1199 dn->d_name.len, dn->d_name.name);
1185 if (di) { 1200 if (di) {
1186 mdsc = &ceph_client(dn->d_sb)->mdsc; 1201 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1187 spin_lock(&mdsc->dentry_lru_lock); 1202 spin_lock(&mdsc->dentry_lru_lock);
1188 list_del_init(&di->lru); 1203 list_del_init(&di->lru);
1189 mdsc->num_dentry--; 1204 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..17447644d675 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
93 return ERR_PTR(-ESTALE); 93 return ERR_PTR(-ESTALE);
94 94
95 dentry = d_obtain_alias(inode); 95 dentry = d_obtain_alias(inode);
96 if (!dentry) { 96 if (IS_ERR(dentry)) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", 97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode); 98 fh->ino, inode);
99 iput(inode); 99 iput(inode);
100 return ERR_PTR(-ENOMEM); 100 return dentry;
101 } 101 }
102 err = ceph_init_dentry(dentry); 102 err = ceph_init_dentry(dentry);
103 103
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 116 struct ceph_nfs_confh *cfh)
117{ 117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; 118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 119 struct inode *inode;
120 struct dentry *dentry; 120 struct dentry *dentry;
121 struct ceph_vino vino; 121 struct ceph_vino vino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
149 } 149 }
150 150
151 dentry = d_obtain_alias(inode); 151 dentry = d_obtain_alias(inode);
152 if (!dentry) { 152 if (IS_ERR(dentry)) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", 153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode); 154 cfh->ino, inode);
155 iput(inode); 155 iput(inode);
156 return ERR_PTR(-ENOMEM); 156 return dentry;
157 } 157 }
158 err = ceph_init_dentry(dentry); 158 err = ceph_init_dentry(dentry);
159 if (err < 0) { 159 if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
202 return ERR_PTR(-ESTALE); 202 return ERR_PTR(-ESTALE);
203 203
204 dentry = d_obtain_alias(inode); 204 dentry = d_obtain_alias(inode);
205 if (!dentry) { 205 if (IS_ERR(dentry)) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode); 207 cfh->ino, inode);
208 iput(inode); 208 iput(inode);
209 return ERR_PTR(-ENOMEM); 209 return dentry;
210 } 210 }
211 err = ceph_init_dentry(dentry); 211 err = ceph_init_dentry(dentry);
212 if (err < 0) { 212 if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6e..6512b6701b9e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
317/* 317/*
318 * allocate a vector new pages 318 * allocate a vector new pages
319 */ 319 */
320static struct page **alloc_page_vector(int num_pages) 320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{ 321{
322 struct page **pages; 322 struct page **pages;
323 int i; 323 int i;
324 324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); 325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages) 326 if (!pages)
327 return ERR_PTR(-ENOMEM); 327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) { 328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS); 329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) { 330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i); 331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM); 332 return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
540 * in sequence. 540 * in sequence.
541 */ 541 */
542 } else { 542 } else {
543 pages = alloc_page_vector(num_pages); 543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
544 } 544 }
545 if (IS_ERR(pages)) 545 if (IS_ERR(pages))
546 return PTR_ERR(pages); 546 return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
649 do_sync, 649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size, 650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2); 651 &mtime, false, 2);
652 if (IS_ERR(req)) 652 if (!req)
653 return PTR_ERR(req); 653 return -ENOMEM;
654 654
655 num_pages = calc_pages_for(pos, len); 655 num_pages = calc_pages_for(pos, len);
656 656
@@ -668,7 +668,7 @@ more:
668 truncate_inode_pages_range(inode->i_mapping, pos, 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 670 } else {
671 pages = alloc_page_vector(num_pages); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
672 if (IS_ERR(pages)) { 672 if (IS_ERR(pages)) {
673 ret = PTR_ERR(pages); 673 ret = PTR_ERR(pages);
674 goto out; 674 goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
809 struct file *file = iocb->ki_filp; 809 struct file *file = iocb->ki_filp;
810 struct inode *inode = file->f_dentry->d_inode; 810 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 811 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len; 813 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 814 int got = 0;
815 int ret, err; 815 int ret, err;
@@ -844,8 +844,7 @@ retry_snap:
844 if ((ret >= 0 || ret == -EIOCBQUEUED) && 844 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
847 err = vfs_fsync_range(file, file->f_path.dentry, 847 err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
848 pos, pos + ret - 1, 1);
849 if (err < 0) 848 if (err < 0)
850 ret = err; 849 ret = err;
851 } 850 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..a81b8b662c7b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 384 */
385 if (ci->i_snap_realm) { 385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc; 387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 389
390 dout(" dropping residual ref to snap realm %p\n", realm); 390 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
619 memcpy(ci->i_xattrs.blob->vec.iov_base, 619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len); 620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 xattr_blob = NULL;
622 } 623 }
623 624
624 inode->i_mapping->a_ops = &ceph_aops; 625 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info = 626 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info; 627 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
627 628
628 switch (inode->i_mode & S_IFMT) { 629 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO: 630 case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
674 /* set dir completion flag? */ 675 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 && 676 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP && 677 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { 678 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
679 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
678 dout(" marking %p complete (empty)\n", inode); 680 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE; 681 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2; 682 ci->i_max_offset = 2;
681 } 683 }
682 684
683 /* it may be better to set st_size in getattr instead? */ 685 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) 686 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes; 687 inode->i_size = ci->i_rbytes;
686 break; 688 break;
687 default: 689 default:
@@ -802,6 +804,37 @@ out_unlock:
802} 804}
803 805
804/* 806/*
807 * Set dentry's directory position based on the current dir's max, and
808 * order it in d_subdirs, so that dcache_readdir behaves.
809 */
810static void ceph_set_dentry_offset(struct dentry *dn)
811{
812 struct dentry *dir = dn->d_parent;
813 struct inode *inode = dn->d_parent->d_inode;
814 struct ceph_dentry_info *di;
815
816 BUG_ON(!inode);
817
818 di = ceph_dentry(dn);
819
820 spin_lock(&inode->i_lock);
821 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
822 spin_unlock(&inode->i_lock);
823 return;
824 }
825 di->offset = ceph_inode(inode)->i_max_offset++;
826 spin_unlock(&inode->i_lock);
827
828 spin_lock(&dcache_lock);
829 spin_lock(&dn->d_lock);
830 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
831 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
832 dn->d_u.d_child.prev, dn->d_u.d_child.next);
833 spin_unlock(&dn->d_lock);
834 spin_unlock(&dcache_lock);
835}
836
837/*
805 * splice a dentry to an inode. 838 * splice a dentry to an inode.
806 * caller must hold directory i_mutex for this to be safe. 839 * caller must hold directory i_mutex for this to be safe.
807 * 840 *
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
814{ 847{
815 struct dentry *realdn; 848 struct dentry *realdn;
816 849
850 BUG_ON(dn->d_inode);
851
817 /* dn must be unhashed */ 852 /* dn must be unhashed */
818 if (!d_unhashed(dn)) 853 if (!d_unhashed(dn))
819 d_drop(dn); 854 d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
835 dn = realdn; 870 dn = realdn;
836 } else { 871 } else {
837 BUG_ON(!ceph_dentry(dn)); 872 BUG_ON(!ceph_dentry(dn));
838
839 dout("dn %p attached to %p ino %llx.%llx\n", 873 dout("dn %p attached to %p ino %llx.%llx\n",
840 dn, dn->d_inode, ceph_vinop(dn->d_inode)); 874 dn, dn->d_inode, ceph_vinop(dn->d_inode));
841 } 875 }
842 if ((!prehash || *prehash) && d_unhashed(dn)) 876 if ((!prehash || *prehash) && d_unhashed(dn))
843 d_rehash(dn); 877 d_rehash(dn);
878 ceph_set_dentry_offset(dn);
844out: 879out:
845 return dn; 880 return dn;
846} 881}
847 882
848/* 883/*
849 * Set dentry's directory position based on the current dir's max, and
850 * order it in d_subdirs, so that dcache_readdir behaves.
851 */
852static void ceph_set_dentry_offset(struct dentry *dn)
853{
854 struct dentry *dir = dn->d_parent;
855 struct inode *inode = dn->d_parent->d_inode;
856 struct ceph_dentry_info *di;
857
858 BUG_ON(!inode);
859
860 di = ceph_dentry(dn);
861
862 spin_lock(&inode->i_lock);
863 di->offset = ceph_inode(inode)->i_max_offset++;
864 spin_unlock(&inode->i_lock);
865
866 spin_lock(&dcache_lock);
867 spin_lock(&dn->d_lock);
868 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
869 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
870 dn->d_u.d_child.prev, dn->d_u.d_child.next);
871 spin_unlock(&dn->d_lock);
872 spin_unlock(&dcache_lock);
873}
874
875/*
876 * Incorporate results into the local cache. This is either just 884 * Incorporate results into the local cache. This is either just
877 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 885 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
878 * after a lookup). 886 * after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
933 941
934 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 942 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
935 dout("fill_trace reply is empty!\n"); 943 dout("fill_trace reply is empty!\n");
936 if (rinfo->head->result == 0 && req->r_locked_dir) { 944 if (rinfo->head->result == 0 && req->r_locked_dir)
937 struct ceph_inode_info *ci = 945 ceph_invalidate_dir_request(req);
938 ceph_inode(req->r_locked_dir);
939 dout(" clearing %p complete (empty trace)\n",
940 req->r_locked_dir);
941 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
942 ci->i_release_count++;
943 }
944 return 0; 946 return 0;
945 } 947 }
946 948
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1011 req->r_old_dentry->d_name.len, 1013 req->r_old_dentry->d_name.len,
1012 req->r_old_dentry->d_name.name, 1014 req->r_old_dentry->d_name.name,
1013 dn, dn->d_name.len, dn->d_name.name); 1015 dn, dn->d_name.len, dn->d_name.name);
1016
1014 /* ensure target dentry is invalidated, despite 1017 /* ensure target dentry is invalidated, despite
1015 rehashing bug in vfs_rename_dir */ 1018 rehashing bug in vfs_rename_dir */
1016 dn->d_time = jiffies; 1019 ceph_invalidate_dentry_lease(dn);
1017 ceph_dentry(dn)->lease_shared_gen = 0; 1020
1018 /* take overwritten dentry's readdir offset */ 1021 /* take overwritten dentry's readdir offset */
1022 dout("dn %p gets %p offset %lld (old offset %lld)\n",
1023 req->r_old_dentry, dn, ceph_dentry(dn)->offset,
1024 ceph_dentry(req->r_old_dentry)->offset);
1019 ceph_dentry(req->r_old_dentry)->offset = 1025 ceph_dentry(req->r_old_dentry)->offset =
1020 ceph_dentry(dn)->offset; 1026 ceph_dentry(dn)->offset;
1027
1021 dn = req->r_old_dentry; /* use old_dentry */ 1028 dn = req->r_old_dentry; /* use old_dentry */
1022 in = dn->d_inode; 1029 in = dn->d_inode;
1023 } 1030 }
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1059 goto done; 1066 goto done;
1060 } 1067 }
1061 req->r_dentry = dn; /* may have spliced */ 1068 req->r_dentry = dn; /* may have spliced */
1062 ceph_set_dentry_offset(dn);
1063 igrab(in); 1069 igrab(in);
1064 } else if (ceph_ino(in) == vino.ino && 1070 } else if (ceph_ino(in) == vino.ino &&
1065 ceph_snap(in) == vino.snap) { 1071 ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1102 err = PTR_ERR(dn); 1108 err = PTR_ERR(dn);
1103 goto done; 1109 goto done;
1104 } 1110 }
1105 ceph_set_dentry_offset(dn);
1106 req->r_dentry = dn; /* may have spliced */ 1111 req->r_dentry = dn; /* may have spliced */
1107 igrab(in); 1112 igrab(in);
1108 rinfo->head->is_dentry = 1; /* fool notrace handlers */ 1113 rinfo->head->is_dentry = 1; /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1429{ 1434{
1430 struct ceph_inode_info *ci = ceph_inode(inode); 1435 struct ceph_inode_info *ci = ceph_inode(inode);
1431 1436
1432 if (queue_work(ceph_client(inode->i_sb)->trunc_wq, 1437 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1433 &ci->i_vmtruncate_work)) { 1438 &ci->i_vmtruncate_work)) {
1434 dout("ceph_queue_vmtruncate %p\n", inode); 1439 dout("ceph_queue_vmtruncate %p\n", inode);
1435 igrab(inode); 1440 igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1518 struct inode *parent_inode = dentry->d_parent->d_inode; 1523 struct inode *parent_inode = dentry->d_parent->d_inode;
1519 const unsigned int ia_valid = attr->ia_valid; 1524 const unsigned int ia_valid = attr->ia_valid;
1520 struct ceph_mds_request *req; 1525 struct ceph_mds_request *req;
1521 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; 1526 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
1522 int issued; 1527 int issued;
1523 int release = 0, dirtied = 0; 1528 int release = 0, dirtied = 0;
1524 int mask = 0; 1529 int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
102 u64 len = 1, olen; 102 u64 len = 1, olen;
103 u64 tmp; 103 u64 tmp;
104 struct ceph_object_layout ol; 104 struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..885aa5710cfd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
40static void __wake_requests(struct ceph_mds_client *mdsc, 40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 41 struct list_head *head);
42 42
43const static struct ceph_connection_operations mds_con_ops; 43static const struct ceph_connection_operations mds_con_ops;
44 44
45 45
46/* 46/*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
665 struct ceph_msg *msg; 665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h; 666 struct ceph_mds_session_head *h;
667 667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); 668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
669 if (IS_ERR(msg)) { 669 if (!msg) {
670 pr_err("create_session_msg ENOMEM creating msg\n"); 670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg)); 671 return NULL;
672 } 672 }
673 h = msg->front.iov_base; 673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op); 674 h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
687 struct ceph_msg *msg; 687 struct ceph_msg *msg;
688 int mstate; 688 int mstate;
689 int mds = session->s_mds; 689 int mds = session->s_mds;
690 int err = 0;
691 690
692 /* wait for mds to go active? */ 691 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); 692 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
698 697
699 /* send connect message */ 698 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 699 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) { 700 if (!msg)
702 err = PTR_ERR(msg); 701 return -ENOMEM;
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg); 702 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0; 703 return 0;
709} 704}
710 705
@@ -804,12 +799,49 @@ out:
804} 799}
805 800
806static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 801static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
807 void *arg) 802 void *arg)
808{ 803{
809 struct ceph_inode_info *ci = ceph_inode(inode); 804 struct ceph_inode_info *ci = ceph_inode(inode);
805 int drop = 0;
806
810 dout("removing cap %p, ci is %p, inode is %p\n", 807 dout("removing cap %p, ci is %p, inode is %p\n",
811 cap, ci, &ci->vfs_inode); 808 cap, ci, &ci->vfs_inode);
812 ceph_remove_cap(cap); 809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 if (!__ceph_is_any_real_caps(ci)) {
812 struct ceph_mds_client *mdsc =
813 &ceph_sb_to_client(inode->i_sb)->mdsc;
814
815 spin_lock(&mdsc->cap_dirty_lock);
816 if (!list_empty(&ci->i_dirty_item)) {
817 pr_info(" dropping dirty %s state for %p %lld\n",
818 ceph_cap_string(ci->i_dirty_caps),
819 inode, ceph_ino(inode));
820 ci->i_dirty_caps = 0;
821 list_del_init(&ci->i_dirty_item);
822 drop = 1;
823 }
824 if (!list_empty(&ci->i_flushing_item)) {
825 pr_info(" dropping dirty+flushing %s state for %p %lld\n",
826 ceph_cap_string(ci->i_flushing_caps),
827 inode, ceph_ino(inode));
828 ci->i_flushing_caps = 0;
829 list_del_init(&ci->i_flushing_item);
830 mdsc->num_cap_flushing--;
831 drop = 1;
832 }
833 if (drop && ci->i_wrbuffer_ref) {
834 pr_info(" dropping dirty data for %p %lld\n",
835 inode, ceph_ino(inode));
836 ci->i_wrbuffer_ref = 0;
837 ci->i_wrbuffer_ref_head = 0;
838 drop++;
839 }
840 spin_unlock(&mdsc->cap_dirty_lock);
841 }
842 spin_unlock(&inode->i_lock);
843 while (drop--)
844 iput(inode);
813 return 0; 845 return 0;
814} 846}
815 847
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
821 dout("remove_session_caps on %p\n", session); 853 dout("remove_session_caps on %p\n", session);
822 iterate_session_caps(session, remove_session_caps_cb, NULL); 854 iterate_session_caps(session, remove_session_caps_cb, NULL);
823 BUG_ON(session->s_nr_caps > 0); 855 BUG_ON(session->s_nr_caps > 0);
856 BUG_ON(!list_empty(&session->s_cap_flushing));
824 cleanup_cap_releases(session); 857 cleanup_cap_releases(session);
825} 858}
826 859
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
883 ceph_mds_state_name(state)); 916 ceph_mds_state_name(state));
884 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 917 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
885 ++session->s_renew_seq); 918 ++session->s_renew_seq);
886 if (IS_ERR(msg)) 919 if (!msg)
887 return PTR_ERR(msg); 920 return -ENOMEM;
888 ceph_con_send(&session->s_con, msg); 921 ceph_con_send(&session->s_con, msg);
889 return 0; 922 return 0;
890} 923}
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
931 struct ceph_mds_session *session) 964 struct ceph_mds_session *session)
932{ 965{
933 struct ceph_msg *msg; 966 struct ceph_msg *msg;
934 int err = 0;
935 967
936 dout("request_close_session mds%d state %s seq %lld\n", 968 dout("request_close_session mds%d state %s seq %lld\n",
937 session->s_mds, session_state_name(session->s_state), 969 session->s_mds, session_state_name(session->s_state),
938 session->s_seq); 970 session->s_seq);
939 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 971 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
940 if (IS_ERR(msg)) 972 if (!msg)
941 err = PTR_ERR(msg); 973 return -ENOMEM;
942 else 974 ceph_con_send(&session->s_con, msg);
943 ceph_con_send(&session->s_con, msg); 975 return 0;
944 return err;
945} 976}
946 977
947/* 978/*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
1059 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1060 spin_unlock(&session->s_cap_lock); 1091 spin_unlock(&session->s_cap_lock);
1061 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1062 0, 0, NULL); 1093 GFP_NOFS);
1063 if (!msg) 1094 if (!msg)
1064 goto out_unlocked; 1095 goto out_unlocked;
1065 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1096 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1151 struct ceph_msg *msg; 1182 struct ceph_msg *msg;
1152 1183
1153 dout("send_cap_releases mds%d\n", session->s_mds); 1184 dout("send_cap_releases mds%d\n", session->s_mds);
1154 while (1) { 1185 spin_lock(&session->s_cap_lock);
1155 spin_lock(&session->s_cap_lock); 1186 while (!list_empty(&session->s_cap_releases_done)) {
1156 if (list_empty(&session->s_cap_releases_done))
1157 break;
1158 msg = list_first_entry(&session->s_cap_releases_done, 1187 msg = list_first_entry(&session->s_cap_releases_done,
1159 struct ceph_msg, list_head); 1188 struct ceph_msg, list_head);
1160 list_del_init(&msg->list_head); 1189 list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1162 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1191 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1163 dout("send_cap_releases mds%d %p\n", session->s_mds, msg); 1192 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1164 ceph_con_send(&session->s_con, msg); 1193 ceph_con_send(&session->s_con, msg);
1194 spin_lock(&session->s_cap_lock);
1165 } 1195 }
1166 spin_unlock(&session->s_cap_lock); 1196 spin_unlock(&session->s_cap_lock);
1167} 1197}
1168 1198
1199static void discard_cap_releases(struct ceph_mds_client *mdsc,
1200 struct ceph_mds_session *session)
1201{
1202 struct ceph_msg *msg;
1203 struct ceph_mds_cap_release *head;
1204 unsigned num;
1205
1206 dout("discard_cap_releases mds%d\n", session->s_mds);
1207 spin_lock(&session->s_cap_lock);
1208
1209 /* zero out the in-progress message */
1210 msg = list_first_entry(&session->s_cap_releases,
1211 struct ceph_msg, list_head);
1212 head = msg->front.iov_base;
1213 num = le32_to_cpu(head->num);
1214 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1215 head->num = cpu_to_le32(0);
1216 session->s_num_cap_releases += num;
1217
1218 /* requeue completed messages */
1219 while (!list_empty(&session->s_cap_releases_done)) {
1220 msg = list_first_entry(&session->s_cap_releases_done,
1221 struct ceph_msg, list_head);
1222 list_del_init(&msg->list_head);
1223
1224 head = msg->front.iov_base;
1225 num = le32_to_cpu(head->num);
1226 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1227 num);
1228 session->s_num_cap_releases += num;
1229 head->num = cpu_to_le32(0);
1230 msg->front.iov_len = sizeof(*head);
1231 list_add(&msg->list_head, &session->s_cap_releases);
1232 }
1233
1234 spin_unlock(&session->s_cap_lock);
1235}
1236
1169/* 1237/*
1170 * requests 1238 * requests
1171 */ 1239 */
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1181 if (!req) 1249 if (!req)
1182 return ERR_PTR(-ENOMEM); 1250 return ERR_PTR(-ENOMEM);
1183 1251
1252 mutex_init(&req->r_fill_mutex);
1184 req->r_started = jiffies; 1253 req->r_started = jiffies;
1185 req->r_resend_mds = -1; 1254 req->r_resend_mds = -1;
1186 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
1251 len += 1 + temp->d_name.len; 1320 len += 1 + temp->d_name.len;
1252 temp = temp->d_parent; 1321 temp = temp->d_parent;
1253 if (temp == NULL) { 1322 if (temp == NULL) {
1254 pr_err("build_path_dentry corrupt dentry %p\n", dentry); 1323 pr_err("build_path corrupt dentry %p\n", dentry);
1255 return ERR_PTR(-EINVAL); 1324 return ERR_PTR(-EINVAL);
1256 } 1325 }
1257 } 1326 }
@@ -1267,7 +1336,7 @@ retry:
1267 struct inode *inode = temp->d_inode; 1336 struct inode *inode = temp->d_inode;
1268 1337
1269 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { 1338 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1270 dout("build_path_dentry path+%d: %p SNAPDIR\n", 1339 dout("build_path path+%d: %p SNAPDIR\n",
1271 pos, temp); 1340 pos, temp);
1272 } else if (stop_on_nosnap && inode && 1341 } else if (stop_on_nosnap && inode &&
1273 ceph_snap(inode) == CEPH_NOSNAP) { 1342 ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
1278 break; 1347 break;
1279 strncpy(path + pos, temp->d_name.name, 1348 strncpy(path + pos, temp->d_name.name,
1280 temp->d_name.len); 1349 temp->d_name.len);
1281 dout("build_path_dentry path+%d: %p '%.*s'\n",
1282 pos, temp, temp->d_name.len, path + pos);
1283 } 1350 }
1284 if (pos) 1351 if (pos)
1285 path[--pos] = '/'; 1352 path[--pos] = '/';
1286 temp = temp->d_parent; 1353 temp = temp->d_parent;
1287 if (temp == NULL) { 1354 if (temp == NULL) {
1288 pr_err("build_path_dentry corrupt dentry\n"); 1355 pr_err("build_path corrupt dentry\n");
1289 kfree(path); 1356 kfree(path);
1290 return ERR_PTR(-EINVAL); 1357 return ERR_PTR(-EINVAL);
1291 } 1358 }
1292 } 1359 }
1293 if (pos != 0) { 1360 if (pos != 0) {
1294 pr_err("build_path_dentry did not end path lookup where " 1361 pr_err("build_path did not end path lookup where "
1295 "expected, namelen is %d, pos is %d\n", len, pos); 1362 "expected, namelen is %d, pos is %d\n", len, pos);
1296 /* presumably this is only possible if racing with a 1363 /* presumably this is only possible if racing with a
1297 rename of one of the parent directories (we can not 1364 rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
1303 1370
1304 *base = ceph_ino(temp->d_inode); 1371 *base = ceph_ino(temp->d_inode);
1305 *plen = len; 1372 *plen = len;
1306 dout("build_path_dentry on %p %d built %llx '%.*s'\n", 1373 dout("build_path on %p %d built %llx '%.*s'\n",
1307 dentry, atomic_read(&dentry->d_count), *base, len, path); 1374 dentry, atomic_read(&dentry->d_count), *base, len, path);
1308 return path; 1375 return path;
1309} 1376}
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1426 if (req->r_old_dentry_drop) 1493 if (req->r_old_dentry_drop)
1427 len += req->r_old_dentry->d_name.len; 1494 len += req->r_old_dentry->d_name.len;
1428 1495
1429 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); 1496 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
1430 if (IS_ERR(msg)) 1497 if (!msg) {
1498 msg = ERR_PTR(-ENOMEM);
1431 goto out_free2; 1499 goto out_free2;
1500 }
1432 1501
1433 msg->hdr.tid = cpu_to_le64(req->r_tid); 1502 msg->hdr.tid = cpu_to_le64(req->r_tid);
1434 1503
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1517 } 1586 }
1518 msg = create_request_message(mdsc, req, mds); 1587 msg = create_request_message(mdsc, req, mds);
1519 if (IS_ERR(msg)) { 1588 if (IS_ERR(msg)) {
1520 req->r_reply = ERR_PTR(PTR_ERR(msg)); 1589 req->r_err = PTR_ERR(msg);
1521 complete_request(mdsc, req); 1590 complete_request(mdsc, req);
1522 return -PTR_ERR(msg); 1591 return PTR_ERR(msg);
1523 } 1592 }
1524 req->r_request = msg; 1593 req->r_request = msg;
1525 1594
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
1552 int mds = -1; 1621 int mds = -1;
1553 int err = -EAGAIN; 1622 int err = -EAGAIN;
1554 1623
1555 if (req->r_reply) 1624 if (req->r_err || req->r_got_result)
1556 goto out; 1625 goto out;
1557 1626
1558 if (req->r_timeout && 1627 if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
1609 return err; 1678 return err;
1610 1679
1611finish: 1680finish:
1612 req->r_reply = ERR_PTR(err); 1681 req->r_err = err;
1613 complete_request(mdsc, req); 1682 complete_request(mdsc, req);
1614 goto out; 1683 goto out;
1615} 1684}
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
1630 1699
1631/* 1700/*
1632 * Wake up threads with requests pending for @mds, so that they can 1701 * Wake up threads with requests pending for @mds, so that they can
1633 * resubmit their requests to a possibly different mds. If @all is set, 1702 * resubmit their requests to a possibly different mds.
1634 * wake up if their requests has been forwarded to @mds, too.
1635 */ 1703 */
1636static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) 1704static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1637{ 1705{
1638 struct ceph_mds_request *req; 1706 struct ceph_mds_request *req;
1639 struct rb_node *p; 1707 struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1689 __register_request(mdsc, req, dir); 1757 __register_request(mdsc, req, dir);
1690 __do_request(mdsc, req); 1758 __do_request(mdsc, req);
1691 1759
1692 /* wait */ 1760 if (req->r_err) {
1693 if (!req->r_reply) { 1761 err = req->r_err;
1694 mutex_unlock(&mdsc->mutex); 1762 __unregister_request(mdsc, req);
1695 if (req->r_timeout) { 1763 dout("do_request early error %d\n", err);
1696 err = (long)wait_for_completion_interruptible_timeout( 1764 goto out;
1697 &req->r_completion, req->r_timeout);
1698 if (err == 0)
1699 req->r_reply = ERR_PTR(-EIO);
1700 else if (err < 0)
1701 req->r_reply = ERR_PTR(err);
1702 } else {
1703 err = wait_for_completion_interruptible(
1704 &req->r_completion);
1705 if (err)
1706 req->r_reply = ERR_PTR(err);
1707 }
1708 mutex_lock(&mdsc->mutex);
1709 } 1765 }
1710 1766
1711 if (IS_ERR(req->r_reply)) { 1767 /* wait */
1712 err = PTR_ERR(req->r_reply); 1768 mutex_unlock(&mdsc->mutex);
1713 req->r_reply = NULL; 1769 dout("do_request waiting\n");
1770 if (req->r_timeout) {
1771 err = (long)wait_for_completion_interruptible_timeout(
1772 &req->r_completion, req->r_timeout);
1773 if (err == 0)
1774 err = -EIO;
1775 } else {
1776 err = wait_for_completion_interruptible(&req->r_completion);
1777 }
1778 dout("do_request waited, got %d\n", err);
1779 mutex_lock(&mdsc->mutex);
1714 1780
1715 if (err == -ERESTARTSYS) { 1781 /* only abort if we didn't race with a real reply */
1716 /* aborted */ 1782 if (req->r_got_result) {
1717 req->r_aborted = true; 1783 err = le32_to_cpu(req->r_reply_info.head->result);
1784 } else if (err < 0) {
1785 dout("aborted request %lld with %d\n", req->r_tid, err);
1718 1786
1719 if (req->r_locked_dir && 1787 /*
1720 (req->r_op & CEPH_MDS_OP_WRITE)) { 1788 * ensure we aren't running concurrently with
1721 struct ceph_inode_info *ci = 1789 * ceph_fill_trace or ceph_readdir_prepopulate, which
1722 ceph_inode(req->r_locked_dir); 1790 * rely on locks (dir mutex) held by our caller.
1791 */
1792 mutex_lock(&req->r_fill_mutex);
1793 req->r_err = err;
1794 req->r_aborted = true;
1795 mutex_unlock(&req->r_fill_mutex);
1723 1796
1724 dout("aborted, clearing I_COMPLETE on %p\n", 1797 if (req->r_locked_dir &&
1725 req->r_locked_dir); 1798 (req->r_op & CEPH_MDS_OP_WRITE))
1726 spin_lock(&req->r_locked_dir->i_lock); 1799 ceph_invalidate_dir_request(req);
1727 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1728 ci->i_release_count++;
1729 spin_unlock(&req->r_locked_dir->i_lock);
1730 }
1731 } else {
1732 /* clean up this request */
1733 __unregister_request(mdsc, req);
1734 if (!list_empty(&req->r_unsafe_item))
1735 list_del_init(&req->r_unsafe_item);
1736 complete(&req->r_safe_completion);
1737 }
1738 } else if (req->r_err) {
1739 err = req->r_err;
1740 } else { 1800 } else {
1741 err = le32_to_cpu(req->r_reply_info.head->result); 1801 err = req->r_err;
1742 } 1802 }
1743 mutex_unlock(&mdsc->mutex);
1744 1803
1804out:
1805 mutex_unlock(&mdsc->mutex);
1745 dout("do_request %p done, result %d\n", req, err); 1806 dout("do_request %p done, result %d\n", req, err);
1746 return err; 1807 return err;
1747} 1808}
1748 1809
1749/* 1810/*
1811 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
1812 * namespace request.
1813 */
1814void ceph_invalidate_dir_request(struct ceph_mds_request *req)
1815{
1816 struct inode *inode = req->r_locked_dir;
1817 struct ceph_inode_info *ci = ceph_inode(inode);
1818
1819 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
1820 spin_lock(&inode->i_lock);
1821 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1822 ci->i_release_count++;
1823 spin_unlock(&inode->i_lock);
1824
1825 if (req->r_dentry)
1826 ceph_invalidate_dentry_lease(req->r_dentry);
1827 if (req->r_old_dentry)
1828 ceph_invalidate_dentry_lease(req->r_old_dentry);
1829}
1830
1831/*
1750 * Handle mds reply. 1832 * Handle mds reply.
1751 * 1833 *
1752 * We take the session mutex and parse and process the reply immediately. 1834 * We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1797 mutex_unlock(&mdsc->mutex); 1879 mutex_unlock(&mdsc->mutex);
1798 goto out; 1880 goto out;
1799 } 1881 }
1882 if (req->r_got_safe && !head->safe) {
1883 pr_warning("got unsafe after safe on %llu from mds%d\n",
1884 tid, mds);
1885 mutex_unlock(&mdsc->mutex);
1886 goto out;
1887 }
1800 1888
1801 result = le32_to_cpu(head->result); 1889 result = le32_to_cpu(head->result);
1802 1890
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1838 mutex_unlock(&mdsc->mutex); 1926 mutex_unlock(&mdsc->mutex);
1839 goto out; 1927 goto out;
1840 } 1928 }
1841 } 1929 } else {
1842
1843 BUG_ON(req->r_reply);
1844
1845 if (!head->safe) {
1846 req->r_got_unsafe = true; 1930 req->r_got_unsafe = true;
1847 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); 1931 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1848 } 1932 }
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1871 } 1955 }
1872 1956
1873 /* insert trace into our cache */ 1957 /* insert trace into our cache */
1958 mutex_lock(&req->r_fill_mutex);
1874 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 1959 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1875 if (err == 0) { 1960 if (err == 0) {
1876 if (result == 0 && rinfo->dir_nr) 1961 if (result == 0 && rinfo->dir_nr)
1877 ceph_readdir_prepopulate(req, req->r_session); 1962 ceph_readdir_prepopulate(req, req->r_session);
1878 ceph_unreserve_caps(&req->r_caps_reservation); 1963 ceph_unreserve_caps(&req->r_caps_reservation);
1879 } 1964 }
1965 mutex_unlock(&req->r_fill_mutex);
1880 1966
1881 up_read(&mdsc->snap_rwsem); 1967 up_read(&mdsc->snap_rwsem);
1882out_err: 1968out_err:
1883 if (err) { 1969 mutex_lock(&mdsc->mutex);
1884 req->r_err = err; 1970 if (!req->r_aborted) {
1971 if (err) {
1972 req->r_err = err;
1973 } else {
1974 req->r_reply = msg;
1975 ceph_msg_get(msg);
1976 req->r_got_result = true;
1977 }
1885 } else { 1978 } else {
1886 req->r_reply = msg; 1979 dout("reply arrived after request %lld was aborted\n", tid);
1887 ceph_msg_get(msg);
1888 } 1980 }
1981 mutex_unlock(&mdsc->mutex);
1889 1982
1890 add_cap_releases(mdsc, req->r_session, -1); 1983 add_cap_releases(mdsc, req->r_session, -1);
1891 mutex_unlock(&session->s_mutex); 1984 mutex_unlock(&session->s_mutex);
@@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session,
1984 2077
1985 switch (op) { 2078 switch (op) {
1986 case CEPH_SESSION_OPEN: 2079 case CEPH_SESSION_OPEN:
2080 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2081 pr_info("mds%d reconnect success\n", session->s_mds);
1987 session->s_state = CEPH_MDS_SESSION_OPEN; 2082 session->s_state = CEPH_MDS_SESSION_OPEN;
1988 renewed_caps(mdsc, session, 0); 2083 renewed_caps(mdsc, session, 0);
1989 wake = 1; 2084 wake = 1;
@@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session,
1997 break; 2092 break;
1998 2093
1999 case CEPH_SESSION_CLOSE: 2094 case CEPH_SESSION_CLOSE:
2095 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2096 pr_info("mds%d reconnect denied\n", session->s_mds);
2000 remove_session_caps(session); 2097 remove_session_caps(session);
2001 wake = 1; /* for good measure */ 2098 wake = 1; /* for good measure */
2002 complete(&mdsc->session_close_waiters); 2099 complete(&mdsc->session_close_waiters);
2003 kick_requests(mdsc, mds, 0); /* cur only */ 2100 kick_requests(mdsc, mds);
2004 break; 2101 break;
2005 2102
2006 case CEPH_SESSION_STALE: 2103 case CEPH_SESSION_STALE:
@@ -2132,54 +2229,44 @@ out:
2132 * 2229 *
2133 * called with mdsc->mutex held. 2230 * called with mdsc->mutex held.
2134 */ 2231 */
2135static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) 2232static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2233 struct ceph_mds_session *session)
2136{ 2234{
2137 struct ceph_mds_session *session = NULL;
2138 struct ceph_msg *reply; 2235 struct ceph_msg *reply;
2139 struct rb_node *p; 2236 struct rb_node *p;
2237 int mds = session->s_mds;
2140 int err = -ENOMEM; 2238 int err = -ENOMEM;
2141 struct ceph_pagelist *pagelist; 2239 struct ceph_pagelist *pagelist;
2142 2240
2143 pr_info("reconnect to recovering mds%d\n", mds); 2241 pr_info("mds%d reconnect start\n", mds);
2144 2242
2145 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 2243 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2146 if (!pagelist) 2244 if (!pagelist)
2147 goto fail_nopagelist; 2245 goto fail_nopagelist;
2148 ceph_pagelist_init(pagelist); 2246 ceph_pagelist_init(pagelist);
2149 2247
2150 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); 2248 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
2151 if (IS_ERR(reply)) { 2249 if (!reply)
2152 err = PTR_ERR(reply);
2153 goto fail_nomsg; 2250 goto fail_nomsg;
2154 }
2155
2156 /* find session */
2157 session = __ceph_lookup_mds_session(mdsc, mds);
2158 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2159 2251
2160 if (session) { 2252 mutex_lock(&session->s_mutex);
2161 mutex_lock(&session->s_mutex); 2253 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2254 session->s_seq = 0;
2162 2255
2163 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2256 ceph_con_open(&session->s_con,
2164 session->s_seq = 0; 2257 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2165 2258
2166 ceph_con_open(&session->s_con, 2259 /* replay unsafe requests */
2167 ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 2260 replay_unsafe_requests(mdsc, session);
2168
2169 /* replay unsafe requests */
2170 replay_unsafe_requests(mdsc, session);
2171 } else {
2172 dout("no session for mds%d, will send short reconnect\n",
2173 mds);
2174 }
2175 2261
2176 down_read(&mdsc->snap_rwsem); 2262 down_read(&mdsc->snap_rwsem);
2177 2263
2178 if (!session)
2179 goto send;
2180 dout("session %p state %s\n", session, 2264 dout("session %p state %s\n", session,
2181 session_state_name(session->s_state)); 2265 session_state_name(session->s_state));
2182 2266
2267 /* drop old cap expires; we're about to reestablish that state */
2268 discard_cap_releases(mdsc, session);
2269
2183 /* traverse this session's caps */ 2270 /* traverse this session's caps */
2184 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2271 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2185 if (err) 2272 if (err)
@@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2208 goto fail; 2295 goto fail;
2209 } 2296 }
2210 2297
2211send:
2212 reply->pagelist = pagelist; 2298 reply->pagelist = pagelist;
2213 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2299 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2214 reply->nr_pages = calc_pages_for(0, pagelist->length); 2300 reply->nr_pages = calc_pages_for(0, pagelist->length);
2215 ceph_con_send(&session->s_con, reply); 2301 ceph_con_send(&session->s_con, reply);
2216 2302
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 mutex_unlock(&session->s_mutex); 2303 mutex_unlock(&session->s_mutex);
2219 2304
2220 mutex_lock(&mdsc->mutex); 2305 mutex_lock(&mdsc->mutex);
2221 __wake_requests(mdsc, &session->s_waiting); 2306 __wake_requests(mdsc, &session->s_waiting);
2222 mutex_unlock(&mdsc->mutex); 2307 mutex_unlock(&mdsc->mutex);
2223 2308
2224 ceph_put_mds_session(session);
2225
2226 up_read(&mdsc->snap_rwsem); 2309 up_read(&mdsc->snap_rwsem);
2227 mutex_lock(&mdsc->mutex);
2228 return; 2310 return;
2229 2311
2230fail: 2312fail:
2231 ceph_msg_put(reply); 2313 ceph_msg_put(reply);
2232 up_read(&mdsc->snap_rwsem); 2314 up_read(&mdsc->snap_rwsem);
2233 mutex_unlock(&session->s_mutex); 2315 mutex_unlock(&session->s_mutex);
2234 ceph_put_mds_session(session);
2235fail_nomsg: 2316fail_nomsg:
2236 ceph_pagelist_release(pagelist); 2317 ceph_pagelist_release(pagelist);
2237 kfree(pagelist); 2318 kfree(pagelist);
2238fail_nopagelist: 2319fail_nopagelist:
2239 pr_err("error %d preparing reconnect for mds%d\n", err, mds); 2320 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2240 mutex_lock(&mdsc->mutex);
2241 return; 2321 return;
2242} 2322}
2243 2323
@@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2290 } 2370 }
2291 2371
2292 /* kick any requests waiting on the recovering mds */ 2372 /* kick any requests waiting on the recovering mds */
2293 kick_requests(mdsc, i, 1); 2373 kick_requests(mdsc, i);
2294 } else if (oldstate == newstate) { 2374 } else if (oldstate == newstate) {
2295 continue; /* nothing new with this mds */ 2375 continue; /* nothing new with this mds */
2296 } 2376 }
@@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2299 * send reconnect? 2379 * send reconnect?
2300 */ 2380 */
2301 if (s->s_state == CEPH_MDS_SESSION_RESTARTING && 2381 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2302 newstate >= CEPH_MDS_STATE_RECONNECT) 2382 newstate >= CEPH_MDS_STATE_RECONNECT) {
2303 send_mds_reconnect(mdsc, i); 2383 mutex_unlock(&mdsc->mutex);
2384 send_mds_reconnect(mdsc, s);
2385 mutex_lock(&mdsc->mutex);
2386 }
2304 2387
2305 /* 2388 /*
2306 * kick requests on any mds that has gone active. 2389 * kick request on any mds that has gone active.
2307 *
2308 * kick requests on cur or forwarder: we may have sent
2309 * the request to mds1, mds1 told us it forwarded it
2310 * to mds2, but then we learn mds1 failed and can't be
2311 * sure it successfully forwarded our request before
2312 * it died.
2313 */ 2390 */
2314 if (oldstate < CEPH_MDS_STATE_ACTIVE && 2391 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2315 newstate >= CEPH_MDS_STATE_ACTIVE) { 2392 newstate >= CEPH_MDS_STATE_ACTIVE) {
2316 pr_info("mds%d reconnect completed\n", s->s_mds); 2393 if (oldstate != CEPH_MDS_STATE_CREATING &&
2317 kick_requests(mdsc, i, 1); 2394 oldstate != CEPH_MDS_STATE_STARTING)
2395 pr_info("mds%d recovery completed\n", s->s_mds);
2396 kick_requests(mdsc, i);
2318 ceph_kick_flushing_caps(mdsc, s); 2397 ceph_kick_flushing_caps(mdsc, s);
2319 wake_up_session_caps(s, 1); 2398 wake_up_session_caps(s, 1);
2320 } 2399 }
@@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2457 dnamelen = dentry->d_name.len; 2536 dnamelen = dentry->d_name.len;
2458 len += dnamelen; 2537 len += dnamelen;
2459 2538
2460 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); 2539 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
2461 if (IS_ERR(msg)) 2540 if (!msg)
2462 return; 2541 return;
2463 lease = msg->front.iov_base; 2542 lease = msg->front.iov_base;
2464 lease->action = action; 2543 lease->action = action;
@@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work)
2603 else 2682 else
2604 ceph_con_keepalive(&s->s_con); 2683 ceph_con_keepalive(&s->s_con);
2605 add_cap_releases(mdsc, s, -1); 2684 add_cap_releases(mdsc, s, -1);
2606 send_cap_releases(mdsc, s); 2685 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2686 s->s_state == CEPH_MDS_SESSION_HUNG)
2687 send_cap_releases(mdsc, s);
2607 mutex_unlock(&s->s_mutex); 2688 mutex_unlock(&s->s_mutex);
2608 ceph_put_mds_session(s); 2689 ceph_put_mds_session(s);
2609 2690
@@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2620 mdsc->client = client; 2701 mdsc->client = client;
2621 mutex_init(&mdsc->mutex); 2702 mutex_init(&mdsc->mutex);
2622 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2703 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2704 if (mdsc->mdsmap == NULL)
2705 return -ENOMEM;
2706
2623 init_completion(&mdsc->safe_umount_waiters); 2707 init_completion(&mdsc->safe_umount_waiters);
2624 init_completion(&mdsc->session_close_waiters); 2708 init_completion(&mdsc->session_close_waiters);
2625 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2709 INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2645 init_waitqueue_head(&mdsc->cap_flushing_wq); 2729 init_waitqueue_head(&mdsc->cap_flushing_wq);
2646 spin_lock_init(&mdsc->dentry_lru_lock); 2730 spin_lock_init(&mdsc->dentry_lru_lock);
2647 INIT_LIST_HEAD(&mdsc->dentry_lru); 2731 INIT_LIST_HEAD(&mdsc->dentry_lru);
2732
2648 return 0; 2733 return 0;
2649} 2734}
2650 2735
@@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2740{ 2825{
2741 u64 want_tid, want_flush; 2826 u64 want_tid, want_flush;
2742 2827
2828 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
2829 return;
2830
2743 dout("sync\n"); 2831 dout("sync\n");
2744 mutex_lock(&mdsc->mutex); 2832 mutex_lock(&mdsc->mutex);
2745 want_tid = mdsc->last_tid; 2833 want_tid = mdsc->last_tid;
@@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con)
2922static void peer_reset(struct ceph_connection *con) 3010static void peer_reset(struct ceph_connection *con)
2923{ 3011{
2924 struct ceph_mds_session *s = con->private; 3012 struct ceph_mds_session *s = con->private;
3013 struct ceph_mds_client *mdsc = s->s_mdsc;
2925 3014
2926 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", 3015 pr_warning("mds%d closed our session\n", s->s_mds);
2927 s->s_mds); 3016 send_mds_reconnect(mdsc, s);
2928} 3017}
2929 3018
2930static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 3019static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
3031 return ceph_monc_validate_auth(&mdsc->client->monc); 3120 return ceph_monc_validate_auth(&mdsc->client->monc);
3032} 3121}
3033 3122
3034const static struct ceph_connection_operations mds_con_ops = { 3123static const struct ceph_connection_operations mds_con_ops = {
3035 .get = con_get, 3124 .get = con_get,
3036 .put = con_put, 3125 .put = con_put,
3037 .dispatch = dispatch, 3126 .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ 165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */ 166 struct inode *r_target_inode; /* resulting inode */
167 167
168 struct mutex r_fill_mutex;
169
168 union ceph_mds_request_args r_args; 170 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */ 171 int r_fmode; /* file mode, if expecting cap */
170 172
@@ -213,7 +215,7 @@ struct ceph_mds_request {
213 struct completion r_safe_completion; 215 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback; 216 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */ 217 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe; 218 bool r_got_unsafe, r_got_safe, r_got_result;
217 219
218 bool r_did_prepopulate; 220 bool r_did_prepopulate;
219 u32 r_readdir_offset; 221 u32 r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode, 303 struct inode *inode,
302 struct dentry *dn, int mask); 304 struct dentry *dn, int mask);
303 305
306extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
307
304extern struct ceph_mds_request * 308extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 309ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 310extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..60b74839ebec 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *); 39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con); 40static void ceph_fault(struct ceph_connection *con);
41 41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/* 42/*
55 * nicely render a sockaddr as a string. 43 * nicely render a sockaddr as a string.
56 */ 44 */
@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con)
340 ceph_msg_put(con->out_msg); 328 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL; 329 con->out_msg = NULL;
342 } 330 }
331 con->out_keepalive_pending = false;
343 con->in_seq = 0; 332 con->in_seq = 0;
344 con->in_seq_acked = 0; 333 con->in_seq_acked = 0;
345} 334}
@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con)
357 clear_bit(WRITE_PENDING, &con->state); 346 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex); 347 mutex_lock(&con->mutex);
359 reset_connection(con); 348 reset_connection(con);
349 con->peer_global_seq = 0;
360 cancel_delayed_work(&con->work); 350 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex); 351 mutex_unlock(&con->mutex);
362 queue_con(con); 352 queue_con(con);
@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
661 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 651 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
662 con->connect_seq, global_seq, proto); 652 con->connect_seq, global_seq, proto);
663 653
664 con->out_connect.features = CEPH_FEATURE_SUPPORTED; 654 con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
665 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 655 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
666 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 656 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
667 con->out_connect.global_seq = cpu_to_le32(global_seq); 657 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con)
1124 1114
1125static int process_connect(struct ceph_connection *con) 1115static int process_connect(struct ceph_connection *con)
1126{ 1116{
1127 u64 sup_feat = CEPH_FEATURE_SUPPORTED; 1117 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
1128 u64 req_feat = CEPH_FEATURE_REQUIRED; 1118 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
1129 u64 server_feat = le64_to_cpu(con->in_reply.features); 1119 u64 server_feat = le64_to_cpu(con->in_reply.features);
1130 1120
1131 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1121 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
1233 clear_bit(CONNECTING, &con->state); 1223 clear_bit(CONNECTING, &con->state);
1234 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1224 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1235 con->connect_seq++; 1225 con->connect_seq++;
1226 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n", 1227 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq, 1228 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq), 1229 le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con)
1402 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1393 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1403 if (skip) { 1394 if (skip) {
1404 /* skip this message */ 1395 /* skip this message */
1405 dout("alloc_msg returned NULL, skipping message\n"); 1396 dout("alloc_msg said skip message\n");
1406 con->in_base_pos = -front_len - middle_len - data_len - 1397 con->in_base_pos = -front_len - middle_len - data_len -
1407 sizeof(m->footer); 1398 sizeof(m->footer);
1408 con->in_tag = CEPH_MSGR_TAG_READY; 1399 con->in_tag = CEPH_MSGR_TAG_READY;
1409 con->in_seq++; 1400 con->in_seq++;
1410 return 0; 1401 return 0;
1411 } 1402 }
1412 if (IS_ERR(con->in_msg)) { 1403 if (!con->in_msg) {
1413 ret = PTR_ERR(con->in_msg);
1414 con->in_msg = NULL;
1415 con->error_msg = 1404 con->error_msg =
1416 "error allocating memory for incoming message"; 1405 "error allocating memory for incoming message";
1417 return ret; 1406 return -ENOMEM;
1418 } 1407 }
1419 m = con->in_msg; 1408 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */ 1409 m->front.iov_len = 0; /* haven't read it yet */
@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con)
1514 1503
1515 /* if first message, set peer_name */ 1504 /* if first message, set peer_name */
1516 if (con->peer_name.type == 0) 1505 if (con->peer_name.type == 0)
1517 con->peer_name = msg->hdr.src.name; 1506 con->peer_name = msg->hdr.src;
1518 1507
1519 con->in_seq++; 1508 con->in_seq++;
1520 mutex_unlock(&con->mutex); 1509 mutex_unlock(&con->mutex);
1521 1510
1522 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", 1511 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1523 msg, le64_to_cpu(msg->hdr.seq), 1512 msg, le64_to_cpu(msg->hdr.seq),
1524 ENTITY_NAME(msg->hdr.src.name), 1513 ENTITY_NAME(msg->hdr.src),
1525 le16_to_cpu(msg->hdr.type), 1514 le16_to_cpu(msg->hdr.type),
1526 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), 1515 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1527 le32_to_cpu(msg->hdr.front_len), 1516 le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con)
1546 dout("try_write start %p state %lu nref %d\n", con, con->state, 1535 dout("try_write start %p state %lu nref %d\n", con, con->state,
1547 atomic_read(&con->nref)); 1536 atomic_read(&con->nref));
1548 1537
1549 mutex_lock(&con->mutex);
1550more: 1538more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 1539 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552 1540
@@ -1639,7 +1627,6 @@ do_next:
1639done: 1627done:
1640 ret = 0; 1628 ret = 0;
1641out: 1629out:
1642 mutex_unlock(&con->mutex);
1643 dout("try_write done on %p\n", con); 1630 dout("try_write done on %p\n", con);
1644 return ret; 1631 return ret;
1645} 1632}
@@ -1651,7 +1638,6 @@ out:
1651 */ 1638 */
1652static int try_read(struct ceph_connection *con) 1639static int try_read(struct ceph_connection *con)
1653{ 1640{
1654 struct ceph_messenger *msgr;
1655 int ret = -1; 1641 int ret = -1;
1656 1642
1657 if (!con->sock) 1643 if (!con->sock)
@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con)
1661 return 0; 1647 return 0;
1662 1648
1663 dout("try_read start on %p\n", con); 1649 dout("try_read start on %p\n", con);
1664 msgr = con->msgr;
1665
1666 mutex_lock(&con->mutex);
1667 1650
1668more: 1651more:
1669 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 1652 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1741,6 @@ more:
1758done: 1741done:
1759 ret = 0; 1742 ret = 0;
1760out: 1743out:
1761 mutex_unlock(&con->mutex);
1762 dout("try_read done on %p\n", con); 1744 dout("try_read done on %p\n", con);
1763 return ret; 1745 return ret;
1764 1746
@@ -1830,6 +1812,8 @@ more:
1830 dout("con_work %p start, clearing QUEUED\n", con); 1812 dout("con_work %p start, clearing QUEUED\n", con);
1831 clear_bit(QUEUED, &con->state); 1813 clear_bit(QUEUED, &con->state);
1832 1814
1815 mutex_lock(&con->mutex);
1816
1833 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ 1817 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1834 dout("con_work CLOSED\n"); 1818 dout("con_work CLOSED\n");
1835 con_close_socket(con); 1819 con_close_socket(con);
@@ -1844,11 +1828,16 @@ more:
1844 if (test_and_clear_bit(SOCK_CLOSED, &con->state) || 1828 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1845 try_read(con) < 0 || 1829 try_read(con) < 0 ||
1846 try_write(con) < 0) { 1830 try_write(con) < 0) {
1831 mutex_unlock(&con->mutex);
1847 backoff = 1; 1832 backoff = 1;
1848 ceph_fault(con); /* error/fault path */ 1833 ceph_fault(con); /* error/fault path */
1834 goto done_unlocked;
1849 } 1835 }
1850 1836
1851done: 1837done:
1838 mutex_unlock(&con->mutex);
1839
1840done_unlocked:
1852 clear_bit(BUSY, &con->state); 1841 clear_bit(BUSY, &con->state);
1853 dout("con->state=%lu\n", con->state); 1842 dout("con->state=%lu\n", con->state);
1854 if (test_bit(QUEUED, &con->state)) { 1843 if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1947 1936
1948 /* the zero page is needed if a request is "canceled" while the message 1937 /* the zero page is needed if a request is "canceled" while the message
1949 * is being written over the socket */ 1938 * is being written over the socket */
1950 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1939 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1951 if (!msgr->zero_page) { 1940 if (!msgr->zero_page) {
1952 kfree(msgr); 1941 kfree(msgr);
1953 return ERR_PTR(-ENOMEM); 1942 return ERR_PTR(-ENOMEM);
@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1987 } 1976 }
1988 1977
1989 /* set src+dst */ 1978 /* set src+dst */
1990 msg->hdr.src.name = con->msgr->inst.name; 1979 msg->hdr.src = con->msgr->inst.name;
1991 msg->hdr.src.addr = con->msgr->my_enc_addr;
1992 msg->hdr.orig_src = msg->hdr.src;
1993 1980
1994 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 1981 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1995 1982
@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
2083 * construct a new message with given type, size 2070 * construct a new message with given type, size
2084 * the new msg has a ref count of 1. 2071 * the new msg has a ref count of 1.
2085 */ 2072 */
2086struct ceph_msg *ceph_msg_new(int type, int front_len, 2073struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2087 int page_len, int page_off, struct page **pages)
2088{ 2074{
2089 struct ceph_msg *m; 2075 struct ceph_msg *m;
2090 2076
2091 m = kmalloc(sizeof(*m), GFP_NOFS); 2077 m = kmalloc(sizeof(*m), flags);
2092 if (m == NULL) 2078 if (m == NULL)
2093 goto out; 2079 goto out;
2094 kref_init(&m->kref); 2080 kref_init(&m->kref);
@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2100 m->hdr.version = 0; 2086 m->hdr.version = 0;
2101 m->hdr.front_len = cpu_to_le32(front_len); 2087 m->hdr.front_len = cpu_to_le32(front_len);
2102 m->hdr.middle_len = 0; 2088 m->hdr.middle_len = 0;
2103 m->hdr.data_len = cpu_to_le32(page_len); 2089 m->hdr.data_len = 0;
2104 m->hdr.data_off = cpu_to_le16(page_off); 2090 m->hdr.data_off = 0;
2105 m->hdr.reserved = 0; 2091 m->hdr.reserved = 0;
2106 m->footer.front_crc = 0; 2092 m->footer.front_crc = 0;
2107 m->footer.middle_crc = 0; 2093 m->footer.middle_crc = 0;
@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2115 /* front */ 2101 /* front */
2116 if (front_len) { 2102 if (front_len) {
2117 if (front_len > PAGE_CACHE_SIZE) { 2103 if (front_len > PAGE_CACHE_SIZE) {
2118 m->front.iov_base = __vmalloc(front_len, GFP_NOFS, 2104 m->front.iov_base = __vmalloc(front_len, flags,
2119 PAGE_KERNEL); 2105 PAGE_KERNEL);
2120 m->front_is_vmalloc = true; 2106 m->front_is_vmalloc = true;
2121 } else { 2107 } else {
2122 m->front.iov_base = kmalloc(front_len, GFP_NOFS); 2108 m->front.iov_base = kmalloc(front_len, flags);
2123 } 2109 }
2124 if (m->front.iov_base == NULL) { 2110 if (m->front.iov_base == NULL) {
2125 pr_err("msg_new can't allocate %d bytes\n", 2111 pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2135 m->middle = NULL; 2121 m->middle = NULL;
2136 2122
2137 /* data */ 2123 /* data */
2138 m->nr_pages = calc_pages_for(page_off, page_len); 2124 m->nr_pages = 0;
2139 m->pages = pages; 2125 m->pages = NULL;
2140 m->pagelist = NULL; 2126 m->pagelist = NULL;
2141 2127
2142 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, 2128 dout("ceph_msg_new %p front %d\n", m, front_len);
2143 m->nr_pages);
2144 return m; 2129 return m;
2145 2130
2146out2: 2131out2:
2147 ceph_msg_put(m); 2132 ceph_msg_put(m);
2148out: 2133out:
2149 pr_err("msg_new can't create type %d len %d\n", type, front_len); 2134 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2150 return ERR_PTR(-ENOMEM); 2135 return NULL;
2151} 2136}
2152 2137
2153/* 2138/*
@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2190 mutex_unlock(&con->mutex); 2175 mutex_unlock(&con->mutex);
2191 msg = con->ops->alloc_msg(con, hdr, skip); 2176 msg = con->ops->alloc_msg(con, hdr, skip);
2192 mutex_lock(&con->mutex); 2177 mutex_lock(&con->mutex);
2193 if (IS_ERR(msg)) 2178 if (!msg || *skip)
2194 return msg;
2195
2196 if (*skip)
2197 return NULL; 2179 return NULL;
2198 } 2180 }
2199 if (!msg) { 2181 if (!msg) {
2200 *skip = 0; 2182 *skip = 0;
2201 msg = ceph_msg_new(type, front_len, 0, 0, NULL); 2183 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2202 if (!msg) { 2184 if (!msg) {
2203 pr_err("unable to allocate msg type %d len %d\n", 2185 pr_err("unable to allocate msg type %d len %d\n",
2204 type, front_len); 2186 type, front_len);
2205 return ERR_PTR(-ENOMEM); 2187 return NULL;
2206 } 2188 }
2207 } 2189 }
2208 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2190 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2209 2191
2210 if (middle_len) { 2192 if (middle_len && !msg->middle) {
2211 ret = ceph_alloc_middle(con, msg); 2193 ret = ceph_alloc_middle(con, msg);
2212
2213 if (ret < 0) { 2194 if (ret < 0) {
2214 ceph_msg_put(msg); 2195 ceph_msg_put(msg);
2215 return msg; 2196 return NULL;
2216 } 2197 }
2217 } 2198 }
2218 2199
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..00a9430b1ffc 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
49 int *skip); 49 int *skip);
50}; 50};
51 51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */ 52/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) 53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
56 54
57struct ceph_messenger { 55struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */ 56 struct ceph_entity_inst inst; /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
144 struct ceph_entity_addr peer_addr; /* peer address */ 142 struct ceph_entity_addr peer_addr; /* peer address */
145 struct ceph_entity_name peer_name; /* peer name */ 143 struct ceph_entity_name peer_name; /* peer name */
146 struct ceph_entity_addr peer_addr_for_me; 144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
147 u32 connect_seq; /* identify the most recent connection 146 u32 connect_seq; /* identify the most recent connection
148 attempt for this connection, client */ 147 attempt for this connection, client */
149 u32 peer_global_seq; /* peer's global seq for this connection */ 148 u32 peer_global_seq; /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
158 struct list_head out_queue; 157 struct list_head out_queue;
159 struct list_head out_sent; /* sending or sent but unacked */ 158 struct list_head out_sent; /* sending or sent but unacked */
160 u64 out_seq; /* last message queued for send */ 159 u64 out_seq; /* last message queued for send */
161 u64 out_seq_sent; /* last message sent */
162 bool out_keepalive_pending; 160 bool out_keepalive_pending;
163 161
164 u64 in_seq, in_seq_acked; /* last message received, acked */ 162 u64 in_seq, in_seq_acked; /* last message received, acked */
@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
234extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); 232extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
235extern void ceph_con_put(struct ceph_connection *con); 233extern void ceph_con_put(struct ceph_connection *con);
236 234
237extern struct ceph_msg *ceph_msg_new(int type, int front_len, 235extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
238 int page_len, int page_off,
239 struct page **pages);
240extern void ceph_msg_kfree(struct ceph_msg *m); 236extern void ceph_msg_kfree(struct ceph_msg *m);
241 237
242 238
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..f6510a476e7e 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
28 * resend any outstanding requests. 28 * resend any outstanding requests.
29 */ 29 */
30 30
31const static struct ceph_connection_operations mon_con_ops; 31static const struct ceph_connection_operations mon_con_ops;
32 32
33static int __validate_auth(struct ceph_mon_client *monc); 33static int __validate_auth(struct ceph_mon_client *monc);
34 34
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
104 monc->pending_auth = 1; 104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len; 105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len); 106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
107 ceph_msg_get(monc->m_auth); /* keep our ref */ 108 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth); 109 ceph_con_send(monc->con, monc->m_auth);
109} 110}
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
187 monc->want_next_osdmap); 188 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) || 189 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) { 190 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg; 191 struct ceph_msg *msg = monc->m_subscribe;
191 struct ceph_mon_subscribe_item *i; 192 struct ceph_mon_subscribe_item *i;
192 void *p, *end; 193 void *p, *end;
193 194
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base; 195 p = msg->front.iov_base;
199 end = p + msg->front.iov_len; 196 end = p + msg->front_max;
200 197
201 dout("__send_subscribe to 'mdsmap' %u+\n", 198 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap); 199 (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
226 223
227 msg->front.iov_len = p - msg->front.iov_base; 224 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg); 226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
230 228
231 monc->sub_sent = jiffies | 1; /* never 0 */ 229 monc->sub_sent = jiffies | 1; /* never 0 */
232 } 230 }
@@ -353,14 +351,14 @@ out:
353/* 351/*
354 * statfs 352 * statfs
355 */ 353 */
356static struct ceph_mon_statfs_request *__lookup_statfs( 354static struct ceph_mon_generic_request *__lookup_generic_req(
357 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
358{ 356{
359 struct ceph_mon_statfs_request *req; 357 struct ceph_mon_generic_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node; 358 struct rb_node *n = monc->generic_request_tree.rb_node;
361 359
362 while (n) { 360 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node); 361 req = rb_entry(n, struct ceph_mon_generic_request, node);
364 if (tid < req->tid) 362 if (tid < req->tid)
365 n = n->rb_left; 363 n = n->rb_left;
366 else if (tid > req->tid) 364 else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
371 return NULL; 369 return NULL;
372} 370}
373 371
374static void __insert_statfs(struct ceph_mon_client *monc, 372static void __insert_generic_request(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new) 373 struct ceph_mon_generic_request *new)
376{ 374{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node; 375 struct rb_node **p = &monc->generic_request_tree.rb_node;
378 struct rb_node *parent = NULL; 376 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL; 377 struct ceph_mon_generic_request *req = NULL;
380 378
381 while (*p) { 379 while (*p) {
382 parent = *p; 380 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node); 381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
384 if (new->tid < req->tid) 382 if (new->tid < req->tid)
385 p = &(*p)->rb_left; 383 p = &(*p)->rb_left;
386 else if (new->tid > req->tid) 384 else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
390 } 388 }
391 389
392 rb_link_node(&new->node, parent, p); 390 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree); 391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403}
404
405static void put_generic_request(struct ceph_mon_generic_request *req)
406{
407 kref_put(&req->kref, release_generic_request);
408}
409
410static void get_generic_request(struct ceph_mon_generic_request *req)
411{
412 kref_get(&req->kref);
413}
414
415static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
416 struct ceph_msg_header *hdr,
417 int *skip)
418{
419 struct ceph_mon_client *monc = con->private;
420 struct ceph_mon_generic_request *req;
421 u64 tid = le64_to_cpu(hdr->tid);
422 struct ceph_msg *m;
423
424 mutex_lock(&monc->mutex);
425 req = __lookup_generic_req(monc, tid);
426 if (!req) {
427 dout("get_generic_reply %lld dne\n", tid);
428 *skip = 1;
429 m = NULL;
430 } else {
431 dout("get_generic_reply %lld got %p\n", tid, req->reply);
432 m = ceph_msg_get(req->reply);
433 /*
434 * we don't need to track the connection reading into
435 * this reply because we only have one open connection
436 * at a time, ever.
437 */
438 }
439 mutex_unlock(&monc->mutex);
440 return m;
394} 441}
395 442
396static void handle_statfs_reply(struct ceph_mon_client *monc, 443static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg) 444 struct ceph_msg *msg)
398{ 445{
399 struct ceph_mon_statfs_request *req; 446 struct ceph_mon_generic_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 447 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid; 448 u64 tid = le64_to_cpu(msg->hdr.tid);
402 449
403 if (msg->front.iov_len != sizeof(*reply)) 450 if (msg->front.iov_len != sizeof(*reply))
404 goto bad; 451 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 452 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407 453
408 mutex_lock(&monc->mutex); 454 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid); 455 req = __lookup_generic_req(monc, tid);
410 if (req) { 456 if (req) {
411 *req->buf = reply->st; 457 *(struct ceph_statfs *)req->buf = reply->st;
412 req->result = 0; 458 req->result = 0;
459 get_generic_request(req);
413 } 460 }
414 mutex_unlock(&monc->mutex); 461 mutex_unlock(&monc->mutex);
415 if (req) 462 if (req) {
416 complete(&req->completion); 463 complete(&req->completion);
464 put_generic_request(req);
465 }
417 return; 466 return;
418 467
419bad: 468bad:
420 pr_err("corrupt statfs reply, no tid\n"); 469 pr_err("corrupt generic reply, no tid\n");
421 ceph_msg_dump(msg); 470 ceph_msg_dump(msg);
422} 471}
423 472
424/* 473/*
425 * (re)send a statfs request 474 * Do a synchronous statfs().
426 */ 475 */
427static int send_statfs(struct ceph_mon_client *monc, 476int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
428 struct ceph_mon_statfs_request *req)
429{ 477{
430 struct ceph_msg *msg; 478 struct ceph_mon_generic_request *req;
431 struct ceph_mon_statfs *h; 479 struct ceph_mon_statfs *h;
480 int err;
432 481
433 dout("send_statfs tid %llu\n", req->tid); 482 req = kzalloc(sizeof(*req), GFP_NOFS);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); 483 if (!req)
435 if (IS_ERR(msg)) 484 return -ENOMEM;
436 return PTR_ERR(msg); 485
437 req->request = msg; 486 kref_init(&req->kref);
438 msg->hdr.tid = cpu_to_le64(req->tid); 487 req->buf = buf;
439 h = msg->front.iov_base; 488 init_completion(&req->completion);
489
490 err = -ENOMEM;
491 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
492 if (!req->request)
493 goto out;
494 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
495 if (!req->reply)
496 goto out;
497
498 /* fill out request */
499 h = req->request->front.iov_base;
440 h->monhdr.have_version = 0; 500 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1); 501 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0; 502 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid; 503 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463 504
464 /* register request */ 505 /* register request */
465 mutex_lock(&monc->mutex); 506 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid; 507 req->tid = ++monc->last_tid;
467 req.last_attempt = jiffies; 508 req->request->hdr.tid = cpu_to_le64(req->tid);
468 req.delay = BASE_DELAY_INTERVAL; 509 __insert_generic_request(monc, req);
469 __insert_statfs(monc, &req); 510 monc->num_generic_requests++;
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex); 511 mutex_unlock(&monc->mutex);
472 512
473 /* send request and wait */ 513 /* send request and wait */
474 err = send_statfs(monc, &req); 514 ceph_con_send(monc->con, ceph_msg_get(req->request));
475 if (!err) 515 err = wait_for_completion_interruptible(&req->completion);
476 err = wait_for_completion_interruptible(&req.completion);
477 516
478 mutex_lock(&monc->mutex); 517 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree); 518 rb_erase(&req->node, &monc->generic_request_tree);
480 monc->num_statfs_requests--; 519 monc->num_generic_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex); 520 mutex_unlock(&monc->mutex);
483 521
484 if (!err) 522 if (!err)
485 err = req.result; 523 err = req->result;
524
525out:
526 kref_put(&req->kref, release_generic_request);
486 return err; 527 return err;
487} 528}
488 529
489/* 530/*
490 * Resend pending statfs requests. 531 * Resend pending statfs requests.
491 */ 532 */
492static void __resend_statfs(struct ceph_mon_client *monc) 533static void __resend_generic_request(struct ceph_mon_client *monc)
493{ 534{
494 struct ceph_mon_statfs_request *req; 535 struct ceph_mon_generic_request *req;
495 struct rb_node *p; 536 struct rb_node *p;
496 537
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { 538 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node); 539 req = rb_entry(p, struct ceph_mon_generic_request, node);
499 send_statfs(monc, req); 540 ceph_con_revoke(monc->con, req->request);
541 ceph_con_send(monc->con, ceph_msg_get(req->request));
500 } 542 }
501} 543}
502 544
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 628 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 629 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588 630
589 /* msg pools */ 631 /* msgs */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 632 err = -ENOMEM;
591 sizeof(struct ceph_mon_subscribe_ack), 1, false); 633 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
592 if (err < 0) 634 sizeof(struct ceph_mon_subscribe_ack),
635 GFP_NOFS);
636 if (!monc->m_subscribe_ack)
593 goto out_monmap; 637 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply, 638
595 sizeof(struct ceph_mon_statfs_reply), 0, false); 639 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
596 if (err < 0) 640 if (!monc->m_subscribe)
597 goto out_pool1; 641 goto out_subscribe_ack;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); 642
599 if (err < 0) 643 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
600 goto out_pool2; 644 if (!monc->m_auth_reply)
601 645 goto out_subscribe;
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); 646
647 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
603 monc->pending_auth = 0; 648 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) { 649 if (!monc->m_auth)
605 err = PTR_ERR(monc->m_auth); 650 goto out_auth_reply;
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609 651
610 monc->cur_mon = -1; 652 monc->cur_mon = -1;
611 monc->hunting = true; 653 monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
613 monc->sub_sent = 0; 655 monc->sub_sent = 0;
614 656
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 657 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT; 658 monc->generic_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0; 659 monc->num_generic_requests = 0;
618 monc->last_tid = 0; 660 monc->last_tid = 0;
619 661
620 monc->have_mdsmap = 0; 662 monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
622 monc->want_next_osdmap = 1; 664 monc->want_next_osdmap = 1;
623 return 0; 665 return 0;
624 666
625out_pool3: 667out_auth_reply:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 668 ceph_msg_put(monc->m_auth_reply);
627out_pool2: 669out_subscribe:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 670 ceph_msg_put(monc->m_subscribe);
629out_pool1: 671out_subscribe_ack:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 672 ceph_msg_put(monc->m_subscribe_ack);
631out_monmap: 673out_monmap:
632 kfree(monc->monmap); 674 kfree(monc->monmap);
633out: 675out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
651 ceph_auth_destroy(monc->auth); 693 ceph_auth_destroy(monc->auth);
652 694
653 ceph_msg_put(monc->m_auth); 695 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 696 ceph_msg_put(monc->m_auth_reply);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 697 ceph_msg_put(monc->m_subscribe);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 698 ceph_msg_put(monc->m_subscribe_ack);
657 699
658 kfree(monc->monmap); 700 kfree(monc->monmap);
659} 701}
@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
681 monc->client->msgr->inst.name.num = monc->auth->global_id; 723 monc->client->msgr->inst.name.num = monc->auth->global_id;
682 724
683 __send_subscribe(monc); 725 __send_subscribe(monc);
684 __resend_statfs(monc); 726 __resend_generic_request(monc);
685 } 727 }
686 mutex_unlock(&monc->mutex); 728 mutex_unlock(&monc->mutex);
687} 729}
@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
770 812
771 switch (type) { 813 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK: 814 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); 815 m = ceph_msg_get(monc->m_subscribe_ack);
774 break; 816 break;
775 case CEPH_MSG_STATFS_REPLY: 817 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); 818 return get_generic_reply(con, hdr, skip);
777 break;
778 case CEPH_MSG_AUTH_REPLY: 819 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); 820 m = ceph_msg_get(monc->m_auth_reply);
780 break; 821 break;
781 case CEPH_MSG_MON_MAP: 822 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP: 823 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP: 824 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL); 825 m = ceph_msg_new(type, front_len, GFP_NOFS);
785 break; 826 break;
786 } 827 }
787 828
@@ -826,7 +867,7 @@ out:
826 mutex_unlock(&monc->mutex); 867 mutex_unlock(&monc->mutex);
827} 868}
828 869
829const static struct ceph_connection_operations mon_con_ops = { 870static const struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get, 871 .get = ceph_con_get,
831 .put = ceph_con_put, 872 .put = ceph_con_put,
832 .dispatch = dispatch, 873 .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
2#define _FS_CEPH_MON_CLIENT_H 2#define _FS_CEPH_MON_CLIENT_H
3 3
4#include <linux/completion.h> 4#include <linux/completion.h>
5#include <linux/kref.h>
5#include <linux/rbtree.h> 6#include <linux/rbtree.h>
6 7
7#include "messenger.h" 8#include "messenger.h"
8#include "msgpool.h"
9 9
10struct ceph_client; 10struct ceph_client;
11struct ceph_mount_args; 11struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
22}; 22};
23 23
24struct ceph_mon_client; 24struct ceph_mon_client;
25struct ceph_mon_statfs_request; 25struct ceph_mon_generic_request;
26 26
27 27
28/* 28/*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
40}; 40};
41 41
42/* 42/*
43 * statfs() is done a bit differently because we need to get data back 43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
44 * to the caller 45 * to the caller
45 */ 46 */
46struct ceph_mon_statfs_request { 47struct ceph_mon_generic_request {
48 struct kref kref;
47 u64 tid; 49 u64 tid;
48 struct rb_node node; 50 struct rb_node node;
49 int result; 51 int result;
50 struct ceph_statfs *buf; 52 void *buf;
51 struct completion completion; 53 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */ 54 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */
54}; 56};
55 57
56struct ceph_mon_client { 58struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
61 struct delayed_work delayed_work; 63 struct delayed_work delayed_work;
62 64
63 struct ceph_auth_client *auth; 65 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth; 66 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
65 int pending_auth; 67 int pending_auth;
66 68
67 bool hunting; 69 bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
70 struct ceph_connection *con; 72 struct ceph_connection *con;
71 bool have_fsid; 73 bool have_fsid;
72 74
73 /* msg pools */ 75 /* pending generic requests */
74 struct ceph_msgpool msgpool_subscribe_ack; 76 struct rb_root generic_request_tree;
75 struct ceph_msgpool msgpool_statfs_reply; 77 int num_generic_requests;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid; 78 u64 last_tid;
82 79
83 /* mds/osd map */ 80 /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
7 7
8#include "msgpool.h" 8#include "msgpool.h"
9 9
10/* 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11 * We use msg pools to preallocate memory for messages we expect to 11{
12 * receive over the wire, to avoid getting ourselves into OOM 12 struct ceph_msgpool *pool = arg;
13 * conditions at unexpected times. We take use a few different 13 void *p;
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31 14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
32 20
33/* 21static void free_fn(void *element, void *arg)
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{ 22{
38 struct ceph_msg *msg; 23 ceph_msg_put(element);
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61} 24}
62 25
63int ceph_msgpool_init(struct ceph_msgpool *pool, 26int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking) 27 int front_len, int size, bool blocking, const char *name)
65{ 28{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len; 29 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs); 30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
72 pool->num = 0; 31 if (!pool->pool)
73 pool->min = min; 32 return -ENOMEM;
74 pool->blocking = blocking; 33 pool->name = name;
75 init_waitqueue_head(&pool->wait); 34 return 0;
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81} 35}
82 36
83void ceph_msgpool_destroy(struct ceph_msgpool *pool) 37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{ 38{
85 dout("msgpool_destroy %p\n", pool); 39 mempool_destroy(pool->pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90} 40}
91 41
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) 42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
93{ 44{
94 int ret; 45 if (front_len > pool->front_len) {
95 46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
96 spin_lock(&pool->lock); 47 pool->name, front_len, pool->front_len);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1); 48 WARN_ON(1);
113 49
114 /* try to alloc a fresh message */ 50 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL); 51 return ceph_msg_new(0, front_len, GFP_NOFS);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 } 52 }
129 53
130 while (1) { 54 return mempool_alloc(pool->pool, GFP_NOFS);
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163} 55}
164 56
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) 57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{ 58{
167 spin_lock(&pool->lock); 59 /* reset msg front_len; user may have changed it */
168 if (pool->num < pool->min) { 60 msg->front.iov_len = pool->front_len;
169 /* reset msg front_len; user may have changed it */ 61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172 62
173 kref_set(&msg->kref, 1); /* retake a single ref */ 63 kref_init(&msg->kref); /* retake single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186} 64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
1#ifndef _FS_CEPH_MSGPOOL 1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL 2#define _FS_CEPH_MSGPOOL
3 3
4#include <linux/mempool.h>
4#include "messenger.h" 5#include "messenger.h"
5 6
6/* 7/*
@@ -8,18 +9,15 @@
8 * avoid unexpected OOM conditions. 9 * avoid unexpected OOM conditions.
9 */ 10 */
10struct ceph_msgpool { 11struct ceph_msgpool {
11 spinlock_t lock; 12 const char *name;
13 mempool_t *pool;
12 int front_len; /* preallocated payload size */ 14 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17}; 15};
18 16
19extern int ceph_msgpool_init(struct ceph_msgpool *pool, 17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking); 18 int front_len, int size, bool blocking,
19 const char *name);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); 20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, 21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len); 22 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); 23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
50#define CEPH_ENTITY_TYPE_MDS 0x02 50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04 51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08 52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20 53#define CEPH_ENTITY_TYPE_AUTH 0x20
55 54
56#define CEPH_ENTITY_TYPE_ANY 0xFF 55#define CEPH_ENTITY_TYPE_ANY 0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
120/* 119/*
121 * message header 120 * message header
122 */ 121 */
123struct ceph_msg_header { 122struct ceph_msg_header_old {
124 __le64 seq; /* message seq# for this session */ 123 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */ 124 __le64 tid; /* transaction id */
126 __le16 type; /* message type */ 125 __le16 type; /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
138 __le32 crc; /* header crc32c */ 137 __le32 crc; /* header crc32c */
139} __attribute__ ((packed)); 138} __attribute__ ((packed));
140 139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
141#define CEPH_MSG_PRIO_LOW 64 158#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127 159#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196 160#define CEPH_MSG_PRIO_HIGH 196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..afa7bb3895c4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
16#define OSD_OP_FRONT_LEN 4096 16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 17#define OSD_OPREPLY_FRONT_LEN 512
18 18
19const static struct ceph_connection_operations osd_con_ops; 19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc, 20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd); 21 struct ceph_osd *kickosd);
22 22
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
147 req = kzalloc(sizeof(*req), GFP_NOFS); 147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 } 148 }
149 if (req == NULL) 149 if (req == NULL)
150 return ERR_PTR(-ENOMEM); 150 return NULL;
151 151
152 req->r_osdc = osdc; 152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 153 req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); 167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (IS_ERR(msg)) { 168 if (!msg) {
169 ceph_osdc_put_request(req); 169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg)); 170 return NULL;
171 } 171 }
172 req->r_reply = msg; 172 req->r_reply = msg;
173 173
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); 181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (IS_ERR(msg)) { 182 if (!msg) {
183 ceph_osdc_put_request(req); 183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg)); 184 return NULL;
185 } 185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 187 memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
715 * should mark the osd as failed and we should find out about 715 * should mark the osd as failed and we should find out about
716 * it from an updated osd map. 716 * it from an updated osd map.
717 */ 717 */
718 while (!list_empty(&osdc->req_lru)) { 718 while (timeout && !list_empty(&osdc->req_lru)) {
719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request, 719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
720 r_req_lru_item); 720 r_req_lru_item);
721 721
@@ -1078,6 +1078,7 @@ done:
1078 if (newmap) 1078 if (newmap)
1079 kick_requests(osdc, NULL); 1079 kick_requests(osdc, NULL);
1080 up_read(&osdc->map_sem); 1080 up_read(&osdc->map_sem);
1081 wake_up(&osdc->client->auth_wq);
1081 return; 1082 return;
1082 1083
1083bad: 1084bad:
@@ -1087,45 +1088,6 @@ bad:
1087 return; 1088 return;
1088} 1089}
1089 1090
1090
1091/*
1092 * A read request prepares specific pages that data is to be read into.
1093 * When a message is being read off the wire, we call prepare_pages to
1094 * find those pages.
1095 * 0 = success, -1 failure.
1096 */
1097static int __prepare_pages(struct ceph_connection *con,
1098 struct ceph_msg_header *hdr,
1099 struct ceph_osd_request *req,
1100 u64 tid,
1101 struct ceph_msg *m)
1102{
1103 struct ceph_osd *osd = con->private;
1104 struct ceph_osd_client *osdc;
1105 int ret = -1;
1106 int data_len = le32_to_cpu(hdr->data_len);
1107 unsigned data_off = le16_to_cpu(hdr->data_off);
1108
1109 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1110
1111 if (!osd)
1112 return -1;
1113
1114 osdc = osd->o_osdc;
1115
1116 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1117 tid, req->r_num_pages, want);
1118 if (unlikely(req->r_num_pages < want))
1119 goto out;
1120 m->pages = req->r_pages;
1121 m->nr_pages = req->r_num_pages;
1122 ret = 0; /* success */
1123out:
1124 BUG_ON(ret < 0 || m->nr_pages < want);
1125
1126 return ret;
1127}
1128
1129/* 1091/*
1130 * Register request, send initial attempt. 1092 * Register request, send initial attempt.
1131 */ 1093 */
@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1252 if (!osdc->req_mempool) 1214 if (!osdc->req_mempool)
1253 goto out; 1215 goto out;
1254 1216
1255 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); 1217 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1218 "osd_op");
1256 if (err < 0) 1219 if (err < 0)
1257 goto out_mempool; 1220 goto out_mempool;
1258 err = ceph_msgpool_init(&osdc->msgpool_op_reply, 1221 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1259 OSD_OPREPLY_FRONT_LEN, 10, true); 1222 OSD_OPREPLY_FRONT_LEN, 10, true,
1223 "osd_op_reply");
1260 if (err < 0) 1224 if (err < 0)
1261 goto out_msgpool; 1225 goto out_msgpool;
1262 return 0; 1226 return 0;
@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1302 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1266 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1303 NULL, 0, truncate_seq, truncate_size, NULL, 1267 NULL, 0, truncate_seq, truncate_size, NULL,
1304 false, 1); 1268 false, 1);
1305 if (IS_ERR(req)) 1269 if (!req)
1306 return PTR_ERR(req); 1270 return -ENOMEM;
1307 1271
1308 /* it may be a short read due to an object boundary */ 1272 /* it may be a short read due to an object boundary */
1309 req->r_pages = pages; 1273 req->r_pages = pages;
@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1345 snapc, do_sync, 1309 snapc, do_sync,
1346 truncate_seq, truncate_size, mtime, 1310 truncate_seq, truncate_size, mtime,
1347 nofail, 1); 1311 nofail, 1);
1348 if (IS_ERR(req)) 1312 if (!req)
1349 return PTR_ERR(req); 1313 return -ENOMEM;
1350 1314
1351 /* it may be a short write due to an object boundary */ 1315 /* it may be a short write due to an object boundary */
1352 req->r_pages = pages; 1316 req->r_pages = pages;
@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1394} 1358}
1395 1359
1396/* 1360/*
1397 * lookup and return message for incoming reply 1361 * lookup and return message for incoming reply. set up reply message
1362 * pages.
1398 */ 1363 */
1399static struct ceph_msg *get_reply(struct ceph_connection *con, 1364static struct ceph_msg *get_reply(struct ceph_connection *con,
1400 struct ceph_msg_header *hdr, 1365 struct ceph_msg_header *hdr,
@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1407 int front = le32_to_cpu(hdr->front_len); 1372 int front = le32_to_cpu(hdr->front_len);
1408 int data_len = le32_to_cpu(hdr->data_len); 1373 int data_len = le32_to_cpu(hdr->data_len);
1409 u64 tid; 1374 u64 tid;
1410 int err;
1411 1375
1412 tid = le64_to_cpu(hdr->tid); 1376 tid = le64_to_cpu(hdr->tid);
1413 mutex_lock(&osdc->request_mutex); 1377 mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1425 req->r_reply, req->r_con_filling_msg); 1389 req->r_reply, req->r_con_filling_msg);
1426 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); 1390 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1427 ceph_con_put(req->r_con_filling_msg); 1391 ceph_con_put(req->r_con_filling_msg);
1392 req->r_con_filling_msg = NULL;
1428 } 1393 }
1429 1394
1430 if (front > req->r_reply->front.iov_len) { 1395 if (front > req->r_reply->front.iov_len) {
1431 pr_warning("get_reply front %d > preallocated %d\n", 1396 pr_warning("get_reply front %d > preallocated %d\n",
1432 front, (int)req->r_reply->front.iov_len); 1397 front, (int)req->r_reply->front.iov_len);
1433 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); 1398 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1434 if (IS_ERR(m)) 1399 if (!m)
1435 goto out; 1400 goto out;
1436 ceph_msg_put(req->r_reply); 1401 ceph_msg_put(req->r_reply);
1437 req->r_reply = m; 1402 req->r_reply = m;
@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1439 m = ceph_msg_get(req->r_reply); 1404 m = ceph_msg_get(req->r_reply);
1440 1405
1441 if (data_len > 0) { 1406 if (data_len > 0) {
1442 err = __prepare_pages(con, hdr, req, tid, m); 1407 unsigned data_off = le16_to_cpu(hdr->data_off);
1443 if (err < 0) { 1408 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1409
1410 if (unlikely(req->r_num_pages < want)) {
1411 pr_warning("tid %lld reply %d > expected %d pages\n",
1412 tid, want, m->nr_pages);
1444 *skip = 1; 1413 *skip = 1;
1445 ceph_msg_put(m); 1414 ceph_msg_put(m);
1446 m = ERR_PTR(err); 1415 m = NULL;
1416 goto out;
1447 } 1417 }
1418 m->pages = req->r_pages;
1419 m->nr_pages = req->r_num_pages;
1448 } 1420 }
1449 *skip = 0; 1421 *skip = 0;
1450 req->r_con_filling_msg = ceph_con_get(con); 1422 req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1466 1438
1467 switch (type) { 1439 switch (type) {
1468 case CEPH_MSG_OSD_MAP: 1440 case CEPH_MSG_OSD_MAP:
1469 return ceph_msg_new(type, front, 0, 0, NULL); 1441 return ceph_msg_new(type, front, GFP_NOFS);
1470 case CEPH_MSG_OSD_OPREPLY: 1442 case CEPH_MSG_OSD_OPREPLY:
1471 return get_reply(con, hdr, skip); 1443 return get_reply(con, hdr, skip);
1472 default: 1444 default:
@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
1552 return ceph_monc_validate_auth(&osdc->client->monc); 1524 return ceph_monc_validate_auth(&osdc->client->monc);
1553} 1525}
1554 1526
1555const static struct ceph_connection_operations osd_con_ops = { 1527static const struct ceph_connection_operations osd_con_ops = {
1556 .get = get_osd_con, 1528 .get = get_osd_con,
1557 .put = put_osd_con, 1529 .put = put_osd_con,
1558 .dispatch = dispatch, 1530 .dispatch = dispatch,
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
20 20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl) 21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{ 22{
23 struct page *page = alloc_page(GFP_NOFS); 23 struct page *page = __page_cache_alloc(GFP_NOFS);
24 if (!page) 24 if (!page)
25 return -ENOMEM; 25 return -ENOMEM;
26 pl->room += PAGE_SIZE; 26 pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
101 __le64 snap_seq; /* seq for per-pool snapshot */ 101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */ 102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps; 103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 uid; 105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed)); 106} __attribute__ ((packed));
107 107
108/* 108/*
@@ -208,6 +208,7 @@ enum {
208 /* read */ 208 /* read */
209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
211 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
211 212
212 /* write */ 213 /* write */
213 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 214 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
305#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 306#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
306#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 307#define EBLACKLISTED ESHUTDOWN /* blacklisted */
307 308
309/* xattr comparison */
310enum {
311 CEPH_OSD_CMPXATTR_OP_NOP = 0,
312 CEPH_OSD_CMPXATTR_OP_EQ = 1,
313 CEPH_OSD_CMPXATTR_OP_NE = 2,
314 CEPH_OSD_CMPXATTR_OP_GT = 3,
315 CEPH_OSD_CMPXATTR_OP_GTE = 4,
316 CEPH_OSD_CMPXATTR_OP_LT = 5,
317 CEPH_OSD_CMPXATTR_OP_LTE = 6
318};
319
320enum {
321 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
322 CEPH_OSD_CMPXATTR_MODE_U64 = 2
323};
324
308/* 325/*
309 * an individual object operation. each may be accompanied by some data 326 * an individual object operation. each may be accompanied by some data
310 * payload 327 * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
321 struct { 338 struct {
322 __le32 name_len; 339 __le32 name_len;
323 __le32 value_len; 340 __le32 value_len;
341 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
342 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
324 } __attribute__ ((packed)) xattr; 343 } __attribute__ ((packed)) xattr;
325 struct { 344 struct {
326 __u8 class_len; 345 __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap) 512 struct ceph_cap_snap *capsnap)
513{ 513{
514 struct inode *inode = &ci->vfs_inode; 514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 515 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
516 516
517 BUG_ON(capsnap->writing); 517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size; 518 capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba9269..7c663d9b9f81 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/parser.h> 10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/seq_file.h> 12#include <linux/seq_file.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/statfs.h> 14#include <linux/statfs.h>
16#include <linux/string.h> 15#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19 16
20#include "decode.h" 17#include "decode.h"
21#include "super.h" 18#include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
107static int ceph_syncfs(struct super_block *sb, int wait) 104static int ceph_syncfs(struct super_block *sb, int wait)
108{ 105{
109 dout("sync_fs %d\n", wait); 106 dout("sync_fs %d\n", wait);
110 ceph_osdc_sync(&ceph_client(sb)->osdc); 107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
111 ceph_mdsc_sync(&ceph_client(sb)->mdsc); 108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
112 dout("sync_fs %d done\n", wait); 109 dout("sync_fs %d done\n", wait);
113 return 0; 110 return 0;
114} 111}
115 112
113static int default_congestion_kb(void)
114{
115 int congestion_kb;
116
117 /*
118 * Copied from NFS
119 *
120 * congestion size, scale with available memory.
121 *
122 * 64MB: 8192k
123 * 128MB: 11585k
124 * 256MB: 16384k
125 * 512MB: 23170k
126 * 1GB: 32768k
127 * 2GB: 46340k
128 * 4GB: 65536k
129 * 8GB: 92681k
130 * 16GB: 131072k
131 *
132 * This allows larger machines to have larger/more transfers.
133 * Limit the default to 256M
134 */
135 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
136 if (congestion_kb > 256*1024)
137 congestion_kb = 256*1024;
138
139 return congestion_kb;
140}
116 141
117/** 142/**
118 * ceph_show_options - Show mount options in /proc/mounts 143 * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
138 seq_puts(m, ",nocrc"); 163 seq_puts(m, ",nocrc");
139 if (args->flags & CEPH_OPT_NOASYNCREADDIR) 164 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
140 seq_puts(m, ",noasyncreaddir"); 165 seq_puts(m, ",noasyncreaddir");
166
167 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
168 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
169 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
170 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
171 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
172 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
173 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
174 seq_printf(m, ",osdkeepalivetimeout=%d",
175 args->osd_keepalive_timeout);
176 if (args->wsize)
177 seq_printf(m, ",wsize=%d", args->wsize);
178 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
179 seq_printf(m, ",rsize=%d", args->rsize);
180 if (args->congestion_kb != default_congestion_kb())
181 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
182 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
183 seq_printf(m, ",caps_wanted_delay_min=%d",
184 args->caps_wanted_delay_min);
185 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
186 seq_printf(m, ",caps_wanted_delay_max=%d",
187 args->caps_wanted_delay_max);
188 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
189 seq_printf(m, ",cap_release_safety=%d",
190 args->cap_release_safety);
191 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
192 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
193 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
194 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
141 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 195 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
142 seq_printf(m, ",snapdirname=%s", args->snapdir_name); 196 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
143 if (args->name) 197 if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
161 inode_init_once(&ci->vfs_inode); 215 inode_init_once(&ci->vfs_inode);
162} 216}
163 217
164static int default_congestion_kb(void)
165{
166 int congestion_kb;
167
168 /*
169 * Copied from NFS
170 *
171 * congestion size, scale with available memory.
172 *
173 * 64MB: 8192k
174 * 128MB: 11585k
175 * 256MB: 16384k
176 * 512MB: 23170k
177 * 1GB: 32768k
178 * 2GB: 46340k
179 * 4GB: 65536k
180 * 8GB: 92681k
181 * 16GB: 131072k
182 *
183 * This allows larger machines to have larger/more transfers.
184 * Limit the default to 256M
185 */
186 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
187 if (congestion_kb > 256*1024)
188 congestion_kb = 256*1024;
189
190 return congestion_kb;
191}
192
193static int __init init_caches(void) 218static int __init init_caches(void)
194{ 219{
195 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 220 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
308 Opt_osd_idle_ttl, 333 Opt_osd_idle_ttl,
309 Opt_caps_wanted_delay_min, 334 Opt_caps_wanted_delay_min,
310 Opt_caps_wanted_delay_max, 335 Opt_caps_wanted_delay_max,
336 Opt_cap_release_safety,
311 Opt_readdir_max_entries, 337 Opt_readdir_max_entries,
338 Opt_readdir_max_bytes,
312 Opt_congestion_kb, 339 Opt_congestion_kb,
313 Opt_last_int, 340 Opt_last_int,
314 /* int args above */ 341 /* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
339 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 366 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
340 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 367 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
341 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 368 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
369 {Opt_cap_release_safety, "cap_release_safety=%d"},
342 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 370 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
371 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
343 {Opt_congestion_kb, "write_congestion_kb=%d"}, 372 {Opt_congestion_kb, "write_congestion_kb=%d"},
344 /* int args above */ 373 /* int args above */
345 {Opt_snapdirname, "snapdirname=%s"}, 374 {Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
388 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 417 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
389 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 418 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
390 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 419 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
391 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; 420 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
392 args->max_readdir = 1024; 421 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
422 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
393 args->congestion_kb = default_congestion_kb(); 423 args->congestion_kb = default_congestion_kb();
394 424
395 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 425 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
497 case Opt_readdir_max_entries: 527 case Opt_readdir_max_entries:
498 args->max_readdir = intval; 528 args->max_readdir = intval;
499 break; 529 break;
530 case Opt_readdir_max_bytes:
531 args->max_readdir_bytes = intval;
532 break;
500 case Opt_congestion_kb: 533 case Opt_congestion_kb:
501 args->congestion_kb = intval; 534 args->congestion_kb = intval;
502 break; 535 break;
@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
682/* 715/*
683 * true if we have the mon map (and have thus joined the cluster) 716 * true if we have the mon map (and have thus joined the cluster)
684 */ 717 */
685static int have_mon_map(struct ceph_client *client) 718static int have_mon_and_osd_map(struct ceph_client *client)
686{ 719{
687 return client->monc.monmap && client->monc.monmap->epoch; 720 return client->monc.monmap && client->monc.monmap->epoch &&
721 client->osdc.osdmap && client->osdc.osdmap->epoch;
688} 722}
689 723
690/* 724/*
@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
762 if (err < 0) 796 if (err < 0)
763 goto out; 797 goto out;
764 798
765 while (!have_mon_map(client)) { 799 while (!have_mon_and_osd_map(client)) {
766 err = -EIO; 800 err = -EIO;
767 if (timeout && time_after_eq(jiffies, started + timeout)) 801 if (timeout && time_after_eq(jiffies, started + timeout))
768 goto out; 802 goto out;
@@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
770 /* wait */ 804 /* wait */
771 dout("mount waiting for mon_map\n"); 805 dout("mount waiting for mon_map\n");
772 err = wait_event_interruptible_timeout(client->auth_wq, 806 err = wait_event_interruptible_timeout(client->auth_wq,
773 have_mon_map(client) || (client->auth_err < 0), 807 have_mon_and_osd_map(client) || (client->auth_err < 0),
774 timeout); 808 timeout);
775 if (err == -EINTR || err == -ERESTARTSYS) 809 if (err == -EINTR || err == -ERESTARTSYS)
776 goto out; 810 goto out;
777 if (client->auth_err < 0) { 811 if (client->auth_err < 0) {
@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
884/* 918/*
885 * construct our own bdi so we can control readahead, etc. 919 * construct our own bdi so we can control readahead, etc.
886 */ 920 */
921static atomic_long_t bdi_seq = ATOMIC_INIT(0);
922
887static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 923static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
888{ 924{
889 int err; 925 int err;
@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
893 client->backing_dev_info.ra_pages = 929 client->backing_dev_info.ra_pages =
894 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 930 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
895 >> PAGE_SHIFT; 931 >> PAGE_SHIFT;
896 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); 932 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
933 atomic_long_inc_return(&bdi_seq));
897 if (!err) 934 if (!err)
898 sb->s_bdi = &client->backing_dev_info; 935 sb->s_bdi = &client->backing_dev_info;
899 return err; 936 return err;
@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
932 goto out; 969 goto out;
933 } 970 }
934 971
935 if (ceph_client(sb) != client) { 972 if (ceph_sb_to_client(sb) != client) {
936 ceph_destroy_client(client); 973 ceph_destroy_client(client);
937 client = ceph_client(sb); 974 client = ceph_sb_to_client(sb);
938 dout("get_sb got existing client %p\n", client); 975 dout("get_sb got existing client %p\n", client);
939 } else { 976 } else {
940 dout("get_sb using new client %p\n", client); 977 dout("get_sb using new client %p\n", client);
@@ -952,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
952 989
953out_splat: 990out_splat:
954 ceph_mdsc_close_sessions(&client->mdsc); 991 ceph_mdsc_close_sessions(&client->mdsc);
955 up_write(&sb->s_umount); 992 deactivate_locked_super(sb);
956 deactivate_super(sb);
957 goto out_final; 993 goto out_final;
958 994
959out: 995out:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..3725c9ee9d08 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -52,24 +52,25 @@
52 52
53struct ceph_mount_args { 53struct ceph_mount_args {
54 int sb_flags; 54 int sb_flags;
55 int flags;
56 struct ceph_fsid fsid;
57 struct ceph_entity_addr my_addr;
55 int num_mon; 58 int num_mon;
56 struct ceph_entity_addr *mon_addr; 59 struct ceph_entity_addr *mon_addr;
57 int flags;
58 int mount_timeout; 60 int mount_timeout;
59 int osd_idle_ttl; 61 int osd_idle_ttl;
60 int caps_wanted_delay_min, caps_wanted_delay_max;
61 struct ceph_fsid fsid;
62 struct ceph_entity_addr my_addr;
63 int wsize;
64 int rsize; /* max readahead */
65 int max_readdir; /* max readdir size */
66 int congestion_kb; /* max readdir size */
67 int osd_timeout; 62 int osd_timeout;
68 int osd_keepalive_timeout; 63 int osd_keepalive_timeout;
64 int wsize;
65 int rsize; /* max readahead */
66 int congestion_kb; /* max writeback in flight */
67 int caps_wanted_delay_min, caps_wanted_delay_max;
68 int cap_release_safety;
69 int max_readdir; /* max readdir result (entires) */
70 int max_readdir_bytes; /* max readdir result (bytes) */
69 char *snapdir_name; /* default ".snap" */ 71 char *snapdir_name; /* default ".snap" */
70 char *name; 72 char *name;
71 char *secret; 73 char *secret;
72 int cap_release_safety;
73}; 74};
74 75
75/* 76/*
@@ -80,13 +81,14 @@ struct ceph_mount_args {
80#define CEPH_OSD_KEEPALIVE_DEFAULT 5 81#define CEPH_OSD_KEEPALIVE_DEFAULT 5
81#define CEPH_OSD_IDLE_TTL_DEFAULT 60 82#define CEPH_OSD_IDLE_TTL_DEFAULT 60
82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ 83#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
84#define CEPH_MAX_READDIR_DEFAULT 1024
85#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
83 86
84#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 87#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
85#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) 88#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
86 89
87#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 90#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
88#define CEPH_AUTH_NAME_DEFAULT "guest" 91#define CEPH_AUTH_NAME_DEFAULT "guest"
89
90/* 92/*
91 * Delay telling the MDS we no longer want caps, in case we reopen 93 * Delay telling the MDS we no longer want caps, in case we reopen
92 * the file. Delay a minimum amount of time, even if we send a cap 94 * the file. Delay a minimum amount of time, even if we send a cap
@@ -96,6 +98,7 @@ struct ceph_mount_args {
96#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 98#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
97#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 99#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
98 100
101#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
99 102
100/* mount state */ 103/* mount state */
101enum { 104enum {
@@ -160,12 +163,6 @@ struct ceph_client {
160#endif 163#endif
161}; 164};
162 165
163static inline struct ceph_client *ceph_client(struct super_block *sb)
164{
165 return sb->s_fs_info;
166}
167
168
169/* 166/*
170 * File i/o capability. This tracks shared state with the metadata 167 * File i/o capability. This tracks shared state with the metadata
171 * server that allows us to cache or writeback attributes or to read 168 * server that allows us to cache or writeback attributes or to read
@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
871extern void ceph_dentry_lru_add(struct dentry *dn); 868extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn); 869extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn); 870extern void ceph_dentry_lru_del(struct dentry *dn);
871extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
874 872
875/* 873/*
876 * our d_ops vary depending on whether the inode is live, 874 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
7 7
8static bool ceph_is_valid_xattr(const char *name) 8static bool ceph_is_valid_xattr(const char *name)
9{ 9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX, 10 return !strncmp(name, "ceph.", 5) ||
11 !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) || 12 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 13 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 14 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
76} 77}
77 78
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 79static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, 80 { true, "ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files}, 81 { true, "ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 82 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, 83 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 84 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 85 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 86 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, 87 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL } 88 { true, NULL, NULL }
88}; 89};
89 90
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
107} 108}
108 109
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 110static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout}, 111 { true, "ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL } 112 { NULL, NULL }
112}; 113};
113 114
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
186 ci->i_xattrs.names_size -= xattr->name_len; 187 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len; 188 ci->i_xattrs.vals_size -= xattr->val_len;
188 } 189 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len; 190 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len; 191 ci->i_xattrs.vals_size += val_len;
197 if (val) 192 if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
574 ci->i_xattrs.version, ci->i_xattrs.index_version); 569 ci->i_xattrs.version, ci->i_xattrs.index_version);
575 570
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 571 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) { 572 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
578 goto list_xattr; 573 goto list_xattr;
579 } else { 574 } else {
580 spin_unlock(&inode->i_lock); 575 spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 617static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags) 618 const char *value, size_t size, int flags)
624{ 619{
625 struct ceph_client *client = ceph_client(dentry->d_sb); 620 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode; 621 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode); 622 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode; 623 struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
641 return -ENOMEM; 636 return -ENOMEM;
642 err = -ENOMEM; 637 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) { 638 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS); 639 pages[i] = __page_cache_alloc(GFP_NOFS);
645 if (!pages[i]) { 640 if (!pages[i]) {
646 nr_pages = i; 641 nr_pages = i;
647 goto out; 642 goto out;
@@ -779,7 +774,7 @@ out:
779 774
780static int ceph_send_removexattr(struct dentry *dentry, const char *name) 775static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{ 776{
782 struct ceph_client *client = ceph_client(dentry->d_sb); 777 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc; 778 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode; 779 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode; 780 struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 4c813f2cdc52..7196077b1688 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
218 host_file = cfi->cfi_container; 218 host_file = cfi->cfi_container;
219 219
220 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); 220 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 221 if ( !err && !datasync ) {
222 lock_kernel(); 222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Pioctl operations for Coda. 2 * Pioctl operations for Coda.
3 * Original version: (C) 1996 Peter Braam 3 * Original version: (C) 1996 Peter Braam
4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University 4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
5 * 5 *
6 * Carnegie Mellon encourages users of this code to contribute improvements 6 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
26/* pioctl ops */ 28/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask); 29static int coda_ioctl_permission(struct inode *inode, int mask);
28static int coda_pioctl(struct inode * inode, struct file * filp, 30static long coda_pioctl(struct file *filp, unsigned int cmd,
29 unsigned int cmd, unsigned long user_data); 31 unsigned long user_data);
30 32
31/* exported from this file */ 33/* exported from this file */
32const struct inode_operations coda_ioctl_inode_operations = 34const struct inode_operations coda_ioctl_inode_operations = {
33{
34 .permission = coda_ioctl_permission, 35 .permission = coda_ioctl_permission,
35 .setattr = coda_setattr, 36 .setattr = coda_setattr,
36}; 37};
37 38
38const struct file_operations coda_ioctl_operations = { 39const struct file_operations coda_ioctl_operations = {
39 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
40 .ioctl = coda_pioctl, 41 .unlocked_ioctl = coda_pioctl,
41}; 42};
42 43
43/* the coda pioctl inode ops */ 44/* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
46 return (mask & MAY_EXEC) ? -EACCES : 0; 47 return (mask & MAY_EXEC) ? -EACCES : 0;
47} 48}
48 49
49static int coda_pioctl(struct inode * inode, struct file * filp, 50static long coda_pioctl(struct file *filp, unsigned int cmd,
50 unsigned int cmd, unsigned long user_data) 51 unsigned long user_data)
51{ 52{
52 struct path path; 53 struct path path;
53 int error; 54 int error;
54 struct PioctlData data; 55 struct PioctlData data;
55 struct inode *target_inode = NULL; 56 struct inode *inode = filp->f_dentry->d_inode;
56 struct coda_inode_info *cnp; 57 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp;
57 59
58 /* get the Pioctl data arguments from user space */ 60 lock_kernel();
59 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 61
60 return -EINVAL; 62 /* get the Pioctl data arguments from user space */
61 } 63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
62 64 error = -EINVAL;
63 /* 65 goto out;
64 * Look up the pathname. Note that the pathname is in
65 * user memory, and namei takes care of this
66 */
67 if (data.follow) {
68 error = user_path(data.path, &path);
69 } else {
70 error = user_lpath(data.path, &path);
71 } 66 }
72 67
73 if ( error ) { 68 /*
74 return error; 69 * Look up the pathname. Note that the pathname is in
75 } else { 70 * user memory, and namei takes care of this
71 */
72 if (data.follow)
73 error = user_path(data.path, &path);
74 else
75 error = user_lpath(data.path, &path);
76
77 if (error)
78 goto out;
79 else
76 target_inode = path.dentry->d_inode; 80 target_inode = path.dentry->d_inode;
77 } 81
78
79 /* return if it is not a Coda inode */ 82 /* return if it is not a Coda inode */
80 if ( target_inode->i_sb != inode->i_sb ) { 83 if (target_inode->i_sb != inode->i_sb) {
81 path_put(&path); 84 path_put(&path);
82 return -EINVAL; 85 error = -EINVAL;
86 goto out;
83 } 87 }
84 88
85 /* now proceed to make the upcall */ 89 /* now proceed to make the upcall */
86 cnp = ITOC(target_inode); 90 cnp = ITOC(target_inode);
87 91
88 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
89 93
90 path_put(&path); 94 path_put(&path);
91 return error;
92}
93 95
96out:
97 unlock_kernel();
98 return error;
99}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
73 return mask; 73 return mask;
74} 74}
75 75
76static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 76static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
77 unsigned int cmd, unsigned long arg)
78{ 77{
79 unsigned int data; 78 unsigned int data;
80 79
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
344 .read = coda_psdev_read, 343 .read = coda_psdev_read,
345 .write = coda_psdev_write, 344 .write = coda_psdev_write,
346 .poll = coda_psdev_poll, 345 .poll = coda_psdev_poll,
347 .ioctl = coda_psdev_ioctl, 346 .unlocked_ioctl = coda_psdev_ioctl,
348 .open = coda_psdev_open, 347 .open = coda_psdev_open,
349 .release = coda_psdev_release, 348 .release = coda_psdev_release,
350}; 349};
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..d96047b4a633 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
536 */ 536 */
537static void prune_dcache(int count) 537static void prune_dcache(int count)
538{ 538{
539 struct super_block *sb; 539 struct super_block *sb, *n;
540 int w_count; 540 int w_count;
541 int unused = dentry_stat.nr_unused; 541 int unused = dentry_stat.nr_unused;
542 int prune_ratio; 542 int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
545 if (unused == 0 || count == 0) 545 if (unused == 0 || count == 0)
546 return; 546 return;
547 spin_lock(&dcache_lock); 547 spin_lock(&dcache_lock);
548restart:
549 if (count >= unused) 548 if (count >= unused)
550 prune_ratio = 1; 549 prune_ratio = 1;
551 else 550 else
552 prune_ratio = unused / count; 551 prune_ratio = unused / count;
553 spin_lock(&sb_lock); 552 spin_lock(&sb_lock);
554 list_for_each_entry(sb, &super_blocks, s_list) { 553 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
554 if (list_empty(&sb->s_instances))
555 continue;
555 if (sb->s_nr_dentry_unused == 0) 556 if (sb->s_nr_dentry_unused == 0)
556 continue; 557 continue;
557 sb->s_count++; 558 sb->s_count++;
@@ -590,14 +591,10 @@ restart:
590 } 591 }
591 spin_lock(&sb_lock); 592 spin_lock(&sb_lock);
592 count -= pruned; 593 count -= pruned;
593 /* 594 __put_super(sb);
594 * restart only when sb is no longer on the list and 595 /* more work left to do? */
595 * we have more work to do. 596 if (count <= 0)
596 */ 597 break;
597 if (__put_super_and_need_restart(sb) && count > 0) {
598 spin_unlock(&sb_lock);
599 goto restart;
600 }
601 } 598 }
602 spin_unlock(&sb_lock); 599 spin_unlock(&sb_lock);
603 spin_unlock(&dcache_lock); 600 spin_unlock(&dcache_lock);
@@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry)
1529 spin_lock(&dentry->d_lock); 1526 spin_lock(&dentry->d_lock);
1530 isdir = S_ISDIR(dentry->d_inode->i_mode); 1527 isdir = S_ISDIR(dentry->d_inode->i_mode);
1531 if (atomic_read(&dentry->d_count) == 1) { 1528 if (atomic_read(&dentry->d_count) == 1) {
1529 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1532 dentry_iput(dentry); 1530 dentry_iput(dentry);
1533 fsnotify_nameremove(dentry, isdir); 1531 fsnotify_nameremove(dentry, isdir);
1534 return; 1532 return;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0120247b41c0..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
384 s->s_flags |= MS_ACTIVE; 384 s->s_flags |= MS_ACTIVE;
385 } 385 }
386 386
387 simple_set_mnt(mnt, s);
388
389 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); 387 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
390 388
391 error = mknod_ptmx(s); 389 error = mknod_ptmx(s);
392 if (error) 390 if (error)
393 goto out_dput; 391 goto out_undo_sget;
394 392
395 return 0; 393 simple_set_mnt(mnt, s);
396 394
397out_dput: 395 return 0;
398 dput(s->s_root); /* undo dget() in simple_set_mnt() */
399 396
400out_undo_sget: 397out_undo_sget:
401 deactivate_locked_super(s); 398 deactivate_locked_super(s);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 17903b491298..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -733,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
733 if (lkb->lkb_rqmode < mode) 733 if (lkb->lkb_rqmode < mode)
734 break; 734 break;
735 735
736 if (!lkb) 736 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
737 list_add_tail(new, head);
738 else
739 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
740} 737}
741 738
742/* add/remove lkb to rsb's grant/convert/wait queue */ 739/* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 8b6e73c47435..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -215,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
215 if (!ast_type) { 215 if (!ast_type) {
216 kref_get(&lkb->lkb_ref); 216 kref_get(&lkb->lkb_ref);
217 list_add_tail(&lkb->lkb_astqueue, &proc->asts); 217 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
218 lkb->lkb_ast_first = type;
218 wake_up_interruptible(&proc->wait); 219 wake_up_interruptible(&proc->wait);
219 } 220 }
220 if (type == AST_COMP && (ast_type & AST_COMP)) 221 if (type == AST_COMP && (ast_type & AST_COMP))
@@ -223,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
223 224
224 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); 225 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
225 if (eol) { 226 if (eol) {
226 lkb->lkb_ast_type &= ~AST_BAST;
227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; 227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
228 } 228 }
229 229
@@ -706,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
706} 706}
707 707
708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, 708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
709 int bmode, char __user *buf, size_t count) 709 int mode, char __user *buf, size_t count)
710{ 710{
711#ifdef CONFIG_COMPAT 711#ifdef CONFIG_COMPAT
712 struct dlm_lock_result32 result32; 712 struct dlm_lock_result32 result32;
@@ -733,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
733 if (type == AST_BAST) { 733 if (type == AST_BAST) {
734 result.user_astaddr = ua->bastaddr; 734 result.user_astaddr = ua->bastaddr;
735 result.user_astparam = ua->bastparam; 735 result.user_astparam = ua->bastparam;
736 result.bast_mode = bmode; 736 result.bast_mode = mode;
737 } else { 737 } else {
738 result.user_astaddr = ua->castaddr; 738 result.user_astaddr = ua->castaddr;
739 result.user_astparam = ua->castparam; 739 result.user_astparam = ua->castparam;
@@ -801,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
801 struct dlm_user_proc *proc = file->private_data; 801 struct dlm_user_proc *proc = file->private_data;
802 struct dlm_lkb *lkb; 802 struct dlm_lkb *lkb;
803 DECLARE_WAITQUEUE(wait, current); 803 DECLARE_WAITQUEUE(wait, current);
804 int error, type=0, bmode=0, removed = 0; 804 int error = 0, removed;
805 int ret_type, ret_mode;
806 int bastmode, castmode, do_bast, do_cast;
805 807
806 if (count == sizeof(struct dlm_device_version)) { 808 if (count == sizeof(struct dlm_device_version)) {
807 error = copy_version_to_user(buf, count); 809 error = copy_version_to_user(buf, count);
@@ -820,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
820#endif 822#endif
821 return -EINVAL; 823 return -EINVAL;
822 824
825 try_another:
826
823 /* do we really need this? can a read happen after a close? */ 827 /* do we really need this? can a read happen after a close? */
824 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) 828 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
825 return -EINVAL; 829 return -EINVAL;
@@ -855,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
855 859
856 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); 860 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
857 861
858 if (lkb->lkb_ast_type & AST_COMP) { 862 removed = 0;
859 lkb->lkb_ast_type &= ~AST_COMP; 863 ret_type = 0;
860 type = AST_COMP; 864 ret_mode = 0;
861 } else if (lkb->lkb_ast_type & AST_BAST) { 865 do_bast = lkb->lkb_ast_type & AST_BAST;
862 lkb->lkb_ast_type &= ~AST_BAST; 866 do_cast = lkb->lkb_ast_type & AST_COMP;
863 type = AST_BAST; 867 bastmode = lkb->lkb_bastmode;
864 bmode = lkb->lkb_bastmode; 868 castmode = lkb->lkb_castmode;
869
870 /* when both are queued figure out which to do first and
871 switch first so the other goes in the next read */
872
873 if (do_cast && do_bast) {
874 if (lkb->lkb_ast_first == AST_COMP) {
875 ret_type = AST_COMP;
876 ret_mode = castmode;
877 lkb->lkb_ast_type &= ~AST_COMP;
878 lkb->lkb_ast_first = AST_BAST;
879 } else {
880 ret_type = AST_BAST;
881 ret_mode = bastmode;
882 lkb->lkb_ast_type &= ~AST_BAST;
883 lkb->lkb_ast_first = AST_COMP;
884 }
885 } else {
886 ret_type = lkb->lkb_ast_first;
887 ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
888 lkb->lkb_ast_type &= ~ret_type;
889 lkb->lkb_ast_first = 0;
890 }
891
892 /* if we're doing a bast but the bast is unnecessary, then
893 switch to do nothing or do a cast if that was needed next */
894
895 if ((ret_type == AST_BAST) &&
896 dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
897 ret_type = 0;
898 ret_mode = 0;
899
900 if (do_cast) {
901 ret_type = AST_COMP;
902 ret_mode = castmode;
903 lkb->lkb_ast_type &= ~AST_COMP;
904 lkb->lkb_ast_first = 0;
905 }
906 }
907
908 if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
909 log_print("device_read %x ast_first %x ast_type %x",
910 lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
865 } 911 }
866 912
867 if (!lkb->lkb_ast_type) { 913 if (!lkb->lkb_ast_type) {
@@ -870,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
870 } 916 }
871 spin_unlock(&proc->asts_spin); 917 spin_unlock(&proc->asts_spin);
872 918
873 error = copy_result_to_user(lkb->lkb_ua, 919 if (ret_type) {
874 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), 920 error = copy_result_to_user(lkb->lkb_ua,
875 type, bmode, buf, count); 921 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
922 ret_type, ret_mode, buf, count);
923
924 if (ret_type == AST_COMP)
925 lkb->lkb_castmode_done = castmode;
926 if (ret_type == AST_BAST)
927 lkb->lkb_bastmode_done = bastmode;
928 }
876 929
877 /* removes reference for the proc->asts lists added by 930 /* removes reference for the proc->asts lists added by
878 dlm_user_add_ast() and may result in the lkb being freed */ 931 dlm_user_add_ast() and may result in the lkb being freed */
932
879 if (removed) 933 if (removed)
880 dlm_put_lkb(lkb); 934 dlm_put_lkb(lkb);
881 935
936 /* the bast that was queued was eliminated (see unnecessary above),
937 leaving nothing to return */
938
939 if (!ret_type)
940 goto try_another;
941
882 return error; 942 return error;
883} 943}
884 944
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..83c4f600786a 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
12/* A global variable is a bit ugly, but it keeps the code simple */ 12/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 13int sysctl_drop_caches;
14 14
15static void drop_pagecache_sb(struct super_block *sb) 15static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 16{
17 struct inode *inode, *toput_inode = NULL; 17 struct inode *inode, *toput_inode = NULL;
18 18
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
33 iput(toput_inode); 33 iput(toput_inode);
34} 34}
35 35
36static void drop_pagecache(void)
37{
38 struct super_block *sb;
39
40 spin_lock(&sb_lock);
41restart:
42 list_for_each_entry(sb, &super_blocks, s_list) {
43 sb->s_count++;
44 spin_unlock(&sb_lock);
45 down_read(&sb->s_umount);
46 if (sb->s_root)
47 drop_pagecache_sb(sb);
48 up_read(&sb->s_umount);
49 spin_lock(&sb_lock);
50 if (__put_super_and_need_restart(sb))
51 goto restart;
52 }
53 spin_unlock(&sb_lock);
54}
55
56static void drop_slab(void) 36static void drop_slab(void)
57{ 37{
58 int nr_objects; 38 int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
68 proc_dointvec_minmax(table, write, buffer, length, ppos); 48 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 49 if (write) {
70 if (sysctl_drop_caches & 1) 50 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 51 iterate_supers(drop_pagecache_sb, NULL);
72 if (sysctl_drop_caches & 2) 52 if (sysctl_drop_caches & 2)
73 drop_slab(); 53 drop_slab();
74 } 54 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bfc2e0f78f00..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, 731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
732 struct page *page_for_lower, 732 struct page *page_for_lower,
733 size_t offset_in_page, size_t size); 733 size_t offset_in_page, size_t size);
734int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 734int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
735 size_t size);
736int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 735int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
737 struct inode *ecryptfs_inode); 736 struct inode *ecryptfs_inode);
738int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, 737int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
739 pgoff_t page_index, 738 pgoff_t page_index,
740 size_t offset_in_page, size_t size, 739 size_t offset_in_page, size_t size,
741 struct inode *ecryptfs_inode); 740 struct inode *ecryptfs_inode);
742struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); 741struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
743int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); 742int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
744int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, 743int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
745 struct user_namespace *user_ns); 744 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6f5ebf..3bdddbcc785f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -276,9 +276,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
276static int 276static int
277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
278{ 278{
279 return vfs_fsync(ecryptfs_file_to_lower(file), 279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
280 ecryptfs_dentry_to_lower(dentry),
281 datasync);
282} 280}
283 281
284static int ecryptfs_fasync(int fd, struct file *file, int flag) 282static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2d4418affac..65dee2f336ae 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -142,19 +142,10 @@ out:
142static int grow_file(struct dentry *ecryptfs_dentry) 142static int grow_file(struct dentry *ecryptfs_dentry)
143{ 143{
144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; 144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
145 struct file fake_file;
146 struct ecryptfs_file_info tmp_file_info;
147 char zero_virt[] = { 0x00 }; 145 char zero_virt[] = { 0x00 };
148 int rc = 0; 146 int rc = 0;
149 147
150 memset(&fake_file, 0, sizeof(fake_file)); 148 rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
151 fake_file.f_path.dentry = ecryptfs_dentry;
152 memset(&tmp_file_info, 0, sizeof(tmp_file_info));
153 ecryptfs_set_file_private(&fake_file, &tmp_file_info);
154 ecryptfs_set_file_lower(
155 &fake_file,
156 ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
157 rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
158 i_size_write(ecryptfs_inode, 0); 149 i_size_write(ecryptfs_inode, 0);
159 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 150 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
160 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= 151 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
784{ 775{
785 int rc = 0; 776 int rc = 0;
786 struct inode *inode = dentry->d_inode; 777 struct inode *inode = dentry->d_inode;
787 struct dentry *lower_dentry;
788 struct file fake_ecryptfs_file;
789 struct ecryptfs_crypt_stat *crypt_stat; 778 struct ecryptfs_crypt_stat *crypt_stat;
790 loff_t i_size = i_size_read(inode); 779 loff_t i_size = i_size_read(inode);
791 loff_t lower_size_before_truncate; 780 loff_t lower_size_before_truncate;
@@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
796 goto out; 785 goto out;
797 } 786 }
798 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 787 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
799 /* Set up a fake ecryptfs file, this is used to interface with
800 * the file in the underlying filesystem so that the
801 * truncation has an effect there as well. */
802 memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
803 fake_ecryptfs_file.f_path.dentry = dentry;
804 /* Released at out_free: label */
805 ecryptfs_set_file_private(&fake_ecryptfs_file,
806 kmem_cache_alloc(ecryptfs_file_info_cache,
807 GFP_KERNEL));
808 if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
809 rc = -ENOMEM;
810 goto out;
811 }
812 lower_dentry = ecryptfs_dentry_to_lower(dentry);
813 ecryptfs_set_file_lower(
814 &fake_ecryptfs_file,
815 ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
816 /* Switch on growing or shrinking file */ 788 /* Switch on growing or shrinking file */
817 if (ia->ia_size > i_size) { 789 if (ia->ia_size > i_size) {
818 char zero[] = { 0x00 }; 790 char zero[] = { 0x00 };
@@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
822 * this triggers code that will fill in 0's throughout 794 * this triggers code that will fill in 0's throughout
823 * the intermediate portion of the previous end of the 795 * the intermediate portion of the previous end of the
824 * file and the new and of the file */ 796 * file and the new and of the file */
825 rc = ecryptfs_write(&fake_ecryptfs_file, zero, 797 rc = ecryptfs_write(inode, zero,
826 (ia->ia_size - 1), 1); 798 (ia->ia_size - 1), 1);
827 } else { /* ia->ia_size < i_size_read(inode) */ 799 } else { /* ia->ia_size < i_size_read(inode) */
828 /* We're chopping off all the pages down to the page 800 /* We're chopping off all the pages down to the page
@@ -835,10 +807,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
835 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
836 rc = vmtruncate(inode, ia->ia_size); 808 rc = vmtruncate(inode, ia->ia_size);
837 if (rc) 809 if (rc)
838 goto out_free; 810 goto out;
839 lower_ia->ia_size = ia->ia_size; 811 lower_ia->ia_size = ia->ia_size;
840 lower_ia->ia_valid |= ATTR_SIZE; 812 lower_ia->ia_valid |= ATTR_SIZE;
841 goto out_free; 813 goto out;
842 } 814 }
843 if (num_zeros) { 815 if (num_zeros) {
844 char *zeros_virt; 816 char *zeros_virt;
@@ -846,16 +818,16 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
846 zeros_virt = kzalloc(num_zeros, GFP_KERNEL); 818 zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
847 if (!zeros_virt) { 819 if (!zeros_virt) {
848 rc = -ENOMEM; 820 rc = -ENOMEM;
849 goto out_free; 821 goto out;
850 } 822 }
851 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, 823 rc = ecryptfs_write(inode, zeros_virt,
852 ia->ia_size, num_zeros); 824 ia->ia_size, num_zeros);
853 kfree(zeros_virt); 825 kfree(zeros_virt);
854 if (rc) { 826 if (rc) {
855 printk(KERN_ERR "Error attempting to zero out " 827 printk(KERN_ERR "Error attempting to zero out "
856 "the remainder of the end page on " 828 "the remainder of the end page on "
857 "reducing truncate; rc = [%d]\n", rc); 829 "reducing truncate; rc = [%d]\n", rc);
858 goto out_free; 830 goto out;
859 } 831 }
860 } 832 }
861 vmtruncate(inode, ia->ia_size); 833 vmtruncate(inode, ia->ia_size);
@@ -864,7 +836,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
864 printk(KERN_ERR "Problem with " 836 printk(KERN_ERR "Problem with "
865 "ecryptfs_write_inode_size_to_metadata; " 837 "ecryptfs_write_inode_size_to_metadata; "
866 "rc = [%d]\n", rc); 838 "rc = [%d]\n", rc);
867 goto out_free; 839 goto out;
868 } 840 }
869 /* We are reducing the size of the ecryptfs file, and need to 841 /* We are reducing the size of the ecryptfs file, and need to
870 * know if we need to reduce the size of the lower file. */ 842 * know if we need to reduce the size of the lower file. */
@@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
878 } else 850 } else
879 lower_ia->ia_valid &= ~ATTR_SIZE; 851 lower_ia->ia_valid &= ~ATTR_SIZE;
880 } 852 }
881out_free:
882 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
883 kmem_cache_free(ecryptfs_file_info_cache,
884 ecryptfs_file_to_private(&fake_ecryptfs_file));
885out: 853out:
886 return rc; 854 return rc;
887} 855}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 760983d0f25e..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
281 * 281 *
282 * Returns zero on success; non-zero on error 282 * Returns zero on success; non-zero on error
283 */ 283 */
284static int ecryptfs_parse_options(struct super_block *sb, char *options) 284static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
285{ 285{
286 char *p; 286 char *p;
287 int rc = 0; 287 int rc = 0;
@@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
293 int fn_cipher_key_bytes; 293 int fn_cipher_key_bytes;
294 int fn_cipher_key_bytes_set = 0; 294 int fn_cipher_key_bytes_set = 0;
295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
296 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 296 &sbi->mount_crypt_stat;
297 substring_t args[MAX_OPT_ARGS]; 297 substring_t args[MAX_OPT_ARGS];
298 int token; 298 int token;
299 char *sig_src; 299 char *sig_src;
@@ -483,68 +483,7 @@ out:
483} 483}
484 484
485struct kmem_cache *ecryptfs_sb_info_cache; 485struct kmem_cache *ecryptfs_sb_info_cache;
486 486static struct file_system_type ecryptfs_fs_type;
487/**
488 * ecryptfs_fill_super
489 * @sb: The ecryptfs super block
490 * @raw_data: The options passed to mount
491 * @silent: Not used but required by function prototype
492 *
493 * Sets up what we can of the sb, rest is done in ecryptfs_read_super
494 *
495 * Returns zero on success; non-zero otherwise
496 */
497static int
498ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
499{
500 struct ecryptfs_sb_info *esi;
501 int rc = 0;
502
503 /* Released in ecryptfs_put_super() */
504 ecryptfs_set_superblock_private(sb,
505 kmem_cache_zalloc(ecryptfs_sb_info_cache,
506 GFP_KERNEL));
507 esi = ecryptfs_superblock_to_private(sb);
508 if (!esi) {
509 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
510 rc = -ENOMEM;
511 goto out;
512 }
513
514 rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
515 if (rc)
516 goto out;
517
518 sb->s_bdi = &esi->bdi;
519 sb->s_op = &ecryptfs_sops;
520 /* Released through deactivate_super(sb) from get_sb_nodev */
521 sb->s_root = d_alloc(NULL, &(const struct qstr) {
522 .hash = 0,.name = "/",.len = 1});
523 if (!sb->s_root) {
524 ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
525 rc = -ENOMEM;
526 goto out;
527 }
528 sb->s_root->d_op = &ecryptfs_dops;
529 sb->s_root->d_sb = sb;
530 sb->s_root->d_parent = sb->s_root;
531 /* Released in d_release when dput(sb->s_root) is called */
532 /* through deactivate_super(sb) from get_sb_nodev() */
533 ecryptfs_set_dentry_private(sb->s_root,
534 kmem_cache_zalloc(ecryptfs_dentry_info_cache,
535 GFP_KERNEL));
536 if (!ecryptfs_dentry_to_private(sb->s_root)) {
537 ecryptfs_printk(KERN_ERR,
538 "dentry_info_cache alloc failed\n");
539 rc = -ENOMEM;
540 goto out;
541 }
542 rc = 0;
543out:
544 /* Should be able to rely on deactivate_super called from
545 * get_sb_nodev */
546 return rc;
547}
548 487
549/** 488/**
550 * ecryptfs_read_super 489 * ecryptfs_read_super
@@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
565 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); 504 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
566 goto out; 505 goto out;
567 } 506 }
507 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
508 rc = -EINVAL;
509 printk(KERN_ERR "Mount on filesystem of type "
510 "eCryptfs explicitly disallowed due to "
511 "known incompatibilities\n");
512 goto out_free;
513 }
568 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); 514 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
569 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; 515 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
570 sb->s_blocksize = path.dentry->d_sb->s_blocksize; 516 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -588,11 +534,8 @@ out:
588 * @dev_name: The path to mount over 534 * @dev_name: The path to mount over
589 * @raw_data: The options passed into the kernel 535 * @raw_data: The options passed into the kernel
590 * 536 *
591 * The whole ecryptfs_get_sb process is broken into 4 functions: 537 * The whole ecryptfs_get_sb process is broken into 3 functions:
592 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any 538 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
593 * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
594 * with as much information as it can before needing
595 * the lower filesystem.
596 * ecryptfs_read_super(): this accesses the lower filesystem and uses 539 * ecryptfs_read_super(): this accesses the lower filesystem and uses
597 * ecryptfs_interpose to perform most of the linking 540 * ecryptfs_interpose to perform most of the linking
598 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) 541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
601 const char *dev_name, void *raw_data, 544 const char *dev_name, void *raw_data,
602 struct vfsmount *mnt) 545 struct vfsmount *mnt)
603{ 546{
547 struct super_block *s;
548 struct ecryptfs_sb_info *sbi;
549 struct ecryptfs_dentry_info *root_info;
550 const char *err = "Getting sb failed";
604 int rc; 551 int rc;
605 struct super_block *sb;
606 552
607 rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); 553 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
608 if (rc < 0) { 554 if (!sbi) {
609 printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); 555 rc = -ENOMEM;
610 goto out; 556 goto out;
611 } 557 }
612 sb = mnt->mnt_sb; 558
613 rc = ecryptfs_parse_options(sb, raw_data); 559 rc = ecryptfs_parse_options(sbi, raw_data);
614 if (rc) { 560 if (rc) {
615 printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); 561 err = "Error parsing options";
616 goto out_abort; 562 goto out;
563 }
564
565 s = sget(fs_type, NULL, set_anon_super, NULL);
566 if (IS_ERR(s)) {
567 rc = PTR_ERR(s);
568 goto out;
617 } 569 }
618 rc = ecryptfs_read_super(sb, dev_name); 570
571 s->s_flags = flags;
572 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
619 if (rc) { 573 if (rc) {
620 printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); 574 deactivate_locked_super(s);
621 goto out_abort; 575 goto out;
622 } 576 }
623 goto out; 577
624out_abort: 578 ecryptfs_set_superblock_private(s, sbi);
625 dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ 579 s->s_bdi = &sbi->bdi;
626 deactivate_locked_super(sb); 580
581 /* ->kill_sb() will take care of sbi after that point */
582 sbi = NULL;
583 s->s_op = &ecryptfs_sops;
584
585 rc = -ENOMEM;
586 s->s_root = d_alloc(NULL, &(const struct qstr) {
587 .hash = 0,.name = "/",.len = 1});
588 if (!s->s_root) {
589 deactivate_locked_super(s);
590 goto out;
591 }
592 s->s_root->d_op = &ecryptfs_dops;
593 s->s_root->d_sb = s;
594 s->s_root->d_parent = s->s_root;
595
596 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
597 if (!root_info) {
598 deactivate_locked_super(s);
599 goto out;
600 }
601 /* ->kill_sb() will take care of root_info */
602 ecryptfs_set_dentry_private(s->s_root, root_info);
603 s->s_flags |= MS_ACTIVE;
604 rc = ecryptfs_read_super(s, dev_name);
605 if (rc) {
606 deactivate_locked_super(s);
607 err = "Reading sb failed";
608 goto out;
609 }
610 simple_set_mnt(mnt, s);
611 return 0;
612
627out: 613out:
614 if (sbi) {
615 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
628 return rc; 619 return rc;
629} 620}
630 621
@@ -633,11 +624,16 @@ out:
633 * @sb: The ecryptfs super block 624 * @sb: The ecryptfs super block
634 * 625 *
635 * Used to bring the superblock down and free the private data. 626 * Used to bring the superblock down and free the private data.
636 * Private data is free'd in ecryptfs_put_super()
637 */ 627 */
638static void ecryptfs_kill_block_super(struct super_block *sb) 628static void ecryptfs_kill_block_super(struct super_block *sb)
639{ 629{
640 generic_shutdown_super(sb); 630 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
631 kill_anon_super(sb);
632 if (!sb_info)
633 return;
634 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
635 bdi_destroy(&sb_info->bdi);
636 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
641} 637}
642 638
643static struct file_system_type ecryptfs_fs_type = { 639static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2ee9a3a7b68c..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -44,17 +44,9 @@
44 * Returns locked and up-to-date page (if ok), with increased 44 * Returns locked and up-to-date page (if ok), with increased
45 * refcnt. 45 * refcnt.
46 */ 46 */
47struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) 47struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
48{ 48{
49 struct dentry *dentry; 49 struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
50 struct inode *inode;
51 struct address_space *mapping;
52 struct page *page;
53
54 dentry = file->f_path.dentry;
55 inode = dentry->d_inode;
56 mapping = inode->i_mapping;
57 page = read_mapping_page(mapping, index, (void *)file);
58 if (!IS_ERR(page)) 50 if (!IS_ERR(page))
59 lock_page(page); 51 lock_page(page);
60 return page; 52 return page;
@@ -198,7 +190,7 @@ out:
198static int ecryptfs_readpage(struct file *file, struct page *page) 190static int ecryptfs_readpage(struct file *file, struct page *page)
199{ 191{
200 struct ecryptfs_crypt_stat *crypt_stat = 192 struct ecryptfs_crypt_stat *crypt_stat =
201 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 193 &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
202 int rc = 0; 194 int rc = 0;
203 195
204 if (!crypt_stat 196 if (!crypt_stat
@@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
300 292
301 if (!PageUptodate(page)) { 293 if (!PageUptodate(page)) {
302 struct ecryptfs_crypt_stat *crypt_stat = 294 struct ecryptfs_crypt_stat *crypt_stat =
303 &ecryptfs_inode_to_private( 295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
304 file->f_path.dentry->d_inode)->crypt_stat;
305 296
306 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) 297 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
307 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { 298 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
487 unsigned to = from + copied; 478 unsigned to = from + copied;
488 struct inode *ecryptfs_inode = mapping->host; 479 struct inode *ecryptfs_inode = mapping->host;
489 struct ecryptfs_crypt_stat *crypt_stat = 480 struct ecryptfs_crypt_stat *crypt_stat =
490 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 481 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
491 int rc; 482 int rc;
492 483
493 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { 484 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
93 93
94/** 94/**
95 * ecryptfs_write 95 * ecryptfs_write
96 * @ecryptfs_file: The eCryptfs file into which to write 96 * @ecryptfs_inode: The eCryptfs file into which to write
97 * @data: Virtual address where data to write is located 97 * @data: Virtual address where data to write is located
98 * @offset: Offset in the eCryptfs file at which to begin writing the 98 * @offset: Offset in the eCryptfs file at which to begin writing the
99 * data from @data 99 * data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
109 * 109 *
110 * Returns zero on success; non-zero otherwise 110 * Returns zero on success; non-zero otherwise
111 */ 111 */
112int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 112int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
113 size_t size) 113 size_t size)
114{ 114{
115 struct page *ecryptfs_page; 115 struct page *ecryptfs_page;
116 struct ecryptfs_crypt_stat *crypt_stat; 116 struct ecryptfs_crypt_stat *crypt_stat;
117 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
118 char *ecryptfs_page_virt; 117 char *ecryptfs_page_virt;
119 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); 118 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
120 loff_t data_offset = 0; 119 loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
145 if (num_bytes > total_remaining_zeros) 144 if (num_bytes > total_remaining_zeros)
146 num_bytes = total_remaining_zeros; 145 num_bytes = total_remaining_zeros;
147 } 146 }
148 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 147 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
149 ecryptfs_page_idx); 148 ecryptfs_page_idx);
150 if (IS_ERR(ecryptfs_page)) { 149 if (IS_ERR(ecryptfs_page)) {
151 rc = PTR_ERR(ecryptfs_page); 150 rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
302int ecryptfs_read(char *data, loff_t offset, size_t size, 301int ecryptfs_read(char *data, loff_t offset, size_t size,
303 struct file *ecryptfs_file) 302 struct file *ecryptfs_file)
304{ 303{
304 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
305 struct page *ecryptfs_page; 305 struct page *ecryptfs_page;
306 char *ecryptfs_page_virt; 306 char *ecryptfs_page_virt;
307 loff_t ecryptfs_file_size = 307 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
308 i_size_read(ecryptfs_file->f_dentry->d_inode);
309 loff_t data_offset = 0; 308 loff_t data_offset = 0;
310 loff_t pos; 309 loff_t pos;
311 int rc = 0; 310 int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
327 326
328 if (num_bytes > total_remaining_bytes) 327 if (num_bytes > total_remaining_bytes)
329 num_bytes = total_remaining_bytes; 328 num_bytes = total_remaining_bytes;
330 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 329 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
331 ecryptfs_page_idx); 330 ecryptfs_page_idx);
332 if (IS_ERR(ecryptfs_page)) { 331 if (IS_ERR(ecryptfs_page)) {
333 rc = PTR_ERR(ecryptfs_page); 332 rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0c0ae491d231..0435886e4a9f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
109} 109}
110 110
111/** 111/**
112 * ecryptfs_put_super
113 * @sb: Pointer to the ecryptfs super block
114 *
115 * Final actions when unmounting a file system.
116 * This will handle deallocation and release of our private data.
117 */
118static void ecryptfs_put_super(struct super_block *sb)
119{
120 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
121
122 lock_kernel();
123
124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
125 bdi_destroy(&sb_info->bdi);
126 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
127 ecryptfs_set_superblock_private(sb, NULL);
128
129 unlock_kernel();
130}
131
132/**
133 * ecryptfs_statfs 112 * ecryptfs_statfs
134 * @sb: The ecryptfs super block 113 * @sb: The ecryptfs super block
135 * @buf: The struct kstatfs to fill in with stats 114 * @buf: The struct kstatfs to fill in with stats
@@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
203 .alloc_inode = ecryptfs_alloc_inode, 182 .alloc_inode = ecryptfs_alloc_inode,
204 .destroy_inode = ecryptfs_destroy_inode, 183 .destroy_inode = ecryptfs_destroy_inode,
205 .drop_inode = generic_delete_inode, 184 .drop_inode = generic_delete_inode,
206 .put_super = ecryptfs_put_super,
207 .statfs = ecryptfs_statfs, 185 .statfs = ecryptfs_statfs,
208 .remount_fs = NULL, 186 .remount_fs = NULL,
209 .clear_inode = ecryptfs_clear_inode, 187 .clear_inode = ecryptfs_clear_inode,
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..9badbc0bfb1d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
242 * use STACK_TOP because that can depend on attributes which aren't 242 * use STACK_TOP because that can depend on attributes which aren't
243 * configured yet. 243 * configured yet.
244 */ 244 */
245 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
245 vma->vm_end = STACK_TOP_MAX; 246 vma->vm_end = STACK_TOP_MAX;
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 247 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 248 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 250 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 251 err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
616 else if (executable_stack == EXSTACK_DISABLE_X) 617 else if (executable_stack == EXSTACK_DISABLE_X)
617 vm_flags &= ~VM_EXEC; 618 vm_flags &= ~VM_EXEC;
618 vm_flags |= mm->def_flags; 619 vm_flags |= mm->def_flags;
620 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
619 621
620 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621 vm_flags); 623 vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 632 goto out_unlock;
631 } 633 }
632 634
635 /* mprotect_fixup is overkill to remove the temporary stack flags */
636 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
637
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 638 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 639 stack_size = vma->vm_end - vma->vm_start;
635 /* 640 /*
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
608 de->inode_no = cpu_to_le64(parent->i_ino); 608 de->inode_no = cpu_to_le64(parent->i_ino);
609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); 609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
610 exofs_set_de_type(de, inode); 610 exofs_set_de_type(de, inode);
611 kunmap_atomic(page, KM_USER0); 611 kunmap_atomic(kaddr, KM_USER0);
612 err = exofs_commit_chunk(page, 0, chunk_size); 612 err = exofs_commit_chunk(page, 0, chunk_size);
613fail: 613fail:
614 page_cache_release(page); 614 page_cache_release(page);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 76d2a79ef93e..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
755 return ret; 755 return ret;
756} 756}
757 757
758static int exofs_releasepage(struct page *page, gfp_t gfp)
759{
760 EXOFS_DBGMSG("page 0x%lx\n", page->index);
761 WARN_ON(1);
762 return try_to_free_buffers(page);
763}
764
765static void exofs_invalidatepage(struct page *page, unsigned long offset)
766{
767 EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
768 WARN_ON(1);
769
770 block_invalidatepage(page, offset);
771}
772
758const struct address_space_operations exofs_aops = { 773const struct address_space_operations exofs_aops = {
759 .readpage = exofs_readpage, 774 .readpage = exofs_readpage,
760 .readpages = exofs_readpages, 775 .readpages = exofs_readpages,
@@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = {
762 .writepages = exofs_writepages, 777 .writepages = exofs_writepages,
763 .write_begin = exofs_write_begin_export, 778 .write_begin = exofs_write_begin_export,
764 .write_end = exofs_write_end, 779 .write_end = exofs_write_end,
780 .releasepage = exofs_releasepage,
781 .set_page_dirty = __set_page_dirty_nobuffers,
782 .invalidatepage = exofs_invalidatepage,
783
784 /* Not implemented Yet */
785 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
786 .direct_IO = NULL, /* TODO: Should be trivial to do */
787
788 /* With these NULL has special meaning or default is not exported */
789 .sync_page = NULL,
790 .get_xip_mem = NULL,
791 .migratepage = NULL,
792 .launder_page = NULL,
793 .is_partially_uptodate = NULL,
794 .error_remove_page = NULL,
765}; 795};
766 796
767/****************************************************************************** 797/******************************************************************************
@@ -1123,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1123 sbi = sb->s_fs_info; 1153 sbi = sb->s_fs_info;
1124 1154
1125 sb->s_dirt = 1; 1155 sb->s_dirt = 1;
1126 inode->i_uid = current->cred->fsuid; 1156 inode_init_owner(inode, dir, mode);
1127 if (dir->i_mode & S_ISGID) {
1128 inode->i_gid = dir->i_gid;
1129 if (S_ISDIR(mode))
1130 mode |= S_ISGID;
1131 } else {
1132 inode->i_gid = current->cred->fsgid;
1133 }
1134 inode->i_mode = mode;
1135
1136 inode->i_ino = sbi->s_nextid++; 1157 inode->i_ino = sbi->s_nextid++;
1137 inode->i_blkbits = EXOFS_BLKSHIFT; 1158 inode->i_blkbits = EXOFS_BLKSHIFT;
1138 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1159 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..ca7e2a0ed98a 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -420,7 +420,7 @@ release_and_out:
420 return error; 420 return error;
421} 421}
422 422
423struct xattr_handler ext2_xattr_acl_access_handler = { 423const struct xattr_handler ext2_xattr_acl_access_handler = {
424 .prefix = POSIX_ACL_XATTR_ACCESS, 424 .prefix = POSIX_ACL_XATTR_ACCESS,
425 .flags = ACL_TYPE_ACCESS, 425 .flags = ACL_TYPE_ACCESS,
426 .list = ext2_xattr_list_acl_access, 426 .list = ext2_xattr_list_acl_access,
@@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
428 .set = ext2_xattr_set_acl, 428 .set = ext2_xattr_set_acl,
429}; 429};
430 430
431struct xattr_handler ext2_xattr_acl_default_handler = { 431const struct xattr_handler ext2_xattr_acl_default_handler = {
432 .prefix = POSIX_ACL_XATTR_DEFAULT, 432 .prefix = POSIX_ACL_XATTR_DEFAULT,
433 .flags = ACL_TYPE_DEFAULT, 433 .flags = ACL_TYPE_DEFAULT,
434 .list = ext2_xattr_list_acl_default, 434 .list = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 3cf038c055d7..e8766a396776 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1332,6 +1332,12 @@ retry_alloc:
1332 1332
1333 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1333 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1334 /* 1334 /*
1335 * skip this group (and avoid loading bitmap) if there
1336 * are no free blocks
1337 */
1338 if (!free_blocks)
1339 continue;
1340 /*
1335 * skip this group if the number of 1341 * skip this group if the number of
1336 * free blocks is less than half of the reservation 1342 * free blocks is less than half of the reservation
1337 * window size. 1343 * window size.
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad7d572ee8dc..938dbc739d00 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
106 struct super_block * sb = inode->i_sb; 106 struct super_block * sb = inode->i_sb;
107 int is_directory; 107 int is_directory;
108 unsigned long ino; 108 unsigned long ino;
109 struct buffer_head *bitmap_bh = NULL; 109 struct buffer_head *bitmap_bh;
110 unsigned long block_group; 110 unsigned long block_group;
111 unsigned long bit; 111 unsigned long bit;
112 struct ext2_super_block * es; 112 struct ext2_super_block * es;
@@ -135,14 +135,13 @@ void ext2_free_inode (struct inode * inode)
135 ino > le32_to_cpu(es->s_inodes_count)) { 135 ino > le32_to_cpu(es->s_inodes_count)) {
136 ext2_error (sb, "ext2_free_inode", 136 ext2_error (sb, "ext2_free_inode",
137 "reserved or nonexistent inode %lu", ino); 137 "reserved or nonexistent inode %lu", ino);
138 goto error_return; 138 return;
139 } 139 }
140 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); 140 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
141 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); 141 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
142 brelse(bitmap_bh);
143 bitmap_bh = read_inode_bitmap(sb, block_group); 142 bitmap_bh = read_inode_bitmap(sb, block_group);
144 if (!bitmap_bh) 143 if (!bitmap_bh)
145 goto error_return; 144 return;
146 145
147 /* Ok, now we can actually update the inode bitmaps.. */ 146 /* Ok, now we can actually update the inode bitmaps.. */
148 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group), 147 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +153,7 @@ void ext2_free_inode (struct inode * inode)
154 mark_buffer_dirty(bitmap_bh); 153 mark_buffer_dirty(bitmap_bh);
155 if (sb->s_flags & MS_SYNCHRONOUS) 154 if (sb->s_flags & MS_SYNCHRONOUS)
156 sync_dirty_buffer(bitmap_bh); 155 sync_dirty_buffer(bitmap_bh);
157error_return: 156
158 brelse(bitmap_bh); 157 brelse(bitmap_bh);
159} 158}
160 159
@@ -550,16 +549,12 @@ got:
550 549
551 sb->s_dirt = 1; 550 sb->s_dirt = 1;
552 mark_buffer_dirty(bh2); 551 mark_buffer_dirty(bh2);
553 inode->i_uid = current_fsuid(); 552 if (test_opt(sb, GRPID)) {
554 if (test_opt (sb, GRPID)) 553 inode->i_mode = mode;
555 inode->i_gid = dir->i_gid; 554 inode->i_uid = current_fsuid();
556 else if (dir->i_mode & S_ISGID) {
557 inode->i_gid = dir->i_gid; 555 inode->i_gid = dir->i_gid;
558 if (S_ISDIR(mode))
559 mode |= S_ISGID;
560 } else 556 } else
561 inode->i_gid = current_fsgid(); 557 inode_init_owner(inode, dir, mode);
562 inode->i_mode = mode;
563 558
564 inode->i_ino = ino; 559 inode->i_ino = ino;
565 inode->i_blocks = 0; 560 inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc119aad..527c46d9bc1f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25#include <linux/smp_lock.h>
26#include <linux/time.h> 25#include <linux/time.h>
27#include <linux/highuid.h> 26#include <linux/highuid.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
@@ -1406,11 +1405,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
1406 /* If this is the first large file 1405 /* If this is the first large file
1407 * created, add a flag to the superblock. 1406 * created, add a flag to the superblock.
1408 */ 1407 */
1409 lock_kernel(); 1408 spin_lock(&EXT2_SB(sb)->s_lock);
1410 ext2_update_dynamic_rev(sb); 1409 ext2_update_dynamic_rev(sb);
1411 EXT2_SET_RO_COMPAT_FEATURE(sb, 1410 EXT2_SET_RO_COMPAT_FEATURE(sb,
1412 EXT2_FEATURE_RO_COMPAT_LARGE_FILE); 1411 EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
1413 unlock_kernel(); 1412 spin_unlock(&EXT2_SB(sb)->s_lock);
1414 ext2_write_super(sb); 1413 ext2_write_super(sb);
1415 } 1414 }
1416 } 1415 }
@@ -1467,7 +1466,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1467 if (error) 1466 if (error)
1468 return error; 1467 return error;
1469 1468
1470 if (iattr->ia_valid & ATTR_SIZE) 1469 if (is_quota_modification(inode, iattr))
1471 dquot_initialize(inode); 1470 dquot_initialize(inode);
1472 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1471 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1473 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1472 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a303b675..71e9eb1fa696 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
26#include <linux/random.h> 26#include <linux/random.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/exportfs.h> 28#include <linux/exportfs.h>
29#include <linux/smp_lock.h>
30#include <linux/vfs.h> 29#include <linux/vfs.h>
31#include <linux/seq_file.h> 30#include <linux/seq_file.h>
32#include <linux/mount.h> 31#include <linux/mount.h>
@@ -39,7 +38,7 @@
39#include "xip.h" 38#include "xip.h"
40 39
41static void ext2_sync_super(struct super_block *sb, 40static void ext2_sync_super(struct super_block *sb,
42 struct ext2_super_block *es); 41 struct ext2_super_block *es, int wait);
43static int ext2_remount (struct super_block * sb, int * flags, char * data); 42static int ext2_remount (struct super_block * sb, int * flags, char * data);
44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
45static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
52 struct ext2_super_block *es = sbi->s_es; 51 struct ext2_super_block *es = sbi->s_es;
53 52
54 if (!(sb->s_flags & MS_RDONLY)) { 53 if (!(sb->s_flags & MS_RDONLY)) {
54 spin_lock(&sbi->s_lock);
55 sbi->s_mount_state |= EXT2_ERROR_FS; 55 sbi->s_mount_state |= EXT2_ERROR_FS;
56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS); 56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
57 ext2_sync_super(sb, es); 57 spin_unlock(&sbi->s_lock);
58 ext2_sync_super(sb, es, 1);
58 } 59 }
59 60
60 va_start(args, fmt); 61 va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
84 va_end(args); 85 va_end(args);
85} 86}
86 87
88/*
89 * This must be called with sbi->s_lock held.
90 */
87void ext2_update_dynamic_rev(struct super_block *sb) 91void ext2_update_dynamic_rev(struct super_block *sb)
88{ 92{
89 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 93 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,8 +119,6 @@ static void ext2_put_super (struct super_block * sb)
115 int i; 119 int i;
116 struct ext2_sb_info *sbi = EXT2_SB(sb); 120 struct ext2_sb_info *sbi = EXT2_SB(sb);
117 121
118 lock_kernel();
119
120 if (sb->s_dirt) 122 if (sb->s_dirt)
121 ext2_write_super(sb); 123 ext2_write_super(sb);
122 124
@@ -124,8 +126,10 @@ static void ext2_put_super (struct super_block * sb)
124 if (!(sb->s_flags & MS_RDONLY)) { 126 if (!(sb->s_flags & MS_RDONLY)) {
125 struct ext2_super_block *es = sbi->s_es; 127 struct ext2_super_block *es = sbi->s_es;
126 128
129 spin_lock(&sbi->s_lock);
127 es->s_state = cpu_to_le16(sbi->s_mount_state); 130 es->s_state = cpu_to_le16(sbi->s_mount_state);
128 ext2_sync_super(sb, es); 131 spin_unlock(&sbi->s_lock);
132 ext2_sync_super(sb, es, 1);
129 } 133 }
130 db_count = sbi->s_gdb_count; 134 db_count = sbi->s_gdb_count;
131 for (i = 0; i < db_count; i++) 135 for (i = 0; i < db_count; i++)
@@ -140,8 +144,6 @@ static void ext2_put_super (struct super_block * sb)
140 sb->s_fs_info = NULL; 144 sb->s_fs_info = NULL;
141 kfree(sbi->s_blockgroup_lock); 145 kfree(sbi->s_blockgroup_lock);
142 kfree(sbi); 146 kfree(sbi);
143
144 unlock_kernel();
145} 147}
146 148
147static struct kmem_cache * ext2_inode_cachep; 149static struct kmem_cache * ext2_inode_cachep;
@@ -209,6 +211,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
209 struct ext2_super_block *es = sbi->s_es; 211 struct ext2_super_block *es = sbi->s_es;
210 unsigned long def_mount_opts; 212 unsigned long def_mount_opts;
211 213
214 spin_lock(&sbi->s_lock);
212 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 215 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
213 216
214 if (sbi->s_sb_block != 1) 217 if (sbi->s_sb_block != 1)
@@ -281,6 +284,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
281 if (!test_opt(sb, RESERVATION)) 284 if (!test_opt(sb, RESERVATION))
282 seq_puts(seq, ",noreservation"); 285 seq_puts(seq, ",noreservation");
283 286
287 spin_unlock(&sbi->s_lock);
284 return 0; 288 return 0;
285} 289}
286 290
@@ -606,7 +610,6 @@ static int ext2_setup_super (struct super_block * sb,
606 if (!le16_to_cpu(es->s_max_mnt_count)) 610 if (!le16_to_cpu(es->s_max_mnt_count))
607 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); 611 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
608 le16_add_cpu(&es->s_mnt_count, 1); 612 le16_add_cpu(&es->s_mnt_count, 1);
609 ext2_write_super(sb);
610 if (test_opt (sb, DEBUG)) 613 if (test_opt (sb, DEBUG))
611 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " 614 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
612 "bpg=%lu, ipg=%lu, mo=%04lx]", 615 "bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -767,6 +770,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
767 sb->s_fs_info = sbi; 770 sb->s_fs_info = sbi;
768 sbi->s_sb_block = sb_block; 771 sbi->s_sb_block = sb_block;
769 772
773 spin_lock_init(&sbi->s_lock);
774
770 /* 775 /*
771 * See what the current blocksize for the device is, and 776 * See what the current blocksize for the device is, and
772 * use that as the blocksize. Otherwise (or if the blocksize 777 * use that as the blocksize. Otherwise (or if the blocksize
@@ -1079,7 +1084,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1079 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) 1084 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
1080 ext2_msg(sb, KERN_WARNING, 1085 ext2_msg(sb, KERN_WARNING,
1081 "warning: mounting ext3 filesystem as ext2"); 1086 "warning: mounting ext3 filesystem as ext2");
1082 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); 1087 if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
1088 sb->s_flags |= MS_RDONLY;
1089 ext2_write_super(sb);
1083 return 0; 1090 return 0;
1084 1091
1085cantfind_ext2: 1092cantfind_ext2:
@@ -1120,30 +1127,26 @@ static void ext2_clear_super_error(struct super_block *sb)
1120 * be remapped. Nothing we can do but to retry the 1127 * be remapped. Nothing we can do but to retry the
1121 * write and hope for the best. 1128 * write and hope for the best.
1122 */ 1129 */
1123 printk(KERN_ERR "EXT2-fs: %s previous I/O error to " 1130 ext2_msg(sb, KERN_ERR,
1124 "superblock detected", sb->s_id); 1131 "previous I/O error to superblock detected\n");
1125 clear_buffer_write_io_error(sbh); 1132 clear_buffer_write_io_error(sbh);
1126 set_buffer_uptodate(sbh); 1133 set_buffer_uptodate(sbh);
1127 } 1134 }
1128} 1135}
1129 1136
1130static void ext2_commit_super (struct super_block * sb, 1137static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
1131 struct ext2_super_block * es) 1138 int wait)
1132{
1133 ext2_clear_super_error(sb);
1134 es->s_wtime = cpu_to_le32(get_seconds());
1135 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1136 sb->s_dirt = 0;
1137}
1138
1139static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1140{ 1139{
1141 ext2_clear_super_error(sb); 1140 ext2_clear_super_error(sb);
1141 spin_lock(&EXT2_SB(sb)->s_lock);
1142 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); 1142 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
1143 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); 1143 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
1144 es->s_wtime = cpu_to_le32(get_seconds()); 1144 es->s_wtime = cpu_to_le32(get_seconds());
1145 /* unlock before we do IO */
1146 spin_unlock(&EXT2_SB(sb)->s_lock);
1145 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 1147 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1146 sync_dirty_buffer(EXT2_SB(sb)->s_sbh); 1148 if (wait)
1149 sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
1147 sb->s_dirt = 0; 1150 sb->s_dirt = 0;
1148} 1151}
1149 1152
@@ -1157,43 +1160,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1157 * may have been checked while mounted and e2fsck may have 1160 * may have been checked while mounted and e2fsck may have
1158 * set s_state to EXT2_VALID_FS after some corrections. 1161 * set s_state to EXT2_VALID_FS after some corrections.
1159 */ 1162 */
1160
1161static int ext2_sync_fs(struct super_block *sb, int wait) 1163static int ext2_sync_fs(struct super_block *sb, int wait)
1162{ 1164{
1165 struct ext2_sb_info *sbi = EXT2_SB(sb);
1163 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 1166 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
1164 struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
1165
1166 lock_kernel();
1167 if (buffer_write_io_error(sbh)) {
1168 /*
1169 * Oh, dear. A previous attempt to write the
1170 * superblock failed. This could happen because the
1171 * USB device was yanked out. Or it could happen to
1172 * be a transient write error and maybe the block will
1173 * be remapped. Nothing we can do but to retry the
1174 * write and hope for the best.
1175 */
1176 ext2_msg(sb, KERN_ERR,
1177 "previous I/O error to superblock detected\n");
1178 clear_buffer_write_io_error(sbh);
1179 set_buffer_uptodate(sbh);
1180 }
1181 1167
1168 spin_lock(&sbi->s_lock);
1182 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { 1169 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
1183 ext2_debug("setting valid to 0\n"); 1170 ext2_debug("setting valid to 0\n");
1184 es->s_state &= cpu_to_le16(~EXT2_VALID_FS); 1171 es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
1185 es->s_free_blocks_count =
1186 cpu_to_le32(ext2_count_free_blocks(sb));
1187 es->s_free_inodes_count =
1188 cpu_to_le32(ext2_count_free_inodes(sb));
1189 es->s_mtime = cpu_to_le32(get_seconds());
1190 ext2_sync_super(sb, es);
1191 } else {
1192 ext2_commit_super(sb, es);
1193 } 1172 }
1194 sb->s_dirt = 0; 1173 spin_unlock(&sbi->s_lock);
1195 unlock_kernel(); 1174 ext2_sync_super(sb, es, wait);
1196
1197 return 0; 1175 return 0;
1198} 1176}
1199 1177
@@ -1215,7 +1193,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1215 unsigned long old_sb_flags; 1193 unsigned long old_sb_flags;
1216 int err; 1194 int err;
1217 1195
1218 lock_kernel(); 1196 spin_lock(&sbi->s_lock);
1219 1197
1220 /* Store the old options */ 1198 /* Store the old options */
1221 old_sb_flags = sb->s_flags; 1199 old_sb_flags = sb->s_flags;
@@ -1254,13 +1232,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1254 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; 1232 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1255 } 1233 }
1256 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1234 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1257 unlock_kernel(); 1235 spin_unlock(&sbi->s_lock);
1258 return 0; 1236 return 0;
1259 } 1237 }
1260 if (*flags & MS_RDONLY) { 1238 if (*flags & MS_RDONLY) {
1261 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || 1239 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
1262 !(sbi->s_mount_state & EXT2_VALID_FS)) { 1240 !(sbi->s_mount_state & EXT2_VALID_FS)) {
1263 unlock_kernel(); 1241 spin_unlock(&sbi->s_lock);
1264 return 0; 1242 return 0;
1265 } 1243 }
1266 /* 1244 /*
@@ -1269,6 +1247,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1269 */ 1247 */
1270 es->s_state = cpu_to_le16(sbi->s_mount_state); 1248 es->s_state = cpu_to_le16(sbi->s_mount_state);
1271 es->s_mtime = cpu_to_le32(get_seconds()); 1249 es->s_mtime = cpu_to_le32(get_seconds());
1250 spin_unlock(&sbi->s_lock);
1251 ext2_sync_super(sb, es, 1);
1272 } else { 1252 } else {
1273 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1253 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
1274 ~EXT2_FEATURE_RO_COMPAT_SUPP); 1254 ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1288,16 +1268,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1288 sbi->s_mount_state = le16_to_cpu(es->s_state); 1268 sbi->s_mount_state = le16_to_cpu(es->s_state);
1289 if (!ext2_setup_super (sb, es, 0)) 1269 if (!ext2_setup_super (sb, es, 0))
1290 sb->s_flags &= ~MS_RDONLY; 1270 sb->s_flags &= ~MS_RDONLY;
1271 spin_unlock(&sbi->s_lock);
1272 ext2_write_super(sb);
1291 } 1273 }
1292 ext2_sync_super(sb, es);
1293 unlock_kernel();
1294 return 0; 1274 return 0;
1295restore_opts: 1275restore_opts:
1296 sbi->s_mount_opt = old_opts.s_mount_opt; 1276 sbi->s_mount_opt = old_opts.s_mount_opt;
1297 sbi->s_resuid = old_opts.s_resuid; 1277 sbi->s_resuid = old_opts.s_resuid;
1298 sbi->s_resgid = old_opts.s_resgid; 1278 sbi->s_resgid = old_opts.s_resgid;
1299 sb->s_flags = old_sb_flags; 1279 sb->s_flags = old_sb_flags;
1300 unlock_kernel(); 1280 spin_unlock(&sbi->s_lock);
1301 return err; 1281 return err;
1302} 1282}
1303 1283
@@ -1308,6 +1288,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1308 struct ext2_super_block *es = sbi->s_es; 1288 struct ext2_super_block *es = sbi->s_es;
1309 u64 fsid; 1289 u64 fsid;
1310 1290
1291 spin_lock(&sbi->s_lock);
1292
1311 if (test_opt (sb, MINIX_DF)) 1293 if (test_opt (sb, MINIX_DF))
1312 sbi->s_overhead_last = 0; 1294 sbi->s_overhead_last = 0;
1313 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { 1295 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1362,6 +1344,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1362 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 1344 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
1363 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 1345 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1364 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 1346 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1347 spin_unlock(&sbi->s_lock);
1365 return 0; 1348 return 0;
1366} 1349}
1367 1350
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e44dc92609be..7c3915780b19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
101 101
102static struct mb_cache *ext2_xattr_cache; 102static struct mb_cache *ext2_xattr_cache;
103 103
104static struct xattr_handler *ext2_xattr_handler_map[] = { 104static const struct xattr_handler *ext2_xattr_handler_map[] = {
105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
106#ifdef CONFIG_EXT2_FS_POSIX_ACL 106#ifdef CONFIG_EXT2_FS_POSIX_ACL
107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, 107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
113#endif 113#endif
114}; 114};
115 115
116struct xattr_handler *ext2_xattr_handlers[] = { 116const struct xattr_handler *ext2_xattr_handlers[] = {
117 &ext2_xattr_user_handler, 117 &ext2_xattr_user_handler,
118 &ext2_xattr_trusted_handler, 118 &ext2_xattr_trusted_handler,
119#ifdef CONFIG_EXT2_FS_POSIX_ACL 119#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
126 NULL 126 NULL
127}; 127};
128 128
129static inline struct xattr_handler * 129static inline const struct xattr_handler *
130ext2_xattr_handler(int name_index) 130ext2_xattr_handler(int name_index)
131{ 131{
132 struct xattr_handler *handler = NULL; 132 const struct xattr_handler *handler = NULL;
133 133
134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) 134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
135 handler = ext2_xattr_handler_map[name_index]; 135 handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
298 /* list the attribute names */ 298 /* list the attribute names */
299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); 299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
300 entry = EXT2_XATTR_NEXT(entry)) { 300 entry = EXT2_XATTR_NEXT(entry)) {
301 struct xattr_handler *handler = 301 const struct xattr_handler *handler =
302 ext2_xattr_handler(entry->e_name_index); 302 ext2_xattr_handler(entry->e_name_index);
303 303
304 if (handler) { 304 if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) 345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
346 return; 346 return;
347 347
348 spin_lock(&EXT2_SB(sb)->s_lock);
348 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); 349 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
350 spin_unlock(&EXT2_SB(sb)->s_lock);
349 sb->s_dirt = 1; 351 sb->s_dirt = 1;
350 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 352 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
351} 353}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
55 55
56# ifdef CONFIG_EXT2_FS_XATTR 56# ifdef CONFIG_EXT2_FS_XATTR
57 57
58extern struct xattr_handler ext2_xattr_user_handler; 58extern const struct xattr_handler ext2_xattr_user_handler;
59extern struct xattr_handler ext2_xattr_trusted_handler; 59extern const struct xattr_handler ext2_xattr_trusted_handler;
60extern struct xattr_handler ext2_xattr_acl_access_handler; 60extern const struct xattr_handler ext2_xattr_acl_access_handler;
61extern struct xattr_handler ext2_xattr_acl_default_handler; 61extern const struct xattr_handler ext2_xattr_acl_default_handler;
62extern struct xattr_handler ext2_xattr_security_handler; 62extern const struct xattr_handler ext2_xattr_security_handler;
63 63
64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); 64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
65 65
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
72extern int init_ext2_xattr(void); 72extern int init_ext2_xattr(void);
73extern void exit_ext2_xattr(void); 73extern void exit_ext2_xattr(void);
74 74
75extern struct xattr_handler *ext2_xattr_handlers[]; 75extern const struct xattr_handler *ext2_xattr_handlers[];
76 76
77# else /* CONFIG_EXT2_FS_XATTR */ 77# else /* CONFIG_EXT2_FS_XATTR */
78 78
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index b118c6383c6d..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
67 return err; 67 return err;
68} 68}
69 69
70struct xattr_handler ext2_xattr_security_handler = { 70const struct xattr_handler ext2_xattr_security_handler = {
71 .prefix = XATTR_SECURITY_PREFIX, 71 .prefix = XATTR_SECURITY_PREFIX,
72 .list = ext2_xattr_security_list, 72 .list = ext2_xattr_security_list,
73 .get = ext2_xattr_security_get, 73 .get = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
50 value, size, flags); 50 value, size, flags);
51} 51}
52 52
53struct xattr_handler ext2_xattr_trusted_handler = { 53const struct xattr_handler ext2_xattr_trusted_handler = {
54 .prefix = XATTR_TRUSTED_PREFIX, 54 .prefix = XATTR_TRUSTED_PREFIX,
55 .list = ext2_xattr_trusted_list, 55 .list = ext2_xattr_trusted_list,
56 .get = ext2_xattr_trusted_get, 56 .get = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext2_xattr_user_handler = { 57const struct xattr_handler ext2_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext2_xattr_user_list, 59 .list = ext2_xattr_user_list,
60 .get = ext2_xattr_user_get, 60 .get = ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..01552abbca3c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -456,7 +456,7 @@ release_and_out:
456 return error; 456 return error;
457} 457}
458 458
459struct xattr_handler ext3_xattr_acl_access_handler = { 459const struct xattr_handler ext3_xattr_acl_access_handler = {
460 .prefix = POSIX_ACL_XATTR_ACCESS, 460 .prefix = POSIX_ACL_XATTR_ACCESS,
461 .flags = ACL_TYPE_ACCESS, 461 .flags = ACL_TYPE_ACCESS,
462 .list = ext3_xattr_list_acl_access, 462 .list = ext3_xattr_list_acl_access,
@@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
464 .set = ext3_xattr_set_acl, 464 .set = ext3_xattr_set_acl,
465}; 465};
466 466
467struct xattr_handler ext3_xattr_acl_default_handler = { 467const struct xattr_handler ext3_xattr_acl_default_handler = {
468 .prefix = POSIX_ACL_XATTR_DEFAULT, 468 .prefix = POSIX_ACL_XATTR_DEFAULT,
469 .flags = ACL_TYPE_DEFAULT, 469 .flags = ACL_TYPE_DEFAULT,
470 .list = ext3_xattr_list_acl_default, 470 .list = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a177122a1b25..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1584,6 +1584,12 @@ retry_alloc:
1584 goto io_error; 1584 goto io_error;
1585 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1585 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1586 /* 1586 /*
1587 * skip this group (and avoid loading bitmap) if there
1588 * are no free blocks
1589 */
1590 if (!free_blocks)
1591 continue;
1592 /*
1587 * skip this group if the number of 1593 * skip this group if the number of
1588 * free blocks is less than half of the reservation 1594 * free blocks is less than half of the reservation
1589 * window size. 1595 * window size.
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..fcf7487734b6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,7 +48,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 struct ext3_inode_info *ei = EXT3_I(inode); 49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
51 int ret = 0; 51 int ret, needs_barrier = 0;
52 tid_t commit_tid; 52 tid_t commit_tid;
53 53
54 if (inode->i_sb->s_flags & MS_RDONLY) 54 if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
70 * (they were dirtied by commit). But that's OK - the blocks are 70 * (they were dirtied by commit). But that's OK - the blocks are
71 * safe in-journal, which is all fsync() needs to ensure. 71 * safe in-journal, which is all fsync() needs to ensure.
72 */ 72 */
73 if (ext3_should_journal_data(inode)) { 73 if (ext3_should_journal_data(inode))
74 ret = ext3_force_commit(inode->i_sb); 74 return ext3_force_commit(inode->i_sb);
75 goto out;
76 }
77 75
78 if (datasync) 76 if (datasync)
79 commit_tid = atomic_read(&ei->i_datasync_tid); 77 commit_tid = atomic_read(&ei->i_datasync_tid);
80 else 78 else
81 commit_tid = atomic_read(&ei->i_sync_tid); 79 commit_tid = atomic_read(&ei->i_sync_tid);
82 80
83 if (log_start_commit(journal, commit_tid)) { 81 if (test_opt(inode->i_sb, BARRIER) &&
84 log_wait_commit(journal, commit_tid); 82 !journal_trans_will_send_data_barrier(journal, commit_tid))
85 goto out; 83 needs_barrier = 1;
86 } 84 log_start_commit(journal, commit_tid);
85 ret = log_wait_commit(journal, commit_tid);
87 86
88 /* 87 /*
89 * In case we didn't commit a transaction, we have to flush 88 * In case we didn't commit a transaction, we have to flush
90 * disk caches manually so that data really is on persistent 89 * disk caches manually so that data really is on persistent
91 * storage 90 * storage
92 */ 91 */
93 if (test_opt(inode->i_sb, BARRIER)) 92 if (needs_barrier)
94 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
95out: 94 BLKDEV_IFL_WAIT);
96 return ret; 95 return ret;
97} 96}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 0d0e97ed3ff6..498021eb88fb 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -538,16 +538,13 @@ got:
538 if (S_ISDIR(mode)) 538 if (S_ISDIR(mode))
539 percpu_counter_inc(&sbi->s_dirs_counter); 539 percpu_counter_inc(&sbi->s_dirs_counter);
540 540
541 inode->i_uid = current_fsuid(); 541
542 if (test_opt (sb, GRPID)) 542 if (test_opt(sb, GRPID)) {
543 inode->i_gid = dir->i_gid; 543 inode->i_mode = mode;
544 else if (dir->i_mode & S_ISGID) { 544 inode->i_uid = current_fsuid();
545 inode->i_gid = dir->i_gid; 545 inode->i_gid = dir->i_gid;
546 if (S_ISDIR(mode))
547 mode |= S_ISGID;
548 } else 546 } else
549 inode->i_gid = current_fsgid(); 547 inode_init_owner(inode, dir, mode);
550 inode->i_mode = mode;
551 548
552 inode->i_ino = ino; 549 inode->i_ino = ino;
553 /* This is the optimal IO size (for stat), not the fs block size */ 550 /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ea33bdf0a300..735f0190ec2a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3151,7 +3151,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3151 if (error) 3151 if (error)
3152 return error; 3152 return error;
3153 3153
3154 if (ia_valid & ATTR_SIZE) 3154 if (is_quota_modification(inode, attr))
3155 dquot_initialize(inode); 3155 dquot_initialize(inode);
3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1bee604cc6cd..0fc1293d0e96 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -653,8 +653,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
653 seq_printf(seq, ",commit=%u", 653 seq_printf(seq, ",commit=%u",
654 (unsigned) (sbi->s_commit_interval / HZ)); 654 (unsigned) (sbi->s_commit_interval / HZ));
655 } 655 }
656 if (test_opt(sb, BARRIER)) 656
657 seq_puts(seq, ",barrier=1"); 657 /*
658 * Always display barrier state so it's clear what the status is.
659 */
660 seq_puts(seq, ",barrier=");
661 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
658 if (test_opt(sb, NOBH)) 662 if (test_opt(sb, NOBH))
659 seq_puts(seq, ",nobh"); 663 seq_puts(seq, ",nobh");
660 664
@@ -810,8 +814,8 @@ enum {
810 Opt_data_err_abort, Opt_data_err_ignore, 814 Opt_data_err_abort, Opt_data_err_ignore,
811 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 815 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
812 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 816 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
813 Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, 817 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
814 Opt_usrquota, Opt_grpquota 818 Opt_resize, Opt_usrquota, Opt_grpquota
815}; 819};
816 820
817static const match_table_t tokens = { 821static const match_table_t tokens = {
@@ -865,6 +869,8 @@ static const match_table_t tokens = {
865 {Opt_quota, "quota"}, 869 {Opt_quota, "quota"},
866 {Opt_usrquota, "usrquota"}, 870 {Opt_usrquota, "usrquota"},
867 {Opt_barrier, "barrier=%u"}, 871 {Opt_barrier, "barrier=%u"},
872 {Opt_barrier, "barrier"},
873 {Opt_nobarrier, "nobarrier"},
868 {Opt_resize, "resize"}, 874 {Opt_resize, "resize"},
869 {Opt_err, NULL}, 875 {Opt_err, NULL},
870}; 876};
@@ -967,7 +973,11 @@ static int parse_options (char *options, struct super_block *sb,
967 int token; 973 int token;
968 if (!*p) 974 if (!*p)
969 continue; 975 continue;
970 976 /*
977 * Initialize args struct so we know whether arg was
978 * found; some options take optional arguments.
979 */
980 args[0].to = args[0].from = 0;
971 token = match_token(p, tokens, args); 981 token = match_token(p, tokens, args);
972 switch (token) { 982 switch (token) {
973 case Opt_bsd_df: 983 case Opt_bsd_df:
@@ -1215,9 +1225,15 @@ set_qf_format:
1215 case Opt_abort: 1225 case Opt_abort:
1216 set_opt(sbi->s_mount_opt, ABORT); 1226 set_opt(sbi->s_mount_opt, ABORT);
1217 break; 1227 break;
1228 case Opt_nobarrier:
1229 clear_opt(sbi->s_mount_opt, BARRIER);
1230 break;
1218 case Opt_barrier: 1231 case Opt_barrier:
1219 if (match_int(&args[0], &option)) 1232 if (args[0].from) {
1220 return 0; 1233 if (match_int(&args[0], &option))
1234 return 0;
1235 } else
1236 option = 1; /* No argument, default to 1 */
1221 if (option) 1237 if (option)
1222 set_opt(sbi->s_mount_opt, BARRIER); 1238 set_opt(sbi->s_mount_opt, BARRIER);
1223 else 1239 else
@@ -1890,21 +1906,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1890 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 1906 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1891 spin_lock_init(&sbi->s_next_gen_lock); 1907 spin_lock_init(&sbi->s_next_gen_lock);
1892 1908
1893 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1894 ext3_count_free_blocks(sb));
1895 if (!err) {
1896 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1897 ext3_count_free_inodes(sb));
1898 }
1899 if (!err) {
1900 err = percpu_counter_init(&sbi->s_dirs_counter,
1901 ext3_count_dirs(sb));
1902 }
1903 if (err) {
1904 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1905 goto failed_mount3;
1906 }
1907
1908 /* per fileystem reservation list head & lock */ 1909 /* per fileystem reservation list head & lock */
1909 spin_lock_init(&sbi->s_rsv_window_lock); 1910 spin_lock_init(&sbi->s_rsv_window_lock);
1910 sbi->s_rsv_window_root = RB_ROOT; 1911 sbi->s_rsv_window_root = RB_ROOT;
@@ -1945,15 +1946,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1945 if (!test_opt(sb, NOLOAD) && 1946 if (!test_opt(sb, NOLOAD) &&
1946 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { 1947 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
1947 if (ext3_load_journal(sb, es, journal_devnum)) 1948 if (ext3_load_journal(sb, es, journal_devnum))
1948 goto failed_mount3; 1949 goto failed_mount2;
1949 } else if (journal_inum) { 1950 } else if (journal_inum) {
1950 if (ext3_create_journal(sb, es, journal_inum)) 1951 if (ext3_create_journal(sb, es, journal_inum))
1951 goto failed_mount3; 1952 goto failed_mount2;
1952 } else { 1953 } else {
1953 if (!silent) 1954 if (!silent)
1954 ext3_msg(sb, KERN_ERR, 1955 ext3_msg(sb, KERN_ERR,
1955 "error: no journal found. " 1956 "error: no journal found. "
1956 "mounting ext3 over ext2?"); 1957 "mounting ext3 over ext2?");
1958 goto failed_mount2;
1959 }
1960 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1961 ext3_count_free_blocks(sb));
1962 if (!err) {
1963 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1964 ext3_count_free_inodes(sb));
1965 }
1966 if (!err) {
1967 err = percpu_counter_init(&sbi->s_dirs_counter,
1968 ext3_count_dirs(sb));
1969 }
1970 if (err) {
1971 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1957 goto failed_mount3; 1972 goto failed_mount3;
1958 } 1973 }
1959 1974
@@ -1978,7 +1993,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1978 ext3_msg(sb, KERN_ERR, 1993 ext3_msg(sb, KERN_ERR,
1979 "error: journal does not support " 1994 "error: journal does not support "
1980 "requested data journaling mode"); 1995 "requested data journaling mode");
1981 goto failed_mount4; 1996 goto failed_mount3;
1982 } 1997 }
1983 default: 1998 default:
1984 break; 1999 break;
@@ -2001,19 +2016,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2001 if (IS_ERR(root)) { 2016 if (IS_ERR(root)) {
2002 ext3_msg(sb, KERN_ERR, "error: get root inode failed"); 2017 ext3_msg(sb, KERN_ERR, "error: get root inode failed");
2003 ret = PTR_ERR(root); 2018 ret = PTR_ERR(root);
2004 goto failed_mount4; 2019 goto failed_mount3;
2005 } 2020 }
2006 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 2021 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2007 iput(root); 2022 iput(root);
2008 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); 2023 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
2009 goto failed_mount4; 2024 goto failed_mount3;
2010 } 2025 }
2011 sb->s_root = d_alloc_root(root); 2026 sb->s_root = d_alloc_root(root);
2012 if (!sb->s_root) { 2027 if (!sb->s_root) {
2013 ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); 2028 ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
2014 iput(root); 2029 iput(root);
2015 ret = -ENOMEM; 2030 ret = -ENOMEM;
2016 goto failed_mount4; 2031 goto failed_mount3;
2017 } 2032 }
2018 2033
2019 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); 2034 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2039,12 +2054,11 @@ cantfind_ext3:
2039 sb->s_id); 2054 sb->s_id);
2040 goto failed_mount; 2055 goto failed_mount;
2041 2056
2042failed_mount4:
2043 journal_destroy(sbi->s_journal);
2044failed_mount3: 2057failed_mount3:
2045 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2058 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2046 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2059 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2047 percpu_counter_destroy(&sbi->s_dirs_counter); 2060 percpu_counter_destroy(&sbi->s_dirs_counter);
2061 journal_destroy(sbi->s_journal);
2048failed_mount2: 2062failed_mount2:
2049 for (i = 0; i < db_count; i++) 2063 for (i = 0; i < db_count; i++)
2050 brelse(sbi->s_group_desc[i]); 2064 brelse(sbi->s_group_desc[i]);
@@ -2278,6 +2292,9 @@ static int ext3_load_journal(struct super_block *sb,
2278 return -EINVAL; 2292 return -EINVAL;
2279 } 2293 }
2280 2294
2295 if (!(journal->j_flags & JFS_BARRIER))
2296 printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
2297
2281 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2298 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2282 err = journal_update_format(journal); 2299 err = journal_update_format(journal);
2283 if (err) { 2300 if (err) {
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..71fb8d65e54c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
104 104
105static struct mb_cache *ext3_xattr_cache; 105static struct mb_cache *ext3_xattr_cache;
106 106
107static struct xattr_handler *ext3_xattr_handler_map[] = { 107static const struct xattr_handler *ext3_xattr_handler_map[] = {
108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, 108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
109#ifdef CONFIG_EXT3_FS_POSIX_ACL 109#ifdef CONFIG_EXT3_FS_POSIX_ACL
110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, 110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
116#endif 116#endif
117}; 117};
118 118
119struct xattr_handler *ext3_xattr_handlers[] = { 119const struct xattr_handler *ext3_xattr_handlers[] = {
120 &ext3_xattr_user_handler, 120 &ext3_xattr_user_handler,
121 &ext3_xattr_trusted_handler, 121 &ext3_xattr_trusted_handler,
122#ifdef CONFIG_EXT3_FS_POSIX_ACL 122#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
129 NULL 129 NULL
130}; 130};
131 131
132static inline struct xattr_handler * 132static inline const struct xattr_handler *
133ext3_xattr_handler(int name_index) 133ext3_xattr_handler(int name_index)
134{ 134{
135 struct xattr_handler *handler = NULL; 135 const struct xattr_handler *handler = NULL;
136 136
137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) 137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
138 handler = ext3_xattr_handler_map[name_index]; 138 handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
338 size_t rest = buffer_size; 338 size_t rest = buffer_size;
339 339
340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { 340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
341 struct xattr_handler *handler = 341 const struct xattr_handler *handler =
342 ext3_xattr_handler(entry->e_name_index); 342 ext3_xattr_handler(entry->e_name_index);
343 343
344 if (handler) { 344 if (handler) {
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
58 58
59# ifdef CONFIG_EXT3_FS_XATTR 59# ifdef CONFIG_EXT3_FS_XATTR
60 60
61extern struct xattr_handler ext3_xattr_user_handler; 61extern const struct xattr_handler ext3_xattr_user_handler;
62extern struct xattr_handler ext3_xattr_trusted_handler; 62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern struct xattr_handler ext3_xattr_acl_access_handler; 63extern const struct xattr_handler ext3_xattr_acl_access_handler;
64extern struct xattr_handler ext3_xattr_acl_default_handler; 64extern const struct xattr_handler ext3_xattr_acl_default_handler;
65extern struct xattr_handler ext3_xattr_security_handler; 65extern const struct xattr_handler ext3_xattr_security_handler;
66 66
67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); 67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
68 68
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
76extern int init_ext3_xattr(void); 76extern int init_ext3_xattr(void);
77extern void exit_ext3_xattr(void); 77extern void exit_ext3_xattr(void);
78 78
79extern struct xattr_handler *ext3_xattr_handlers[]; 79extern const struct xattr_handler *ext3_xattr_handlers[];
80 80
81# else /* CONFIG_EXT3_FS_XATTR */ 81# else /* CONFIG_EXT3_FS_XATTR */
82 82
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3af91f476dff..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext3_xattr_security_handler = { 72const struct xattr_handler ext3_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext3_xattr_security_list, 74 .list = ext3_xattr_security_list,
75 .get = ext3_xattr_security_get, 75 .get = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
51 value, size, flags); 51 value, size, flags);
52} 52}
53 53
54struct xattr_handler ext3_xattr_trusted_handler = { 54const struct xattr_handler ext3_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext3_xattr_trusted_list, 56 .list = ext3_xattr_trusted_list,
57 .get = ext3_xattr_trusted_get, 57 .get = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext3_xattr_user_handler = { 57const struct xattr_handler ext3_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext3_xattr_user_list, 59 .list = ext3_xattr_user_list,
60 .get = ext3_xattr_user_get, 60 .get = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..feaf498feaa6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
454 return error; 454 return error;
455} 455}
456 456
457struct xattr_handler ext4_xattr_acl_access_handler = { 457const struct xattr_handler ext4_xattr_acl_access_handler = {
458 .prefix = POSIX_ACL_XATTR_ACCESS, 458 .prefix = POSIX_ACL_XATTR_ACCESS,
459 .flags = ACL_TYPE_ACCESS, 459 .flags = ACL_TYPE_ACCESS,
460 .list = ext4_xattr_list_acl_access, 460 .list = ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
462 .set = ext4_xattr_set_acl, 462 .set = ext4_xattr_set_acl,
463}; 463};
464 464
465struct xattr_handler ext4_xattr_acl_default_handler = { 465const struct xattr_handler ext4_xattr_acl_default_handler = {
466 .prefix = POSIX_ACL_XATTR_DEFAULT, 466 .prefix = POSIX_ACL_XATTR_DEFAULT,
467 .flags = ACL_TYPE_DEFAULT, 467 .flags = ACL_TYPE_DEFAULT,
468 .list = ext4_xattr_list_acl_default, 468 .list = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
100 if (ext4_should_writeback_data(inode) && 100 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) && 101 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER)) 102 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 103 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 NULL, BLKDEV_IFL_WAIT);
104 jbd2_log_wait_commit(journal, commit_tid); 105 jbd2_log_wait_commit(journal, commit_tid);
105 } else if (journal->j_flags & JBD2_BARRIER) 106 } else if (journal->j_flags & JBD2_BARRIER)
106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 107 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
108 BLKDEV_IFL_WAIT);
107 return ret; 109 return ret;
108} 110}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef6ccd6..1a0e183a2f04 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -979,16 +979,12 @@ got:
979 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 979 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
980 } 980 }
981 981
982 inode->i_uid = current_fsuid(); 982 if (test_opt(sb, GRPID)) {
983 if (test_opt(sb, GRPID)) 983 inode->i_mode = mode;
984 inode->i_uid = current_fsuid();
984 inode->i_gid = dir->i_gid; 985 inode->i_gid = dir->i_gid;
985 else if (dir->i_mode & S_ISGID) {
986 inode->i_gid = dir->i_gid;
987 if (S_ISDIR(mode))
988 mode |= S_ISGID;
989 } else 986 } else
990 inode->i_gid = current_fsgid(); 987 inode_init_owner(inode, dir, mode);
991 inode->i_mode = mode;
992 988
993 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 989 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
994 /* This is the optimal IO size (for stat), not the fs block size */ 990 /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 81d605412844..3e0f6af9d08d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5425,7 +5425,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5425 if (error) 5425 if (error)
5426 return error; 5426 return error;
5427 5427
5428 if (ia_valid & ATTR_SIZE) 5428 if (is_quota_modification(inode, attr))
5429 dquot_initialize(inode); 5429 dquot_initialize(inode);
5430 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5430 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5431 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5431 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..2de0e9515089 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
97 97
98static struct mb_cache *ext4_xattr_cache; 98static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static const struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
109#endif 109#endif
110}; 110};
111 111
112struct xattr_handler *ext4_xattr_handlers[] = { 112const struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
122 NULL 122 NULL
123}; 123};
124 124
125static inline struct xattr_handler * 125static inline const struct xattr_handler *
126ext4_xattr_handler(int name_index) 126ext4_xattr_handler(int name_index)
127{ 127{
128 struct xattr_handler *handler = NULL; 128 const struct xattr_handler *handler = NULL;
129 129
130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) 130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
131 handler = ext4_xattr_handler_map[name_index]; 131 handler = ext4_xattr_handler_map[name_index];
@@ -332,7 +332,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
332 size_t rest = buffer_size; 332 size_t rest = buffer_size;
333 333
334 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { 334 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
335 struct xattr_handler *handler = 335 const struct xattr_handler *handler =
336 ext4_xattr_handler(entry->e_name_index); 336 ext4_xattr_handler(entry->e_name_index);
337 337
338 if (handler) { 338 if (handler) {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
65 65
66# ifdef CONFIG_EXT4_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern const struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern const struct xattr_handler ext4_xattr_trusted_handler;
70extern struct xattr_handler ext4_xattr_acl_access_handler; 70extern const struct xattr_handler ext4_xattr_acl_access_handler;
71extern struct xattr_handler ext4_xattr_acl_default_handler; 71extern const struct xattr_handler ext4_xattr_acl_default_handler;
72extern struct xattr_handler ext4_xattr_security_handler; 72extern const struct xattr_handler ext4_xattr_security_handler;
73 73
74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); 74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
75 75
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
86extern int init_ext4_xattr(void); 86extern int init_ext4_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 8b145e98df07..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext4_xattr_security_handler = { 72const struct xattr_handler ext4_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext4_xattr_security_list, 74 .list = ext4_xattr_security_list,
75 .get = ext4_xattr_security_get, 75 .get = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
51 name, value, size, flags); 51 name, value, size, flags);
52} 52}
53 53
54struct xattr_handler ext4_xattr_trusted_handler = { 54const struct xattr_handler ext4_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext4_xattr_trusted_list, 56 .list = ext4_xattr_trusted_list,
57 .get = ext4_xattr_trusted_get, 57 .get = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext4_xattr_user_handler = { 57const struct xattr_handler ext4_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext4_xattr_user_list, 59 .list = ext4_xattr_user_list,
60 .get = ext4_xattr_user_get, 60 .get = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
242 while (*fclus < cluster) { 242 while (*fclus < cluster) {
243 /* prevent the infinite loop of cluster chain */ 243 /* prevent the infinite loop of cluster chain */
244 if (*fclus > limit) { 244 if (*fclus > limit) {
245 fat_fs_error(sb, "%s: detected the cluster chain loop" 245 fat_fs_error_ratelimit(sb,
246 " (i_pos %lld)", __func__, 246 "%s: detected the cluster chain loop"
247 MSDOS_I(inode)->i_pos); 247 " (i_pos %lld)", __func__,
248 MSDOS_I(inode)->i_pos);
248 nr = -EIO; 249 nr = -EIO;
249 goto out; 250 goto out;
250 } 251 }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
253 if (nr < 0) 254 if (nr < 0)
254 goto out; 255 goto out;
255 else if (nr == FAT_ENT_FREE) { 256 else if (nr == FAT_ENT_FREE) {
256 fat_fs_error(sb, "%s: invalid cluster chain" 257 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
257 " (i_pos %lld)", __func__, 258 " (i_pos %lld)", __func__,
258 MSDOS_I(inode)->i_pos); 259 MSDOS_I(inode)->i_pos);
259 nr = -EIO; 260 nr = -EIO;
260 goto out; 261 goto out;
261 } else if (nr == FAT_ENT_EOF) { 262 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <linux/kernel.h>
22#include "fat.h" 23#include "fat.h"
23 24
24/* 25/*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
140{ 141{
141 const wchar_t *ip; 142 const wchar_t *ip;
142 wchar_t ec; 143 wchar_t ec;
143 unsigned char *op, nc; 144 unsigned char *op;
144 int charlen; 145 int charlen;
145 int k;
146 146
147 ip = uni; 147 ip = uni;
148 op = ascii; 148 op = ascii;
149 149
150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { 150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
151 ec = *ip++; 151 ec = *ip++;
152 if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { 152 if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
153 op += charlen; 153 op += charlen;
154 len -= charlen; 154 len -= charlen;
155 } else { 155 } else {
156 if (uni_xlate == 1) { 156 if (uni_xlate == 1) {
157 *op = ':'; 157 *op++ = ':';
158 for (k = 4; k > 0; k--) { 158 op = pack_hex_byte(op, ec >> 8);
159 nc = ec & 0xF; 159 op = pack_hex_byte(op, ec);
160 op[k] = nc > 9 ? nc + ('a' - 10)
161 : nc + '0';
162 ec >>= 4;
163 }
164 op += 5;
165 len -= 5; 160 len -= 5;
166 } else { 161 } else {
167 *op++ = '?'; 162 *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
758 return ret; 753 return ret;
759} 754}
760 755
761static int fat_dir_ioctl(struct inode *inode, struct file *filp, 756static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
762 unsigned int cmd, unsigned long arg) 757 unsigned long arg)
763{ 758{
759 struct inode *inode = filp->f_path.dentry->d_inode;
764 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; 760 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
765 int short_only, both; 761 int short_only, both;
766 762
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
774 both = 1; 770 both = 1;
775 break; 771 break;
776 default: 772 default:
777 return fat_generic_ioctl(inode, filp, cmd, arg); 773 return fat_generic_ioctl(filp, cmd, arg);
778 } 774 }
779 775
780 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) 776 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
814 both = 1; 810 both = 1;
815 break; 811 break;
816 default: 812 default:
817 return -ENOIOCTLCMD; 813 return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
818 } 814 }
819 815
820 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) 816 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
836 .llseek = generic_file_llseek, 832 .llseek = generic_file_llseek,
837 .read = generic_read_dir, 833 .read = generic_read_dir,
838 .readdir = fat_readdir, 834 .readdir = fat_readdir,
839 .ioctl = fat_dir_ioctl, 835 .unlocked_ioctl = fat_dir_ioctl,
840#ifdef CONFIG_COMPAT 836#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 837 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 838#endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..53dba57b49a1 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/ratelimit.h>
9#include <linux/msdos_fs.h> 10#include <linux/msdos_fs.h>
10 11
11/* 12/*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
82 struct fatent_operations *fatent_ops; 83 struct fatent_operations *fatent_ops;
83 struct inode *fat_inode; 84 struct inode *fat_inode;
84 85
86 struct ratelimit_state ratelimit;
87
85 spinlock_t inode_hash_lock; 88 spinlock_t inode_hash_lock;
86 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 89 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
87}; 90};
@@ -298,8 +301,8 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
298extern int fat_count_free_clusters(struct super_block *sb); 301extern int fat_count_free_clusters(struct super_block *sb);
299 302
300/* fat/file.c */ 303/* fat/file.c */
301extern int fat_generic_ioctl(struct inode *inode, struct file *filp, 304extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
302 unsigned int cmd, unsigned long arg); 305 unsigned long arg);
303extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
304extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
305extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323 struct inode *i2); 326 struct inode *i2);
324/* fat/misc.c */ 327/* fat/misc.c */
325extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 328extern void
326 __attribute__ ((format (printf, 2, 3))) __cold; 329__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
330 __attribute__ ((format (printf, 3, 4))) __cold;
331#define fat_fs_error(s, fmt, args...) \
332 __fat_fs_error(s, 1, fmt , ## args)
333#define fat_fs_error_ratelimit(s, fmt, args...) \
334 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
327extern int fat_clusters_flush(struct super_block *sb); 335extern int fat_clusters_flush(struct super_block *sb);
328extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 336extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
329extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 337extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..a14c2f6a489e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/capability.h> 9#include <linux/capability.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/compat.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/time.h> 13#include <linux/time.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
114 return err; 115 return err;
115} 116}
116 117
117int fat_generic_ioctl(struct inode *inode, struct file *filp, 118long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118 unsigned int cmd, unsigned long arg)
119{ 119{
120 struct inode *inode = filp->f_path.dentry->d_inode;
120 u32 __user *user_attr = (u32 __user *)arg; 121 u32 __user *user_attr = (u32 __user *)arg;
121 122
122 switch (cmd) { 123 switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
129 } 130 }
130} 131}
131 132
133#ifdef CONFIG_COMPAT
134static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
135 unsigned long arg)
136
137{
138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
139}
140#endif
141
132static int fat_file_release(struct inode *inode, struct file *filp) 142static int fat_file_release(struct inode *inode, struct file *filp)
133{ 143{
134 if ((filp->f_mode & FMODE_WRITE) && 144 if ((filp->f_mode & FMODE_WRITE) &&
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
159 .aio_write = generic_file_aio_write, 169 .aio_write = generic_file_aio_write,
160 .mmap = generic_file_mmap, 170 .mmap = generic_file_mmap,
161 .release = fat_file_release, 171 .release = fat_file_release,
162 .ioctl = fat_generic_ioctl, 172 .unlocked_ioctl = fat_generic_ioctl,
173#ifdef CONFIG_COMPAT
174 .compat_ioctl = fat_generic_compat_ioctl,
175#endif
163 .fsync = fat_file_fsync, 176 .fsync = fat_file_fsync,
164 .splice_read = generic_file_splice_read, 177 .splice_read = generic_file_splice_read,
165}; 178};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..ed33904926ee 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1250,6 +1250,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1250 sb->s_op = &fat_sops; 1250 sb->s_op = &fat_sops;
1251 sb->s_export_op = &fat_export_ops; 1251 sb->s_export_op = &fat_export_ops;
1252 sbi->dir_ops = fs_dir_inode_ops; 1252 sbi->dir_ops = fs_dir_inode_ops;
1253 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1254 DEFAULT_RATELIMIT_BURST);
1253 1255
1254 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1256 error = parse_options(data, isvfat, silent, &debug, &sbi->options);
1255 if (error) 1257 if (error)
@@ -1497,10 +1499,8 @@ out_fail:
1497 iput(fat_inode); 1499 iput(fat_inode);
1498 if (root_inode) 1500 if (root_inode)
1499 iput(root_inode); 1501 iput(root_inode);
1500 if (sbi->nls_io) 1502 unload_nls(sbi->nls_io);
1501 unload_nls(sbi->nls_io); 1503 unload_nls(sbi->nls_disk);
1502 if (sbi->nls_disk)
1503 unload_nls(sbi->nls_disk);
1504 if (sbi->options.iocharset != fat_default_iocharset) 1504 if (sbi->options.iocharset != fat_default_iocharset)
1505 kfree(sbi->options.iocharset); 1505 kfree(sbi->options.iocharset);
1506 sb->s_fs_info = NULL; 1506 sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void fat_fs_error(struct super_block *s, const char *fmt, ...) 23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
26 va_list args; 26 va_list args;
27 27
28 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); 28 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
29 30
30 printk(KERN_ERR " "); 31 printk(KERN_ERR " ");
31 va_start(args, fmt); 32 va_start(args, fmt);
32 vprintk(fmt, args); 33 vprintk(fmt, args);
33 va_end(args); 34 va_end(args);
34 printk("\n"); 35 printk("\n");
36 }
35 37
36 if (opts->errors == FAT_ERRORS_PANIC) 38 if (opts->errors == FAT_ERRORS_PANIC)
37 panic(" FAT fs panic from previous error\n"); 39 panic("FAT: fs panic from previous error\n");
38 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
39 s->s_flags |= MS_RDONLY; 41 s->s_flags |= MS_RDONLY;
40 printk(KERN_ERR " File system has been set read-only\n"); 42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
41 } 43 }
42} 44}
43EXPORT_SYMBOL_GPL(fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
44 46
45/* Flushes the number of free clusters on FAT32 */ 47/* Flushes the number of free clusters on FAT32 */
46/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..f74d270ba155 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/pipe_fs_i.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/ptrace.h> 19#include <linux/ptrace.h>
19#include <linux/signal.h> 20#include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
412 case F_NOTIFY: 413 case F_NOTIFY:
413 err = fcntl_dirnotify(fd, filp, arg); 414 err = fcntl_dirnotify(fd, filp, arg);
414 break; 415 break;
416 case F_SETPIPE_SZ:
417 case F_GETPIPE_SZ:
418 err = pipe_fcntl(filp, cmd, arg);
419 break;
415 default: 420 default:
416 break; 421 break;
417 } 422 }
@@ -614,9 +619,15 @@ int send_sigurg(struct fown_struct *fown)
614 return ret; 619 return ret;
615} 620}
616 621
617static DEFINE_RWLOCK(fasync_lock); 622static DEFINE_SPINLOCK(fasync_lock);
618static struct kmem_cache *fasync_cache __read_mostly; 623static struct kmem_cache *fasync_cache __read_mostly;
619 624
625static void fasync_free_rcu(struct rcu_head *head)
626{
627 kmem_cache_free(fasync_cache,
628 container_of(head, struct fasync_struct, fa_rcu));
629}
630
620/* 631/*
621 * Remove a fasync entry. If successfully removed, return 632 * Remove a fasync entry. If successfully removed, return
622 * positive and clear the FASYNC flag. If no entry exists, 633 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +636,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
625 * NOTE! It is very important that the FASYNC flag always 636 * NOTE! It is very important that the FASYNC flag always
626 * match the state "is the filp on a fasync list". 637 * match the state "is the filp on a fasync list".
627 * 638 *
628 * We always take the 'filp->f_lock', in since fasync_lock
629 * needs to be irq-safe.
630 */ 639 */
631static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 640static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
632{ 641{
@@ -634,17 +643,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
634 int result = 0; 643 int result = 0;
635 644
636 spin_lock(&filp->f_lock); 645 spin_lock(&filp->f_lock);
637 write_lock_irq(&fasync_lock); 646 spin_lock(&fasync_lock);
638 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 647 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
639 if (fa->fa_file != filp) 648 if (fa->fa_file != filp)
640 continue; 649 continue;
650
651 spin_lock_irq(&fa->fa_lock);
652 fa->fa_file = NULL;
653 spin_unlock_irq(&fa->fa_lock);
654
641 *fp = fa->fa_next; 655 *fp = fa->fa_next;
642 kmem_cache_free(fasync_cache, fa); 656 call_rcu(&fa->fa_rcu, fasync_free_rcu);
643 filp->f_flags &= ~FASYNC; 657 filp->f_flags &= ~FASYNC;
644 result = 1; 658 result = 1;
645 break; 659 break;
646 } 660 }
647 write_unlock_irq(&fasync_lock); 661 spin_unlock(&fasync_lock);
648 spin_unlock(&filp->f_lock); 662 spin_unlock(&filp->f_lock);
649 return result; 663 return result;
650} 664}
@@ -666,25 +680,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
666 return -ENOMEM; 680 return -ENOMEM;
667 681
668 spin_lock(&filp->f_lock); 682 spin_lock(&filp->f_lock);
669 write_lock_irq(&fasync_lock); 683 spin_lock(&fasync_lock);
670 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 684 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
671 if (fa->fa_file != filp) 685 if (fa->fa_file != filp)
672 continue; 686 continue;
687
688 spin_lock_irq(&fa->fa_lock);
673 fa->fa_fd = fd; 689 fa->fa_fd = fd;
690 spin_unlock_irq(&fa->fa_lock);
691
674 kmem_cache_free(fasync_cache, new); 692 kmem_cache_free(fasync_cache, new);
675 goto out; 693 goto out;
676 } 694 }
677 695
696 spin_lock_init(&new->fa_lock);
678 new->magic = FASYNC_MAGIC; 697 new->magic = FASYNC_MAGIC;
679 new->fa_file = filp; 698 new->fa_file = filp;
680 new->fa_fd = fd; 699 new->fa_fd = fd;
681 new->fa_next = *fapp; 700 new->fa_next = *fapp;
682 *fapp = new; 701 rcu_assign_pointer(*fapp, new);
683 result = 1; 702 result = 1;
684 filp->f_flags |= FASYNC; 703 filp->f_flags |= FASYNC;
685 704
686out: 705out:
687 write_unlock_irq(&fasync_lock); 706 spin_unlock(&fasync_lock);
688 spin_unlock(&filp->f_lock); 707 spin_unlock(&filp->f_lock);
689 return result; 708 return result;
690} 709}
@@ -704,37 +723,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
704 723
705EXPORT_SYMBOL(fasync_helper); 724EXPORT_SYMBOL(fasync_helper);
706 725
707void __kill_fasync(struct fasync_struct *fa, int sig, int band) 726/*
727 * rcu_read_lock() is held
728 */
729static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
708{ 730{
709 while (fa) { 731 while (fa) {
710 struct fown_struct * fown; 732 struct fown_struct *fown;
711 if (fa->magic != FASYNC_MAGIC) { 733 if (fa->magic != FASYNC_MAGIC) {
712 printk(KERN_ERR "kill_fasync: bad magic number in " 734 printk(KERN_ERR "kill_fasync: bad magic number in "
713 "fasync_struct!\n"); 735 "fasync_struct!\n");
714 return; 736 return;
715 } 737 }
716 fown = &fa->fa_file->f_owner; 738 spin_lock(&fa->fa_lock);
717 /* Don't send SIGURG to processes which have not set a 739 if (fa->fa_file) {
718 queued signum: SIGURG has its own default signalling 740 fown = &fa->fa_file->f_owner;
719 mechanism. */ 741 /* Don't send SIGURG to processes which have not set a
720 if (!(sig == SIGURG && fown->signum == 0)) 742 queued signum: SIGURG has its own default signalling
721 send_sigio(fown, fa->fa_fd, band); 743 mechanism. */
722 fa = fa->fa_next; 744 if (!(sig == SIGURG && fown->signum == 0))
745 send_sigio(fown, fa->fa_fd, band);
746 }
747 spin_unlock(&fa->fa_lock);
748 fa = rcu_dereference(fa->fa_next);
723 } 749 }
724} 750}
725 751
726EXPORT_SYMBOL(__kill_fasync);
727
728void kill_fasync(struct fasync_struct **fp, int sig, int band) 752void kill_fasync(struct fasync_struct **fp, int sig, int band)
729{ 753{
730 /* First a quick test without locking: usually 754 /* First a quick test without locking: usually
731 * the list is empty. 755 * the list is empty.
732 */ 756 */
733 if (*fp) { 757 if (*fp) {
734 read_lock(&fasync_lock); 758 rcu_read_lock();
735 /* reread *fp after obtaining the lock */ 759 kill_fasync_rcu(rcu_dereference(*fp), sig, band);
736 __kill_fasync(*fp, sig, band); 760 rcu_read_unlock();
737 read_unlock(&fasync_lock);
738 } 761 }
739} 762}
740EXPORT_SYMBOL(kill_fasync); 763EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..ea8592b90696 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -42,9 +42,10 @@ struct wb_writeback_args {
42 long nr_pages; 42 long nr_pages;
43 struct super_block *sb; 43 struct super_block *sb;
44 enum writeback_sync_modes sync_mode; 44 enum writeback_sync_modes sync_mode;
45 int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 int for_background:1; 47 unsigned int for_background:1;
48 unsigned int sb_pinned:1;
48}; 49};
49 50
50/* 51/*
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
192} 193}
193 194
194static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 195static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
195 struct wb_writeback_args *args) 196 struct wb_writeback_args *args,
197 int wait)
196{ 198{
197 struct bdi_work *work; 199 struct bdi_work *work;
198 200
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
204 if (work) { 206 if (work) {
205 bdi_work_init(work, args); 207 bdi_work_init(work, args);
206 bdi_queue_work(bdi, work); 208 bdi_queue_work(bdi, work);
209 if (wait)
210 bdi_wait_on_work_clear(work);
207 } else { 211 } else {
208 struct bdi_writeback *wb = &bdi->wb; 212 struct bdi_writeback *wb = &bdi->wb;
209 213
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
230 .sync_mode = WB_SYNC_ALL, 234 .sync_mode = WB_SYNC_ALL,
231 .nr_pages = LONG_MAX, 235 .nr_pages = LONG_MAX,
232 .range_cyclic = 0, 236 .range_cyclic = 0,
237 /*
238 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
239 * lets make it explicitly clear.
240 */
241 .sb_pinned = 1,
233 }; 242 };
234 struct bdi_work work; 243 struct bdi_work work;
235 244
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
245 * @bdi: the backing device to write from 254 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block 255 * @sb: write inodes from this super_block
247 * @nr_pages: the number of pages to write 256 * @nr_pages: the number of pages to write
257 * @sb_locked: caller already holds sb umount sem.
248 * 258 *
249 * Description: 259 * Description:
250 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 260 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
251 * started when this function returns, we make no guarentees on 261 * started when this function returns, we make no guarentees on
252 * completion. Caller need not hold sb s_umount semaphore. 262 * completion. Caller specifies whether sb umount sem is held already or not.
253 * 263 *
254 */ 264 */
255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 265void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
256 long nr_pages) 266 long nr_pages, int sb_locked)
257{ 267{
258 struct wb_writeback_args args = { 268 struct wb_writeback_args args = {
259 .sb = sb, 269 .sb = sb,
260 .sync_mode = WB_SYNC_NONE, 270 .sync_mode = WB_SYNC_NONE,
261 .nr_pages = nr_pages, 271 .nr_pages = nr_pages,
262 .range_cyclic = 1, 272 .range_cyclic = 1,
273 .sb_pinned = sb_locked,
263 }; 274 };
264 275
265 /* 276 /*
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
271 args.for_background = 1; 282 args.for_background = 1;
272 } 283 }
273 284
274 bdi_alloc_queue_work(bdi, &args); 285 bdi_alloc_queue_work(bdi, &args, sb_locked);
275} 286}
276 287
277/* 288/*
@@ -398,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode)
398 wait_queue_head_t *wqh; 409 wait_queue_head_t *wqh;
399 410
400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 411 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
401 do { 412 while (inode->i_state & I_SYNC) {
402 spin_unlock(&inode_lock); 413 spin_unlock(&inode_lock);
403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 414 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
404 spin_lock(&inode_lock); 415 spin_lock(&inode_lock);
405 } while (inode->i_state & I_SYNC); 416 }
406} 417}
407 418
408/* 419/*
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
452 463
453 BUG_ON(inode->i_state & I_SYNC); 464 BUG_ON(inode->i_state & I_SYNC);
454 465
455 /* Set I_SYNC, reset I_DIRTY */ 466 /* Set I_SYNC, reset I_DIRTY_PAGES */
456 dirty = inode->i_state & I_DIRTY;
457 inode->i_state |= I_SYNC; 467 inode->i_state |= I_SYNC;
458 inode->i_state &= ~I_DIRTY; 468 inode->i_state &= ~I_DIRTY_PAGES;
459
460 spin_unlock(&inode_lock); 469 spin_unlock(&inode_lock);
461 470
462 ret = do_writepages(mapping, wbc); 471 ret = do_writepages(mapping, wbc);
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
472 ret = err; 481 ret = err;
473 } 482 }
474 483
484 /*
485 * Some filesystems may redirty the inode during the writeback
486 * due to delalloc, clear dirty metadata flags right before
487 * write_inode()
488 */
489 spin_lock(&inode_lock);
490 dirty = inode->i_state & I_DIRTY;
491 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
492 spin_unlock(&inode_lock);
475 /* Don't write the inode if only I_DIRTY_PAGES was set */ 493 /* Don't write the inode if only I_DIRTY_PAGES was set */
476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 494 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc); 495 int err = write_inode(inode, wbc);
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
577 /* 595 /*
578 * Caller must already hold the ref for this 596 * Caller must already hold the ref for this
579 */ 597 */
580 if (wbc->sync_mode == WB_SYNC_ALL) { 598 if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
581 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 599 WARN_ON(!rwsem_is_locked(&sb->s_umount));
582 return SB_NOT_PINNED; 600 return SB_NOT_PINNED;
583 } 601 }
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
751 .for_kupdate = args->for_kupdate, 769 .for_kupdate = args->for_kupdate,
752 .for_background = args->for_background, 770 .for_background = args->for_background,
753 .range_cyclic = args->range_cyclic, 771 .range_cyclic = args->range_cyclic,
772 .sb_pinned = args->sb_pinned,
754 }; 773 };
755 unsigned long oldest_jif; 774 unsigned long oldest_jif;
756 long wrote = 0; 775 long wrote = 0;
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
852 unsigned long expired; 871 unsigned long expired;
853 long nr_pages; 872 long nr_pages;
854 873
874 /*
875 * When set to zero, disable periodic writeback
876 */
877 if (!dirty_writeback_interval)
878 return 0;
879
855 expired = wb->last_old_flush + 880 expired = wb->last_old_flush +
856 msecs_to_jiffies(dirty_writeback_interval * 10); 881 msecs_to_jiffies(dirty_writeback_interval * 10);
857 if (time_before(jiffies, expired)) 882 if (time_before(jiffies, expired))
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
887 912
888 while ((work = get_next_work_item(bdi, wb)) != NULL) { 913 while ((work = get_next_work_item(bdi, wb)) != NULL) {
889 struct wb_writeback_args args = work->args; 914 struct wb_writeback_args args = work->args;
915 int post_clear;
890 916
891 /* 917 /*
892 * Override sync mode, in case we must wait for completion 918 * Override sync mode, in case we must wait for completion
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
894 if (force_wait) 920 if (force_wait)
895 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
896 922
923 post_clear = WB_SYNC_ALL || args.sb_pinned;
924
897 /* 925 /*
898 * If this isn't a data integrity operation, just notify 926 * If this isn't a data integrity operation, just notify
899 * that we have seen this work and we are now starting it. 927 * that we have seen this work and we are now starting it.
900 */ 928 */
901 if (args.sync_mode == WB_SYNC_NONE) 929 if (!post_clear)
902 wb_clear_pending(wb, work); 930 wb_clear_pending(wb, work);
903 931
904 wrote += wb_writeback(wb, &args); 932 wrote += wb_writeback(wb, &args);
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
907 * This is a data integrity writeback, so only do the 935 * This is a data integrity writeback, so only do the
908 * notification when we have completed the work. 936 * notification when we have completed the work.
909 */ 937 */
910 if (args.sync_mode == WB_SYNC_ALL) 938 if (post_clear)
911 wb_clear_pending(wb, work); 939 wb_clear_pending(wb, work);
912 } 940 }
913 941
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
947 break; 975 break;
948 } 976 }
949 977
950 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 978 if (dirty_writeback_interval) {
951 schedule_timeout_interruptible(wait_jiffies); 979 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
980 schedule_timeout_interruptible(wait_jiffies);
981 } else {
982 set_current_state(TASK_INTERRUPTIBLE);
983 if (list_empty_careful(&wb->bdi->work_list) &&
984 !kthread_should_stop())
985 schedule();
986 __set_current_state(TASK_RUNNING);
987 }
988
952 try_to_freeze(); 989 try_to_freeze();
953 } 990 }
954 991
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
974 if (!bdi_has_dirty_io(bdi)) 1011 if (!bdi_has_dirty_io(bdi))
975 continue; 1012 continue;
976 1013
977 bdi_alloc_queue_work(bdi, &args); 1014 bdi_alloc_queue_work(bdi, &args, 0);
978 } 1015 }
979 1016
980 rcu_read_unlock(); 1017 rcu_read_unlock();
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
1183 iput(old_inode); 1220 iput(old_inode);
1184} 1221}
1185 1222
1223static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1224{
1225 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1226 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1227 long nr_to_write;
1228
1229 nr_to_write = nr_dirty + nr_unstable +
1230 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1231
1232 bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
1233}
1234
1186/** 1235/**
1187 * writeback_inodes_sb - writeback dirty inodes from given super_block 1236 * writeback_inodes_sb - writeback dirty inodes from given super_block
1188 * @sb: the superblock 1237 * @sb: the superblock
@@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb)
1194 */ 1243 */
1195void writeback_inodes_sb(struct super_block *sb) 1244void writeback_inodes_sb(struct super_block *sb)
1196{ 1245{
1197 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1246 __writeback_inodes_sb(sb, 0);
1198 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1199 long nr_to_write;
1200
1201 nr_to_write = nr_dirty + nr_unstable +
1202 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1203
1204 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1205} 1247}
1206EXPORT_SYMBOL(writeback_inodes_sb); 1248EXPORT_SYMBOL(writeback_inodes_sb);
1207 1249
1208/** 1250/**
1251 * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
1252 * @sb: the superblock
1253 *
1254 * Like writeback_inodes_sb(), except the caller already holds the
1255 * sb umount sem.
1256 */
1257void writeback_inodes_sb_locked(struct super_block *sb)
1258{
1259 __writeback_inodes_sb(sb, 1);
1260}
1261
1262/**
1209 * writeback_inodes_sb_if_idle - start writeback if none underway 1263 * writeback_inodes_sb_if_idle - start writeback if none underway
1210 * @sb: the superblock 1264 * @sb: the superblock
1211 * 1265 *
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index fe5df5457656..99800e564157 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
201 return -EAGAIN; 201 return -EAGAIN;
202} 202}
203 203
204struct xattr_handler generic_acl_access_handler = { 204const struct xattr_handler generic_acl_access_handler = {
205 .prefix = POSIX_ACL_XATTR_ACCESS, 205 .prefix = POSIX_ACL_XATTR_ACCESS,
206 .flags = ACL_TYPE_ACCESS, 206 .flags = ACL_TYPE_ACCESS,
207 .list = generic_acl_list, 207 .list = generic_acl_list,
@@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
209 .set = generic_acl_set, 209 .set = generic_acl_set,
210}; 210};
211 211
212struct xattr_handler generic_acl_default_handler = { 212const struct xattr_handler generic_acl_default_handler = {
213 .prefix = POSIX_ACL_XATTR_DEFAULT, 213 .prefix = POSIX_ACL_XATTR_DEFAULT,
214 .flags = ACL_TYPE_DEFAULT, 214 .flags = ACL_TYPE_DEFAULT,
215 .list = generic_acl_list, 215 .list = generic_acl_list,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
236 void *buffer, size_t size, int xtype) 236 void *buffer, size_t size, int xtype)
237{ 237{
238 struct inode *inode = dentry->d_inode; 238 struct inode *inode = dentry->d_inode;
239 struct gfs2_sbd *sdp = GFS2_SB(inode);
239 struct posix_acl *acl; 240 struct posix_acl *acl;
240 int type; 241 int type;
241 int error; 242 int error;
242 243
244 if (!sdp->sd_args.ar_posix_acl)
245 return -EOPNOTSUPP;
246
243 type = gfs2_acl_type(name); 247 type = gfs2_acl_type(name);
244 if (type < 0) 248 if (type < 0)
245 return type; 249 return type;
@@ -335,7 +339,7 @@ out:
335 return error; 339 return error;
336} 340}
337 341
338struct xattr_handler gfs2_xattr_system_handler = { 342const struct xattr_handler gfs2_xattr_system_handler = {
339 .prefix = XATTR_SYSTEM_PREFIX, 343 .prefix = XATTR_SYSTEM_PREFIX,
340 .flags = GFS2_EATYPE_SYS, 344 .flags = GFS2_EATYPE_SYS,
341 .get = gfs2_xattr_system_get, 345 .get = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
19extern int gfs2_check_acl(struct inode *inode, int mask); 19extern int gfs2_check_acl(struct inode *inode, int mask);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern struct xattr_handler gfs2_xattr_system_handler; 22extern const struct xattr_handler gfs2_xattr_system_handler;
23 23
24#endif /* __ACL_DOT_H__ */ 24#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..a739a0a48067 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
418static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) 418static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
419{ 419{
420 struct buffer_head *dibh; 420 struct buffer_head *dibh;
421 u64 dsize = i_size_read(&ip->i_inode);
421 void *kaddr; 422 void *kaddr;
422 int error; 423 int error;
423 424
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
437 return error; 438 return error;
438 439
439 kaddr = kmap_atomic(page, KM_USER0); 440 kaddr = kmap_atomic(page, KM_USER0);
440 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 441 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
441 ip->i_disksize); 442 dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
442 memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); 443 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
444 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
443 kunmap_atomic(kaddr, KM_USER0); 445 kunmap_atomic(kaddr, KM_USER0);
444 flush_dcache_page(page); 446 flush_dcache_page(page);
445 brelse(dibh); 447 brelse(dibh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e411d5f4697..4a48c0f4b402 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -71,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
71 71
72 if (!PageUptodate(page)) { 72 if (!PageUptodate(page)) {
73 void *kaddr = kmap(page); 73 void *kaddr = kmap(page);
74 u64 dsize = i_size_read(inode);
75
76 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
77 dsize = dibh->b_size - sizeof(struct gfs2_dinode);
74 78
75 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 79 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
76 ip->i_disksize); 80 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
77 memset(kaddr + ip->i_disksize, 0,
78 PAGE_CACHE_SIZE - ip->i_disksize);
79 kunmap(page); 81 kunmap(page);
80 82
81 SetPageUptodate(page); 83 SetPageUptodate(page);
@@ -1038,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1038 goto out; 1040 goto out;
1039 1041
1040 if (gfs2_is_stuffed(ip)) { 1042 if (gfs2_is_stuffed(ip)) {
1041 ip->i_disksize = size; 1043 u64 dsize = size + sizeof(struct gfs2_inode);
1042 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1044 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1043 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1045 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1044 gfs2_dinode_out(ip, dibh->b_data); 1046 gfs2_dinode_out(ip, dibh->b_data);
1045 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); 1047 if (dsize > dibh->b_size)
1048 dsize = dibh->b_size;
1049 gfs2_buffer_clear_tail(dibh, dsize);
1046 error = 1; 1050 error = 1;
1047
1048 } else { 1051 } else {
1049 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 1052 if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
1050 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 1053 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..8295c5b5d4a9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1475 inode = gfs2_inode_lookup(dir->i_sb, 1475 inode = gfs2_inode_lookup(dir->i_sb,
1476 be16_to_cpu(dent->de_type), 1476 be16_to_cpu(dent->de_type),
1477 be64_to_cpu(dent->de_inum.no_addr), 1477 be64_to_cpu(dent->de_inum.no_addr),
1478 be64_to_cpu(dent->de_inum.no_formal_ino), 0); 1478 be64_to_cpu(dent->de_inum.no_formal_ino));
1479 brelse(bh); 1479 brelse(bh);
1480 return inode; 1480 return inode;
1481 } 1481 }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index c22c21174833..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -168,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
168 if (error) 168 if (error)
169 goto fail; 169 goto fail;
170 170
171 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); 171 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
172 if (IS_ERR(inode)) { 172 if (IS_ERR(inode)) {
173 error = PTR_ERR(inode); 173 error = PTR_ERR(inode);
174 goto fail; 174 goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..b20bfcc9fa2d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
218 if (error) 218 if (error)
219 goto out_drop_write; 219 goto out_drop_write;
220 220
221 error = -EACCES;
222 if (!is_owner_or_cap(inode))
223 goto out;
224
225 error = 0;
221 flags = ip->i_diskflags; 226 flags = ip->i_diskflags;
222 new_flags = (flags & ~mask) | (reqflags & mask); 227 new_flags = (flags & ~mask) | (reqflags & mask);
223 if ((new_flags ^ flags) == 0) 228 if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
275{ 280{
276 struct inode *inode = filp->f_path.dentry->d_inode; 281 struct inode *inode = filp->f_path.dentry->d_inode;
277 u32 fsflags, gfsflags; 282 u32 fsflags, gfsflags;
283
278 if (get_user(fsflags, ptr)) 284 if (get_user(fsflags, ptr))
279 return -EFAULT; 285 return -EFAULT;
286
280 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); 287 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
281 if (!S_ISDIR(inode->i_mode)) { 288 if (!S_ISDIR(inode->i_mode)) {
282 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 289 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..ddcdbf493536 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
855 gh->gh_flags = flags; 855 gh->gh_flags = flags;
856 gh->gh_iflags = 0; 856 gh->gh_iflags = 0;
857 gh->gh_ip = (unsigned long)__builtin_return_address(0); 857 gh->gh_ip = (unsigned long)__builtin_return_address(0);
858 if (gh->gh_owner_pid)
859 put_pid(gh->gh_owner_pid);
860 gh->gh_owner_pid = get_pid(task_pid(current));
858} 861}
859 862
860/** 863/**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3aac46f6853e..b5d7363b22da 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -439,9 +439,6 @@ struct gfs2_args {
439struct gfs2_tune { 439struct gfs2_tune {
440 spinlock_t gt_spin; 440 spinlock_t gt_spin;
441 441
442 unsigned int gt_incore_log_blocks;
443 unsigned int gt_log_flush_secs;
444
445 unsigned int gt_logd_secs; 442 unsigned int gt_logd_secs;
446 443
447 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 444 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +459,7 @@ enum {
462 SDF_SHUTDOWN = 2, 459 SDF_SHUTDOWN = 2,
463 SDF_NOBARRIERS = 3, 460 SDF_NOBARRIERS = 3,
464 SDF_NORECOVERY = 4, 461 SDF_NORECOVERY = 4,
462 SDF_DEMOTE = 5,
465}; 463};
466 464
467#define GFS2_FSNAME_LEN 256 465#define GFS2_FSNAME_LEN 256
@@ -618,6 +616,7 @@ struct gfs2_sbd {
618 unsigned int sd_log_commited_databuf; 616 unsigned int sd_log_commited_databuf;
619 int sd_log_commited_revoke; 617 int sd_log_commited_revoke;
620 618
619 atomic_t sd_log_pinned;
621 unsigned int sd_log_num_buf; 620 unsigned int sd_log_num_buf;
622 unsigned int sd_log_num_revoke; 621 unsigned int sd_log_num_revoke;
623 unsigned int sd_log_num_rg; 622 unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
629 struct list_head sd_log_le_databuf; 628 struct list_head sd_log_le_databuf;
630 struct list_head sd_log_le_ordered; 629 struct list_head sd_log_le_ordered;
631 630
631 atomic_t sd_log_thresh1;
632 atomic_t sd_log_thresh2;
632 atomic_t sd_log_blks_free; 633 atomic_t sd_log_blks_free;
633 struct mutex sd_log_reserve_mutex; 634 wait_queue_head_t sd_log_waitq;
635 wait_queue_head_t sd_logd_waitq;
634 636
635 u64 sd_log_sequence; 637 u64 sd_log_sequence;
636 unsigned int sd_log_head; 638 unsigned int sd_log_head;
637 unsigned int sd_log_tail; 639 unsigned int sd_log_tail;
638 int sd_log_idle; 640 int sd_log_idle;
639 641
640 unsigned long sd_log_flush_time;
641 struct rw_semaphore sd_log_flush_lock; 642 struct rw_semaphore sd_log_flush_lock;
642 atomic_t sd_log_in_flight; 643 atomic_t sd_log_in_flight;
643 wait_queue_head_t sd_log_flush_wait; 644 wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
158 * @sb: The super block 158 * @sb: The super block
159 * @no_addr: The inode number 159 * @no_addr: The inode number
160 * @type: The type of the inode 160 * @type: The type of the inode
161 * @skip_freeing: set this not return an inode if it is currently being freed.
162 * 161 *
163 * Returns: A VFS inode, or an error 162 * Returns: A VFS inode, or an error
164 */ 163 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
166struct inode *gfs2_inode_lookup(struct super_block *sb, 165struct inode *gfs2_inode_lookup(struct super_block *sb,
167 unsigned int type, 166 unsigned int type,
168 u64 no_addr, 167 u64 no_addr,
169 u64 no_formal_ino, int skip_freeing) 168 u64 no_formal_ino)
170{ 169{
171 struct inode *inode; 170 struct inode *inode;
172 struct gfs2_inode *ip; 171 struct gfs2_inode *ip;
173 struct gfs2_glock *io_gl; 172 struct gfs2_glock *io_gl;
174 int error; 173 int error;
175 174
176 if (skip_freeing) 175 inode = gfs2_iget(sb, no_addr);
177 inode = gfs2_iget_skip(sb, no_addr);
178 else
179 inode = gfs2_iget(sb, no_addr);
180 ip = GFS2_I(inode); 176 ip = GFS2_I(inode);
181 177
182 if (!inode) 178 if (!inode)
@@ -234,11 +230,102 @@ fail_glock:
234fail_iopen: 230fail_iopen:
235 gfs2_glock_put(io_gl); 231 gfs2_glock_put(io_gl);
236fail_put: 232fail_put:
233 if (inode->i_state & I_NEW)
234 ip->i_gl->gl_object = NULL;
235 gfs2_glock_put(ip->i_gl);
236fail:
237 if (inode->i_state & I_NEW)
238 iget_failed(inode);
239 else
240 iput(inode);
241 return ERR_PTR(error);
242}
243
244/**
245 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
246 * and try to reclaim it by doing iput.
247 *
248 * This function assumes no rgrp locks are currently held.
249 *
250 * @sb: The super block
251 * no_addr: The inode number
252 *
253 */
254
255void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
256{
257 struct gfs2_sbd *sdp;
258 struct gfs2_inode *ip;
259 struct gfs2_glock *io_gl;
260 int error;
261 struct gfs2_holder gh;
262 struct inode *inode;
263
264 inode = gfs2_iget_skip(sb, no_addr);
265
266 if (!inode)
267 return;
268
269 /* If it's not a new inode, someone's using it, so leave it alone. */
270 if (!(inode->i_state & I_NEW)) {
271 iput(inode);
272 return;
273 }
274
275 ip = GFS2_I(inode);
276 sdp = GFS2_SB(inode);
277 ip->i_no_formal_ino = -1;
278
279 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
280 if (unlikely(error))
281 goto fail;
282 ip->i_gl->gl_object = ip;
283
284 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
285 if (unlikely(error))
286 goto fail_put;
287
288 set_bit(GIF_INVALID, &ip->i_flags);
289 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
290 &ip->i_iopen_gh);
291 if (unlikely(error))
292 goto fail_iopen;
293
294 ip->i_iopen_gh.gh_gl->gl_object = ip;
295 gfs2_glock_put(io_gl);
296
297 inode->i_mode = DT2IF(DT_UNKNOWN);
298
299 /*
300 * We must read the inode in order to work out its type in
301 * this case. Note that this doesn't happen often as we normally
302 * know the type beforehand. This code path only occurs during
303 * unlinked inode recovery (where it is safe to do this glock,
304 * which is not true in the general case).
305 */
306 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
307 &gh);
308 if (unlikely(error))
309 goto fail_glock;
310
311 /* Inode is now uptodate */
312 gfs2_glock_dq_uninit(&gh);
313 gfs2_set_iop(inode);
314
315 /* The iput will cause it to be deleted. */
316 iput(inode);
317 return;
318
319fail_glock:
320 gfs2_glock_dq(&ip->i_iopen_gh);
321fail_iopen:
322 gfs2_glock_put(io_gl);
323fail_put:
237 ip->i_gl->gl_object = NULL; 324 ip->i_gl->gl_object = NULL;
238 gfs2_glock_put(ip->i_gl); 325 gfs2_glock_put(ip->i_gl);
239fail: 326fail:
240 iget_failed(inode); 327 iget_failed(inode);
241 return ERR_PTR(error); 328 return;
242} 329}
243 330
244static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 331static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -862,7 +949,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
862 goto fail_gunlock2; 949 goto fail_gunlock2;
863 950
864 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, 951 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
865 inum.no_formal_ino, 0); 952 inum.no_formal_ino);
866 if (IS_ERR(inode)) 953 if (IS_ERR(inode))
867 goto fail_gunlock2; 954 goto fail_gunlock2;
868 955
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
83 83
84extern void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino, 86 u64 no_addr, u64 no_formal_ino);
87 int skip_freeing); 87extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
89 89
90extern int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5bf4b59d46e..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
168 return list_empty(&ai->ai_ail1_list); 168 return list_empty(&ai->ai_ail1_list);
169} 169}
170 170
171static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) 171static void gfs2_ail1_start(struct gfs2_sbd *sdp)
172{ 172{
173 struct list_head *head; 173 struct list_head *head;
174 u64 sync_gen; 174 u64 sync_gen;
175 struct list_head *first; 175 struct gfs2_ail *ai;
176 struct gfs2_ail *first_ai, *ai, *tmp;
177 int done = 0; 176 int done = 0;
178 177
179 gfs2_log_lock(sdp); 178 gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
184 } 183 }
185 sync_gen = sdp->sd_ail_sync_gen++; 184 sync_gen = sdp->sd_ail_sync_gen++;
186 185
187 first = head->prev;
188 first_ai = list_entry(first, struct gfs2_ail, ai_list);
189 first_ai->ai_sync_gen = sync_gen;
190 gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
191
192 if (flags & DIO_ALL)
193 first = NULL;
194
195 while(!done) { 186 while(!done) {
196 if (first && (head->prev != first ||
197 gfs2_ail1_empty_one(sdp, first_ai, 0)))
198 break;
199
200 done = 1; 187 done = 1;
201 list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { 188 list_for_each_entry_reverse(ai, head, ai_list) {
202 if (ai->ai_sync_gen >= sync_gen) 189 if (ai->ai_sync_gen >= sync_gen)
203 continue; 190 continue;
204 ai->ai_sync_gen = sync_gen; 191 ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
290 * flush time, so we ensure that we have just enough free blocks at all 277 * flush time, so we ensure that we have just enough free blocks at all
291 * times to avoid running out during a log flush. 278 * times to avoid running out during a log flush.
292 * 279 *
280 * We no longer flush the log here, instead we wake up logd to do that
281 * for us. To avoid the thundering herd and to ensure that we deal fairly
282 * with queued waiters, we use an exclusive wait. This means that when we
283 * get woken with enough journal space to get our reservation, we need to
284 * wake the next waiter on the list.
285 *
293 * Returns: errno 286 * Returns: errno
294 */ 287 */
295 288
296int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) 289int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
297{ 290{
298 unsigned int try = 0;
299 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); 291 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
292 unsigned wanted = blks + reserved_blks;
293 DEFINE_WAIT(wait);
294 int did_wait = 0;
295 unsigned int free_blocks;
300 296
301 if (gfs2_assert_warn(sdp, blks) || 297 if (gfs2_assert_warn(sdp, blks) ||
302 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) 298 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
303 return -EINVAL; 299 return -EINVAL;
304 300retry:
305 mutex_lock(&sdp->sd_log_reserve_mutex); 301 free_blocks = atomic_read(&sdp->sd_log_blks_free);
306 gfs2_log_lock(sdp); 302 if (unlikely(free_blocks <= wanted)) {
307 while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { 303 do {
308 gfs2_log_unlock(sdp); 304 prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
309 gfs2_ail1_empty(sdp, 0); 305 TASK_UNINTERRUPTIBLE);
310 gfs2_log_flush(sdp, NULL); 306 wake_up(&sdp->sd_logd_waitq);
311 307 did_wait = 1;
312 if (try++) 308 if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
313 gfs2_ail1_start(sdp, 0); 309 io_schedule();
314 gfs2_log_lock(sdp); 310 free_blocks = atomic_read(&sdp->sd_log_blks_free);
311 } while(free_blocks <= wanted);
312 finish_wait(&sdp->sd_log_waitq, &wait);
315 } 313 }
316 atomic_sub(blks, &sdp->sd_log_blks_free); 314 if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
315 free_blocks - blks) != free_blocks)
316 goto retry;
317 trace_gfs2_log_blocks(sdp, -blks); 317 trace_gfs2_log_blocks(sdp, -blks);
318 gfs2_log_unlock(sdp); 318
319 mutex_unlock(&sdp->sd_log_reserve_mutex); 319 /*
320 * If we waited, then so might others, wake them up _after_ we get
321 * our share of the log.
322 */
323 if (unlikely(did_wait))
324 wake_up(&sdp->sd_log_waitq);
320 325
321 down_read(&sdp->sd_log_flush_lock); 326 down_read(&sdp->sd_log_flush_lock);
322 327
323 return 0; 328 return 0;
324} 329}
325 330
326/**
327 * gfs2_log_release - Release a given number of log blocks
328 * @sdp: The GFS2 superblock
329 * @blks: The number of blocks
330 *
331 */
332
333void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
334{
335
336 gfs2_log_lock(sdp);
337 atomic_add(blks, &sdp->sd_log_blks_free);
338 trace_gfs2_log_blocks(sdp, blks);
339 gfs2_assert_withdraw(sdp,
340 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
341 gfs2_log_unlock(sdp);
342 up_read(&sdp->sd_log_flush_lock);
343}
344
345static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) 331static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
346{ 332{
347 struct gfs2_journal_extent *je; 333 struct gfs2_journal_extent *je;
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
559 545
560 ail2_empty(sdp, new_tail); 546 ail2_empty(sdp, new_tail);
561 547
562 gfs2_log_lock(sdp);
563 atomic_add(dist, &sdp->sd_log_blks_free); 548 atomic_add(dist, &sdp->sd_log_blks_free);
564 trace_gfs2_log_blocks(sdp, dist); 549 trace_gfs2_log_blocks(sdp, dist);
565 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 550 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
566 gfs2_log_unlock(sdp); 551 sdp->sd_jdesc->jd_blocks);
567 552
568 sdp->sd_log_tail = new_tail; 553 sdp->sd_log_tail = new_tail;
569} 554}
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
615 if (buffer_eopnotsupp(bh)) { 600 if (buffer_eopnotsupp(bh)) {
616 clear_buffer_eopnotsupp(bh); 601 clear_buffer_eopnotsupp(bh);
617 set_buffer_uptodate(bh); 602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
618 set_bit(SDF_NOBARRIERS, &sdp->sd_flags); 604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
619 lock_buffer(bh); 605 lock_buffer(bh);
620skip_barrier: 606skip_barrier:
@@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
710 * 696 *
711 */ 697 */
712 698
713void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 699void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
714{ 700{
715 struct gfs2_ail *ai; 701 struct gfs2_ail *ai;
716 702
@@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
822 * @sdp: the filesystem 808 * @sdp: the filesystem
823 * @tr: the transaction 809 * @tr: the transaction
824 * 810 *
811 * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
812 * or the total number of used blocks (pinned blocks plus AIL blocks)
813 * is greater than thresh2.
814 *
815 * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
816 * journal size.
817 *
825 * Returns: errno 818 * Returns: errno
826 */ 819 */
827 820
@@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
832 825
833 up_read(&sdp->sd_log_flush_lock); 826 up_read(&sdp->sd_log_flush_lock);
834 827
835 gfs2_log_lock(sdp); 828 if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
836 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) 829 ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
837 wake_up_process(sdp->sd_logd_process); 830 atomic_read(&sdp->sd_log_thresh2)))
838 gfs2_log_unlock(sdp); 831 wake_up(&sdp->sd_logd_waitq);
839} 832}
840 833
841/** 834/**
@@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
882{ 875{
883 gfs2_log_flush(sdp, NULL); 876 gfs2_log_flush(sdp, NULL);
884 for (;;) { 877 for (;;) {
885 gfs2_ail1_start(sdp, DIO_ALL); 878 gfs2_ail1_start(sdp);
886 if (gfs2_ail1_empty(sdp, DIO_ALL)) 879 if (gfs2_ail1_empty(sdp, DIO_ALL))
887 break; 880 break;
888 msleep(10); 881 msleep(10);
889 } 882 }
890} 883}
891 884
885static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
886{
887 return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
888}
889
890static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
891{
892 unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
893 return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
894}
892 895
893/** 896/**
894 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks 897 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
901int gfs2_logd(void *data) 904int gfs2_logd(void *data)
902{ 905{
903 struct gfs2_sbd *sdp = data; 906 struct gfs2_sbd *sdp = data;
904 unsigned long t; 907 unsigned long t = 1;
905 int need_flush; 908 DEFINE_WAIT(wait);
909 unsigned preflush;
906 910
907 while (!kthread_should_stop()) { 911 while (!kthread_should_stop()) {
908 /* Advance the log tail */
909 912
910 t = sdp->sd_log_flush_time + 913 preflush = atomic_read(&sdp->sd_log_pinned);
911 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; 914 if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
915 gfs2_ail1_empty(sdp, DIO_ALL);
916 gfs2_log_flush(sdp, NULL);
917 gfs2_ail1_empty(sdp, DIO_ALL);
918 }
912 919
913 gfs2_ail1_empty(sdp, DIO_ALL); 920 if (gfs2_ail_flush_reqd(sdp)) {
914 gfs2_log_lock(sdp); 921 gfs2_ail1_start(sdp);
915 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); 922 io_schedule();
916 gfs2_log_unlock(sdp); 923 gfs2_ail1_empty(sdp, 0);
917 if (need_flush || time_after_eq(jiffies, t)) {
918 gfs2_log_flush(sdp, NULL); 924 gfs2_log_flush(sdp, NULL);
919 sdp->sd_log_flush_time = jiffies; 925 gfs2_ail1_empty(sdp, DIO_ALL);
920 } 926 }
921 927
928 wake_up(&sdp->sd_log_waitq);
922 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; 929 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
923 if (freezing(current)) 930 if (freezing(current))
924 refrigerator(); 931 refrigerator();
925 schedule_timeout_interruptible(t); 932
933 do {
934 prepare_to_wait(&sdp->sd_logd_waitq, &wait,
935 TASK_UNINTERRUPTIBLE);
936 if (!gfs2_ail_flush_reqd(sdp) &&
937 !gfs2_jrnl_flush_reqd(sdp) &&
938 !kthread_should_stop())
939 t = schedule_timeout(t);
940 } while(t && !gfs2_ail_flush_reqd(sdp) &&
941 !gfs2_jrnl_flush_reqd(sdp) &&
942 !kthread_should_stop());
943 finish_wait(&sdp->sd_logd_waitq, &wait);
926 } 944 }
927 945
928 return 0; 946 return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
47 sdp->sd_log_head = sdp->sd_log_tail = value; 47 sdp->sd_log_head = sdp->sd_log_tail = value;
48} 48}
49 49
50unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 50extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
51 unsigned int ssize); 51 unsigned int ssize);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); 54extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp);
56 55
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 56extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 57extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
59 struct buffer_head *real); 58 struct buffer_head *real);
60void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 59extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
61extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
61 62
62static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) 63extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
63{ 64extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
64 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) 65extern int gfs2_logd(void *data);
65 __gfs2_log_flush(sbd, gl);
66}
67
68void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
69void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
70
71void gfs2_log_shutdown(struct gfs2_sbd *sdp);
72void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
73int gfs2_logd(void *data);
74 66
75#endif /* __LOG_DOT_H__ */ 67#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
54 if (bd->bd_ail) 54 if (bd->bd_ail)
55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
56 get_bh(bh); 56 get_bh(bh);
57 atomic_inc(&sdp->sd_log_pinned);
57 trace_gfs2_pin(bd, 1); 58 trace_gfs2_pin(bd, 1);
58} 59}
59 60
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
94 trace_gfs2_pin(bd, 0); 95 trace_gfs2_pin(bd, 0);
95 gfs2_log_unlock(sdp); 96 gfs2_log_unlock(sdp);
96 unlock_buffer(bh); 97 unlock_buffer(bh);
98 atomic_dec(&sdp->sd_log_pinned);
97} 99}
98 100
99 101
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..fb2a5f93b7c3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
94 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
95 goto fail; 95 goto fail;
96 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", 97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
98 sizeof(struct gfs2_glock) + 98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space), 99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once); 100 0, 0, gfs2_init_gl_aspace_once);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..18176d0b75d7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
34 34
35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) 35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
36{ 36{
37 int err;
38 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
39 int nr_underway = 0; 38 int nr_underway = 0;
40 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? 39 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
86 } while (bh != head); 85 } while (bh != head);
87 unlock_page(page); 86 unlock_page(page);
88 87
89 err = 0;
90 if (nr_underway == 0) 88 if (nr_underway == 0)
91 end_page_writeback(page); 89 end_page_writeback(page);
92 90
93 return err; 91 return 0;
94} 92}
95 93
96const struct address_space_operations gfs2_meta_aops = { 94const struct address_space_operations gfs2_meta_aops = {
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
313 struct gfs2_bufdata *bd = bh->b_private; 311 struct gfs2_bufdata *bd = bh->b_private;
314 312
315 if (test_clear_buffer_pinned(bh)) { 313 if (test_clear_buffer_pinned(bh)) {
314 atomic_dec(&sdp->sd_log_pinned);
316 list_del_init(&bd->bd_le.le_list); 315 list_del_init(&bd->bd_le.le_list);
317 if (meta) { 316 if (meta) {
318 gfs2_assert_warn(sdp, sdp->sd_log_num_buf); 317 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1309ed1c496..3593b3a7290e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
57{ 57{
58 spin_lock_init(&gt->gt_spin); 58 spin_lock_init(&gt->gt_spin);
59 59
60 gt->gt_incore_log_blocks = 1024;
61 gt->gt_logd_secs = 1;
62 gt->gt_quota_simul_sync = 64; 60 gt->gt_quota_simul_sync = 64;
63 gt->gt_quota_warn_period = 10; 61 gt->gt_quota_warn_period = 10;
64 gt->gt_quota_scale_num = 1; 62 gt->gt_quota_scale_num = 1;
@@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
101 spin_lock_init(&sdp->sd_trunc_lock); 99 spin_lock_init(&sdp->sd_trunc_lock);
102 100
103 spin_lock_init(&sdp->sd_log_lock); 101 spin_lock_init(&sdp->sd_log_lock);
104 102 atomic_set(&sdp->sd_log_pinned, 0);
105 INIT_LIST_HEAD(&sdp->sd_log_le_buf); 103 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
106 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 104 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
107 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 105 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
108 INIT_LIST_HEAD(&sdp->sd_log_le_databuf); 106 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
109 INIT_LIST_HEAD(&sdp->sd_log_le_ordered); 107 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
110 108
111 mutex_init(&sdp->sd_log_reserve_mutex); 109 init_waitqueue_head(&sdp->sd_log_waitq);
110 init_waitqueue_head(&sdp->sd_logd_waitq);
112 INIT_LIST_HEAD(&sdp->sd_ail1_list); 111 INIT_LIST_HEAD(&sdp->sd_ail1_list);
113 INIT_LIST_HEAD(&sdp->sd_ail2_list); 112 INIT_LIST_HEAD(&sdp->sd_ail2_list);
114 113
@@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
487 struct dentry *dentry; 486 struct dentry *dentry;
488 struct inode *inode; 487 struct inode *inode;
489 488
490 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 489 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
491 if (IS_ERR(inode)) { 490 if (IS_ERR(inode)) {
492 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); 491 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
493 return PTR_ERR(inode); 492 return PTR_ERR(inode);
@@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
733 if (sdp->sd_args.ar_spectator) { 732 if (sdp->sd_args.ar_spectator) {
734 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); 733 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
735 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 734 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
735 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
736 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
736 } else { 737 } else {
737 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { 738 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
738 fs_err(sdp, "can't mount journal #%u\n", 739 fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
770 goto fail_jinode_gh; 771 goto fail_jinode_gh;
771 } 772 }
772 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 773 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
774 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
775 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
773 776
774 /* Map the extents for this journal's blocks */ 777 /* Map the extents for this journal's blocks */
775 map_journal_extents(sdp); 778 map_journal_extents(sdp);
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
951 if (undo) 954 if (undo)
952 goto fail_quotad; 955 goto fail_quotad;
953 956
954 sdp->sd_log_flush_time = jiffies;
955
956 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 957 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
957 error = IS_ERR(p); 958 error = IS_ERR(p);
958 if (error) { 959 if (error) {
@@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1160 GFS2_BASIC_BLOCK_SHIFT; 1161 GFS2_BASIC_BLOCK_SHIFT;
1161 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; 1162 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
1162 1163
1163 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; 1164 sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
1164 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; 1165 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
1165 if (sdp->sd_args.ar_statfs_quantum) { 1166 if (sdp->sd_args.ar_statfs_quantum) {
1166 sdp->sd_tune.gt_statfs_slow = 0; 1167 sdp->sd_tune.gt_statfs_slow = 0;
@@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1323 memset(&args, 0, sizeof(args)); 1324 memset(&args, 0, sizeof(args));
1324 args.ar_quota = GFS2_QUOTA_DEFAULT; 1325 args.ar_quota = GFS2_QUOTA_DEFAULT;
1325 args.ar_data = GFS2_DATA_DEFAULT; 1326 args.ar_data = GFS2_DATA_DEFAULT;
1326 args.ar_commit = 60; 1327 args.ar_commit = 30;
1327 args.ar_statfs_quantum = 30; 1328 args.ar_statfs_quantum = 30;
1328 args.ar_quota_quantum = 60; 1329 args.ar_quota_quantum = 60;
1329 args.ar_errors = GFS2_ERRORS_DEFAULT; 1330 args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..49667d68769e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
637 unsigned blocksize, iblock, pos; 637 unsigned blocksize, iblock, pos;
638 struct buffer_head *bh, *dibh; 638 struct buffer_head *bh, *dibh;
639 struct page *page; 639 struct page *page;
640 void *kaddr; 640 void *kaddr, *ptr;
641 struct gfs2_quota *qp; 641 struct gfs2_quota q, *qp;
642 s64 value; 642 int err, nbytes;
643 int err = -EIO;
644 u64 size; 643 u64 size;
645 644
646 if (gfs2_is_stuffed(ip)) 645 if (gfs2_is_stuffed(ip))
647 gfs2_unstuff_dinode(ip, NULL); 646 gfs2_unstuff_dinode(ip, NULL);
648 647
648 memset(&q, 0, sizeof(struct gfs2_quota));
649 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
650 if (err < 0)
651 return err;
652
653 err = -EIO;
654 qp = &q;
655 qp->qu_value = be64_to_cpu(qp->qu_value);
656 qp->qu_value += change;
657 qp->qu_value = cpu_to_be64(qp->qu_value);
658 qd->qd_qb.qb_value = qp->qu_value;
659 if (fdq) {
660 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
661 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
662 qd->qd_qb.qb_warn = qp->qu_warn;
663 }
664 if (fdq->d_fieldmask & FS_DQ_BHARD) {
665 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
666 qd->qd_qb.qb_limit = qp->qu_limit;
667 }
668 }
669
670 /* Write the quota into the quota file on disk */
671 ptr = qp;
672 nbytes = sizeof(struct gfs2_quota);
673get_a_page:
649 page = grab_cache_page(mapping, index); 674 page = grab_cache_page(mapping, index);
650 if (!page) 675 if (!page)
651 return -ENOMEM; 676 return -ENOMEM;
@@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
667 if (!buffer_mapped(bh)) { 692 if (!buffer_mapped(bh)) {
668 gfs2_block_map(inode, iblock, bh, 1); 693 gfs2_block_map(inode, iblock, bh, 1);
669 if (!buffer_mapped(bh)) 694 if (!buffer_mapped(bh))
670 goto unlock; 695 goto unlock_out;
696 /* If it's a newly allocated disk block for quota, zero it */
697 if (buffer_new(bh)) {
698 memset(bh->b_data, 0, bh->b_size);
699 set_buffer_uptodate(bh);
700 }
671 } 701 }
672 702
673 if (PageUptodate(page)) 703 if (PageUptodate(page))
@@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
677 ll_rw_block(READ_META, 1, &bh); 707 ll_rw_block(READ_META, 1, &bh);
678 wait_on_buffer(bh); 708 wait_on_buffer(bh);
679 if (!buffer_uptodate(bh)) 709 if (!buffer_uptodate(bh))
680 goto unlock; 710 goto unlock_out;
681 } 711 }
682 712
683 gfs2_trans_add_bh(ip->i_gl, bh, 0); 713 gfs2_trans_add_bh(ip->i_gl, bh, 0);
684 714
685 kaddr = kmap_atomic(page, KM_USER0); 715 kaddr = kmap_atomic(page, KM_USER0);
686 qp = kaddr + offset; 716 if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
687 value = (s64)be64_to_cpu(qp->qu_value) + change; 717 nbytes = PAGE_CACHE_SIZE - offset;
688 qp->qu_value = cpu_to_be64(value); 718 memcpy(kaddr + offset, ptr, nbytes);
689 qd->qd_qb.qb_value = qp->qu_value;
690 if (fdq) {
691 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
692 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
693 qd->qd_qb.qb_warn = qp->qu_warn;
694 }
695 if (fdq->d_fieldmask & FS_DQ_BHARD) {
696 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
697 qd->qd_qb.qb_limit = qp->qu_limit;
698 }
699 }
700 flush_dcache_page(page); 719 flush_dcache_page(page);
701 kunmap_atomic(kaddr, KM_USER0); 720 kunmap_atomic(kaddr, KM_USER0);
721 unlock_page(page);
722 page_cache_release(page);
702 723
724 /* If quota straddles page boundary, we need to update the rest of the
725 * quota at the beginning of the next page */
726 if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
727 ptr = ptr + nbytes;
728 nbytes = sizeof(struct gfs2_quota) - nbytes;
729 offset = 0;
730 index++;
731 goto get_a_page;
732 }
733
734 /* Update the disk inode timestamp and size (if extended) */
703 err = gfs2_meta_inode_buffer(ip, &dibh); 735 err = gfs2_meta_inode_buffer(ip, &dibh);
704 if (err) 736 if (err)
705 goto unlock; 737 goto out;
706 738
707 size = loc + sizeof(struct gfs2_quota); 739 size = loc + sizeof(struct gfs2_quota);
708 if (size > inode->i_size) { 740 if (size > inode->i_size) {
@@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
715 brelse(dibh); 747 brelse(dibh);
716 mark_inode_dirty(inode); 748 mark_inode_dirty(inode);
717 749
718unlock: 750out:
751 return err;
752unlock_out:
719 unlock_page(page); 753 unlock_page(page);
720 page_cache_release(page); 754 page_cache_release(page);
721 return err; 755 return err;
@@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
779 * rgrp since it won't be allocated during the transaction 813 * rgrp since it won't be allocated during the transaction
780 */ 814 */
781 al->al_requested = 1; 815 al->al_requested = 1;
782 /* +1 in the end for block requested above for unstuffing */ 816 /* +3 in the end for unstuffing block, inode size update block
783 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; 817 * and another block in case quota straddles page boundary and
818 * two blocks need to be updated instead of 1 */
819 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
784 820
785 if (nalloc) 821 if (nalloc)
786 al->al_requested += nalloc * (data_blocks + ind_blocks); 822 al->al_requested += nalloc * (data_blocks + ind_blocks);
@@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1418 1454
1419 memset(fqs, 0, sizeof(struct fs_quota_stat)); 1455 memset(fqs, 0, sizeof(struct fs_quota_stat));
1420 fqs->qs_version = FS_QSTAT_VERSION; 1456 fqs->qs_version = FS_QSTAT_VERSION;
1421 if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) 1457
1422 fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); 1458 switch (sdp->sd_args.ar_quota) {
1423 else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) 1459 case GFS2_QUOTA_ON:
1424 fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); 1460 fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
1461 /*FALLTHRU*/
1462 case GFS2_QUOTA_ACCOUNT:
1463 fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
1464 break;
1465 case GFS2_QUOTA_OFF:
1466 break;
1467 }
1468
1425 if (sdp->sd_quota_inode) { 1469 if (sdp->sd_quota_inode) {
1426 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; 1470 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
1427 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; 1471 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
@@ -1432,8 +1476,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1432 return 0; 1476 return 0;
1433} 1477}
1434 1478
1435static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id, 1479static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1436 struct fs_disk_quota *fdq) 1480 struct fs_disk_quota *fdq)
1437{ 1481{
1438 struct gfs2_sbd *sdp = sb->s_fs_info; 1482 struct gfs2_sbd *sdp = sb->s_fs_info;
1439 struct gfs2_quota_lvb *qlvb; 1483 struct gfs2_quota_lvb *qlvb;
@@ -1477,8 +1521,8 @@ out:
1477/* GFS2 only supports a subset of the XFS fields */ 1521/* GFS2 only supports a subset of the XFS fields */
1478#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) 1522#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
1479 1523
1480static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id, 1524static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1481 struct fs_disk_quota *fdq) 1525 struct fs_disk_quota *fdq)
1482{ 1526{
1483 struct gfs2_sbd *sdp = sb->s_fs_info; 1527 struct gfs2_sbd *sdp = sb->s_fs_info;
1484 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); 1528 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1585,7 +1629,7 @@ out_put:
1585const struct quotactl_ops gfs2_quotactl_ops = { 1629const struct quotactl_ops gfs2_quotactl_ops = {
1586 .quota_sync = gfs2_quota_sync, 1630 .quota_sync = gfs2_quota_sync,
1587 .get_xstate = gfs2_quota_get_xstate, 1631 .get_xstate = gfs2_quota_get_xstate,
1588 .get_xquota = gfs2_xquota_get, 1632 .get_dqblk = gfs2_get_dqblk,
1589 .set_xquota = gfs2_xquota_set, 1633 .set_dqblk = gfs2_set_dqblk,
1590}; 1634};
1591 1635
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 854 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 855 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 856 nr_sects, GFP_NOFS,
857 DISCARD_FL_BARRIER); 857 BLKDEV_IFL_WAIT |
858 BLKDEV_IFL_BARRIER);
858 if (rv) 859 if (rv)
859 goto fail; 860 goto fail;
860 nr_sects = 0; 861 nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
869 } 870 }
870 if (nr_sects) { 871 if (nr_sects) {
871 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
872 DISCARD_FL_BARRIER); 873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
873 if (rv) 874 if (rv)
874 goto fail; 875 goto fail;
875 } 876 }
@@ -948,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
948 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes 949 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
949 * @rgd: The rgrp 950 * @rgd: The rgrp
950 * 951 *
951 * Returns: The inode, if one has been found 952 * Returns: 0 if no error
953 * The inode, if one has been found, in inode.
952 */ 954 */
953 955
954static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 956static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
955 u64 skip) 957 u64 skip)
956{ 958{
957 struct inode *inode;
958 u32 goal = 0, block; 959 u32 goal = 0, block;
959 u64 no_addr; 960 u64 no_addr;
960 struct gfs2_sbd *sdp = rgd->rd_sbd; 961 struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
979 if (no_addr == skip) 980 if (no_addr == skip)
980 continue; 981 continue;
981 *last_unlinked = no_addr; 982 *last_unlinked = no_addr;
982 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 983 return no_addr;
983 no_addr, -1, 1);
984 if (!IS_ERR(inode))
985 return inode;
986 } 984 }
987 985
988 rgd->rd_flags &= ~GFS2_RDF_CHECK; 986 rgd->rd_flags &= ~GFS2_RDF_CHECK;
989 return NULL; 987 return 0;
990} 988}
991 989
992/** 990/**
@@ -1067,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1067 * Try to acquire rgrp in way which avoids contending with others. 1065 * Try to acquire rgrp in way which avoids contending with others.
1068 * 1066 *
1069 * Returns: errno 1067 * Returns: errno
1068 * unlinked: the block address of an unlinked block to be reclaimed
1070 */ 1069 */
1071 1070
1072static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) 1071static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1072 u64 *last_unlinked)
1073{ 1073{
1074 struct inode *inode = NULL;
1075 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1074 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1076 struct gfs2_rgrpd *rgd, *begin = NULL; 1075 struct gfs2_rgrpd *rgd, *begin = NULL;
1077 struct gfs2_alloc *al = ip->i_alloc; 1076 struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1080 int loops = 0; 1079 int loops = 0;
1081 int error, rg_locked; 1080 int error, rg_locked;
1082 1081
1082 *unlinked = 0;
1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1084 1084
1085 while (rgd) { 1085 while (rgd) {
@@ -1096,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1096 case 0: 1096 case 0:
1097 if (try_rgrp_fit(rgd, al)) 1097 if (try_rgrp_fit(rgd, al))
1098 goto out; 1098 goto out;
1099 if (rgd->rd_flags & GFS2_RDF_CHECK) 1099 /* If the rg came in already locked, there's no
1100 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1100 way we can recover from a failed try_rgrp_unlink
1101 because that would require an iput which can only
1102 happen after the rgrp is unlocked. */
1103 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1104 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1105 ip->i_no_addr);
1101 if (!rg_locked) 1106 if (!rg_locked)
1102 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1107 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1103 if (inode) 1108 if (*unlinked)
1104 return inode; 1109 return -EAGAIN;
1105 /* fall through */ 1110 /* fall through */
1106 case GLR_TRYFAILED: 1111 case GLR_TRYFAILED:
1107 rgd = recent_rgrp_next(rgd); 1112 rgd = recent_rgrp_next(rgd);
1108 break; 1113 break;
1109 1114
1110 default: 1115 default:
1111 return ERR_PTR(error); 1116 return error;
1112 } 1117 }
1113 } 1118 }
1114 1119
@@ -1130,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1130 case 0: 1135 case 0:
1131 if (try_rgrp_fit(rgd, al)) 1136 if (try_rgrp_fit(rgd, al))
1132 goto out; 1137 goto out;
1133 if (rgd->rd_flags & GFS2_RDF_CHECK) 1138 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1134 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1139 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1140 ip->i_no_addr);
1135 if (!rg_locked) 1141 if (!rg_locked)
1136 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1142 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1137 if (inode) 1143 if (*unlinked)
1138 return inode; 1144 return -EAGAIN;
1139 break; 1145 break;
1140 1146
1141 case GLR_TRYFAILED: 1147 case GLR_TRYFAILED:
@@ -1143,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1143 break; 1149 break;
1144 1150
1145 default: 1151 default:
1146 return ERR_PTR(error); 1152 return error;
1147 } 1153 }
1148 1154
1149 rgd = gfs2_rgrpd_get_next(rgd); 1155 rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1152 1158
1153 if (rgd == begin) { 1159 if (rgd == begin) {
1154 if (++loops >= 3) 1160 if (++loops >= 3)
1155 return ERR_PTR(-ENOSPC); 1161 return -ENOSPC;
1156 if (!skipped) 1162 if (!skipped)
1157 loops++; 1163 loops++;
1158 flags = 0; 1164 flags = 0;
@@ -1172,7 +1178,7 @@ out:
1172 forward_rgrp_set(sdp, rgd); 1178 forward_rgrp_set(sdp, rgd);
1173 } 1179 }
1174 1180
1175 return NULL; 1181 return 0;
1176} 1182}
1177 1183
1178/** 1184/**
@@ -1186,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1186{ 1192{
1187 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1188 struct gfs2_alloc *al = ip->i_alloc; 1194 struct gfs2_alloc *al = ip->i_alloc;
1189 struct inode *inode;
1190 int error = 0; 1195 int error = 0;
1191 u64 last_unlinked = NO_BLOCK; 1196 u64 last_unlinked = NO_BLOCK, unlinked;
1192 1197
1193 if (gfs2_assert_warn(sdp, al->al_requested)) 1198 if (gfs2_assert_warn(sdp, al->al_requested))
1194 return -EINVAL; 1199 return -EINVAL;
@@ -1204,17 +1209,27 @@ try_again:
1204 if (error) 1209 if (error)
1205 return error; 1210 return error;
1206 1211
1207 inode = get_local_rgrp(ip, &last_unlinked); 1212 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1208 if (inode) { 1213 dinodes along the way, error will equal -EAGAIN and unlinked will
1214 contains it block address. We then need to look up that inode and
1215 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) {
1209 if (ip != GFS2_I(sdp->sd_rindex)) 1218 if (ip != GFS2_I(sdp->sd_rindex))
1210 gfs2_glock_dq_uninit(&al->al_ri_gh); 1219 gfs2_glock_dq_uninit(&al->al_ri_gh);
1211 if (IS_ERR(inode)) 1220 if (error != -EAGAIN)
1212 return PTR_ERR(inode); 1221 return error;
1213 iput(inode); 1222
1223 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1224 /* regardless of whether or not gfs2_process_unlinked_inode
1225 was successful, we don't want to repeat it again. */
1226 last_unlinked = unlinked;
1214 gfs2_log_flush(sdp, NULL); 1227 gfs2_log_flush(sdp, NULL);
1228 error = 0;
1229
1215 goto try_again; 1230 goto try_again;
1216 } 1231 }
1217 1232 /* no error, so we have the rgrp set in the inode's allocation. */
1218 al->al_file = file; 1233 al->al_file = file;
1219 al->al_line = line; 1234 al->al_line = line;
1220 1235
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..4d1aad38f1b1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1113 int error; 1113 int error;
1114 1114
1115 spin_lock(&gt->gt_spin); 1115 spin_lock(&gt->gt_spin);
1116 args.ar_commit = gt->gt_log_flush_secs; 1116 args.ar_commit = gt->gt_logd_secs;
1117 args.ar_quota_quantum = gt->gt_quota_quantum; 1117 args.ar_quota_quantum = gt->gt_quota_quantum;
1118 if (gt->gt_statfs_slow) 1118 if (gt->gt_statfs_slow)
1119 args.ar_statfs_quantum = 0; 1119 args.ar_statfs_quantum = 0;
@@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1160 else 1160 else
1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); 1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
1162 spin_lock(&gt->gt_spin); 1162 spin_lock(&gt->gt_spin);
1163 gt->gt_log_flush_secs = args.ar_commit; 1163 gt->gt_logd_secs = args.ar_commit;
1164 gt->gt_quota_quantum = args.ar_quota_quantum; 1164 gt->gt_quota_quantum = args.ar_quota_quantum;
1165 if (args.ar_statfs_quantum) { 1165 if (args.ar_statfs_quantum) {
1166 gt->gt_statfs_slow = 0; 1166 gt->gt_statfs_slow = 0;
@@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1305 } 1305 }
1306 if (args->ar_discard) 1306 if (args->ar_discard)
1307 seq_printf(s, ",discard"); 1307 seq_printf(s, ",discard");
1308 val = sdp->sd_tune.gt_log_flush_secs; 1308 val = sdp->sd_tune.gt_logd_secs;
1309 if (val != 60) 1309 if (val != 30)
1310 seq_printf(s, ",commit=%d", val); 1310 seq_printf(s, ",commit=%d", val);
1311 val = sdp->sd_tune.gt_statfs_quantum; 1311 val = sdp->sd_tune.gt_statfs_quantum;
1312 if (val != 30) 1312 if (val != 30)
@@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1334 } 1334 }
1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1336 seq_printf(s, ",nobarrier"); 1336 seq_printf(s, ",nobarrier");
1337 1337 if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
1338 seq_printf(s, ",demote_interface_used");
1338 return 0; 1339 return 0;
1339} 1340}
1340 1341
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
54extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
55extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
56extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[]; 57extern const struct xattr_handler *gfs2_xattr_handlers[];
58 58
59#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
60 60
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 54fd98425991..37f5393e68e6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -232,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
232 glops = gfs2_glops_list[gltype]; 232 glops = gfs2_glops_list[gltype];
233 if (glops == NULL) 233 if (glops == NULL)
234 return -EINVAL; 234 return -EINVAL;
235 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
236 fs_info(sdp, "demote interface used\n");
235 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); 237 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
236 if (rv) 238 if (rv)
237 return rv; 239 return rv;
@@ -468,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
468} \ 470} \
469TUNE_ATTR_2(name, name##_store) 471TUNE_ATTR_2(name, name##_store)
470 472
471TUNE_ATTR(incore_log_blocks, 0);
472TUNE_ATTR(log_flush_secs, 0);
473TUNE_ATTR(quota_warn_period, 0); 473TUNE_ATTR(quota_warn_period, 0);
474TUNE_ATTR(quota_quantum, 0); 474TUNE_ATTR(quota_quantum, 0);
475TUNE_ATTR(max_readahead, 0); 475TUNE_ATTR(max_readahead, 0);
@@ -481,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1);
481TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 481TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
482 482
483static struct attribute *tune_attrs[] = { 483static struct attribute *tune_attrs[] = {
484 &tune_attr_incore_log_blocks.attr,
485 &tune_attr_log_flush_secs.attr,
486 &tune_attr_quota_warn_period.attr, 484 &tune_attr_quota_warn_period.attr,
487 &tune_attr_quota_quantum.attr, 485 &tune_attr_quota_quantum.attr,
488 &tune_attr_max_readahead.attr, 486 &tune_attr_max_readahead.attr,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
23#include "meta_io.h" 23#include "meta_io.h"
24#include "trans.h" 24#include "trans.h"
25#include "util.h" 25#include "util.h"
26#include "trace_gfs2.h"
26 27
27int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
28 unsigned int revokes) 29 unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
75 return error; 76 return error;
76} 77}
77 78
79/**
80 * gfs2_log_release - Release a given number of log blocks
81 * @sdp: The GFS2 superblock
82 * @blks: The number of blocks
83 *
84 */
85
86static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
87{
88
89 atomic_add(blks, &sdp->sd_log_blks_free);
90 trace_gfs2_log_blocks(sdp, blks);
91 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
92 sdp->sd_jdesc->jd_blocks);
93 up_read(&sdp->sd_log_flush_lock);
94}
95
78void gfs2_trans_end(struct gfs2_sbd *sdp) 96void gfs2_trans_end(struct gfs2_sbd *sdp)
79{ 97{
80 struct gfs2_trans *tr = current->journal_info; 98 struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..82f93da00d1b 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
1535 return error; 1535 return error;
1536} 1536}
1537 1537
1538static struct xattr_handler gfs2_xattr_user_handler = { 1538static const struct xattr_handler gfs2_xattr_user_handler = {
1539 .prefix = XATTR_USER_PREFIX, 1539 .prefix = XATTR_USER_PREFIX,
1540 .flags = GFS2_EATYPE_USR, 1540 .flags = GFS2_EATYPE_USR,
1541 .get = gfs2_xattr_get, 1541 .get = gfs2_xattr_get,
1542 .set = gfs2_xattr_set, 1542 .set = gfs2_xattr_set,
1543}; 1543};
1544 1544
1545static struct xattr_handler gfs2_xattr_security_handler = { 1545static const struct xattr_handler gfs2_xattr_security_handler = {
1546 .prefix = XATTR_SECURITY_PREFIX, 1546 .prefix = XATTR_SECURITY_PREFIX,
1547 .flags = GFS2_EATYPE_SECURITY, 1547 .flags = GFS2_EATYPE_SECURITY,
1548 .get = gfs2_xattr_get, 1548 .get = gfs2_xattr_get,
1549 .set = gfs2_xattr_set, 1549 .set = gfs2_xattr_set,
1550}; 1550};
1551 1551
1552struct xattr_handler *gfs2_xattr_handlers[] = { 1552const struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler, 1553 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler, 1554 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler, 1555 &gfs2_xattr_system_handler,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
494const struct file_operations hfsplus_dir_operations = { 494const struct file_operations hfsplus_dir_operations = {
495 .read = generic_read_dir, 495 .read = generic_read_dir,
496 .readdir = hfsplus_readdir, 496 .readdir = hfsplus_readdir,
497 .ioctl = hfsplus_ioctl, 497 .unlocked_ioctl = hfsplus_ioctl,
498 .llseek = generic_file_llseek, 498 .llseek = generic_file_llseek,
499 .release = hfsplus_dir_release, 499 .release = hfsplus_dir_release,
500}; 500};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
337void hfsplus_delete_inode(struct inode *); 337void hfsplus_delete_inode(struct inode *);
338 338
339/* ioctl.c */ 339/* ioctl.c */
340int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 340long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
341 unsigned long arg);
342int hfsplus_setxattr(struct dentry *dentry, const char *name, 341int hfsplus_setxattr(struct dentry *dentry, const char *name,
343 const void *value, size_t size, int flags); 342 const void *value, size_t size, int flags);
344ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 343ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
285 .fsync = file_fsync, 285 .fsync = file_fsync,
286 .open = hfsplus_file_open, 286 .open = hfsplus_file_open,
287 .release = hfsplus_file_release, 287 .release = hfsplus_file_release,
288 .ioctl = hfsplus_ioctl, 288 .unlocked_ioctl = hfsplus_ioctl,
289}; 289};
290 290
291struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 291struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "hfsplus_fs.h" 22#include "hfsplus_fs.h"
22 23
23int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 25{
26 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 27 unsigned int flags;
27 28
29 lock_kernel();
28 switch (cmd) { 30 switch (cmd) {
29 case HFSPLUS_IOC_EXT2_GETFLAGS: 31 case HFSPLUS_IOC_EXT2_GETFLAGS:
30 flags = 0; 32 flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
38 case HFSPLUS_IOC_EXT2_SETFLAGS: { 40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
39 int err = 0; 41 int err = 0;
40 err = mnt_want_write(filp->f_path.mnt); 42 err = mnt_want_write(filp->f_path.mnt);
41 if (err) 43 if (err) {
44 unlock_kernel();
42 return err; 45 return err;
46 }
43 47
44 if (!is_owner_or_cap(inode)) { 48 if (!is_owner_or_cap(inode)) {
45 err = -EACCES; 49 err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
85 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
86setflags_out: 90setflags_out:
87 mnt_drop_write(filp->f_path.mnt); 91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
88 return err; 93 return err;
89 } 94 }
90 default: 95 default:
96 unlock_kernel();
91 return -ENOTTY; 97 return -ENOTTY;
92 } 98 }
93} 99}
diff --git a/fs/inode.c b/fs/inode.c
index 258ec22bb298..2bee20ae3d65 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,11 +286,9 @@ static void init_once(void *foo)
286 */ 286 */
287void __iget(struct inode *inode) 287void __iget(struct inode *inode)
288{ 288{
289 if (atomic_read(&inode->i_count)) { 289 if (atomic_inc_return(&inode->i_count) != 1)
290 atomic_inc(&inode->i_count);
291 return; 290 return;
292 } 291
293 atomic_inc(&inode->i_count);
294 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 292 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
295 list_move(&inode->i_list, &inode_in_use); 293 list_move(&inode->i_list, &inode_in_use);
296 inodes_stat.nr_unused--; 294 inodes_stat.nr_unused--;
@@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1608 inode->i_ino); 1606 inode->i_ino);
1609} 1607}
1610EXPORT_SYMBOL(init_special_inode); 1608EXPORT_SYMBOL(init_special_inode);
1609
1610/**
1611 * Init uid,gid,mode for new inode according to posix standards
1612 * @inode: New inode
1613 * @dir: Directory inode
1614 * @mode: mode of the new inode
1615 */
1616void inode_init_owner(struct inode *inode, const struct inode *dir,
1617 mode_t mode)
1618{
1619 inode->i_uid = current_fsuid();
1620 if (dir && dir->i_mode & S_ISGID) {
1621 inode->i_gid = dir->i_gid;
1622 if (S_ISDIR(mode))
1623 mode |= S_ISGID;
1624 } else
1625 inode->i_gid = current_fsgid();
1626 inode->i_mode = mode;
1627}
1628EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..6b706bc60a66 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void);
87 * super.c 87 * super.c
88 */ 88 */
89extern int do_remount_sb(struct super_block *, int, void *, int); 89extern int do_remount_sb(struct super_block *, int, void *, int);
90extern void __put_super(struct super_block *sb);
91extern void put_super(struct super_block *sb);
90 92
91/* 93/*
92 * open.c 94 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4da939..2d140a713861 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
525 if (sb->s_op->freeze_fs == NULL) 525 if (sb->s_op->freeze_fs == NULL)
526 return -EOPNOTSUPP; 526 return -EOPNOTSUPP;
527 527
528 /* If a blockdevice-backed filesystem isn't specified, return. */
529 if (sb->s_bdev == NULL)
530 return -EINVAL;
531
532 /* Freeze */ 528 /* Freeze */
533 sb = freeze_bdev(sb->s_bdev); 529 return freeze_super(sb);
534 if (IS_ERR(sb))
535 return PTR_ERR(sb);
536 return 0;
537} 530}
538 531
539static int ioctl_fsthaw(struct file *filp) 532static int ioctl_fsthaw(struct file *filp)
@@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
543 if (!capable(CAP_SYS_ADMIN)) 536 if (!capable(CAP_SYS_ADMIN))
544 return -EPERM; 537 return -EPERM;
545 538
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 /* Thaw */ 539 /* Thaw */
551 return thaw_bdev(sb->s_bdev, sb); 540 return thaw_super(sb);
552} 541}
553 542
554/* 543/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ecb44c94ba8d..28a9ddaa0c49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -786,6 +786,12 @@ wait_for_iobuf:
786 786
787 jbd_debug(3, "JBD: commit phase 6\n"); 787 jbd_debug(3, "JBD: commit phase 6\n");
788 788
789 /* All metadata is written, now write commit record and do cleanup */
790 spin_lock(&journal->j_state_lock);
791 J_ASSERT(commit_transaction->t_state == T_COMMIT);
792 commit_transaction->t_state = T_COMMIT_RECORD;
793 spin_unlock(&journal->j_state_lock);
794
789 if (journal_write_commit_record(journal, commit_transaction)) 795 if (journal_write_commit_record(journal, commit_transaction))
790 err = -EIO; 796 err = -EIO;
791 797
@@ -923,7 +929,7 @@ restart_loop:
923 929
924 jbd_debug(3, "JBD: commit phase 8\n"); 930 jbd_debug(3, "JBD: commit phase 8\n");
925 931
926 J_ASSERT(commit_transaction->t_state == T_COMMIT); 932 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
927 933
928 commit_transaction->t_state = T_FINISHED; 934 commit_transaction->t_state = T_FINISHED;
929 J_ASSERT(commit_transaction == journal->j_committing_transaction); 935 J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..93d1e47647bd 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
565} 565}
566 566
567/* 567/*
568 * Return 1 if a given transaction has not yet sent barrier request
569 * connected with a transaction commit. If 0 is returned, transaction
570 * may or may not have sent the barrier. Used to avoid sending barrier
571 * twice in common cases.
572 */
573int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
574{
575 int ret = 0;
576 transaction_t *commit_trans;
577
578 if (!(journal->j_flags & JFS_BARRIER))
579 return 0;
580 spin_lock(&journal->j_state_lock);
581 /* Transaction already committed? */
582 if (tid_geq(journal->j_commit_sequence, tid))
583 goto out;
584 /*
585 * Transaction is being committed and we already proceeded to
586 * writing commit record?
587 */
588 commit_trans = journal->j_committing_transaction;
589 if (commit_trans && commit_trans->t_tid == tid &&
590 commit_trans->t_state >= T_COMMIT_RECORD)
591 goto out;
592 ret = 1;
593out:
594 spin_unlock(&journal->j_state_lock);
595 return ret;
596}
597EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
598
599/*
568 * Log buffer allocation routines: 600 * Log buffer allocation routines:
569 */ 601 */
570 602
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
1157{ 1189{
1158 int err = 0; 1190 int err = 0;
1159 1191
1192
1160 /* Wait for the commit thread to wake up and die. */ 1193 /* Wait for the commit thread to wake up and die. */
1161 journal_kill_thread(journal); 1194 journal_kill_thread(journal);
1162 1195
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 530 */
531 if ((journal->j_fs_dev != journal->j_dev) && 531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL); 533 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
534 BLKDEV_IFL_WAIT);
534 if (!(journal->j_flags & JBD2_ABORT)) 535 if (!(journal->j_flags & JBD2_ABORT))
535 jbd2_journal_update_superblock(journal, 1); 536 jbd2_journal_update_superblock(journal, 1);
536 return 0; 537 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL); 720 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
721 BLKDEV_IFL_WAIT);
721 722
722 /* Done it all: now write the commit record asynchronously. */ 723 /* Done it all: now write the commit record asynchronously. */
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 724 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
727 if (err) 728 if (err)
728 __jbd2_journal_abort_hard(journal); 729 __jbd2_journal_abort_hard(journal);
729 if (journal->j_flags & JBD2_BARRIER) 730 if (journal->j_flags & JBD2_BARRIER)
730 blkdev_issue_flush(journal->j_dev, NULL); 731 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 BLKDEV_IFL_WAIT);
731 } 733 }
732 734
733 err = journal_finish_inode_data_buffers(journal, commit_transaction); 735 err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..a33aab6b5e68 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
419 return rc; 419 return rc;
420} 420}
421 421
422struct xattr_handler jffs2_acl_access_xattr_handler = { 422const struct xattr_handler jffs2_acl_access_xattr_handler = {
423 .prefix = POSIX_ACL_XATTR_ACCESS, 423 .prefix = POSIX_ACL_XATTR_ACCESS,
424 .flags = ACL_TYPE_DEFAULT, 424 .flags = ACL_TYPE_DEFAULT,
425 .list = jffs2_acl_access_listxattr, 425 .list = jffs2_acl_access_listxattr,
@@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
427 .set = jffs2_acl_setxattr, 427 .set = jffs2_acl_setxattr,
428}; 428};
429 429
430struct xattr_handler jffs2_acl_default_xattr_handler = { 430const struct xattr_handler jffs2_acl_default_xattr_handler = {
431 .prefix = POSIX_ACL_XATTR_DEFAULT, 431 .prefix = POSIX_ACL_XATTR_DEFAULT,
432 .flags = ACL_TYPE_DEFAULT, 432 .flags = ACL_TYPE_DEFAULT,
433 .list = jffs2_acl_default_listxattr, 433 .list = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern struct xattr_handler jffs2_acl_access_xattr_handler; 34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
35extern struct xattr_handler jffs2_acl_default_xattr_handler; 35extern const struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..55f1dde2fa8b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
23 23
24void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) 24void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
25{ 25{
26 spin_lock(&c->erase_completion_lock); 26 assert_spin_locked(&c->erase_completion_lock);
27 if (c->gc_task && jffs2_thread_should_wake(c)) 27 if (c->gc_task && jffs2_thread_should_wake(c))
28 send_sig(SIGHUP, c->gc_task, 1); 28 send_sig(SIGHUP, c->gc_task, 1);
29 spin_unlock(&c->erase_completion_lock);
30} 29}
31 30
32/* This must only ever be called when no GC thread is currently running */ 31/* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..6286ad9b00f7 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
103 jffs2_erase_failed(c, jeb, bad_offset); 103 jffs2_erase_failed(c, jeb, bad_offset);
104} 104}
105 105
106void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) 106int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
107{ 107{
108 struct jffs2_eraseblock *jeb; 108 struct jffs2_eraseblock *jeb;
109 int work_done = 0;
109 110
110 mutex_lock(&c->erase_free_sem); 111 mutex_lock(&c->erase_free_sem);
111 112
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
121 mutex_unlock(&c->erase_free_sem); 122 mutex_unlock(&c->erase_free_sem);
122 jffs2_mark_erased_block(c, jeb); 123 jffs2_mark_erased_block(c, jeb);
123 124
125 work_done++;
124 if (!--count) { 126 if (!--count) {
125 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); 127 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
126 goto done; 128 goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
157 mutex_unlock(&c->erase_free_sem); 159 mutex_unlock(&c->erase_free_sem);
158 done: 160 done:
159 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); 161 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
162 return work_done;
160} 163}
161 164
162static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) 165static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
165 mutex_lock(&c->erase_free_sem); 168 mutex_lock(&c->erase_free_sem);
166 spin_lock(&c->erase_completion_lock); 169 spin_lock(&c->erase_completion_lock);
167 list_move_tail(&jeb->list, &c->erase_complete_list); 170 list_move_tail(&jeb->list, &c->erase_complete_list);
171 /* Wake the GC thread to mark them clean */
172 jffs2_garbage_collect_trigger(c);
168 spin_unlock(&c->erase_completion_lock); 173 spin_unlock(&c->erase_completion_lock);
169 mutex_unlock(&c->erase_free_sem); 174 mutex_unlock(&c->erase_free_sem);
170 /* Ensure that kupdated calls us again to mark them clean */ 175 wake_up(&c->erase_wait);
171 jffs2_erase_pending_trigger(c);
172} 176}
173 177
174static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) 178static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
487 491
488refile: 492refile:
489 /* Stick it back on the list from whence it came and come back later */ 493 /* Stick it back on the list from whence it came and come back later */
490 jffs2_erase_pending_trigger(c);
491 mutex_lock(&c->erase_free_sem); 494 mutex_lock(&c->erase_free_sem);
492 spin_lock(&c->erase_completion_lock); 495 spin_lock(&c->erase_completion_lock);
496 jffs2_garbage_collect_trigger(c);
493 list_move(&jeb->list, &c->erase_complete_list); 497 list_move(&jeb->list, &c->erase_complete_list);
494 spin_unlock(&c->erase_completion_lock); 498 spin_unlock(&c->erase_completion_lock);
495 mutex_unlock(&c->erase_free_sem); 499 mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..86e0821fc989 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
313 case S_IFBLK: 313 case S_IFBLK:
314 case S_IFCHR: 314 case S_IFCHR:
315 /* Read the device numbers from the media */ 315 /* Read the device numbers from the media */
316 if (f->metadata->size != sizeof(jdev.old) && 316 if (f->metadata->size != sizeof(jdev.old_id) &&
317 f->metadata->size != sizeof(jdev.new)) { 317 f->metadata->size != sizeof(jdev.new_id)) {
318 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); 318 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
319 goto error_io; 319 goto error_io;
320 } 320 }
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
325 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); 325 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
326 goto error; 326 goto error;
327 } 327 }
328 if (f->metadata->size == sizeof(jdev.old)) 328 if (f->metadata->size == sizeof(jdev.old_id))
329 rdev = old_decode_dev(je16_to_cpu(jdev.old)); 329 rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
330 else 330 else
331 rdev = new_decode_dev(je32_to_cpu(jdev.new)); 331 rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
332 332
333 case S_IFSOCK: 333 case S_IFSOCK:
334 case S_IFIFO: 334 case S_IFIFO:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..f5e96bd656e8 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
214 return ret; 214 return ret;
215 } 215 }
216 216
217 /* If there are any blocks which need erasing, erase them now */
218 if (!list_empty(&c->erase_complete_list) ||
219 !list_empty(&c->erase_pending_list)) {
220 spin_unlock(&c->erase_completion_lock);
221 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
222 if (jffs2_erase_pending_blocks(c, 1)) {
223 mutex_unlock(&c->alloc_sem);
224 return 0;
225 }
226 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
227 spin_lock(&c->erase_completion_lock);
228 }
229
217 /* First, work out which block we're garbage-collecting */ 230 /* First, work out which block we're garbage-collecting */
218 jeb = c->gcblock; 231 jeb = c->gcblock;
219 232
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
222 235
223 if (!jeb) { 236 if (!jeb) {
224 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ 237 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
225 if (!list_empty(&c->erase_pending_list)) { 238 if (c->nr_erasing_blocks) {
226 spin_unlock(&c->erase_completion_lock); 239 spin_unlock(&c->erase_completion_lock);
227 mutex_unlock(&c->alloc_sem); 240 mutex_unlock(&c->alloc_sem);
228 return -EAGAIN; 241 return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
435 list_add_tail(&c->gcblock->list, &c->erase_pending_list); 448 list_add_tail(&c->gcblock->list, &c->erase_pending_list);
436 c->gcblock = NULL; 449 c->gcblock = NULL;
437 c->nr_erasing_blocks++; 450 c->nr_erasing_blocks++;
438 jffs2_erase_pending_trigger(c); 451 jffs2_garbage_collect_trigger(c);
439 } 452 }
440 spin_unlock(&c->erase_completion_lock); 453 spin_unlock(&c->erase_completion_lock);
441 454
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..a881a42f19e3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
312static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) 312static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
313{ 313{
314 if (old_valid_dev(rdev)) { 314 if (old_valid_dev(rdev)) {
315 jdev->old = cpu_to_je16(old_encode_dev(rdev)); 315 jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
316 return sizeof(jdev->old); 316 return sizeof(jdev->old_id);
317 } else { 317 } else {
318 jdev->new = cpu_to_je32(new_encode_dev(rdev)); 318 jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
319 return sizeof(jdev->new); 319 return sizeof(jdev->new_id);
320 } 320 }
321} 321}
322 322
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
464int jffs2_do_mount_fs(struct jffs2_sb_info *c); 464int jffs2_do_mount_fs(struct jffs2_sb_info *c);
465 465
466/* erase.c */ 466/* erase.c */
467void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); 467int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
468void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); 468void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
469 469
470#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 470#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 191359dde4e1..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -116,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
116 116
117 ret = jffs2_garbage_collect_pass(c); 117 ret = jffs2_garbage_collect_pass(c);
118 118
119 if (ret == -EAGAIN) 119 if (ret == -EAGAIN) {
120 jffs2_erase_pending_blocks(c, 1); 120 spin_lock(&c->erase_completion_lock);
121 else if (ret) 121 if (c->nr_erasing_blocks &&
122 list_empty(&c->erase_pending_list) &&
123 list_empty(&c->erase_complete_list)) {
124 DECLARE_WAITQUEUE(wait, current);
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 add_wait_queue(&c->erase_wait, &wait);
127 D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
128 spin_unlock(&c->erase_completion_lock);
129
130 schedule();
131 } else
132 spin_unlock(&c->erase_completion_lock);
133 } else if (ret)
122 return ret; 134 return ret;
123 135
124 cond_resched(); 136 cond_resched();
@@ -217,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
217 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); 229 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
218 list_move_tail(&ejeb->list, &c->erase_pending_list); 230 list_move_tail(&ejeb->list, &c->erase_pending_list);
219 c->nr_erasing_blocks++; 231 c->nr_erasing_blocks++;
220 jffs2_erase_pending_trigger(c); 232 jffs2_garbage_collect_trigger(c);
221 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", 233 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
222 ejeb->offset)); 234 ejeb->offset));
223 } 235 }
@@ -469,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
469void jffs2_complete_reservation(struct jffs2_sb_info *c) 481void jffs2_complete_reservation(struct jffs2_sb_info *c)
470{ 482{
471 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); 483 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
484 spin_lock(&c->erase_completion_lock);
472 jffs2_garbage_collect_trigger(c); 485 jffs2_garbage_collect_trigger(c);
486 spin_unlock(&c->erase_completion_lock);
473 mutex_unlock(&c->alloc_sem); 487 mutex_unlock(&c->alloc_sem);
474} 488}
475 489
@@ -611,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
611 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 625 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
612 list_add_tail(&jeb->list, &c->erase_pending_list); 626 list_add_tail(&jeb->list, &c->erase_pending_list);
613 c->nr_erasing_blocks++; 627 c->nr_erasing_blocks++;
614 jffs2_erase_pending_trigger(c); 628 jffs2_garbage_collect_trigger(c);
615 } else { 629 } else {
616 /* Sometimes, however, we leave it elsewhere so it doesn't get 630 /* Sometimes, however, we leave it elsewhere so it doesn't get
617 immediately reused, and we spread the load a bit. */ 631 immediately reused, and we spread the load a bit. */
@@ -732,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
732 int nr_very_dirty = 0; 746 int nr_very_dirty = 0;
733 struct jffs2_eraseblock *jeb; 747 struct jffs2_eraseblock *jeb;
734 748
749 if (!list_empty(&c->erase_complete_list) ||
750 !list_empty(&c->erase_pending_list))
751 return 1;
752
735 if (c->unchecked_size) { 753 if (c->unchecked_size) {
736 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", 754 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
737 c->unchecked_size, c->checked_ino)); 755 c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..035a767f958b 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
140 140
141#endif /* WRITEBUFFER */ 141#endif /* WRITEBUFFER */
142 142
143/* erase.c */ 143static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
144static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
145{ 144{
146 OFNI_BS_2SFFJ(c)->s_dirt = 1; 145 OFNI_BS_2SFFJ(c)->s_dirt = 1;
147} 146}
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
260 ret = -EIO; 260 ret = -EIO;
261 goto out; 261 goto out;
262 } 262 }
263 jffs2_erase_pending_trigger(c); 263 spin_lock(&c->erase_completion_lock);
264 jffs2_garbage_collect_trigger(c);
265 spin_unlock(&c->erase_completion_lock);
264 } 266 }
265 ret = 0; 267 ret = 0;
266 out: 268 out:
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
77 return retlen; 77 return retlen;
78} 78}
79 79
80struct xattr_handler jffs2_security_xattr_handler = { 80const struct xattr_handler jffs2_security_xattr_handler = {
81 .prefix = XATTR_SECURITY_PREFIX, 81 .prefix = XATTR_SECURITY_PREFIX,
82 .list = jffs2_security_listxattr, 82 .list = jffs2_security_listxattr,
83 .set = jffs2_security_setxattr, 83 .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..511e2d609d12 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
63 63
64 if (!(sb->s_flags & MS_RDONLY)) { 64 if (!(sb->s_flags & MS_RDONLY)) {
65 D1(printk(KERN_DEBUG "jffs2_write_super()\n")); 65 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
66 jffs2_garbage_collect_trigger(c);
67 jffs2_erase_pending_blocks(c, 0);
68 jffs2_flush_wbuf_gc(c, 0); 66 jffs2_flush_wbuf_gc(c, 0);
69 } 67 }
70 68
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
84 struct jffs2_inodirty *new; 84 struct jffs2_inodirty *new;
85 85
86 /* Mark the superblock dirty so that kupdated will flush... */ 86 /* Mark the superblock dirty so that kupdated will flush... */
87 jffs2_erase_pending_trigger(c); 87 jffs2_dirty_trigger(c);
88 88
89 if (jffs2_wbuf_pending_for_ino(c, ino)) 89 if (jffs2_wbuf_pending_for_ino(c, ino))
90 return; 90 return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
122 list_add_tail(&jeb->list, &c->erase_pending_list); 122 list_add_tail(&jeb->list, &c->erase_pending_list);
123 c->nr_erasing_blocks++; 123 c->nr_erasing_blocks++;
124 jffs2_erase_pending_trigger(c); 124 jffs2_garbage_collect_trigger(c);
125 } else { 125 } else {
126 /* Sometimes, however, we leave it elsewhere so it doesn't get 126 /* Sometimes, however, we leave it elsewhere so it doesn't get
127 immediately reused, and we spread the load a bit. */ 127 immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); 152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
153 list_add(&jeb->list, &c->erase_pending_list); 153 list_add(&jeb->list, &c->erase_pending_list);
154 c->nr_erasing_blocks++; 154 c->nr_erasing_blocks++;
155 jffs2_erase_pending_trigger(c); 155 jffs2_garbage_collect_trigger(c);
156 } 156 }
157 157
158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { 158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); 543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
544 list_move(&jeb->list, &c->erase_pending_list); 544 list_move(&jeb->list, &c->erase_pending_list);
545 c->nr_erasing_blocks++; 545 c->nr_erasing_blocks++;
546 jffs2_erase_pending_trigger(c); 546 jffs2_garbage_collect_trigger(c);
547 } 547 }
548 548
549 jffs2_dbg_acct_sanity_check_nolock(c, jeb); 549 jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..a2d58c96f1b4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) 904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
905 * is an implementation of setxattr handler on jffs2. 905 * is an implementation of setxattr handler on jffs2.
906 * -------------------------------------------------- */ 906 * -------------------------------------------------- */
907struct xattr_handler *jffs2_xattr_handlers[] = { 907const struct xattr_handler *jffs2_xattr_handlers[] = {
908 &jffs2_user_xattr_handler, 908 &jffs2_user_xattr_handler,
909#ifdef CONFIG_JFFS2_FS_SECURITY 909#ifdef CONFIG_JFFS2_FS_SECURITY
910 &jffs2_security_xattr_handler, 910 &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
917 NULL 917 NULL
918}; 918};
919 919
920static struct xattr_handler *xprefix_to_handler(int xprefix) { 920static const struct xattr_handler *xprefix_to_handler(int xprefix) {
921 struct xattr_handler *ret; 921 const struct xattr_handler *ret;
922 922
923 switch (xprefix) { 923 switch (xprefix) {
924 case JFFS2_XPREFIX_USER: 924 case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
955 struct jffs2_inode_cache *ic = f->inocache; 955 struct jffs2_inode_cache *ic = f->inocache;
956 struct jffs2_xattr_ref *ref, **pref; 956 struct jffs2_xattr_ref *ref, **pref;
957 struct jffs2_xattr_datum *xd; 957 struct jffs2_xattr_datum *xd;
958 struct xattr_handler *xhandle; 958 const struct xattr_handler *xhandle;
959 ssize_t len, rc; 959 ssize_t len, rc;
960 int retry = 0; 960 int retry = 0;
961 961
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, 93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
94 const char *buffer, size_t size, int flags); 94 const char *buffer, size_t size, int flags);
95 95
96extern struct xattr_handler *jffs2_xattr_handlers[]; 96extern const struct xattr_handler *jffs2_xattr_handlers[];
97extern struct xattr_handler jffs2_user_xattr_handler; 97extern const struct xattr_handler jffs2_user_xattr_handler;
98extern struct xattr_handler jffs2_trusted_xattr_handler; 98extern const struct xattr_handler jffs2_trusted_xattr_handler;
99 99
100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); 100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
101#define jffs2_getxattr generic_getxattr 101#define jffs2_getxattr generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
122 122
123#ifdef CONFIG_JFFS2_FS_SECURITY 123#ifdef CONFIG_JFFS2_FS_SECURITY
124extern int jffs2_init_security(struct inode *inode, struct inode *dir); 124extern int jffs2_init_security(struct inode *inode, struct inode *dir);
125extern struct xattr_handler jffs2_security_xattr_handler; 125extern const struct xattr_handler jffs2_security_xattr_handler;
126#else 126#else
127#define jffs2_init_security(inode,dir) (0) 127#define jffs2_init_security(inode,dir) (0)
128#endif /* CONFIG_JFFS2_FS_SECURITY */ 128#endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_trusted_xattr_handler = { 50const struct xattr_handler jffs2_trusted_xattr_handler = {
51 .prefix = XATTR_TRUSTED_PREFIX, 51 .prefix = XATTR_TRUSTED_PREFIX,
52 .list = jffs2_trusted_listxattr, 52 .list = jffs2_trusted_listxattr,
53 .set = jffs2_trusted_setxattr, 53 .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_user_xattr_handler = { 50const struct xattr_handler jffs2_user_xattr_handler = {
51 .prefix = XATTR_USER_PREFIX, 51 .prefix = XATTR_USER_PREFIX,
52 .list = jffs2_user_listxattr, 52 .list = jffs2_user_listxattr,
53 .set = jffs2_user_setxattr, 53 .set = jffs2_user_setxattr,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982b3f24..85d9ec659225 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -98,7 +98,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
98 if (rc) 98 if (rc)
99 return rc; 99 return rc;
100 100
101 if (iattr->ia_valid & ATTR_SIZE) 101 if (is_quota_modification(inode, iattr))
102 dquot_initialize(inode); 102 dquot_initialize(inode);
103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
98 goto fail_unlock; 98 goto fail_unlock;
99 } 99 }
100 100
101 inode->i_uid = current_fsuid(); 101 inode_init_owner(inode, parent, mode);
102 if (parent->i_mode & S_ISGID) {
103 inode->i_gid = parent->i_gid;
104 if (S_ISDIR(mode))
105 mode |= S_ISGID;
106 } else
107 inode->i_gid = current_fsgid();
108
109 /* 102 /*
110 * New inodes need to save sane values on disk when 103 * New inodes need to save sane values on disk when
111 * uid & gid mount options are used 104 * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
121 if (rc) 114 if (rc)
122 goto fail_drop; 115 goto fail_drop;
123 116
124 inode->i_mode = mode;
125 /* inherit flags from parent */ 117 /* inherit flags from parent */
126 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; 118 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
127 119
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
134 if (S_ISLNK(mode)) 126 if (S_ISLNK(mode))
135 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); 127 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
136 } 128 }
137 jfs_inode->mode2 |= mode; 129 jfs_inode->mode2 |= inode->i_mode;
138 130
139 inode->i_blocks = 0; 131 inode->i_blocks = 0;
140 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 132 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 755a92e8daa7..f602e230e162 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
358 inode->i_mode = mode; 358 inode->i_mode = mode;
359 logfs_set_ino_generation(sb, inode); 359 logfs_set_ino_generation(sb, inode);
360 360
361 inode->i_uid = current_fsuid(); 361 inode_init_owner(inode, dir, mode);
362 inode->i_gid = current_fsgid();
363 if (dir->i_mode & S_ISGID) {
364 inode->i_gid = dir->i_gid;
365 if (S_ISDIR(mode))
366 inode->i_mode |= S_ISGID;
367 }
368
369 logfs_inode_setops(inode); 362 logfs_inode_setops(inode);
370 insert_inode_hash(inode); 363 insert_inode_hash(inode);
371 364
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..482779fe4e7c 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
221 clear_inode(inode); /* clear in-memory copy */ 221 clear_inode(inode); /* clear in-memory copy */
222} 222}
223 223
224struct inode * minix_new_inode(const struct inode * dir, int * error) 224struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
225{ 225{
226 struct super_block *sb = dir->i_sb; 226 struct super_block *sb = dir->i_sb;
227 struct minix_sb_info *sbi = minix_sb(sb); 227 struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
263 iput(inode); 263 iput(inode);
264 return NULL; 264 return NULL;
265 } 265 }
266 inode->i_uid = current_fsuid(); 266 inode_init_owner(inode, dir, mode);
267 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
268 inode->i_ino = j; 267 inode->i_ino = j;
269 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 268 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
270 inode->i_blocks = 0; 269 inode->i_blocks = 0;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..111f34ee9e3b 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
46extern struct inode *minix_iget(struct super_block *, unsigned long); 46extern struct inode *minix_iget(struct super_block *, unsigned long);
47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); 47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode * dir, int * error); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..e20ee85955d1 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
46 if (!old_valid_dev(rdev)) 46 if (!old_valid_dev(rdev))
47 return -EINVAL; 47 return -EINVAL;
48 48
49 inode = minix_new_inode(dir, &error); 49 inode = minix_new_inode(dir, mode, &error);
50 50
51 if (inode) { 51 if (inode) {
52 inode->i_mode = mode;
53 minix_set_inode(inode, rdev); 52 minix_set_inode(inode, rdev);
54 mark_inode_dirty(inode); 53 mark_inode_dirty(inode);
55 error = add_nondir(dentry, inode); 54 error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
73 if (i > dir->i_sb->s_blocksize) 72 if (i > dir->i_sb->s_blocksize)
74 goto out; 73 goto out;
75 74
76 inode = minix_new_inode(dir, &err); 75 inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
77 if (!inode) 76 if (!inode)
78 goto out; 77 goto out;
79 78
80 inode->i_mode = S_IFLNK | 0777;
81 minix_set_inode(inode, 0); 79 minix_set_inode(inode, 0);
82 err = page_symlink(inode, symname, i); 80 err = page_symlink(inode, symname, i);
83 if (err) 81 if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
117 115
118 inode_inc_link_count(dir); 116 inode_inc_link_count(dir);
119 117
120 inode = minix_new_inode(dir, &err); 118 inode = minix_new_inode(dir, mode, &err);
121 if (!inode) 119 if (!inode)
122 goto out_dir; 120 goto out_dir;
123 121
124 inode->i_mode = S_IFDIR | mode;
125 if (dir->i_mode & S_ISGID)
126 inode->i_mode |= S_ISGID;
127 minix_set_inode(inode, 0); 122 minix_set_inode(inode, 0);
128 123
129 inode_inc_link_count(inode); 124 inode_inc_link_count(inode);
diff --git a/fs/namei.c b/fs/namei.c
index b86b96fe1dc3..48e1f60520ea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
523static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 523static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
524{ 524{
525 dput(nd->path.dentry); 525 dput(nd->path.dentry);
526 if (nd->path.mnt != path->mnt) 526 if (nd->path.mnt != path->mnt) {
527 mntput(nd->path.mnt); 527 mntput(nd->path.mnt);
528 nd->path.mnt = path->mnt; 528 nd->path.mnt = path->mnt;
529 }
529 nd->path.dentry = path->dentry; 530 nd->path.dentry = path->dentry;
530} 531}
531 532
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..92dde6f8d893 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -51,7 +51,7 @@ const struct file_operations ncp_dir_operations =
51{ 51{
52 .read = generic_read_dir, 52 .read = generic_read_dir,
53 .readdir = ncp_readdir, 53 .readdir = ncp_readdir,
54 .ioctl = ncp_ioctl, 54 .unlocked_ioctl = ncp_ioctl,
55#ifdef CONFIG_COMPAT 55#ifdef CONFIG_COMPAT
56 .compat_ioctl = ncp_compat_ioctl, 56 .compat_ioctl = ncp_compat_ioctl,
57#endif 57#endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..b93870892892 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
295 .llseek = ncp_remote_llseek, 295 .llseek = ncp_remote_llseek,
296 .read = ncp_file_read, 296 .read = ncp_file_read,
297 .write = ncp_file_write, 297 .write = ncp_file_write,
298 .ioctl = ncp_ioctl, 298 .unlocked_ioctl = ncp_ioctl,
299#ifdef CONFIG_COMPAT 299#ifdef CONFIG_COMPAT
300 .compat_ioctl = ncp_compat_ioctl, 300 .compat_ioctl = ncp_compat_ioctl,
301#endif 301#endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,6 +20,7 @@
20#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/smp_lock.h>
23 24
24#include <linux/ncp_fs.h> 25#include <linux/ncp_fs.h>
25 26
@@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 262}
262#endif /* CONFIG_NCPFS_NLS */ 263#endif /* CONFIG_NCPFS_NLS */
263 264
264static int __ncp_ioctl(struct inode *inode, struct file *filp, 265static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
265 unsigned int cmd, unsigned long arg)
266{ 266{
267 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 268 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 269 int result;
269 struct ncp_ioctl_request request; 270 struct ncp_ioctl_request request;
@@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
841 } 842 }
842} 843}
843 844
844int ncp_ioctl(struct inode *inode, struct file *filp, 845long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845 unsigned int cmd, unsigned long arg)
846{ 846{
847 int ret; 847 long ret;
848 848
849 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) { 850 if (ncp_ioctl_need_write(cmd)) {
850 /* 851 /*
851 * inside the ioctl(), any failures which 852 * inside the ioctl(), any failures which
@@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
853 * -EACCESS, so it seems consistent to keep 854 * -EACCESS, so it seems consistent to keep
854 * that here. 855 * that here.
855 */ 856 */
856 if (mnt_want_write(filp->f_path.mnt)) 857 if (mnt_want_write(filp->f_path.mnt)) {
857 return -EACCES; 858 ret = -EACCES;
859 goto out;
860 }
858 } 861 }
859 ret = __ncp_ioctl(inode, filp, cmd, arg); 862 ret = __ncp_ioctl(filp, cmd, arg);
860 if (ncp_ioctl_need_write(cmd)) 863 if (ncp_ioctl_need_write(cmd))
861 mnt_drop_write(filp->f_path.mnt); 864 mnt_drop_write(filp->f_path.mnt);
865
866out:
867 unlock_kernel();
862 return ret; 868 return ret;
863} 869}
864 870
865#ifdef CONFIG_COMPAT 871#ifdef CONFIG_COMPAT
866long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 872long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
867{ 873{
868 struct inode *inode = file->f_path.dentry->d_inode; 874 long ret;
869 int ret;
870 875
871 lock_kernel(); 876 lock_kernel();
872 arg = (unsigned long) compat_ptr(arg); 877 arg = (unsigned long) compat_ptr(arg);
873 ret = ncp_ioctl(inode, file, cmd, arg); 878 ret = ncp_ioctl(file, cmd, arg);
874 unlock_kernel(); 879 unlock_kernel();
875 return ret; 880 return ret;
876} 881}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa2..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
1060 goto out_nomem; 1060 goto out_nomem;
1061 rc = strict_strtoul(string, 10, &option); 1061 rc = strict_strtoul(string, 10, &option);
1062 kfree(string); 1062 kfree(string);
1063 if (rc != 0 || option > USHORT_MAX) 1063 if (rc != 0 || option > USHRT_MAX)
1064 goto out_invalid_value; 1064 goto out_invalid_value;
1065 mnt->nfs_server.port = option; 1065 mnt->nfs_server.port = option;
1066 break; 1066 break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
1181 goto out_nomem; 1181 goto out_nomem;
1182 rc = strict_strtoul(string, 10, &option); 1182 rc = strict_strtoul(string, 10, &option);
1183 kfree(string); 1183 kfree(string);
1184 if (rc != 0 || option > USHORT_MAX) 1184 if (rc != 0 || option > USHRT_MAX)
1185 goto out_invalid_value; 1185 goto out_invalid_value;
1186 mnt->mount_server.port = option; 1186 mnt->mount_server.port = option;
1187 break; 1187 break;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7a9ae3254a4b..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -44,8 +44,7 @@
44#define NFSDDBG_FACILITY NFSDDBG_PROC 44#define NFSDDBG_FACILITY NFSDDBG_PROC
45 45
46/* Globals */ 46/* Globals */
47static struct path rec_dir; 47static struct file *rec_file;
48static int rec_dir_init = 0;
49 48
50static int 49static int
51nfs4_save_creds(const struct cred **original_creds) 50nfs4_save_creds(const struct cred **original_creds)
@@ -117,33 +116,28 @@ out_no_tfm:
117 return status; 116 return status;
118} 117}
119 118
120static void
121nfsd4_sync_rec_dir(void)
122{
123 vfs_fsync(NULL, rec_dir.dentry, 0);
124}
125
126int 119int
127nfsd4_create_clid_dir(struct nfs4_client *clp) 120nfsd4_create_clid_dir(struct nfs4_client *clp)
128{ 121{
129 const struct cred *original_cred; 122 const struct cred *original_cred;
130 char *dname = clp->cl_recdir; 123 char *dname = clp->cl_recdir;
131 struct dentry *dentry; 124 struct dentry *dir, *dentry;
132 int status; 125 int status;
133 126
134 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 127 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
135 128
136 if (!rec_dir_init || clp->cl_firststate) 129 if (!rec_file || clp->cl_firststate)
137 return 0; 130 return 0;
138 131
139 status = nfs4_save_creds(&original_cred); 132 status = nfs4_save_creds(&original_cred);
140 if (status < 0) 133 if (status < 0)
141 return status; 134 return status;
142 135
136 dir = rec_file->f_path.dentry;
143 /* lock the parent */ 137 /* lock the parent */
144 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 138 mutex_lock(&dir->d_inode->i_mutex);
145 139
146 dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); 140 dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
147 if (IS_ERR(dentry)) { 141 if (IS_ERR(dentry)) {
148 status = PTR_ERR(dentry); 142 status = PTR_ERR(dentry);
149 goto out_unlock; 143 goto out_unlock;
@@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
153 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); 147 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
154 goto out_put; 148 goto out_put;
155 } 149 }
156 status = mnt_want_write(rec_dir.mnt); 150 status = mnt_want_write(rec_file->f_path.mnt);
157 if (status) 151 if (status)
158 goto out_put; 152 goto out_put;
159 status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); 153 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
160 mnt_drop_write(rec_dir.mnt); 154 mnt_drop_write(rec_file->f_path.mnt);
161out_put: 155out_put:
162 dput(dentry); 156 dput(dentry);
163out_unlock: 157out_unlock:
164 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 158 mutex_unlock(&dir->d_inode->i_mutex);
165 if (status == 0) { 159 if (status == 0) {
166 clp->cl_firststate = 1; 160 clp->cl_firststate = 1;
167 nfsd4_sync_rec_dir(); 161 vfs_fsync(rec_file, 0);
168 } 162 }
169 nfs4_reset_creds(original_cred); 163 nfs4_reset_creds(original_cred);
170 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); 164 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
206 struct dentry *dentry; 200 struct dentry *dentry;
207 int status; 201 int status;
208 202
209 if (!rec_dir_init) 203 if (!rec_file)
210 return 0; 204 return 0;
211 205
212 status = nfs4_save_creds(&original_cred); 206 status = nfs4_save_creds(&original_cred);
213 if (status < 0) 207 if (status < 0)
214 return status; 208 return status;
215 209
216 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 210 filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
217 current_cred()); 211 current_cred());
218 status = PTR_ERR(filp); 212 status = PTR_ERR(filp);
219 if (IS_ERR(filp)) 213 if (IS_ERR(filp))
@@ -250,13 +244,14 @@ out:
250static int 244static int
251nfsd4_unlink_clid_dir(char *name, int namlen) 245nfsd4_unlink_clid_dir(char *name, int namlen)
252{ 246{
253 struct dentry *dentry; 247 struct dentry *dir, *dentry;
254 int status; 248 int status;
255 249
256 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 250 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
257 251
258 mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 252 dir = rec_file->f_path.dentry;
259 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 253 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
254 dentry = lookup_one_len(name, dir, namlen);
260 if (IS_ERR(dentry)) { 255 if (IS_ERR(dentry)) {
261 status = PTR_ERR(dentry); 256 status = PTR_ERR(dentry);
262 goto out_unlock; 257 goto out_unlock;
@@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
264 status = -ENOENT; 259 status = -ENOENT;
265 if (!dentry->d_inode) 260 if (!dentry->d_inode)
266 goto out; 261 goto out;
267 status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); 262 status = vfs_rmdir(dir->d_inode, dentry);
268out: 263out:
269 dput(dentry); 264 dput(dentry);
270out_unlock: 265out_unlock:
271 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 266 mutex_unlock(&dir->d_inode->i_mutex);
272 return status; 267 return status;
273} 268}
274 269
@@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
278 const struct cred *original_cred; 273 const struct cred *original_cred;
279 int status; 274 int status;
280 275
281 if (!rec_dir_init || !clp->cl_firststate) 276 if (!rec_file || !clp->cl_firststate)
282 return; 277 return;
283 278
284 status = mnt_want_write(rec_dir.mnt); 279 status = mnt_want_write(rec_file->f_path.mnt);
285 if (status) 280 if (status)
286 goto out; 281 goto out;
287 clp->cl_firststate = 0; 282 clp->cl_firststate = 0;
@@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
293 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 288 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
294 nfs4_reset_creds(original_cred); 289 nfs4_reset_creds(original_cred);
295 if (status == 0) 290 if (status == 0)
296 nfsd4_sync_rec_dir(); 291 vfs_fsync(rec_file, 0);
297 mnt_drop_write(rec_dir.mnt); 292 mnt_drop_write(rec_file->f_path.mnt);
298out: 293out:
299 if (status) 294 if (status)
300 printk("NFSD: Failed to remove expired client state directory" 295 printk("NFSD: Failed to remove expired client state directory"
@@ -323,19 +318,19 @@ void
323nfsd4_recdir_purge_old(void) { 318nfsd4_recdir_purge_old(void) {
324 int status; 319 int status;
325 320
326 if (!rec_dir_init) 321 if (!rec_file)
327 return; 322 return;
328 status = mnt_want_write(rec_dir.mnt); 323 status = mnt_want_write(rec_file->f_path.mnt);
329 if (status) 324 if (status)
330 goto out; 325 goto out;
331 status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); 326 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
332 if (status == 0) 327 if (status == 0)
333 nfsd4_sync_rec_dir(); 328 vfs_fsync(rec_file, 0);
334 mnt_drop_write(rec_dir.mnt); 329 mnt_drop_write(rec_file->f_path.mnt);
335out: 330out:
336 if (status) 331 if (status)
337 printk("nfsd4: failed to purge old clients from recovery" 332 printk("nfsd4: failed to purge old clients from recovery"
338 " directory %s\n", rec_dir.dentry->d_name.name); 333 " directory %s\n", rec_file->f_path.dentry->d_name.name);
339} 334}
340 335
341static int 336static int
@@ -355,10 +350,13 @@ int
355nfsd4_recdir_load(void) { 350nfsd4_recdir_load(void) {
356 int status; 351 int status;
357 352
358 status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); 353 if (!rec_file)
354 return 0;
355
356 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
359 if (status) 357 if (status)
360 printk("nfsd4: failed loading clients from recovery" 358 printk("nfsd4: failed loading clients from recovery"
361 " directory %s\n", rec_dir.dentry->d_name.name); 359 " directory %s\n", rec_file->f_path.dentry->d_name.name);
362 return status; 360 return status;
363} 361}
364 362
@@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
375 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 373 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
376 rec_dirname); 374 rec_dirname);
377 375
378 BUG_ON(rec_dir_init); 376 BUG_ON(rec_file);
379 377
380 status = nfs4_save_creds(&original_cred); 378 status = nfs4_save_creds(&original_cred);
381 if (status < 0) { 379 if (status < 0) {
@@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
385 return; 383 return;
386 } 384 }
387 385
388 status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 386 rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
389 &rec_dir); 387 if (IS_ERR(rec_file)) {
390 if (status)
391 printk("NFSD: unable to find recovery directory %s\n", 388 printk("NFSD: unable to find recovery directory %s\n",
392 rec_dirname); 389 rec_dirname);
390 rec_file = NULL;
391 }
393 392
394 if (!status)
395 rec_dir_init = 1;
396 nfs4_reset_creds(original_cred); 393 nfs4_reset_creds(original_cred);
397} 394}
398 395
399void 396void
400nfsd4_shutdown_recdir(void) 397nfsd4_shutdown_recdir(void)
401{ 398{
402 if (!rec_dir_init) 399 if (!rec_file)
403 return; 400 return;
404 rec_dir_init = 0; 401 fput(rec_file);
405 path_put(&rec_dir); 402 rec_file = NULL;
406} 403}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
998 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 998 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
999 return -EINVAL; 999 return -EINVAL;
1000 1000
1001 if (port < 1 || port > USHORT_MAX) 1001 if (port < 1 || port > USHRT_MAX)
1002 return -EINVAL; 1002 return -EINVAL;
1003 1003
1004 err = nfsd_create_serv(); 1004 err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) 1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1041 return -EINVAL; 1041 return -EINVAL;
1042 1042
1043 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) 1043 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
1044 return -EINVAL; 1044 return -EINVAL;
1045 1045
1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23c06f77f4ca..ebbf3b6b2457 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -999,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file)
999 999
1000 if (inode->i_state & I_DIRTY) { 1000 if (inode->i_state & I_DIRTY) {
1001 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 1001 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1002 err = vfs_fsync(file, file->f_path.dentry, 0); 1002 err = vfs_fsync(file, 0);
1003 } 1003 }
1004 last_ino = inode->i_ino; 1004 last_ino = inode->i_ino;
1005 last_dev = inode->i_sb->s_dev; 1005 last_dev = inode->i_sb->s_dev;
@@ -1175,8 +1175,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1175 if (err) 1175 if (err)
1176 goto out; 1176 goto out;
1177 if (EX_ISSYNC(fhp->fh_export)) { 1177 if (EX_ISSYNC(fhp->fh_export)) {
1178 int err2 = vfs_fsync_range(file, file->f_path.dentry, 1178 int err2 = vfs_fsync_range(file, offset, end, 0);
1179 offset, end, 0);
1180 1179
1181 if (err2 != -EINVAL) 1180 if (err2 != -EINVAL)
1182 err = nfserrno(err2); 1181 err = nfserrno(err2);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7cfb87e692da..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -31,6 +31,11 @@
31#include "alloc.h" 31#include "alloc.h"
32 32
33 33
34/**
35 * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
36 * descriptor block can maintain
37 * @inode: inode of metadata file using this allocator
38 */
34static inline unsigned long 39static inline unsigned long
35nilfs_palloc_groups_per_desc_block(const struct inode *inode) 40nilfs_palloc_groups_per_desc_block(const struct inode *inode)
36{ 41{
@@ -38,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
38 sizeof(struct nilfs_palloc_group_desc); 43 sizeof(struct nilfs_palloc_group_desc);
39} 44}
40 45
46/**
47 * nilfs_palloc_groups_count - get maximum number of groups
48 * @inode: inode of metadata file using this allocator
49 */
41static inline unsigned long 50static inline unsigned long
42nilfs_palloc_groups_count(const struct inode *inode) 51nilfs_palloc_groups_count(const struct inode *inode)
43{ 52{
44 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); 53 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
45} 54}
46 55
56/**
57 * nilfs_palloc_init_blockgroup - initialize private variables for allocator
58 * @inode: inode of metadata file using this allocator
59 * @entry_size: size of the persistent object
60 */
47int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) 61int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
48{ 62{
49 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 63 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -69,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
69 return 0; 83 return 0;
70} 84}
71 85
86/**
87 * nilfs_palloc_group - get group number and offset from an entry number
88 * @inode: inode of metadata file using this allocator
89 * @nr: serial number of the entry (e.g. inode number)
90 * @offset: pointer to store offset number in the group
91 */
72static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, 92static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
73 unsigned long *offset) 93 unsigned long *offset)
74{ 94{
@@ -78,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
78 return group; 98 return group;
79} 99}
80 100
101/**
102 * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
103 * @inode: inode of metadata file using this allocator
104 * @group: group number
105 *
106 * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
107 * block which contains a descriptor of the specified group.
108 */
81static unsigned long 109static unsigned long
82nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) 110nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
83{ 111{
@@ -86,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
86 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; 114 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
87} 115}
88 116
117/**
118 * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
119 * @inode: inode of metadata file using this allocator
120 * @group: group number
121 *
122 * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
123 * block used to allocate/deallocate entries in the specified group.
124 */
89static unsigned long 125static unsigned long
90nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) 126nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
91{ 127{
@@ -95,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
95 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; 131 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
96} 132}
97 133
134/**
135 * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
136 * @inode: inode of metadata file using this allocator
137 * @group: group number
138 * @desc: pointer to descriptor structure for the group
139 */
98static unsigned long 140static unsigned long
99nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, 141nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
100 const struct nilfs_palloc_group_desc *desc) 142 const struct nilfs_palloc_group_desc *desc)
@@ -107,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
107 return nfree; 149 return nfree;
108} 150}
109 151
152/**
153 * nilfs_palloc_group_desc_add_entries - adjust count of free entries
154 * @inode: inode of metadata file using this allocator
155 * @group: group number
156 * @desc: pointer to descriptor structure for the group
157 * @n: delta to be added
158 */
110static void 159static void
111nilfs_palloc_group_desc_add_entries(struct inode *inode, 160nilfs_palloc_group_desc_add_entries(struct inode *inode,
112 unsigned long group, 161 unsigned long group,
@@ -118,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
118 spin_unlock(nilfs_mdt_bgl_lock(inode, group)); 167 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
119} 168}
120 169
170/**
171 * nilfs_palloc_entry_blkoff - get block offset of an entry block
172 * @inode: inode of metadata file using this allocator
173 * @nr: serial number of the entry (e.g. inode number)
174 */
121static unsigned long 175static unsigned long
122nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) 176nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
123{ 177{
@@ -129,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
129 group_offset / NILFS_MDT(inode)->mi_entries_per_block; 183 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
130} 184}
131 185
186/**
187 * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
188 * @inode: inode of metadata file
189 * @bh: buffer head of the buffer to be initialized
190 * @kaddr: kernel address mapped for the page including the buffer
191 */
132static void nilfs_palloc_desc_block_init(struct inode *inode, 192static void nilfs_palloc_desc_block_init(struct inode *inode,
133 struct buffer_head *bh, void *kaddr) 193 struct buffer_head *bh, void *kaddr)
134{ 194{
@@ -179,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
179 return ret; 239 return ret;
180} 240}
181 241
242/**
243 * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
244 * @inode: inode of metadata file using this allocator
245 * @group: group number
246 * @create: create flag
247 * @bhp: pointer to store the resultant buffer head
248 */
182static int nilfs_palloc_get_desc_block(struct inode *inode, 249static int nilfs_palloc_get_desc_block(struct inode *inode,
183 unsigned long group, 250 unsigned long group,
184 int create, struct buffer_head **bhp) 251 int create, struct buffer_head **bhp)
@@ -191,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
191 bhp, &cache->prev_desc, &cache->lock); 258 bhp, &cache->prev_desc, &cache->lock);
192} 259}
193 260
261/**
262 * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
263 * @inode: inode of metadata file using this allocator
264 * @group: group number
265 * @create: create flag
266 * @bhp: pointer to store the resultant buffer head
267 */
194static int nilfs_palloc_get_bitmap_block(struct inode *inode, 268static int nilfs_palloc_get_bitmap_block(struct inode *inode,
195 unsigned long group, 269 unsigned long group,
196 int create, struct buffer_head **bhp) 270 int create, struct buffer_head **bhp)
@@ -203,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
203 &cache->prev_bitmap, &cache->lock); 277 &cache->prev_bitmap, &cache->lock);
204} 278}
205 279
280/**
281 * nilfs_palloc_get_entry_block - get buffer head of an entry block
282 * @inode: inode of metadata file using this allocator
283 * @nr: serial number of the entry (e.g. inode number)
284 * @create: create flag
285 * @bhp: pointer to store the resultant buffer head
286 */
206int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, 287int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
207 int create, struct buffer_head **bhp) 288 int create, struct buffer_head **bhp)
208{ 289{
@@ -214,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
214 &cache->prev_entry, &cache->lock); 295 &cache->prev_entry, &cache->lock);
215} 296}
216 297
298/**
299 * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
300 * @inode: inode of metadata file using this allocator
301 * @group: group number
302 * @bh: buffer head of the buffer storing the group descriptor block
303 * @kaddr: kernel address mapped for the page including the buffer
304 */
217static struct nilfs_palloc_group_desc * 305static struct nilfs_palloc_group_desc *
218nilfs_palloc_block_get_group_desc(const struct inode *inode, 306nilfs_palloc_block_get_group_desc(const struct inode *inode,
219 unsigned long group, 307 unsigned long group,
@@ -223,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
223 group % nilfs_palloc_groups_per_desc_block(inode); 311 group % nilfs_palloc_groups_per_desc_block(inode);
224} 312}
225 313
314/**
315 * nilfs_palloc_block_get_entry - get kernel address of an entry
316 * @inode: inode of metadata file using this allocator
317 * @nr: serial number of the entry (e.g. inode number)
318 * @bh: buffer head of the buffer storing the entry block
319 * @kaddr: kernel address mapped for the page including the buffer
320 */
226void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, 321void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
227 const struct buffer_head *bh, void *kaddr) 322 const struct buffer_head *bh, void *kaddr)
228{ 323{
@@ -235,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
235 entry_offset * NILFS_MDT(inode)->mi_entry_size; 330 entry_offset * NILFS_MDT(inode)->mi_entry_size;
236} 331}
237 332
333/**
334 * nilfs_palloc_find_available_slot - find available slot in a group
335 * @inode: inode of metadata file using this allocator
336 * @group: group number
337 * @target: offset number of an entry in the group (start point)
338 * @bitmap: bitmap of the group
339 * @bsize: size in bits
340 */
238static int nilfs_palloc_find_available_slot(struct inode *inode, 341static int nilfs_palloc_find_available_slot(struct inode *inode,
239 unsigned long group, 342 unsigned long group,
240 unsigned long target, 343 unsigned long target,
241 unsigned char *bitmap, 344 unsigned char *bitmap,
242 int bsize) /* size in bits */ 345 int bsize)
243{ 346{
244 int curr, pos, end, i; 347 int curr, pos, end, i;
245 348
@@ -277,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
277 return -ENOSPC; 380 return -ENOSPC;
278} 381}
279 382
383/**
384 * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
385 * in a group descriptor block
386 * @inode: inode of metadata file using this allocator
387 * @curr: current group number
388 * @max: maximum number of groups
389 */
280static unsigned long 390static unsigned long
281nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, 391nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
282 unsigned long curr, unsigned long max) 392 unsigned long curr, unsigned long max)
@@ -287,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
287 max - curr + 1); 397 max - curr + 1);
288} 398}
289 399
400/**
401 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
402 * @inode: inode of metadata file using this allocator
403 * @req: nilfs_palloc_req structure exchanged for the allocation
404 */
290int nilfs_palloc_prepare_alloc_entry(struct inode *inode, 405int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
291 struct nilfs_palloc_req *req) 406 struct nilfs_palloc_req *req)
292{ 407{
@@ -366,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
366 return ret; 481 return ret;
367} 482}
368 483
484/**
485 * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
486 * @inode: inode of metadata file using this allocator
487 * @req: nilfs_palloc_req structure exchanged for the allocation
488 */
369void nilfs_palloc_commit_alloc_entry(struct inode *inode, 489void nilfs_palloc_commit_alloc_entry(struct inode *inode,
370 struct nilfs_palloc_req *req) 490 struct nilfs_palloc_req *req)
371{ 491{
@@ -377,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
377 brelse(req->pr_desc_bh); 497 brelse(req->pr_desc_bh);
378} 498}
379 499
500/**
501 * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
502 * @inode: inode of metadata file using this allocator
503 * @req: nilfs_palloc_req structure exchanged for the removal
504 */
380void nilfs_palloc_commit_free_entry(struct inode *inode, 505void nilfs_palloc_commit_free_entry(struct inode *inode,
381 struct nilfs_palloc_req *req) 506 struct nilfs_palloc_req *req)
382{ 507{
@@ -410,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
410 brelse(req->pr_desc_bh); 535 brelse(req->pr_desc_bh);
411} 536}
412 537
538/**
539 * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
540 * @inode: inode of metadata file using this allocator
541 * @req: nilfs_palloc_req structure exchanged for the allocation
542 */
413void nilfs_palloc_abort_alloc_entry(struct inode *inode, 543void nilfs_palloc_abort_alloc_entry(struct inode *inode,
414 struct nilfs_palloc_req *req) 544 struct nilfs_palloc_req *req)
415{ 545{
@@ -442,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
442 req->pr_desc_bh = NULL; 572 req->pr_desc_bh = NULL;
443} 573}
444 574
575/**
576 * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
577 * @inode: inode of metadata file using this allocator
578 * @req: nilfs_palloc_req structure exchanged for the removal
579 */
445int nilfs_palloc_prepare_free_entry(struct inode *inode, 580int nilfs_palloc_prepare_free_entry(struct inode *inode,
446 struct nilfs_palloc_req *req) 581 struct nilfs_palloc_req *req)
447{ 582{
@@ -464,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
464 return 0; 599 return 0;
465} 600}
466 601
602/**
603 * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
604 * @inode: inode of metadata file using this allocator
605 * @req: nilfs_palloc_req structure exchanged for the removal
606 */
467void nilfs_palloc_abort_free_entry(struct inode *inode, 607void nilfs_palloc_abort_free_entry(struct inode *inode,
468 struct nilfs_palloc_req *req) 608 struct nilfs_palloc_req *req)
469{ 609{
@@ -475,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
475 req->pr_desc_bh = NULL; 615 req->pr_desc_bh = NULL;
476} 616}
477 617
618/**
619 * nilfs_palloc_group_is_in - judge if an entry is in a group
620 * @inode: inode of metadata file using this allocator
621 * @group: group number
622 * @nr: serial number of the entry (e.g. inode number)
623 */
478static int 624static int
479nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) 625nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
480{ 626{
@@ -485,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
485 return (nr >= first) && (nr <= last); 631 return (nr >= first) && (nr <= last);
486} 632}
487 633
634/**
635 * nilfs_palloc_freev - deallocate a set of persistent objects
636 * @inode: inode of metadata file using this allocator
637 * @entry_nrs: array of entry numbers to be deallocated
638 * @nitems: number of entries stored in @entry_nrs
639 */
488int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) 640int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
489{ 641{
490 struct buffer_head *desc_bh, *bitmap_bh; 642 struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 5cccf874d692..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
29#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31 31
32/**
33 * nilfs_palloc_entries_per_group - get the number of entries per group
34 * @inode: inode of metadata file using this allocator
35 *
36 * The number of entries per group is defined by the number of bits
37 * that a bitmap block can maintain.
38 */
32static inline unsigned long 39static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode) 40nilfs_palloc_entries_per_group(const struct inode *inode)
34{ 41{
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 76c38e3e19d2..b27a342c5af6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
31#include "alloc.h" 31#include "alloc.h"
32#include "dat.h" 32#include "dat.h"
33 33
34/** 34static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
35 * struct nilfs_btree_path - A path on which B-tree operations are executed
36 * @bp_bh: buffer head of node block
37 * @bp_sib_bh: buffer head of sibling node block
38 * @bp_index: index of child node
39 * @bp_oldreq: ptr end request for old ptr
40 * @bp_newreq: ptr alloc request for new ptr
41 * @bp_op: rebalance operation
42 */
43struct nilfs_btree_path {
44 struct buffer_head *bp_bh;
45 struct buffer_head *bp_sib_bh;
46 int bp_index;
47 union nilfs_bmap_ptr_req bp_oldreq;
48 union nilfs_bmap_ptr_req bp_newreq;
49 struct nilfs_btnode_chkey_ctxt bp_ctxt;
50 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
51 int, __u64 *, __u64 *);
52};
53
54/*
55 * B-tree path operations
56 */
57
58static struct kmem_cache *nilfs_btree_path_cache;
59
60int __init nilfs_btree_path_cache_init(void)
61{
62 nilfs_btree_path_cache =
63 kmem_cache_create("nilfs2_btree_path_cache",
64 sizeof(struct nilfs_btree_path) *
65 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
66 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
67}
68
69void nilfs_btree_path_cache_destroy(void)
70{
71 kmem_cache_destroy(nilfs_btree_path_cache);
72}
73
74static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
75{
76 return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
77}
78
79static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
80{ 35{
81 kmem_cache_free(nilfs_btree_path_cache, path); 36 struct nilfs_btree_path *path;
82} 37 int level = NILFS_BTREE_LEVEL_DATA;
83 38
84static void nilfs_btree_init_path(struct nilfs_btree_path *path) 39 path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
85{ 40 if (path == NULL)
86 int level; 41 goto out;
87 42
88 for (level = NILFS_BTREE_LEVEL_DATA; 43 for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
89 level < NILFS_BTREE_LEVEL_MAX;
90 level++) {
91 path[level].bp_bh = NULL; 44 path[level].bp_bh = NULL;
92 path[level].bp_sib_bh = NULL; 45 path[level].bp_sib_bh = NULL;
93 path[level].bp_index = 0; 46 path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
95 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; 48 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
96 path[level].bp_op = NULL; 49 path[level].bp_op = NULL;
97 } 50 }
51
52out:
53 return path;
98} 54}
99 55
100static void nilfs_btree_release_path(struct nilfs_btree_path *path) 56static void nilfs_btree_free_path(struct nilfs_btree_path *path)
101{ 57{
102 int level; 58 int level = NILFS_BTREE_LEVEL_DATA;
103 59
104 for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; 60 for (; level < NILFS_BTREE_LEVEL_MAX; level++)
105 level++)
106 brelse(path[level].bp_bh); 61 brelse(path[level].bp_bh);
62
63 kmem_cache_free(nilfs_btree_path_cache, path);
107} 64}
108 65
109/* 66/*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
566 path = nilfs_btree_alloc_path(); 523 path = nilfs_btree_alloc_path();
567 if (path == NULL) 524 if (path == NULL)
568 return -ENOMEM; 525 return -ENOMEM;
569 nilfs_btree_init_path(path);
570 526
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 527 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
572 528
573 if (ptrp != NULL) 529 if (ptrp != NULL)
574 *ptrp = ptr; 530 *ptrp = ptr;
575 531
576 nilfs_btree_release_path(path);
577 nilfs_btree_free_path(path); 532 nilfs_btree_free_path(path);
578 533
579 return ret; 534 return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
594 path = nilfs_btree_alloc_path(); 549 path = nilfs_btree_alloc_path();
595 if (path == NULL) 550 if (path == NULL)
596 return -ENOMEM; 551 return -ENOMEM;
597 nilfs_btree_init_path(path); 552
598 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 553 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
599 if (ret < 0) 554 if (ret < 0)
600 goto out; 555 goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
655 *ptrp = ptr; 610 *ptrp = ptr;
656 ret = cnt; 611 ret = cnt;
657 out: 612 out:
658 nilfs_btree_release_path(path);
659 nilfs_btree_free_path(path); 613 nilfs_btree_free_path(path);
660 return ret; 614 return ret;
661} 615}
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1123 path = nilfs_btree_alloc_path(); 1077 path = nilfs_btree_alloc_path();
1124 if (path == NULL) 1078 if (path == NULL)
1125 return -ENOMEM; 1079 return -ENOMEM;
1126 nilfs_btree_init_path(path);
1127 1080
1128 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1081 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1129 NILFS_BTREE_LEVEL_NODE_MIN); 1082 NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1140 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1093 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1141 1094
1142 out: 1095 out:
1143 nilfs_btree_release_path(path);
1144 nilfs_btree_free_path(path); 1096 nilfs_btree_free_path(path);
1145 return ret; 1097 return ret;
1146} 1098}
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1456 path = nilfs_btree_alloc_path(); 1408 path = nilfs_btree_alloc_path();
1457 if (path == NULL) 1409 if (path == NULL)
1458 return -ENOMEM; 1410 return -ENOMEM;
1459 nilfs_btree_init_path(path); 1411
1460 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1412 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1461 NILFS_BTREE_LEVEL_NODE_MIN); 1413 NILFS_BTREE_LEVEL_NODE_MIN);
1462 if (ret < 0) 1414 if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1473 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1425 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1474 1426
1475out: 1427out:
1476 nilfs_btree_release_path(path);
1477 nilfs_btree_free_path(path); 1428 nilfs_btree_free_path(path);
1478 return ret; 1429 return ret;
1479} 1430}
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1488 path = nilfs_btree_alloc_path(); 1439 path = nilfs_btree_alloc_path();
1489 if (path == NULL) 1440 if (path == NULL)
1490 return -ENOMEM; 1441 return -ENOMEM;
1491 nilfs_btree_init_path(path);
1492 1442
1493 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); 1443 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1494 1444
1495 nilfs_btree_release_path(path);
1496 nilfs_btree_free_path(path); 1445 nilfs_btree_free_path(path);
1497 1446
1498 return ret; 1447 return ret;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1923 path = nilfs_btree_alloc_path(); 1872 path = nilfs_btree_alloc_path();
1924 if (path == NULL) 1873 if (path == NULL)
1925 return -ENOMEM; 1874 return -ENOMEM;
1926 nilfs_btree_init_path(path);
1927 1875
1928 if (buffer_nilfs_node(bh)) { 1876 if (buffer_nilfs_node(bh)) {
1929 node = (struct nilfs_btree_node *)bh->b_data; 1877 node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1947 nilfs_btree_propagate_p(btree, path, level, bh); 1895 nilfs_btree_propagate_p(btree, path, level, bh);
1948 1896
1949 out: 1897 out:
1950 nilfs_btree_release_path(path);
1951 nilfs_btree_free_path(path); 1898 nilfs_btree_free_path(path);
1952 1899
1953 return ret; 1900 return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2108 path = nilfs_btree_alloc_path(); 2055 path = nilfs_btree_alloc_path();
2109 if (path == NULL) 2056 if (path == NULL)
2110 return -ENOMEM; 2057 return -ENOMEM;
2111 nilfs_btree_init_path(path);
2112 2058
2113 if (buffer_nilfs_node(*bh)) { 2059 if (buffer_nilfs_node(*bh)) {
2114 node = (struct nilfs_btree_node *)(*bh)->b_data; 2060 node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2130 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2076 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2131 2077
2132 out: 2078 out:
2133 nilfs_btree_release_path(path);
2134 nilfs_btree_free_path(path); 2079 nilfs_btree_free_path(path);
2135 2080
2136 return ret; 2081 return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2175 path = nilfs_btree_alloc_path(); 2120 path = nilfs_btree_alloc_path();
2176 if (path == NULL) 2121 if (path == NULL)
2177 return -ENOMEM; 2122 return -ENOMEM;
2178 nilfs_btree_init_path(path);
2179 2123
2180 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2124 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2181 if (ret < 0) { 2125 if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2195 nilfs_bmap_set_dirty(&btree->bt_bmap); 2139 nilfs_bmap_set_dirty(&btree->bt_bmap);
2196 2140
2197 out: 2141 out:
2198 nilfs_btree_release_path(path);
2199 nilfs_btree_free_path(path); 2142 nilfs_btree_free_path(path);
2200 return ret; 2143 return ret;
2201} 2144}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..af638d59e3bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
30#include "btnode.h" 30#include "btnode.h"
31#include "bmap.h" 31#include "bmap.h"
32 32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/** 33/**
37 * struct nilfs_btree - B-tree structure 34 * struct nilfs_btree - B-tree structure
38 * @bt_bmap: bmap base structure 35 * @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
41 struct nilfs_bmap bt_bmap; 38 struct nilfs_bmap bt_bmap;
42}; 39};
43 40
41/**
42 * struct nilfs_btree_path - A path on which B-tree operations are executed
43 * @bp_bh: buffer head of node block
44 * @bp_sib_bh: buffer head of sibling node block
45 * @bp_index: index of child node
46 * @bp_oldreq: ptr end request for old ptr
47 * @bp_newreq: ptr alloc request for new ptr
48 * @bp_op: rebalance operation
49 */
50struct nilfs_btree_path {
51 struct buffer_head *bp_bh;
52 struct buffer_head *bp_sib_bh;
53 int bp_index;
54 union nilfs_bmap_ptr_req bp_oldreq;
55 union nilfs_bmap_ptr_req bp_newreq;
56 struct nilfs_btnode_chkey_ctxt bp_ctxt;
57 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
58 int, __u64 *, __u64 *);
59};
44 60
45#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE 61#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
46#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ 62#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
@@ -57,6 +73,7 @@ struct nilfs_btree {
57#define NILFS_BTREE_KEY_MIN ((__u64)0) 73#define NILFS_BTREE_KEY_MIN ((__u64)0)
58#define NILFS_BTREE_KEY_MAX (~(__u64)0) 74#define NILFS_BTREE_KEY_MAX (~(__u64)0)
59 75
76extern struct kmem_cache *nilfs_btree_path_cache;
60 77
61int nilfs_btree_path_cache_init(void); 78int nilfs_btree_path_cache_init(void);
62void nilfs_btree_path_cache_destroy(void); 79void nilfs_btree_path_cache_destroy(void);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0957b58f909d..39e038ac8fcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
281 281
282 atomic_inc(&sbi->s_inodes_count); 282 atomic_inc(&sbi->s_inodes_count);
283 283 inode_init_owner(inode, dir, mode);
284 inode->i_uid = current_fsuid();
285 if (dir->i_mode & S_ISGID) {
286 inode->i_gid = dir->i_gid;
287 if (S_ISDIR(mode))
288 mode |= S_ISGID;
289 } else
290 inode->i_gid = current_fsgid();
291
292 inode->i_mode = mode;
293 inode->i_ino = ino; 284 inode->i_ino = ino;
294 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 285 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
295 286
@@ -451,7 +442,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
451 inode->i_op = &nilfs_special_inode_operations; 442 inode->i_op = &nilfs_special_inode_operations;
452 init_special_inode( 443 init_special_inode(
453 inode, inode->i_mode, 444 inode, inode->i_mode,
454 new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 445 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
455 } 446 }
456 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 447 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
457 brelse(bh); 448 brelse(bh);
@@ -511,7 +502,7 @@ void nilfs_write_inode_common(struct inode *inode,
511 nilfs_bmap_write(ii->i_bmap, raw_inode); 502 nilfs_bmap_write(ii->i_bmap, raw_inode);
512 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 503 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
513 raw_inode->i_device_code = 504 raw_inode->i_device_code =
514 cpu_to_le64(new_encode_dev(inode->i_rdev)); 505 cpu_to_le64(huge_encode_dev(inode->i_rdev));
515 /* When extending inode, nilfs->ns_inode_size should be checked 506 /* When extending inode, nilfs->ns_inode_size should be checked
516 for substitutions of appended fields */ 507 for substitutions of appended fields */
517} 508}
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba43146f3c30..bae2a516b4ee 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -105,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
105 105
106 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); 106 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
107 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); 107 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
108
109 /* need to verify ->ss_bytes field if read ->ss_cno */
108} 110}
109 111
110/** 112/**
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 17851f77f739..2e6a2723b8fa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,35 +40,10 @@ struct nilfs_write_info {
40 sector_t blocknr; 40 sector_t blocknr;
41}; 41};
42 42
43
44static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
45 struct the_nilfs *nilfs); 44 struct the_nilfs *nilfs);
46static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); 45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
47 46
48
49static struct kmem_cache *nilfs_segbuf_cachep;
50
51static void nilfs_segbuf_init_once(void *obj)
52{
53 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
54}
55
56int __init nilfs_init_segbuf_cache(void)
57{
58 nilfs_segbuf_cachep =
59 kmem_cache_create("nilfs2_segbuf_cache",
60 sizeof(struct nilfs_segment_buffer),
61 0, SLAB_RECLAIM_ACCOUNT,
62 nilfs_segbuf_init_once);
63
64 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
65}
66
67void nilfs_destroy_segbuf_cache(void)
68{
69 kmem_cache_destroy(nilfs_segbuf_cachep);
70}
71
72struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) 47struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
73{ 48{
74 struct nilfs_segment_buffer *segbuf; 49 struct nilfs_segment_buffer *segbuf;
@@ -81,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
81 INIT_LIST_HEAD(&segbuf->sb_list); 56 INIT_LIST_HEAD(&segbuf->sb_list);
82 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); 57 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
83 INIT_LIST_HEAD(&segbuf->sb_payload_buffers); 58 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
59 segbuf->sb_super_root = NULL;
84 60
85 init_completion(&segbuf->sb_bio_event); 61 init_completion(&segbuf->sb_bio_event);
86 atomic_set(&segbuf->sb_err, 0); 62 atomic_set(&segbuf->sb_err, 0);
@@ -158,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
158} 134}
159 135
160int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, 136int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
161 time_t ctime) 137 time_t ctime, __u64 cno)
162{ 138{
163 int err; 139 int err;
164 140
@@ -171,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
171 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); 147 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
172 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; 148 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
173 segbuf->sb_sum.ctime = ctime; 149 segbuf->sb_sum.ctime = ctime;
150 segbuf->sb_sum.cno = cno;
174 return 0; 151 return 0;
175} 152}
176 153
@@ -196,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
196 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); 173 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
197 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); 174 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
198 raw_sum->ss_pad = 0; 175 raw_sum->ss_pad = 0;
176 raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno);
199} 177}
200 178
201/* 179/*
202 * CRC calculation routines 180 * CRC calculation routines
203 */ 181 */
204void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, 182static void
205 u32 seed) 183nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
206{ 184{
207 struct buffer_head *bh; 185 struct buffer_head *bh;
208 struct nilfs_segment_summary *raw_sum; 186 struct nilfs_segment_summary *raw_sum;
@@ -229,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
229 raw_sum->ss_sumsum = cpu_to_le32(crc); 207 raw_sum->ss_sumsum = cpu_to_le32(crc);
230} 208}
231 209
232void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, 210static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
233 u32 seed) 211 u32 seed)
234{ 212{
235 struct buffer_head *bh; 213 struct buffer_head *bh;
236 struct nilfs_segment_summary *raw_sum; 214 struct nilfs_segment_summary *raw_sum;
@@ -256,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
256 raw_sum->ss_datasum = cpu_to_le32(crc); 234 raw_sum->ss_datasum = cpu_to_le32(crc);
257} 235}
258 236
237static void
238nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
239 u32 seed)
240{
241 struct nilfs_super_root *raw_sr;
242 u32 crc;
243
244 raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
245 crc = crc32_le(seed,
246 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
247 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
248 raw_sr->sr_sum = cpu_to_le32(crc);
249}
250
259static void nilfs_release_buffers(struct list_head *list) 251static void nilfs_release_buffers(struct list_head *list)
260{ 252{
261 struct buffer_head *bh, *n; 253 struct buffer_head *bh, *n;
@@ -282,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
282{ 274{
283 nilfs_release_buffers(&segbuf->sb_segsum_buffers); 275 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
284 nilfs_release_buffers(&segbuf->sb_payload_buffers); 276 nilfs_release_buffers(&segbuf->sb_payload_buffers);
277 segbuf->sb_super_root = NULL;
285} 278}
286 279
287/* 280/*
@@ -334,6 +327,23 @@ int nilfs_wait_on_logs(struct list_head *logs)
334 return ret; 327 return ret;
335} 328}
336 329
330/**
331 * nilfs_add_checksums_on_logs - add checksums on the logs
332 * @logs: list of segment buffers storing target logs
333 * @seed: checksum seed value
334 */
335void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
336{
337 struct nilfs_segment_buffer *segbuf;
338
339 list_for_each_entry(segbuf, logs, sb_list) {
340 if (segbuf->sb_super_root)
341 nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
342 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
343 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
344 }
345}
346
337/* 347/*
338 * BIO operations 348 * BIO operations
339 */ 349 */
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..fdf1c3b6d673 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
37 * @sumbytes: Byte count of segment summary 37 * @sumbytes: Byte count of segment summary
38 * @nfileblk: Total number of file blocks 38 * @nfileblk: Total number of file blocks
39 * @seg_seq: Segment sequence number 39 * @seg_seq: Segment sequence number
40 * @cno: Checkpoint number
40 * @ctime: Creation time 41 * @ctime: Creation time
41 * @next: Block number of the next full segment 42 * @next: Block number of the next full segment
42 */ 43 */
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
48 unsigned long sumbytes; 49 unsigned long sumbytes;
49 unsigned long nfileblk; 50 unsigned long nfileblk;
50 u64 seg_seq; 51 u64 seg_seq;
52 __u64 cno;
51 time_t ctime; 53 time_t ctime;
52 sector_t next; 54 sector_t next;
53}; 55};
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
76 * @sb_rest_blocks: Number of residual blocks in the current segment 78 * @sb_rest_blocks: Number of residual blocks in the current segment
77 * @sb_segsum_buffers: List of buffers for segment summaries 79 * @sb_segsum_buffers: List of buffers for segment summaries
78 * @sb_payload_buffers: List of buffers for segment payload 80 * @sb_payload_buffers: List of buffers for segment payload
81 * @sb_super_root: Pointer to buffer storing a super root block (if exists)
79 * @sb_nbio: Number of flying bio requests 82 * @sb_nbio: Number of flying bio requests
80 * @sb_err: I/O error status 83 * @sb_err: I/O error status
81 * @sb_bio_event: Completion event of log writing 84 * @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
95 /* Buffers */ 98 /* Buffers */
96 struct list_head sb_segsum_buffers; 99 struct list_head sb_segsum_buffers;
97 struct list_head sb_payload_buffers; /* including super root */ 100 struct list_head sb_payload_buffers; /* including super root */
101 struct buffer_head *sb_super_root;
98 102
99 /* io status */ 103 /* io status */
100 int sb_nbio; 104 int sb_nbio;
@@ -121,6 +125,7 @@ struct nilfs_segment_buffer {
121 b_assoc_buffers)) 125 b_assoc_buffers))
122#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) 126#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
123 127
128extern struct kmem_cache *nilfs_segbuf_cachep;
124 129
125int __init nilfs_init_segbuf_cache(void); 130int __init nilfs_init_segbuf_cache(void);
126void nilfs_destroy_segbuf_cache(void); 131void nilfs_destroy_segbuf_cache(void);
@@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
132 struct nilfs_segment_buffer *prev); 137 struct nilfs_segment_buffer *prev);
133void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, 138void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
134 struct the_nilfs *); 139 struct the_nilfs *);
135int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); 140int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
136int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); 141int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
137int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, 142int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
138 struct buffer_head **); 143 struct buffer_head **);
139void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); 144void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
140void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
141void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
142 145
143static inline void 146static inline void
144nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, 147nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
171 struct nilfs_segment_buffer *last); 174 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); 175int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
173int nilfs_wait_on_logs(struct list_head *logs); 176int nilfs_wait_on_logs(struct list_head *logs);
177void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
174 178
175static inline void nilfs_destroy_logs(struct list_head *logs) 179static inline void nilfs_destroy_logs(struct list_head *logs)
176{ 180{
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6a7dbd8451db..c9201649cc49 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -116,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
116#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) 116#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
117#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) 117#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
118 118
119/*
120 * Transaction
121 */
122static struct kmem_cache *nilfs_transaction_cachep;
123
124/**
125 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
126 *
127 * nilfs_init_transaction_cache() creates a slab cache for the struct
128 * nilfs_transaction_info.
129 *
130 * Return Value: On success, it returns 0. On error, one of the following
131 * negative error code is returned.
132 *
133 * %-ENOMEM - Insufficient memory available.
134 */
135int nilfs_init_transaction_cache(void)
136{
137 nilfs_transaction_cachep =
138 kmem_cache_create("nilfs2_transaction_cache",
139 sizeof(struct nilfs_transaction_info),
140 0, SLAB_RECLAIM_ACCOUNT, NULL);
141 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
142}
143
144/**
145 * nilfs_destroy_transaction_cache - destroy the cache for transaction info
146 *
147 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
148 * nilfs_transaction_info.
149 */
150void nilfs_destroy_transaction_cache(void)
151{
152 kmem_cache_destroy(nilfs_transaction_cachep);
153}
154
155static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) 119static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
156{ 120{
157 struct nilfs_transaction_info *cur_ti = current->journal_info; 121 struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -402,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
402 366
403 if (nilfs_doing_gc()) 367 if (nilfs_doing_gc())
404 flags = NILFS_SS_GC; 368 flags = NILFS_SS_GC;
405 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); 369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
370 sci->sc_sbi->s_nilfs->ns_cno);
406 if (unlikely(err)) 371 if (unlikely(err))
407 return err; 372 return err;
408 373
@@ -435,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
435 return err; 400 return err;
436 segbuf = sci->sc_curseg; 401 segbuf = sci->sc_curseg;
437 } 402 }
438 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); 403 err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
439 if (likely(!err)) 404 if (likely(!err))
440 segbuf->sb_sum.flags |= NILFS_SS_SR; 405 segbuf->sb_sum.flags |= NILFS_SS_SR;
441 return err; 406 return err;
@@ -599,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
599 *vblocknr = binfo->bi_v.bi_vblocknr; 564 *vblocknr = binfo->bi_v.bi_vblocknr;
600} 565}
601 566
602struct nilfs_sc_operations nilfs_sc_file_ops = { 567static struct nilfs_sc_operations nilfs_sc_file_ops = {
603 .collect_data = nilfs_collect_file_data, 568 .collect_data = nilfs_collect_file_data,
604 .collect_node = nilfs_collect_file_node, 569 .collect_node = nilfs_collect_file_node,
605 .collect_bmap = nilfs_collect_file_bmap, 570 .collect_bmap = nilfs_collect_file_bmap,
@@ -649,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
649 *binfo_dat = binfo->bi_dat; 614 *binfo_dat = binfo->bi_dat;
650} 615}
651 616
652struct nilfs_sc_operations nilfs_sc_dat_ops = { 617static struct nilfs_sc_operations nilfs_sc_dat_ops = {
653 .collect_data = nilfs_collect_dat_data, 618 .collect_data = nilfs_collect_dat_data,
654 .collect_node = nilfs_collect_file_node, 619 .collect_node = nilfs_collect_file_node,
655 .collect_bmap = nilfs_collect_dat_bmap, 620 .collect_bmap = nilfs_collect_dat_bmap,
@@ -657,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
657 .write_node_binfo = nilfs_write_dat_node_binfo, 622 .write_node_binfo = nilfs_write_dat_node_binfo,
658}; 623};
659 624
660struct nilfs_sc_operations nilfs_sc_dsync_ops = { 625static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
661 .collect_data = nilfs_collect_file_data, 626 .collect_data = nilfs_collect_file_data,
662 .collect_node = NULL, 627 .collect_node = NULL,
663 .collect_bmap = NULL, 628 .collect_bmap = NULL,
@@ -932,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
932 } 897 }
933} 898}
934 899
935/*
936 * CRC calculation routines
937 */
938static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
939{
940 struct nilfs_super_root *raw_sr =
941 (struct nilfs_super_root *)bh_sr->b_data;
942 u32 crc;
943
944 crc = crc32_le(seed,
945 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
946 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
947 raw_sr->sr_sum = cpu_to_le32(crc);
948}
949
950static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
951 u32 seed)
952{
953 struct nilfs_segment_buffer *segbuf;
954
955 if (sci->sc_super_root)
956 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
957
958 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
959 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
960 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
961 }
962}
963
964static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, 900static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
965 struct the_nilfs *nilfs) 901 struct the_nilfs *nilfs)
966{ 902{
967 struct buffer_head *bh_sr = sci->sc_super_root; 903 struct buffer_head *bh_sr;
968 struct nilfs_super_root *raw_sr = 904 struct nilfs_super_root *raw_sr;
969 (struct nilfs_super_root *)bh_sr->b_data;
970 unsigned isz = nilfs->ns_inode_size; 905 unsigned isz = nilfs->ns_inode_size;
971 906
907 bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
908 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
909
972 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); 910 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
973 raw_sr->sr_nongc_ctime 911 raw_sr->sr_nongc_ctime
974 = cpu_to_le64(nilfs_doing_gc() ? 912 = cpu_to_le64(nilfs_doing_gc() ?
@@ -1491,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1491 1429
1492 /* Collection retry loop */ 1430 /* Collection retry loop */
1493 for (;;) { 1431 for (;;) {
1494 sci->sc_super_root = NULL;
1495 sci->sc_nblk_this_inc = 0; 1432 sci->sc_nblk_this_inc = 0;
1496 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); 1433 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1497 1434
@@ -1568,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1568 ssp.offset = sizeof(struct nilfs_segment_summary); 1505 ssp.offset = sizeof(struct nilfs_segment_summary);
1569 1506
1570 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { 1507 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1571 if (bh == sci->sc_super_root) 1508 if (bh == segbuf->sb_super_root)
1572 break; 1509 break;
1573 if (!finfo) { 1510 if (!finfo) {
1574 finfo = nilfs_segctor_map_segsum_entry( 1511 finfo = nilfs_segctor_map_segsum_entry(
@@ -1729,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1729 1666
1730 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1667 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1731 b_assoc_buffers) { 1668 b_assoc_buffers) {
1732 if (bh == sci->sc_super_root) { 1669 if (bh == segbuf->sb_super_root) {
1733 if (bh->b_page != bd_page) { 1670 if (bh->b_page != bd_page) {
1734 lock_page(bd_page); 1671 lock_page(bd_page);
1735 clear_page_dirty_for_io(bd_page); 1672 clear_page_dirty_for_io(bd_page);
@@ -1848,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1848} 1785}
1849 1786
1850static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, 1787static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1851 struct buffer_head *bh_sr, int err) 1788 int err)
1852{ 1789{
1853 struct nilfs_segment_buffer *segbuf; 1790 struct nilfs_segment_buffer *segbuf;
1854 struct page *bd_page = NULL, *fs_page = NULL; 1791 struct page *bd_page = NULL, *fs_page = NULL;
@@ -1869,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1869 1806
1870 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1807 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1871 b_assoc_buffers) { 1808 b_assoc_buffers) {
1872 if (bh == bh_sr) { 1809 if (bh == segbuf->sb_super_root) {
1873 if (bh->b_page != bd_page) { 1810 if (bh->b_page != bd_page) {
1874 end_page_writeback(bd_page); 1811 end_page_writeback(bd_page);
1875 bd_page = bh->b_page; 1812 bd_page = bh->b_page;
@@ -1898,7 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1898 1835
1899 list_splice_tail_init(&sci->sc_write_logs, &logs); 1836 list_splice_tail_init(&sci->sc_write_logs, &logs);
1900 ret = nilfs_wait_on_logs(&logs); 1837 ret = nilfs_wait_on_logs(&logs);
1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err); 1838 nilfs_abort_logs(&logs, NULL, ret ? : err);
1902 1839
1903 list_splice_tail_init(&sci->sc_segbufs, &logs); 1840 list_splice_tail_init(&sci->sc_segbufs, &logs);
1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1841 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1914 } 1851 }
1915 1852
1916 nilfs_destroy_logs(&logs); 1853 nilfs_destroy_logs(&logs);
1917 sci->sc_super_root = NULL;
1918} 1854}
1919 1855
1920static void nilfs_set_next_segment(struct the_nilfs *nilfs, 1856static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1933 struct nilfs_segment_buffer *segbuf; 1869 struct nilfs_segment_buffer *segbuf;
1934 struct page *bd_page = NULL, *fs_page = NULL; 1870 struct page *bd_page = NULL, *fs_page = NULL;
1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 1871 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1936 int update_sr = (sci->sc_super_root != NULL); 1872 int update_sr = false;
1937 1873
1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1874 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
1939 struct buffer_head *bh; 1875 struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1964 set_buffer_uptodate(bh); 1900 set_buffer_uptodate(bh);
1965 clear_buffer_dirty(bh); 1901 clear_buffer_dirty(bh);
1966 clear_buffer_nilfs_volatile(bh); 1902 clear_buffer_nilfs_volatile(bh);
1967 if (bh == sci->sc_super_root) { 1903 if (bh == segbuf->sb_super_root) {
1968 if (bh->b_page != bd_page) { 1904 if (bh->b_page != bd_page) {
1969 end_page_writeback(bd_page); 1905 end_page_writeback(bd_page);
1970 bd_page = bh->b_page; 1906 bd_page = bh->b_page;
1971 } 1907 }
1908 update_sr = true;
1972 break; 1909 break;
1973 } 1910 }
1974 if (bh->b_page != fs_page) { 1911 if (bh->b_page != fs_page) {
@@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2115 struct nilfs_sb_info *sbi = sci->sc_sbi; 2052 struct nilfs_sb_info *sbi = sci->sc_sbi;
2116 struct the_nilfs *nilfs = sbi->s_nilfs; 2053 struct the_nilfs *nilfs = sbi->s_nilfs;
2117 struct page *failed_page; 2054 struct page *failed_page;
2118 int err, has_sr = 0; 2055 int err;
2119 2056
2120 sci->sc_stage.scnt = NILFS_ST_INIT; 2057 sci->sc_stage.scnt = NILFS_ST_INIT;
2121 2058
@@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2143 if (unlikely(err)) 2080 if (unlikely(err))
2144 goto failed; 2081 goto failed;
2145 2082
2146 has_sr = (sci->sc_super_root != NULL);
2147
2148 /* Avoid empty segment */ 2083 /* Avoid empty segment */
2149 if (sci->sc_stage.scnt == NILFS_ST_DONE && 2084 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2150 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { 2085 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2159 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2094 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2160 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2095 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2161 2096
2162 if (has_sr) { 2097 if (mode == SC_LSEG_SR &&
2098 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
2163 err = nilfs_segctor_fill_in_checkpoint(sci); 2099 err = nilfs_segctor_fill_in_checkpoint(sci);
2164 if (unlikely(err)) 2100 if (unlikely(err))
2165 goto failed_to_write; 2101 goto failed_to_write;
@@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2171 /* Write partial segments */ 2107 /* Write partial segments */
2172 err = nilfs_segctor_prepare_write(sci, &failed_page); 2108 err = nilfs_segctor_prepare_write(sci, &failed_page);
2173 if (err) { 2109 if (err) {
2174 nilfs_abort_logs(&sci->sc_segbufs, failed_page, 2110 nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
2175 sci->sc_super_root, err);
2176 goto failed_to_write; 2111 goto failed_to_write;
2177 } 2112 }
2178 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); 2113
2114 nilfs_add_checksums_on_logs(&sci->sc_segbufs,
2115 nilfs->ns_crc_seed);
2179 2116
2180 err = nilfs_segctor_write(sci, nilfs); 2117 err = nilfs_segctor_write(sci, nilfs);
2181 if (unlikely(err)) 2118 if (unlikely(err))
@@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2196 } 2133 }
2197 } while (sci->sc_stage.scnt != NILFS_ST_DONE); 2134 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2198 2135
2199 sci->sc_super_root = NULL;
2200
2201 out: 2136 out:
2202 nilfs_segctor_check_out_files(sci, sbi); 2137 nilfs_segctor_check_out_files(sci, sbi);
2203 return err; 2138 return err;
@@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2224static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) 2159static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2225{ 2160{
2226 spin_lock(&sci->sc_state_lock); 2161 spin_lock(&sci->sc_state_lock);
2227 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { 2162 if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2228 sci->sc_timer->expires = jiffies + sci->sc_interval; 2163 sci->sc_timer.expires = jiffies + sci->sc_interval;
2229 add_timer(sci->sc_timer); 2164 add_timer(&sci->sc_timer);
2230 sci->sc_state |= NILFS_SEGCTOR_COMMIT; 2165 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2231 } 2166 }
2232 spin_unlock(&sci->sc_state_lock); 2167 spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2431 spin_lock(&sci->sc_state_lock); 2366 spin_lock(&sci->sc_state_lock);
2432 sci->sc_seq_accepted = sci->sc_seq_request; 2367 sci->sc_seq_accepted = sci->sc_seq_request;
2433 spin_unlock(&sci->sc_state_lock); 2368 spin_unlock(&sci->sc_state_lock);
2434 2369 del_timer_sync(&sci->sc_timer);
2435 if (sci->sc_timer)
2436 del_timer_sync(sci->sc_timer);
2437} 2370}
2438 2371
2439/** 2372/**
@@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2392 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2460 2393
2461 /* re-enable timer if checkpoint creation was not done */ 2394 /* re-enable timer if checkpoint creation was not done */
2462 if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2395 if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2463 time_before(jiffies, sci->sc_timer->expires)) 2396 time_before(jiffies, sci->sc_timer.expires))
2464 add_timer(sci->sc_timer); 2397 add_timer(&sci->sc_timer);
2465 } 2398 }
2466 spin_unlock(&sci->sc_state_lock); 2399 spin_unlock(&sci->sc_state_lock);
2467} 2400}
@@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg)
2640{ 2573{
2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2574 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 2575 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2643 struct timer_list timer;
2644 int timeout = 0; 2576 int timeout = 0;
2645 2577
2646 init_timer(&timer); 2578 sci->sc_timer.data = (unsigned long)current;
2647 timer.data = (unsigned long)current; 2579 sci->sc_timer.function = nilfs_construction_timeout;
2648 timer.function = nilfs_construction_timeout;
2649 sci->sc_timer = &timer;
2650 2580
2651 /* start sync. */ 2581 /* start sync. */
2652 sci->sc_task = current; 2582 sci->sc_task = current;
@@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
2695 should_sleep = 0; 2625 should_sleep = 0;
2696 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) 2626 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2697 should_sleep = time_before(jiffies, 2627 should_sleep = time_before(jiffies,
2698 sci->sc_timer->expires); 2628 sci->sc_timer.expires);
2699 2629
2700 if (should_sleep) { 2630 if (should_sleep) {
2701 spin_unlock(&sci->sc_state_lock); 2631 spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg)
2704 } 2634 }
2705 finish_wait(&sci->sc_wait_daemon, &wait); 2635 finish_wait(&sci->sc_wait_daemon, &wait);
2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2636 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2707 time_after_eq(jiffies, sci->sc_timer->expires)); 2637 time_after_eq(jiffies, sci->sc_timer.expires));
2708 2638
2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) 2639 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2710 set_nilfs_discontinued(nilfs); 2640 set_nilfs_discontinued(nilfs);
@@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg)
2713 2643
2714 end_thread: 2644 end_thread:
2715 spin_unlock(&sci->sc_state_lock); 2645 spin_unlock(&sci->sc_state_lock);
2716 del_timer_sync(sci->sc_timer);
2717 sci->sc_timer = NULL;
2718 2646
2719 /* end sync. */ 2647 /* end sync. */
2720 sci->sc_task = NULL; 2648 sci->sc_task = NULL;
@@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2750 } 2678 }
2751} 2679}
2752 2680
2753static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2754{
2755 sci->sc_seq_done = sci->sc_seq_request;
2756
2757 return nilfs_segctor_start_thread(sci);
2758}
2759
2760/* 2681/*
2761 * Setup & clean-up functions 2682 * Setup & clean-up functions
2762 */ 2683 */
@@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2780 INIT_LIST_HEAD(&sci->sc_write_logs); 2701 INIT_LIST_HEAD(&sci->sc_write_logs);
2781 INIT_LIST_HEAD(&sci->sc_gc_inodes); 2702 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2782 INIT_LIST_HEAD(&sci->sc_copied_buffers); 2703 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2704 init_timer(&sci->sc_timer);
2783 2705
2784 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; 2706 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2785 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; 2707 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2846 2768
2847 down_write(&sbi->s_nilfs->ns_segctor_sem); 2769 down_write(&sbi->s_nilfs->ns_segctor_sem);
2848 2770
2771 del_timer_sync(&sci->sc_timer);
2849 kfree(sci); 2772 kfree(sci);
2850} 2773}
2851 2774
@@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2880 return -ENOMEM; 2803 return -ENOMEM;
2881 2804
2882 nilfs_attach_writer(nilfs, sbi); 2805 nilfs_attach_writer(nilfs, sbi);
2883 err = nilfs_segctor_init(NILFS_SC(sbi)); 2806 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2884 if (err) { 2807 if (err) {
2885 nilfs_detach_writer(nilfs, sbi); 2808 nilfs_detach_writer(nilfs, sbi);
2886 kfree(sbi->s_sc_info); 2809 kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 82dfd6a686b9..dca142361ccf 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
100 * @sc_write_logs: List of segment buffers to hold logs under writing 100 * @sc_write_logs: List of segment buffers to hold logs under writing
101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers. 101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
102 * @sc_curseg: Current segment buffer 102 * @sc_curseg: Current segment buffer
103 * @sc_super_root: Pointer to the super root buffer
104 * @sc_stage: Collection stage 103 * @sc_stage: Collection stage
105 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary 104 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
106 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary 105 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
148 struct list_head sc_write_logs; 147 struct list_head sc_write_logs;
149 unsigned long sc_segbuf_nblocks; 148 unsigned long sc_segbuf_nblocks;
150 struct nilfs_segment_buffer *sc_curseg; 149 struct nilfs_segment_buffer *sc_curseg;
151 struct buffer_head *sc_super_root;
152 150
153 struct nilfs_cstage sc_stage; 151 struct nilfs_cstage sc_stage;
154 152
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
179 unsigned long sc_lseg_stime; /* in 1/HZ seconds */ 177 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
180 unsigned long sc_watermark; 178 unsigned long sc_watermark;
181 179
182 struct timer_list *sc_timer; 180 struct timer_list sc_timer;
183 struct task_struct *sc_task; 181 struct task_struct *sc_task;
184}; 182};
185 183
@@ -219,6 +217,8 @@ enum {
219 */ 217 */
220#define NILFS_SC_DEFAULT_WATERMARK 3600 218#define NILFS_SC_DEFAULT_WATERMARK 3600
221 219
220/* super.c */
221extern struct kmem_cache *nilfs_transaction_cachep;
222 222
223/* segment.c */ 223/* segment.c */
224extern int nilfs_init_transaction_cache(void); 224extern int nilfs_init_transaction_cache(void);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 48145f505a6a..03b34b738993 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
67 "(NILFS)"); 67 "(NILFS)");
68MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
69 69
70struct kmem_cache *nilfs_inode_cachep;
71struct kmem_cache *nilfs_transaction_cachep;
72struct kmem_cache *nilfs_segbuf_cachep;
73struct kmem_cache *nilfs_btree_path_cache;
74
70static int nilfs_remount(struct super_block *sb, int *flags, char *data); 75static int nilfs_remount(struct super_block *sb, int *flags, char *data);
71 76
72/** 77/**
@@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
129 va_end(args); 134 va_end(args);
130} 135}
131 136
132static struct kmem_cache *nilfs_inode_cachep;
133 137
134struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 138struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
135{ 139{
@@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
155 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 159 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
156} 160}
157 161
158static void init_once(void *obj)
159{
160 struct nilfs_inode_info *ii = obj;
161
162 INIT_LIST_HEAD(&ii->i_dirty);
163#ifdef CONFIG_NILFS_XATTR
164 init_rwsem(&ii->xattr_sem);
165#endif
166 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
167 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
168 inode_init_once(&ii->vfs_inode);
169}
170
171static int nilfs_init_inode_cache(void)
172{
173 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
174 sizeof(struct nilfs_inode_info),
175 0, SLAB_RECLAIM_ACCOUNT,
176 init_once);
177
178 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
179}
180
181static inline void nilfs_destroy_inode_cache(void)
182{
183 kmem_cache_destroy(nilfs_inode_cachep);
184}
185
186static void nilfs_clear_inode(struct inode *inode) 162static void nilfs_clear_inode(struct inode *inode)
187{ 163{
188 struct nilfs_inode_info *ii = NILFS_I(inode); 164 struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
266 int err; 242 int err;
267 243
268 /* nilfs->sem must be locked by the caller. */ 244 /* nilfs->sem must be locked by the caller. */
269 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { 245 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
270 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) 246 if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
271 nilfs_swap_super_block(nilfs); 247 nilfs_swap_super_block(nilfs);
272 else { 248 else {
273 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 249 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
470 if (nilfs_test_opt(sbi, SNAPSHOT)) 446 if (nilfs_test_opt(sbi, SNAPSHOT))
471 seq_printf(seq, ",cp=%llu", 447 seq_printf(seq, ",cp=%llu",
472 (unsigned long long int)sbi->s_snapshot_cno); 448 (unsigned long long int)sbi->s_snapshot_cno);
473 if (nilfs_test_opt(sbi, ERRORS_RO))
474 seq_printf(seq, ",errors=remount-ro");
475 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 449 if (nilfs_test_opt(sbi, ERRORS_PANIC))
476 seq_printf(seq, ",errors=panic"); 450 seq_printf(seq, ",errors=panic");
451 if (nilfs_test_opt(sbi, ERRORS_CONT))
452 seq_printf(seq, ",errors=continue");
477 if (nilfs_test_opt(sbi, STRICT_ORDER)) 453 if (nilfs_test_opt(sbi, STRICT_ORDER))
478 seq_printf(seq, ",order=strict"); 454 seq_printf(seq, ",order=strict");
479 if (nilfs_test_opt(sbi, NORECOVERY)) 455 if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
631 struct nilfs_super_block *sbp) 607 struct nilfs_super_block *sbp)
632{ 608{
633 sbi->s_mount_opt = 609 sbi->s_mount_opt =
634 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; 610 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
635} 611}
636 612
637static int nilfs_setup_super(struct nilfs_sb_info *sbi) 613static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -778,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
778 goto failed_sbi; 754 goto failed_sbi;
779 } 755 }
780 cno = sbi->s_snapshot_cno; 756 cno = sbi->s_snapshot_cno;
781 } else 757 }
782 /* Read-only mount */
783 sbi->s_snapshot_cno = cno;
784 } 758 }
785 759
786 err = nilfs_attach_checkpoint(sbi, cno); 760 err = nilfs_attach_checkpoint(sbi, cno);
@@ -849,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
849 struct the_nilfs *nilfs = sbi->s_nilfs; 823 struct the_nilfs *nilfs = sbi->s_nilfs;
850 unsigned long old_sb_flags; 824 unsigned long old_sb_flags;
851 struct nilfs_mount_options old_opts; 825 struct nilfs_mount_options old_opts;
852 int err; 826 int was_snapshot, err;
853 827
854 lock_kernel(); 828 lock_kernel();
855 829
@@ -857,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
857 old_sb_flags = sb->s_flags; 831 old_sb_flags = sb->s_flags;
858 old_opts.mount_opt = sbi->s_mount_opt; 832 old_opts.mount_opt = sbi->s_mount_opt;
859 old_opts.snapshot_cno = sbi->s_snapshot_cno; 833 old_opts.snapshot_cno = sbi->s_snapshot_cno;
834 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
860 835
861 if (!parse_options(data, sb)) { 836 if (!parse_options(data, sb)) {
862 err = -EINVAL; 837 err = -EINVAL;
@@ -864,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
864 } 839 }
865 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 840 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
866 841
867 if ((*flags & MS_RDONLY) && 842 err = -EINVAL;
868 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 843 if (was_snapshot) {
869 printk(KERN_WARNING "NILFS (device %s): couldn't " 844 if (!(*flags & MS_RDONLY)) {
870 "remount to a different snapshot.\n", 845 printk(KERN_ERR "NILFS (device %s): cannot remount "
871 sb->s_id); 846 "snapshot read/write.\n",
872 err = -EINVAL; 847 sb->s_id);
873 goto restore_opts; 848 goto restore_opts;
849 } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
850 printk(KERN_ERR "NILFS (device %s): cannot "
851 "remount to a different snapshot.\n",
852 sb->s_id);
853 goto restore_opts;
854 }
855 } else {
856 if (nilfs_test_opt(sbi, SNAPSHOT)) {
857 printk(KERN_ERR "NILFS (device %s): cannot change "
858 "a regular mount to a snapshot.\n",
859 sb->s_id);
860 goto restore_opts;
861 }
874 } 862 }
875 863
876 if (!nilfs_valid_fs(nilfs)) { 864 if (!nilfs_valid_fs(nilfs)) {
877 printk(KERN_WARNING "NILFS (device %s): couldn't " 865 printk(KERN_WARNING "NILFS (device %s): couldn't "
878 "remount because the filesystem is in an " 866 "remount because the filesystem is in an "
879 "incomplete recovery state.\n", sb->s_id); 867 "incomplete recovery state.\n", sb->s_id);
880 err = -EINVAL;
881 goto restore_opts; 868 goto restore_opts;
882 } 869 }
883 870
@@ -888,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
888 nilfs_detach_segment_constructor(sbi); 875 nilfs_detach_segment_constructor(sbi);
889 sb->s_flags |= MS_RDONLY; 876 sb->s_flags |= MS_RDONLY;
890 877
891 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
892 /* nilfs_set_opt(sbi, SNAPSHOT); */
893
894 /* 878 /*
895 * Remounting a valid RW partition RDONLY, so set 879 * Remounting a valid RW partition RDONLY, so set
896 * the RDONLY flag and then mark the partition as valid again. 880 * the RDONLY flag and then mark the partition as valid again.
@@ -909,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
909 * store the current valid flag. (It may have been changed 893 * store the current valid flag. (It may have been changed
910 * by fsck since we originally mounted the partition.) 894 * by fsck since we originally mounted the partition.)
911 */ 895 */
912 if (nilfs->ns_current && nilfs->ns_current != sbi) {
913 printk(KERN_WARNING "NILFS (device %s): couldn't "
914 "remount because an RW-mount exists.\n",
915 sb->s_id);
916 err = -EBUSY;
917 goto restore_opts;
918 }
919 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
920 printk(KERN_WARNING "NILFS (device %s): couldn't "
921 "remount because the current RO-mount is not "
922 "the latest one.\n",
923 sb->s_id);
924 err = -EINVAL;
925 goto restore_opts;
926 }
927 sb->s_flags &= ~MS_RDONLY; 896 sb->s_flags &= ~MS_RDONLY;
928 nilfs_clear_opt(sbi, SNAPSHOT);
929 sbi->s_snapshot_cno = 0;
930 897
931 err = nilfs_attach_segment_constructor(sbi); 898 err = nilfs_attach_segment_constructor(sbi);
932 if (err) 899 if (err)
@@ -935,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
935 down_write(&nilfs->ns_sem); 902 down_write(&nilfs->ns_sem);
936 nilfs_setup_super(sbi); 903 nilfs_setup_super(sbi);
937 up_write(&nilfs->ns_sem); 904 up_write(&nilfs->ns_sem);
938
939 nilfs->ns_current = sbi;
940 } 905 }
941 out: 906 out:
942 up_write(&nilfs->ns_super_sem); 907 up_write(&nilfs->ns_super_sem);
@@ -1022,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1022{ 987{
1023 struct nilfs_super_data sd; 988 struct nilfs_super_data sd;
1024 struct super_block *s; 989 struct super_block *s;
990 fmode_t mode = FMODE_READ;
1025 struct the_nilfs *nilfs; 991 struct the_nilfs *nilfs;
1026 int err, need_to_close = 1; 992 int err, need_to_close = 1;
1027 993
1028 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); 994 if (!(flags & MS_RDONLY))
995 mode |= FMODE_WRITE;
996
997 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1029 if (IS_ERR(sd.bdev)) 998 if (IS_ERR(sd.bdev))
1030 return PTR_ERR(sd.bdev); 999 return PTR_ERR(sd.bdev);
1031 1000
@@ -1092,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1092 1061
1093 /* New superblock instance created */ 1062 /* New superblock instance created */
1094 s->s_flags = flags; 1063 s->s_flags = flags;
1064 s->s_mode = mode;
1095 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1065 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1096 sb_set_blocksize(s, block_size(sd.bdev)); 1066 sb_set_blocksize(s, block_size(sd.bdev));
1097 1067
1098 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); 1068 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
1069 nilfs);
1099 if (err) 1070 if (err)
1100 goto cancel_new; 1071 goto cancel_new;
1101 1072
@@ -1106,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1106 mutex_unlock(&nilfs->ns_mount_mutex); 1077 mutex_unlock(&nilfs->ns_mount_mutex);
1107 put_nilfs(nilfs); 1078 put_nilfs(nilfs);
1108 if (need_to_close) 1079 if (need_to_close)
1109 close_bdev_exclusive(sd.bdev, flags); 1080 close_bdev_exclusive(sd.bdev, mode);
1110 simple_set_mnt(mnt, s); 1081 simple_set_mnt(mnt, s);
1111 return 0; 1082 return 0;
1112 1083
@@ -1114,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1114 mutex_unlock(&nilfs->ns_mount_mutex); 1085 mutex_unlock(&nilfs->ns_mount_mutex);
1115 put_nilfs(nilfs); 1086 put_nilfs(nilfs);
1116 failed: 1087 failed:
1117 close_bdev_exclusive(sd.bdev, flags); 1088 close_bdev_exclusive(sd.bdev, mode);
1118 1089
1119 return err; 1090 return err;
1120 1091
@@ -1124,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1124 put_nilfs(nilfs); 1095 put_nilfs(nilfs);
1125 deactivate_locked_super(s); 1096 deactivate_locked_super(s);
1126 /* 1097 /*
1127 * deactivate_super() invokes close_bdev_exclusive(). 1098 * deactivate_locked_super() invokes close_bdev_exclusive().
1128 * We must finish all post-cleaning before this call; 1099 * We must finish all post-cleaning before this call;
1129 * put_nilfs() needs the block device. 1100 * put_nilfs() needs the block device.
1130 */ 1101 */
@@ -1139,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
1139 .fs_flags = FS_REQUIRES_DEV, 1110 .fs_flags = FS_REQUIRES_DEV,
1140}; 1111};
1141 1112
1142static int __init init_nilfs_fs(void) 1113static void nilfs_inode_init_once(void *obj)
1143{ 1114{
1144 int err; 1115 struct nilfs_inode_info *ii = obj;
1145
1146 err = nilfs_init_inode_cache();
1147 if (err)
1148 goto failed;
1149 1116
1150 err = nilfs_init_transaction_cache(); 1117 INIT_LIST_HEAD(&ii->i_dirty);
1151 if (err) 1118#ifdef CONFIG_NILFS_XATTR
1152 goto failed_inode_cache; 1119 init_rwsem(&ii->xattr_sem);
1120#endif
1121 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
1122 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
1123 inode_init_once(&ii->vfs_inode);
1124}
1153 1125
1154 err = nilfs_init_segbuf_cache(); 1126static void nilfs_segbuf_init_once(void *obj)
1155 if (err) 1127{
1156 goto failed_transaction_cache; 1128 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
1129}
1157 1130
1158 err = nilfs_btree_path_cache_init(); 1131static void nilfs_destroy_cachep(void)
1159 if (err) 1132{
1160 goto failed_segbuf_cache; 1133 if (nilfs_inode_cachep)
1134 kmem_cache_destroy(nilfs_inode_cachep);
1135 if (nilfs_transaction_cachep)
1136 kmem_cache_destroy(nilfs_transaction_cachep);
1137 if (nilfs_segbuf_cachep)
1138 kmem_cache_destroy(nilfs_segbuf_cachep);
1139 if (nilfs_btree_path_cache)
1140 kmem_cache_destroy(nilfs_btree_path_cache);
1141}
1161 1142
1162 err = register_filesystem(&nilfs_fs_type); 1143static int __init nilfs_init_cachep(void)
1163 if (err) 1144{
1164 goto failed_btree_path_cache; 1145 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
1146 sizeof(struct nilfs_inode_info), 0,
1147 SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
1148 if (!nilfs_inode_cachep)
1149 goto fail;
1150
1151 nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
1152 sizeof(struct nilfs_transaction_info), 0,
1153 SLAB_RECLAIM_ACCOUNT, NULL);
1154 if (!nilfs_transaction_cachep)
1155 goto fail;
1156
1157 nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
1158 sizeof(struct nilfs_segment_buffer), 0,
1159 SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
1160 if (!nilfs_segbuf_cachep)
1161 goto fail;
1162
1163 nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
1164 sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
1165 0, 0, NULL);
1166 if (!nilfs_btree_path_cache)
1167 goto fail;
1165 1168
1166 return 0; 1169 return 0;
1167 1170
1168 failed_btree_path_cache: 1171fail:
1169 nilfs_btree_path_cache_destroy(); 1172 nilfs_destroy_cachep();
1173 return -ENOMEM;
1174}
1175
1176static int __init init_nilfs_fs(void)
1177{
1178 int err;
1170 1179
1171 failed_segbuf_cache: 1180 err = nilfs_init_cachep();
1172 nilfs_destroy_segbuf_cache(); 1181 if (err)
1182 goto fail;
1173 1183
1174 failed_transaction_cache: 1184 err = register_filesystem(&nilfs_fs_type);
1175 nilfs_destroy_transaction_cache(); 1185 if (err)
1186 goto free_cachep;
1176 1187
1177 failed_inode_cache: 1188 printk(KERN_INFO "NILFS version 2 loaded\n");
1178 nilfs_destroy_inode_cache(); 1189 return 0;
1179 1190
1180 failed: 1191free_cachep:
1192 nilfs_destroy_cachep();
1193fail:
1181 return err; 1194 return err;
1182} 1195}
1183 1196
1184static void __exit exit_nilfs_fs(void) 1197static void __exit exit_nilfs_fs(void)
1185{ 1198{
1186 nilfs_destroy_segbuf_cache(); 1199 nilfs_destroy_cachep();
1187 nilfs_destroy_transaction_cache();
1188 nilfs_destroy_inode_cache();
1189 nilfs_btree_path_cache_destroy();
1190 unregister_filesystem(&nilfs_fs_type); 1200 unregister_filesystem(&nilfs_fs_type);
1191} 1201}
1192 1202
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
486 printk(KERN_WARNING 486 printk(KERN_WARNING
487 "NILFS warning: unable to read secondary superblock\n"); 487 "NILFS warning: unable to read secondary superblock\n");
488 488
489 /*
490 * Compare two super blocks and set 1 in swp if the secondary
491 * super block is valid and newer. Otherwise, set 0 in swp.
492 */
489 valid[0] = nilfs_valid_sb(sbp[0]); 493 valid[0] = nilfs_valid_sb(sbp[0]);
490 valid[1] = nilfs_valid_sb(sbp[1]); 494 valid[1] = nilfs_valid_sb(sbp[1]);
491 swp = valid[1] && 495 swp = valid[1] && (!valid[0] ||
492 (!valid[0] || 496 le64_to_cpu(sbp[1]->s_last_cno) >
493 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); 497 le64_to_cpu(sbp[0]->s_last_cno));
494 498
495 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { 499 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
496 brelse(sbh[1]); 500 brelse(sbh[1]);
@@ -670,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
670 start * sects_per_block, 674 start * sects_per_block,
671 nblocks * sects_per_block, 675 nblocks * sects_per_block,
672 GFP_NOFS, 676 GFP_NOFS,
673 DISCARD_FL_BARRIER); 677 BLKDEV_IFL_BARRIER);
674 if (ret < 0) 678 if (ret < 0)
675 return ret; 679 return ret;
676 nblocks = 0; 680 nblocks = 0;
@@ -680,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
680 ret = blkdev_issue_discard(nilfs->ns_bdev, 684 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block, 685 start * sects_per_block,
682 nblocks * sects_per_block, 686 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER); 687 GFP_NOFS, BLKDEV_IFL_BARRIER);
684 return ret; 688 return ret;
685} 689}
686 690
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914ccb..27b75ebc7460 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
110int pin_inotify_watch(struct inotify_watch *watch) 110int pin_inotify_watch(struct inotify_watch *watch)
111{ 111{
112 struct super_block *sb = watch->inode->i_sb; 112 struct super_block *sb = watch->inode->i_sb;
113 spin_lock(&sb_lock); 113 if (atomic_inc_not_zero(&sb->s_active)) {
114 if (sb->s_count >= S_BIAS) {
115 atomic_inc(&sb->s_active);
116 spin_unlock(&sb_lock);
117 atomic_inc(&watch->count); 114 atomic_inc(&watch->count);
118 return 1; 115 return 1;
119 } 116 }
120 spin_unlock(&sb_lock);
121 return 0; 117 return 0;
122} 118}
123 119
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
515 * done. Cleanup is just deactivate_super(). However, that leaves a messy 511 * done. Cleanup is just deactivate_super(). However, that leaves a messy
516 * case - what if we *are* racing with umount() and active references to 512 * case - what if we *are* racing with umount() and active references to
517 * superblock can't be acquired anymore? We can bump ->s_count, grab 513 * superblock can't be acquired anymore? We can bump ->s_count, grab
518 * ->s_umount, which will almost certainly wait until the superblock is shut 514 * ->s_umount, which will wait until the superblock is shut down and the
519 * down and the watch in question is pining for fjords. That's fine, but 515 * watch in question is pining for fjords.
520 * there is a problem - we might have hit the window between ->s_active
521 * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
522 * is past the point of no return and is heading for shutdown) and the
523 * moment when deactivate_super() acquires ->s_umount. We could just do
524 * drop_super() yield() and retry, but that's rather antisocial and this
525 * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having
526 * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
527 * that we won't race with inotify_umount_inodes(). So we could grab a
528 * reference to watch and do the rest as above, just with drop_super() instead
529 * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we
530 * could grab ->s_umount. So the watch could've been gone already.
531 *
532 * That still can be dealt with - we need to save watch->wd, do idr_find()
533 * and compare its result with our pointer. If they match, we either have
534 * the damn thing still alive or we'd lost not one but two races at once,
535 * the watch had been killed and a new one got created with the same ->wd
536 * at the same address. That couldn't have happened in inotify_destroy(),
537 * but inotify_rm_wd() could run into that. Still, "new one got created"
538 * is not a problem - we have every right to kill it or leave it alone,
539 * whatever's more convenient.
540 *
541 * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
542 * "grab it and kill it" check. If it's been our original watch, we are
543 * fine, if it's a newcomer - nevermind, just pretend that we'd won the
544 * race and kill the fscker anyway; we are safe since we know that its
545 * superblock won't be going away.
546 * 516 *
547 * And yes, this is far beyond mere "not very pretty"; so's the entire 517 * And yes, this is far beyond mere "not very pretty"; so's the entire
548 * concept of inotify to start with. 518 * concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
556 * Called with ih->mutex held, drops it. Possible return values: 526 * Called with ih->mutex held, drops it. Possible return values:
557 * 0 - nothing to do, it has died 527 * 0 - nothing to do, it has died
558 * 1 - remove it, drop the reference and deactivate_super() 528 * 1 - remove it, drop the reference and deactivate_super()
559 * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
560 * that variant, since it involved a lot of PITA, but that's the best that
561 * could've been done.
562 */ 529 */
563static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) 530static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
564{ 531{
565 struct super_block *sb = watch->inode->i_sb; 532 struct super_block *sb = watch->inode->i_sb;
566 s32 wd = watch->wd;
567 533
568 spin_lock(&sb_lock); 534 if (atomic_inc_not_zero(&sb->s_active)) {
569 if (sb->s_count >= S_BIAS) {
570 atomic_inc(&sb->s_active);
571 spin_unlock(&sb_lock);
572 get_inotify_watch(watch); 535 get_inotify_watch(watch);
573 mutex_unlock(&ih->mutex); 536 mutex_unlock(&ih->mutex);
574 return 1; /* the best outcome */ 537 return 1; /* the best outcome */
575 } 538 }
539 spin_lock(&sb_lock);
576 sb->s_count++; 540 sb->s_count++;
577 spin_unlock(&sb_lock); 541 spin_unlock(&sb_lock);
578 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ 542 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
579 down_read(&sb->s_umount); 543 down_read(&sb->s_umount);
580 if (likely(!sb->s_root)) { 544 /* fs is already shut down; the watch is dead */
581 /* fs is already shut down; the watch is dead */ 545 drop_super(sb);
582 drop_super(sb); 546 return 0;
583 return 0;
584 }
585 /* raced with the final deactivate_super() */
586 mutex_lock(&ih->mutex);
587 if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
588 /* the watch is dead */
589 mutex_unlock(&ih->mutex);
590 drop_super(sb);
591 return 0;
592 }
593 /* still alive or freed and reused with the same sb and wd; kill */
594 get_inotify_watch(watch);
595 mutex_unlock(&ih->mutex);
596 return 2;
597} 547}
598 548
599static void unpin_and_kill(struct inotify_watch *watch, int how) 549static void unpin_and_kill(struct inotify_watch *watch)
600{ 550{
601 struct super_block *sb = watch->inode->i_sb; 551 struct super_block *sb = watch->inode->i_sb;
602 put_inotify_watch(watch); 552 put_inotify_watch(watch);
603 switch (how) { 553 deactivate_super(sb);
604 case 1:
605 deactivate_super(sb);
606 break;
607 case 2:
608 drop_super(sb);
609 }
610} 554}
611 555
612/** 556/**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
628 struct list_head *watches; 572 struct list_head *watches;
629 struct super_block *sb; 573 struct super_block *sb;
630 struct inode *inode; 574 struct inode *inode;
631 int how;
632 575
633 mutex_lock(&ih->mutex); 576 mutex_lock(&ih->mutex);
634 watches = &ih->watches; 577 watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
638 } 581 }
639 watch = list_first_entry(watches, struct inotify_watch, h_list); 582 watch = list_first_entry(watches, struct inotify_watch, h_list);
640 sb = watch->inode->i_sb; 583 sb = watch->inode->i_sb;
641 how = pin_to_kill(ih, watch); 584 if (!pin_to_kill(ih, watch))
642 if (!how)
643 continue; 585 continue;
644 586
645 inode = watch->inode; 587 inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
654 596
655 mutex_unlock(&ih->mutex); 597 mutex_unlock(&ih->mutex);
656 mutex_unlock(&inode->inotify_mutex); 598 mutex_unlock(&inode->inotify_mutex);
657 unpin_and_kill(watch, how); 599 unpin_and_kill(watch);
658 } 600 }
659 601
660 /* free this handle: the put matching the get in inotify_init() */ 602 /* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
857 struct inotify_watch *watch; 799 struct inotify_watch *watch;
858 struct super_block *sb; 800 struct super_block *sb;
859 struct inode *inode; 801 struct inode *inode;
860 int how;
861 802
862 mutex_lock(&ih->mutex); 803 mutex_lock(&ih->mutex);
863 watch = idr_find(&ih->idr, wd); 804 watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
866 return -EINVAL; 807 return -EINVAL;
867 } 808 }
868 sb = watch->inode->i_sb; 809 sb = watch->inode->i_sb;
869 how = pin_to_kill(ih, watch); 810 if (!pin_to_kill(ih, watch))
870 if (!how)
871 return 0; 811 return 0;
872 812
873 inode = watch->inode; 813 inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
881 821
882 mutex_unlock(&ih->mutex); 822 mutex_unlock(&ih->mutex);
883 mutex_unlock(&inode->inotify_mutex); 823 mutex_unlock(&inode->inotify_mutex);
884 unpin_and_kill(watch, how); 824 unpin_and_kill(watch);
885 825
886 return 0; 826 return 0;
887} 827}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..a1924a0d2ab0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
98 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * the page at all. For a more detailed explanation see ntfs_truncate() in
99 * fs/ntfs/inode.c. 99 * fs/ntfs/inode.c.
100 * 100 *
101 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
102 * pages.
103 *
104 * Return 0 on success and -errno on error. In the case that an error is 101 * Return 0 on success and -errno on error. In the case that an error is
105 * encountered it is possible that the initialized size will already have been 102 * encountered it is possible that the initialized size will already have been
106 * incremented some way towards @new_init_size but it is guaranteed that if 103 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
110 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
111 * held by the caller. 108 * held by the caller.
112 */ 109 */
113static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
114 struct page **cached_page, struct pagevec *lru_pvec)
115{ 111{
116 s64 old_init_size; 112 s64 old_init_size;
117 loff_t old_i_size; 113 loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
403 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
404 * starting at index @index. 400 * starting at index @index.
405 * 401 *
406 * If a page is newly created, increment its refcount and add it to the 402 * If a page is newly created, add it to lru list
407 * caller's lru-buffering pagevec @lru_pvec.
408 *
409 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
410 * are obtained at once instead of just one page and that 0 is returned on
411 * success and -errno on error.
412 * 403 *
413 * Note, the page locks are obtained in ascending page index order. 404 * Note, the page locks are obtained in ascending page index order.
414 */ 405 */
415static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
416 pgoff_t index, const unsigned nr_pages, struct page **pages, 407 pgoff_t index, const unsigned nr_pages, struct page **pages,
417 struct page **cached_page, struct pagevec *lru_pvec) 408 struct page **cached_page)
418{ 409{
419 int err, nr; 410 int err, nr;
420 411
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
430 goto err_out; 421 goto err_out;
431 } 422 }
432 } 423 }
433 err = add_to_page_cache(*cached_page, mapping, index, 424 err = add_to_page_cache_lru(*cached_page, mapping, index,
434 GFP_KERNEL); 425 GFP_KERNEL);
435 if (unlikely(err)) { 426 if (unlikely(err)) {
436 if (err == -EEXIST) 427 if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
438 goto err_out; 429 goto err_out;
439 } 430 }
440 pages[nr] = *cached_page; 431 pages[nr] = *cached_page;
441 page_cache_get(*cached_page);
442 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
443 __pagevec_lru_add_file(lru_pvec);
444 *cached_page = NULL; 432 *cached_page = NULL;
445 } 433 }
446 index++; 434 index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1800 ssize_t status, written; 1788 ssize_t status, written;
1801 unsigned nr_pages; 1789 unsigned nr_pages;
1802 int err; 1790 int err;
1803 struct pagevec lru_pvec;
1804 1791
1805 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1806 "pos 0x%llx, count 0x%lx.", 1793 "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1912 } 1899 }
1913 } 1900 }
1914 } 1901 }
1915 pagevec_init(&lru_pvec, 0);
1916 written = 0; 1902 written = 0;
1917 /* 1903 /*
1918 * If the write starts beyond the initialized size, extend it up to the 1904 * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1925 ll = ni->initialized_size; 1911 ll = ni->initialized_size;
1926 read_unlock_irqrestore(&ni->size_lock, flags); 1912 read_unlock_irqrestore(&ni->size_lock, flags);
1927 if (pos > ll) { 1913 if (pos > ll) {
1928 err = ntfs_attr_extend_initialized(ni, pos, &cached_page, 1914 err = ntfs_attr_extend_initialized(ni, pos);
1929 &lru_pvec);
1930 if (err < 0) { 1915 if (err < 0) {
1931 ntfs_error(vol->sb, "Cannot perform write to inode " 1916 ntfs_error(vol->sb, "Cannot perform write to inode "
1932 "0x%lx, attribute type 0x%x, because " 1917 "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2012 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1997 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2013 /* Get and lock @do_pages starting at index @start_idx. */ 1998 /* Get and lock @do_pages starting at index @start_idx. */
2014 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1999 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2015 pages, &cached_page, &lru_pvec); 2000 pages, &cached_page);
2016 if (unlikely(status)) 2001 if (unlikely(status))
2017 break; 2002 break;
2018 /* 2003 /*
@@ -2077,7 +2062,6 @@ err_out:
2077 *ppos = pos; 2062 *ppos = pos;
2078 if (cached_page) 2063 if (cached_page)
2079 page_cache_release(cached_page); 2064 page_cache_release(cached_page);
2080 pagevec_lru_add_file(&lru_pvec);
2081 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2065 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2082 written ? "written" : "status", (unsigned long)written, 2066 written ? "written" : "status", (unsigned long)written,
2083 (long)status); 2067 (long)status);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
29 mmap.o \ 29 mmap.o \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \
32 resize.o \ 33 resize.o \
33 slot_map.o \ 34 slot_map.o \
34 suballoc.o \ 35 suballoc.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e13fc9e8fcdc..da702294d7e7 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -489,7 +489,7 @@ cleanup:
489 return ret; 489 return ret;
490} 490}
491 491
492struct xattr_handler ocfs2_xattr_acl_access_handler = { 492const struct xattr_handler ocfs2_xattr_acl_access_handler = {
493 .prefix = POSIX_ACL_XATTR_ACCESS, 493 .prefix = POSIX_ACL_XATTR_ACCESS,
494 .flags = ACL_TYPE_ACCESS, 494 .flags = ACL_TYPE_ACCESS,
495 .list = ocfs2_xattr_list_acl_access, 495 .list = ocfs2_xattr_list_acl_access,
@@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
497 .set = ocfs2_xattr_set_acl, 497 .set = ocfs2_xattr_set_acl,
498}; 498};
499 499
500struct xattr_handler ocfs2_xattr_acl_default_handler = { 500const struct xattr_handler ocfs2_xattr_acl_default_handler = {
501 .prefix = POSIX_ACL_XATTR_DEFAULT, 501 .prefix = POSIX_ACL_XATTR_DEFAULT,
502 .flags = ACL_TYPE_DEFAULT, 502 .flags = ACL_TYPE_DEFAULT,
503 .list = ocfs2_xattr_list_acl_default, 503 .list = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..215e12ce1d85 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1006 int count, status, i; 1006 int count, status, i;
1007 u16 suballoc_bit_start; 1007 u16 suballoc_bit_start;
1008 u32 num_got; 1008 u32 num_got;
1009 u64 first_blkno; 1009 u64 suballoc_loc, first_blkno;
1010 struct ocfs2_super *osb = 1010 struct ocfs2_super *osb =
1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1012 struct ocfs2_extent_block *eb; 1012 struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1015 1015
1016 count = 0; 1016 count = 0;
1017 while (count < wanted) { 1017 while (count < wanted) {
1018 status = ocfs2_claim_metadata(osb, 1018 status = ocfs2_claim_metadata(handle,
1019 handle,
1020 meta_ac, 1019 meta_ac,
1021 wanted - count, 1020 wanted - count,
1021 &suballoc_loc,
1022 &suballoc_bit_start, 1022 &suballoc_bit_start,
1023 &num_got, 1023 &num_got,
1024 &first_blkno); 1024 &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot); 1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1055 eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1056 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1056 eb->h_list.l_count = 1057 eb->h_list.l_count =
1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1058 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1061 1062
1062 /* We'll also be dirtied by the caller, so 1063 /* We'll also be dirtied by the caller, so
1063 * this isn't absolutely necessary. */ 1064 * this isn't absolutely necessary. */
1064 status = ocfs2_journal_dirty(handle, bhs[i]); 1065 ocfs2_journal_dirty(handle, bhs[i]);
1065 if (status < 0) {
1066 mlog_errno(status);
1067 goto bail;
1068 }
1069 } 1066 }
1070 1067
1071 count += num_got; 1068 count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1129 goto out; 1126 goto out;
1130 } 1127 }
1131 1128
1132 status = ocfs2_extend_trans(handle, path_num_items(path) + 1129 status = ocfs2_extend_trans(handle, path_num_items(path));
1133 handle->h_buffer_credits);
1134 if (status < 0) { 1130 if (status < 0) {
1135 mlog_errno(status); 1131 mlog_errno(status);
1136 goto out; 1132 goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
1270 if (!eb_el->l_tree_depth) 1266 if (!eb_el->l_tree_depth)
1271 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 1267 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1272 1268
1273 status = ocfs2_journal_dirty(handle, bh); 1269 ocfs2_journal_dirty(handle, bh);
1274 if (status < 0) {
1275 mlog_errno(status);
1276 goto bail;
1277 }
1278
1279 next_blkno = le64_to_cpu(eb->h_blkno); 1270 next_blkno = le64_to_cpu(eb->h_blkno);
1280 } 1271 }
1281 1272
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
1321 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 1312 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1322 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 1313 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1323 1314
1324 status = ocfs2_journal_dirty(handle, *last_eb_bh); 1315 ocfs2_journal_dirty(handle, *last_eb_bh);
1325 if (status < 0) 1316 ocfs2_journal_dirty(handle, et->et_root_bh);
1326 mlog_errno(status); 1317 if (eb_bh)
1327 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1318 ocfs2_journal_dirty(handle, eb_bh);
1328 if (status < 0)
1329 mlog_errno(status);
1330 if (eb_bh) {
1331 status = ocfs2_journal_dirty(handle, eb_bh);
1332 if (status < 0)
1333 mlog_errno(status);
1334 }
1335 1319
1336 /* 1320 /*
1337 * Some callers want to track the rightmost leaf so pass it 1321 * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1399 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1383 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1400 eb_el->l_recs[i] = root_el->l_recs[i]; 1384 eb_el->l_recs[i] = root_el->l_recs[i];
1401 1385
1402 status = ocfs2_journal_dirty(handle, new_eb_bh); 1386 ocfs2_journal_dirty(handle, new_eb_bh);
1403 if (status < 0) {
1404 mlog_errno(status);
1405 goto bail;
1406 }
1407 1387
1408 status = ocfs2_et_root_journal_access(handle, et, 1388 status = ocfs2_et_root_journal_access(handle, et,
1409 OCFS2_JOURNAL_ACCESS_WRITE); 1389 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1428 if (root_el->l_tree_depth == cpu_to_le16(1)) 1408 if (root_el->l_tree_depth == cpu_to_le16(1))
1429 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1409 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1430 1410
1431 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1411 ocfs2_journal_dirty(handle, et->et_root_bh);
1432 if (status < 0) {
1433 mlog_errno(status);
1434 goto bail;
1435 }
1436 1412
1437 *ret_new_eb_bh = new_eb_bh; 1413 *ret_new_eb_bh = new_eb_bh;
1438 new_eb_bh = NULL; 1414 new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2064 struct ocfs2_path *right_path, 2040 struct ocfs2_path *right_path,
2065 int subtree_index) 2041 int subtree_index)
2066{ 2042{
2067 int ret, i, idx; 2043 int i, idx;
2068 struct ocfs2_extent_list *el, *left_el, *right_el; 2044 struct ocfs2_extent_list *el, *left_el, *right_el;
2069 struct ocfs2_extent_rec *left_rec, *right_rec; 2045 struct ocfs2_extent_rec *left_rec, *right_rec;
2070 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2046 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2102 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 2078 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2103 right_el); 2079 right_el);
2104 2080
2105 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 2081 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2106 if (ret) 2082 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2107 mlog_errno(ret);
2108
2109 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2110 if (ret)
2111 mlog_errno(ret);
2112 2083
2113 /* 2084 /*
2114 * Setup our list pointers now so that the current 2085 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2132 2103
2133 root_bh = left_path->p_node[subtree_index].bh; 2104 root_bh = left_path->p_node[subtree_index].bh;
2134 2105
2135 ret = ocfs2_journal_dirty(handle, root_bh); 2106 ocfs2_journal_dirty(handle, root_bh);
2136 if (ret)
2137 mlog_errno(ret);
2138} 2107}
2139 2108
2140static int ocfs2_rotate_subtree_right(handle_t *handle, 2109static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2207 2176
2208 ocfs2_create_empty_extent(right_el); 2177 ocfs2_create_empty_extent(right_el);
2209 2178
2210 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 2179 ocfs2_journal_dirty(handle, right_leaf_bh);
2211 if (ret) {
2212 mlog_errno(ret);
2213 goto out;
2214 }
2215 2180
2216 /* Do the copy now. */ 2181 /* Do the copy now. */
2217 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 2182 i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2230 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2195 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2231 le16_add_cpu(&left_el->l_next_free_rec, 1); 2196 le16_add_cpu(&left_el->l_next_free_rec, 1);
2232 2197
2233 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 2198 ocfs2_journal_dirty(handle, left_leaf_bh);
2234 if (ret) {
2235 mlog_errno(ret);
2236 goto out;
2237 }
2238 2199
2239 ocfs2_complete_edge_insert(handle, left_path, right_path, 2200 ocfs2_complete_edge_insert(handle, left_path, right_path,
2240 subtree_index); 2201 subtree_index);
@@ -2249,8 +2210,8 @@ out:
2249 * 2210 *
2250 * Will return zero if the path passed in is already the leftmost path. 2211 * Will return zero if the path passed in is already the leftmost path.
2251 */ 2212 */
2252static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 2213int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2253 struct ocfs2_path *path, u32 *cpos) 2214 struct ocfs2_path *path, u32 *cpos)
2254{ 2215{
2255 int i, j, ret = 0; 2216 int i, j, ret = 0;
2256 u64 blkno; 2217 u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2327 int op_credits, 2288 int op_credits,
2328 struct ocfs2_path *path) 2289 struct ocfs2_path *path)
2329{ 2290{
2330 int ret; 2291 int ret = 0;
2331 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2292 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2332 2293
2333 if (handle->h_buffer_credits < credits) { 2294 if (handle->h_buffer_credits < credits)
2334 ret = ocfs2_extend_trans(handle, 2295 ret = ocfs2_extend_trans(handle,
2335 credits - handle->h_buffer_credits); 2296 credits - handle->h_buffer_credits);
2336 if (ret)
2337 return ret;
2338 2297
2339 if (unlikely(handle->h_buffer_credits < credits)) 2298 return ret;
2340 return ocfs2_extend_trans(handle, credits);
2341 }
2342
2343 return 0;
2344} 2299}
2345 2300
2346/* 2301/*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
2584 * records for all the bh in the path. 2539 * records for all the bh in the path.
2585 * So we have to allocate extra credits and access them. 2540 * So we have to allocate extra credits and access them.
2586 */ 2541 */
2587 ret = ocfs2_extend_trans(handle, 2542 ret = ocfs2_extend_trans(handle, subtree_index);
2588 handle->h_buffer_credits + subtree_index);
2589 if (ret) { 2543 if (ret) {
2590 mlog_errno(ret); 2544 mlog_errno(ret);
2591 goto out; 2545 goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2823 ocfs2_remove_empty_extent(right_leaf_el); 2777 ocfs2_remove_empty_extent(right_leaf_el);
2824 } 2778 }
2825 2779
2826 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2780 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2827 if (ret) 2781 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2828 mlog_errno(ret);
2829 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2830 if (ret)
2831 mlog_errno(ret);
2832 2782
2833 if (del_right_subtree) { 2783 if (del_right_subtree) {
2834 ocfs2_unlink_subtree(handle, et, left_path, right_path, 2784 ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2851 if (right_has_empty) 2801 if (right_has_empty)
2852 ocfs2_remove_empty_extent(left_leaf_el); 2802 ocfs2_remove_empty_extent(left_leaf_el);
2853 2803
2854 ret = ocfs2_journal_dirty(handle, et_root_bh); 2804 ocfs2_journal_dirty(handle, et_root_bh);
2855 if (ret)
2856 mlog_errno(ret);
2857 2805
2858 *deleted = 1; 2806 *deleted = 1;
2859 } else 2807 } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2962 } 2910 }
2963 2911
2964 ocfs2_remove_empty_extent(el); 2912 ocfs2_remove_empty_extent(el);
2965 2913 ocfs2_journal_dirty(handle, bh);
2966 ret = ocfs2_journal_dirty(handle, bh);
2967 if (ret)
2968 mlog_errno(ret);
2969 2914
2970out: 2915out:
2971 return ret; 2916 return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3506 3451
3507 ocfs2_cleanup_merge(el, index); 3452 ocfs2_cleanup_merge(el, index);
3508 3453
3509 ret = ocfs2_journal_dirty(handle, bh); 3454 ocfs2_journal_dirty(handle, bh);
3510 if (ret)
3511 mlog_errno(ret);
3512
3513 if (right_path) { 3455 if (right_path) {
3514 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3456 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3515 if (ret)
3516 mlog_errno(ret);
3517
3518 ocfs2_complete_edge_insert(handle, left_path, right_path, 3457 ocfs2_complete_edge_insert(handle, left_path, right_path,
3519 subtree_index); 3458 subtree_index);
3520 } 3459 }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3683 3622
3684 ocfs2_cleanup_merge(el, index); 3623 ocfs2_cleanup_merge(el, index);
3685 3624
3686 ret = ocfs2_journal_dirty(handle, bh); 3625 ocfs2_journal_dirty(handle, bh);
3687 if (ret)
3688 mlog_errno(ret);
3689
3690 if (left_path) { 3626 if (left_path) {
3691 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3627 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3692 if (ret)
3693 mlog_errno(ret);
3694 3628
3695 /* 3629 /*
3696 * In the situation that the right_rec is empty and the extent 3630 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
4016 le32_add_cpu(&rec->e_int_clusters, 3950 le32_add_cpu(&rec->e_int_clusters,
4017 -le32_to_cpu(rec->e_cpos)); 3951 -le32_to_cpu(rec->e_cpos));
4018 3952
4019 ret = ocfs2_journal_dirty(handle, bh); 3953 ocfs2_journal_dirty(handle, bh);
4020 if (ret)
4021 mlog_errno(ret);
4022
4023 } 3954 }
4024} 3955}
4025 3956
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
4203 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 4134 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4204 4135
4205 if (left_path) { 4136 if (left_path) {
4206 int credits = handle->h_buffer_credits;
4207
4208 /* 4137 /*
4209 * There's a chance that left_path got passed back to 4138 * There's a chance that left_path got passed back to
4210 * us without being accounted for in the 4139 * us without being accounted for in the
4211 * journal. Extend our transaction here to be sure we 4140 * journal. Extend our transaction here to be sure we
4212 * can change those blocks. 4141 * can change those blocks.
4213 */ 4142 */
4214 credits += left_path->p_tree_depth; 4143 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4215
4216 ret = ocfs2_extend_trans(handle, credits);
4217 if (ret < 0) { 4144 if (ret < 0) {
4218 mlog_errno(ret); 4145 mlog_errno(ret);
4219 goto out; 4146 goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
4251 * dirty this for us. 4178 * dirty this for us.
4252 */ 4179 */
4253 if (left_path) 4180 if (left_path)
4254 ret = ocfs2_journal_dirty(handle, 4181 ocfs2_journal_dirty(handle,
4255 path_leaf_bh(left_path)); 4182 path_leaf_bh(left_path));
4256 if (ret)
4257 mlog_errno(ret);
4258 } else 4183 } else
4259 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), 4184 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4260 insert); 4185 insert);
4261 4186
4262 ret = ocfs2_journal_dirty(handle, leaf_bh); 4187 ocfs2_journal_dirty(handle, leaf_bh);
4263 if (ret)
4264 mlog_errno(ret);
4265 4188
4266 if (left_path) { 4189 if (left_path) {
4267 /* 4190 /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
4384 ocfs2_et_update_clusters(et, 4307 ocfs2_et_update_clusters(et,
4385 le16_to_cpu(insert_rec->e_leaf_clusters)); 4308 le16_to_cpu(insert_rec->e_leaf_clusters));
4386 4309
4387 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4310 ocfs2_journal_dirty(handle, et->et_root_bh);
4388 if (ret)
4389 mlog_errno(ret);
4390 4311
4391out: 4312out:
4392 ocfs2_free_path(left_path); 4313 ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4866 goto leave; 4787 goto leave;
4867 } 4788 }
4868 4789
4869 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4790 status = __ocfs2_claim_clusters(handle, data_ac, 1,
4870 clusters_to_add, &bit_off, &num_bits); 4791 clusters_to_add, &bit_off, &num_bits);
4871 if (status < 0) { 4792 if (status < 0) {
4872 if (status != -ENOSPC) 4793 if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4895 goto leave; 4816 goto leave;
4896 } 4817 }
4897 4818
4898 status = ocfs2_journal_dirty(handle, et->et_root_bh); 4819 ocfs2_journal_dirty(handle, et->et_root_bh);
4899 if (status < 0) {
4900 mlog_errno(status);
4901 goto leave;
4902 }
4903 4820
4904 clusters_to_add -= num_bits; 4821 clusters_to_add -= num_bits;
4905 *logical_offset += num_bits; 4822 *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5309 int index, u32 new_range, 5226 int index, u32 new_range,
5310 struct ocfs2_alloc_context *meta_ac) 5227 struct ocfs2_alloc_context *meta_ac)
5311{ 5228{
5312 int ret, depth, credits = handle->h_buffer_credits; 5229 int ret, depth, credits;
5313 struct buffer_head *last_eb_bh = NULL; 5230 struct buffer_head *last_eb_bh = NULL;
5314 struct ocfs2_extent_block *eb; 5231 struct ocfs2_extent_block *eb;
5315 struct ocfs2_extent_list *rightmost_el, *el; 5232 struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5340 } else 5257 } else
5341 rightmost_el = path_leaf_el(path); 5258 rightmost_el = path_leaf_el(path);
5342 5259
5343 credits += path->p_tree_depth + 5260 credits = path->p_tree_depth +
5344 ocfs2_extend_meta_needed(et->et_root_el); 5261 ocfs2_extend_meta_needed(et->et_root_el);
5345 ret = ocfs2_extend_trans(handle, credits); 5262 ret = ocfs2_extend_trans(handle, credits);
5346 if (ret) { 5263 if (ret) {
5347 mlog_errno(ret); 5264 mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
5671 return ret; 5588 return ret;
5672} 5589}
5673 5590
5591/*
5592 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5593 * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5594 * number to reserve some extra blocks, and it only handles meta
5595 * data allocations.
5596 *
5597 * Currently, only ocfs2_remove_btree_range() uses it for truncating
5598 * and punching holes.
5599 */
5600static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5601 struct ocfs2_extent_tree *et,
5602 u32 extents_to_split,
5603 struct ocfs2_alloc_context **ac,
5604 int extra_blocks)
5605{
5606 int ret = 0, num_free_extents;
5607 unsigned int max_recs_needed = 2 * extents_to_split;
5608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5609
5610 *ac = NULL;
5611
5612 num_free_extents = ocfs2_num_free_extents(osb, et);
5613 if (num_free_extents < 0) {
5614 ret = num_free_extents;
5615 mlog_errno(ret);
5616 goto out;
5617 }
5618
5619 if (!num_free_extents ||
5620 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5621 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5622
5623 if (extra_blocks) {
5624 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5625 if (ret < 0) {
5626 if (ret != -ENOSPC)
5627 mlog_errno(ret);
5628 goto out;
5629 }
5630 }
5631
5632out:
5633 if (ret) {
5634 if (*ac) {
5635 ocfs2_free_alloc_context(*ac);
5636 *ac = NULL;
5637 }
5638 }
5639
5640 return ret;
5641}
5642
5674int ocfs2_remove_btree_range(struct inode *inode, 5643int ocfs2_remove_btree_range(struct inode *inode,
5675 struct ocfs2_extent_tree *et, 5644 struct ocfs2_extent_tree *et,
5676 u32 cpos, u32 phys_cpos, u32 len, 5645 u32 cpos, u32 phys_cpos, u32 len, int flags,
5677 struct ocfs2_cached_dealloc_ctxt *dealloc) 5646 struct ocfs2_cached_dealloc_ctxt *dealloc,
5647 u64 refcount_loc)
5678{ 5648{
5679 int ret; 5649 int ret, credits = 0, extra_blocks = 0;
5680 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 5650 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5681 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5651 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5682 struct inode *tl_inode = osb->osb_tl_inode; 5652 struct inode *tl_inode = osb->osb_tl_inode;
5683 handle_t *handle; 5653 handle_t *handle;
5684 struct ocfs2_alloc_context *meta_ac = NULL; 5654 struct ocfs2_alloc_context *meta_ac = NULL;
5655 struct ocfs2_refcount_tree *ref_tree = NULL;
5656
5657 if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5658 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
5659 OCFS2_HAS_REFCOUNT_FL));
5660
5661 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5662 &ref_tree, NULL);
5663 if (ret) {
5664 mlog_errno(ret);
5665 goto out;
5666 }
5685 5667
5686 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); 5668 ret = ocfs2_prepare_refcount_change_for_del(inode,
5669 refcount_loc,
5670 phys_blkno,
5671 len,
5672 &credits,
5673 &extra_blocks);
5674 if (ret < 0) {
5675 mlog_errno(ret);
5676 goto out;
5677 }
5678 }
5679
5680 ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5681 extra_blocks);
5687 if (ret) { 5682 if (ret) {
5688 mlog_errno(ret); 5683 mlog_errno(ret);
5689 return ret; 5684 return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
5699 } 5694 }
5700 } 5695 }
5701 5696
5702 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); 5697 handle = ocfs2_start_trans(osb,
5698 ocfs2_remove_extent_credits(osb->sb) + credits);
5703 if (IS_ERR(handle)) { 5699 if (IS_ERR(handle)) {
5704 ret = PTR_ERR(handle); 5700 ret = PTR_ERR(handle);
5705 mlog_errno(ret); 5701 mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
5724 5720
5725 ocfs2_et_update_clusters(et, -len); 5721 ocfs2_et_update_clusters(et, -len);
5726 5722
5727 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5723 ocfs2_journal_dirty(handle, et->et_root_bh);
5728 if (ret) {
5729 mlog_errno(ret);
5730 goto out_commit;
5731 }
5732 5724
5733 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 5725 if (phys_blkno) {
5734 if (ret) 5726 if (flags & OCFS2_EXT_REFCOUNTED)
5735 mlog_errno(ret); 5727 ret = ocfs2_decrease_refcount(inode, handle,
5728 ocfs2_blocks_to_clusters(osb->sb,
5729 phys_blkno),
5730 len, meta_ac,
5731 dealloc, 1);
5732 else
5733 ret = ocfs2_truncate_log_append(osb, handle,
5734 phys_blkno, len);
5735 if (ret)
5736 mlog_errno(ret);
5737
5738 }
5736 5739
5737out_commit: 5740out_commit:
5738 ocfs2_commit_trans(osb, handle); 5741 ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
5742 if (meta_ac) 5745 if (meta_ac)
5743 ocfs2_free_alloc_context(meta_ac); 5746 ocfs2_free_alloc_context(meta_ac);
5744 5747
5748 if (ref_tree)
5749 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5750
5745 return ret; 5751 return ret;
5746} 5752}
5747 5753
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5850 } 5856 }
5851 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5857 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5852 5858
5853 status = ocfs2_journal_dirty(handle, tl_bh); 5859 ocfs2_journal_dirty(handle, tl_bh);
5854 if (status < 0) {
5855 mlog_errno(status);
5856 goto bail;
5857 }
5858 5860
5859bail: 5861bail:
5860 mlog_exit(status); 5862 mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5893 5895
5894 tl->tl_used = cpu_to_le16(i); 5896 tl->tl_used = cpu_to_le16(i);
5895 5897
5896 status = ocfs2_journal_dirty(handle, tl_bh); 5898 ocfs2_journal_dirty(handle, tl_bh);
5897 if (status < 0) {
5898 mlog_errno(status);
5899 goto bail;
5900 }
5901 5899
5902 /* TODO: Perhaps we can calculate the bulk of the 5900 /* TODO: Perhaps we can calculate the bulk of the
5903 * credits up front rather than extending like 5901 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6298 */ 6296 */
6299struct ocfs2_cached_block_free { 6297struct ocfs2_cached_block_free {
6300 struct ocfs2_cached_block_free *free_next; 6298 struct ocfs2_cached_block_free *free_next;
6299 u64 free_bg;
6301 u64 free_blk; 6300 u64 free_blk;
6302 unsigned int free_bit; 6301 unsigned int free_bit;
6303}; 6302};
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6344 } 6343 }
6345 6344
6346 while (head) { 6345 while (head) {
6347 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6346 if (head->free_bg)
6348 head->free_bit); 6347 bg_blkno = head->free_bg;
6348 else
6349 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6350 head->free_bit);
6349 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 6351 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6350 head->free_bit, (unsigned long long)head->free_blk); 6352 head->free_bit, (unsigned long long)head->free_blk);
6351 6353
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6393 int ret = 0; 6395 int ret = 0;
6394 struct ocfs2_cached_block_free *item; 6396 struct ocfs2_cached_block_free *item;
6395 6397
6396 item = kmalloc(sizeof(*item), GFP_NOFS); 6398 item = kzalloc(sizeof(*item), GFP_NOFS);
6397 if (item == NULL) { 6399 if (item == NULL) {
6398 ret = -ENOMEM; 6400 ret = -ENOMEM;
6399 mlog_errno(ret); 6401 mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
6533} 6535}
6534 6536
6535int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6537int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6536 int type, int slot, u64 blkno, 6538 int type, int slot, u64 suballoc,
6537 unsigned int bit) 6539 u64 blkno, unsigned int bit)
6538{ 6540{
6539 int ret; 6541 int ret;
6540 struct ocfs2_per_slot_free_list *fl; 6542 struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6547 goto out; 6549 goto out;
6548 } 6550 }
6549 6551
6550 item = kmalloc(sizeof(*item), GFP_NOFS); 6552 item = kzalloc(sizeof(*item), GFP_NOFS);
6551 if (item == NULL) { 6553 if (item == NULL) {
6552 ret = -ENOMEM; 6554 ret = -ENOMEM;
6553 mlog_errno(ret); 6555 mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6557 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 6559 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6558 type, slot, bit, (unsigned long long)blkno); 6560 type, slot, bit, (unsigned long long)blkno);
6559 6561
6562 item->free_bg = suballoc;
6560 item->free_blk = blkno; 6563 item->free_blk = blkno;
6561 item->free_bit = bit; 6564 item->free_bit = bit;
6562 item->free_next = fl->f_first; 6565 item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6573{ 6576{
6574 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 6577 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6575 le16_to_cpu(eb->h_suballoc_slot), 6578 le16_to_cpu(eb->h_suballoc_slot),
6579 le64_to_cpu(eb->h_suballoc_loc),
6576 le64_to_cpu(eb->h_blkno), 6580 le64_to_cpu(eb->h_blkno),
6577 le16_to_cpu(eb->h_suballoc_bit)); 6581 le16_to_cpu(eb->h_suballoc_bit));
6578} 6582}
6579 6583
6580/* This function will figure out whether the currently last extent
6581 * block will be deleted, and if it will, what the new last extent
6582 * block will be so we can update his h_next_leaf_blk field, as well
6583 * as the dinodes i_last_eb_blk */
6584static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6585 unsigned int clusters_to_del,
6586 struct ocfs2_path *path,
6587 struct buffer_head **new_last_eb)
6588{
6589 int next_free, ret = 0;
6590 u32 cpos;
6591 struct ocfs2_extent_rec *rec;
6592 struct ocfs2_extent_block *eb;
6593 struct ocfs2_extent_list *el;
6594 struct buffer_head *bh = NULL;
6595
6596 *new_last_eb = NULL;
6597
6598 /* we have no tree, so of course, no last_eb. */
6599 if (!path->p_tree_depth)
6600 goto out;
6601
6602 /* trunc to zero special case - this makes tree_depth = 0
6603 * regardless of what it is. */
6604 if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6605 goto out;
6606
6607 el = path_leaf_el(path);
6608 BUG_ON(!el->l_next_free_rec);
6609
6610 /*
6611 * Make sure that this extent list will actually be empty
6612 * after we clear away the data. We can shortcut out if
6613 * there's more than one non-empty extent in the
6614 * list. Otherwise, a check of the remaining extent is
6615 * necessary.
6616 */
6617 next_free = le16_to_cpu(el->l_next_free_rec);
6618 rec = NULL;
6619 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6620 if (next_free > 2)
6621 goto out;
6622
6623 /* We may have a valid extent in index 1, check it. */
6624 if (next_free == 2)
6625 rec = &el->l_recs[1];
6626
6627 /*
6628 * Fall through - no more nonempty extents, so we want
6629 * to delete this leaf.
6630 */
6631 } else {
6632 if (next_free > 1)
6633 goto out;
6634
6635 rec = &el->l_recs[0];
6636 }
6637
6638 if (rec) {
6639 /*
6640 * Check it we'll only be trimming off the end of this
6641 * cluster.
6642 */
6643 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6644 goto out;
6645 }
6646
6647 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6648 if (ret) {
6649 mlog_errno(ret);
6650 goto out;
6651 }
6652
6653 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6654 if (ret) {
6655 mlog_errno(ret);
6656 goto out;
6657 }
6658
6659 eb = (struct ocfs2_extent_block *) bh->b_data;
6660 el = &eb->h_list;
6661
6662 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6663 * Any corruption is a code bug. */
6664 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6665
6666 *new_last_eb = bh;
6667 get_bh(*new_last_eb);
6668 mlog(0, "returning block %llu, (cpos: %u)\n",
6669 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6670out:
6671 brelse(bh);
6672
6673 return ret;
6674}
6675
6676/*
6677 * Trim some clusters off the rightmost edge of a tree. Only called
6678 * during truncate.
6679 *
6680 * The caller needs to:
6681 * - start journaling of each path component.
6682 * - compute and fully set up any new last ext block
6683 */
6684static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6685 handle_t *handle, struct ocfs2_truncate_context *tc,
6686 u32 clusters_to_del, u64 *delete_start, u8 *flags)
6687{
6688 int ret, i, index = path->p_tree_depth;
6689 u32 new_edge = 0;
6690 u64 deleted_eb = 0;
6691 struct buffer_head *bh;
6692 struct ocfs2_extent_list *el;
6693 struct ocfs2_extent_rec *rec;
6694
6695 *delete_start = 0;
6696 *flags = 0;
6697
6698 while (index >= 0) {
6699 bh = path->p_node[index].bh;
6700 el = path->p_node[index].el;
6701
6702 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6703 index, (unsigned long long)bh->b_blocknr);
6704
6705 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6706
6707 if (index !=
6708 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6709 ocfs2_error(inode->i_sb,
6710 "Inode %lu has invalid ext. block %llu",
6711 inode->i_ino,
6712 (unsigned long long)bh->b_blocknr);
6713 ret = -EROFS;
6714 goto out;
6715 }
6716
6717find_tail_record:
6718 i = le16_to_cpu(el->l_next_free_rec) - 1;
6719 rec = &el->l_recs[i];
6720
6721 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6722 "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6723 ocfs2_rec_clusters(el, rec),
6724 (unsigned long long)le64_to_cpu(rec->e_blkno),
6725 le16_to_cpu(el->l_next_free_rec));
6726
6727 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6728
6729 if (le16_to_cpu(el->l_tree_depth) == 0) {
6730 /*
6731 * If the leaf block contains a single empty
6732 * extent and no records, we can just remove
6733 * the block.
6734 */
6735 if (i == 0 && ocfs2_is_empty_extent(rec)) {
6736 memset(rec, 0,
6737 sizeof(struct ocfs2_extent_rec));
6738 el->l_next_free_rec = cpu_to_le16(0);
6739
6740 goto delete;
6741 }
6742
6743 /*
6744 * Remove any empty extents by shifting things
6745 * left. That should make life much easier on
6746 * the code below. This condition is rare
6747 * enough that we shouldn't see a performance
6748 * hit.
6749 */
6750 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6751 le16_add_cpu(&el->l_next_free_rec, -1);
6752
6753 for(i = 0;
6754 i < le16_to_cpu(el->l_next_free_rec); i++)
6755 el->l_recs[i] = el->l_recs[i + 1];
6756
6757 memset(&el->l_recs[i], 0,
6758 sizeof(struct ocfs2_extent_rec));
6759
6760 /*
6761 * We've modified our extent list. The
6762 * simplest way to handle this change
6763 * is to being the search from the
6764 * start again.
6765 */
6766 goto find_tail_record;
6767 }
6768
6769 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6770
6771 /*
6772 * We'll use "new_edge" on our way back up the
6773 * tree to know what our rightmost cpos is.
6774 */
6775 new_edge = le16_to_cpu(rec->e_leaf_clusters);
6776 new_edge += le32_to_cpu(rec->e_cpos);
6777
6778 /*
6779 * The caller will use this to delete data blocks.
6780 */
6781 *delete_start = le64_to_cpu(rec->e_blkno)
6782 + ocfs2_clusters_to_blocks(inode->i_sb,
6783 le16_to_cpu(rec->e_leaf_clusters));
6784 *flags = rec->e_flags;
6785
6786 /*
6787 * If it's now empty, remove this record.
6788 */
6789 if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6790 memset(rec, 0,
6791 sizeof(struct ocfs2_extent_rec));
6792 le16_add_cpu(&el->l_next_free_rec, -1);
6793 }
6794 } else {
6795 if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6796 memset(rec, 0,
6797 sizeof(struct ocfs2_extent_rec));
6798 le16_add_cpu(&el->l_next_free_rec, -1);
6799
6800 goto delete;
6801 }
6802
6803 /* Can this actually happen? */
6804 if (le16_to_cpu(el->l_next_free_rec) == 0)
6805 goto delete;
6806
6807 /*
6808 * We never actually deleted any clusters
6809 * because our leaf was empty. There's no
6810 * reason to adjust the rightmost edge then.
6811 */
6812 if (new_edge == 0)
6813 goto delete;
6814
6815 rec->e_int_clusters = cpu_to_le32(new_edge);
6816 le32_add_cpu(&rec->e_int_clusters,
6817 -le32_to_cpu(rec->e_cpos));
6818
6819 /*
6820 * A deleted child record should have been
6821 * caught above.
6822 */
6823 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6824 }
6825
6826delete:
6827 ret = ocfs2_journal_dirty(handle, bh);
6828 if (ret) {
6829 mlog_errno(ret);
6830 goto out;
6831 }
6832
6833 mlog(0, "extent list container %llu, after: record %d: "
6834 "(%u, %u, %llu), next = %u.\n",
6835 (unsigned long long)bh->b_blocknr, i,
6836 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6837 (unsigned long long)le64_to_cpu(rec->e_blkno),
6838 le16_to_cpu(el->l_next_free_rec));
6839
6840 /*
6841 * We must be careful to only attempt delete of an
6842 * extent block (and not the root inode block).
6843 */
6844 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6845 struct ocfs2_extent_block *eb =
6846 (struct ocfs2_extent_block *)bh->b_data;
6847
6848 /*
6849 * Save this for use when processing the
6850 * parent block.
6851 */
6852 deleted_eb = le64_to_cpu(eb->h_blkno);
6853
6854 mlog(0, "deleting this extent block.\n");
6855
6856 ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6857
6858 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6859 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6860 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6861
6862 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6863 /* An error here is not fatal. */
6864 if (ret < 0)
6865 mlog_errno(ret);
6866 } else {
6867 deleted_eb = 0;
6868 }
6869
6870 index--;
6871 }
6872
6873 ret = 0;
6874out:
6875 return ret;
6876}
6877
6878static int ocfs2_do_truncate(struct ocfs2_super *osb,
6879 unsigned int clusters_to_del,
6880 struct inode *inode,
6881 struct buffer_head *fe_bh,
6882 handle_t *handle,
6883 struct ocfs2_truncate_context *tc,
6884 struct ocfs2_path *path,
6885 struct ocfs2_alloc_context *meta_ac)
6886{
6887 int status;
6888 struct ocfs2_dinode *fe;
6889 struct ocfs2_extent_block *last_eb = NULL;
6890 struct ocfs2_extent_list *el;
6891 struct buffer_head *last_eb_bh = NULL;
6892 u64 delete_blk = 0;
6893 u8 rec_flags;
6894
6895 fe = (struct ocfs2_dinode *) fe_bh->b_data;
6896
6897 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6898 path, &last_eb_bh);
6899 if (status < 0) {
6900 mlog_errno(status);
6901 goto bail;
6902 }
6903
6904 /*
6905 * Each component will be touched, so we might as well journal
6906 * here to avoid having to handle errors later.
6907 */
6908 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6909 if (status < 0) {
6910 mlog_errno(status);
6911 goto bail;
6912 }
6913
6914 if (last_eb_bh) {
6915 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6916 OCFS2_JOURNAL_ACCESS_WRITE);
6917 if (status < 0) {
6918 mlog_errno(status);
6919 goto bail;
6920 }
6921
6922 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6923 }
6924
6925 el = &(fe->id2.i_list);
6926
6927 /*
6928 * Lower levels depend on this never happening, but it's best
6929 * to check it up here before changing the tree.
6930 */
6931 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6932 ocfs2_error(inode->i_sb,
6933 "Inode %lu has an empty extent record, depth %u\n",
6934 inode->i_ino, le16_to_cpu(el->l_tree_depth));
6935 status = -EROFS;
6936 goto bail;
6937 }
6938
6939 dquot_free_space_nodirty(inode,
6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6943 clusters_to_del;
6944 spin_unlock(&OCFS2_I(inode)->ip_lock);
6945 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6946 inode->i_blocks = ocfs2_inode_sector_count(inode);
6947
6948 status = ocfs2_trim_tree(inode, path, handle, tc,
6949 clusters_to_del, &delete_blk, &rec_flags);
6950 if (status) {
6951 mlog_errno(status);
6952 goto bail;
6953 }
6954
6955 if (le32_to_cpu(fe->i_clusters) == 0) {
6956 /* trunc to zero is a special case. */
6957 el->l_tree_depth = 0;
6958 fe->i_last_eb_blk = 0;
6959 } else if (last_eb)
6960 fe->i_last_eb_blk = last_eb->h_blkno;
6961
6962 status = ocfs2_journal_dirty(handle, fe_bh);
6963 if (status < 0) {
6964 mlog_errno(status);
6965 goto bail;
6966 }
6967
6968 if (last_eb) {
6969 /* If there will be a new last extent block, then by
6970 * definition, there cannot be any leaves to the right of
6971 * him. */
6972 last_eb->h_next_leaf_blk = 0;
6973 status = ocfs2_journal_dirty(handle, last_eb_bh);
6974 if (status < 0) {
6975 mlog_errno(status);
6976 goto bail;
6977 }
6978 }
6979
6980 if (delete_blk) {
6981 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6982 status = ocfs2_decrease_refcount(inode, handle,
6983 ocfs2_blocks_to_clusters(osb->sb,
6984 delete_blk),
6985 clusters_to_del, meta_ac,
6986 &tc->tc_dealloc, 1);
6987 else
6988 status = ocfs2_truncate_log_append(osb, handle,
6989 delete_blk,
6990 clusters_to_del);
6991 if (status < 0) {
6992 mlog_errno(status);
6993 goto bail;
6994 }
6995 }
6996 status = 0;
6997bail:
6998 brelse(last_eb_bh);
6999 mlog_exit(status);
7000 return status;
7001}
7002
7003static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 6584static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
7004{ 6585{
7005 set_buffer_uptodate(bh); 6586 set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7307 goto out_commit; 6888 goto out_commit;
7308 did_quota = 1; 6889 did_quota = 1;
7309 6890
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6891 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
6892
6893 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7311 &num); 6894 &num);
7312 if (ret) { 6895 if (ret) {
7313 mlog_errno(ret); 6896 mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
7406 */ 6989 */
7407int ocfs2_commit_truncate(struct ocfs2_super *osb, 6990int ocfs2_commit_truncate(struct ocfs2_super *osb,
7408 struct inode *inode, 6991 struct inode *inode,
7409 struct buffer_head *fe_bh, 6992 struct buffer_head *di_bh)
7410 struct ocfs2_truncate_context *tc)
7411{ 6993{
7412 int status, i, credits, tl_sem = 0; 6994 int status = 0, i, flags = 0;
7413 u32 clusters_to_del, new_highest_cpos, range; 6995 u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
7414 u64 blkno = 0; 6996 u64 blkno = 0;
7415 struct ocfs2_extent_list *el; 6997 struct ocfs2_extent_list *el;
7416 handle_t *handle = NULL; 6998 struct ocfs2_extent_rec *rec;
7417 struct inode *tl_inode = osb->osb_tl_inode;
7418 struct ocfs2_path *path = NULL; 6999 struct ocfs2_path *path = NULL;
7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7000 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7420 struct ocfs2_alloc_context *meta_ac = NULL; 7001 struct ocfs2_extent_list *root_el = &(di->id2.i_list);
7421 struct ocfs2_refcount_tree *ref_tree = NULL; 7002 u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
7003 struct ocfs2_extent_tree et;
7004 struct ocfs2_cached_dealloc_ctxt dealloc;
7422 7005
7423 mlog_entry_void(); 7006 mlog_entry_void();
7424 7007
7008 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7009 ocfs2_init_dealloc_ctxt(&dealloc);
7010
7425 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7011 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7426 i_size_read(inode)); 7012 i_size_read(inode));
7427 7013
7428 path = ocfs2_new_path(fe_bh, &di->id2.i_list, 7014 path = ocfs2_new_path(di_bh, &di->id2.i_list,
7429 ocfs2_journal_access_di); 7015 ocfs2_journal_access_di);
7430 if (!path) { 7016 if (!path) {
7431 status = -ENOMEM; 7017 status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
7444 goto bail; 7030 goto bail;
7445 } 7031 }
7446 7032
7447 credits = 0;
7448
7449 /* 7033 /*
7450 * Truncate always works against the rightmost tree branch. 7034 * Truncate always works against the rightmost tree branch.
7451 */ 7035 */
@@ -7480,101 +7064,62 @@ start:
7480 } 7064 }
7481 7065
7482 i = le16_to_cpu(el->l_next_free_rec) - 1; 7066 i = le16_to_cpu(el->l_next_free_rec) - 1;
7483 range = le32_to_cpu(el->l_recs[i].e_cpos) + 7067 rec = &el->l_recs[i];
7484 ocfs2_rec_clusters(el, &el->l_recs[i]); 7068 flags = rec->e_flags;
7485 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 7069 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7486 clusters_to_del = 0; 7070
7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7071 if (i == 0 && ocfs2_is_empty_extent(rec)) {
7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7072 /*
7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 7073 * Lower levels depend on this never happening, but it's best
7074 * to check it up here before changing the tree.
7075 */
7076 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7077 ocfs2_error(inode->i_sb, "Inode %lu has an empty "
7078 "extent record, depth %u\n", inode->i_ino,
7079 le16_to_cpu(root_el->l_tree_depth));
7080 status = -EROFS;
7081 goto bail;
7082 }
7083 trunc_cpos = le32_to_cpu(rec->e_cpos);
7084 trunc_len = 0;
7085 blkno = 0;
7086 } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7087 /*
7088 * Truncate entire record.
7089 */
7090 trunc_cpos = le32_to_cpu(rec->e_cpos);
7091 trunc_len = ocfs2_rec_clusters(el, rec);
7092 blkno = le64_to_cpu(rec->e_blkno);
7490 } else if (range > new_highest_cpos) { 7093 } else if (range > new_highest_cpos) {
7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7094 /*
7492 le32_to_cpu(el->l_recs[i].e_cpos)) - 7095 * Partial truncate. it also should be
7493 new_highest_cpos; 7096 * the last truncate we're doing.
7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) + 7097 */
7495 ocfs2_clusters_to_blocks(inode->i_sb, 7098 trunc_cpos = new_highest_cpos;
7496 ocfs2_rec_clusters(el, &el->l_recs[i]) - 7099 trunc_len = range - new_highest_cpos;
7497 clusters_to_del); 7100 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7101 blkno = le64_to_cpu(rec->e_blkno) +
7102 ocfs2_clusters_to_blocks(inode->i_sb, coff);
7498 } else { 7103 } else {
7104 /*
7105 * Truncate completed, leave happily.
7106 */
7499 status = 0; 7107 status = 0;
7500 goto bail; 7108 goto bail;
7501 } 7109 }
7502 7110
7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7111 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7505
7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7508 OCFS2_HAS_REFCOUNT_FL));
7509
7510 status = ocfs2_lock_refcount_tree(osb,
7511 le64_to_cpu(di->i_refcount_loc),
7512 1, &ref_tree, NULL);
7513 if (status) {
7514 mlog_errno(status);
7515 goto bail;
7516 }
7517
7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7519 blkno,
7520 clusters_to_del,
7521 &credits,
7522 &meta_ac);
7523 if (status < 0) {
7524 mlog_errno(status);
7525 goto bail;
7526 }
7527 }
7528
7529 mutex_lock(&tl_inode->i_mutex);
7530 tl_sem = 1;
7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one
7532 * record is free for use. If there isn't any, we flush to get
7533 * an empty truncate log. */
7534 if (ocfs2_truncate_log_needs_flush(osb)) {
7535 status = __ocfs2_flush_truncate_log(osb);
7536 if (status < 0) {
7537 mlog_errno(status);
7538 goto bail;
7539 }
7540 }
7541 7112
7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7113 status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7543 (struct ocfs2_dinode *)fe_bh->b_data, 7114 phys_cpos, trunc_len, flags, &dealloc,
7544 el); 7115 refcount_loc);
7545 handle = ocfs2_start_trans(osb, credits);
7546 if (IS_ERR(handle)) {
7547 status = PTR_ERR(handle);
7548 handle = NULL;
7549 mlog_errno(status);
7550 goto bail;
7551 }
7552
7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7554 tc, path, meta_ac);
7555 if (status < 0) { 7116 if (status < 0) {
7556 mlog_errno(status); 7117 mlog_errno(status);
7557 goto bail; 7118 goto bail;
7558 } 7119 }
7559 7120
7560 mutex_unlock(&tl_inode->i_mutex);
7561 tl_sem = 0;
7562
7563 ocfs2_commit_trans(osb, handle);
7564 handle = NULL;
7565
7566 ocfs2_reinit_path(path, 1); 7121 ocfs2_reinit_path(path, 1);
7567 7122
7568 if (meta_ac) {
7569 ocfs2_free_alloc_context(meta_ac);
7570 meta_ac = NULL;
7571 }
7572
7573 if (ref_tree) {
7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7575 ref_tree = NULL;
7576 }
7577
7578 /* 7123 /*
7579 * The check above will catch the case where we've truncated 7124 * The check above will catch the case where we've truncated
7580 * away all allocation. 7125 * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
7585 7130
7586 ocfs2_schedule_truncate_log_flush(osb, 1); 7131 ocfs2_schedule_truncate_log_flush(osb, 1);
7587 7132
7588 if (tl_sem) 7133 ocfs2_run_deallocs(osb, &dealloc);
7589 mutex_unlock(&tl_inode->i_mutex);
7590
7591 if (handle)
7592 ocfs2_commit_trans(osb, handle);
7593
7594 if (meta_ac)
7595 ocfs2_free_alloc_context(meta_ac);
7596
7597 if (ref_tree)
7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7599
7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7601 7134
7602 ocfs2_free_path(path); 7135 ocfs2_free_path(path);
7603 7136
7604 /* This will drop the ext_alloc cluster lock for us */
7605 ocfs2_free_truncate_context(tc);
7606
7607 mlog_exit(status); 7137 mlog_exit(status);
7608 return status; 7138 return status;
7609} 7139}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
140 struct ocfs2_cached_dealloc_ctxt *dealloc); 140 struct ocfs2_cached_dealloc_ctxt *dealloc);
141int ocfs2_remove_btree_range(struct inode *inode, 141int ocfs2_remove_btree_range(struct inode *inode,
142 struct ocfs2_extent_tree *et, 142 struct ocfs2_extent_tree *et,
143 u32 cpos, u32 phys_cpos, u32 len, 143 u32 cpos, u32 phys_cpos, u32 len, int flags,
144 struct ocfs2_cached_dealloc_ctxt *dealloc); 144 struct ocfs2_cached_dealloc_ctxt *dealloc,
145 u64 refcount_loc);
145 146
146int ocfs2_num_free_extents(struct ocfs2_super *osb, 147int ocfs2_num_free_extents(struct ocfs2_super *osb,
147 struct ocfs2_extent_tree *et); 148 struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
209int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 210int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
210 u64 blkno, unsigned int bit); 211 u64 blkno, unsigned int bit);
211int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 212int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
212 int type, int slot, u64 blkno, 213 int type, int slot, u64 suballoc, u64 blkno,
213 unsigned int bit); 214 unsigned int bit);
214static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) 215static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
215{ 216{
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
233 struct ocfs2_truncate_context **tc); 234 struct ocfs2_truncate_context **tc);
234int ocfs2_commit_truncate(struct ocfs2_super *osb, 235int ocfs2_commit_truncate(struct ocfs2_super *osb,
235 struct inode *inode, 236 struct inode *inode,
236 struct buffer_head *fe_bh, 237 struct buffer_head *di_bh);
237 struct ocfs2_truncate_context *tc);
238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
239 unsigned int start, unsigned int end, int trunc); 239 unsigned int start, unsigned int end, int trunc);
240 240
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
319 struct ocfs2_path *path); 319 struct ocfs2_path *path);
320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
321 struct ocfs2_path *path, u32 *cpos); 321 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
323 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 324int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
323 struct ocfs2_path *left, 325 struct ocfs2_path *left,
324 struct ocfs2_path *right); 326 struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..3623ca20cc18 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1735 goto out; 1735 goto out;
1736 } 1736 }
1737 1737
1738 if (data_ac)
1739 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
1740
1738 credits = ocfs2_calc_extend_credits(inode->i_sb, 1741 credits = ocfs2_calc_extend_credits(inode->i_sb,
1739 &di->id2.i_list, 1742 &di->id2.i_list,
1740 clusters_to_alloc); 1743 clusters_to_alloc);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
404 * larger than 16 bits. 404 * larger than 16 bits.
405 */ 405 */
406 BUG_ON(ecc > USHORT_MAX); 406 BUG_ON(ecc > USHRT_MAX);
407 407
408 bc->bc_crc32e = cpu_to_le32(crc); 408 bc->bc_crc32e = cpu_to_le32(crc);
409 bc->bc_ecc = cpu_to_le16((u16)ecc); 409 bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
509 * larger than 16 bits. 509 * larger than 16 bits.
510 */ 510 */
511 BUG_ON(ecc > USHORT_MAX); 511 BUG_ON(ecc > USHRT_MAX);
512 512
513 bc->bc_crc32e = cpu_to_le32(crc); 513 bc->bc_crc32e = cpu_to_le32(crc);
514 bc->bc_ecc = cpu_to_le16((u16)ecc); 514 bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7d..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
116 define_mask(ERROR), 116 define_mask(ERROR),
117 define_mask(NOTICE), 117 define_mask(NOTICE),
118 define_mask(KTHREAD), 118 define_mask(KTHREAD),
119 define_mask(RESERVATIONS),
119}; 120};
120 121
121static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; 122static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
122 123
123#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
124#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743eea2c8..aa75ca3f78da 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
583 o2net_sc_queue_work(sc, &sc->sc_connect_work); 583 o2net_sc_queue_work(sc, &sc->sc_connect_work);
584 break; 584 break;
585 default: 585 default:
586 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
587 " shutdown, state %d\n",
588 SC_NODEF_ARGS(sc), sk->sk_state);
586 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 589 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
587 break; 590 break;
588 } 591 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..f04ebcfffc4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1194 else 1194 else
1195 de->inode = 0; 1195 de->inode = 0;
1196 dir->i_version++; 1196 dir->i_version++;
1197 status = ocfs2_journal_dirty(handle, bh); 1197 ocfs2_journal_dirty(handle, bh);
1198 goto bail; 1198 goto bail;
1199 } 1199 }
1200 i += le16_to_cpu(de->rec_len); 1200 i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
1752 ocfs2_recalc_free_list(dir, handle, lookup); 1752 ocfs2_recalc_free_list(dir, handle, lookup);
1753 1753
1754 dir->i_version++; 1754 dir->i_version++;
1755 status = ocfs2_journal_dirty(handle, insert_bh); 1755 ocfs2_journal_dirty(handle, insert_bh);
1756 retval = 0; 1756 retval = 0;
1757 goto bail; 1757 goto bail;
1758 } 1758 }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2297 } 2297 }
2298 2298
2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); 2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
2300
2301 ocfs2_journal_dirty(handle, di_bh); 2300 ocfs2_journal_dirty(handle, di_bh);
2302 if (ret) {
2303 mlog_errno(ret);
2304 goto out;
2305 }
2306 2301
2307 i_size_write(inode, size); 2302 i_size_write(inode, size);
2308 inode->i_nlink = 2; 2303 inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2366 ocfs2_init_dir_trailer(inode, new_bh, size); 2361 ocfs2_init_dir_trailer(inode, new_bh, size);
2367 } 2362 }
2368 2363
2369 status = ocfs2_journal_dirty(handle, new_bh); 2364 ocfs2_journal_dirty(handle, new_bh);
2370 if (status < 0) {
2371 mlog_errno(status);
2372 goto bail;
2373 }
2374 2365
2375 i_size_write(inode, inode->i_sb->s_blocksize); 2366 i_size_write(inode, inode->i_sb->s_blocksize);
2376 inode->i_nlink = 2; 2367 inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2404 int ret; 2395 int ret;
2405 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2406 u16 dr_suballoc_bit; 2397 u16 dr_suballoc_bit;
2407 u64 dr_blkno; 2398 u64 suballoc_loc, dr_blkno;
2408 unsigned int num_bits; 2399 unsigned int num_bits;
2409 struct buffer_head *dx_root_bh = NULL; 2400 struct buffer_head *dx_root_bh = NULL;
2410 struct ocfs2_dx_root_block *dx_root; 2401 struct ocfs2_dx_root_block *dx_root;
2411 struct ocfs2_dir_block_trailer *trailer = 2402 struct ocfs2_dir_block_trailer *trailer =
2412 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2413 2404
2414 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, 2405 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
2415 &num_bits, &dr_blkno); 2406 &dr_suballoc_bit, &num_bits, &dr_blkno);
2416 if (ret) { 2407 if (ret) {
2417 mlog_errno(ret); 2408 mlog_errno(ret);
2418 goto out; 2409 goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2431 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 2433 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2434 dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2435 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2436 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2437 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2458 dx_root->dr_list.l_count = 2450 dx_root->dr_list.l_count =
2459 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 2451 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2460 } 2452 }
2461 2453 ocfs2_journal_dirty(handle, dx_root_bh);
2462 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2463 if (ret)
2464 mlog_errno(ret);
2465 2454
2466 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, 2455 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2467 OCFS2_JOURNAL_ACCESS_CREATE); 2456 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2475 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; 2464 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2476 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 2465 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2477 2466
2478 ret = ocfs2_journal_dirty(handle, di_bh); 2467 ocfs2_journal_dirty(handle, di_bh);
2479 if (ret)
2480 mlog_errno(ret);
2481 2468
2482 *ret_dx_root_bh = dx_root_bh; 2469 *ret_dx_root_bh = dx_root_bh;
2483 dx_root_bh = NULL; 2470 dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2558 * chance of contiguousness as the directory grows in number 2545 * chance of contiguousness as the directory grows in number
2559 * of entries. 2546 * of entries.
2560 */ 2547 */
2561 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); 2548 ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
2562 if (ret) { 2549 if (ret) {
2563 mlog_errno(ret); 2550 mlog_errno(ret);
2564 goto out; 2551 goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2991 * if we only get one now, that's enough to continue. The rest 2978 * if we only get one now, that's enough to continue. The rest
2992 * will be claimed after the conversion to extents. 2979 * will be claimed after the conversion to extents.
2993 */ 2980 */
2994 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); 2981 if (ocfs2_dir_resv_allowed(osb))
2982 data_ac->ac_resv = &oi->ip_la_data_resv;
2983 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
2995 if (ret) { 2984 if (ret) {
2996 mlog_errno(ret); 2985 mlog_errno(ret);
2997 goto out_commit; 2986 goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3034 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 3023 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3035 } 3024 }
3036 3025
3037 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3026 ocfs2_journal_dirty(handle, dirdata_bh);
3038 if (ret) {
3039 mlog_errno(ret);
3040 goto out_commit;
3041 }
3042 3027
3043 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 3028 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3044 /* 3029 /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3104 */ 3089 */
3105 dir->i_blocks = ocfs2_inode_sector_count(dir); 3090 dir->i_blocks = ocfs2_inode_sector_count(dir);
3106 3091
3107 ret = ocfs2_journal_dirty(handle, di_bh); 3092 ocfs2_journal_dirty(handle, di_bh);
3108 if (ret) {
3109 mlog_errno(ret);
3110 goto out_commit;
3111 }
3112 3093
3113 if (ocfs2_supports_indexed_dirs(osb)) { 3094 if (ocfs2_supports_indexed_dirs(osb)) {
3114 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, 3095 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3138 * pass. Claim the 2nd cluster as a separate extent. 3119 * pass. Claim the 2nd cluster as a separate extent.
3139 */ 3120 */
3140 if (alloc > len) { 3121 if (alloc > len) {
3141 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 3122 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
3142 &len); 3123 &len);
3143 if (ret) { 3124 if (ret) {
3144 mlog_errno(ret); 3125 mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3369 goto bail; 3350 goto bail;
3370 } 3351 }
3371 3352
3353 if (ocfs2_dir_resv_allowed(osb))
3354 data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
3355
3372 credits = ocfs2_calc_extend_credits(sb, el, 1); 3356 credits = ocfs2_calc_extend_credits(sb, el, 1);
3373 } else { 3357 } else {
3374 spin_unlock(&OCFS2_I(dir)->ip_lock); 3358 spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
3423 } else { 3407 } else {
3424 de->rec_len = cpu_to_le16(sb->s_blocksize); 3408 de->rec_len = cpu_to_le16(sb->s_blocksize);
3425 } 3409 }
3426 status = ocfs2_journal_dirty(handle, new_bh); 3410 ocfs2_journal_dirty(handle, new_bh);
3427 if (status < 0) {
3428 mlog_errno(status);
3429 goto bail;
3430 }
3431 3411
3432 dir_i_size += dir->i_sb->s_blocksize; 3412 dir_i_size += dir->i_sb->s_blocksize;
3433 i_size_write(dir, dir_i_size); 3413 i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3906 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, 3886 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3907 dx_leaf_sort_swap); 3887 dx_leaf_sort_swap);
3908 3888
3909 ret = ocfs2_journal_dirty(handle, dx_leaf_bh); 3889 ocfs2_journal_dirty(handle, dx_leaf_bh);
3910 if (ret) {
3911 mlog_errno(ret);
3912 goto out_commit;
3913 }
3914 3890
3915 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, 3891 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3916 &split_hash); 3892 &split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4490 4466
4491 blk = le64_to_cpu(dx_root->dr_blkno); 4467 blk = le64_to_cpu(dx_root->dr_blkno);
4492 bit = le16_to_cpu(dx_root->dr_suballoc_bit); 4468 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4493 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 4469 if (dx_root->dr_suballoc_loc)
4470 bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
4471 else
4472 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4494 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, 4473 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4495 bit, bg_blkno, 1); 4474 bit, bg_blkno, 1);
4496 if (ret) 4475 if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4551 4530
4552 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); 4531 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4553 4532
4554 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 4533 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
4555 &dealloc); 4534 &dealloc, 0);
4556 if (ret) { 4535 if (ret) {
4557 mlog_errno(ret); 4536 mlog_errno(ret);
4558 goto out; 4537 goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 12d5eb78a11a..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
88 return 0; 88 return 0;
89} 89}
90 90
91static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
92{ 92{
93 mlog_entry_void(); 93 mlog_entry_void();
94 94
@@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
145} 145}
146 146
147 147
148static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 148void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
149{ 149{
150 mlog_entry_void(); 150 mlog_entry_void();
151 151
@@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, 451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
452 lock->ml.node, &status); 452 lock->ml.node, &status);
453 if (ret < 0) 453 if (ret < 0)
454 mlog_errno(ret); 454 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
455 "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
456 lock->ml.node);
455 else { 457 else {
456 if (status == DLM_RECOVERING) { 458 if (status == DLM_RECOVERING) {
457 mlog(ML_ERROR, "sent AST to node %u, it thinks this " 459 mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..4b6ae2c13b47 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes 37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms 38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39 39
40#define DLM_HASH_SIZE_DEFAULT (1 << 14) 40#define DLM_HASH_SIZE_DEFAULT (1 << 17)
41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE 41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
42# define DLM_HASH_PAGES 1 42# define DLM_HASH_PAGES 1
43#else 43#else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
904 904
905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
908void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void dlm_do_local_ast(struct dlm_ctxt *dlm, 909void dlm_do_local_ast(struct dlm_ctxt *dlm,
908 struct dlm_lock_resource *res, 910 struct dlm_lock_resource *res,
909 struct dlm_lock *lock); 911 struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 90803b47cd8c..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) 390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
391 dlm_error(ret); 391 dlm_error(ret);
392 } else { 392 } else {
393 mlog_errno(tmpret); 393 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
394 "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
395 res->owner);
394 if (dlm_is_host_down(tmpret)) { 396 if (dlm_is_host_down(tmpret)) {
395 /* instead of logging the same network error over 397 /* instead of logging the same network error over
396 * and over, sleep here and wait for the heartbeat 398 * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..6b5a492e1749 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
511 511
512 assert_spin_locked(&dlm->spinlock); 512 assert_spin_locked(&dlm->spinlock);
513 513
514 printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name); 514 printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
515 515
516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
517 node + 1)) < O2NM_MAX_NODES) { 517 node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
534 534
535 node = exit_msg->node_idx; 535 node = exit_msg->node_idx;
536 536
537 printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name); 537 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
538 538
539 spin_lock(&dlm->spinlock); 539 spin_lock(&dlm->spinlock);
540 clear_bit(node, dlm->domain_map); 540 clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, 565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
566 &leave_msg, sizeof(leave_msg), node, 566 &leave_msg, sizeof(leave_msg), node,
567 NULL); 567 NULL);
568 568 if (status < 0)
569 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
570 "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
569 mlog(0, "status return %d from o2net_send_message\n", status); 571 mlog(0, "status return %d from o2net_send_message\n", status);
570 572
571 return status; 573 return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
904 set_bit(assert->node_idx, dlm->domain_map); 906 set_bit(assert->node_idx, dlm->domain_map);
905 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 907 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
906 908
907 printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n", 909 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
908 assert->node_idx, dlm->name); 910 assert->node_idx, dlm->name);
909 __dlm_print_nodes(dlm); 911 __dlm_print_nodes(dlm);
910 912
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
962 &cancel_msg, sizeof(cancel_msg), node, 964 &cancel_msg, sizeof(cancel_msg), node,
963 NULL); 965 NULL);
964 if (status < 0) { 966 if (status < 0) {
965 mlog_errno(status); 967 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
968 "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
969 node);
966 goto bail; 970 goto bail;
967 } 971 }
968 972
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
1029 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); 1033 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
1030 1034
1031 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, 1035 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
1032 sizeof(join_msg), node, 1036 sizeof(join_msg), node, &join_resp);
1033 &join_resp);
1034 if (status < 0 && status != -ENOPROTOOPT) { 1037 if (status < 0 && status != -ENOPROTOOPT) {
1035 mlog_errno(status); 1038 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1039 "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1040 node);
1036 goto bail; 1041 goto bail;
1037 } 1042 }
1038 dlm_query_join_wire_to_packet(join_resp, &packet); 1043 dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
1103 &assert_msg, sizeof(assert_msg), node, 1108 &assert_msg, sizeof(assert_msg), node,
1104 NULL); 1109 NULL);
1105 if (status < 0) 1110 if (status < 0)
1106 mlog_errno(status); 1111 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1112 "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1113 node);
1107 1114
1108 return status; 1115 return status;
1109} 1116}
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1516 goto leave; 1523 goto leave;
1517 } 1524 }
1518 1525
1519 dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); 1526 dlm->name = kstrdup(domain, GFP_KERNEL);
1520 if (dlm->name == NULL) { 1527 if (dlm->name == NULL) {
1521 mlog_errno(-ENOMEM); 1528 mlog_errno(-ENOMEM);
1522 kfree(dlm); 1529 kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1550 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1557 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1551 INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); 1558 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1552 1559
1553 strcpy(dlm->name, domain);
1554 dlm->key = key; 1560 dlm->key = key;
1555 dlm->node_num = o2nm_this_node(); 1561 dlm->node_num = o2nm_this_node();
1556 1562
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
329 BUG(); 329 BUG();
330 } 330 }
331 } else { 331 } else {
332 mlog_errno(tmpret); 332 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
333 "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
334 res->owner);
333 if (dlm_is_host_down(tmpret)) { 335 if (dlm_is_host_down(tmpret)) {
334 ret = DLM_RECOVERING; 336 ret = DLM_RECOVERING;
335 mlog(0, "node %u died so returning DLM_RECOVERING " 337 mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
429 struct dlm_lock *lock; 431 struct dlm_lock *lock;
430 int kernel_allocated = 0; 432 int kernel_allocated = 0;
431 433
432 lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); 434 lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
433 if (!lock) 435 if (!lock)
434 return NULL; 436 return NULL;
435 437
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9289b4357d27..4a7506a4e314 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
617{ 617{
618 struct dlm_lock_resource *res = NULL; 618 struct dlm_lock_resource *res = NULL;
619 619
620 res = (struct dlm_lock_resource *) 620 res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
621 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
622 if (!res) 621 if (!res)
623 goto error; 622 goto error;
624 623
625 res->lockname.name = (char *) 624 res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
626 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
627 if (!res->lockname.name) 625 if (!res->lockname.name)
628 goto error; 626 goto error;
629 627
@@ -757,8 +755,7 @@ lookup:
757 spin_unlock(&dlm->spinlock); 755 spin_unlock(&dlm->spinlock);
758 mlog(0, "allocating a new resource\n"); 756 mlog(0, "allocating a new resource\n");
759 /* nothing found and we need to allocate one. */ 757 /* nothing found and we need to allocate one. */
760 alloc_mle = (struct dlm_master_list_entry *) 758 alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
761 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
762 if (!alloc_mle) 759 if (!alloc_mle)
763 goto leave; 760 goto leave;
764 res = dlm_new_lockres(dlm, lockid, namelen); 761 res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
1542 spin_unlock(&dlm->master_lock); 1539 spin_unlock(&dlm->master_lock);
1543 spin_unlock(&dlm->spinlock); 1540 spin_unlock(&dlm->spinlock);
1544 1541
1545 mle = (struct dlm_master_list_entry *) 1542 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1546 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1547 if (!mle) { 1543 if (!mle) {
1548 response = DLM_MASTER_RESP_ERROR; 1544 response = DLM_MASTER_RESP_ERROR;
1549 mlog_errno(-ENOMEM); 1545 mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
1666 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 1662 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1667 &assert, sizeof(assert), to, &r); 1663 &assert, sizeof(assert), to, &r);
1668 if (tmpret < 0) { 1664 if (tmpret < 0) {
1669 mlog(0, "assert_master returned %d!\n", tmpret); 1665 mlog(ML_ERROR, "Error %d when sending message %u (key "
1666 "0x%x) to node %u\n", tmpret,
1667 DLM_ASSERT_MASTER_MSG, dlm->key, to);
1670 if (!dlm_is_host_down(tmpret)) { 1668 if (!dlm_is_host_down(tmpret)) {
1671 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); 1669 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1672 BUG(); 1670 BUG();
@@ -2205,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2205 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2203 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2206 &deref, sizeof(deref), res->owner, &r); 2204 &deref, sizeof(deref), res->owner, &r);
2207 if (ret < 0) 2205 if (ret < 0)
2208 mlog_errno(ret); 2206 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
2207 "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
2208 res->owner);
2209 else if (r < 0) { 2209 else if (r < 0) {
2210 /* BAD. other node says I did not have a ref. */ 2210 /* BAD. other node says I did not have a ref. */
2211 mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2211 mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2452,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2452 goto leave; 2452 goto leave;
2453 } 2453 }
2454 2454
2455 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 2455 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
2456 GFP_NOFS);
2457 if (!mle) { 2456 if (!mle) {
2458 mlog_errno(ret); 2457 mlog_errno(ret);
2459 goto leave; 2458 goto leave;
@@ -2975,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2975 &migrate, sizeof(migrate), nodenum, 2974 &migrate, sizeof(migrate), nodenum,
2976 &status); 2975 &status);
2977 if (ret < 0) { 2976 if (ret < 0) {
2978 mlog(0, "migrate_request returned %d!\n", ret); 2977 mlog(ML_ERROR, "Error %d when sending message %u (key "
2978 "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
2979 dlm->key, nodenum);
2979 if (!dlm_is_host_down(ret)) { 2980 if (!dlm_is_host_down(ret)) {
2980 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2981 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2981 BUG(); 2982 BUG();
@@ -3033,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3033 hash = dlm_lockid_hash(name, namelen); 3034 hash = dlm_lockid_hash(name, namelen);
3034 3035
3035 /* preallocate.. if this fails, abort */ 3036 /* preallocate.. if this fails, abort */
3036 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 3037 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
3037 GFP_NOFS);
3038 3038
3039 if (!mle) { 3039 if (!mle) {
3040 ret = -ENOMEM; 3040 ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..f8b75ce4be70 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
803 803
804 /* negative status is handled by caller */ 804 /* negative status is handled by caller */
805 if (ret < 0) 805 if (ret < 0)
806 mlog_errno(ret); 806 mlog(ML_ERROR, "Error %d when sending message %u (key "
807 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
808 dlm->key, request_from);
807 809
808 // return from here, then 810 // return from here, then
809 // sleep until all received or error 811 // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
955 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 957 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
956 sizeof(done_msg), send_to, &tmpret); 958 sizeof(done_msg), send_to, &tmpret);
957 if (ret < 0) { 959 if (ret < 0) {
960 mlog(ML_ERROR, "Error %d when sending message %u (key "
961 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
962 dlm->key, send_to);
958 if (!dlm_is_host_down(ret)) { 963 if (!dlm_is_host_down(ret)) {
959 mlog_errno(ret);
960 mlog(ML_ERROR, "%s: unknown error sending data-done "
961 "to %u\n", dlm->name, send_to);
962 BUG(); 964 BUG();
963 } 965 }
964 } else 966 } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1126 if (ret < 0) { 1128 if (ret < 0) {
1127 /* XXX: negative status is not handled. 1129 /* XXX: negative status is not handled.
1128 * this will end up killing this node. */ 1130 * this will end up killing this node. */
1129 mlog_errno(ret); 1131 mlog(ML_ERROR, "Error %d when sending message %u (key "
1132 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
1133 dlm->key, send_to);
1130 } else { 1134 } else {
1131 /* might get an -ENOMEM back here */ 1135 /* might get an -ENOMEM back here */
1132 ret = status; 1136 ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1642 &req, sizeof(req), nodenum, &status); 1646 &req, sizeof(req), nodenum, &status);
1643 /* XXX: negative status not handled properly here. */ 1647 /* XXX: negative status not handled properly here. */
1644 if (ret < 0) 1648 if (ret < 0)
1645 mlog_errno(ret); 1649 mlog(ML_ERROR, "Error %d when sending message %u (key "
1650 "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
1651 dlm->key, nodenum);
1646 else { 1652 else {
1647 BUG_ON(status < 0); 1653 BUG_ON(status < 0);
1648 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); 1654 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
2640 if (dlm_is_host_down(ret)) { 2646 if (dlm_is_host_down(ret)) {
2641 /* node is down. not involved in recovery 2647 /* node is down. not involved in recovery
2642 * so just keep going */ 2648 * so just keep going */
2643 mlog(0, "%s: node %u was down when sending " 2649 mlog(ML_NOTICE, "%s: node %u was down when sending "
2644 "begin reco msg (%d)\n", dlm->name, nodenum, ret); 2650 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2645 ret = 0; 2651 ret = 0;
2646 } 2652 }
@@ -2660,11 +2666,12 @@ retry:
2660 } 2666 }
2661 if (ret < 0) { 2667 if (ret < 0) {
2662 struct dlm_lock_resource *res; 2668 struct dlm_lock_resource *res;
2669
2663 /* this is now a serious problem, possibly ENOMEM 2670 /* this is now a serious problem, possibly ENOMEM
2664 * in the network stack. must retry */ 2671 * in the network stack. must retry */
2665 mlog_errno(ret); 2672 mlog_errno(ret);
2666 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2673 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
2667 " returned %d\n", dlm->name, nodenum, ret); 2674 "returned %d\n", dlm->name, nodenum, ret);
2668 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, 2675 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2669 DLM_RECOVERY_LOCK_NAME_LEN); 2676 DLM_RECOVERY_LOCK_NAME_LEN);
2670 if (res) { 2677 if (res) {
@@ -2789,7 +2796,9 @@ stage2:
2789 if (ret >= 0) 2796 if (ret >= 0)
2790 ret = status; 2797 ret = status;
2791 if (ret < 0) { 2798 if (ret < 0) {
2792 mlog_errno(ret); 2799 mlog(ML_ERROR, "Error %d when sending message %u (key "
2800 "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
2801 dlm->key, nodenum);
2793 if (dlm_is_host_down(ret)) { 2802 if (dlm_is_host_down(ret)) {
2794 /* this has no effect on this recovery 2803 /* this has no effect on this recovery
2795 * session, so set the status to zero to 2804 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 11a6d1fd1d35..d4f73ca68fe5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -309,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
309 * spinlock, and because we know that it is not migrating/ 309 * spinlock, and because we know that it is not migrating/
310 * recovering/in-progress, it is fine to reserve asts and 310 * recovering/in-progress, it is fine to reserve asts and
311 * basts right before queueing them all throughout */ 311 * basts right before queueing them all throughout */
312 assert_spin_locked(&dlm->ast_lock);
312 assert_spin_locked(&res->spinlock); 313 assert_spin_locked(&res->spinlock);
313 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| 314 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
314 DLM_LOCK_RES_RECOVERING| 315 DLM_LOCK_RES_RECOVERING|
@@ -337,7 +338,7 @@ converting:
337 /* queue the BAST if not already */ 338 /* queue the BAST if not already */
338 if (lock->ml.highest_blocked == LKM_IVMODE) { 339 if (lock->ml.highest_blocked == LKM_IVMODE) {
339 __dlm_lockres_reserve_ast(res); 340 __dlm_lockres_reserve_ast(res);
340 dlm_queue_bast(dlm, lock); 341 __dlm_queue_bast(dlm, lock);
341 } 342 }
342 /* update the highest_blocked if needed */ 343 /* update the highest_blocked if needed */
343 if (lock->ml.highest_blocked < target->ml.convert_type) 344 if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -355,7 +356,7 @@ converting:
355 can_grant = 0; 356 can_grant = 0;
356 if (lock->ml.highest_blocked == LKM_IVMODE) { 357 if (lock->ml.highest_blocked == LKM_IVMODE) {
357 __dlm_lockres_reserve_ast(res); 358 __dlm_lockres_reserve_ast(res);
358 dlm_queue_bast(dlm, lock); 359 __dlm_queue_bast(dlm, lock);
359 } 360 }
360 if (lock->ml.highest_blocked < target->ml.convert_type) 361 if (lock->ml.highest_blocked < target->ml.convert_type)
361 lock->ml.highest_blocked = 362 lock->ml.highest_blocked =
@@ -383,7 +384,7 @@ converting:
383 spin_unlock(&target->spinlock); 384 spin_unlock(&target->spinlock);
384 385
385 __dlm_lockres_reserve_ast(res); 386 __dlm_lockres_reserve_ast(res);
386 dlm_queue_ast(dlm, target); 387 __dlm_queue_ast(dlm, target);
387 /* go back and check for more */ 388 /* go back and check for more */
388 goto converting; 389 goto converting;
389 } 390 }
@@ -402,7 +403,7 @@ blocked:
402 can_grant = 0; 403 can_grant = 0;
403 if (lock->ml.highest_blocked == LKM_IVMODE) { 404 if (lock->ml.highest_blocked == LKM_IVMODE) {
404 __dlm_lockres_reserve_ast(res); 405 __dlm_lockres_reserve_ast(res);
405 dlm_queue_bast(dlm, lock); 406 __dlm_queue_bast(dlm, lock);
406 } 407 }
407 if (lock->ml.highest_blocked < target->ml.type) 408 if (lock->ml.highest_blocked < target->ml.type)
408 lock->ml.highest_blocked = target->ml.type; 409 lock->ml.highest_blocked = target->ml.type;
@@ -418,7 +419,7 @@ blocked:
418 can_grant = 0; 419 can_grant = 0;
419 if (lock->ml.highest_blocked == LKM_IVMODE) { 420 if (lock->ml.highest_blocked == LKM_IVMODE) {
420 __dlm_lockres_reserve_ast(res); 421 __dlm_lockres_reserve_ast(res);
421 dlm_queue_bast(dlm, lock); 422 __dlm_queue_bast(dlm, lock);
422 } 423 }
423 if (lock->ml.highest_blocked < target->ml.type) 424 if (lock->ml.highest_blocked < target->ml.type)
424 lock->ml.highest_blocked = target->ml.type; 425 lock->ml.highest_blocked = target->ml.type;
@@ -444,7 +445,7 @@ blocked:
444 spin_unlock(&target->spinlock); 445 spin_unlock(&target->spinlock);
445 446
446 __dlm_lockres_reserve_ast(res); 447 __dlm_lockres_reserve_ast(res);
447 dlm_queue_ast(dlm, target); 448 __dlm_queue_ast(dlm, target);
448 /* go back and check for more */ 449 /* go back and check for more */
449 goto converting; 450 goto converting;
450 } 451 }
@@ -674,6 +675,7 @@ static int dlm_thread(void *data)
674 /* lockres can be re-dirtied/re-added to the 675 /* lockres can be re-dirtied/re-added to the
675 * dirty_list in this gap, but that is ok */ 676 * dirty_list in this gap, but that is ok */
676 677
678 spin_lock(&dlm->ast_lock);
677 spin_lock(&res->spinlock); 679 spin_lock(&res->spinlock);
678 if (res->owner != dlm->node_num) { 680 if (res->owner != dlm->node_num) {
679 __dlm_print_one_lock_resource(res); 681 __dlm_print_one_lock_resource(res);
@@ -694,6 +696,7 @@ static int dlm_thread(void *data)
694 /* move it to the tail and keep going */ 696 /* move it to the tail and keep going */
695 res->state &= ~DLM_LOCK_RES_DIRTY; 697 res->state &= ~DLM_LOCK_RES_DIRTY;
696 spin_unlock(&res->spinlock); 698 spin_unlock(&res->spinlock);
699 spin_unlock(&dlm->ast_lock);
697 mlog(0, "delaying list shuffling for in-" 700 mlog(0, "delaying list shuffling for in-"
698 "progress lockres %.*s, state=%d\n", 701 "progress lockres %.*s, state=%d\n",
699 res->lockname.len, res->lockname.name, 702 res->lockname.len, res->lockname.name,
@@ -715,6 +718,7 @@ static int dlm_thread(void *data)
715 dlm_shuffle_lists(dlm, res); 718 dlm_shuffle_lists(dlm, res);
716 res->state &= ~DLM_LOCK_RES_DIRTY; 719 res->state &= ~DLM_LOCK_RES_DIRTY;
717 spin_unlock(&res->spinlock); 720 spin_unlock(&res->spinlock);
721 spin_unlock(&dlm->ast_lock);
718 722
719 dlm_lockres_calc_usage(dlm, res); 723 dlm_lockres_calc_usage(dlm, res);
720 724
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index b47c1b92b82b..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
354 mlog(0, "master was in-progress. retry\n"); 354 mlog(0, "master was in-progress. retry\n");
355 ret = status; 355 ret = status;
356 } else { 356 } else {
357 mlog_errno(tmpret); 357 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
358 "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
358 if (dlm_is_host_down(tmpret)) { 359 if (dlm_is_host_down(tmpret)) {
359 /* NOTE: this seems strange, but it is what we want. 360 /* NOTE: this seems strange, but it is what we want.
360 * when the master goes down during a cancel or 361 * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da4..39eb16ac5f98 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3897 oinfo->dqi_gi.dqi_free_entry = 3897 oinfo->dqi_gi.dqi_free_entry =
3898 be32_to_cpu(lvb->lvb_free_entry); 3898 be32_to_cpu(lvb->lvb_free_entry);
3899 } else { 3899 } else {
3900 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh); 3900 status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
3901 oinfo->dqi_giblk, &bh);
3901 if (status) { 3902 if (status) {
3902 mlog_errno(status); 3903 mlog_errno(status);
3903 goto bail; 3904 goto bail;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5fbd9cea968..97e54b9e654b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
278 inode->i_atime = CURRENT_TIME; 278 inode->i_atime = CURRENT_TIME;
279 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 279 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
280 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 280 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
281 281 ocfs2_journal_dirty(handle, bh);
282 ret = ocfs2_journal_dirty(handle, bh);
283 if (ret < 0)
284 mlog_errno(ret);
285 282
286out_commit: 283out_commit:
287 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 284 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
430 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 427 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
431 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 428 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
432 429
433 status = ocfs2_journal_dirty(handle, fe_bh); 430 ocfs2_journal_dirty(handle, fe_bh);
434 if (status < 0)
435 mlog_errno(status);
436 431
437out_commit: 432out_commit:
438 ocfs2_commit_trans(osb, handle); 433 ocfs2_commit_trans(osb, handle);
@@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode,
449 int status = 0; 444 int status = 0;
450 struct ocfs2_dinode *fe = NULL; 445 struct ocfs2_dinode *fe = NULL;
451 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 446 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
452 struct ocfs2_truncate_context *tc = NULL;
453 447
454 mlog_entry("(inode = %llu, new_i_size = %llu\n", 448 mlog_entry("(inode = %llu, new_i_size = %llu\n",
455 (unsigned long long)OCFS2_I(inode)->ip_blkno, 449 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode,
488 482
489 down_write(&OCFS2_I(inode)->ip_alloc_sem); 483 down_write(&OCFS2_I(inode)->ip_alloc_sem);
490 484
485 ocfs2_resv_discard(&osb->osb_la_resmap,
486 &OCFS2_I(inode)->ip_la_data_resv);
487
491 /* 488 /*
492 * The inode lock forced other nodes to sync and drop their 489 * The inode lock forced other nodes to sync and drop their
493 * pages, which (correctly) happens even if we have a truncate 490 * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode,
517 goto bail_unlock_sem; 514 goto bail_unlock_sem;
518 } 515 }
519 516
520 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 517 status = ocfs2_commit_truncate(osb, inode, di_bh);
521 if (status < 0) {
522 mlog_errno(status);
523 goto bail_unlock_sem;
524 }
525
526 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
527 if (status < 0) { 518 if (status < 0) {
528 mlog_errno(status); 519 mlog_errno(status);
529 goto bail_unlock_sem; 520 goto bail_unlock_sem;
@@ -666,11 +657,7 @@ restarted_transaction:
666 goto leave; 657 goto leave;
667 } 658 }
668 659
669 status = ocfs2_journal_dirty(handle, bh); 660 ocfs2_journal_dirty(handle, bh);
670 if (status < 0) {
671 mlog_errno(status);
672 goto leave;
673 }
674 661
675 spin_lock(&OCFS2_I(inode)->ip_lock); 662 spin_lock(&OCFS2_I(inode)->ip_lock);
676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 663 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -946,9 +933,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
946 struct ocfs2_super *osb = OCFS2_SB(sb); 933 struct ocfs2_super *osb = OCFS2_SB(sb);
947 struct buffer_head *bh = NULL; 934 struct buffer_head *bh = NULL;
948 handle_t *handle = NULL; 935 handle_t *handle = NULL;
949 int qtype;
950 struct dquot *transfer_from[MAXQUOTAS] = { };
951 struct dquot *transfer_to[MAXQUOTAS] = { }; 936 struct dquot *transfer_to[MAXQUOTAS] = { };
937 int qtype;
952 938
953 mlog_entry("(0x%p, '%.*s')\n", dentry, 939 mlog_entry("(0x%p, '%.*s')\n", dentry,
954 dentry->d_name.len, dentry->d_name.name); 940 dentry->d_name.len, dentry->d_name.name);
@@ -979,10 +965,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
979 if (status) 965 if (status)
980 return status; 966 return status;
981 967
968 if (is_quota_modification(inode, attr))
969 dquot_initialize(inode);
982 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 970 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
983 if (size_change) { 971 if (size_change) {
984 dquot_initialize(inode);
985
986 status = ocfs2_rw_lock(inode, 1); 972 status = ocfs2_rw_lock(inode, 1);
987 if (status < 0) { 973 if (status < 0) {
988 mlog_errno(status); 974 mlog_errno(status);
@@ -1032,9 +1018,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1032 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1018 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1033 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1019 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1034 USRQUOTA); 1020 USRQUOTA);
1035 transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, 1021 if (!transfer_to[USRQUOTA]) {
1036 USRQUOTA);
1037 if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
1038 status = -ESRCH; 1022 status = -ESRCH;
1039 goto bail_unlock; 1023 goto bail_unlock;
1040 } 1024 }
@@ -1044,9 +1028,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1044 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1028 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1045 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1029 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1046 GRPQUOTA); 1030 GRPQUOTA);
1047 transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, 1031 if (!transfer_to[GRPQUOTA]) {
1048 GRPQUOTA);
1049 if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
1050 status = -ESRCH; 1032 status = -ESRCH;
1051 goto bail_unlock; 1033 goto bail_unlock;
1052 } 1034 }
@@ -1058,7 +1040,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1058 mlog_errno(status); 1040 mlog_errno(status);
1059 goto bail_unlock; 1041 goto bail_unlock;
1060 } 1042 }
1061 status = dquot_transfer(inode, attr); 1043 status = __dquot_transfer(inode, transfer_to);
1062 if (status < 0) 1044 if (status < 0)
1063 goto bail_commit; 1045 goto bail_commit;
1064 } else { 1046 } else {
@@ -1098,10 +1080,8 @@ bail:
1098 brelse(bh); 1080 brelse(bh);
1099 1081
1100 /* Release quota pointers in case we acquired them */ 1082 /* Release quota pointers in case we acquired them */
1101 for (qtype = 0; qtype < MAXQUOTAS; qtype++) { 1083 for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1102 dqput(transfer_to[qtype]); 1084 dqput(transfer_to[qtype]);
1103 dqput(transfer_from[qtype]);
1104 }
1105 1085
1106 if (!status && attr->ia_valid & ATTR_MODE) { 1086 if (!status && attr->ia_valid & ATTR_MODE) {
1107 status = ocfs2_acl_chmod(inode); 1087 status = ocfs2_acl_chmod(inode);
@@ -1195,9 +1175,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1195 di = (struct ocfs2_dinode *) bh->b_data; 1175 di = (struct ocfs2_dinode *) bh->b_data;
1196 di->i_mode = cpu_to_le16(inode->i_mode); 1176 di->i_mode = cpu_to_le16(inode->i_mode);
1197 1177
1198 ret = ocfs2_journal_dirty(handle, bh); 1178 ocfs2_journal_dirty(handle, bh);
1199 if (ret < 0)
1200 mlog_errno(ret);
1201 1179
1202out_trans: 1180out_trans:
1203 ocfs2_commit_trans(osb, handle); 1181 ocfs2_commit_trans(osb, handle);
@@ -1434,16 +1412,90 @@ out:
1434 return ret; 1412 return ret;
1435} 1413}
1436 1414
1415static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1416{
1417 int i;
1418 struct ocfs2_extent_rec *rec = NULL;
1419
1420 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1421
1422 rec = &el->l_recs[i];
1423
1424 if (le32_to_cpu(rec->e_cpos) < pos)
1425 break;
1426 }
1427
1428 return i;
1429}
1430
1431/*
1432 * Helper to calculate the punching pos and length in one run, we handle the
1433 * following three cases in order:
1434 *
1435 * - remove the entire record
1436 * - remove a partial record
1437 * - no record needs to be removed (hole-punching completed)
1438*/
1439static void ocfs2_calc_trunc_pos(struct inode *inode,
1440 struct ocfs2_extent_list *el,
1441 struct ocfs2_extent_rec *rec,
1442 u32 trunc_start, u32 *trunc_cpos,
1443 u32 *trunc_len, u32 *trunc_end,
1444 u64 *blkno, int *done)
1445{
1446 int ret = 0;
1447 u32 coff, range;
1448
1449 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1450
1451 if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1452 *trunc_cpos = le32_to_cpu(rec->e_cpos);
1453 /*
1454 * Skip holes if any.
1455 */
1456 if (range < *trunc_end)
1457 *trunc_end = range;
1458 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1459 *blkno = le64_to_cpu(rec->e_blkno);
1460 *trunc_end = le32_to_cpu(rec->e_cpos);
1461 } else if (range > trunc_start) {
1462 *trunc_cpos = trunc_start;
1463 *trunc_len = *trunc_end - trunc_start;
1464 coff = trunc_start - le32_to_cpu(rec->e_cpos);
1465 *blkno = le64_to_cpu(rec->e_blkno) +
1466 ocfs2_clusters_to_blocks(inode->i_sb, coff);
1467 *trunc_end = trunc_start;
1468 } else {
1469 /*
1470 * It may have two following possibilities:
1471 *
1472 * - last record has been removed
1473 * - trunc_start was within a hole
1474 *
1475 * both two cases mean the completion of hole punching.
1476 */
1477 ret = 1;
1478 }
1479
1480 *done = ret;
1481}
1482
1437static int ocfs2_remove_inode_range(struct inode *inode, 1483static int ocfs2_remove_inode_range(struct inode *inode,
1438 struct buffer_head *di_bh, u64 byte_start, 1484 struct buffer_head *di_bh, u64 byte_start,
1439 u64 byte_len) 1485 u64 byte_len)
1440{ 1486{
1441 int ret = 0; 1487 int ret = 0, flags = 0, done = 0, i;
1442 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1488 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1489 u32 cluster_in_el;
1443 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1490 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1444 struct ocfs2_cached_dealloc_ctxt dealloc; 1491 struct ocfs2_cached_dealloc_ctxt dealloc;
1445 struct address_space *mapping = inode->i_mapping; 1492 struct address_space *mapping = inode->i_mapping;
1446 struct ocfs2_extent_tree et; 1493 struct ocfs2_extent_tree et;
1494 struct ocfs2_path *path = NULL;
1495 struct ocfs2_extent_list *el = NULL;
1496 struct ocfs2_extent_rec *rec = NULL;
1497 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1498 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1447 1499
1448 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1500 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1449 ocfs2_init_dealloc_ctxt(&dealloc); 1501 ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1469,17 +1521,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1469 goto out; 1521 goto out;
1470 } 1522 }
1471 1523
1524 /*
1525 * For reflinks, we may need to CoW 2 clusters which might be
1526 * partially zero'd later, if hole's start and end offset were
1527 * within one cluster(means is not exactly aligned to clustersize).
1528 */
1529
1530 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1531
1532 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1533 if (ret) {
1534 mlog_errno(ret);
1535 goto out;
1536 }
1537
1538 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1539 if (ret) {
1540 mlog_errno(ret);
1541 goto out;
1542 }
1543 }
1544
1472 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1545 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1473 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1546 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1474 if (trunc_len >= trunc_start) 1547 cluster_in_el = trunc_end;
1475 trunc_len -= trunc_start;
1476 else
1477 trunc_len = 0;
1478 1548
1479 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", 1549 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
1480 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1550 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1481 (unsigned long long)byte_start, 1551 (unsigned long long)byte_start,
1482 (unsigned long long)byte_len, trunc_start, trunc_len); 1552 (unsigned long long)byte_len, trunc_start, trunc_end);
1483 1553
1484 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1554 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1485 if (ret) { 1555 if (ret) {
@@ -1487,31 +1557,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1487 goto out; 1557 goto out;
1488 } 1558 }
1489 1559
1490 cpos = trunc_start; 1560 path = ocfs2_new_path_from_et(&et);
1491 while (trunc_len) { 1561 if (!path) {
1492 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1562 ret = -ENOMEM;
1493 &alloc_size, NULL); 1563 mlog_errno(ret);
1564 goto out;
1565 }
1566
1567 while (trunc_end > trunc_start) {
1568
1569 ret = ocfs2_find_path(INODE_CACHE(inode), path,
1570 cluster_in_el);
1494 if (ret) { 1571 if (ret) {
1495 mlog_errno(ret); 1572 mlog_errno(ret);
1496 goto out; 1573 goto out;
1497 } 1574 }
1498 1575
1499 if (alloc_size > trunc_len) 1576 el = path_leaf_el(path);
1500 alloc_size = trunc_len; 1577
1578 i = ocfs2_find_rec(el, trunc_end);
1579 /*
1580 * Need to go to previous extent block.
1581 */
1582 if (i < 0) {
1583 if (path->p_tree_depth == 0)
1584 break;
1501 1585
1502 /* Only do work for non-holes */ 1586 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1503 if (phys_cpos != 0) { 1587 path,
1504 ret = ocfs2_remove_btree_range(inode, &et, cpos, 1588 &cluster_in_el);
1505 phys_cpos, alloc_size,
1506 &dealloc);
1507 if (ret) { 1589 if (ret) {
1508 mlog_errno(ret); 1590 mlog_errno(ret);
1509 goto out; 1591 goto out;
1510 } 1592 }
1593
1594 /*
1595 * We've reached the leftmost extent block,
1596 * it's safe to leave.
1597 */
1598 if (cluster_in_el == 0)
1599 break;
1600
1601 /*
1602 * The 'pos' searched for previous extent block is
1603 * always one cluster less than actual trunc_end.
1604 */
1605 trunc_end = cluster_in_el + 1;
1606
1607 ocfs2_reinit_path(path, 1);
1608
1609 continue;
1610
1611 } else
1612 rec = &el->l_recs[i];
1613
1614 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1615 &trunc_len, &trunc_end, &blkno, &done);
1616 if (done)
1617 break;
1618
1619 flags = rec->e_flags;
1620 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1621
1622 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1623 phys_cpos, trunc_len, flags,
1624 &dealloc, refcount_loc);
1625 if (ret < 0) {
1626 mlog_errno(ret);
1627 goto out;
1511 } 1628 }
1512 1629
1513 cpos += alloc_size; 1630 cluster_in_el = trunc_end;
1514 trunc_len -= alloc_size; 1631
1632 ocfs2_reinit_path(path, 1);
1515 } 1633 }
1516 1634
1517 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1635 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index af189887201c..abb0a95cc717 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
376 376
377 OCFS2_I(inode)->ip_last_used_slot = 0; 377 OCFS2_I(inode)->ip_last_used_slot = 0;
378 OCFS2_I(inode)->ip_last_used_group = 0; 378 OCFS2_I(inode)->ip_last_used_group = 0;
379
380 if (S_ISDIR(inode->i_mode))
381 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
382 OCFS2_RESV_FLAG_DIR);
379 mlog_exit_void(); 383 mlog_exit_void();
380} 384}
381 385
@@ -539,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
539 struct buffer_head *fe_bh) 543 struct buffer_head *fe_bh)
540{ 544{
541 int status = 0; 545 int status = 0;
542 struct ocfs2_truncate_context *tc = NULL;
543 struct ocfs2_dinode *fe; 546 struct ocfs2_dinode *fe;
544 handle_t *handle = NULL; 547 handle_t *handle = NULL;
545 548
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
582 ocfs2_commit_trans(osb, handle); 585 ocfs2_commit_trans(osb, handle);
583 handle = NULL; 586 handle = NULL;
584 587
585 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); 588 status = ocfs2_commit_truncate(osb, inode, fe_bh);
586 if (status < 0) {
587 mlog_errno(status);
588 goto out;
589 }
590
591 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
592 if (status < 0) { 589 if (status < 0) {
593 mlog_errno(status); 590 mlog_errno(status);
594 goto out; 591 goto out;
@@ -659,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
659 656
660 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 657 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
661 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 658 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
662 659 ocfs2_journal_dirty(handle, di_bh);
663 status = ocfs2_journal_dirty(handle, di_bh);
664 if (status < 0) {
665 mlog_errno(status);
666 goto bail_commit;
667 }
668 660
669 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 661 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
670 dquot_free_inode(inode); 662 dquot_free_inode(inode);
@@ -980,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
980void ocfs2_delete_inode(struct inode *inode) 972void ocfs2_delete_inode(struct inode *inode)
981{ 973{
982 int wipe, status; 974 int wipe, status;
983 sigset_t blocked, oldset; 975 sigset_t oldset;
984 struct buffer_head *di_bh = NULL; 976 struct buffer_head *di_bh = NULL;
985 977
986 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 978 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1007,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
1007 * messaging paths may return us -ERESTARTSYS. Which would 999 * messaging paths may return us -ERESTARTSYS. Which would
1008 * cause us to exit early, resulting in inodes being orphaned 1000 * cause us to exit early, resulting in inodes being orphaned
1009 * forever. */ 1001 * forever. */
1010 sigfillset(&blocked); 1002 ocfs2_block_signals(&oldset);
1011 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
1012 if (status < 0) {
1013 mlog_errno(status);
1014 ocfs2_cleanup_delete_inode(inode, 1);
1015 goto bail;
1016 }
1017 1003
1018 /* 1004 /*
1019 * Synchronize us against ocfs2_get_dentry. We take this in 1005 * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1087,9 +1073,7 @@ bail_unlock_nfs_sync:
1087 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); 1073 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1088 1074
1089bail_unblock: 1075bail_unblock:
1090 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1076 ocfs2_unblock_signals(&oldset);
1091 if (status < 0)
1092 mlog_errno(status);
1093bail: 1077bail:
1094 clear_inode(inode); 1078 clear_inode(inode);
1095 mlog_exit_void(); 1079 mlog_exit_void();
@@ -1123,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
1123 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); 1107 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1124 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1108 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1125 1109
1110 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
1111 &oi->ip_la_data_resv);
1112 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1113
1126 /* We very well may get a clear_inode before all an inodes 1114 /* We very well may get a clear_inode before all an inodes
1127 * metadata has hit disk. Of course, we can't drop any cluster 1115 * metadata has hit disk. Of course, we can't drop any cluster
1128 * locks until the journal has finished with it. The only 1116 * locks until the journal has finished with it. The only
@@ -1298,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1298 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 1286 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1299 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1287 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1300 1288
1301 status = ocfs2_journal_dirty(handle, bh); 1289 ocfs2_journal_dirty(handle, bh);
1302 if (status < 0)
1303 mlog_errno(status);
1304
1305 status = 0;
1306leave: 1290leave:
1307
1308 mlog_exit(status); 1291 mlog_exit(status);
1309 return status; 1292 return status;
1310} 1293}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 0b28e1921a39..9f5f5fcadc45 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
70 /* Only valid if the inode is the dir. */ 70 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 71 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 72 u64 ip_last_used_group;
73
74 struct ocfs2_alloc_reservation ip_la_data_resv;
73}; 75};
74 76
75/* 77/*
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..47878cf16418 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
402} 402}
403 403
404/* 404/*
405 * 'nblocks' is what you want to add to the current 405 * 'nblocks' is what you want to add to the current transaction.
406 * transaction. extend_trans will either extend the current handle by
407 * nblocks, or commit it and start a new one with nblocks credits.
408 * 406 *
409 * This might call jbd2_journal_restart() which will commit dirty buffers 407 * This might call jbd2_journal_restart() which will commit dirty buffers
410 * and then restart the transaction. Before calling 408 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
422 */ 420 */
423int ocfs2_extend_trans(handle_t *handle, int nblocks) 421int ocfs2_extend_trans(handle_t *handle, int nblocks)
424{ 422{
425 int status; 423 int status, old_nblocks;
426 424
427 BUG_ON(!handle); 425 BUG_ON(!handle);
428 BUG_ON(!nblocks); 426 BUG_ON(nblocks < 0);
427
428 if (!nblocks)
429 return 0;
429 430
431 old_nblocks = handle->h_buffer_credits;
430 mlog_entry_void(); 432 mlog_entry_void();
431 433
432 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 434 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
445 mlog(0, 447 mlog(0,
446 "jbd2_journal_extend failed, trying " 448 "jbd2_journal_extend failed, trying "
447 "jbd2_journal_restart\n"); 449 "jbd2_journal_restart\n");
448 status = jbd2_journal_restart(handle, nblocks); 450 status = jbd2_journal_restart(handle,
451 old_nblocks + nblocks);
449 if (status < 0) { 452 if (status < 0) {
450 mlog_errno(status); 453 mlog_errno(status);
451 goto bail; 454 goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
734 return __ocfs2_journal_access(handle, ci, bh, NULL, type); 737 return __ocfs2_journal_access(handle, ci, bh, NULL, type);
735} 738}
736 739
737int ocfs2_journal_dirty(handle_t *handle, 740void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
738 struct buffer_head *bh)
739{ 741{
740 int status; 742 int status;
741 743
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
743 (unsigned long long)bh->b_blocknr); 745 (unsigned long long)bh->b_blocknr);
744 746
745 status = jbd2_journal_dirty_metadata(handle, bh); 747 status = jbd2_journal_dirty_metadata(handle, bh);
746 if (status < 0) 748 BUG_ON(status);
747 mlog(ML_ERROR, "Could not dirty metadata buffer. "
748 "(bh->b_blocknr=%llu)\n",
749 (unsigned long long)bh->b_blocknr);
750 749
751 mlog_exit(status); 750 mlog_exit_void();
752 return status;
753} 751}
754 752
755#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 753#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
325 * <modify the bh> 325 * <modify the bh>
326 * ocfs2_journal_dirty(handle, bh); 326 * ocfs2_journal_dirty(handle, bh);
327 */ 327 */
328int ocfs2_journal_dirty(handle_t *handle, 328void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
329 struct buffer_head *bh);
330 329
331/* 330/*
332 * Credit Macros: 331 * Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
562 return blocks; 561 return blocks;
563} 562}
564 563
564/*
565 * Allocating a discontiguous block group requires the credits from
566 * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
567 * the group descriptor's extent list. The caller already has started
568 * the transaction with ocfs2_calc_group_alloc_credits(). They extend
569 * it with these credits.
570 */
571static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
572{
573 return ocfs2_extent_recs_per_gd(sb);
574}
575
565static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, 576static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
566 unsigned int clusters_to_del, 577 unsigned int clusters_to_del,
567 struct ocfs2_dinode *fe, 578 struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index c983715d8d8c..3d7419682dc0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
52 52
53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
54 struct ocfs2_dinode *alloc, 54 struct ocfs2_dinode *alloc,
55 u32 numbits); 55 u32 *numbits,
56 struct ocfs2_alloc_reservation *resv);
56 57
57static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); 58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
58 59
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
75 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
76 77
78/*
79 * ocfs2_la_default_mb() - determine a default size, in megabytes of
80 * the local alloc.
81 *
82 * Generally, we'd like to pick as large a local alloc as
83 * possible. Performance on large workloads tends to scale
84 * proportionally to la size. In addition to that, the reservations
85 * code functions more efficiently as it can reserve more windows for
86 * write.
87 *
88 * Some things work against us when trying to choose a large local alloc:
89 *
90 * - We need to ensure our sizing is picked to leave enough space in
91 * group descriptors for other allocations (such as block groups,
92 * etc). Picking default sizes which are a multiple of 4 could help
93 * - block groups are allocated in 2mb and 4mb chunks.
94 *
95 * - Likewise, we don't want to starve other nodes of bits on small
96 * file systems. This can easily be taken care of by limiting our
97 * default to a reasonable size (256M) on larger cluster sizes.
98 *
99 * - Some file systems can't support very large sizes - 4k and 8k in
100 * particular are limited to less than 128 and 256 megabytes respectively.
101 *
102 * The following reference table shows group descriptor and local
103 * alloc maximums at various cluster sizes (4k blocksize)
104 *
105 * csize: 4K group: 126M la: 121M
106 * csize: 8K group: 252M la: 243M
107 * csize: 16K group: 504M la: 486M
108 * csize: 32K group: 1008M la: 972M
109 * csize: 64K group: 2016M la: 1944M
110 * csize: 128K group: 4032M la: 3888M
111 * csize: 256K group: 8064M la: 7776M
112 * csize: 512K group: 16128M la: 15552M
113 * csize: 1024K group: 32256M la: 31104M
114 */
115#define OCFS2_LA_MAX_DEFAULT_MB 256
116#define OCFS2_LA_OLD_DEFAULT 8
117unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
118{
119 unsigned int la_mb;
120 unsigned int gd_mb;
121 unsigned int megs_per_slot;
122 struct super_block *sb = osb->sb;
123
124 gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
125 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
126
127 /*
128 * This takes care of files systems with very small group
129 * descriptors - 512 byte blocksize at cluster sizes lower
130 * than 16K and also 1k blocksize with 4k cluster size.
131 */
132 if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
133 || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
134 return OCFS2_LA_OLD_DEFAULT;
135
136 /*
137 * Leave enough room for some block groups and make the final
138 * value we work from a multiple of 4.
139 */
140 gd_mb -= 16;
141 gd_mb &= 0xFFFFFFFB;
142
143 la_mb = gd_mb;
144
145 /*
146 * Keep window sizes down to a reasonable default
147 */
148 if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
149 /*
150 * Some clustersize / blocksize combinations will have
151 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
152 * default size, but get poor distribution when
153 * limited to exactly 256 megabytes.
154 *
155 * As an example, 16K clustersize at 4K blocksize
156 * gives us a cluster group size of 504M. Paring the
157 * local alloc size down to 256 however, would give us
158 * only one window and around 200MB left in the
159 * cluster group. Instead, find the first size below
160 * 256 which would give us an even distribution.
161 *
162 * Larger cluster group sizes actually work out pretty
163 * well when pared to 256, so we don't have to do this
164 * for any group that fits more than two
165 * OCFS2_LA_MAX_DEFAULT_MB windows.
166 */
167 if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
168 la_mb = 256;
169 else {
170 unsigned int gd_mult = gd_mb;
171
172 while (gd_mult > 256)
173 gd_mult = gd_mult >> 1;
174
175 la_mb = gd_mult;
176 }
177 }
178
179 megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
180 megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
181 /* Too many nodes, too few disk clusters. */
182 if (megs_per_slot < la_mb)
183 la_mb = megs_per_slot;
184
185 return la_mb;
186}
187
188void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
189{
190 struct super_block *sb = osb->sb;
191 unsigned int la_default_mb = ocfs2_la_default_mb(osb);
192 unsigned int la_max_mb;
193
194 la_max_mb = ocfs2_clusters_to_megabytes(sb,
195 ocfs2_local_alloc_size(sb) * 8);
196
197 mlog(0, "requested: %dM, max: %uM, default: %uM\n",
198 requested_mb, la_max_mb, la_default_mb);
199
200 if (requested_mb == -1) {
201 /* No user request - use defaults */
202 osb->local_alloc_default_bits =
203 ocfs2_megabytes_to_clusters(sb, la_default_mb);
204 } else if (requested_mb > la_max_mb) {
205 /* Request is too big, we give the maximum available */
206 osb->local_alloc_default_bits =
207 ocfs2_megabytes_to_clusters(sb, la_max_mb);
208 } else {
209 osb->local_alloc_default_bits =
210 ocfs2_megabytes_to_clusters(sb, requested_mb);
211 }
212
213 osb->local_alloc_bits = osb->local_alloc_default_bits;
214}
215
77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 216static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
78{ 217{
79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 218 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
156 osb->local_alloc_bits, (osb->bitmap_cpg - 1)); 295 osb->local_alloc_bits, (osb->bitmap_cpg - 1));
157 osb->local_alloc_bits = 296 osb->local_alloc_bits =
158 ocfs2_megabytes_to_clusters(osb->sb, 297 ocfs2_megabytes_to_clusters(osb->sb,
159 OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); 298 ocfs2_la_default_mb(osb));
160 } 299 }
161 300
162 /* read the alloc off disk */ 301 /* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
262 401
263 osb->local_alloc_state = OCFS2_LA_DISABLED; 402 osb->local_alloc_state = OCFS2_LA_DISABLED;
264 403
404 ocfs2_resmap_uninit(&osb->osb_la_resmap);
405
265 main_bm_inode = ocfs2_get_system_file_inode(osb, 406 main_bm_inode = ocfs2_get_system_file_inode(osb,
266 GLOBAL_BITMAP_SYSTEM_INODE, 407 GLOBAL_BITMAP_SYSTEM_INODE,
267 OCFS2_INVALID_SLOT); 408 OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
305 } 446 }
306 447
307 ocfs2_clear_local_alloc(alloc); 448 ocfs2_clear_local_alloc(alloc);
308 449 ocfs2_journal_dirty(handle, bh);
309 status = ocfs2_journal_dirty(handle, bh);
310 if (status < 0) {
311 mlog_errno(status);
312 goto out_commit;
313 }
314 450
315 brelse(bh); 451 brelse(bh);
316 osb->local_alloc_bh = NULL; 452 osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
481 return status; 617 return status;
482} 618}
483 619
484/* Check to see if the local alloc window is within ac->ac_max_block */
485static int ocfs2_local_alloc_in_range(struct inode *inode,
486 struct ocfs2_alloc_context *ac,
487 u32 bits_wanted)
488{
489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
490 struct ocfs2_dinode *alloc;
491 struct ocfs2_local_alloc *la;
492 int start;
493 u64 block_off;
494
495 if (!ac->ac_max_block)
496 return 1;
497
498 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
499 la = OCFS2_LOCAL_ALLOC(alloc);
500
501 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
502 if (start == -1) {
503 mlog_errno(-ENOSPC);
504 return 0;
505 }
506
507 /*
508 * Converting (bm_off + start + bits_wanted) to blocks gives us
509 * the blkno just past our actual allocation. This is perfect
510 * to compare with ac_max_block.
511 */
512 block_off = ocfs2_clusters_to_blocks(inode->i_sb,
513 le32_to_cpu(la->la_bm_off) +
514 start + bits_wanted);
515 mlog(0, "Checking %llu against %llu\n",
516 (unsigned long long)block_off,
517 (unsigned long long)ac->ac_max_block);
518 if (block_off > ac->ac_max_block)
519 return 0;
520
521 return 1;
522}
523
524/* 620/*
525 * make sure we've got at least bits_wanted contiguous bits in the 621 * make sure we've got at least bits_wanted contiguous bits in the
526 * local alloc. You lose them when you drop i_mutex. 622 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
613 mlog(0, "Calling in_range for max block %llu\n", 709 mlog(0, "Calling in_range for max block %llu\n",
614 (unsigned long long)ac->ac_max_block); 710 (unsigned long long)ac->ac_max_block);
615 711
616 if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
617 bits_wanted)) {
618 /*
619 * The window is outside ac->ac_max_block.
620 * This errno tells the caller to keep localalloc enabled
621 * but to get the allocation from the main bitmap.
622 */
623 status = -EFBIG;
624 goto bail;
625 }
626
627 ac->ac_inode = local_alloc_inode; 712 ac->ac_inode = local_alloc_inode;
628 /* We should never use localalloc from another slot */ 713 /* We should never use localalloc from another slot */
629 ac->ac_alloc_slot = osb->slot_num; 714 ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
664 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 749 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
665 la = OCFS2_LOCAL_ALLOC(alloc); 750 la = OCFS2_LOCAL_ALLOC(alloc);
666 751
667 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); 752 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
753 ac->ac_resv);
668 if (start == -1) { 754 if (start == -1) {
669 /* TODO: Shouldn't we just BUG here? */ 755 /* TODO: Shouldn't we just BUG here? */
670 status = -ENOSPC; 756 status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
674 760
675 bitmap = la->la_bitmap; 761 bitmap = la->la_bitmap;
676 *bit_off = le32_to_cpu(la->la_bm_off) + start; 762 *bit_off = le32_to_cpu(la->la_bm_off) + start;
677 /* local alloc is always contiguous by nature -- we never
678 * delete bits from it! */
679 *num_bits = bits_wanted; 763 *num_bits = bits_wanted;
680 764
681 status = ocfs2_journal_access_di(handle, 765 status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
687 goto bail; 771 goto bail;
688 } 772 }
689 773
774 ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
775 bits_wanted);
776
690 while(bits_wanted--) 777 while(bits_wanted--)
691 ocfs2_set_bit(start++, bitmap); 778 ocfs2_set_bit(start++, bitmap);
692 779
693 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); 780 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
781 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
694 782
695 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
696 if (status < 0) {
697 mlog_errno(status);
698 goto bail;
699 }
700
701 status = 0;
702bail: 783bail:
703 mlog_exit(status); 784 mlog_exit(status);
704 return status; 785 return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
722} 803}
723 804
724static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 805static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
725 struct ocfs2_dinode *alloc, 806 struct ocfs2_dinode *alloc,
726 u32 numbits) 807 u32 *numbits,
808 struct ocfs2_alloc_reservation *resv)
727{ 809{
728 int numfound, bitoff, left, startoff, lastzero; 810 int numfound, bitoff, left, startoff, lastzero;
811 int local_resv = 0;
812 struct ocfs2_alloc_reservation r;
729 void *bitmap = NULL; 813 void *bitmap = NULL;
814 struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
730 815
731 mlog_entry("(numbits wanted = %u)\n", numbits); 816 mlog_entry("(numbits wanted = %u)\n", *numbits);
732 817
733 if (!alloc->id1.bitmap1.i_total) { 818 if (!alloc->id1.bitmap1.i_total) {
734 mlog(0, "No bits in my window!\n"); 819 mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
736 goto bail; 821 goto bail;
737 } 822 }
738 823
824 if (!resv) {
825 local_resv = 1;
826 ocfs2_resv_init_once(&r);
827 ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
828 resv = &r;
829 }
830
831 numfound = *numbits;
832 if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
833 if (numfound < *numbits)
834 *numbits = numfound;
835 goto bail;
836 }
837
838 /*
839 * Code error. While reservations are enabled, local
840 * allocation should _always_ go through them.
841 */
842 BUG_ON(osb->osb_resv_level != 0);
843
844 /*
845 * Reservations are disabled. Handle this the old way.
846 */
847
739 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; 848 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
740 849
741 numfound = bitoff = startoff = 0; 850 numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
761 startoff = bitoff+1; 870 startoff = bitoff+1;
762 } 871 }
763 /* we got everything we needed */ 872 /* we got everything we needed */
764 if (numfound == numbits) { 873 if (numfound == *numbits) {
765 /* mlog(0, "Found it all!\n"); */ 874 /* mlog(0, "Found it all!\n"); */
766 break; 875 break;
767 } 876 }
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
770 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, 879 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
771 numfound); 880 numfound);
772 881
773 if (numfound == numbits) 882 if (numfound == *numbits)
774 bitoff = startoff - numfound; 883 bitoff = startoff - numfound;
775 else 884 else
776 bitoff = -1; 885 bitoff = -1;
777 886
778bail: 887bail:
888 if (local_resv)
889 ocfs2_resv_discard(resmap, resv);
890
779 mlog_exit(bitoff); 891 mlog_exit(bitoff);
780 return bitoff; 892 return bitoff;
781} 893}
@@ -1049,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
1049 /* we used the generic suballoc reserve function, but we set 1161 /* we used the generic suballoc reserve function, but we set
1050 * everything up nicely, so there's no reason why we can't use 1162 * everything up nicely, so there's no reason why we can't use
1051 * the more specific cluster api to claim bits. */ 1163 * the more specific cluster api to claim bits. */
1052 status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, 1164 status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
1053 &cluster_off, &cluster_count); 1165 &cluster_off, &cluster_count);
1054 if (status == -ENOSPC) { 1166 if (status == -ENOSPC) {
1055retry_enospc: 1167retry_enospc:
@@ -1063,7 +1175,7 @@ retry_enospc:
1063 goto bail; 1175 goto bail;
1064 1176
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits; 1177 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1066 status = ocfs2_claim_clusters(osb, handle, ac, 1178 status = ocfs2_claim_clusters(handle, ac,
1067 osb->local_alloc_bits, 1179 osb->local_alloc_bits,
1068 &cluster_off, 1180 &cluster_off,
1069 &cluster_count); 1181 &cluster_count);
@@ -1098,6 +1210,9 @@ retry_enospc:
1098 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, 1210 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
1099 le16_to_cpu(la->la_size)); 1211 le16_to_cpu(la->la_size));
1100 1212
1213 ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
1214 OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
1215
1101 mlog(0, "New window allocated:\n"); 1216 mlog(0, "New window allocated:\n");
1102 mlog(0, "window la_bm_off = %u\n", 1217 mlog(0, "window la_bm_off = %u\n",
1103 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); 1218 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1169,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1169 } 1284 }
1170 1285
1171 ocfs2_clear_local_alloc(alloc); 1286 ocfs2_clear_local_alloc(alloc);
1172 1287 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1173 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1174 if (status < 0) {
1175 mlog_errno(status);
1176 goto bail;
1177 }
1178 1288
1179 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, 1289 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
1180 main_bm_inode, main_bm_bh); 1290 main_bm_inode, main_bm_bh);
@@ -1192,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1192 1302
1193 atomic_inc(&osb->alloc_stats.moves); 1303 atomic_inc(&osb->alloc_stats.moves);
1194 1304
1195 status = 0;
1196bail: 1305bail:
1197 if (handle) 1306 if (handle)
1198 ocfs2_commit_trans(osb, handle); 1307 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30 30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); 31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32 32
33void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
34unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
35
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, 36int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num, 37 int node_num,
35 struct ocfs2_dinode **alloc_copy); 38 struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7898bd3a99f5..af2b8fe1f139 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -41,44 +41,20 @@
41#include "file.h" 41#include "file.h"
42#include "inode.h" 42#include "inode.h"
43#include "mmap.h" 43#include "mmap.h"
44#include "super.h"
44 45
45static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
46{
47 /* The best way to deal with signals in the vm path is
48 * to block them upfront, rather than allowing the
49 * locking paths to return -ERESTARTSYS. */
50 sigfillset(blocked);
51
52 /* We should technically never get a bad return value
53 * from sigprocmask */
54 return sigprocmask(SIG_BLOCK, blocked, oldset);
55}
56
57static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
58{
59 return sigprocmask(SIG_SETMASK, oldset, NULL);
60}
61 46
62static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) 47static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
63{ 48{
64 sigset_t blocked, oldset; 49 sigset_t oldset;
65 int error, ret; 50 int ret;
66 51
67 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff); 52 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
68 53
69 error = ocfs2_vm_op_block_sigs(&blocked, &oldset); 54 ocfs2_block_signals(&oldset);
70 if (error < 0) {
71 mlog_errno(error);
72 ret = VM_FAULT_SIGBUS;
73 goto out;
74 }
75
76 ret = filemap_fault(area, vmf); 55 ret = filemap_fault(area, vmf);
56 ocfs2_unblock_signals(&oldset);
77 57
78 error = ocfs2_vm_op_unblock_sigs(&oldset);
79 if (error < 0)
80 mlog_errno(error);
81out:
82 mlog_exit_ptr(vmf->page); 58 mlog_exit_ptr(vmf->page);
83 return ret; 59 return ret;
84} 60}
@@ -158,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
158 struct page *page = vmf->page; 134 struct page *page = vmf->page;
159 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 135 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
160 struct buffer_head *di_bh = NULL; 136 struct buffer_head *di_bh = NULL;
161 sigset_t blocked, oldset; 137 sigset_t oldset;
162 int ret, ret2; 138 int ret;
163 139
164 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); 140 ocfs2_block_signals(&oldset);
165 if (ret < 0) {
166 mlog_errno(ret);
167 return ret;
168 }
169 141
170 /* 142 /*
171 * The cluster locks taken will block a truncate from another 143 * The cluster locks taken will block a truncate from another
@@ -193,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
193 ocfs2_inode_unlock(inode, 1); 165 ocfs2_inode_unlock(inode, 1);
194 166
195out: 167out:
196 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 168 ocfs2_unblock_signals(&oldset);
197 if (ret2 < 0)
198 mlog_errno(ret2);
199 if (ret) 169 if (ret)
200 ret = VM_FAULT_SIGBUS; 170 ret = VM_FAULT_SIGBUS;
201 return ret; 171 return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4cbb18f26c5f..f171b51a74f7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
204 inode->i_nlink = 2; 204 inode->i_nlink = 2;
205 else 205 else
206 inode->i_nlink = 1; 206 inode->i_nlink = 1;
207 inode->i_uid = current_fsuid(); 207 inode_init_owner(inode, dir, mode);
208 if (dir->i_mode & S_ISGID) {
209 inode->i_gid = dir->i_gid;
210 if (S_ISDIR(mode))
211 mode |= S_ISGID;
212 } else
213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode;
215 dquot_initialize(inode); 208 dquot_initialize(inode);
216 return inode; 209 return inode;
217} 210}
@@ -239,6 +232,8 @@ static int ocfs2_mknod(struct inode *dir,
239 }; 232 };
240 int did_quota_inode = 0; 233 int did_quota_inode = 0;
241 struct ocfs2_dir_lookup_result lookup = { NULL, }; 234 struct ocfs2_dir_lookup_result lookup = { NULL, };
235 sigset_t oldset;
236 int did_block_signals = 0;
242 237
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 238 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 239 (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +345,10 @@ static int ocfs2_mknod(struct inode *dir,
350 goto leave; 345 goto leave;
351 } 346 }
352 347
348 /* Starting to change things, restart is no longer possible. */
349 ocfs2_block_signals(&oldset);
350 did_block_signals = 1;
351
353 status = dquot_alloc_inode(inode); 352 status = dquot_alloc_inode(inode);
354 if (status) 353 if (status)
355 goto leave; 354 goto leave;
@@ -384,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
384 goto leave; 383 goto leave;
385 } 384 }
386 ocfs2_add_links_count(dirfe, 1); 385 ocfs2_add_links_count(dirfe, 1);
387 status = ocfs2_journal_dirty(handle, parent_fe_bh); 386 ocfs2_journal_dirty(handle, parent_fe_bh);
388 if (status < 0) {
389 mlog_errno(status);
390 goto leave;
391 }
392 inc_nlink(dir); 387 inc_nlink(dir);
393 } 388 }
394 389
@@ -439,6 +434,8 @@ leave:
439 ocfs2_commit_trans(osb, handle); 434 ocfs2_commit_trans(osb, handle);
440 435
441 ocfs2_inode_unlock(dir, 1); 436 ocfs2_inode_unlock(dir, 1);
437 if (did_block_signals)
438 ocfs2_unblock_signals(&oldset);
442 439
443 if (status == -ENOSPC) 440 if (status == -ENOSPC)
444 mlog(0, "Disk is full\n"); 441 mlog(0, "Disk is full\n");
@@ -487,14 +484,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
487 int status = 0; 484 int status = 0;
488 struct ocfs2_dinode *fe = NULL; 485 struct ocfs2_dinode *fe = NULL;
489 struct ocfs2_extent_list *fel; 486 struct ocfs2_extent_list *fel;
490 u64 fe_blkno = 0; 487 u64 suballoc_loc, fe_blkno = 0;
491 u16 suballoc_bit; 488 u16 suballoc_bit;
492 u16 feat; 489 u16 feat;
493 490
494 *new_fe_bh = NULL; 491 *new_fe_bh = NULL;
495 492
496 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, 493 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
497 inode_ac, &suballoc_bit, &fe_blkno); 494 inode_ac, &suballoc_loc,
495 &suballoc_bit, &fe_blkno);
498 if (status < 0) { 496 if (status < 0) {
499 mlog_errno(status); 497 mlog_errno(status);
500 goto leave; 498 goto leave;
@@ -531,6 +529,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
531 fe->i_generation = cpu_to_le32(inode->i_generation); 529 fe->i_generation = cpu_to_le32(inode->i_generation);
532 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 530 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
533 fe->i_blkno = cpu_to_le64(fe_blkno); 531 fe->i_blkno = cpu_to_le64(fe_blkno);
532 fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
534 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 533 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
535 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); 534 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
536 fe->i_uid = cpu_to_le32(inode->i_uid); 535 fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -567,11 +566,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
567 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); 566 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
568 } 567 }
569 568
570 status = ocfs2_journal_dirty(handle, *new_fe_bh); 569 ocfs2_journal_dirty(handle, *new_fe_bh);
571 if (status < 0) {
572 mlog_errno(status);
573 goto leave;
574 }
575 570
576 ocfs2_populate_inode(inode, fe, 1); 571 ocfs2_populate_inode(inode, fe, 1);
577 ocfs2_ci_set_new(osb, INODE_CACHE(inode)); 572 ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -637,6 +632,7 @@ static int ocfs2_link(struct dentry *old_dentry,
637 struct ocfs2_dinode *fe = NULL; 632 struct ocfs2_dinode *fe = NULL;
638 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 633 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
639 struct ocfs2_dir_lookup_result lookup = { NULL, }; 634 struct ocfs2_dir_lookup_result lookup = { NULL, };
635 sigset_t oldset;
640 636
641 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 637 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
642 old_dentry->d_name.len, old_dentry->d_name.name, 638 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -693,6 +689,9 @@ static int ocfs2_link(struct dentry *old_dentry,
693 goto out_unlock_inode; 689 goto out_unlock_inode;
694 } 690 }
695 691
692 /* Starting to change things, restart is no longer possible. */
693 ocfs2_block_signals(&oldset);
694
696 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 695 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
697 OCFS2_JOURNAL_ACCESS_WRITE); 696 OCFS2_JOURNAL_ACCESS_WRITE);
698 if (err < 0) { 697 if (err < 0) {
@@ -705,14 +704,7 @@ static int ocfs2_link(struct dentry *old_dentry,
705 ocfs2_set_links_count(fe, inode->i_nlink); 704 ocfs2_set_links_count(fe, inode->i_nlink);
706 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 705 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
707 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 706 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
708 707 ocfs2_journal_dirty(handle, fe_bh);
709 err = ocfs2_journal_dirty(handle, fe_bh);
710 if (err < 0) {
711 ocfs2_add_links_count(fe, -1);
712 drop_nlink(inode);
713 mlog_errno(err);
714 goto out_commit;
715 }
716 708
717 err = ocfs2_add_entry(handle, dentry, inode, 709 err = ocfs2_add_entry(handle, dentry, inode,
718 OCFS2_I(inode)->ip_blkno, 710 OCFS2_I(inode)->ip_blkno,
@@ -736,6 +728,7 @@ static int ocfs2_link(struct dentry *old_dentry,
736 728
737out_commit: 729out_commit:
738 ocfs2_commit_trans(osb, handle); 730 ocfs2_commit_trans(osb, handle);
731 ocfs2_unblock_signals(&oldset);
739out_unlock_inode: 732out_unlock_inode:
740 ocfs2_inode_unlock(inode, 1); 733 ocfs2_inode_unlock(inode, 1);
741 734
@@ -909,12 +902,7 @@ static int ocfs2_unlink(struct inode *dir,
909 drop_nlink(inode); 902 drop_nlink(inode);
910 drop_nlink(inode); 903 drop_nlink(inode);
911 ocfs2_set_links_count(fe, inode->i_nlink); 904 ocfs2_set_links_count(fe, inode->i_nlink);
912 905 ocfs2_journal_dirty(handle, fe_bh);
913 status = ocfs2_journal_dirty(handle, fe_bh);
914 if (status < 0) {
915 mlog_errno(status);
916 goto leave;
917 }
918 906
919 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 907 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
920 if (S_ISDIR(inode->i_mode)) 908 if (S_ISDIR(inode->i_mode))
@@ -1332,12 +1320,7 @@ static int ocfs2_rename(struct inode *old_dir,
1332 ocfs2_set_links_count(newfe, 0); 1320 ocfs2_set_links_count(newfe, 0);
1333 else 1321 else
1334 ocfs2_add_links_count(newfe, -1); 1322 ocfs2_add_links_count(newfe, -1);
1335 1323 ocfs2_journal_dirty(handle, newfe_bh);
1336 status = ocfs2_journal_dirty(handle, newfe_bh);
1337 if (status < 0) {
1338 mlog_errno(status);
1339 goto bail;
1340 }
1341 } else { 1324 } else {
1342 /* if the name was not found in new_dir, add it now */ 1325 /* if the name was not found in new_dir, add it now */
1343 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1326 status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1356,10 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir,
1356 1339
1357 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); 1340 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
1358 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); 1341 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
1359 1342 ocfs2_journal_dirty(handle, old_inode_bh);
1360 status = ocfs2_journal_dirty(handle, old_inode_bh);
1361 if (status < 0)
1362 mlog_errno(status);
1363 } else 1343 } else
1364 mlog_errno(status); 1344 mlog_errno(status);
1365 1345
@@ -1431,7 +1411,7 @@ static int ocfs2_rename(struct inode *old_dir,
1431 OCFS2_JOURNAL_ACCESS_WRITE); 1411 OCFS2_JOURNAL_ACCESS_WRITE);
1432 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1412 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1433 ocfs2_set_links_count(fe, old_dir->i_nlink); 1413 ocfs2_set_links_count(fe, old_dir->i_nlink);
1434 status = ocfs2_journal_dirty(handle, old_dir_bh); 1414 ocfs2_journal_dirty(handle, old_dir_bh);
1435 } 1415 }
1436 } 1416 }
1437 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1417 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1563,11 +1543,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1563 (bytes_left > sb->s_blocksize) ? sb->s_blocksize : 1543 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1564 bytes_left); 1544 bytes_left);
1565 1545
1566 status = ocfs2_journal_dirty(handle, bhs[virtual]); 1546 ocfs2_journal_dirty(handle, bhs[virtual]);
1567 if (status < 0) {
1568 mlog_errno(status);
1569 goto bail;
1570 }
1571 1547
1572 virtual++; 1548 virtual++;
1573 p_blkno++; 1549 p_blkno++;
@@ -1611,6 +1587,8 @@ static int ocfs2_symlink(struct inode *dir,
1611 }; 1587 };
1612 int did_quota = 0, did_quota_inode = 0; 1588 int did_quota = 0, did_quota_inode = 0;
1613 struct ocfs2_dir_lookup_result lookup = { NULL, }; 1589 struct ocfs2_dir_lookup_result lookup = { NULL, };
1590 sigset_t oldset;
1591 int did_block_signals = 0;
1614 1592
1615 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1593 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1616 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1594 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1706,6 +1684,10 @@ static int ocfs2_symlink(struct inode *dir,
1706 goto bail; 1684 goto bail;
1707 } 1685 }
1708 1686
1687 /* Starting to change things, restart is no longer possible. */
1688 ocfs2_block_signals(&oldset);
1689 did_block_signals = 1;
1690
1709 status = dquot_alloc_inode(inode); 1691 status = dquot_alloc_inode(inode);
1710 if (status) 1692 if (status)
1711 goto bail; 1693 goto bail;
@@ -1814,6 +1796,8 @@ bail:
1814 ocfs2_commit_trans(osb, handle); 1796 ocfs2_commit_trans(osb, handle);
1815 1797
1816 ocfs2_inode_unlock(dir, 1); 1798 ocfs2_inode_unlock(dir, 1);
1799 if (did_block_signals)
1800 ocfs2_unblock_signals(&oldset);
1817 1801
1818 brelse(new_fe_bh); 1802 brelse(new_fe_bh);
1819 brelse(parent_fe_bh); 1803 brelse(parent_fe_bh);
@@ -1961,12 +1945,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1961 if (S_ISDIR(inode->i_mode)) 1945 if (S_ISDIR(inode->i_mode))
1962 ocfs2_add_links_count(orphan_fe, 1); 1946 ocfs2_add_links_count(orphan_fe, 1);
1963 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 1947 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1964 1948 ocfs2_journal_dirty(handle, orphan_dir_bh);
1965 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
1966 if (status < 0) {
1967 mlog_errno(status);
1968 goto leave;
1969 }
1970 1949
1971 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1950 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1972 OCFS2_ORPHAN_NAMELEN, inode, 1951 OCFS2_ORPHAN_NAMELEN, inode,
@@ -2065,12 +2044,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2065 if (S_ISDIR(inode->i_mode)) 2044 if (S_ISDIR(inode->i_mode))
2066 ocfs2_add_links_count(orphan_fe, -1); 2045 ocfs2_add_links_count(orphan_fe, -1);
2067 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 2046 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2068 2047 ocfs2_journal_dirty(handle, orphan_dir_bh);
2069 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2070 if (status < 0) {
2071 mlog_errno(status);
2072 goto leave;
2073 }
2074 2048
2075leave: 2049leave:
2076 ocfs2_free_dir_lookup_result(&lookup); 2050 ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index adf5e2ebc2c4..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
47/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
48#include "blockcheck.h" 48#include "blockcheck.h"
49 49
50#include "reservations.h"
50 51
51/* Caching of metadata buffers */ 52/* Caching of metadata buffers */
52 53
@@ -341,6 +342,9 @@ struct ocfs2_super
341 */ 342 */
342 unsigned int local_alloc_bits; 343 unsigned int local_alloc_bits;
343 unsigned int local_alloc_default_bits; 344 unsigned int local_alloc_default_bits;
345 /* osb_clusters_at_boot can become stale! Do not trust it to
346 * be up to date. */
347 unsigned int osb_clusters_at_boot;
344 348
345 enum ocfs2_local_alloc_state local_alloc_state; /* protected 349 enum ocfs2_local_alloc_state local_alloc_state; /* protected
346 * by osb_lock */ 350 * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
349 353
350 u64 la_last_gd; 354 u64 la_last_gd;
351 355
356 struct ocfs2_reservation_map osb_la_resmap;
357
358 unsigned int osb_resv_level;
359 unsigned int osb_dir_resv_level;
360
352 /* Next three fields are for local node slot recovery during 361 /* Next three fields are for local node slot recovery during
353 * mount. */ 362 * mount. */
354 int dirty; 363 int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
482 return 0; 491 return 0;
483} 492}
484 493
494static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
495{
496 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
497 return 1;
498 return 0;
499}
500
485static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) 501static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
486{ 502{
487 if (ocfs2_supports_indexed_dirs(osb)) 503 if (ocfs2_supports_indexed_dirs(osb))
@@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 779 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
764} 780}
765 781
782static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
783 unsigned int clusters)
784{
785 return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
786}
787
766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) 788static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
767{ 789{
768 ext2_set_bit(bit, bitmap); 790 ext2_set_bit(bit, bitmap);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..33f1c9a8258d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
100 | OCFS2_FEATURE_INCOMPAT_XATTR \ 100 | OCFS2_FEATURE_INCOMPAT_XATTR \
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
104#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
105 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
106 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
165/* Refcount tree support */ 166/* Refcount tree support */
166#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 167#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
167 168
169/* Discontigous block groups */
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171
168/* 172/*
169 * backup superblock flag is used to indicate that this volume 173 * backup superblock flag is used to indicate that this volume
170 * has backup superblocks. 174 * has backup superblocks.
@@ -283,14 +287,6 @@
283#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 287#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
284 288
285/* 289/*
286 * Default local alloc size (in megabytes)
287 *
288 * The value chosen should be such that most allocations, including new
289 * block groups, use local alloc.
290 */
291#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
292
293/*
294 * Inline extended attribute size (in bytes) 290 * Inline extended attribute size (in bytes)
295 * The value chosen should be aligned to 16 byte boundaries. 291 * The value chosen should be aligned to 16 byte boundaries.
296 */ 292 */
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
512 block group */ 508 block group */
513 __le32 h_fs_generation; /* Must match super block */ 509 __le32 h_fs_generation; /* Must match super block */
514 __le64 h_blkno; /* Offset on disk, in blocks */ 510 __le64 h_blkno; /* Offset on disk, in blocks */
515/*20*/ __le64 h_reserved3; 511/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
512 eb belongs to. Only valid
513 if allocated from a
514 discontiguous block group */
516 __le64 h_next_leaf_blk; /* Offset on disk, in blocks, 515 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
517 of next leaf header pointing 516 of next leaf header pointing
518 to data */ 517 to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
679/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 678/*80*/ struct ocfs2_block_check i_check; /* Error checking */
680/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ 679/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
681/*90*/ __le64 i_refcount_loc; 680/*90*/ __le64 i_refcount_loc;
682 __le64 i_reserved2[4]; 681 __le64 i_suballoc_loc; /* Suballocator block group this
682 inode belongs to. Only valid
683 if allocated from a
684 discontiguous block group */
685/*A0*/ __le64 i_reserved2[3];
683/*B8*/ union { 686/*B8*/ union {
684 __le64 i_pad1; /* Generic way to refer to this 687 __le64 i_pad1; /* Generic way to refer to this
685 64bit union */ 688 64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
814 __le32 dr_reserved2; 817 __le32 dr_reserved2;
815 __le64 dr_free_blk; /* Pointer to head of free 818 __le64 dr_free_blk; /* Pointer to head of free
816 * unindexed block list. */ 819 * unindexed block list. */
817 __le64 dr_reserved3[15]; 820 __le64 dr_suballoc_loc; /* Suballocator block group
821 this root belongs to.
822 Only valid if allocated
823 from a discontiguous
824 block group */
825 __le64 dr_reserved3[14];
818 union { 826 union {
819 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 827 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
820 * bits for maximum space 828 * bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
840}; 848};
841 849
842/* 850/*
851 * Largest bitmap for a block (suballocator) group in bytes. This limit
852 * does not affect cluster groups (global allocator). Cluster group
853 * bitmaps run to the end of the block.
854 */
855#define OCFS2_MAX_BG_BITMAP_SIZE 256
856
857/*
843 * On disk allocator group structure for OCFS2 858 * On disk allocator group structure for OCFS2
844 */ 859 */
845struct ocfs2_group_desc 860struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
860 __le64 bg_blkno; /* Offset on disk, in blocks */ 875 __le64 bg_blkno; /* Offset on disk, in blocks */
861/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ 876/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
862 __le64 bg_reserved2; 877 __le64 bg_reserved2;
863/*40*/ __u8 bg_bitmap[0]; 878/*40*/ union {
879 __u8 bg_bitmap[0];
880 struct {
881 /*
882 * Block groups may be discontiguous when
883 * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
884 * The extents of a discontigous block group are
885 * stored in bg_list. It is a flat list.
886 * l_tree_depth must always be zero. A
887 * discontiguous group is signified by a non-zero
888 * bg_list->l_next_free_rec. Only block groups
889 * can be discontiguous; Cluster groups cannot.
890 * We've never made a block group with more than
891 * 2048 blocks (256 bytes of bg_bitmap). This
892 * codifies that limit so that we can fit bg_list.
893 * bg_size of a discontiguous block group will
894 * be 256 to match bg_bitmap_filler.
895 */
896 __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
897/*140*/ struct ocfs2_extent_list bg_list;
898 };
899 };
900/* Actual on-disk size is one block */
864}; 901};
865 902
866struct ocfs2_refcount_rec { 903struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
905/*40*/ __le32 rf_generation; /* generation number. all be the same 942/*40*/ __le32 rf_generation; /* generation number. all be the same
906 * for the same refcount tree. */ 943 * for the same refcount tree. */
907 __le32 rf_reserved0; 944 __le32 rf_reserved0;
908 __le64 rf_reserved1[7]; 945 __le64 rf_suballoc_loc; /* Suballocator block group this
946 refcount block belongs to. Only
947 valid if allocated from a
948 discontiguous block group */
949/*50*/ __le64 rf_reserved1[6];
909/*80*/ union { 950/*80*/ union {
910 struct ocfs2_refcount_list rf_records; /* List of refcount 951 struct ocfs2_refcount_list rf_records; /* List of refcount
911 records */ 952 records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
1017 real xattr or a xattr tree. */ 1058 real xattr or a xattr tree. */
1018 __le16 xb_reserved0; 1059 __le16 xb_reserved0;
1019 __le32 xb_reserved1; 1060 __le32 xb_reserved1;
1020 __le64 xb_reserved2; 1061 __le64 xb_suballoc_loc; /* Suballocator block group this
1062 xattr block belongs to. Only
1063 valid if allocated from a
1064 discontiguous block group */
1021/*30*/ union { 1065/*30*/ union {
1022 struct ocfs2_xattr_header xb_header; /* xattr header if this 1066 struct ocfs2_xattr_header xb_header; /* xattr header if this
1023 block contains xattr */ 1067 block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1254 return size / sizeof(struct ocfs2_extent_rec); 1298 return size / sizeof(struct ocfs2_extent_rec);
1255} 1299}
1256 1300
1301static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
1302{
1303 int size;
1304
1305 size = sb->s_blocksize -
1306 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1307
1308 return size / sizeof(struct ocfs2_extent_rec);
1309}
1310
1257static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) 1311static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1258{ 1312{
1259 int size; 1313 int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1284 return size; 1338 return size;
1285} 1339}
1286 1340
1287static inline int ocfs2_group_bitmap_size(struct super_block *sb) 1341static inline int ocfs2_group_bitmap_size(struct super_block *sb,
1342 int suballocator,
1343 u32 feature_incompat)
1288{ 1344{
1289 int size; 1345 int size = sb->s_blocksize -
1290
1291 size = sb->s_blocksize -
1292 offsetof(struct ocfs2_group_desc, bg_bitmap); 1346 offsetof(struct ocfs2_group_desc, bg_bitmap);
1293 1347
1348 /*
1349 * The cluster allocator uses the entire block. Suballocators have
1350 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1351 * code expects bg_size set to the maximum. Thus we must keep
1352 * bg_size as-is unless discontig_bg is enabled.
1353 */
1354 if (suballocator &&
1355 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1356 size = OCFS2_MAX_BG_BITMAP_SIZE;
1357
1294 return size; 1358 return size;
1295} 1359}
1296 1360
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
1402 return size / sizeof(struct ocfs2_extent_rec); 1466 return size / sizeof(struct ocfs2_extent_rec);
1403} 1467}
1404 1468
1405static inline int ocfs2_local_alloc_size(int blocksize) 1469static inline int ocfs2_extent_recs_per_gd(int blocksize)
1406{ 1470{
1407 int size; 1471 int size;
1408 1472
1409 size = blocksize - 1473 size = blocksize -
1410 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); 1474 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1411 1475
1412 return size; 1476 return size / sizeof(struct ocfs2_extent_rec);
1413} 1477}
1414 1478
1415static inline int ocfs2_group_bitmap_size(int blocksize) 1479static inline int ocfs2_local_alloc_size(int blocksize)
1416{ 1480{
1417 int size; 1481 int size;
1418 1482
1419 size = blocksize - 1483 size = blocksize -
1484 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
1485
1486 return size;
1487}
1488
1489static inline int ocfs2_group_bitmap_size(int blocksize,
1490 int suballocator,
1491 uint32_t feature_incompat)
1492{
1493 int size = sb->s_blocksize -
1420 offsetof(struct ocfs2_group_desc, bg_bitmap); 1494 offsetof(struct ocfs2_group_desc, bg_bitmap);
1421 1495
1496 /*
1497 * The cluster allocator uses the entire block. Suballocators have
1498 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1499 * code expects bg_size set to the maximum. Thus we must keep
1500 * bg_size as-is unless discontig_bg is enabled.
1501 */
1502 if (suballocator &&
1503 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1504 size = OCFS2_MAX_BG_BITMAP_SIZE;
1505
1422 return size; 1506 return size;
1423} 1507}
1424 1508
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
1491 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 1575 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1492} 1576}
1493 1577
1578static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
1579{
1580 if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
1581 le16_to_cpu(gd->bg_size)) !=
1582 offsetof(struct ocfs2_group_desc, bg_list))
1583 return 0;
1584 /*
1585 * Only valid to check l_next_free_rec if
1586 * bg_bitmap + bg_size == bg_list.
1587 */
1588 if (!gd->bg_list.l_next_free_rec)
1589 return 0;
1590 return 1;
1591}
1494#endif /* _OCFS2_FS_H */ 1592#endif /* _OCFS2_FS_H */
1495 1593
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
23struct ocfs2_dquot { 23struct ocfs2_dquot {
24 struct dquot dq_dquot; /* Generic VFS dquot */ 24 struct dquot dq_dquot; /* Generic VFS dquot */
25 loff_t dq_local_off; /* Offset in the local quota file */ 25 loff_t dq_local_off; /* Offset in the local quota file */
26 u64 dq_local_phys_blk; /* Physical block carrying quota structure */
26 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ 27 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
27 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ 28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
28 s64 dq_origspace; /* Last globally synced space usage */ 29 s64 dq_origspace; /* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
51 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ 52 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
52 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ 53 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
53 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ 54 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
55 u64 dqi_giblk; /* Number of block with global information header */
54 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ 56 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
55 struct buffer_head *dqi_ibh; /* Buffer with information header */ 57 struct buffer_head *dqi_libh; /* Buffer with local information header */
56 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ 58 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
57 struct delayed_work dqi_sync_work; /* Work for syncing dquots */ 59 struct delayed_work dqi_sync_work; /* Work for syncing dquots */
58 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery 60 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
102 104
103int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 105int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
104void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 106void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
105int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 107int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
106 struct buffer_head **bh); 108int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
109 struct buffer_head **bh);
110int ocfs2_create_local_dquot(struct dquot *dquot);
111int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
112int ocfs2_local_write_dquot(struct dquot *dquot);
107 113
108extern const struct dquot_operations ocfs2_quota_operations; 114extern const struct dquot_operations ocfs2_quota_operations;
109extern struct quota_format_type ocfs2_quota_format; 115extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index ab42a74c7539..2bb35fe00511 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -25,8 +25,44 @@
25#include "dlmglue.h" 25#include "dlmglue.h"
26#include "uptodate.h" 26#include "uptodate.h"
27#include "super.h" 27#include "super.h"
28#include "buffer_head_io.h"
28#include "quota.h" 29#include "quota.h"
29 30
31/*
32 * Locking of quotas with OCFS2 is rather complex. Here are rules that
33 * should be obeyed by all the functions:
34 * - any write of quota structure (either to local or global file) is protected
35 * by dqio_mutex or dquot->dq_lock.
36 * - any modification of global quota file holds inode cluster lock, i_mutex,
37 * and ip_alloc_sem of the global quota file (achieved by
38 * ocfs2_lock_global_qf). It also has to hold qinfo_lock.
39 * - an allocation of new blocks for local quota file is protected by
40 * its ip_alloc_sem
41 *
42 * A rough sketch of locking dependencies (lf = local file, gf = global file):
43 * Normal filesystem operation:
44 * start_trans -> dqio_mutex -> write to lf
45 * Syncing of local and global file:
46 * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
47 * write to gf
48 * -> write to lf
49 * Acquire dquot for the first time:
50 * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
51 * -> alloc space for gf
52 * -> start_trans -> qinfo_lock -> write to gf
53 * -> ip_alloc_sem of lf -> alloc space for lf
54 * -> write to lf
55 * Release last reference to dquot:
56 * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
57 * -> write to lf
58 * Note that all the above operations also hold the inode cluster lock of lf.
59 * Recovery:
60 * inode cluster lock of recovered lf
61 * -> read bitmaps -> ip_alloc_sem of lf
62 * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
63 * write to gf
64 */
65
30static struct workqueue_struct *ocfs2_quota_wq = NULL; 66static struct workqueue_struct *ocfs2_quota_wq = NULL;
31 67
32static void qsync_work_fn(struct work_struct *work); 68static void qsync_work_fn(struct work_struct *work);
@@ -91,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
91 .is_id = ocfs2_global_is_id, 127 .is_id = ocfs2_global_is_id,
92}; 128};
93 129
94static int ocfs2_validate_quota_block(struct super_block *sb, 130int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
95 struct buffer_head *bh)
96{ 131{
97 struct ocfs2_disk_dqtrailer *dqt = 132 struct ocfs2_disk_dqtrailer *dqt =
98 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); 133 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -110,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
110 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); 145 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
111} 146}
112 147
113int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 148int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
114 struct buffer_head **bh) 149 struct buffer_head **bhp)
115{ 150{
116 int rc = 0; 151 int rc;
117 struct buffer_head *tmp = *bh; 152
118 153 *bhp = NULL;
119 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { 154 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
120 ocfs2_error(inode->i_sb, 155 ocfs2_validate_quota_block);
121 "Quota file %llu is probably corrupted! Requested "
122 "to read block %Lu but file has size only %Lu\n",
123 (unsigned long long)OCFS2_I(inode)->ip_blkno,
124 (unsigned long long)v_block,
125 (unsigned long long)i_size_read(inode));
126 return -EIO;
127 }
128 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
129 ocfs2_validate_quota_block);
130 if (rc) 156 if (rc)
131 mlog_errno(rc); 157 mlog_errno(rc);
132
133 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
134 if (!rc && !*bh)
135 *bh = tmp;
136
137 return rc; 158 return rc;
138} 159}
139 160
140static int ocfs2_get_quota_block(struct inode *inode, int block,
141 struct buffer_head **bh)
142{
143 u64 pblock, pcount;
144 int err;
145
146 down_read(&OCFS2_I(inode)->ip_alloc_sem);
147 err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
148 up_read(&OCFS2_I(inode)->ip_alloc_sem);
149 if (err) {
150 mlog_errno(err);
151 return err;
152 }
153 *bh = sb_getblk(inode->i_sb, pblock);
154 if (!*bh) {
155 err = -EIO;
156 mlog_errno(err);
157 }
158 return err;
159}
160
161/* Read data from global quotafile - avoid pagecache and such because we cannot 161/* Read data from global quotafile - avoid pagecache and such because we cannot
162 * afford acquiring the locks... We use quota cluster lock to serialize 162 * afford acquiring the locks... We use quota cluster lock to serialize
163 * operations. Caller is responsible for acquiring it. */ 163 * operations. Caller is responsible for acquiring it. */
@@ -172,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
172 int err = 0; 172 int err = 0;
173 struct buffer_head *bh; 173 struct buffer_head *bh;
174 size_t toread, tocopy; 174 size_t toread, tocopy;
175 u64 pblock = 0, pcount = 0;
175 176
176 if (off > i_size) 177 if (off > i_size)
177 return 0; 178 return 0;
@@ -180,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
180 toread = len; 181 toread = len;
181 while (toread > 0) { 182 while (toread > 0) {
182 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); 183 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
184 if (!pcount) {
185 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
186 &pcount, NULL);
187 if (err) {
188 mlog_errno(err);
189 return err;
190 }
191 } else {
192 pcount--;
193 pblock++;
194 }
183 bh = NULL; 195 bh = NULL;
184 err = ocfs2_read_quota_block(gqinode, blk, &bh); 196 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
185 if (err) { 197 if (err) {
186 mlog_errno(err); 198 mlog_errno(err);
187 return err; 199 return err;
@@ -209,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
209 int err = 0, new = 0, ja_type; 221 int err = 0, new = 0, ja_type;
210 struct buffer_head *bh = NULL; 222 struct buffer_head *bh = NULL;
211 handle_t *handle = journal_current_handle(); 223 handle_t *handle = journal_current_handle();
224 u64 pblock, pcount;
212 225
213 if (!handle) { 226 if (!handle) {
214 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " 227 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -221,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
221 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; 234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
222 } 235 }
223 236
224 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
225 if (gqinode->i_size < off + len) { 237 if (gqinode->i_size < off + len) {
226 loff_t rounded_end = 238 loff_t rounded_end =
227 ocfs2_align_bytes_to_blocks(sb, off + len); 239 ocfs2_align_bytes_to_blocks(sb, off + len);
228 240
229 /* Space is already allocated in ocfs2_global_read_dquot() */ 241 /* Space is already allocated in ocfs2_acquire_dquot() */
230 err = ocfs2_simple_size_update(gqinode, 242 err = ocfs2_simple_size_update(gqinode,
231 oinfo->dqi_gqi_bh, 243 oinfo->dqi_gqi_bh,
232 rounded_end); 244 rounded_end);
@@ -234,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
234 goto out; 246 goto out;
235 new = 1; 247 new = 1;
236 } 248 }
249 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
250 if (err) {
251 mlog_errno(err);
252 goto out;
253 }
237 /* Not rewriting whole block? */ 254 /* Not rewriting whole block? */
238 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && 255 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
239 !new) { 256 !new) {
240 err = ocfs2_read_quota_block(gqinode, blk, &bh); 257 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
241 ja_type = OCFS2_JOURNAL_ACCESS_WRITE; 258 ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
242 } else { 259 } else {
243 err = ocfs2_get_quota_block(gqinode, blk, &bh); 260 bh = sb_getblk(sb, pblock);
261 if (!bh)
262 err = -ENOMEM;
244 ja_type = OCFS2_JOURNAL_ACCESS_CREATE; 263 ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
245 } 264 }
246 if (err) { 265 if (err) {
@@ -261,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
261 brelse(bh); 280 brelse(bh);
262 goto out; 281 goto out;
263 } 282 }
264 err = ocfs2_journal_dirty(handle, bh); 283 ocfs2_journal_dirty(handle, bh);
265 brelse(bh); 284 brelse(bh);
266 if (err < 0)
267 goto out;
268out: 285out:
269 if (err) { 286 if (err) {
270 mutex_unlock(&gqinode->i_mutex);
271 mlog_errno(err); 287 mlog_errno(err);
272 return err; 288 return err;
273 } 289 }
274 gqinode->i_version++; 290 gqinode->i_version++;
275 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); 291 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
276 mutex_unlock(&gqinode->i_mutex);
277 return len; 292 return len;
278} 293}
279 294
@@ -291,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
291 else 306 else
292 WARN_ON(bh != oinfo->dqi_gqi_bh); 307 WARN_ON(bh != oinfo->dqi_gqi_bh);
293 spin_unlock(&dq_data_lock); 308 spin_unlock(&dq_data_lock);
309 if (ex) {
310 mutex_lock(&oinfo->dqi_gqinode->i_mutex);
311 down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
312 } else {
313 down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
314 }
294 return 0; 315 return 0;
295} 316}
296 317
297void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) 318void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
298{ 319{
320 if (ex) {
321 up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
322 mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
323 } else {
324 up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
325 }
299 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); 326 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
300 brelse(oinfo->dqi_gqi_bh); 327 brelse(oinfo->dqi_gqi_bh);
301 spin_lock(&dq_data_lock); 328 spin_lock(&dq_data_lock);
@@ -313,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
313 struct ocfs2_global_disk_dqinfo dinfo; 340 struct ocfs2_global_disk_dqinfo dinfo;
314 struct mem_dqinfo *info = sb_dqinfo(sb, type); 341 struct mem_dqinfo *info = sb_dqinfo(sb, type);
315 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; 342 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
343 u64 pcount;
316 int status; 344 int status;
317 345
318 mlog_entry_void(); 346 mlog_entry_void();
@@ -339,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
339 mlog_errno(status); 367 mlog_errno(status);
340 goto out_err; 368 goto out_err;
341 } 369 }
370
371 status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
372 &pcount, NULL);
373 if (status < 0)
374 goto out_unlock;
375
376 status = ocfs2_qinfo_lock(oinfo, 0);
377 if (status < 0)
378 goto out_unlock;
342 status = sb->s_op->quota_read(sb, type, (char *)&dinfo, 379 status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
343 sizeof(struct ocfs2_global_disk_dqinfo), 380 sizeof(struct ocfs2_global_disk_dqinfo),
344 OCFS2_GLOBAL_INFO_OFF); 381 OCFS2_GLOBAL_INFO_OFF);
382 ocfs2_qinfo_unlock(oinfo, 0);
345 ocfs2_unlock_global_qf(oinfo, 0); 383 ocfs2_unlock_global_qf(oinfo, 0);
346 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { 384 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
347 mlog(ML_ERROR, "Cannot read global quota info (%d).\n", 385 mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -368,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
368out_err: 406out_err:
369 mlog_exit(status); 407 mlog_exit(status);
370 return status; 408 return status;
409out_unlock:
410 ocfs2_unlock_global_qf(oinfo, 0);
411 mlog_errno(status);
412 goto out_err;
371} 413}
372 414
373/* Write information to global quota file. Expects exlusive lock on quota 415/* Write information to global quota file. Expects exlusive lock on quota
@@ -426,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
426 468
427static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) 469static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
428{ 470{
429 /* We modify all the allocated blocks, tree root, and info block */ 471 /* We modify all the allocated blocks, tree root, info block and
472 * the inode */
430 return (ocfs2_global_qinit_alloc(sb, type) + 2) * 473 return (ocfs2_global_qinit_alloc(sb, type) + 2) *
431 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; 474 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
432}
433
434/* Read in information from global quota file and acquire a reference to it.
435 * dquot_acquire() has already started the transaction and locked quota file */
436int ocfs2_global_read_dquot(struct dquot *dquot)
437{
438 int err, err2, ex = 0;
439 struct super_block *sb = dquot->dq_sb;
440 int type = dquot->dq_type;
441 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
442 struct ocfs2_super *osb = OCFS2_SB(sb);
443 struct inode *gqinode = info->dqi_gqinode;
444 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
445 handle_t *handle = NULL;
446
447 err = ocfs2_qinfo_lock(info, 0);
448 if (err < 0)
449 goto out;
450 err = qtree_read_dquot(&info->dqi_gi, dquot);
451 if (err < 0)
452 goto out_qlock;
453 OCFS2_DQUOT(dquot)->dq_use_count++;
454 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
455 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
456 ocfs2_qinfo_unlock(info, 0);
457
458 if (!dquot->dq_off) { /* No real quota entry? */
459 ex = 1;
460 /*
461 * Add blocks to quota file before we start a transaction since
462 * locking allocators ranks above a transaction start
463 */
464 WARN_ON(journal_current_handle());
465 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
466 err = ocfs2_extend_no_holes(gqinode,
467 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
468 gqinode->i_size);
469 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
470 if (err < 0)
471 goto out;
472 }
473
474 handle = ocfs2_start_trans(osb,
475 ocfs2_calc_global_qinit_credits(sb, type));
476 if (IS_ERR(handle)) {
477 err = PTR_ERR(handle);
478 goto out;
479 }
480 err = ocfs2_qinfo_lock(info, ex);
481 if (err < 0)
482 goto out_trans;
483 err = qtree_write_dquot(&info->dqi_gi, dquot);
484 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
485 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
486 if (!err)
487 err = err2;
488 }
489out_qlock:
490 if (ex)
491 ocfs2_qinfo_unlock(info, 1);
492 else
493 ocfs2_qinfo_unlock(info, 0);
494out_trans:
495 if (handle)
496 ocfs2_commit_trans(osb, handle);
497out:
498 if (err < 0)
499 mlog_errno(err);
500 return err;
501} 475}
502 476
503/* Sync local information about quota modifications with global quota file. 477/* Sync local information about quota modifications with global quota file.
@@ -638,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
638 } 612 }
639 mutex_lock(&sb_dqopt(sb)->dqio_mutex); 613 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
640 status = ocfs2_sync_dquot(dquot); 614 status = ocfs2_sync_dquot(dquot);
641 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
642 if (status < 0) 615 if (status < 0)
643 mlog_errno(status); 616 mlog_errno(status);
644 /* We have to write local structure as well... */ 617 /* We have to write local structure as well... */
645 dquot_mark_dquot_dirty(dquot); 618 status = ocfs2_local_write_dquot(dquot);
646 status = dquot_commit(dquot);
647 if (status < 0) 619 if (status < 0)
648 mlog_errno(status); 620 mlog_errno(status);
621 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
649 ocfs2_commit_trans(osb, handle); 622 ocfs2_commit_trans(osb, handle);
650out_ilock: 623out_ilock:
651 ocfs2_unlock_global_qf(oinfo, 1); 624 ocfs2_unlock_global_qf(oinfo, 1);
@@ -684,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
684 mlog_errno(status); 657 mlog_errno(status);
685 goto out; 658 goto out;
686 } 659 }
687 status = dquot_commit(dquot); 660 mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
661 status = ocfs2_local_write_dquot(dquot);
662 mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
688 ocfs2_commit_trans(osb, handle); 663 ocfs2_commit_trans(osb, handle);
689out: 664out:
690 mlog_exit(status); 665 mlog_exit(status);
@@ -715,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
715 690
716 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 691 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
717 692
693 mutex_lock(&dquot->dq_lock);
694 /* Check whether we are not racing with some other dqget() */
695 if (atomic_read(&dquot->dq_count) > 1)
696 goto out;
718 status = ocfs2_lock_global_qf(oinfo, 1); 697 status = ocfs2_lock_global_qf(oinfo, 1);
719 if (status < 0) 698 if (status < 0)
720 goto out; 699 goto out;
@@ -725,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
725 mlog_errno(status); 704 mlog_errno(status);
726 goto out_ilock; 705 goto out_ilock;
727 } 706 }
728 status = dquot_release(dquot); 707
708 status = ocfs2_global_release_dquot(dquot);
709 if (status < 0) {
710 mlog_errno(status);
711 goto out_trans;
712 }
713 status = ocfs2_local_release_dquot(handle, dquot);
714 /*
715 * If we fail here, we cannot do much as global structure is
716 * already released. So just complain...
717 */
718 if (status < 0)
719 mlog_errno(status);
720 clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
721out_trans:
729 ocfs2_commit_trans(osb, handle); 722 ocfs2_commit_trans(osb, handle);
730out_ilock: 723out_ilock:
731 ocfs2_unlock_global_qf(oinfo, 1); 724 ocfs2_unlock_global_qf(oinfo, 1);
732out: 725out:
726 mutex_unlock(&dquot->dq_lock);
733 mlog_exit(status); 727 mlog_exit(status);
734 return status; 728 return status;
735} 729}
736 730
731/*
732 * Read global dquot structure from disk or create it if it does
733 * not exist. Also update use count of the global structure and
734 * create structure in node-local quota file.
735 */
737static int ocfs2_acquire_dquot(struct dquot *dquot) 736static int ocfs2_acquire_dquot(struct dquot *dquot)
738{ 737{
739 struct ocfs2_mem_dqinfo *oinfo = 738 int status = 0, err;
740 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 739 int ex = 0;
741 int status = 0; 740 struct super_block *sb = dquot->dq_sb;
741 struct ocfs2_super *osb = OCFS2_SB(sb);
742 int type = dquot->dq_type;
743 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
744 struct inode *gqinode = info->dqi_gqinode;
745 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
746 handle_t *handle;
742 747
743 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 748 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
744 /* We need an exclusive lock, because we're going to update use count 749 mutex_lock(&dquot->dq_lock);
745 * and instantiate possibly new dquot structure */ 750 /*
746 status = ocfs2_lock_global_qf(oinfo, 1); 751 * We need an exclusive lock, because we're going to update use count
752 * and instantiate possibly new dquot structure
753 */
754 status = ocfs2_lock_global_qf(info, 1);
747 if (status < 0) 755 if (status < 0)
748 goto out; 756 goto out;
749 status = dquot_acquire(dquot); 757 if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
750 ocfs2_unlock_global_qf(oinfo, 1); 758 status = ocfs2_qinfo_lock(info, 0);
759 if (status < 0)
760 goto out_dq;
761 status = qtree_read_dquot(&info->dqi_gi, dquot);
762 ocfs2_qinfo_unlock(info, 0);
763 if (status < 0)
764 goto out_dq;
765 }
766 set_bit(DQ_READ_B, &dquot->dq_flags);
767
768 OCFS2_DQUOT(dquot)->dq_use_count++;
769 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
770 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
771 if (!dquot->dq_off) { /* No real quota entry? */
772 ex = 1;
773 /*
774 * Add blocks to quota file before we start a transaction since
775 * locking allocators ranks above a transaction start
776 */
777 WARN_ON(journal_current_handle());
778 status = ocfs2_extend_no_holes(gqinode,
779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
780 gqinode->i_size);
781 if (status < 0)
782 goto out_dq;
783 }
784
785 handle = ocfs2_start_trans(osb,
786 ocfs2_calc_global_qinit_credits(sb, type));
787 if (IS_ERR(handle)) {
788 status = PTR_ERR(handle);
789 goto out_dq;
790 }
791 status = ocfs2_qinfo_lock(info, ex);
792 if (status < 0)
793 goto out_trans;
794 status = qtree_write_dquot(&info->dqi_gi, dquot);
795 if (ex && info_dirty(sb_dqinfo(sb, type))) {
796 err = __ocfs2_global_write_info(sb, type);
797 if (!status)
798 status = err;
799 }
800 ocfs2_qinfo_unlock(info, ex);
801out_trans:
802 ocfs2_commit_trans(osb, handle);
803out_dq:
804 ocfs2_unlock_global_qf(info, 1);
805 if (status < 0)
806 goto out;
807
808 status = ocfs2_create_local_dquot(dquot);
809 if (status < 0)
810 goto out;
811 set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
751out: 812out:
813 mutex_unlock(&dquot->dq_lock);
752 mlog_exit(status); 814 mlog_exit(status);
753 return status; 815 return status;
754} 816}
@@ -770,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
770 struct ocfs2_super *osb = OCFS2_SB(sb); 832 struct ocfs2_super *osb = OCFS2_SB(sb);
771 833
772 mlog_entry("id=%u, type=%d", dquot->dq_id, type); 834 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
773 dquot_mark_dquot_dirty(dquot);
774 835
775 /* In case user set some limits, sync dquot immediately to global 836 /* In case user set some limits, sync dquot immediately to global
776 * quota file so that information propagates quicker */ 837 * quota file so that information propagates quicker */
@@ -793,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
793 mlog_errno(status); 854 mlog_errno(status);
794 goto out_ilock; 855 goto out_ilock;
795 } 856 }
857 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
796 status = ocfs2_sync_dquot(dquot); 858 status = ocfs2_sync_dquot(dquot);
797 if (status < 0) { 859 if (status < 0) {
798 mlog_errno(status); 860 mlog_errno(status);
799 goto out_trans; 861 goto out_dlock;
800 } 862 }
801 /* Now write updated local dquot structure */ 863 /* Now write updated local dquot structure */
802 status = dquot_commit(dquot); 864 status = ocfs2_local_write_dquot(dquot);
803out_trans: 865out_dlock:
866 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
804 ocfs2_commit_trans(osb, handle); 867 ocfs2_commit_trans(osb, handle);
805out_ilock: 868out_ilock:
806 ocfs2_unlock_global_qf(oinfo, 1); 869 ocfs2_unlock_global_qf(oinfo, 1);
@@ -852,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
852} 915}
853 916
854const struct dquot_operations ocfs2_quota_operations = { 917const struct dquot_operations ocfs2_quota_operations = {
855 .write_dquot = ocfs2_write_dquot, 918 /* We never make dquot dirty so .write_dquot is never called */
856 .acquire_dquot = ocfs2_acquire_dquot, 919 .acquire_dquot = ocfs2_acquire_dquot,
857 .release_dquot = ocfs2_release_dquot, 920 .release_dquot = ocfs2_release_dquot,
858 .mark_dirty = ocfs2_mark_dquot_dirty, 921 .mark_dirty = ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 9ad49305f450..8bd70d4d184d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -22,6 +22,7 @@
22#include "dlmglue.h" 22#include "dlmglue.h"
23#include "quota.h" 23#include "quota.h"
24#include "uptodate.h" 24#include "uptodate.h"
25#include "super.h"
25 26
26/* Number of local quota structures per block */ 27/* Number of local quota structures per block */
27static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) 28static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -119,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
119 lock_buffer(bh); 120 lock_buffer(bh);
120 modify(bh, private); 121 modify(bh, private);
121 unlock_buffer(bh); 122 unlock_buffer(bh);
122 status = ocfs2_journal_dirty(handle, bh); 123 ocfs2_journal_dirty(handle, bh);
123 if (status < 0) { 124
124 mlog_errno(status);
125 ocfs2_commit_trans(OCFS2_SB(sb), handle);
126 return status;
127 }
128 status = ocfs2_commit_trans(OCFS2_SB(sb), handle); 125 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
129 if (status < 0) { 126 if (status < 0) {
130 mlog_errno(status); 127 mlog_errno(status);
@@ -133,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
133 return 0; 130 return 0;
134} 131}
135 132
133/*
134 * Read quota block from a given logical offset.
135 *
136 * This function acquires ip_alloc_sem and thus it must not be called with a
137 * transaction started.
138 */
139static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
140 struct buffer_head **bh)
141{
142 int rc = 0;
143 struct buffer_head *tmp = *bh;
144
145 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
146 ocfs2_error(inode->i_sb,
147 "Quota file %llu is probably corrupted! Requested "
148 "to read block %Lu but file has size only %Lu\n",
149 (unsigned long long)OCFS2_I(inode)->ip_blkno,
150 (unsigned long long)v_block,
151 (unsigned long long)i_size_read(inode));
152 return -EIO;
153 }
154 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
155 ocfs2_validate_quota_block);
156 if (rc)
157 mlog_errno(rc);
158
159 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
160 if (!rc && !*bh)
161 *bh = tmp;
162
163 return rc;
164}
165
136/* Check whether we understand format of quota files */ 166/* Check whether we understand format of quota files */
137static int ocfs2_local_check_quota_file(struct super_block *sb, int type) 167static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
138{ 168{
@@ -523,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
523 ocfs2_clear_bit(bit, dchunk->dqc_bitmap); 553 ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
524 le32_add_cpu(&dchunk->dqc_free, 1); 554 le32_add_cpu(&dchunk->dqc_free, 1);
525 unlock_buffer(qbh); 555 unlock_buffer(qbh);
526 status = ocfs2_journal_dirty(handle, qbh); 556 ocfs2_journal_dirty(handle, qbh);
527 if (status < 0)
528 mlog_errno(status);
529out_commit: 557out_commit:
530 mutex_unlock(&sb_dqopt(sb)->dqio_mutex); 558 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
531 ocfs2_commit_trans(OCFS2_SB(sb), handle); 559 ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -631,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
631 lock_buffer(bh); 659 lock_buffer(bh);
632 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); 660 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
633 unlock_buffer(bh); 661 unlock_buffer(bh);
634 status = ocfs2_journal_dirty(handle, bh); 662 ocfs2_journal_dirty(handle, bh);
635 if (status < 0)
636 mlog_errno(status);
637out_trans: 663out_trans:
638 ocfs2_commit_trans(osb, handle); 664 ocfs2_commit_trans(osb, handle);
639out_bh: 665out_bh:
@@ -679,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
679 INIT_LIST_HEAD(&oinfo->dqi_chunk); 705 INIT_LIST_HEAD(&oinfo->dqi_chunk);
680 oinfo->dqi_rec = NULL; 706 oinfo->dqi_rec = NULL;
681 oinfo->dqi_lqi_bh = NULL; 707 oinfo->dqi_lqi_bh = NULL;
682 oinfo->dqi_ibh = NULL; 708 oinfo->dqi_libh = NULL;
683 709
684 status = ocfs2_global_read_info(sb, type); 710 status = ocfs2_global_read_info(sb, type);
685 if (status < 0) 711 if (status < 0)
@@ -705,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
705 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); 731 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
706 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); 732 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
707 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); 733 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
708 oinfo->dqi_ibh = bh; 734 oinfo->dqi_libh = bh;
709 735
710 /* We crashed when using local quota file? */ 736 /* We crashed when using local quota file? */
711 if (!(info->dqi_flags & OLQF_CLEAN)) { 737 if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -767,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
767{ 793{
768 struct mem_dqinfo *info = sb_dqinfo(sb, type); 794 struct mem_dqinfo *info = sb_dqinfo(sb, type);
769 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) 795 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
770 ->dqi_ibh; 796 ->dqi_libh;
771 int status; 797 int status;
772 798
773 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, 799 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -790,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
790 int mark_clean = 1, len; 816 int mark_clean = 1, len;
791 int status; 817 int status;
792 818
793 /* At this point we know there are no more dquots and thus
794 * even if there's some sync in the pdflush queue, it won't
795 * find any dquots and return without doing anything */
796 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
797 iput(oinfo->dqi_gqinode); 819 iput(oinfo->dqi_gqinode);
798 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); 820 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
799 ocfs2_lock_res_free(&oinfo->dqi_gqlock); 821 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -828,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
828 /* Mark local file as clean */ 850 /* Mark local file as clean */
829 info->dqi_flags |= OLQF_CLEAN; 851 info->dqi_flags |= OLQF_CLEAN;
830 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], 852 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
831 oinfo->dqi_ibh, 853 oinfo->dqi_libh,
832 olq_update_info, 854 olq_update_info,
833 info); 855 info);
834 if (status < 0) { 856 if (status < 0) {
@@ -838,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
838 860
839out: 861out:
840 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); 862 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
841 brelse(oinfo->dqi_ibh); 863 brelse(oinfo->dqi_libh);
842 brelse(oinfo->dqi_lqi_bh); 864 brelse(oinfo->dqi_lqi_bh);
843 kfree(oinfo); 865 kfree(oinfo);
844 return 0; 866 return 0;
@@ -866,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
866} 888}
867 889
868/* Write dquot to local quota file */ 890/* Write dquot to local quota file */
869static int ocfs2_local_write_dquot(struct dquot *dquot) 891int ocfs2_local_write_dquot(struct dquot *dquot)
870{ 892{
871 struct super_block *sb = dquot->dq_sb; 893 struct super_block *sb = dquot->dq_sb;
872 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 894 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
873 struct buffer_head *bh = NULL; 895 struct buffer_head *bh;
896 struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
874 int status; 897 int status;
875 898
876 status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type], 899 status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
877 ol_dqblk_file_block(sb, od->dq_local_off), 900 &bh);
878 &bh);
879 if (status) { 901 if (status) {
880 mlog_errno(status); 902 mlog_errno(status);
881 goto out; 903 goto out;
882 } 904 }
883 status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh, 905 status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
884 olq_set_dquot, od);
885 if (status < 0) { 906 if (status < 0) {
886 mlog_errno(status); 907 mlog_errno(status);
887 goto out; 908 goto out;
@@ -981,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
981 } 1002 }
982 1003
983 /* Initialize chunk header */ 1004 /* Initialize chunk header */
984 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
985 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1005 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
986 &p_blkno, NULL, NULL); 1006 &p_blkno, NULL, NULL);
987 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
988 if (status < 0) { 1007 if (status < 0) {
989 mlog_errno(status); 1008 mlog_errno(status);
990 goto out_trans; 1009 goto out_trans;
@@ -1009,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1009 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - 1028 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
1010 OCFS2_QBLK_RESERVED_SPACE); 1029 OCFS2_QBLK_RESERVED_SPACE);
1011 unlock_buffer(bh); 1030 unlock_buffer(bh);
1012 status = ocfs2_journal_dirty(handle, bh); 1031 ocfs2_journal_dirty(handle, bh);
1013 if (status < 0) {
1014 mlog_errno(status);
1015 goto out_trans;
1016 }
1017 1032
1018 /* Initialize new block with structures */ 1033 /* Initialize new block with structures */
1019 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1020 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, 1034 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
1021 &p_blkno, NULL, NULL); 1035 &p_blkno, NULL, NULL);
1022 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1023 if (status < 0) { 1036 if (status < 0) {
1024 mlog_errno(status); 1037 mlog_errno(status);
1025 goto out_trans; 1038 goto out_trans;
@@ -1040,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1040 lock_buffer(dbh); 1053 lock_buffer(dbh);
1041 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); 1054 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
1042 unlock_buffer(dbh); 1055 unlock_buffer(dbh);
1043 status = ocfs2_journal_dirty(handle, dbh); 1056 ocfs2_journal_dirty(handle, dbh);
1044 if (status < 0) {
1045 mlog_errno(status);
1046 goto out_trans;
1047 }
1048 1057
1049 /* Update local quotafile info */ 1058 /* Update local quotafile info */
1050 oinfo->dqi_blocks += 2; 1059 oinfo->dqi_blocks += 2;
@@ -1120,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1120 } 1129 }
1121 1130
1122 /* Get buffer from the just added block */ 1131 /* Get buffer from the just added block */
1123 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1124 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1132 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
1125 &p_blkno, NULL, NULL); 1133 &p_blkno, NULL, NULL);
1126 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1127 if (status < 0) { 1134 if (status < 0) {
1128 mlog_errno(status); 1135 mlog_errno(status);
1129 goto out; 1136 goto out;
@@ -1155,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1155 lock_buffer(bh); 1162 lock_buffer(bh);
1156 memset(bh->b_data, 0, sb->s_blocksize); 1163 memset(bh->b_data, 0, sb->s_blocksize);
1157 unlock_buffer(bh); 1164 unlock_buffer(bh);
1158 status = ocfs2_journal_dirty(handle, bh); 1165 ocfs2_journal_dirty(handle, bh);
1159 if (status < 0) { 1166
1160 mlog_errno(status);
1161 goto out_trans;
1162 }
1163 /* Update chunk header */ 1167 /* Update chunk header */
1164 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), 1168 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
1165 chunk->qc_headerbh, 1169 chunk->qc_headerbh,
@@ -1173,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1173 lock_buffer(chunk->qc_headerbh); 1177 lock_buffer(chunk->qc_headerbh);
1174 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); 1178 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
1175 unlock_buffer(chunk->qc_headerbh); 1179 unlock_buffer(chunk->qc_headerbh);
1176 status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); 1180 ocfs2_journal_dirty(handle, chunk->qc_headerbh);
1177 if (status < 0) { 1181
1178 mlog_errno(status);
1179 goto out_trans;
1180 }
1181 /* Update file header */ 1182 /* Update file header */
1182 oinfo->dqi_blocks++; 1183 oinfo->dqi_blocks++;
1183 status = ocfs2_local_write_info(sb, type); 1184 status = ocfs2_local_write_info(sb, type);
@@ -1210,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1210} 1211}
1211 1212
1212/* Create dquot in the local file for given id */ 1213/* Create dquot in the local file for given id */
1213static int ocfs2_create_local_dquot(struct dquot *dquot) 1214int ocfs2_create_local_dquot(struct dquot *dquot)
1214{ 1215{
1215 struct super_block *sb = dquot->dq_sb; 1216 struct super_block *sb = dquot->dq_sb;
1216 int type = dquot->dq_type; 1217 int type = dquot->dq_type;
@@ -1219,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1219 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 1220 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1220 int offset; 1221 int offset;
1221 int status; 1222 int status;
1223 u64 pcount;
1222 1224
1225 down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1223 chunk = ocfs2_find_free_entry(sb, type, &offset); 1226 chunk = ocfs2_find_free_entry(sb, type, &offset);
1224 if (!chunk) { 1227 if (!chunk) {
1225 chunk = ocfs2_extend_local_quota_file(sb, type, &offset); 1228 chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
1226 if (IS_ERR(chunk)) 1229 if (IS_ERR(chunk)) {
1227 return PTR_ERR(chunk); 1230 status = PTR_ERR(chunk);
1231 goto out;
1232 }
1228 } else if (IS_ERR(chunk)) { 1233 } else if (IS_ERR(chunk)) {
1229 return PTR_ERR(chunk); 1234 status = PTR_ERR(chunk);
1235 goto out;
1230 } 1236 }
1231 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); 1237 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
1232 od->dq_chunk = chunk; 1238 od->dq_chunk = chunk;
1239 status = ocfs2_extent_map_get_blocks(lqinode,
1240 ol_dqblk_block(sb, chunk->qc_num, offset),
1241 &od->dq_local_phys_blk,
1242 &pcount,
1243 NULL);
1233 1244
1234 /* Initialize dquot structure on disk */ 1245 /* Initialize dquot structure on disk */
1235 status = ocfs2_local_write_dquot(dquot); 1246 status = ocfs2_local_write_dquot(dquot);
@@ -1246,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1246 goto out; 1257 goto out;
1247 } 1258 }
1248out: 1259out:
1260 up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1249 return status; 1261 return status;
1250} 1262}
1251 1263
1252/* Create entry in local file for dquot, load data from the global file */ 1264/*
1253static int ocfs2_local_read_dquot(struct dquot *dquot) 1265 * Release dquot structure from local quota file. ocfs2_release_dquot() has
1254{ 1266 * already started a transaction and written all changes to global quota file
1255 int status; 1267 */
1256 1268int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1257 mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
1258
1259 status = ocfs2_global_read_dquot(dquot);
1260 if (status < 0) {
1261 mlog_errno(status);
1262 goto out_err;
1263 }
1264
1265 /* Now create entry in the local quota file */
1266 status = ocfs2_create_local_dquot(dquot);
1267 if (status < 0) {
1268 mlog_errno(status);
1269 goto out_err;
1270 }
1271 mlog_exit(0);
1272 return 0;
1273out_err:
1274 mlog_exit(status);
1275 return status;
1276}
1277
1278/* Release dquot structure from local quota file. ocfs2_release_dquot() has
1279 * already started a transaction and obtained exclusive lock for global
1280 * quota file. */
1281static int ocfs2_local_release_dquot(struct dquot *dquot)
1282{ 1269{
1283 int status; 1270 int status;
1284 int type = dquot->dq_type; 1271 int type = dquot->dq_type;
@@ -1286,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1286 struct super_block *sb = dquot->dq_sb; 1273 struct super_block *sb = dquot->dq_sb;
1287 struct ocfs2_local_disk_chunk *dchunk; 1274 struct ocfs2_local_disk_chunk *dchunk;
1288 int offset; 1275 int offset;
1289 handle_t *handle = journal_current_handle();
1290
1291 BUG_ON(!handle);
1292 /* First write all local changes to global file */
1293 status = ocfs2_global_release_dquot(dquot);
1294 if (status < 0) {
1295 mlog_errno(status);
1296 goto out;
1297 }
1298 1276
1299 status = ocfs2_journal_access_dq(handle, 1277 status = ocfs2_journal_access_dq(handle,
1300 INODE_CACHE(sb_dqopt(sb)->files[type]), 1278 INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1312 ocfs2_clear_bit(offset, dchunk->dqc_bitmap); 1290 ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
1313 le32_add_cpu(&dchunk->dqc_free, 1); 1291 le32_add_cpu(&dchunk->dqc_free, 1);
1314 unlock_buffer(od->dq_chunk->qc_headerbh); 1292 unlock_buffer(od->dq_chunk->qc_headerbh);
1315 status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); 1293 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1316 if (status < 0) { 1294
1317 mlog_errno(status);
1318 goto out;
1319 }
1320 status = 0;
1321out: 1295out:
1322 /* Clear the read bit so that next time someone uses this 1296 /* Clear the read bit so that next time someone uses this
1323 * dquot he reads fresh info from disk and allocates local 1297 * dquot he reads fresh info from disk and allocates local
@@ -1331,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
1331 .read_file_info = ocfs2_local_read_info, 1305 .read_file_info = ocfs2_local_read_info,
1332 .write_file_info = ocfs2_global_write_info, 1306 .write_file_info = ocfs2_global_write_info,
1333 .free_file_info = ocfs2_local_free_info, 1307 .free_file_info = ocfs2_local_free_info,
1334 .read_dqblk = ocfs2_local_read_dquot,
1335 .commit_dqblk = ocfs2_local_write_dquot,
1336 .release_dqblk = ocfs2_local_release_dquot,
1337}; 1308};
1338 1309
1339struct quota_format_type ocfs2_quota_format = { 1310struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5cbcd0f008fc..4793f36f6518 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
570 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; 570 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
571 u16 suballoc_bit_start; 571 u16 suballoc_bit_start;
572 u32 num_got; 572 u32 num_got;
573 u64 first_blkno; 573 u64 suballoc_loc, first_blkno;
574 574
575 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 575 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
576 576
@@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
596 goto out_commit; 596 goto out_commit;
597 } 597 }
598 598
599 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 599 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
600 &suballoc_bit_start, &num_got, 600 &suballoc_bit_start, &num_got,
601 &first_blkno); 601 &first_blkno);
602 if (ret) { 602 if (ret) {
@@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
626 memset(rb, 0, inode->i_sb->s_blocksize); 626 memset(rb, 0, inode->i_sb->s_blocksize);
627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
629 rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
629 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
630 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
631 rb->rf_blkno = cpu_to_le64(first_blkno); 632 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
790 if (le32_to_cpu(rb->rf_count) == 1) { 791 if (le32_to_cpu(rb->rf_count) == 1) {
791 blk = le64_to_cpu(rb->rf_blkno); 792 blk = le64_to_cpu(rb->rf_blkno);
792 bit = le16_to_cpu(rb->rf_suballoc_bit); 793 bit = le16_to_cpu(rb->rf_suballoc_bit);
793 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 794 if (rb->rf_suballoc_loc)
795 bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
796 else
797 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
794 798
795 alloc_inode = ocfs2_get_system_file_inode(osb, 799 alloc_inode = ocfs2_get_system_file_inode(osb,
796 EXTENT_ALLOC_SYSTEM_INODE, 800 EXTENT_ALLOC_SYSTEM_INODE,
@@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
1268 } else if (merge) 1272 } else if (merge)
1269 ocfs2_refcount_rec_merge(rb, index); 1273 ocfs2_refcount_rec_merge(rb, index);
1270 1274
1271 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1275 ocfs2_journal_dirty(handle, ref_leaf_bh);
1272 if (ret)
1273 mlog_errno(ret);
1274out: 1276out:
1275 return ret; 1277 return ret;
1276} 1278}
@@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1284 int ret; 1286 int ret;
1285 u16 suballoc_bit_start; 1287 u16 suballoc_bit_start;
1286 u32 num_got; 1288 u32 num_got;
1287 u64 blkno; 1289 u64 suballoc_loc, blkno;
1288 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1290 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1289 struct buffer_head *new_bh = NULL; 1291 struct buffer_head *new_bh = NULL;
1290 struct ocfs2_refcount_block *new_rb; 1292 struct ocfs2_refcount_block *new_rb;
@@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1298 goto out; 1300 goto out;
1299 } 1301 }
1300 1302
1301 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1303 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1302 &suballoc_bit_start, &num_got, 1304 &suballoc_bit_start, &num_got,
1303 &blkno); 1305 &blkno);
1304 if (ret) { 1306 if (ret) {
@@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1330 1332
1331 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1333 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1332 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1334 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1335 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1333 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1336 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1334 new_rb->rf_blkno = cpu_to_le64(blkno); 1337 new_rb->rf_blkno = cpu_to_le64(blkno);
1335 new_rb->rf_cpos = cpu_to_le32(0); 1338 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1524 int ret; 1527 int ret;
1525 u16 suballoc_bit_start; 1528 u16 suballoc_bit_start;
1526 u32 num_got, new_cpos; 1529 u32 num_got, new_cpos;
1527 u64 blkno; 1530 u64 suballoc_loc, blkno;
1528 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1531 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1529 struct ocfs2_refcount_block *root_rb = 1532 struct ocfs2_refcount_block *root_rb =
1530 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1533 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1548 goto out; 1551 goto out;
1549 } 1552 }
1550 1553
1551 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1554 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1552 &suballoc_bit_start, &num_got, 1555 &suballoc_bit_start, &num_got,
1553 &blkno); 1556 &blkno);
1554 if (ret) { 1557 if (ret) {
@@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1576 memset(new_rb, 0, sb->s_blocksize); 1579 memset(new_rb, 0, sb->s_blocksize);
1577 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1580 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1578 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1581 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1582 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1579 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1583 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1580 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1584 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1581 new_rb->rf_blkno = cpu_to_le64(blkno); 1585 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
1694 * 2 more credits, one for the leaf refcount block, one for 1698 * 2 more credits, one for the leaf refcount block, one for
1695 * the extent block contains the extent rec. 1699 * the extent block contains the extent rec.
1696 */ 1700 */
1697 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); 1701 ret = ocfs2_extend_trans(handle, 2);
1698 if (ret < 0) { 1702 if (ret < 0) {
1699 mlog_errno(ret); 1703 mlog_errno(ret);
1700 goto out; 1704 goto out;
@@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
1802 if (merge) 1806 if (merge)
1803 ocfs2_refcount_rec_merge(rb, index); 1807 ocfs2_refcount_rec_merge(rb, index);
1804 1808
1805 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1809 ocfs2_journal_dirty(handle, ref_leaf_bh);
1806 if (ret) {
1807 mlog_errno(ret);
1808 goto out;
1809 }
1810 1810
1811 if (index == 0) { 1811 if (index == 0) {
1812 ret = ocfs2_adjust_refcount_rec(handle, ci, 1812 ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1977 ocfs2_refcount_rec_merge(rb, index); 1977 ocfs2_refcount_rec_merge(rb, index);
1978 } 1978 }
1979 1979
1980 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1980 ocfs2_journal_dirty(handle, ref_leaf_bh);
1981 if (ret)
1982 mlog_errno(ret);
1983 1981
1984out: 1982out:
1985 brelse(new_bh); 1983 brelse(new_bh);
@@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
2112 */ 2110 */
2113 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, 2111 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2114 le16_to_cpu(rb->rf_suballoc_slot), 2112 le16_to_cpu(rb->rf_suballoc_slot),
2113 le64_to_cpu(rb->rf_suballoc_loc),
2115 le64_to_cpu(rb->rf_blkno), 2114 le64_to_cpu(rb->rf_blkno),
2116 le16_to_cpu(rb->rf_suballoc_bit)); 2115 le16_to_cpu(rb->rf_suballoc_bit));
2117 if (ret) { 2116 if (ret) {
@@ -2516,20 +2515,19 @@ out:
2516 * 2515 *
2517 * Normally the refcount blocks store these refcount should be 2516 * Normally the refcount blocks store these refcount should be
2518 * contiguous also, so that we can get the number easily. 2517 * contiguous also, so that we can get the number easily.
2519 * As for meta_ac, we will at most add split 2 refcount record and 2518 * We will at most add split 2 refcount records and 2 more
2520 * 2 more refcount block, so just check it in a rough way. 2519 * refcount blocks, so just check it in a rough way.
2521 * 2520 *
2522 * Caller must hold refcount tree lock. 2521 * Caller must hold refcount tree lock.
2523 */ 2522 */
2524int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 2523int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2525 struct buffer_head *di_bh, 2524 u64 refcount_loc,
2526 u64 phys_blkno, 2525 u64 phys_blkno,
2527 u32 clusters, 2526 u32 clusters,
2528 int *credits, 2527 int *credits,
2529 struct ocfs2_alloc_context **meta_ac) 2528 int *ref_blocks)
2530{ 2529{
2531 int ret, ref_blocks = 0; 2530 int ret;
2532 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2533 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2531 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2534 struct buffer_head *ref_root_bh = NULL; 2532 struct buffer_head *ref_root_bh = NULL;
2535 struct ocfs2_refcount_tree *tree; 2533 struct ocfs2_refcount_tree *tree;
@@ -2546,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2546 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2544 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2547 2545
2548 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), 2546 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2549 le64_to_cpu(di->i_refcount_loc), &tree); 2547 refcount_loc, &tree);
2550 if (ret) { 2548 if (ret) {
2551 mlog_errno(ret); 2549 mlog_errno(ret);
2552 goto out; 2550 goto out;
2553 } 2551 }
2554 2552
2555 ret = ocfs2_read_refcount_block(&tree->rf_ci, 2553 ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2556 le64_to_cpu(di->i_refcount_loc),
2557 &ref_root_bh); 2554 &ref_root_bh);
2558 if (ret) { 2555 if (ret) {
2559 mlog_errno(ret); 2556 mlog_errno(ret);
@@ -2564,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2564 &tree->rf_ci, 2561 &tree->rf_ci,
2565 ref_root_bh, 2562 ref_root_bh,
2566 start_cpos, clusters, 2563 start_cpos, clusters,
2567 &ref_blocks, credits); 2564 ref_blocks, credits);
2568 if (ret) { 2565 if (ret) {
2569 mlog_errno(ret); 2566 mlog_errno(ret);
2570 goto out; 2567 goto out;
2571 } 2568 }
2572 2569
2573 mlog(0, "reserve new metadata %d, credits = %d\n", 2570 mlog(0, "reserve new metadata %d blocks, credits = %d\n",
2574 ref_blocks, *credits); 2571 *ref_blocks, *credits);
2575
2576 if (ref_blocks) {
2577 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2578 ref_blocks, meta_ac);
2579 if (ret)
2580 mlog_errno(ret);
2581 }
2582 2572
2583out: 2573out:
2584 brelse(ref_root_bh); 2574 brelse(ref_root_bh);
@@ -3040,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3040 } 3030 }
3041 3031
3042 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); 3032 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3043 ret = ocfs2_journal_dirty(handle, new_bh); 3033 ocfs2_journal_dirty(handle, new_bh);
3044 if (ret) {
3045 mlog_errno(ret);
3046 break;
3047 }
3048 3034
3049 brelse(new_bh); 3035 brelse(new_bh);
3050 brelse(old_bh); 3036 brelse(old_bh);
@@ -3282,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3282 } else { 3268 } else {
3283 delete = 1; 3269 delete = 1;
3284 3270
3285 ret = __ocfs2_claim_clusters(osb, handle, 3271 ret = __ocfs2_claim_clusters(handle,
3286 context->data_ac, 3272 context->data_ac,
3287 1, set_len, 3273 1, set_len,
3288 &new_bit, &new_len); 3274 &new_bit, &new_len);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
47 struct ocfs2_cached_dealloc_ctxt *dealloc, 47 struct ocfs2_cached_dealloc_ctxt *dealloc,
48 int delete); 48 int delete);
49int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 49int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
50 struct buffer_head *di_bh, 50 u64 refcount_loc,
51 u64 phys_blkno, 51 u64 phys_blkno,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 struct ocfs2_alloc_context **meta_ac); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 56 u32 cpos, u32 write_len, u32 max_cpos);
57 57
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..40650021fc24
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,847 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.c
5 *
6 * Allocation reservations implementation
7 *
8 * Some code borrowed from fs/ext3/balloc.c and is:
9 *
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 *
15 * The rest is copyright (C) 2010 Novell. All rights reserved.
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public
19 * License version 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * General Public License for more details.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/bitops.h>
32#include <linux/list.h>
33
34#define MLOG_MASK_PREFIX ML_RESERVATIONS
35#include <cluster/masklog.h>
36
37#include "ocfs2.h"
38
39#ifdef CONFIG_OCFS2_DEBUG_FS
40#define OCFS2_CHECK_RESERVATIONS
41#endif
42
43DEFINE_SPINLOCK(resv_lock);
44
45#define OCFS2_MIN_RESV_WINDOW_BITS 8
46#define OCFS2_MAX_RESV_WINDOW_BITS 1024
47
48int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
49{
50 return (osb->osb_resv_level && osb->osb_dir_resv_level);
51}
52
53static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
54 struct ocfs2_alloc_reservation *resv)
55{
56 struct ocfs2_super *osb = resmap->m_osb;
57 unsigned int bits;
58
59 if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
60 /* 8, 16, 32, 64, 128, 256, 512, 1024 */
61 bits = 4 << osb->osb_resv_level;
62 } else {
63 bits = 4 << osb->osb_dir_resv_level;
64 }
65 return bits;
66}
67
68static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
69{
70 if (resv->r_len)
71 return resv->r_start + resv->r_len - 1;
72 return resv->r_start;
73}
74
75static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
76{
77 return !!(resv->r_len == 0);
78}
79
80static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
81{
82 if (resmap->m_osb->osb_resv_level == 0)
83 return 1;
84 return 0;
85}
86
87static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
88{
89 struct ocfs2_super *osb = resmap->m_osb;
90 struct rb_node *node;
91 struct ocfs2_alloc_reservation *resv;
92 int i = 0;
93
94 mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
95 osb->dev_str, resmap->m_bitmap_len);
96
97 node = rb_first(&resmap->m_reservations);
98 while (node) {
99 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
100
101 mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
102 "\tlast_len: %u\n", resv->r_start,
103 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
104 resv->r_last_len);
105
106 node = rb_next(node);
107 i++;
108 }
109
110 mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
111
112 i = 0;
113 list_for_each_entry(resv, &resmap->m_lru, r_lru) {
114 mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
115 "last_start: %u\tlast_len: %u\n", i, resv->r_start,
116 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
117 resv->r_last_len);
118
119 i++;
120 }
121}
122
123#ifdef OCFS2_CHECK_RESERVATIONS
124static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
125 int i,
126 struct ocfs2_alloc_reservation *resv)
127{
128 char *disk_bitmap = resmap->m_disk_bitmap;
129 unsigned int start = resv->r_start;
130 unsigned int end = ocfs2_resv_end(resv);
131
132 while (start <= end) {
133 if (ocfs2_test_bit(start, disk_bitmap)) {
134 mlog(ML_ERROR,
135 "reservation %d covers an allocated area "
136 "starting at bit %u!\n", i, start);
137 return 1;
138 }
139
140 start++;
141 }
142 return 0;
143}
144
145static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
146{
147 unsigned int off = 0;
148 int i = 0;
149 struct rb_node *node;
150 struct ocfs2_alloc_reservation *resv;
151
152 node = rb_first(&resmap->m_reservations);
153 while (node) {
154 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
155
156 if (i > 0 && resv->r_start <= off) {
157 mlog(ML_ERROR, "reservation %d has bad start off!\n",
158 i);
159 goto bad;
160 }
161
162 if (resv->r_len == 0) {
163 mlog(ML_ERROR, "reservation %d has no length!\n",
164 i);
165 goto bad;
166 }
167
168 if (resv->r_start > ocfs2_resv_end(resv)) {
169 mlog(ML_ERROR, "reservation %d has invalid range!\n",
170 i);
171 goto bad;
172 }
173
174 if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
175 mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
176 i);
177 goto bad;
178 }
179
180 if (ocfs2_validate_resmap_bits(resmap, i, resv))
181 goto bad;
182
183 off = ocfs2_resv_end(resv);
184 node = rb_next(node);
185
186 i++;
187 }
188 return;
189
190bad:
191 ocfs2_dump_resv(resmap);
192 BUG();
193}
194#else
195static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
196{
197
198}
199#endif
200
201void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
202{
203 memset(resv, 0, sizeof(*resv));
204 INIT_LIST_HEAD(&resv->r_lru);
205}
206
207void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
208 unsigned int flags)
209{
210 BUG_ON(flags & ~OCFS2_RESV_TYPES);
211
212 resv->r_flags |= flags;
213}
214
215int ocfs2_resmap_init(struct ocfs2_super *osb,
216 struct ocfs2_reservation_map *resmap)
217{
218 memset(resmap, 0, sizeof(*resmap));
219
220 resmap->m_osb = osb;
221 resmap->m_reservations = RB_ROOT;
222 /* m_bitmap_len is initialized to zero by the above memset. */
223 INIT_LIST_HEAD(&resmap->m_lru);
224
225 return 0;
226}
227
228static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
229 struct ocfs2_alloc_reservation *resv)
230{
231 assert_spin_locked(&resv_lock);
232
233 if (!list_empty(&resv->r_lru))
234 list_del_init(&resv->r_lru);
235
236 list_add_tail(&resv->r_lru, &resmap->m_lru);
237}
238
239static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
240{
241 resv->r_len = 0;
242 resv->r_start = 0;
243}
244
245static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
246 struct ocfs2_alloc_reservation *resv)
247{
248 if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
249 list_del_init(&resv->r_lru);
250 rb_erase(&resv->r_node, &resmap->m_reservations);
251 resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
252 }
253}
254
255static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
256 struct ocfs2_alloc_reservation *resv)
257{
258 assert_spin_locked(&resv_lock);
259
260 __ocfs2_resv_trunc(resv);
261 /*
262 * last_len and last_start no longer make sense if
263 * we're changing the range of our allocations.
264 */
265 resv->r_last_len = resv->r_last_start = 0;
266
267 ocfs2_resv_remove(resmap, resv);
268}
269
270/* does nothing if 'resv' is null */
271void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
272 struct ocfs2_alloc_reservation *resv)
273{
274 if (resv) {
275 spin_lock(&resv_lock);
276 __ocfs2_resv_discard(resmap, resv);
277 spin_unlock(&resv_lock);
278 }
279}
280
281static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
282{
283 struct rb_node *node;
284 struct ocfs2_alloc_reservation *resv;
285
286 assert_spin_locked(&resv_lock);
287
288 while ((node = rb_last(&resmap->m_reservations)) != NULL) {
289 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
290
291 __ocfs2_resv_discard(resmap, resv);
292 }
293}
294
295void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
296 unsigned int clen, char *disk_bitmap)
297{
298 if (ocfs2_resmap_disabled(resmap))
299 return;
300
301 spin_lock(&resv_lock);
302
303 ocfs2_resmap_clear_all_resv(resmap);
304 resmap->m_bitmap_len = clen;
305 resmap->m_disk_bitmap = disk_bitmap;
306
307 spin_unlock(&resv_lock);
308}
309
310void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
311{
312 /* Does nothing for now. Keep this around for API symmetry */
313}
314
315static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
316 struct ocfs2_alloc_reservation *new)
317{
318 struct rb_root *root = &resmap->m_reservations;
319 struct rb_node *parent = NULL;
320 struct rb_node **p = &root->rb_node;
321 struct ocfs2_alloc_reservation *tmp;
322
323 assert_spin_locked(&resv_lock);
324
325 mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
326 new->r_len);
327
328 while (*p) {
329 parent = *p;
330
331 tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
332
333 if (new->r_start < tmp->r_start) {
334 p = &(*p)->rb_left;
335
336 /*
337 * This is a good place to check for
338 * overlapping reservations.
339 */
340 BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
341 } else if (new->r_start > ocfs2_resv_end(tmp)) {
342 p = &(*p)->rb_right;
343 } else {
344 /* This should never happen! */
345 mlog(ML_ERROR, "Duplicate reservation window!\n");
346 BUG();
347 }
348 }
349
350 rb_link_node(&new->r_node, parent, p);
351 rb_insert_color(&new->r_node, root);
352 new->r_flags |= OCFS2_RESV_FLAG_INUSE;
353
354 ocfs2_resv_mark_lru(resmap, new);
355
356 ocfs2_check_resmap(resmap);
357}
358
359/**
360 * ocfs2_find_resv_lhs() - find the window which contains goal
361 * @resmap: reservation map to search
362 * @goal: which bit to search for
363 *
364 * If a window containing that goal is not found, we return the window
365 * which comes before goal. Returns NULL on empty rbtree or no window
366 * before goal.
367 */
368static struct ocfs2_alloc_reservation *
369ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
370{
371 struct ocfs2_alloc_reservation *resv = NULL;
372 struct ocfs2_alloc_reservation *prev_resv = NULL;
373 struct rb_node *node = resmap->m_reservations.rb_node;
374
375 assert_spin_locked(&resv_lock);
376
377 if (!node)
378 return NULL;
379
380 node = rb_first(&resmap->m_reservations);
381 while (node) {
382 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
383
384 if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
385 break;
386
387 /* Check if we overshot the reservation just before goal? */
388 if (resv->r_start > goal) {
389 resv = prev_resv;
390 break;
391 }
392
393 prev_resv = resv;
394 node = rb_next(node);
395 }
396
397 return resv;
398}
399
400/*
401 * We are given a range within the bitmap, which corresponds to a gap
402 * inside the reservations tree (search_start, search_len). The range
403 * can be anything from the whole bitmap, to a gap between
404 * reservations.
405 *
406 * The start value of *rstart is insignificant.
407 *
408 * This function searches the bitmap range starting at search_start
409 * with length search_len for a set of contiguous free bits. We try
410 * to find up to 'wanted' bits, but can sometimes return less.
411 *
412 * Returns the length of allocation, 0 if no free bits are found.
413 *
414 * *cstart and *clen will also be populated with the result.
415 */
416static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
417 unsigned int wanted,
418 unsigned int search_start,
419 unsigned int search_len,
420 unsigned int *rstart,
421 unsigned int *rlen)
422{
423 void *bitmap = resmap->m_disk_bitmap;
424 unsigned int best_start, best_len = 0;
425 int offset, start, found;
426
427 mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
428 wanted, search_start, search_len, resmap->m_bitmap_len);
429
430 found = best_start = best_len = 0;
431
432 start = search_start;
433 while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
434 start)) != -1) {
435 /* Search reached end of the region */
436 if (offset >= (search_start + search_len))
437 break;
438
439 if (offset == start) {
440 /* we found a zero */
441 found++;
442 /* move start to the next bit to test */
443 start++;
444 } else {
445 /* got a zero after some ones */
446 found = 1;
447 start = offset + 1;
448 }
449 if (found > best_len) {
450 best_len = found;
451 best_start = start - found;
452 }
453
454 if (found >= wanted)
455 break;
456 }
457
458 if (best_len == 0)
459 return 0;
460
461 if (best_len >= wanted)
462 best_len = wanted;
463
464 *rlen = best_len;
465 *rstart = best_start;
466
467 mlog(0, "Found start: %u len: %u\n", best_start, best_len);
468
469 return *rlen;
470}
471
472static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
473 struct ocfs2_alloc_reservation *resv,
474 unsigned int goal, unsigned int wanted)
475{
476 struct rb_root *root = &resmap->m_reservations;
477 unsigned int gap_start, gap_end, gap_len;
478 struct ocfs2_alloc_reservation *prev_resv, *next_resv;
479 struct rb_node *prev, *next;
480 unsigned int cstart, clen;
481 unsigned int best_start = 0, best_len = 0;
482
483 /*
484 * Nasty cases to consider:
485 *
486 * - rbtree is empty
487 * - our window should be first in all reservations
488 * - our window should be last in all reservations
489 * - need to make sure we don't go past end of bitmap
490 */
491
492 mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
493 resv->r_start, ocfs2_resv_end(resv), goal, wanted);
494
495 assert_spin_locked(&resv_lock);
496
497 if (RB_EMPTY_ROOT(root)) {
498 /*
499 * Easiest case - empty tree. We can just take
500 * whatever window of free bits we want.
501 */
502
503 mlog(0, "Empty root\n");
504
505 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
506 resmap->m_bitmap_len - goal,
507 &cstart, &clen);
508
509 /*
510 * This should never happen - the local alloc window
511 * will always have free bits when we're called.
512 */
513 BUG_ON(goal == 0 && clen == 0);
514
515 if (clen == 0)
516 return;
517
518 resv->r_start = cstart;
519 resv->r_len = clen;
520
521 ocfs2_resv_insert(resmap, resv);
522 return;
523 }
524
525 prev_resv = ocfs2_find_resv_lhs(resmap, goal);
526
527 if (prev_resv == NULL) {
528 mlog(0, "Goal on LHS of leftmost window\n");
529
530 /*
531 * A NULL here means that the search code couldn't
532 * find a window that starts before goal.
533 *
534 * However, we can take the first window after goal,
535 * which is also by definition, the leftmost window in
536 * the entire tree. If we can find free bits in the
537 * gap between goal and the LHS window, then the
538 * reservation can safely be placed there.
539 *
540 * Otherwise we fall back to a linear search, checking
541 * the gaps in between windows for a place to
542 * allocate.
543 */
544
545 next = rb_first(root);
546 next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
547 r_node);
548
549 /*
550 * The search should never return such a window. (see
551 * comment above
552 */
553 if (next_resv->r_start <= goal) {
554 mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
555 goal, next_resv->r_start, next_resv->r_len);
556 ocfs2_dump_resv(resmap);
557 BUG();
558 }
559
560 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
561 next_resv->r_start - goal,
562 &cstart, &clen);
563 if (clen) {
564 best_len = clen;
565 best_start = cstart;
566 if (best_len == wanted)
567 goto out_insert;
568 }
569
570 prev_resv = next_resv;
571 next_resv = NULL;
572 }
573
574 prev = &prev_resv->r_node;
575
576 /* Now we do a linear search for a window, starting at 'prev_rsv' */
577 while (1) {
578 next = rb_next(prev);
579 if (next) {
580 mlog(0, "One more resv found in linear search\n");
581 next_resv = rb_entry(next,
582 struct ocfs2_alloc_reservation,
583 r_node);
584
585 gap_start = ocfs2_resv_end(prev_resv) + 1;
586 gap_end = next_resv->r_start - 1;
587 gap_len = gap_end - gap_start + 1;
588 } else {
589 mlog(0, "No next node\n");
590 /*
591 * We're at the rightmost edge of the
592 * tree. See if a reservation between this
593 * window and the end of the bitmap will work.
594 */
595 gap_start = ocfs2_resv_end(prev_resv) + 1;
596 gap_len = resmap->m_bitmap_len - gap_start;
597 gap_end = resmap->m_bitmap_len - 1;
598 }
599
600 /*
601 * No need to check this gap if we have already found
602 * a larger region of free bits.
603 */
604 if (gap_len <= best_len)
605 goto next_resv;
606
607 clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
608 gap_len, &cstart, &clen);
609 if (clen == wanted) {
610 best_len = clen;
611 best_start = cstart;
612 goto out_insert;
613 } else if (clen > best_len) {
614 best_len = clen;
615 best_start = cstart;
616 }
617
618next_resv:
619 if (!next)
620 break;
621
622 prev = next;
623 prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
624 r_node);
625 }
626
627out_insert:
628 if (best_len) {
629 resv->r_start = best_start;
630 resv->r_len = best_len;
631 ocfs2_resv_insert(resmap, resv);
632 }
633}
634
635static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
636 struct ocfs2_alloc_reservation *resv,
637 unsigned int wanted)
638{
639 struct ocfs2_alloc_reservation *lru_resv;
640 int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
641 unsigned int min_bits;
642
643 if (!tmpwindow)
644 min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
645 else
646 min_bits = wanted; /* We at know the temp window will use all
647 * of these bits */
648
649 /*
650 * Take the first reservation off the LRU as our 'target'. We
651 * don't try to be smart about it. There might be a case for
652 * searching based on size but I don't have enough data to be
653 * sure. --Mark (3/16/2010)
654 */
655 lru_resv = list_first_entry(&resmap->m_lru,
656 struct ocfs2_alloc_reservation, r_lru);
657
658 mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
659 lru_resv->r_len, ocfs2_resv_end(lru_resv));
660
661 /*
662 * Cannibalize (some or all) of the target reservation and
663 * feed it to the current window.
664 */
665 if (lru_resv->r_len <= min_bits) {
666 /*
667 * Discard completely if size is less than or equal to a
668 * reasonable threshold - 50% of window bits for non temporary
669 * windows.
670 */
671 resv->r_start = lru_resv->r_start;
672 resv->r_len = lru_resv->r_len;
673
674 __ocfs2_resv_discard(resmap, lru_resv);
675 } else {
676 unsigned int shrink;
677 if (tmpwindow)
678 shrink = min_bits;
679 else
680 shrink = lru_resv->r_len / 2;
681
682 lru_resv->r_len -= shrink;
683
684 resv->r_start = ocfs2_resv_end(lru_resv) + 1;
685 resv->r_len = shrink;
686 }
687
688 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
689 "r_len: %u r_last_start: %u r_last_len: %u\n",
690 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
691 resv->r_last_start, resv->r_last_len);
692
693 ocfs2_resv_insert(resmap, resv);
694}
695
696static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
697 struct ocfs2_alloc_reservation *resv,
698 unsigned int wanted)
699{
700 unsigned int goal = 0;
701
702 BUG_ON(!ocfs2_resv_empty(resv));
703
704 /*
705 * Begin by trying to get a window as close to the previous
706 * one as possible. Using the most recent allocation as a
707 * start goal makes sense.
708 */
709 if (resv->r_last_len) {
710 goal = resv->r_last_start + resv->r_last_len;
711 if (goal >= resmap->m_bitmap_len)
712 goal = 0;
713 }
714
715 __ocfs2_resv_find_window(resmap, resv, goal, wanted);
716
717 /* Search from last alloc didn't work, try once more from beginning. */
718 if (ocfs2_resv_empty(resv) && goal != 0)
719 __ocfs2_resv_find_window(resmap, resv, 0, wanted);
720
721 if (ocfs2_resv_empty(resv)) {
722 /*
723 * Still empty? Pull oldest one off the LRU, remove it from
724 * tree, put this one in it's place.
725 */
726 ocfs2_cannibalize_resv(resmap, resv, wanted);
727 }
728
729 BUG_ON(ocfs2_resv_empty(resv));
730}
731
732int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
733 struct ocfs2_alloc_reservation *resv,
734 int *cstart, int *clen)
735{
736 unsigned int wanted = *clen;
737
738 if (resv == NULL || ocfs2_resmap_disabled(resmap))
739 return -ENOSPC;
740
741 spin_lock(&resv_lock);
742
743 /*
744 * We don't want to over-allocate for temporary
745 * windows. Otherwise, we run the risk of fragmenting the
746 * allocation space.
747 */
748 wanted = ocfs2_resv_window_bits(resmap, resv);
749 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
750 wanted = *clen;
751
752 if (ocfs2_resv_empty(resv)) {
753 mlog(0, "empty reservation, find new window\n");
754
755 /*
756 * Try to get a window here. If it works, we must fall
757 * through and test the bitmap . This avoids some
758 * ping-ponging of windows due to non-reserved space
759 * being allocation before we initialize a window for
760 * that inode.
761 */
762 ocfs2_resv_find_window(resmap, resv, wanted);
763 }
764
765 BUG_ON(ocfs2_resv_empty(resv));
766
767 *cstart = resv->r_start;
768 *clen = resv->r_len;
769
770 spin_unlock(&resv_lock);
771 return 0;
772}
773
774static void
775 ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
776 struct ocfs2_alloc_reservation *resv,
777 unsigned int start, unsigned int end)
778{
779 unsigned int rhs = 0;
780 unsigned int old_end = ocfs2_resv_end(resv);
781
782 BUG_ON(start != resv->r_start || old_end < end);
783
784 /*
785 * Completely used? We can remove it then.
786 */
787 if (old_end == end) {
788 __ocfs2_resv_discard(resmap, resv);
789 return;
790 }
791
792 rhs = old_end - end;
793
794 /*
795 * This should have been trapped above.
796 */
797 BUG_ON(rhs == 0);
798
799 resv->r_start = end + 1;
800 resv->r_len = old_end - resv->r_start + 1;
801}
802
803void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
804 struct ocfs2_alloc_reservation *resv,
805 u32 cstart, u32 clen)
806{
807 unsigned int cend = cstart + clen - 1;
808
809 if (resmap == NULL || ocfs2_resmap_disabled(resmap))
810 return;
811
812 if (resv == NULL)
813 return;
814
815 BUG_ON(cstart != resv->r_start);
816
817 spin_lock(&resv_lock);
818
819 mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
820 "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
821 cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
822 resv->r_len, resv->r_last_start, resv->r_last_len);
823
824 BUG_ON(cstart < resv->r_start);
825 BUG_ON(cstart > ocfs2_resv_end(resv));
826 BUG_ON(cend > ocfs2_resv_end(resv));
827
828 ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
829 resv->r_last_start = cstart;
830 resv->r_last_len = clen;
831
832 /*
833 * May have been discarded above from
834 * ocfs2_adjust_resv_from_alloc().
835 */
836 if (!ocfs2_resv_empty(resv))
837 ocfs2_resv_mark_lru(resmap, resv);
838
839 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
840 "r_len: %u r_last_start: %u r_last_len: %u\n",
841 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
842 resv->r_last_start, resv->r_last_len);
843
844 ocfs2_check_resmap(resmap);
845
846 spin_unlock(&resv_lock);
847}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.h
5 *
6 * Allocation reservations function prototypes and structures.
7 *
8 * Copyright (C) 2010 Novell. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_RESERVATIONS_H
21#define OCFS2_RESERVATIONS_H
22
23#include <linux/rbtree.h>
24
25#define OCFS2_DEFAULT_RESV_LEVEL 2
26#define OCFS2_MAX_RESV_LEVEL 9
27#define OCFS2_MIN_RESV_LEVEL 0
28
29struct ocfs2_alloc_reservation {
30 struct rb_node r_node;
31
32 unsigned int r_start; /* Begining of current window */
33 unsigned int r_len; /* Length of the window */
34
35 unsigned int r_last_len; /* Length of most recent alloc */
36 unsigned int r_last_start; /* Start of most recent alloc */
37 struct list_head r_lru; /* LRU list head */
38
39 unsigned int r_flags;
40};
41
42#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
43#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
44 * destroyed immedately after use */
45#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
46 * directory btree */
47
48struct ocfs2_reservation_map {
49 struct rb_root m_reservations;
50 char *m_disk_bitmap;
51
52 struct ocfs2_super *m_osb;
53
54 /* The following are not initialized to meaningful values until a disk
55 * bitmap is provided. */
56 u32 m_bitmap_len; /* Number of valid
57 * bits available */
58
59 struct list_head m_lru; /* LRU of reservations
60 * structures. */
61
62};
63
64void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
65
66#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
67void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
68 unsigned int flags);
69
70int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
71
72/**
73 * ocfs2_resv_discard() - truncate a reservation
74 * @resmap:
75 * @resv: the reservation to truncate.
76 *
77 * After this function is called, the reservation will be empty, and
78 * unlinked from the rbtree.
79 */
80void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
81 struct ocfs2_alloc_reservation *resv);
82
83
84/**
85 * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
86 * @resmap: struct ocfs2_reservation_map to initialize
87 * @obj: unused for now
88 * @ops: unused for now
89 * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
90 *
91 * Only possible return value other than '0' is -ENOMEM for failure to
92 * allocation mirror bitmap.
93 */
94int ocfs2_resmap_init(struct ocfs2_super *osb,
95 struct ocfs2_reservation_map *resmap);
96
97/**
98 * ocfs2_resmap_restart() - "restart" a reservation bitmap
99 * @resmap: reservations bitmap
100 * @clen: Number of valid bits in the bitmap
101 * @disk_bitmap: the disk bitmap this resmap should refer to.
102 *
103 * Re-initialize the parameters of a reservation bitmap. This is
104 * useful for local alloc window slides.
105 *
106 * This function will call ocfs2_trunc_resv against all existing
107 * reservations. A future version will recalculate existing
108 * reservations based on the new bitmap.
109 */
110void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
111 unsigned int clen, char *disk_bitmap);
112
113/**
114 * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
115 * @resmap: the struct ocfs2_reservation_map to uninitialize
116 */
117void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
118
119/**
120 * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
121 * @resmap: reservations bitmap
122 * @resv: reservation to base search from
123 * @cstart: start of proposed allocation
124 * @clen: length (in clusters) of proposed allocation
125 *
126 * Using the reservation data from resv, this function will compare
127 * resmap and resmap->m_disk_bitmap to determine what part (if any) of
128 * the reservation window is still clear to use. If resv is empty,
129 * this function will try to allocate a window for it.
130 *
131 * On success, zero is returned and the valid allocation area is set in cstart
132 * and clen.
133 *
134 * Returns -ENOSPC if reservations are disabled.
135 */
136int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
137 struct ocfs2_alloc_reservation *resv,
138 int *cstart, int *clen);
139
140/**
141 * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
142 * @resmap: reservations bitmap
143 * @resv: optional reservation to recalulate based on new bitmap
144 * @cstart: start of allocation in clusters
145 * @clen: end of allocation in clusters.
146 *
147 * Tell the reservation code that bits were used to fulfill allocation in
148 * resmap. The bits don't have to have been part of any existing
149 * reservation. But we must always call this function when bits are claimed.
150 * Internally, the reservations code will use this information to mark the
151 * reservations bitmap. If resv is passed, it's next allocation window will be
152 * calculated. It also expects that 'cstart' is the same as we passed back
153 * from ocfs2_resmap_resv_bits().
154 */
155void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
156 struct ocfs2_alloc_reservation *resv,
157 u32 cstart, u32 clen);
158
159#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups); 134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 } 135 }
136 136
137 ret = ocfs2_journal_dirty(handle, group_bh); 137 ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142 138
143 /* update the inode accordingly. */ 139 /* update the inode accordingly. */
144 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, 140 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
319 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 315 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
320 316
321 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
322 ocfs2_group_bitmap_size(osb->sb) * 8) { 318 ocfs2_group_bitmap_size(osb->sb, 0,
319 osb->s_feature_incompat) * 8) {
323 mlog(ML_ERROR, "The disk is too old and small. " 320 mlog(ML_ERROR, "The disk is too old and small. "
324 "Force to do offline resize."); 321 "Force to do offline resize.");
325 ret = -EINVAL; 322 ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
500 fe = (struct ocfs2_dinode *)main_bm_bh->b_data; 497 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
501 498
502 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 499 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
503 ocfs2_group_bitmap_size(osb->sb) * 8) { 500 ocfs2_group_bitmap_size(osb->sb, 0,
501 osb->s_feature_incompat) * 8) {
504 mlog(ML_ERROR, "The disk is too old and small." 502 mlog(ML_ERROR, "The disk is too old and small."
505 " Force to do offline resize."); 503 " Force to do offline resize.");
506 ret = -EINVAL; 504 ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
545 543
546 group = (struct ocfs2_group_desc *)group_bh->b_data; 544 group = (struct ocfs2_group_desc *)group_bh->b_data;
547 group->bg_next_group = cr->c_blkno; 545 group->bg_next_group = cr->c_blkno;
548 546 ocfs2_journal_dirty(handle, group_bh);
549 ret = ocfs2_journal_dirty(handle, group_bh);
550 if (ret < 0) {
551 mlog_errno(ret);
552 goto out_commit;
553 }
554 547
555 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 548 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
556 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 549 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 19ba00f28547..f4c2a9eb8c4d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,15 @@
53 53
54#define OCFS2_MAX_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56struct ocfs2_suballoc_result {
57 u64 sr_bg_blkno; /* The bg we allocated from. Set
58 to 0 when a block group is
59 contiguous. */
60 u64 sr_blkno; /* The first allocated block */
61 unsigned int sr_bit_offset; /* The bit in the bg */
62 unsigned int sr_bits; /* How many bits we claimed */
63};
64
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 65static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 66static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 67static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
60 struct inode *alloc_inode, 69 struct inode *alloc_inode,
61 struct buffer_head *bg_bh, 70 struct buffer_head *bg_bh,
62 u64 group_blkno, 71 u64 group_blkno,
72 unsigned int group_clusters,
63 u16 my_chain, 73 u16 my_chain,
64 struct ocfs2_chain_list *cl); 74 struct ocfs2_chain_list *cl);
65static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 75static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
73 struct buffer_head *group_bh, 83 struct buffer_head *group_bh,
74 u32 bits_wanted, u32 min_bits, 84 u32 bits_wanted, u32 min_bits,
75 u64 max_block, 85 u64 max_block,
76 u16 *bit_off, u16 *bits_found); 86 struct ocfs2_suballoc_result *res);
77static int ocfs2_block_group_search(struct inode *inode, 87static int ocfs2_block_group_search(struct inode *inode,
78 struct buffer_head *group_bh, 88 struct buffer_head *group_bh,
79 u32 bits_wanted, u32 min_bits, 89 u32 bits_wanted, u32 min_bits,
80 u64 max_block, 90 u64 max_block,
81 u16 *bit_off, u16 *bits_found); 91 struct ocfs2_suballoc_result *res);
82static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 92static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
83 struct ocfs2_alloc_context *ac,
84 handle_t *handle, 93 handle_t *handle,
85 u32 bits_wanted, 94 u32 bits_wanted,
86 u32 min_bits, 95 u32 min_bits,
87 u16 *bit_off, 96 struct ocfs2_suballoc_result *res);
88 unsigned int *num_bits,
89 u64 *bg_blkno);
90static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 97static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91 int nr); 98 int nr);
92static inline int ocfs2_block_group_set_bits(handle_t *handle, 99static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -130,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
130 } 137 }
131 brelse(ac->ac_bh); 138 brelse(ac->ac_bh);
132 ac->ac_bh = NULL; 139 ac->ac_bh = NULL;
140 ac->ac_resv = NULL;
133} 141}
134 142
135void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 143void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -325,14 +333,38 @@ out:
325 return rc; 333 return rc;
326} 334}
327 335
336static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337 struct ocfs2_group_desc *bg,
338 struct ocfs2_chain_list *cl,
339 u64 p_blkno, u32 clusters)
340{
341 struct ocfs2_extent_list *el = &bg->bg_list;
342 struct ocfs2_extent_rec *rec;
343
344 BUG_ON(!ocfs2_supports_discontig_bg(osb));
345 if (!el->l_next_free_rec)
346 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
347 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
348 rec->e_blkno = cpu_to_le64(p_blkno);
349 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350 le16_to_cpu(cl->cl_bpc));
351 rec->e_leaf_clusters = cpu_to_le32(clusters);
352 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353 le16_add_cpu(&bg->bg_free_bits_count,
354 clusters * le16_to_cpu(cl->cl_bpc));
355 le16_add_cpu(&el->l_next_free_rec, 1);
356}
357
328static int ocfs2_block_group_fill(handle_t *handle, 358static int ocfs2_block_group_fill(handle_t *handle,
329 struct inode *alloc_inode, 359 struct inode *alloc_inode,
330 struct buffer_head *bg_bh, 360 struct buffer_head *bg_bh,
331 u64 group_blkno, 361 u64 group_blkno,
362 unsigned int group_clusters,
332 u16 my_chain, 363 u16 my_chain,
333 struct ocfs2_chain_list *cl) 364 struct ocfs2_chain_list *cl)
334{ 365{
335 int status = 0; 366 int status = 0;
367 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
336 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 368 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
337 struct super_block * sb = alloc_inode->i_sb; 369 struct super_block * sb = alloc_inode->i_sb;
338 370
@@ -359,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
359 memset(bg, 0, sb->s_blocksize); 391 memset(bg, 0, sb->s_blocksize);
360 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 392 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
361 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 393 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
362 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); 394 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
363 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 395 osb->s_feature_incompat));
364 bg->bg_chain = cpu_to_le16(my_chain); 396 bg->bg_chain = cpu_to_le16(my_chain);
365 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 397 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
366 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 398 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
367 bg->bg_blkno = cpu_to_le64(group_blkno); 399 bg->bg_blkno = cpu_to_le64(group_blkno);
400 if (group_clusters == le16_to_cpu(cl->cl_cpg))
401 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
402 else
403 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
404 group_clusters);
405
368 /* set the 1st bit in the bitmap to account for the descriptor block */ 406 /* set the 1st bit in the bitmap to account for the descriptor block */
369 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 407 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
370 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 408 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
371 409
372 status = ocfs2_journal_dirty(handle, bg_bh); 410 ocfs2_journal_dirty(handle, bg_bh);
373 if (status < 0)
374 mlog_errno(status);
375 411
376 /* There is no need to zero out or otherwise initialize the 412 /* There is no need to zero out or otherwise initialize the
377 * other blocks in a group - All valid FS metadata in a block 413 * other blocks in a group - All valid FS metadata in a block
@@ -397,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
397 return best; 433 return best;
398} 434}
399 435
436static struct buffer_head *
437ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
438 struct inode *alloc_inode,
439 struct ocfs2_alloc_context *ac,
440 struct ocfs2_chain_list *cl)
441{
442 int status;
443 u32 bit_off, num_bits;
444 u64 bg_blkno;
445 struct buffer_head *bg_bh;
446 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
447
448 status = ocfs2_claim_clusters(handle, ac,
449 le16_to_cpu(cl->cl_cpg), &bit_off,
450 &num_bits);
451 if (status < 0) {
452 if (status != -ENOSPC)
453 mlog_errno(status);
454 goto bail;
455 }
456
457 /* setup the group */
458 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
459 mlog(0, "new descriptor, record %u, at block %llu\n",
460 alloc_rec, (unsigned long long)bg_blkno);
461
462 bg_bh = sb_getblk(osb->sb, bg_blkno);
463 if (!bg_bh) {
464 status = -EIO;
465 mlog_errno(status);
466 goto bail;
467 }
468 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
469
470 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
471 bg_blkno, num_bits, alloc_rec, cl);
472 if (status < 0) {
473 brelse(bg_bh);
474 mlog_errno(status);
475 }
476
477bail:
478 return status ? ERR_PTR(status) : bg_bh;
479}
480
481static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
482 handle_t *handle,
483 struct ocfs2_alloc_context *ac,
484 unsigned int min_bits,
485 u32 *bit_off, u32 *num_bits)
486{
487 int status = 0;
488
489 while (min_bits) {
490 status = ocfs2_claim_clusters(handle, ac, min_bits,
491 bit_off, num_bits);
492 if (status != -ENOSPC)
493 break;
494
495 min_bits >>= 1;
496 }
497
498 return status;
499}
500
501static int ocfs2_block_group_grow_discontig(handle_t *handle,
502 struct inode *alloc_inode,
503 struct buffer_head *bg_bh,
504 struct ocfs2_alloc_context *ac,
505 struct ocfs2_chain_list *cl,
506 unsigned int min_bits)
507{
508 int status;
509 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
510 struct ocfs2_group_desc *bg =
511 (struct ocfs2_group_desc *)bg_bh->b_data;
512 unsigned int needed = le16_to_cpu(cl->cl_cpg) -
513 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
514 u32 p_cpos, clusters;
515 u64 p_blkno;
516 struct ocfs2_extent_list *el = &bg->bg_list;
517
518 status = ocfs2_journal_access_gd(handle,
519 INODE_CACHE(alloc_inode),
520 bg_bh,
521 OCFS2_JOURNAL_ACCESS_CREATE);
522 if (status < 0) {
523 mlog_errno(status);
524 goto bail;
525 }
526
527 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
528 le16_to_cpu(el->l_count))) {
529 if (min_bits > needed)
530 min_bits = needed;
531 status = ocfs2_block_group_claim_bits(osb, handle, ac,
532 min_bits, &p_cpos,
533 &clusters);
534 if (status < 0) {
535 if (status != -ENOSPC)
536 mlog_errno(status);
537 goto bail;
538 }
539 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
540 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
541 clusters);
542
543 min_bits = clusters;
544 needed = le16_to_cpu(cl->cl_cpg) -
545 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
546 }
547
548 if (needed > 0) {
549 /*
550 * We have used up all the extent rec but can't fill up
551 * the cpg. So bail out.
552 */
553 status = -ENOSPC;
554 goto bail;
555 }
556
557 ocfs2_journal_dirty(handle, bg_bh);
558
559bail:
560 return status;
561}
562
563static void ocfs2_bg_alloc_cleanup(handle_t *handle,
564 struct ocfs2_alloc_context *cluster_ac,
565 struct inode *alloc_inode,
566 struct buffer_head *bg_bh)
567{
568 int i, ret;
569 struct ocfs2_group_desc *bg;
570 struct ocfs2_extent_list *el;
571 struct ocfs2_extent_rec *rec;
572
573 if (!bg_bh)
574 return;
575
576 bg = (struct ocfs2_group_desc *)bg_bh->b_data;
577 el = &bg->bg_list;
578 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
579 rec = &el->l_recs[i];
580 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
581 cluster_ac->ac_bh,
582 le64_to_cpu(rec->e_blkno),
583 le32_to_cpu(rec->e_leaf_clusters));
584 if (ret)
585 mlog_errno(ret);
586 /* Try all the clusters to free */
587 }
588
589 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
590 brelse(bg_bh);
591}
592
593static struct buffer_head *
594ocfs2_block_group_alloc_discontig(handle_t *handle,
595 struct inode *alloc_inode,
596 struct ocfs2_alloc_context *ac,
597 struct ocfs2_chain_list *cl)
598{
599 int status;
600 u32 bit_off, num_bits;
601 u64 bg_blkno;
602 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
603 struct buffer_head *bg_bh = NULL;
604 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
605 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
606
607 if (!ocfs2_supports_discontig_bg(osb)) {
608 status = -ENOSPC;
609 goto bail;
610 }
611
612 status = ocfs2_extend_trans(handle,
613 ocfs2_calc_bg_discontig_credits(osb->sb));
614 if (status) {
615 mlog_errno(status);
616 goto bail;
617 }
618
619 /*
620 * We're going to be grabbing from multiple cluster groups.
621 * We don't have enough credits to relink them all, and the
622 * cluster groups will be staying in cache for the duration of
623 * this operation.
624 */
625 ac->ac_allow_chain_relink = 0;
626
627 /* Claim the first region */
628 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
629 &bit_off, &num_bits);
630 if (status < 0) {
631 if (status != -ENOSPC)
632 mlog_errno(status);
633 goto bail;
634 }
635 min_bits = num_bits;
636
637 /* setup the group */
638 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
639 mlog(0, "new descriptor, record %u, at block %llu\n",
640 alloc_rec, (unsigned long long)bg_blkno);
641
642 bg_bh = sb_getblk(osb->sb, bg_blkno);
643 if (!bg_bh) {
644 status = -EIO;
645 mlog_errno(status);
646 goto bail;
647 }
648 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
649
650 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
651 bg_blkno, num_bits, alloc_rec, cl);
652 if (status < 0) {
653 mlog_errno(status);
654 goto bail;
655 }
656
657 status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
658 bg_bh, ac, cl, min_bits);
659 if (status)
660 mlog_errno(status);
661
662bail:
663 if (status)
664 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
665 return status ? ERR_PTR(status) : bg_bh;
666}
667
400/* 668/*
401 * We expect the block group allocator to already be locked. 669 * We expect the block group allocator to already be locked.
402 */ 670 */
@@ -412,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
412 struct ocfs2_chain_list *cl; 680 struct ocfs2_chain_list *cl;
413 struct ocfs2_alloc_context *ac = NULL; 681 struct ocfs2_alloc_context *ac = NULL;
414 handle_t *handle = NULL; 682 handle_t *handle = NULL;
415 u32 bit_off, num_bits;
416 u16 alloc_rec; 683 u16 alloc_rec;
417 u64 bg_blkno;
418 struct buffer_head *bg_bh = NULL; 684 struct buffer_head *bg_bh = NULL;
419 struct ocfs2_group_desc *bg; 685 struct ocfs2_group_desc *bg;
420 686
@@ -447,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
447 (unsigned long long)*last_alloc_group); 713 (unsigned long long)*last_alloc_group);
448 ac->ac_last_group = *last_alloc_group; 714 ac->ac_last_group = *last_alloc_group;
449 } 715 }
450 status = ocfs2_claim_clusters(osb, 716
451 handle, 717 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
452 ac, 718 ac, cl);
453 le16_to_cpu(cl->cl_cpg), 719 if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
454 &bit_off, 720 bg_bh = ocfs2_block_group_alloc_discontig(handle,
455 &num_bits); 721 alloc_inode,
456 if (status < 0) { 722 ac, cl);
723 if (IS_ERR(bg_bh)) {
724 status = PTR_ERR(bg_bh);
725 bg_bh = NULL;
457 if (status != -ENOSPC) 726 if (status != -ENOSPC)
458 mlog_errno(status); 727 mlog_errno(status);
459 goto bail; 728 goto bail;
460 } 729 }
461
462 alloc_rec = ocfs2_find_smallest_chain(cl);
463
464 /* setup the group */
465 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
466 mlog(0, "new descriptor, record %u, at block %llu\n",
467 alloc_rec, (unsigned long long)bg_blkno);
468
469 bg_bh = sb_getblk(osb->sb, bg_blkno);
470 if (!bg_bh) {
471 status = -EIO;
472 mlog_errno(status);
473 goto bail;
474 }
475 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
476
477 status = ocfs2_block_group_fill(handle,
478 alloc_inode,
479 bg_bh,
480 bg_blkno,
481 alloc_rec,
482 cl);
483 if (status < 0) {
484 mlog_errno(status);
485 goto bail;
486 }
487
488 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 730 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
489 731
490 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 732 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -494,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
494 goto bail; 736 goto bail;
495 } 737 }
496 738
739 alloc_rec = le16_to_cpu(bg->bg_chain);
497 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 740 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
498 le16_to_cpu(bg->bg_free_bits_count)); 741 le16_to_cpu(bg->bg_free_bits_count));
499 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); 742 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
500 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); 743 le16_to_cpu(bg->bg_bits));
744 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno);
501 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 745 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
502 le16_add_cpu(&cl->cl_next_free_rec, 1); 746 le16_add_cpu(&cl->cl_next_free_rec, 1);
503 747
@@ -506,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
506 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 750 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
507 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 751 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
508 752
509 status = ocfs2_journal_dirty(handle, bh); 753 ocfs2_journal_dirty(handle, bh);
510 if (status < 0) {
511 mlog_errno(status);
512 goto bail;
513 }
514 754
515 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 755 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
516 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 756 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -760,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
760 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 1000 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
761 EXTENT_ALLOC_SYSTEM_INODE, 1001 EXTENT_ALLOC_SYSTEM_INODE,
762 (u32)osb->slot_num, NULL, 1002 (u32)osb->slot_num, NULL,
763 ALLOC_NEW_GROUP); 1003 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
764 1004
765 1005
766 if (status >= 0) { 1006 if (status >= 0) {
@@ -946,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
946 status = ocfs2_reserve_local_alloc_bits(osb, 1186 status = ocfs2_reserve_local_alloc_bits(osb,
947 bits_wanted, 1187 bits_wanted,
948 *ac); 1188 *ac);
949 if (status == -EFBIG) { 1189 if ((status < 0) && (status != -ENOSPC)) {
950 /* The local alloc window is outside ac_max_block.
951 * use the main bitmap. */
952 status = -ENOSPC;
953 } else if ((status < 0) && (status != -ENOSPC)) {
954 mlog_errno(status); 1190 mlog_errno(status);
955 goto bail; 1191 goto bail;
956 } 1192 }
@@ -1033,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1033 struct buffer_head *bg_bh, 1269 struct buffer_head *bg_bh,
1034 unsigned int bits_wanted, 1270 unsigned int bits_wanted,
1035 unsigned int total_bits, 1271 unsigned int total_bits,
1036 u16 *bit_off, 1272 struct ocfs2_suballoc_result *res)
1037 u16 *bits_found)
1038{ 1273{
1039 void *bitmap; 1274 void *bitmap;
1040 u16 best_offset, best_size; 1275 u16 best_offset, best_size;
@@ -1078,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1078 } 1313 }
1079 } 1314 }
1080 1315
1081 /* XXX: I think the first clause is equivalent to the second 1316 if (best_size) {
1082 * - jlbec */ 1317 res->sr_bit_offset = best_offset;
1083 if (found == bits_wanted) { 1318 res->sr_bits = best_size;
1084 *bit_off = start - found;
1085 *bits_found = found;
1086 } else if (best_size) {
1087 *bit_off = best_offset;
1088 *bits_found = best_size;
1089 } else { 1319 } else {
1090 status = -ENOSPC; 1320 status = -ENOSPC;
1091 /* No error log here -- see the comment above 1321 /* No error log here -- see the comment above
@@ -1129,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1129 } 1359 }
1130 1360
1131 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1361 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1132
1133 while(num_bits--) 1362 while(num_bits--)
1134 ocfs2_set_bit(bit_off++, bitmap); 1363 ocfs2_set_bit(bit_off++, bitmap);
1135 1364
1136 status = ocfs2_journal_dirty(handle, 1365 ocfs2_journal_dirty(handle, group_bh);
1137 group_bh);
1138 if (status < 0) {
1139 mlog_errno(status);
1140 goto bail;
1141 }
1142 1366
1143bail: 1367bail:
1144 mlog_exit(status); 1368 mlog_exit(status);
@@ -1202,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1202 } 1426 }
1203 1427
1204 prev_bg->bg_next_group = bg->bg_next_group; 1428 prev_bg->bg_next_group = bg->bg_next_group;
1205 1429 ocfs2_journal_dirty(handle, prev_bg_bh);
1206 status = ocfs2_journal_dirty(handle, prev_bg_bh);
1207 if (status < 0) {
1208 mlog_errno(status);
1209 goto out_rollback;
1210 }
1211 1430
1212 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1431 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1213 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1432 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1217,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1217 } 1436 }
1218 1437
1219 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1438 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1220 1439 ocfs2_journal_dirty(handle, bg_bh);
1221 status = ocfs2_journal_dirty(handle, bg_bh);
1222 if (status < 0) {
1223 mlog_errno(status);
1224 goto out_rollback;
1225 }
1226 1440
1227 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1441 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1228 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1442 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1232,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1232 } 1446 }
1233 1447
1234 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1448 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1449 ocfs2_journal_dirty(handle, fe_bh);
1235 1450
1236 status = ocfs2_journal_dirty(handle, fe_bh);
1237 if (status < 0) {
1238 mlog_errno(status);
1239 goto out_rollback;
1240 }
1241
1242 status = 0;
1243out_rollback: 1451out_rollback:
1244 if (status < 0) { 1452 if (status < 0) {
1245 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); 1453 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1263,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1263 struct buffer_head *group_bh, 1471 struct buffer_head *group_bh,
1264 u32 bits_wanted, u32 min_bits, 1472 u32 bits_wanted, u32 min_bits,
1265 u64 max_block, 1473 u64 max_block,
1266 u16 *bit_off, u16 *bits_found) 1474 struct ocfs2_suballoc_result *res)
1267{ 1475{
1268 int search = -ENOSPC; 1476 int search = -ENOSPC;
1269 int ret; 1477 int ret;
1270 u64 blkoff; 1478 u64 blkoff;
1271 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1479 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1272 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1480 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1273 u16 tmp_off, tmp_found;
1274 unsigned int max_bits, gd_cluster_off; 1481 unsigned int max_bits, gd_cluster_off;
1275 1482
1276 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1483 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1297,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1297 1504
1298 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1505 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1299 group_bh, bits_wanted, 1506 group_bh, bits_wanted,
1300 max_bits, 1507 max_bits, res);
1301 &tmp_off, &tmp_found);
1302 if (ret) 1508 if (ret)
1303 return ret; 1509 return ret;
1304 1510
1305 if (max_block) { 1511 if (max_block) {
1306 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1512 blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1307 gd_cluster_off + 1513 gd_cluster_off +
1308 tmp_off + tmp_found); 1514 res->sr_bit_offset +
1515 res->sr_bits);
1309 mlog(0, "Checking %llu against %llu\n", 1516 mlog(0, "Checking %llu against %llu\n",
1310 (unsigned long long)blkoff, 1517 (unsigned long long)blkoff,
1311 (unsigned long long)max_block); 1518 (unsigned long long)max_block);
@@ -1317,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1317 * return success, but we still want to return 1524 * return success, but we still want to return
1318 * -ENOSPC unless it found the minimum number 1525 * -ENOSPC unless it found the minimum number
1319 * of bits. */ 1526 * of bits. */
1320 if (min_bits <= tmp_found) { 1527 if (min_bits <= res->sr_bits)
1321 *bit_off = tmp_off;
1322 *bits_found = tmp_found;
1323 search = 0; /* success */ 1528 search = 0; /* success */
1324 } else if (tmp_found) { 1529 else if (res->sr_bits) {
1325 /* 1530 /*
1326 * Don't show bits which we'll be returning 1531 * Don't show bits which we'll be returning
1327 * for allocation to the local alloc bitmap. 1532 * for allocation to the local alloc bitmap.
1328 */ 1533 */
1329 ocfs2_local_alloc_seen_free_bits(osb, tmp_found); 1534 ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1330 } 1535 }
1331 } 1536 }
1332 1537
@@ -1337,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
1337 struct buffer_head *group_bh, 1542 struct buffer_head *group_bh,
1338 u32 bits_wanted, u32 min_bits, 1543 u32 bits_wanted, u32 min_bits,
1339 u64 max_block, 1544 u64 max_block,
1340 u16 *bit_off, u16 *bits_found) 1545 struct ocfs2_suballoc_result *res)
1341{ 1546{
1342 int ret = -ENOSPC; 1547 int ret = -ENOSPC;
1343 u64 blkoff; 1548 u64 blkoff;
@@ -1350,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
1350 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1555 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1351 group_bh, bits_wanted, 1556 group_bh, bits_wanted,
1352 le16_to_cpu(bg->bg_bits), 1557 le16_to_cpu(bg->bg_bits),
1353 bit_off, bits_found); 1558 res);
1354 if (!ret && max_block) { 1559 if (!ret && max_block) {
1355 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + 1560 blkoff = le64_to_cpu(bg->bg_blkno) +
1356 *bits_found; 1561 res->sr_bit_offset + res->sr_bits;
1357 mlog(0, "Checking %llu against %llu\n", 1562 mlog(0, "Checking %llu against %llu\n",
1358 (unsigned long long)blkoff, 1563 (unsigned long long)blkoff,
1359 (unsigned long long)max_block); 1564 (unsigned long long)max_block);
@@ -1386,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1386 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1591 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1387 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1592 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1388 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1593 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1389 1594 ocfs2_journal_dirty(handle, di_bh);
1390 ret = ocfs2_journal_dirty(handle, di_bh);
1391 if (ret < 0)
1392 mlog_errno(ret);
1393 1595
1394out: 1596out:
1395 return ret; 1597 return ret;
1396} 1598}
1397 1599
1600static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1601 struct ocfs2_extent_rec *rec,
1602 struct ocfs2_chain_list *cl)
1603{
1604 unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1605 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1606 unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1607
1608 if (res->sr_bit_offset < bitoff)
1609 return 0;
1610 if (res->sr_bit_offset >= (bitoff + bitcount))
1611 return 0;
1612 res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1613 (res->sr_bit_offset - bitoff);
1614 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1615 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1616 return 1;
1617}
1618
1619static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1620 struct ocfs2_group_desc *bg,
1621 struct ocfs2_suballoc_result *res)
1622{
1623 int i;
1624 u64 bg_blkno = res->sr_bg_blkno; /* Save off */
1625 struct ocfs2_extent_rec *rec;
1626 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1627 struct ocfs2_chain_list *cl = &di->id2.i_chain;
1628
1629 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1630 res->sr_blkno = 0;
1631 return;
1632 }
1633
1634 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1635 res->sr_bg_blkno = 0; /* Clear it for contig block groups */
1636 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1637 !bg->bg_list.l_next_free_rec)
1638 return;
1639
1640 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1641 rec = &bg->bg_list.l_recs[i];
1642 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1643 res->sr_bg_blkno = bg_blkno; /* Restore */
1644 break;
1645 }
1646 }
1647}
1648
1398static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1649static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1399 handle_t *handle, 1650 handle_t *handle,
1400 u32 bits_wanted, 1651 u32 bits_wanted,
1401 u32 min_bits, 1652 u32 min_bits,
1402 u16 *bit_off, 1653 struct ocfs2_suballoc_result *res,
1403 unsigned int *num_bits,
1404 u64 gd_blkno,
1405 u16 *bits_left) 1654 u16 *bits_left)
1406{ 1655{
1407 int ret; 1656 int ret;
1408 u16 found;
1409 struct buffer_head *group_bh = NULL; 1657 struct buffer_head *group_bh = NULL;
1410 struct ocfs2_group_desc *gd; 1658 struct ocfs2_group_desc *gd;
1411 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1659 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1412 struct inode *alloc_inode = ac->ac_inode; 1660 struct inode *alloc_inode = ac->ac_inode;
1413 1661
1414 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, 1662 ret = ocfs2_read_group_descriptor(alloc_inode, di,
1415 &group_bh); 1663 res->sr_bg_blkno, &group_bh);
1416 if (ret < 0) { 1664 if (ret < 0) {
1417 mlog_errno(ret); 1665 mlog_errno(ret);
1418 return ret; 1666 return ret;
@@ -1420,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1420 1668
1421 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1669 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1422 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1670 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1423 ac->ac_max_block, bit_off, &found); 1671 ac->ac_max_block, res);
1424 if (ret < 0) { 1672 if (ret < 0) {
1425 if (ret != -ENOSPC) 1673 if (ret != -ENOSPC)
1426 mlog_errno(ret); 1674 mlog_errno(ret);
1427 goto out; 1675 goto out;
1428 } 1676 }
1429 1677
1430 *num_bits = found; 1678 if (!ret)
1679 ocfs2_bg_discontig_fix_result(ac, gd, res);
1431 1680
1432 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1681 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1433 *num_bits, 1682 res->sr_bits,
1434 le16_to_cpu(gd->bg_chain)); 1683 le16_to_cpu(gd->bg_chain));
1435 if (ret < 0) { 1684 if (ret < 0) {
1436 mlog_errno(ret); 1685 mlog_errno(ret);
@@ -1438,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1438 } 1687 }
1439 1688
1440 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1689 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1441 *bit_off, *num_bits); 1690 res->sr_bit_offset, res->sr_bits);
1442 if (ret < 0) 1691 if (ret < 0)
1443 mlog_errno(ret); 1692 mlog_errno(ret);
1444 1693
@@ -1454,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1454 handle_t *handle, 1703 handle_t *handle,
1455 u32 bits_wanted, 1704 u32 bits_wanted,
1456 u32 min_bits, 1705 u32 min_bits,
1457 u16 *bit_off, 1706 struct ocfs2_suballoc_result *res,
1458 unsigned int *num_bits,
1459 u64 *bg_blkno,
1460 u16 *bits_left) 1707 u16 *bits_left)
1461{ 1708{
1462 int status; 1709 int status;
1463 u16 chain, tmp_bits; 1710 u16 chain;
1464 u32 tmp_used; 1711 u32 tmp_used;
1465 u64 next_group; 1712 u64 next_group;
1466 struct inode *alloc_inode = ac->ac_inode; 1713 struct inode *alloc_inode = ac->ac_inode;
@@ -1489,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1489 * the 1st group with any empty bits. */ 1736 * the 1st group with any empty bits. */
1490 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1737 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1491 bits_wanted, min_bits, 1738 bits_wanted, min_bits,
1492 ac->ac_max_block, bit_off, 1739 ac->ac_max_block,
1493 &tmp_bits)) == -ENOSPC) { 1740 res)) == -ENOSPC) {
1494 if (!bg->bg_next_group) 1741 if (!bg->bg_next_group)
1495 break; 1742 break;
1496 1743
@@ -1515,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1515 } 1762 }
1516 1763
1517 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1764 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1518 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1765 res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1519 1766
1520 *num_bits = tmp_bits; 1767 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1768
1769 BUG_ON(res->sr_bits == 0);
1770 if (!status)
1771 ocfs2_bg_discontig_fix_result(ac, bg, res);
1521 1772
1522 BUG_ON(*num_bits == 0);
1523 1773
1524 /* 1774 /*
1525 * Keep track of previous block descriptor read. When 1775 * Keep track of previous block descriptor read. When
@@ -1536,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1536 */ 1786 */
1537 if (ac->ac_allow_chain_relink && 1787 if (ac->ac_allow_chain_relink &&
1538 (prev_group_bh) && 1788 (prev_group_bh) &&
1539 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { 1789 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1540 status = ocfs2_relink_block_group(handle, alloc_inode, 1790 status = ocfs2_relink_block_group(handle, alloc_inode,
1541 ac->ac_bh, group_bh, 1791 ac->ac_bh, group_bh,
1542 prev_group_bh, chain); 1792 prev_group_bh, chain);
@@ -1558,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1558 } 1808 }
1559 1809
1560 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 1810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1561 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); 1811 fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1562 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); 1812 le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1563 1813 ocfs2_journal_dirty(handle, ac->ac_bh);
1564 status = ocfs2_journal_dirty(handle,
1565 ac->ac_bh);
1566 if (status < 0) {
1567 mlog_errno(status);
1568 goto bail;
1569 }
1570 1814
1571 status = ocfs2_block_group_set_bits(handle, 1815 status = ocfs2_block_group_set_bits(handle,
1572 alloc_inode, 1816 alloc_inode,
1573 bg, 1817 bg,
1574 group_bh, 1818 group_bh,
1575 *bit_off, 1819 res->sr_bit_offset,
1576 *num_bits); 1820 res->sr_bits);
1577 if (status < 0) { 1821 if (status < 0) {
1578 mlog_errno(status); 1822 mlog_errno(status);
1579 goto bail; 1823 goto bail;
1580 } 1824 }
1581 1825
1582 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1826 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1583 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1827 (unsigned long long)le64_to_cpu(fe->i_blkno));
1584 1828
1585 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1586 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1829 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1587bail: 1830bail:
1588 brelse(group_bh); 1831 brelse(group_bh);
@@ -1593,19 +1836,15 @@ bail:
1593} 1836}
1594 1837
1595/* will give out up to bits_wanted contiguous bits. */ 1838/* will give out up to bits_wanted contiguous bits. */
1596static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 1839static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1597 struct ocfs2_alloc_context *ac,
1598 handle_t *handle, 1840 handle_t *handle,
1599 u32 bits_wanted, 1841 u32 bits_wanted,
1600 u32 min_bits, 1842 u32 min_bits,
1601 u16 *bit_off, 1843 struct ocfs2_suballoc_result *res)
1602 unsigned int *num_bits,
1603 u64 *bg_blkno)
1604{ 1844{
1605 int status; 1845 int status;
1606 u16 victim, i; 1846 u16 victim, i;
1607 u16 bits_left = 0; 1847 u16 bits_left = 0;
1608 u64 hint_blkno = ac->ac_last_group;
1609 struct ocfs2_chain_list *cl; 1848 struct ocfs2_chain_list *cl;
1610 struct ocfs2_dinode *fe; 1849 struct ocfs2_dinode *fe;
1611 1850
@@ -1623,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1623 1862
1624 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1863 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1625 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1864 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1626 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1865 ocfs2_error(ac->ac_inode->i_sb,
1866 "Chain allocator dinode %llu has %u used "
1627 "bits but only %u total.", 1867 "bits but only %u total.",
1628 (unsigned long long)le64_to_cpu(fe->i_blkno), 1868 (unsigned long long)le64_to_cpu(fe->i_blkno),
1629 le32_to_cpu(fe->id1.bitmap1.i_used), 1869 le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1632,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1632 goto bail; 1872 goto bail;
1633 } 1873 }
1634 1874
1635 if (hint_blkno) { 1875 res->sr_bg_blkno = ac->ac_last_group;
1876 if (res->sr_bg_blkno) {
1636 /* Attempt to short-circuit the usual search mechanism 1877 /* Attempt to short-circuit the usual search mechanism
1637 * by jumping straight to the most recently used 1878 * by jumping straight to the most recently used
1638 * allocation group. This helps us mantain some 1879 * allocation group. This helps us mantain some
1639 * contiguousness across allocations. */ 1880 * contiguousness across allocations. */
1640 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1881 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1641 min_bits, bit_off, num_bits, 1882 min_bits, res, &bits_left);
1642 hint_blkno, &bits_left); 1883 if (!status)
1643 if (!status) {
1644 /* Be careful to update *bg_blkno here as the
1645 * caller is expecting it to be filled in, and
1646 * ocfs2_search_one_group() won't do that for
1647 * us. */
1648 *bg_blkno = hint_blkno;
1649 goto set_hint; 1884 goto set_hint;
1650 }
1651 if (status < 0 && status != -ENOSPC) { 1885 if (status < 0 && status != -ENOSPC) {
1652 mlog_errno(status); 1886 mlog_errno(status);
1653 goto bail; 1887 goto bail;
@@ -1660,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1660 ac->ac_chain = victim; 1894 ac->ac_chain = victim;
1661 ac->ac_allow_chain_relink = 1; 1895 ac->ac_allow_chain_relink = 1;
1662 1896
1663 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, 1897 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1664 num_bits, bg_blkno, &bits_left); 1898 res, &bits_left);
1665 if (!status) 1899 if (!status)
1666 goto set_hint; 1900 goto set_hint;
1667 if (status < 0 && status != -ENOSPC) { 1901 if (status < 0 && status != -ENOSPC) {
@@ -1685,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1685 1919
1686 ac->ac_chain = i; 1920 ac->ac_chain = i;
1687 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1921 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1688 bit_off, num_bits, bg_blkno, 1922 res, &bits_left);
1689 &bits_left);
1690 if (!status) 1923 if (!status)
1691 break; 1924 break;
1692 if (status < 0 && status != -ENOSPC) { 1925 if (status < 0 && status != -ENOSPC) {
@@ -1703,7 +1936,7 @@ set_hint:
1703 if (bits_left < min_bits) 1936 if (bits_left < min_bits)
1704 ac->ac_last_group = 0; 1937 ac->ac_last_group = 0;
1705 else 1938 else
1706 ac->ac_last_group = *bg_blkno; 1939 ac->ac_last_group = res->sr_bg_blkno;
1707 } 1940 }
1708 1941
1709bail: 1942bail:
@@ -1711,37 +1944,37 @@ bail:
1711 return status; 1944 return status;
1712} 1945}
1713 1946
1714int ocfs2_claim_metadata(struct ocfs2_super *osb, 1947int ocfs2_claim_metadata(handle_t *handle,
1715 handle_t *handle,
1716 struct ocfs2_alloc_context *ac, 1948 struct ocfs2_alloc_context *ac,
1717 u32 bits_wanted, 1949 u32 bits_wanted,
1950 u64 *suballoc_loc,
1718 u16 *suballoc_bit_start, 1951 u16 *suballoc_bit_start,
1719 unsigned int *num_bits, 1952 unsigned int *num_bits,
1720 u64 *blkno_start) 1953 u64 *blkno_start)
1721{ 1954{
1722 int status; 1955 int status;
1723 u64 bg_blkno; 1956 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1724 1957
1725 BUG_ON(!ac); 1958 BUG_ON(!ac);
1726 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 1959 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1727 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 1960 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1728 1961
1729 status = ocfs2_claim_suballoc_bits(osb, 1962 status = ocfs2_claim_suballoc_bits(ac,
1730 ac,
1731 handle, 1963 handle,
1732 bits_wanted, 1964 bits_wanted,
1733 1, 1965 1,
1734 suballoc_bit_start, 1966 &res);
1735 num_bits,
1736 &bg_blkno);
1737 if (status < 0) { 1967 if (status < 0) {
1738 mlog_errno(status); 1968 mlog_errno(status);
1739 goto bail; 1969 goto bail;
1740 } 1970 }
1741 atomic_inc(&osb->alloc_stats.bg_allocs); 1971 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1742 1972
1743 *blkno_start = bg_blkno + (u64) *suballoc_bit_start; 1973 *suballoc_loc = res.sr_bg_blkno;
1744 ac->ac_bits_given += (*num_bits); 1974 *suballoc_bit_start = res.sr_bit_offset;
1975 *blkno_start = res.sr_blkno;
1976 ac->ac_bits_given += res.sr_bits;
1977 *num_bits = res.sr_bits;
1745 status = 0; 1978 status = 0;
1746bail: 1979bail:
1747 mlog_exit(status); 1980 mlog_exit(status);
@@ -1749,10 +1982,10 @@ bail:
1749} 1982}
1750 1983
1751static void ocfs2_init_inode_ac_group(struct inode *dir, 1984static void ocfs2_init_inode_ac_group(struct inode *dir,
1752 struct buffer_head *parent_fe_bh, 1985 struct buffer_head *parent_di_bh,
1753 struct ocfs2_alloc_context *ac) 1986 struct ocfs2_alloc_context *ac)
1754{ 1987{
1755 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; 1988 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
1756 /* 1989 /*
1757 * Try to allocate inodes from some specific group. 1990 * Try to allocate inodes from some specific group.
1758 * 1991 *
@@ -1766,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
1766 if (OCFS2_I(dir)->ip_last_used_group && 1999 if (OCFS2_I(dir)->ip_last_used_group &&
1767 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2000 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1768 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2001 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1769 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) 2002 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
1770 ac->ac_last_group = ocfs2_which_suballoc_group( 2003 if (di->i_suballoc_loc)
1771 le64_to_cpu(fe->i_blkno), 2004 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
1772 le16_to_cpu(fe->i_suballoc_bit)); 2005 else
2006 ac->ac_last_group = ocfs2_which_suballoc_group(
2007 le64_to_cpu(di->i_blkno),
2008 le16_to_cpu(di->i_suballoc_bit));
2009 }
1773} 2010}
1774 2011
1775static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2012static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1779,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1779 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2016 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1780} 2017}
1781 2018
1782int ocfs2_claim_new_inode(struct ocfs2_super *osb, 2019int ocfs2_claim_new_inode(handle_t *handle,
1783 handle_t *handle,
1784 struct inode *dir, 2020 struct inode *dir,
1785 struct buffer_head *parent_fe_bh, 2021 struct buffer_head *parent_fe_bh,
1786 struct ocfs2_alloc_context *ac, 2022 struct ocfs2_alloc_context *ac,
2023 u64 *suballoc_loc,
1787 u16 *suballoc_bit, 2024 u16 *suballoc_bit,
1788 u64 *fe_blkno) 2025 u64 *fe_blkno)
1789{ 2026{
1790 int status; 2027 int status;
1791 unsigned int num_bits; 2028 struct ocfs2_suballoc_result res;
1792 u64 bg_blkno;
1793 2029
1794 mlog_entry_void(); 2030 mlog_entry_void();
1795 2031
@@ -1800,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1800 2036
1801 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2037 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1802 2038
1803 status = ocfs2_claim_suballoc_bits(osb, 2039 status = ocfs2_claim_suballoc_bits(ac,
1804 ac,
1805 handle, 2040 handle,
1806 1, 2041 1,
1807 1, 2042 1,
1808 suballoc_bit, 2043 &res);
1809 &num_bits,
1810 &bg_blkno);
1811 if (status < 0) { 2044 if (status < 0) {
1812 mlog_errno(status); 2045 mlog_errno(status);
1813 goto bail; 2046 goto bail;
1814 } 2047 }
1815 atomic_inc(&osb->alloc_stats.bg_allocs); 2048 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1816 2049
1817 BUG_ON(num_bits != 1); 2050 BUG_ON(res.sr_bits != 1);
1818 2051
1819 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 2052 *suballoc_loc = res.sr_bg_blkno;
2053 *suballoc_bit = res.sr_bit_offset;
2054 *fe_blkno = res.sr_blkno;
1820 ac->ac_bits_given++; 2055 ac->ac_bits_given++;
1821 ocfs2_save_inode_ac_group(dir, ac); 2056 ocfs2_save_inode_ac_group(dir, ac);
1822 status = 0; 2057 status = 0;
@@ -1886,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1886 * contig. allocation, set to '1' to indicate we can deal with extents 2121 * contig. allocation, set to '1' to indicate we can deal with extents
1887 * of any size. 2122 * of any size.
1888 */ 2123 */
1889int __ocfs2_claim_clusters(struct ocfs2_super *osb, 2124int __ocfs2_claim_clusters(handle_t *handle,
1890 handle_t *handle,
1891 struct ocfs2_alloc_context *ac, 2125 struct ocfs2_alloc_context *ac,
1892 u32 min_clusters, 2126 u32 min_clusters,
1893 u32 max_clusters, 2127 u32 max_clusters,
@@ -1896,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1896{ 2130{
1897 int status; 2131 int status;
1898 unsigned int bits_wanted = max_clusters; 2132 unsigned int bits_wanted = max_clusters;
1899 u64 bg_blkno = 0; 2133 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1900 u16 bg_bit_off; 2134 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
1901 2135
1902 mlog_entry_void(); 2136 mlog_entry_void();
1903 2137
@@ -1907,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1907 && ac->ac_which != OCFS2_AC_USE_MAIN); 2141 && ac->ac_which != OCFS2_AC_USE_MAIN);
1908 2142
1909 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2143 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2144 WARN_ON(min_clusters > 1);
2145
1910 status = ocfs2_claim_local_alloc_bits(osb, 2146 status = ocfs2_claim_local_alloc_bits(osb,
1911 handle, 2147 handle,
1912 ac, 2148 ac,
@@ -1929,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1929 if (bits_wanted > (osb->bitmap_cpg - 1)) 2165 if (bits_wanted > (osb->bitmap_cpg - 1))
1930 bits_wanted = osb->bitmap_cpg - 1; 2166 bits_wanted = osb->bitmap_cpg - 1;
1931 2167
1932 status = ocfs2_claim_suballoc_bits(osb, 2168 status = ocfs2_claim_suballoc_bits(ac,
1933 ac,
1934 handle, 2169 handle,
1935 bits_wanted, 2170 bits_wanted,
1936 min_clusters, 2171 min_clusters,
1937 &bg_bit_off, 2172 &res);
1938 num_clusters,
1939 &bg_blkno);
1940 if (!status) { 2173 if (!status) {
2174 BUG_ON(res.sr_blkno); /* cluster alloc can't set */
1941 *cluster_start = 2175 *cluster_start =
1942 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2176 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1943 bg_blkno, 2177 res.sr_bg_blkno,
1944 bg_bit_off); 2178 res.sr_bit_offset);
1945 atomic_inc(&osb->alloc_stats.bitmap_data); 2179 atomic_inc(&osb->alloc_stats.bitmap_data);
2180 *num_clusters = res.sr_bits;
1946 } 2181 }
1947 } 2182 }
1948 if (status < 0) { 2183 if (status < 0) {
@@ -1958,8 +2193,7 @@ bail:
1958 return status; 2193 return status;
1959} 2194}
1960 2195
1961int ocfs2_claim_clusters(struct ocfs2_super *osb, 2196int ocfs2_claim_clusters(handle_t *handle,
1962 handle_t *handle,
1963 struct ocfs2_alloc_context *ac, 2197 struct ocfs2_alloc_context *ac,
1964 u32 min_clusters, 2198 u32 min_clusters,
1965 u32 *cluster_start, 2199 u32 *cluster_start,
@@ -1967,7 +2201,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1967{ 2201{
1968 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2202 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1969 2203
1970 return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, 2204 return __ocfs2_claim_clusters(handle, ac, min_clusters,
1971 bits_wanted, cluster_start, num_clusters); 2205 bits_wanted, cluster_start, num_clusters);
1972} 2206}
1973 2207
@@ -2023,9 +2257,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2023 if (undo_fn) 2257 if (undo_fn)
2024 jbd_unlock_bh_state(group_bh); 2258 jbd_unlock_bh_state(group_bh);
2025 2259
2026 status = ocfs2_journal_dirty(handle, group_bh); 2260 ocfs2_journal_dirty(handle, group_bh);
2027 if (status < 0)
2028 mlog_errno(status);
2029bail: 2261bail:
2030 return status; 2262 return status;
2031} 2263}
@@ -2092,12 +2324,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
2092 count); 2324 count);
2093 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2325 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2094 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2326 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2095 2327 ocfs2_journal_dirty(handle, alloc_bh);
2096 status = ocfs2_journal_dirty(handle, alloc_bh);
2097 if (status < 0) {
2098 mlog_errno(status);
2099 goto bail;
2100 }
2101 2328
2102bail: 2329bail:
2103 brelse(group_bh); 2330 brelse(group_bh);
@@ -2126,6 +2353,8 @@ int ocfs2_free_dinode(handle_t *handle,
2126 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2353 u16 bit = le16_to_cpu(di->i_suballoc_bit);
2127 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2354 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2128 2355
2356 if (di->i_suballoc_loc)
2357 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2129 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2358 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2130 inode_alloc_bh, bit, bg_blkno, 1); 2359 inode_alloc_bh, bit, bg_blkno, 1);
2131} 2360}
@@ -2395,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2395 struct buffer_head *alloc_bh, u64 blkno, 2624 struct buffer_head *alloc_bh, u64 blkno,
2396 u16 bit, int *res) 2625 u16 bit, int *res)
2397{ 2626{
2398 struct ocfs2_dinode *alloc_fe; 2627 struct ocfs2_dinode *alloc_di;
2399 struct ocfs2_group_desc *group; 2628 struct ocfs2_group_desc *group;
2400 struct buffer_head *group_bh = NULL; 2629 struct buffer_head *group_bh = NULL;
2401 u64 bg_blkno; 2630 u64 bg_blkno;
@@ -2404,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2404 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, 2633 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2405 (unsigned int)bit); 2634 (unsigned int)bit);
2406 2635
2407 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; 2636 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2408 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { 2637 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2409 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2638 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2410 (unsigned int)bit, 2639 (unsigned int)bit,
2411 ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); 2640 ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2412 status = -EINVAL; 2641 status = -EINVAL;
2413 goto bail; 2642 goto bail;
2414 } 2643 }
2415 2644
2416 bg_blkno = ocfs2_which_suballoc_group(blkno, bit); 2645 if (alloc_di->i_suballoc_loc)
2417 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, 2646 bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
2647 else
2648 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2649 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2418 &group_bh); 2650 &group_bh);
2419 if (status < 0) { 2651 if (status < 0) {
2420 mlog(ML_ERROR, "read group %llu failed %d\n", 2652 mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e0f46df357e6..a017dd3ee7d9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
26#ifndef _CHAINALLOC_H_ 26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_ 27#define _CHAINALLOC_H_
28 28
29struct ocfs2_suballoc_result;
29typedef int (group_search_t)(struct inode *, 30typedef int (group_search_t)(struct inode *,
30 struct buffer_head *, 31 struct buffer_head *,
31 u32, /* bits_wanted */ 32 u32, /* bits_wanted */
32 u32, /* min_bits */ 33 u32, /* min_bits */
33 u64, /* max_block */ 34 u64, /* max_block */
34 u16 *, /* *bit_off */ 35 struct ocfs2_suballoc_result *);
35 u16 *); /* *bits_found */ 36 /* found bits */
36 37
37struct ocfs2_alloc_context { 38struct ocfs2_alloc_context {
38 struct inode *ac_inode; /* which bitmap are we allocating from? */ 39 struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
54 u64 ac_last_group; 55 u64 ac_last_group;
55 u64 ac_max_block; /* Highest block number to allocate. 0 is 56 u64 ac_max_block; /* Highest block number to allocate. 0 is
56 is the same as ~0 - unlimited */ 57 is the same as ~0 - unlimited */
58
59 struct ocfs2_alloc_reservation *ac_resv;
57}; 60};
58 61
59void ocfs2_init_steal_slots(struct ocfs2_super *osb); 62void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
80 u32 bits_wanted, 83 u32 bits_wanted,
81 struct ocfs2_alloc_context **ac); 84 struct ocfs2_alloc_context **ac);
82 85
83int ocfs2_claim_metadata(struct ocfs2_super *osb, 86int ocfs2_claim_metadata(handle_t *handle,
84 handle_t *handle,
85 struct ocfs2_alloc_context *ac, 87 struct ocfs2_alloc_context *ac,
86 u32 bits_wanted, 88 u32 bits_wanted,
89 u64 *suballoc_loc,
87 u16 *suballoc_bit_start, 90 u16 *suballoc_bit_start,
88 u32 *num_bits, 91 u32 *num_bits,
89 u64 *blkno_start); 92 u64 *blkno_start);
90int ocfs2_claim_new_inode(struct ocfs2_super *osb, 93int ocfs2_claim_new_inode(handle_t *handle,
91 handle_t *handle,
92 struct inode *dir, 94 struct inode *dir,
93 struct buffer_head *parent_fe_bh, 95 struct buffer_head *parent_fe_bh,
94 struct ocfs2_alloc_context *ac, 96 struct ocfs2_alloc_context *ac,
97 u64 *suballoc_loc,
95 u16 *suballoc_bit, 98 u16 *suballoc_bit,
96 u64 *fe_blkno); 99 u64 *fe_blkno);
97int ocfs2_claim_clusters(struct ocfs2_super *osb, 100int ocfs2_claim_clusters(handle_t *handle,
98 handle_t *handle,
99 struct ocfs2_alloc_context *ac, 101 struct ocfs2_alloc_context *ac,
100 u32 min_clusters, 102 u32 min_clusters,
101 u32 *cluster_start, 103 u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
104 * Use this variant of ocfs2_claim_clusters to specify a maxiumum 106 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
105 * number of clusters smaller than the allocation reserved. 107 * number of clusters smaller than the allocation reserved.
106 */ 108 */
107int __ocfs2_claim_clusters(struct ocfs2_super *osb, 109int __ocfs2_claim_clusters(handle_t *handle,
108 handle_t *handle,
109 struct ocfs2_alloc_context *ac, 110 struct ocfs2_alloc_context *ac,
110 u32 min_clusters, 111 u32 min_clusters,
111 u32 max_clusters, 112 u32 max_clusters,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..2c26ce251cb3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
94 unsigned long mount_opt; 94 unsigned long mount_opt;
95 unsigned int atime_quantum; 95 unsigned int atime_quantum;
96 signed short slot; 96 signed short slot;
97 unsigned int localalloc_opt; 97 int localalloc_opt;
98 unsigned int resv_level;
99 int dir_resv_level;
98 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 100 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
99}; 101};
100 102
@@ -176,6 +178,8 @@ enum {
176 Opt_noacl, 178 Opt_noacl,
177 Opt_usrquota, 179 Opt_usrquota,
178 Opt_grpquota, 180 Opt_grpquota,
181 Opt_resv_level,
182 Opt_dir_resv_level,
179 Opt_err, 183 Opt_err,
180}; 184};
181 185
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
202 {Opt_noacl, "noacl"}, 206 {Opt_noacl, "noacl"},
203 {Opt_usrquota, "usrquota"}, 207 {Opt_usrquota, "usrquota"},
204 {Opt_grpquota, "grpquota"}, 208 {Opt_grpquota, "grpquota"},
209 {Opt_resv_level, "resv_level=%u"},
210 {Opt_dir_resv_level, "dir_resv_level=%u"},
205 {Opt_err, NULL} 211 {Opt_err, NULL}
206}; 212};
207 213
@@ -932,12 +938,16 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
932 int type; 938 int type;
933 struct inode *inode; 939 struct inode *inode;
934 struct super_block *sb = osb->sb; 940 struct super_block *sb = osb->sb;
941 struct ocfs2_mem_dqinfo *oinfo;
935 942
936 /* We mostly ignore errors in this function because there's not much 943 /* We mostly ignore errors in this function because there's not much
937 * we can do when we see them */ 944 * we can do when we see them */
938 for (type = 0; type < MAXQUOTAS; type++) { 945 for (type = 0; type < MAXQUOTAS; type++) {
939 if (!sb_has_quota_loaded(sb, type)) 946 if (!sb_has_quota_loaded(sb, type))
940 continue; 947 continue;
948 /* Cancel periodic syncing before we grab dqonoff_mutex */
949 oinfo = sb_dqinfo(sb, type)->dqi_priv;
950 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
941 inode = igrab(sb->s_dquot.files[type]); 951 inode = igrab(sb->s_dquot.files[type]);
942 /* Turn off quotas. This will remove all dquot structures from 952 /* Turn off quotas. This will remove all dquot structures from
943 * memory and so they will be automatically synced to global 953 * memory and so they will be automatically synced to global
@@ -1028,8 +1038,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1028 osb->s_atime_quantum = parsed_options.atime_quantum; 1038 osb->s_atime_quantum = parsed_options.atime_quantum;
1029 osb->preferred_slot = parsed_options.slot; 1039 osb->preferred_slot = parsed_options.slot;
1030 osb->osb_commit_interval = parsed_options.commit_interval; 1040 osb->osb_commit_interval = parsed_options.commit_interval;
1031 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 1041
1032 osb->local_alloc_bits = osb->local_alloc_default_bits; 1042 ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
1043 osb->osb_resv_level = parsed_options.resv_level;
1044 osb->osb_dir_resv_level = parsed_options.resv_level;
1045 if (parsed_options.dir_resv_level == -1)
1046 osb->osb_dir_resv_level = parsed_options.resv_level;
1047 else
1048 osb->osb_dir_resv_level = parsed_options.dir_resv_level;
1033 1049
1034 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 1050 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
1035 if (status) 1051 if (status)
@@ -1285,11 +1301,13 @@ static int ocfs2_parse_options(struct super_block *sb,
1285 options ? options : "(none)"); 1301 options ? options : "(none)");
1286 1302
1287 mopt->commit_interval = 0; 1303 mopt->commit_interval = 0;
1288 mopt->mount_opt = 0; 1304 mopt->mount_opt = OCFS2_MOUNT_NOINTR;
1289 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1305 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1290 mopt->slot = OCFS2_INVALID_SLOT; 1306 mopt->slot = OCFS2_INVALID_SLOT;
1291 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 1307 mopt->localalloc_opt = -1;
1292 mopt->cluster_stack[0] = '\0'; 1308 mopt->cluster_stack[0] = '\0';
1309 mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
1310 mopt->dir_resv_level = -1;
1293 1311
1294 if (!options) { 1312 if (!options) {
1295 status = 1; 1313 status = 1;
@@ -1380,7 +1398,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1380 status = 0; 1398 status = 0;
1381 goto bail; 1399 goto bail;
1382 } 1400 }
1383 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) 1401 if (option >= 0)
1384 mopt->localalloc_opt = option; 1402 mopt->localalloc_opt = option;
1385 break; 1403 break;
1386 case Opt_localflocks: 1404 case Opt_localflocks:
@@ -1433,6 +1451,28 @@ static int ocfs2_parse_options(struct super_block *sb,
1433 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; 1451 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1434 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1452 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1435 break; 1453 break;
1454 case Opt_resv_level:
1455 if (is_remount)
1456 break;
1457 if (match_int(&args[0], &option)) {
1458 status = 0;
1459 goto bail;
1460 }
1461 if (option >= OCFS2_MIN_RESV_LEVEL &&
1462 option < OCFS2_MAX_RESV_LEVEL)
1463 mopt->resv_level = option;
1464 break;
1465 case Opt_dir_resv_level:
1466 if (is_remount)
1467 break;
1468 if (match_int(&args[0], &option)) {
1469 status = 0;
1470 goto bail;
1471 }
1472 if (option >= OCFS2_MIN_RESV_LEVEL &&
1473 option < OCFS2_MAX_RESV_LEVEL)
1474 mopt->dir_resv_level = option;
1475 break;
1436 default: 1476 default:
1437 mlog(ML_ERROR, 1477 mlog(ML_ERROR,
1438 "Unrecognized mount option \"%s\" " 1478 "Unrecognized mount option \"%s\" "
@@ -1487,7 +1527,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1487 (unsigned) (osb->osb_commit_interval / HZ)); 1527 (unsigned) (osb->osb_commit_interval / HZ));
1488 1528
1489 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); 1529 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
1490 if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) 1530 if (local_alloc_megs != ocfs2_la_default_mb(osb))
1491 seq_printf(s, ",localalloc=%d", local_alloc_megs); 1531 seq_printf(s, ",localalloc=%d", local_alloc_megs);
1492 1532
1493 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 1533 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1554,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1514 else 1554 else
1515 seq_printf(s, ",noacl"); 1555 seq_printf(s, ",noacl");
1516 1556
1557 if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
1558 seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
1559
1560 if (osb->osb_dir_resv_level != osb->osb_resv_level)
1561 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
1562
1517 return 0; 1563 return 0;
1518} 1564}
1519 1565
@@ -1688,6 +1734,8 @@ static void ocfs2_inode_init_once(void *data)
1688 oi->ip_blkno = 0ULL; 1734 oi->ip_blkno = 0ULL;
1689 oi->ip_clusters = 0; 1735 oi->ip_clusters = 0;
1690 1736
1737 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1738
1691 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1739 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1692 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1740 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1693 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1741 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2090,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
2042 2090
2043 init_waitqueue_head(&osb->osb_mount_event); 2091 init_waitqueue_head(&osb->osb_mount_event);
2044 2092
2093 status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
2094 if (status) {
2095 mlog_errno(status);
2096 goto bail;
2097 }
2098
2045 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 2099 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
2046 if (!osb->vol_label) { 2100 if (!osb->vol_label) {
2047 mlog(ML_ERROR, "unable to alloc vol label\n"); 2101 mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2278,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
2224 } 2278 }
2225 2279
2226 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 2280 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
2281 osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
2227 iput(inode); 2282 iput(inode);
2228 2283
2229 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; 2284 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
2285 osb->s_feature_incompat) * 8;
2230 2286
2231 status = ocfs2_init_slot_info(osb); 2287 status = ocfs2_init_slot_info(osb);
2232 if (status < 0) { 2288 if (status < 0) {
@@ -2509,5 +2565,25 @@ void __ocfs2_abort(struct super_block* sb,
2509 ocfs2_handle_error(sb); 2565 ocfs2_handle_error(sb);
2510} 2566}
2511 2567
2568/*
2569 * Void signal blockers, because in-kernel sigprocmask() only fails
2570 * when SIG_* is wrong.
2571 */
2572void ocfs2_block_signals(sigset_t *oldset)
2573{
2574 int rc;
2575 sigset_t blocked;
2576
2577 sigfillset(&blocked);
2578 rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
2579 BUG_ON(rc);
2580}
2581
2582void ocfs2_unblock_signals(sigset_t *oldset)
2583{
2584 int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
2585 BUG_ON(rc);
2586}
2587
2512module_init(ocfs2_init); 2588module_init(ocfs2_init);
2513module_exit(ocfs2_exit); 2589module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48/*
49 * Void signal blockers, because in-kernel sigprocmask() only fails
50 * when SIG_* is wrong.
51 */
52void ocfs2_block_signals(sigset_t *oldset);
53void ocfs2_unblock_signals(sigset_t *oldset);
54
48#endif /* OCFS2_SUPER_H */ 55#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3e7773089b96..e97b34842cfe 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
79 struct ocfs2_alloc_context *meta_ac; 79 struct ocfs2_alloc_context *meta_ac;
80 struct ocfs2_alloc_context *data_ac; 80 struct ocfs2_alloc_context *data_ac;
81 struct ocfs2_cached_dealloc_ctxt dealloc; 81 struct ocfs2_cached_dealloc_ctxt dealloc;
82 int set_abort;
82}; 83};
83 84
84#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 85#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
96 .xv.xr_list.l_count = cpu_to_le16(1), 97 .xv.xr_list.l_count = cpu_to_le16(1),
97}; 98};
98 99
99struct xattr_handler *ocfs2_xattr_handlers[] = { 100const struct xattr_handler *ocfs2_xattr_handlers[] = {
100 &ocfs2_xattr_user_handler, 101 &ocfs2_xattr_user_handler,
101 &ocfs2_xattr_acl_access_handler, 102 &ocfs2_xattr_acl_access_handler,
102 &ocfs2_xattr_acl_default_handler, 103 &ocfs2_xattr_acl_default_handler,
@@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
105 NULL 106 NULL
106}; 107};
107 108
108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 109static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
110 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
111 = &ocfs2_xattr_acl_access_handler, 112 = &ocfs2_xattr_acl_access_handler,
@@ -539,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
539 540
540static inline const char *ocfs2_xattr_prefix(int name_index) 541static inline const char *ocfs2_xattr_prefix(int name_index)
541{ 542{
542 struct xattr_handler *handler = NULL; 543 const struct xattr_handler *handler = NULL;
543 544
544 if (name_index > 0 && name_index < OCFS2_XATTR_MAX) 545 if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
545 handler = ocfs2_xattr_handler_map[name_index]; 546 handler = ocfs2_xattr_handler_map[name_index];
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
739 goto leave; 740 goto leave;
740 } 741 }
741 742
742 status = ocfs2_journal_dirty(handle, vb->vb_bh); 743 ocfs2_journal_dirty(handle, vb->vb_bh);
743 if (status < 0) {
744 mlog_errno(status);
745 goto leave;
746 }
747 744
748 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; 745 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
749 746
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
786 } 783 }
787 784
788 le32_add_cpu(&vb->vb_xv->xr_clusters, -len); 785 le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
789 786 ocfs2_journal_dirty(handle, vb->vb_bh);
790 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
791 if (ret) {
792 mlog_errno(ret);
793 goto out;
794 }
795 787
796 if (ext_flags & OCFS2_EXT_REFCOUNTED) 788 if (ext_flags & OCFS2_EXT_REFCOUNTED)
797 ret = ocfs2_decrease_refcount(inode, handle, 789 ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1374 memset(bh->b_data + cp_len, 0, 1366 memset(bh->b_data + cp_len, 0,
1375 blocksize - cp_len); 1367 blocksize - cp_len);
1376 1368
1377 ret = ocfs2_journal_dirty(handle, bh); 1369 ocfs2_journal_dirty(handle, bh);
1378 if (ret < 0) {
1379 mlog_errno(ret);
1380 goto out;
1381 }
1382 brelse(bh); 1370 brelse(bh);
1383 bh = NULL; 1371 bh = NULL;
1384 1372
@@ -2148,15 +2136,19 @@ alloc_value:
2148 orig_clusters = ocfs2_xa_value_clusters(loc); 2136 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); 2137 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) { 2138 if (rc < 0) {
2151 /* 2139 ctxt->set_abort = 1;
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing", 2140 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters); 2141 orig_clusters);
2142 /*
2143 * If we were growing an existing value,
2144 * ocfs2_xa_cleanup_value_truncate() won't remove
2145 * the entry. We need to restore the original value
2146 * size.
2147 */
2148 if (loc->xl_entry) {
2149 BUG_ON(!orig_value_size);
2150 loc->xl_entry->xe_value_size = orig_value_size;
2151 }
2160 mlog_errno(rc); 2152 mlog_errno(rc);
2161 } 2153 }
2162 } 2154 }
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
2479 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 2471 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
2480 blk = le64_to_cpu(xb->xb_blkno); 2472 blk = le64_to_cpu(xb->xb_blkno);
2481 bit = le16_to_cpu(xb->xb_suballoc_bit); 2473 bit = le16_to_cpu(xb->xb_suballoc_bit);
2482 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2474 if (xb->xb_suballoc_loc)
2475 bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
2476 else
2477 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2483 2478
2484 xb_alloc_inode = ocfs2_get_system_file_inode(osb, 2479 xb_alloc_inode = ocfs2_get_system_file_inode(osb,
2485 EXTENT_ALLOC_SYSTEM_INODE, 2480 EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
2594 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2589 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2595 spin_unlock(&oi->ip_lock); 2590 spin_unlock(&oi->ip_lock);
2596 2591
2597 ret = ocfs2_journal_dirty(handle, di_bh); 2592 ocfs2_journal_dirty(handle, di_bh);
2598 if (ret < 0)
2599 mlog_errno(ret);
2600out_commit: 2593out_commit:
2601 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 2594 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
2602out: 2595out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2717 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock); 2718 spin_unlock(&oi->ip_lock);
2726 2719
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh); 2720 ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730 2721
2731out: 2722out:
2732 return ret; 2723 return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2846 int ret; 2837 int ret;
2847 u16 suballoc_bit_start; 2838 u16 suballoc_bit_start;
2848 u32 num_got; 2839 u32 num_got;
2849 u64 first_blkno; 2840 u64 suballoc_loc, first_blkno;
2850 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; 2841 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
2851 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2852 struct buffer_head *new_bh = NULL; 2842 struct buffer_head *new_bh = NULL;
2853 struct ocfs2_xattr_block *xblk; 2843 struct ocfs2_xattr_block *xblk;
2854 2844
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2859 goto end; 2849 goto end;
2860 } 2850 }
2861 2851
2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1, 2852 ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
2863 &suballoc_bit_start, &num_got, 2853 &suballoc_loc, &suballoc_bit_start,
2864 &first_blkno); 2854 &num_got, &first_blkno);
2865 if (ret < 0) { 2855 if (ret < 0) {
2866 mlog_errno(ret); 2856 mlog_errno(ret);
2867 goto end; 2857 goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2883 memset(xblk, 0, inode->i_sb->s_blocksize); 2873 memset(xblk, 0, inode->i_sb->s_blocksize);
2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2874 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); 2875 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2876 xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2877 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2878 xblk->xb_fs_generation =
2879 cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
2888 xblk->xb_blkno = cpu_to_le64(first_blkno); 2880 xblk->xb_blkno = cpu_to_le64(first_blkno);
2889 if (indexed) { 2881 if (indexed) {
2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2882 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2956 ret = ocfs2_xa_set(&loc, xi, ctxt); 2948 ret = ocfs2_xa_set(&loc, xi, ctxt);
2957 if (!ret) 2949 if (!ret)
2958 xs->here = loc.xl_entry; 2950 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC) 2951 else if ((ret != -ENOSPC) || ctxt->set_abort)
2960 goto end; 2952 goto end;
2961 else { 2953 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2954 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3312 goto out; 3304 goto out;
3313 } 3305 }
3314 3306
3315 ret = ocfs2_extend_trans(ctxt->handle, credits + 3307 ret = ocfs2_extend_trans(ctxt->handle, credits);
3316 ctxt->handle->h_buffer_credits);
3317 if (ret) { 3308 if (ret) {
3318 mlog_errno(ret); 3309 mlog_errno(ret);
3319 goto out; 3310 goto out;
3320 } 3311 }
3321 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); 3312 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
3322 } else if (ret == -ENOSPC) { 3313 } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
3323 if (di->i_xattr_loc && !xbs->xattr_bh) { 3314 if (di->i_xattr_loc && !xbs->xattr_bh) {
3324 ret = ocfs2_xattr_block_find(inode, 3315 ret = ocfs2_xattr_block_find(inode,
3325 xi->xi_name_index, 3316 xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3343 goto out; 3334 goto out;
3344 } 3335 }
3345 3336
3346 ret = ocfs2_extend_trans(ctxt->handle, credits + 3337 ret = ocfs2_extend_trans(ctxt->handle, credits);
3347 ctxt->handle->h_buffer_credits);
3348 if (ret) { 3338 if (ret) {
3349 mlog_errno(ret); 3339 mlog_errno(ret);
3350 goto out; 3340 goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3378 goto out; 3368 goto out;
3379 } 3369 }
3380 3370
3381 ret = ocfs2_extend_trans(ctxt->handle, credits + 3371 ret = ocfs2_extend_trans(ctxt->handle, credits);
3382 ctxt->handle->h_buffer_credits);
3383 if (ret) { 3372 if (ret) {
3384 mlog_errno(ret); 3373 mlog_errno(ret);
3385 goto out; 3374 goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4249 u32 bit_off, len; 4238 u32 bit_off, len;
4250 u64 blkno; 4239 u64 blkno;
4251 handle_t *handle = ctxt->handle; 4240 handle_t *handle = ctxt->handle;
4252 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4253 struct ocfs2_inode_info *oi = OCFS2_I(inode); 4241 struct ocfs2_inode_info *oi = OCFS2_I(inode);
4254 struct buffer_head *xb_bh = xs->xattr_bh; 4242 struct buffer_head *xb_bh = xs->xattr_bh;
4255 struct ocfs2_xattr_block *xb = 4243 struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4277 goto out; 4265 goto out;
4278 } 4266 }
4279 4267
4280 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 4268 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
4281 1, 1, &bit_off, &len); 4269 1, 1, &bit_off, &len);
4282 if (ret) { 4270 if (ret) {
4283 mlog_errno(ret); 4271 mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
4887 * We need to update the first bucket of the old extent and all 4875 * We need to update the first bucket of the old extent and all
4888 * the buckets going to the new extent. 4876 * the buckets going to the new extent.
4889 */ 4877 */
4890 credits = ((num_buckets + 1) * blks_per_bucket) + 4878 credits = ((num_buckets + 1) * blks_per_bucket);
4891 handle->h_buffer_credits;
4892 ret = ocfs2_extend_trans(handle, credits); 4879 ret = ocfs2_extend_trans(handle, credits);
4893 if (ret) { 4880 if (ret) {
4894 mlog_errno(ret); 4881 mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
4958 u32 *first_hash) 4945 u32 *first_hash)
4959{ 4946{
4960 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4947 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4961 int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; 4948 int ret, credits = 2 * blk_per_bucket;
4962 4949
4963 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); 4950 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
4964 4951
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5099 goto leave; 5086 goto leave;
5100 } 5087 }
5101 5088
5102 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, 5089 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
5103 clusters_to_add, &bit_off, &num_bits); 5090 clusters_to_add, &bit_off, &num_bits);
5104 if (ret < 0) { 5091 if (ret < 0) {
5105 if (ret != -ENOSPC) 5092 if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5153 goto leave; 5140 goto leave;
5154 } 5141 }
5155 5142
5156 ret = ocfs2_journal_dirty(handle, root_bh); 5143 ocfs2_journal_dirty(handle, root_bh);
5157 if (ret < 0)
5158 mlog_errno(ret);
5159 5144
5160leave: 5145leave:
5161 return ret; 5146 return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
5200 * existing bucket. Then we add the last existing bucket, the 5185 * existing bucket. Then we add the last existing bucket, the
5201 * new bucket, and the first bucket (3 * blk_per_bucket). 5186 * new bucket, and the first bucket (3 * blk_per_bucket).
5202 */ 5187 */
5203 credits = (end_blk - target_blk) + (3 * blk_per_bucket) + 5188 credits = (end_blk - target_blk) + (3 * blk_per_bucket);
5204 handle->h_buffer_credits;
5205 ret = ocfs2_extend_trans(handle, credits); 5189 ret = ocfs2_extend_trans(handle, credits);
5206 if (ret) { 5190 if (ret) {
5207 mlog_errno(ret); 5191 mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5477 } 5461 }
5478 5462
5479 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); 5463 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
5480 5464 ocfs2_journal_dirty(handle, root_bh);
5481 ret = ocfs2_journal_dirty(handle, root_bh);
5482 if (ret) {
5483 mlog_errno(ret);
5484 goto out_commit;
5485 }
5486 5465
5487 ret = ocfs2_truncate_log_append(osb, handle, blkno, len); 5466 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
5488 if (ret) 5467 if (ret)
@@ -6935,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6935 goto out; 6914 goto out;
6936 } 6915 }
6937 6916
6938 ret = ocfs2_claim_clusters(osb, handle, data_ac, 6917 ret = ocfs2_claim_clusters(handle, data_ac,
6939 len, &p_cluster, &num_clusters); 6918 len, &p_cluster, &num_clusters);
6940 if (ret) { 6919 if (ret) {
6941 mlog_errno(ret); 6920 mlog_errno(ret);
@@ -7234,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle,
7234 xattr_ac, data_ac); 7213 xattr_ac, data_ac);
7235} 7214}
7236 7215
7237struct xattr_handler ocfs2_xattr_security_handler = { 7216const struct xattr_handler ocfs2_xattr_security_handler = {
7238 .prefix = XATTR_SECURITY_PREFIX, 7217 .prefix = XATTR_SECURITY_PREFIX,
7239 .list = ocfs2_xattr_security_list, 7218 .list = ocfs2_xattr_security_list,
7240 .get = ocfs2_xattr_security_get, 7219 .get = ocfs2_xattr_security_get,
@@ -7278,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
7278 name, value, size, flags); 7257 name, value, size, flags);
7279} 7258}
7280 7259
7281struct xattr_handler ocfs2_xattr_trusted_handler = { 7260const struct xattr_handler ocfs2_xattr_trusted_handler = {
7282 .prefix = XATTR_TRUSTED_PREFIX, 7261 .prefix = XATTR_TRUSTED_PREFIX,
7283 .list = ocfs2_xattr_trusted_list, 7262 .list = ocfs2_xattr_trusted_list,
7284 .get = ocfs2_xattr_trusted_get, 7263 .get = ocfs2_xattr_trusted_get,
@@ -7334,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
7334 name, value, size, flags); 7313 name, value, size, flags);
7335} 7314}
7336 7315
7337struct xattr_handler ocfs2_xattr_user_handler = { 7316const struct xattr_handler ocfs2_xattr_user_handler = {
7338 .prefix = XATTR_USER_PREFIX, 7317 .prefix = XATTR_USER_PREFIX,
7339 .list = ocfs2_xattr_user_list, 7318 .list = ocfs2_xattr_user_list,
7340 .get = ocfs2_xattr_user_get, 7319 .get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
37 size_t value_len; 37 size_t value_len;
38}; 38};
39 39
40extern struct xattr_handler ocfs2_xattr_user_handler; 40extern const struct xattr_handler ocfs2_xattr_user_handler;
41extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern const struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler; 42extern const struct xattr_handler ocfs2_xattr_security_handler;
43extern struct xattr_handler ocfs2_xattr_acl_access_handler; 43extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
44extern struct xattr_handler ocfs2_xattr_acl_default_handler; 44extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
45extern struct xattr_handler *ocfs2_xattr_handlers[]; 45extern const struct xattr_handler *ocfs2_xattr_handlers[];
46 46
47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, 48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index b44bb835e8ea..089839a6cc64 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
37 goto fail; 37 goto fail;
38 38
39 inode->i_ino = new_block; 39 inode->i_ino = new_block;
40 inode->i_mode = mode; 40 inode_init_owner(inode, NULL, mode);
41 inode->i_uid = current_fsuid();
42 inode->i_gid = current_fsgid();
43 inode->i_mapping->a_ops = &omfs_aops; 41 inode->i_mapping->a_ops = &omfs_aops;
44 42
45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 43 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9f718e..5463266db9e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -17,7 +17,6 @@
17#include <linux/securebits.h> 17#include <linux/securebits.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/vfs.h>
21#include <linux/fcntl.h> 20#include <linux/fcntl.h>
22#include <linux/slab.h> 21#include <linux/slab.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
@@ -33,171 +32,6 @@
33 32
34#include "internal.h" 33#include "internal.h"
35 34
36int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
37{
38 int retval = -ENODEV;
39
40 if (dentry) {
41 retval = -ENOSYS;
42 if (dentry->d_sb->s_op->statfs) {
43 memset(buf, 0, sizeof(*buf));
44 retval = security_sb_statfs(dentry);
45 if (retval)
46 return retval;
47 retval = dentry->d_sb->s_op->statfs(dentry, buf);
48 if (retval == 0 && buf->f_frsize == 0)
49 buf->f_frsize = buf->f_bsize;
50 }
51 }
52 return retval;
53}
54
55EXPORT_SYMBOL(vfs_statfs);
56
57static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
58{
59 struct kstatfs st;
60 int retval;
61
62 retval = vfs_statfs(dentry, &st);
63 if (retval)
64 return retval;
65
66 if (sizeof(*buf) == sizeof(st))
67 memcpy(buf, &st, sizeof(st));
68 else {
69 if (sizeof buf->f_blocks == 4) {
70 if ((st.f_blocks | st.f_bfree | st.f_bavail |
71 st.f_bsize | st.f_frsize) &
72 0xffffffff00000000ULL)
73 return -EOVERFLOW;
74 /*
75 * f_files and f_ffree may be -1; it's okay to stuff
76 * that into 32 bits
77 */
78 if (st.f_files != -1 &&
79 (st.f_files & 0xffffffff00000000ULL))
80 return -EOVERFLOW;
81 if (st.f_ffree != -1 &&
82 (st.f_ffree & 0xffffffff00000000ULL))
83 return -EOVERFLOW;
84 }
85
86 buf->f_type = st.f_type;
87 buf->f_bsize = st.f_bsize;
88 buf->f_blocks = st.f_blocks;
89 buf->f_bfree = st.f_bfree;
90 buf->f_bavail = st.f_bavail;
91 buf->f_files = st.f_files;
92 buf->f_ffree = st.f_ffree;
93 buf->f_fsid = st.f_fsid;
94 buf->f_namelen = st.f_namelen;
95 buf->f_frsize = st.f_frsize;
96 memset(buf->f_spare, 0, sizeof(buf->f_spare));
97 }
98 return 0;
99}
100
101static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
102{
103 struct kstatfs st;
104 int retval;
105
106 retval = vfs_statfs(dentry, &st);
107 if (retval)
108 return retval;
109
110 if (sizeof(*buf) == sizeof(st))
111 memcpy(buf, &st, sizeof(st));
112 else {
113 buf->f_type = st.f_type;
114 buf->f_bsize = st.f_bsize;
115 buf->f_blocks = st.f_blocks;
116 buf->f_bfree = st.f_bfree;
117 buf->f_bavail = st.f_bavail;
118 buf->f_files = st.f_files;
119 buf->f_ffree = st.f_ffree;
120 buf->f_fsid = st.f_fsid;
121 buf->f_namelen = st.f_namelen;
122 buf->f_frsize = st.f_frsize;
123 memset(buf->f_spare, 0, sizeof(buf->f_spare));
124 }
125 return 0;
126}
127
128SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
129{
130 struct path path;
131 int error;
132
133 error = user_path(pathname, &path);
134 if (!error) {
135 struct statfs tmp;
136 error = vfs_statfs_native(path.dentry, &tmp);
137 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
138 error = -EFAULT;
139 path_put(&path);
140 }
141 return error;
142}
143
144SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
145{
146 struct path path;
147 long error;
148
149 if (sz != sizeof(*buf))
150 return -EINVAL;
151 error = user_path(pathname, &path);
152 if (!error) {
153 struct statfs64 tmp;
154 error = vfs_statfs64(path.dentry, &tmp);
155 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
156 error = -EFAULT;
157 path_put(&path);
158 }
159 return error;
160}
161
162SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
163{
164 struct file * file;
165 struct statfs tmp;
166 int error;
167
168 error = -EBADF;
169 file = fget(fd);
170 if (!file)
171 goto out;
172 error = vfs_statfs_native(file->f_path.dentry, &tmp);
173 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
174 error = -EFAULT;
175 fput(file);
176out:
177 return error;
178}
179
180SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
181{
182 struct file * file;
183 struct statfs64 tmp;
184 int error;
185
186 if (sz != sizeof(*buf))
187 return -EINVAL;
188
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = vfs_statfs64(file->f_path.dentry, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error;
199}
200
201int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 35int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
202 struct file *filp) 36 struct file *filp)
203{ 37{
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
70 70
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS) 72 defined(CONFIG_ACORN_PARTITION_ADFS)
73static int 73static int riscix_partition(struct parsed_partitions *state,
74riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 74 unsigned long first_sect, int slot,
75 unsigned long first_sect, int slot, unsigned long nr_sects) 75 unsigned long nr_sects)
76{ 76{
77 Sector sect; 77 Sector sect;
78 struct riscix_record *rr; 78 struct riscix_record *rr;
79 79
80 rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect); 80 rr = read_part_sector(state, first_sect, &sect);
81 if (!rr) 81 if (!rr)
82 return -1; 82 return -1;
83 83
@@ -123,9 +123,9 @@ struct linux_part {
123 123
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS) 125 defined(CONFIG_ACORN_PARTITION_ADFS)
126static int 126static int linux_partition(struct parsed_partitions *state,
127linux_partition(struct parsed_partitions *state, struct block_device *bdev, 127 unsigned long first_sect, int slot,
128 unsigned long first_sect, int slot, unsigned long nr_sects) 128 unsigned long nr_sects)
129{ 129{
130 Sector sect; 130 Sector sect;
131 struct linux_part *linuxp; 131 struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
135 135
136 put_partition(state, slot++, first_sect, size); 136 put_partition(state, slot++, first_sect, size);
137 137
138 linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect); 138 linuxp = read_part_sector(state, first_sect, &sect);
139 if (!linuxp) 139 if (!linuxp)
140 return -1; 140 return -1;
141 141
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
157#endif 157#endif
158 158
159#ifdef CONFIG_ACORN_PARTITION_CUMANA 159#ifdef CONFIG_ACORN_PARTITION_CUMANA
160int 160int adfspart_check_CUMANA(struct parsed_partitions *state)
161adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
162{ 161{
163 unsigned long first_sector = 0; 162 unsigned long first_sector = 0;
164 unsigned int start_blk = 0; 163 unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
185 struct adfs_discrecord *dr; 184 struct adfs_discrecord *dr;
186 unsigned int nr_sects; 185 unsigned int nr_sects;
187 186
188 data = read_dev_sector(bdev, start_blk * 2 + 6, &sect); 187 data = read_part_sector(state, start_blk * 2 + 6, &sect);
189 if (!data) 188 if (!data)
190 return -1; 189 return -1;
191 190
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
217#ifdef CONFIG_ACORN_PARTITION_RISCIX 216#ifdef CONFIG_ACORN_PARTITION_RISCIX
218 case PARTITION_RISCIX_SCSI: 217 case PARTITION_RISCIX_SCSI:
219 /* RISCiX - we don't know how to find the next one. */ 218 /* RISCiX - we don't know how to find the next one. */
220 slot = riscix_partition(state, bdev, first_sector, 219 slot = riscix_partition(state, first_sector, slot,
221 slot, nr_sects); 220 nr_sects);
222 break; 221 break;
223#endif 222#endif
224 223
225 case PARTITION_LINUX: 224 case PARTITION_LINUX:
226 slot = linux_partition(state, bdev, first_sector, 225 slot = linux_partition(state, first_sector, slot,
227 slot, nr_sects); 226 nr_sects);
228 break; 227 break;
229 } 228 }
230 put_dev_sector(sect); 229 put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
249 * hda1 = ADFS partition on first drive. 248 * hda1 = ADFS partition on first drive.
250 * hda2 = non-ADFS partition. 249 * hda2 = non-ADFS partition.
251 */ 250 */
252int 251int adfspart_check_ADFS(struct parsed_partitions *state)
253adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
254{ 252{
255 unsigned long start_sect, nr_sects, sectscyl, heads; 253 unsigned long start_sect, nr_sects, sectscyl, heads;
256 Sector sect; 254 Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
259 unsigned char id; 257 unsigned char id;
260 int slot = 1; 258 int slot = 1;
261 259
262 data = read_dev_sector(bdev, 6, &sect); 260 data = read_part_sector(state, 6, &sect);
263 if (!data) 261 if (!data)
264 return -1; 262 return -1;
265 263
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
278 /* 276 /*
279 * Work out start of non-adfs partition. 277 * Work out start of non-adfs partition.
280 */ 278 */
281 nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; 279 nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
282 280
283 if (start_sect) { 281 if (start_sect) {
284 switch (id) { 282 switch (id) {
285#ifdef CONFIG_ACORN_PARTITION_RISCIX 283#ifdef CONFIG_ACORN_PARTITION_RISCIX
286 case PARTITION_RISCIX_SCSI: 284 case PARTITION_RISCIX_SCSI:
287 case PARTITION_RISCIX_MFM: 285 case PARTITION_RISCIX_MFM:
288 slot = riscix_partition(state, bdev, start_sect, 286 slot = riscix_partition(state, start_sect, slot,
289 slot, nr_sects); 287 nr_sects);
290 break; 288 break;
291#endif 289#endif
292 290
293 case PARTITION_LINUX: 291 case PARTITION_LINUX:
294 slot = linux_partition(state, bdev, start_sect, 292 slot = linux_partition(state, start_sect, slot,
295 slot, nr_sects); 293 nr_sects);
296 break; 294 break;
297 } 295 }
298 } 296 }
@@ -308,10 +306,11 @@ struct ics_part {
308 __le32 size; 306 __le32 size;
309}; 307};
310 308
311static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) 309static int adfspart_check_ICSLinux(struct parsed_partitions *state,
310 unsigned long block)
312{ 311{
313 Sector sect; 312 Sector sect;
314 unsigned char *data = read_dev_sector(bdev, block, &sect); 313 unsigned char *data = read_part_sector(state, block, &sect);
315 int result = 0; 314 int result = 0;
316 315
317 if (data) { 316 if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
349 * hda2 = ADFS partition 1 on first drive. 348 * hda2 = ADFS partition 1 on first drive.
350 * ..etc.. 349 * ..etc..
351 */ 350 */
352int 351int adfspart_check_ICS(struct parsed_partitions *state)
353adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
354{ 352{
355 const unsigned char *data; 353 const unsigned char *data;
356 const struct ics_part *p; 354 const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
360 /* 358 /*
361 * Try ICS style partitions - sector 0 contains partition info. 359 * Try ICS style partitions - sector 0 contains partition info.
362 */ 360 */
363 data = read_dev_sector(bdev, 0, &sect); 361 data = read_part_sector(state, 0, &sect);
364 if (!data) 362 if (!data)
365 return -1; 363 return -1;
366 364
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
392 * partition is. We must not make this visible 390 * partition is. We must not make this visible
393 * to the filesystem. 391 * to the filesystem.
394 */ 392 */
395 if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { 393 if (size > 1 && adfspart_check_ICSLinux(state, start)) {
396 start += 1; 394 start += 1;
397 size -= 1; 395 size -= 1;
398 } 396 }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
446 * hda2 = ADFS partition 1 on first drive. 444 * hda2 = ADFS partition 1 on first drive.
447 * ..etc.. 445 * ..etc..
448 */ 446 */
449int 447int adfspart_check_POWERTEC(struct parsed_partitions *state)
450adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
451{ 448{
452 Sector sect; 449 Sector sect;
453 const unsigned char *data; 450 const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
455 int slot = 1; 452 int slot = 1;
456 int i; 453 int i;
457 454
458 data = read_dev_sector(bdev, 0, &sect); 455 data = read_part_sector(state, 0, &sect);
459 if (!data) 456 if (!data)
460 return -1; 457 return -1;
461 458
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
508 * 1. The individual ADFS boot block entries that are placed on the disk. 505 * 1. The individual ADFS boot block entries that are placed on the disk.
509 * 2. The start address of the next entry. 506 * 2. The start address of the next entry.
510 */ 507 */
511int 508int adfspart_check_EESOX(struct parsed_partitions *state)
512adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
513{ 509{
514 Sector sect; 510 Sector sect;
515 const unsigned char *data; 511 const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
518 sector_t start = 0; 514 sector_t start = 0;
519 int i, slot = 1; 515 int i, slot = 1;
520 516
521 data = read_dev_sector(bdev, 7, &sect); 517 data = read_part_sector(state, 7, &sect);
522 if (!data) 518 if (!data)
523 return -1; 519 return -1;
524 520
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
545 if (i != 0) { 541 if (i != 0) {
546 sector_t size; 542 sector_t size;
547 543
548 size = get_capacity(bdev->bd_disk); 544 size = get_capacity(state->bdev->bd_disk);
549 put_partition(state, slot++, start, size - start); 545 put_partition(state, slot++, start, size - start);
550 printk("\n"); 546 printk("\n");
551 } 547 }
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
7 * format, and everyone stick to it? 7 * format, and everyone stick to it?
8 */ 8 */
9 9
10int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); 10int adfspart_check_CUMANA(struct parsed_partitions *state);
11int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); 11int adfspart_check_ADFS(struct parsed_partitions *state);
12int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); 12int adfspart_check_ICS(struct parsed_partitions *state);
13int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); 13int adfspart_check_POWERTEC(struct parsed_partitions *state);
14int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); 14int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
23 return sum; 23 return sum;
24} 24}
25 25
26int 26int amiga_partition(struct parsed_partitions *state)
27amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
28{ 27{
29 Sector sect; 28 Sector sect;
30 unsigned char *data; 29 unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
38 for (blk = 0; ; blk++, put_dev_sector(sect)) { 37 for (blk = 0; ; blk++, put_dev_sector(sect)) {
39 if (blk == RDB_ALLOCATION_LIMIT) 38 if (blk == RDB_ALLOCATION_LIMIT)
40 goto rdb_done; 39 goto rdb_done;
41 data = read_dev_sector(bdev, blk, &sect); 40 data = read_part_sector(state, blk, &sect);
42 if (!data) { 41 if (!data) {
43 if (warn_no_part) 42 if (warn_no_part)
44 printk("Dev %s: unable to read RDB block %d\n", 43 printk("Dev %s: unable to read RDB block %d\n",
45 bdevname(bdev, b), blk); 44 bdevname(state->bdev, b), blk);
46 res = -1; 45 res = -1;
47 goto rdb_done; 46 goto rdb_done;
48 } 47 }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
64 } 63 }
65 64
66 printk("Dev %s: RDB in block %d has bad checksum\n", 65 printk("Dev %s: RDB in block %d has bad checksum\n",
67 bdevname(bdev, b), blk); 66 bdevname(state->bdev, b), blk);
68 } 67 }
69 68
70 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
75 put_dev_sector(sect); 74 put_dev_sector(sect);
76 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 75 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
77 blk *= blksize; /* Read in terms partition table understands */ 76 blk *= blksize; /* Read in terms partition table understands */
78 data = read_dev_sector(bdev, blk, &sect); 77 data = read_part_sector(state, blk, &sect);
79 if (!data) { 78 if (!data) {
80 if (warn_no_part) 79 if (warn_no_part)
81 printk("Dev %s: unable to read partition block %d\n", 80 printk("Dev %s: unable to read partition block %d\n",
82 bdevname(bdev, b), blk); 81 bdevname(state->bdev, b), blk);
83 res = -1; 82 res = -1;
84 goto rdb_done; 83 goto rdb_done;
85 } 84 }
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
2 * fs/partitions/amiga.h 2 * fs/partitions/amiga.h
3 */ 3 */
4 4
5int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); 5int amiga_partition(struct parsed_partitions *state);
6 6
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
30 memcmp (s, "RAW", 3) == 0 ; 30 memcmp (s, "RAW", 3) == 0 ;
31} 31}
32 32
33int atari_partition(struct parsed_partitions *state, struct block_device *bdev) 33int atari_partition(struct parsed_partitions *state)
34{ 34{
35 Sector sect; 35 Sector sect;
36 struct rootsector *rs; 36 struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ 42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
43#endif 43#endif
44 44
45 rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect); 45 rs = read_part_sector(state, 0, &sect);
46 if (!rs) 46 if (!rs)
47 return -1; 47 return -1;
48 48
49 /* Verify this is an Atari rootsector: */ 49 /* Verify this is an Atari rootsector: */
50 hd_size = bdev->bd_inode->i_size >> 9; 50 hd_size = state->bdev->bd_inode->i_size >> 9;
51 if (!VALID_PARTITION(&rs->part[0], hd_size) && 51 if (!VALID_PARTITION(&rs->part[0], hd_size) &&
52 !VALID_PARTITION(&rs->part[1], hd_size) && 52 !VALID_PARTITION(&rs->part[1], hd_size) &&
53 !VALID_PARTITION(&rs->part[2], hd_size) && 53 !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
84 printk(" XGM<"); 84 printk(" XGM<");
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
88 if (!xrs) { 88 if (!xrs) {
89 printk (" block %ld read failed\n", partsect); 89 printk (" block %ld read failed\n", partsect);
90 put_dev_sector(sect); 90 put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
31 u16 checksum; /* checksum for bootable disks */ 31 u16 checksum; /* checksum for bootable disks */
32} __attribute__((__packed__)); 32} __attribute__((__packed__));
33 33
34int atari_partition(struct parsed_partitions *state, struct block_device *bdev); 34int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
45 45
46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ 46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
47 47
48static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { 48static int (*check_part[])(struct parsed_partitions *) = {
49 /* 49 /*
50 * Probe partition formats with tables at disk address 0 50 * Probe partition formats with tables at disk address 0
51 * that also have an ADFS boot block at 0xdc0. 51 * that also have an ADFS boot block at 0xdc0.
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
161 struct parsed_partitions *state; 161 struct parsed_partitions *state;
162 int i, res, err; 162 int i, res, err;
163 163
164 state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
165 if (!state) 165 if (!state)
166 return NULL; 166 return NULL;
167 167
168 state->bdev = bdev;
168 disk_name(hd, 0, state->name); 169 disk_name(hd, 0, state->name);
169 printk(KERN_INFO " %s:", state->name); 170 printk(KERN_INFO " %s:", state->name);
170 if (isdigit(state->name[strlen(state->name)-1])) 171 if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
174 i = res = err = 0; 175 i = res = err = 0;
175 while (!res && check_part[i]) { 176 while (!res && check_part[i]) {
176 memset(&state->parts, 0, sizeof(state->parts)); 177 memset(&state->parts, 0, sizeof(state->parts));
177 res = check_part[i++](state, bdev); 178 res = check_part[i++](state);
178 if (res < 0) { 179 if (res < 0) {
179 /* We have hit an I/O error which we don't report now. 180 /* We have hit an I/O error which we don't report now.
180 * But record it, and let the others do their job. 181 * But record it, and let the others do their job.
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
186 } 187 }
187 if (res > 0) 188 if (res > 0)
188 return state; 189 return state;
190 if (state->access_beyond_eod)
191 err = -ENOSPC;
189 if (err) 192 if (err)
190 /* The partition is unrecognized. So report I/O errors if there were any */ 193 /* The partition is unrecognized. So report I/O errors if there were any */
191 res = err; 194 res = err;
@@ -538,12 +541,33 @@ exit:
538 disk_part_iter_exit(&piter); 541 disk_part_iter_exit(&piter);
539} 542}
540 543
544static bool disk_unlock_native_capacity(struct gendisk *disk)
545{
546 const struct block_device_operations *bdops = disk->fops;
547
548 if (bdops->unlock_native_capacity &&
549 !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
550 printk(KERN_CONT "enabling native capacity\n");
551 bdops->unlock_native_capacity(disk);
552 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
553 return true;
554 } else {
555 printk(KERN_CONT "truncated\n");
556 return false;
557 }
558}
559
541int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 560int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
542{ 561{
562 struct parsed_partitions *state = NULL;
543 struct disk_part_iter piter; 563 struct disk_part_iter piter;
544 struct hd_struct *part; 564 struct hd_struct *part;
545 struct parsed_partitions *state;
546 int p, highest, res; 565 int p, highest, res;
566rescan:
567 if (state && !IS_ERR(state)) {
568 kfree(state);
569 state = NULL;
570 }
547 571
548 if (bdev->bd_part_count) 572 if (bdev->bd_part_count)
549 return -EBUSY; 573 return -EBUSY;
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
562 bdev->bd_invalidated = 0; 586 bdev->bd_invalidated = 0;
563 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 587 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
564 return 0; 588 return 0;
565 if (IS_ERR(state)) /* I/O error reading the partition table */ 589 if (IS_ERR(state)) {
590 /*
591 * I/O error reading the partition table. If any
592 * partition code tried to read beyond EOD, retry
593 * after unlocking native capacity.
594 */
595 if (PTR_ERR(state) == -ENOSPC) {
596 printk(KERN_WARNING "%s: partition table beyond EOD, ",
597 disk->disk_name);
598 if (disk_unlock_native_capacity(disk))
599 goto rescan;
600 }
566 return -EIO; 601 return -EIO;
602 }
603 /*
604 * If any partition code tried to read beyond EOD, try
605 * unlocking native capacity even if partition table is
606 * sucessfully read as we could be missing some partitions.
607 */
608 if (state->access_beyond_eod) {
609 printk(KERN_WARNING
610 "%s: partition table partially beyond EOD, ",
611 disk->disk_name);
612 if (disk_unlock_native_capacity(disk))
613 goto rescan;
614 }
567 615
568 /* tell userspace that the media / partition table may have changed */ 616 /* tell userspace that the media / partition table may have changed */
569 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); 617 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
581 /* add partitions */ 629 /* add partitions */
582 for (p = 1; p < state->limit; p++) { 630 for (p = 1; p < state->limit; p++) {
583 sector_t size, from; 631 sector_t size, from;
584try_scan: 632
585 size = state->parts[p].size; 633 size = state->parts[p].size;
586 if (!size) 634 if (!size)
587 continue; 635 continue;
@@ -589,30 +637,21 @@ try_scan:
589 from = state->parts[p].from; 637 from = state->parts[p].from;
590 if (from >= get_capacity(disk)) { 638 if (from >= get_capacity(disk)) {
591 printk(KERN_WARNING 639 printk(KERN_WARNING
592 "%s: p%d ignored, start %llu is behind the end of the disk\n", 640 "%s: p%d start %llu is beyond EOD, ",
593 disk->disk_name, p, (unsigned long long) from); 641 disk->disk_name, p, (unsigned long long) from);
642 if (disk_unlock_native_capacity(disk))
643 goto rescan;
594 continue; 644 continue;
595 } 645 }
596 646
597 if (from + size > get_capacity(disk)) { 647 if (from + size > get_capacity(disk)) {
598 const struct block_device_operations *bdops = disk->fops;
599 unsigned long long capacity;
600
601 printk(KERN_WARNING 648 printk(KERN_WARNING
602 "%s: p%d size %llu exceeds device capacity, ", 649 "%s: p%d size %llu extends beyond EOD, ",
603 disk->disk_name, p, (unsigned long long) size); 650 disk->disk_name, p, (unsigned long long) size);
604 651
605 if (bdops->set_capacity && 652 if (disk_unlock_native_capacity(disk)) {
606 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { 653 /* free state and restart */
607 printk(KERN_CONT "enabling native capacity\n"); 654 goto rescan;
608 capacity = bdops->set_capacity(disk, ~0ULL);
609 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
610 if (capacity > get_capacity(disk)) {
611 set_capacity(disk, capacity);
612 check_disk_size_change(disk, bdev);
613 bdev->bd_invalidated = 0;
614 }
615 goto try_scan;
616 } else { 655 } else {
617 /* 656 /*
618 * we can not ignore partitions of broken tables 657 * we can not ignore partitions of broken tables
@@ -620,7 +659,6 @@ try_scan:
620 * we limit them to the end of the disk to avoid 659 * we limit them to the end of the disk to avoid
621 * creating invalid block devices 660 * creating invalid block devices
622 */ 661 */
623 printk(KERN_CONT "limited to end of disk\n");
624 size = get_capacity(disk) - from; 662 size = get_capacity(disk) - from;
625 } 663 }
626 } 664 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
6 * description. 6 * description.
7 */ 7 */
8struct parsed_partitions { 8struct parsed_partitions {
9 struct block_device *bdev;
9 char name[BDEVNAME_SIZE]; 10 char name[BDEVNAME_SIZE];
10 struct { 11 struct {
11 sector_t from; 12 sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
14 } parts[DISK_MAX_PARTS]; 15 } parts[DISK_MAX_PARTS];
15 int next; 16 int next;
16 int limit; 17 int limit;
18 bool access_beyond_eod;
17}; 19};
18 20
21static inline void *read_part_sector(struct parsed_partitions *state,
22 sector_t n, Sector *p)
23{
24 if (n >= get_capacity(state->bdev->bd_disk)) {
25 state->access_beyond_eod = true;
26 return NULL;
27 }
28 return read_dev_sector(state->bdev, n, p);
29}
30
19static inline void 31static inline void
20put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 32put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
21{ 33{
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..9efb2cfe2410 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
140 * the part[0] entry for this disk, and is the number of 140 * the part[0] entry for this disk, and is the number of
141 * physical sectors available on the disk. 141 * physical sectors available on the disk.
142 */ 142 */
143static u64 143static u64 last_lba(struct block_device *bdev)
144last_lba(struct block_device *bdev)
145{ 144{
146 if (!bdev || !bdev->bd_inode) 145 if (!bdev || !bdev->bd_inode)
147 return 0; 146 return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
181 180
182/** 181/**
183 * read_lba(): Read bytes from disk, starting at given LBA 182 * read_lba(): Read bytes from disk, starting at given LBA
184 * @bdev 183 * @state
185 * @lba 184 * @lba
186 * @buffer 185 * @buffer
187 * @size_t 186 * @size_t
188 * 187 *
189 * Description: Reads @count bytes from @bdev into @buffer. 188 * Description: Reads @count bytes from @state->bdev into @buffer.
190 * Returns number of bytes read on success, 0 on error. 189 * Returns number of bytes read on success, 0 on error.
191 */ 190 */
192static size_t 191static size_t read_lba(struct parsed_partitions *state,
193read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 192 u64 lba, u8 *buffer, size_t count)
194{ 193{
195 size_t totalreadcount = 0; 194 size_t totalreadcount = 0;
195 struct block_device *bdev = state->bdev;
196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512); 196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
197 197
198 if (!bdev || !buffer || lba > last_lba(bdev)) 198 if (!buffer || lba > last_lba(bdev))
199 return 0; 199 return 0;
200 200
201 while (count) { 201 while (count) {
202 int copied = 512; 202 int copied = 512;
203 Sector sect; 203 Sector sect;
204 unsigned char *data = read_dev_sector(bdev, n++, &sect); 204 unsigned char *data = read_part_sector(state, n++, &sect);
205 if (!data) 205 if (!data)
206 break; 206 break;
207 if (copied > count) 207 if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
217 217
218/** 218/**
219 * alloc_read_gpt_entries(): reads partition entries from disk 219 * alloc_read_gpt_entries(): reads partition entries from disk
220 * @bdev 220 * @state
221 * @gpt - GPT header 221 * @gpt - GPT header
222 * 222 *
223 * Description: Returns ptes on success, NULL on error. 223 * Description: Returns ptes on success, NULL on error.
224 * Allocates space for PTEs based on information found in @gpt. 224 * Allocates space for PTEs based on information found in @gpt.
225 * Notes: remember to free pte when you're done! 225 * Notes: remember to free pte when you're done!
226 */ 226 */
227static gpt_entry * 227static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
228alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) 228 gpt_header *gpt)
229{ 229{
230 size_t count; 230 size_t count;
231 gpt_entry *pte; 231 gpt_entry *pte;
232 if (!bdev || !gpt) 232
233 if (!gpt)
233 return NULL; 234 return NULL;
234 235
235 count = le32_to_cpu(gpt->num_partition_entries) * 236 count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
240 if (!pte) 241 if (!pte)
241 return NULL; 242 return NULL;
242 243
243 if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), 244 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
244 (u8 *) pte, 245 (u8 *) pte,
245 count) < count) { 246 count) < count) {
246 kfree(pte); 247 kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
252 253
253/** 254/**
254 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk 255 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
255 * @bdev 256 * @state
256 * @lba is the Logical Block Address of the partition table 257 * @lba is the Logical Block Address of the partition table
257 * 258 *
258 * Description: returns GPT header on success, NULL on error. Allocates 259 * Description: returns GPT header on success, NULL on error. Allocates
259 * and fills a GPT header starting at @ from @bdev. 260 * and fills a GPT header starting at @ from @state->bdev.
260 * Note: remember to free gpt when finished with it. 261 * Note: remember to free gpt when finished with it.
261 */ 262 */
262static gpt_header * 263static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
263alloc_read_gpt_header(struct block_device *bdev, u64 lba) 264 u64 lba)
264{ 265{
265 gpt_header *gpt; 266 gpt_header *gpt;
266 unsigned ssz = bdev_logical_block_size(bdev); 267 unsigned ssz = bdev_logical_block_size(state->bdev);
267
268 if (!bdev)
269 return NULL;
270 268
271 gpt = kzalloc(ssz, GFP_KERNEL); 269 gpt = kzalloc(ssz, GFP_KERNEL);
272 if (!gpt) 270 if (!gpt)
273 return NULL; 271 return NULL;
274 272
275 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { 273 if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
276 kfree(gpt); 274 kfree(gpt);
277 gpt=NULL; 275 gpt=NULL;
278 return NULL; 276 return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
283 281
284/** 282/**
285 * is_gpt_valid() - tests one GPT header and PTEs for validity 283 * is_gpt_valid() - tests one GPT header and PTEs for validity
286 * @bdev 284 * @state
287 * @lba is the logical block address of the GPT header to test 285 * @lba is the logical block address of the GPT header to test
288 * @gpt is a GPT header ptr, filled on return. 286 * @gpt is a GPT header ptr, filled on return.
289 * @ptes is a PTEs ptr, filled on return. 287 * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
291 * Description: returns 1 if valid, 0 on error. 289 * Description: returns 1 if valid, 0 on error.
292 * If valid, returns pointers to newly allocated GPT header and PTEs. 290 * If valid, returns pointers to newly allocated GPT header and PTEs.
293 */ 291 */
294static int 292static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
295is_gpt_valid(struct block_device *bdev, u64 lba, 293 gpt_header **gpt, gpt_entry **ptes)
296 gpt_header **gpt, gpt_entry **ptes)
297{ 294{
298 u32 crc, origcrc; 295 u32 crc, origcrc;
299 u64 lastlba; 296 u64 lastlba;
300 297
301 if (!bdev || !gpt || !ptes) 298 if (!ptes)
302 return 0; 299 return 0;
303 if (!(*gpt = alloc_read_gpt_header(bdev, lba))) 300 if (!(*gpt = alloc_read_gpt_header(state, lba)))
304 return 0; 301 return 0;
305 302
306 /* Check the GUID Partition Table signature */ 303 /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
336 /* Check the first_usable_lba and last_usable_lba are 333 /* Check the first_usable_lba and last_usable_lba are
337 * within the disk. 334 * within the disk.
338 */ 335 */
339 lastlba = last_lba(bdev); 336 lastlba = last_lba(state->bdev);
340 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { 337 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
341 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", 338 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
342 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), 339 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
350 goto fail; 347 goto fail;
351 } 348 }
352 349
353 if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) 350 if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
354 goto fail; 351 goto fail;
355 352
356 /* Check the GUID Partition Entry Array CRC */ 353 /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
495 492
496/** 493/**
497 * find_valid_gpt() - Search disk for valid GPT headers and PTEs 494 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
498 * @bdev 495 * @state
499 * @gpt is a GPT header ptr, filled on return. 496 * @gpt is a GPT header ptr, filled on return.
500 * @ptes is a PTEs ptr, filled on return. 497 * @ptes is a PTEs ptr, filled on return.
501 * Description: Returns 1 if valid, 0 on error. 498 * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
508 * This protects against devices which misreport their size, and forces 505 * This protects against devices which misreport their size, and forces
509 * the user to decide to use the Alternate GPT. 506 * the user to decide to use the Alternate GPT.
510 */ 507 */
511static int 508static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
512find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) 509 gpt_entry **ptes)
513{ 510{
514 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; 511 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
515 gpt_header *pgpt = NULL, *agpt = NULL; 512 gpt_header *pgpt = NULL, *agpt = NULL;
516 gpt_entry *pptes = NULL, *aptes = NULL; 513 gpt_entry *pptes = NULL, *aptes = NULL;
517 legacy_mbr *legacymbr; 514 legacy_mbr *legacymbr;
518 u64 lastlba; 515 u64 lastlba;
519 if (!bdev || !gpt || !ptes) 516
517 if (!ptes)
520 return 0; 518 return 0;
521 519
522 lastlba = last_lba(bdev); 520 lastlba = last_lba(state->bdev);
523 if (!force_gpt) { 521 if (!force_gpt) {
524 /* This will be added to the EFI Spec. per Intel after v1.02. */ 522 /* This will be added to the EFI Spec. per Intel after v1.02. */
525 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); 523 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
526 if (legacymbr) { 524 if (legacymbr) {
527 read_lba(bdev, 0, (u8 *) legacymbr, 525 read_lba(state, 0, (u8 *) legacymbr,
528 sizeof (*legacymbr)); 526 sizeof (*legacymbr));
529 good_pmbr = is_pmbr_valid(legacymbr); 527 good_pmbr = is_pmbr_valid(legacymbr);
530 kfree(legacymbr); 528 kfree(legacymbr);
531 } 529 }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
533 goto fail; 531 goto fail;
534 } 532 }
535 533
536 good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, 534 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
537 &pgpt, &pptes); 535 &pgpt, &pptes);
538 if (good_pgpt) 536 if (good_pgpt)
539 good_agpt = is_gpt_valid(bdev, 537 good_agpt = is_gpt_valid(state,
540 le64_to_cpu(pgpt->alternate_lba), 538 le64_to_cpu(pgpt->alternate_lba),
541 &agpt, &aptes); 539 &agpt, &aptes);
542 if (!good_agpt && force_gpt) 540 if (!good_agpt && force_gpt)
543 good_agpt = is_gpt_valid(bdev, lastlba, 541 good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
544 &agpt, &aptes);
545 542
546 /* The obviously unsuccessful case */ 543 /* The obviously unsuccessful case */
547 if (!good_pgpt && !good_agpt) 544 if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
583} 580}
584 581
585/** 582/**
586 * efi_partition(struct parsed_partitions *state, struct block_device *bdev) 583 * efi_partition(struct parsed_partitions *state)
587 * @state 584 * @state
588 * @bdev
589 * 585 *
590 * Description: called from check.c, if the disk contains GPT 586 * Description: called from check.c, if the disk contains GPT
591 * partitions, sets up partition entries in the kernel. 587 * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
602 * 1 if successful 598 * 1 if successful
603 * 599 *
604 */ 600 */
605int 601int efi_partition(struct parsed_partitions *state)
606efi_partition(struct parsed_partitions *state, struct block_device *bdev)
607{ 602{
608 gpt_header *gpt = NULL; 603 gpt_header *gpt = NULL;
609 gpt_entry *ptes = NULL; 604 gpt_entry *ptes = NULL;
610 u32 i; 605 u32 i;
611 unsigned ssz = bdev_logical_block_size(bdev) / 512; 606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
612 607
613 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
614 kfree(gpt); 609 kfree(gpt);
615 kfree(ptes); 610 kfree(ptes);
616 return 0; 611 return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
623 u64 size = le64_to_cpu(ptes[i].ending_lba) - 618 u64 size = le64_to_cpu(ptes[i].ending_lba) -
624 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 619 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
625 620
626 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 621 if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
627 continue; 622 continue;
628 623
629 put_partition(state, i+1, start * ssz, size * ssz); 624 put_partition(state, i+1, start * ssz, size * ssz);
@@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
631 /* If this is a RAID volume, tell md */ 626 /* If this is a RAID volume, tell md */
632 if (!efi_guidcmp(ptes[i].partition_type_guid, 627 if (!efi_guidcmp(ptes[i].partition_type_guid,
633 PARTITION_LINUX_RAID_GUID)) 628 PARTITION_LINUX_RAID_GUID))
634 state->parts[i+1].flags = 1; 629 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635 } 630 }
636 kfree(ptes); 631 kfree(ptes);
637 kfree(gpt); 632 kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
110} __attribute__ ((packed)) legacy_mbr; 110} __attribute__ ((packed)) legacy_mbr;
111 111
112/* Functions */ 112/* Functions */
113extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); 113extern int efi_partition(struct parsed_partitions *state);
114 114
115#endif 115#endif
116 116
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
58 58
59/* 59/*
60 */ 60 */
61int 61int ibm_partition(struct parsed_partitions *state)
62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
63{ 62{
63 struct block_device *bdev = state->bdev;
64 int blocksize, res; 64 int blocksize, res;
65 loff_t i_size, offset, size, fmt_size; 65 loff_t i_size, offset, size, fmt_size;
66 dasd_information2_t *info; 66 dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
100 /* 100 /*
101 * Get volume label, extract name and type. 101 * Get volume label, extract name and type.
102 */ 102 */
103 data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect); 103 data = read_part_sector(state, info->label_block*(blocksize/512),
104 &sect);
104 if (data == NULL) 105 if (data == NULL)
105 goto out_readerr; 106 goto out_readerr;
106 107
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
193 */ 194 */
194 blk = cchhb2blk(&label->vol.vtoc, geo) + 1; 195 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
195 counter = 0; 196 counter = 0;
196 data = read_dev_sector(bdev, blk * (blocksize/512), 197 data = read_part_sector(state, blk * (blocksize/512),
197 &sect); 198 &sect);
198 while (data != NULL) { 199 while (data != NULL) {
199 struct vtoc_format1_label f1; 200 struct vtoc_format1_label f1;
200 201
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
208 || f1.DS1FMTID == _ascebc['7'] 209 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) { 210 || f1.DS1FMTID == _ascebc['9']) {
210 blk++; 211 blk++;
211 data = read_dev_sector(bdev, blk * 212 data = read_part_sector(state,
212 (blocksize/512), 213 blk * (blocksize/512), &sect);
213 &sect);
214 continue; 214 continue;
215 } 215 }
216 216
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
230 size * (blocksize >> 9)); 230 size * (blocksize >> 9));
231 counter++; 231 counter++;
232 blk++; 232 blk++;
233 data = read_dev_sector(bdev, 233 data = read_part_sector(state,
234 blk * (blocksize/512), 234 blk * (blocksize/512), &sect);
235 &sect);
236 } 235 }
237 236
238 if (!data) 237 if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
int ibm_partition(struct parsed_partitions *, struct block_device *); int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "karma.h" 10#include "karma.h"
11 11
12int karma_partition(struct parsed_partitions *state, struct block_device *bdev) 12int karma_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 int slot = 1; 15 int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
29 } __attribute__((packed)) *label; 29 } __attribute__((packed)) *label;
30 struct d_partition *p; 30 struct d_partition *p;
31 31
32 data = read_dev_sector(bdev, 0, &sect); 32 data = read_part_sector(state, 0, &sect);
33 if (!data) 33 if (!data)
34 return -1; 34 return -1;
35 35
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
4 4
5#define KARMA_LABEL_MAGIC 0xAB56 5#define KARMA_LABEL_MAGIC 0xAB56
6 6
7int karma_partition(struct parsed_partitions *state, struct block_device *bdev); 7int karma_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/stringify.h> 28#include <linux/stringify.h>
29#include <linux/kernel.h>
29#include "ldm.h" 30#include "ldm.h"
30#include "check.h" 31#include "check.h"
31#include "msdos.h" 32#include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
77 int h; 78 int h;
78 79
79 /* high part */ 80 /* high part */
80 if ((x = src[0] - '0') <= '9'-'0') h = x; 81 x = h = hex_to_bin(src[0]);
81 else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; 82 if (h < 0)
82 else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; 83 return -1;
83 else return -1;
84 h <<= 4;
85 84
86 /* low part */ 85 /* low part */
87 if ((x = src[1] - '0') <= '9'-'0') return h | x; 86 h = hex_to_bin(src[1]);
88 if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); 87 if (h < 0)
89 if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); 88 return -1;
90 return -1; 89
90 return (x << 4) + h;
91} 91}
92 92
93/** 93/**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
309 309
310/** 310/**
311 * ldm_validate_privheads - Compare the primary privhead with its backups 311 * ldm_validate_privheads - Compare the primary privhead with its backups
312 * @bdev: Device holding the LDM Database 312 * @state: Partition check state including device holding the LDM Database
313 * @ph1: Memory struct to fill with ph contents 313 * @ph1: Memory struct to fill with ph contents
314 * 314 *
315 * Read and compare all three privheads from disk. 315 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
321 * Return: 'true' Success 321 * Return: 'true' Success
322 * 'false' Error 322 * 'false' Error
323 */ 323 */
324static bool ldm_validate_privheads (struct block_device *bdev, 324static bool ldm_validate_privheads(struct parsed_partitions *state,
325 struct privhead *ph1) 325 struct privhead *ph1)
326{ 326{
327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; 327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
328 struct privhead *ph[3] = { ph1 }; 328 struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
332 long num_sects; 332 long num_sects;
333 int i; 333 int i;
334 334
335 BUG_ON (!bdev || !ph1); 335 BUG_ON (!state || !ph1);
336 336
337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); 337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); 338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
346 346
347 /* Read and parse privheads */ 347 /* Read and parse privheads */
348 for (i = 0; i < 3; i++) { 348 for (i = 0; i < 3; i++) {
349 data = read_dev_sector (bdev, 349 data = read_part_sector(state, ph[0]->config_start + off[i],
350 ph[0]->config_start + off[i], &sect); 350 &sect);
351 if (!data) { 351 if (!data) {
352 ldm_crit ("Disk read failed."); 352 ldm_crit ("Disk read failed.");
353 goto out; 353 goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
363 } 363 }
364 } 364 }
365 365
366 num_sects = bdev->bd_inode->i_size >> 9; 366 num_sects = state->bdev->bd_inode->i_size >> 9;
367 367
368 if ((ph[0]->config_start > num_sects) || 368 if ((ph[0]->config_start > num_sects) ||
369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { 369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
397 397
398/** 398/**
399 * ldm_validate_tocblocks - Validate the table of contents and its backups 399 * ldm_validate_tocblocks - Validate the table of contents and its backups
400 * @bdev: Device holding the LDM Database 400 * @state: Partition check state including device holding the LDM Database
401 * @base: Offset, into @bdev, of the database 401 * @base: Offset, into @state->bdev, of the database
402 * @ldb: Cache of the database structures 402 * @ldb: Cache of the database structures
403 * 403 *
404 * Find and compare the four tables of contents of the LDM Database stored on 404 * Find and compare the four tables of contents of the LDM Database stored on
405 * @bdev and return the parsed information into @toc1. 405 * @state->bdev and return the parsed information into @toc1.
406 * 406 *
407 * The offsets and sizes of the configs are range-checked against a privhead. 407 * The offsets and sizes of the configs are range-checked against a privhead.
408 * 408 *
409 * Return: 'true' @toc1 contains validated TOCBLOCK info 409 * Return: 'true' @toc1 contains validated TOCBLOCK info
410 * 'false' @toc1 contents are undefined 410 * 'false' @toc1 contents are undefined
411 */ 411 */
412static bool ldm_validate_tocblocks(struct block_device *bdev, 412static bool ldm_validate_tocblocks(struct parsed_partitions *state,
413 unsigned long base, struct ldmdb *ldb) 413 unsigned long base, struct ldmdb *ldb)
414{ 414{
415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; 415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
416 struct tocblock *tb[4]; 416 struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
420 int i, nr_tbs; 420 int i, nr_tbs;
421 bool result = false; 421 bool result = false;
422 422
423 BUG_ON(!bdev || !ldb); 423 BUG_ON(!state || !ldb);
424 ph = &ldb->ph; 424 ph = &ldb->ph;
425 tb[0] = &ldb->toc; 425 tb[0] = &ldb->toc;
426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); 426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
437 * skip any that fail as long as we get at least one valid TOCBLOCK. 437 * skip any that fail as long as we get at least one valid TOCBLOCK.
438 */ 438 */
439 for (nr_tbs = i = 0; i < 4; i++) { 439 for (nr_tbs = i = 0; i < 4; i++) {
440 data = read_dev_sector(bdev, base + off[i], &sect); 440 data = read_part_sector(state, base + off[i], &sect);
441 if (!data) { 441 if (!data) {
442 ldm_error("Disk read failed for TOCBLOCK %d.", i); 442 ldm_error("Disk read failed for TOCBLOCK %d.", i);
443 continue; 443 continue;
@@ -473,7 +473,7 @@ err:
473 473
474/** 474/**
475 * ldm_validate_vmdb - Read the VMDB and validate it 475 * ldm_validate_vmdb - Read the VMDB and validate it
476 * @bdev: Device holding the LDM Database 476 * @state: Partition check state including device holding the LDM Database
477 * @base: Offset, into @bdev, of the database 477 * @base: Offset, into @bdev, of the database
478 * @ldb: Cache of the database structures 478 * @ldb: Cache of the database structures
479 * 479 *
@@ -483,8 +483,8 @@ err:
483 * Return: 'true' @ldb contains validated VBDB info 483 * Return: 'true' @ldb contains validated VBDB info
484 * 'false' @ldb contents are undefined 484 * 'false' @ldb contents are undefined
485 */ 485 */
486static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, 486static bool ldm_validate_vmdb(struct parsed_partitions *state,
487 struct ldmdb *ldb) 487 unsigned long base, struct ldmdb *ldb)
488{ 488{
489 Sector sect; 489 Sector sect;
490 u8 *data; 490 u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
492 struct vmdb *vm; 492 struct vmdb *vm;
493 struct tocblock *toc; 493 struct tocblock *toc;
494 494
495 BUG_ON (!bdev || !ldb); 495 BUG_ON (!state || !ldb);
496 496
497 vm = &ldb->vm; 497 vm = &ldb->vm;
498 toc = &ldb->toc; 498 toc = &ldb->toc;
499 499
500 data = read_dev_sector (bdev, base + OFF_VMDB, &sect); 500 data = read_part_sector(state, base + OFF_VMDB, &sect);
501 if (!data) { 501 if (!data) {
502 ldm_crit ("Disk read failed."); 502 ldm_crit ("Disk read failed.");
503 return false; 503 return false;
@@ -534,21 +534,21 @@ out:
534 534
535/** 535/**
536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk 536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
537 * @bdev: Device holding the LDM Database 537 * @state: Partition check state including device holding the LDM Database
538 * 538 *
539 * This function provides a weak test to decide whether the device is a dynamic 539 * This function provides a weak test to decide whether the device is a dynamic
540 * disk or not. It looks for an MS-DOS-style partition table containing at 540 * disk or not. It looks for an MS-DOS-style partition table containing at
541 * least one partition of type 0x42 (formerly SFS, now used by Windows for 541 * least one partition of type 0x42 (formerly SFS, now used by Windows for
542 * dynamic disks). 542 * dynamic disks).
543 * 543 *
544 * N.B. The only possible error can come from the read_dev_sector and that is 544 * N.B. The only possible error can come from the read_part_sector and that is
545 * only likely to happen if the underlying device is strange. If that IS 545 * only likely to happen if the underlying device is strange. If that IS
546 * the case we should return zero to let someone else try. 546 * the case we should return zero to let someone else try.
547 * 547 *
548 * Return: 'true' @bdev is a dynamic disk 548 * Return: 'true' @state->bdev is a dynamic disk
549 * 'false' @bdev is not a dynamic disk, or an error occurred 549 * 'false' @state->bdev is not a dynamic disk, or an error occurred
550 */ 550 */
551static bool ldm_validate_partition_table (struct block_device *bdev) 551static bool ldm_validate_partition_table(struct parsed_partitions *state)
552{ 552{
553 Sector sect; 553 Sector sect;
554 u8 *data; 554 u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
556 int i; 556 int i;
557 bool result = false; 557 bool result = false;
558 558
559 BUG_ON (!bdev); 559 BUG_ON(!state);
560 560
561 data = read_dev_sector (bdev, 0, &sect); 561 data = read_part_sector(state, 0, &sect);
562 if (!data) { 562 if (!data) {
563 ldm_crit ("Disk read failed."); 563 ldm_crit ("Disk read failed.");
564 return false; 564 return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1391 1391
1392/** 1392/**
1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory 1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
1394 * @bdev: Device holding the LDM Database 1394 * @state: Partition check state including device holding the LDM Database
1395 * @base: Offset, into @bdev, of the database 1395 * @base: Offset, into @state->bdev, of the database
1396 * @ldb: Cache of the database structures 1396 * @ldb: Cache of the database structures
1397 * 1397 *
1398 * To use the information from the VBLKs, they need to be read from the disk, 1398 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1401 * Return: 'true' All the VBLKs were read successfully 1401 * Return: 'true' All the VBLKs were read successfully
1402 * 'false' An error occurred 1402 * 'false' An error occurred
1403 */ 1403 */
1404static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, 1404static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
1405 struct ldmdb *ldb) 1405 struct ldmdb *ldb)
1406{ 1406{
1407 int size, perbuf, skip, finish, s, v, recs; 1407 int size, perbuf, skip, finish, s, v, recs;
1408 u8 *data = NULL; 1408 u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1410 bool result = false; 1410 bool result = false;
1411 LIST_HEAD (frags); 1411 LIST_HEAD (frags);
1412 1412
1413 BUG_ON (!bdev || !ldb); 1413 BUG_ON(!state || !ldb);
1414 1414
1415 size = ldb->vm.vblk_size; 1415 size = ldb->vm.vblk_size;
1416 perbuf = 512 / size; 1416 perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1418 finish = (size * ldb->vm.last_vblk_seq) >> 9; 1418 finish = (size * ldb->vm.last_vblk_seq) >> 9;
1419 1419
1420 for (s = skip; s < finish; s++) { /* For each sector */ 1420 for (s = skip; s < finish; s++) { /* For each sector */
1421 data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect); 1421 data = read_part_sector(state, base + OFF_VMDB + s, &sect);
1422 if (!data) { 1422 if (!data) {
1423 ldm_crit ("Disk read failed."); 1423 ldm_crit ("Disk read failed.");
1424 goto out; 1424 goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
1474 1474
1475/** 1475/**
1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it 1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it
1477 * @pp: List of the partitions parsed so far 1477 * @state: Partition check state including device holding the LDM Database
1478 * @bdev: Device holding the LDM Database
1479 * 1478 *
1480 * This determines whether the device @bdev is a dynamic disk and if so creates 1479 * This determines whether the device @bdev is a dynamic disk and if so creates
1481 * the partitions necessary in the gendisk structure pointed to by @hd. 1480 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
1485 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, 1484 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
1486 * and so on: the actual data containing partitions. 1485 * and so on: the actual data containing partitions.
1487 * 1486 *
1488 * Return: 1 Success, @bdev is a dynamic disk and we handled it 1487 * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
1489 * 0 Success, @bdev is not a dynamic disk 1488 * 0 Success, @state->bdev is not a dynamic disk
1490 * -1 An error occurred before enough information had been read 1489 * -1 An error occurred before enough information had been read
1491 * Or @bdev is a dynamic disk, but it may be corrupted 1490 * Or @state->bdev is a dynamic disk, but it may be corrupted
1492 */ 1491 */
1493int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) 1492int ldm_partition(struct parsed_partitions *state)
1494{ 1493{
1495 struct ldmdb *ldb; 1494 struct ldmdb *ldb;
1496 unsigned long base; 1495 unsigned long base;
1497 int result = -1; 1496 int result = -1;
1498 1497
1499 BUG_ON (!pp || !bdev); 1498 BUG_ON(!state);
1500 1499
1501 /* Look for signs of a Dynamic Disk */ 1500 /* Look for signs of a Dynamic Disk */
1502 if (!ldm_validate_partition_table (bdev)) 1501 if (!ldm_validate_partition_table(state))
1503 return 0; 1502 return 0;
1504 1503
1505 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); 1504 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1509 } 1508 }
1510 1509
1511 /* Parse and check privheads. */ 1510 /* Parse and check privheads. */
1512 if (!ldm_validate_privheads (bdev, &ldb->ph)) 1511 if (!ldm_validate_privheads(state, &ldb->ph))
1513 goto out; /* Already logged */ 1512 goto out; /* Already logged */
1514 1513
1515 /* All further references are relative to base (database start). */ 1514 /* All further references are relative to base (database start). */
1516 base = ldb->ph.config_start; 1515 base = ldb->ph.config_start;
1517 1516
1518 /* Parse and check tocs and vmdb. */ 1517 /* Parse and check tocs and vmdb. */
1519 if (!ldm_validate_tocblocks (bdev, base, ldb) || 1518 if (!ldm_validate_tocblocks(state, base, ldb) ||
1520 !ldm_validate_vmdb (bdev, base, ldb)) 1519 !ldm_validate_vmdb(state, base, ldb))
1521 goto out; /* Already logged */ 1520 goto out; /* Already logged */
1522 1521
1523 /* Initialize vblk lists in ldmdb struct */ 1522 /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1527 INIT_LIST_HEAD (&ldb->v_comp); 1526 INIT_LIST_HEAD (&ldb->v_comp);
1528 INIT_LIST_HEAD (&ldb->v_part); 1527 INIT_LIST_HEAD (&ldb->v_part);
1529 1528
1530 if (!ldm_get_vblks (bdev, base, ldb)) { 1529 if (!ldm_get_vblks(state, base, ldb)) {
1531 ldm_crit ("Failed to read the VBLKs from the database."); 1530 ldm_crit ("Failed to read the VBLKs from the database.");
1532 goto cleanup; 1531 goto cleanup;
1533 } 1532 }
1534 1533
1535 /* Finally, create the data partition devices. */ 1534 /* Finally, create the data partition devices. */
1536 if (ldm_create_data_partitions (pp, ldb)) { 1535 if (ldm_create_data_partitions(state, ldb)) {
1537 ldm_debug ("Parsed LDM database successfully."); 1536 ldm_debug ("Parsed LDM database successfully.");
1538 result = 1; 1537 result = 1;
1539 } 1538 }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */
209 struct list_head v_part; 209 struct list_head v_part;
210}; 210};
211 211
212int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); 212int ldm_partition(struct parsed_partitions *state);
213 213
214#endif /* _FS_PT_LDM_H_ */ 214#endif /* _FS_PT_LDM_H_ */
215 215
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..74465ff7c263 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
27 stg[i] = 0; 27 stg[i] = 0;
28} 28}
29 29
30int mac_partition(struct parsed_partitions *state, struct block_device *bdev) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1; 32 int slot = 1;
33 Sector sect; 33 Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
42 struct mac_driver_desc *md; 42 struct mac_driver_desc *md;
43 43
44 /* Get 0th block and look at the first partition map entry. */ 44 /* Get 0th block and look at the first partition map entry. */
45 md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect); 45 md = read_part_sector(state, 0, &sect);
46 if (!md) 46 if (!md)
47 return -1; 47 return -1;
48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { 48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
51 } 51 }
52 secsize = be16_to_cpu(md->block_size); 52 secsize = be16_to_cpu(md->block_size);
53 put_dev_sector(sect); 53 put_dev_sector(sect);
54 data = read_dev_sector(bdev, secsize/512, &sect); 54 data = read_part_sector(state, secsize/512, &sect);
55 if (!data) 55 if (!data)
56 return -1; 56 return -1;
57 part = (struct mac_partition *) (data + secsize%512); 57 part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
66 put_dev_sector(sect); 66 put_dev_sector(sect);
67 data = read_dev_sector(bdev, pos/512, &sect); 67 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 68 if (!data)
69 return -1; 69 return -1;
70 part = (struct mac_partition *) (data + pos%512); 70 part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
75 be32_to_cpu(part->block_count) * (secsize/512)); 75 be32_to_cpu(part->block_count) * (secsize/512));
76 76
77 if (!strnicmp(part->type, "Linux_RAID", 10)) 77 if (!strnicmp(part->type, "Linux_RAID", 10))
78 state->parts[slot].flags = 1; 78 state->parts[slot].flags = ADDPART_FLAG_RAID;
79#ifdef CONFIG_PPC_PMAC 79#ifdef CONFIG_PPC_PMAC
80 /* 80 /*
81 * If this is the first bootable partition, tell the 81 * If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
123 } 123 }
124#ifdef CONFIG_PPC_PMAC 124#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 125 if (found_root_goodness)
126 note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); 126 note_bootable_part(state->bdev->bd_dev, found_root,
127 found_root_goodness);
127#endif 128#endif
128 129
129 put_dev_sector(sect); 130 put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
41 /* ... more stuff */ 41 /* ... more stuff */
42}; 42};
43 43
44int mac_partition(struct parsed_partitions *state, struct block_device *bdev); 44int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..15bfb7b1e044 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
64#define AIX_LABEL_MAGIC2 0xC2 64#define AIX_LABEL_MAGIC2 0xC2
65#define AIX_LABEL_MAGIC3 0xD4 65#define AIX_LABEL_MAGIC3 0xD4
66#define AIX_LABEL_MAGIC4 0xC1 66#define AIX_LABEL_MAGIC4 0xC1
67static int aix_magic_present(unsigned char *p, struct block_device *bdev) 67static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
68{ 68{
69 struct partition *pt = (struct partition *) (p + 0x1be); 69 struct partition *pt = (struct partition *) (p + 0x1be);
70 Sector sect; 70 Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
85 is_extended_partition(pt)) 85 is_extended_partition(pt))
86 return 0; 86 return 0;
87 } 87 }
88 d = read_dev_sector(bdev, 7, &sect); 88 d = read_part_sector(state, 7, &sect);
89 if (d) { 89 if (d) {
90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') 90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
91 ret = 1; 91 ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
105 * only for the actual data partitions. 105 * only for the actual data partitions.
106 */ 106 */
107 107
108static void 108static void parse_extended(struct parsed_partitions *state,
109parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109 sector_t first_sector, sector_t first_size)
110 sector_t first_sector, sector_t first_size)
111{ 110{
112 struct partition *p; 111 struct partition *p;
113 Sector sect; 112 Sector sect;
114 unsigned char *data; 113 unsigned char *data;
115 sector_t this_sector, this_size; 114 sector_t this_sector, this_size;
116 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 115 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
117 int loopct = 0; /* number of links followed 116 int loopct = 0; /* number of links followed
118 without finding a data partition */ 117 without finding a data partition */
119 int i; 118 int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
126 return; 125 return;
127 if (state->next == state->limit) 126 if (state->next == state->limit)
128 return; 127 return;
129 data = read_dev_sector(bdev, this_sector, &sect); 128 data = read_part_sector(state, this_sector, &sect);
130 if (!data) 129 if (!data)
131 return; 130 return;
132 131
@@ -198,9 +197,8 @@ done:
198/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also 197/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
199 indicates linux swap. Be careful before believing this is Solaris. */ 198 indicates linux swap. Be careful before believing this is Solaris. */
200 199
201static void 200static void parse_solaris_x86(struct parsed_partitions *state,
202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 201 sector_t offset, sector_t size, int origin)
203 sector_t offset, sector_t size, int origin)
204{ 202{
205#ifdef CONFIG_SOLARIS_X86_PARTITION 203#ifdef CONFIG_SOLARIS_X86_PARTITION
206 Sector sect; 204 Sector sect;
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
208 int i; 206 int i;
209 short max_nparts; 207 short max_nparts;
210 208
211 v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect); 209 v = read_part_sector(state, offset + 1, &sect);
212 if (!v) 210 if (!v)
213 return; 211 return;
214 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { 212 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
245 * Create devices for BSD partitions listed in a disklabel, under a 243 * Create devices for BSD partitions listed in a disklabel, under a
246 * dos-like partition. See parse_extended() for more information. 244 * dos-like partition. See parse_extended() for more information.
247 */ 245 */
248static void 246static void parse_bsd(struct parsed_partitions *state,
249parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 247 sector_t offset, sector_t size, int origin, char *flavour,
250 sector_t offset, sector_t size, int origin, char *flavour, 248 int max_partitions)
251 int max_partitions)
252{ 249{
253 Sector sect; 250 Sector sect;
254 struct bsd_disklabel *l; 251 struct bsd_disklabel *l;
255 struct bsd_partition *p; 252 struct bsd_partition *p;
256 253
257 l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect); 254 l = read_part_sector(state, offset + 1, &sect);
258 if (!l) 255 if (!l)
259 return; 256 return;
260 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { 257 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
291} 288}
292#endif 289#endif
293 290
294static void 291static void parse_freebsd(struct parsed_partitions *state,
295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 292 sector_t offset, sector_t size, int origin)
296 sector_t offset, sector_t size, int origin)
297{ 293{
298#ifdef CONFIG_BSD_DISKLABEL 294#ifdef CONFIG_BSD_DISKLABEL
299 parse_bsd(state, bdev, offset, size, origin, 295 parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
300 "bsd", BSD_MAXPARTITIONS);
301#endif 296#endif
302} 297}
303 298
304static void 299static void parse_netbsd(struct parsed_partitions *state,
305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 300 sector_t offset, sector_t size, int origin)
306 sector_t offset, sector_t size, int origin)
307{ 301{
308#ifdef CONFIG_BSD_DISKLABEL 302#ifdef CONFIG_BSD_DISKLABEL
309 parse_bsd(state, bdev, offset, size, origin, 303 parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
310 "netbsd", BSD_MAXPARTITIONS);
311#endif 304#endif
312} 305}
313 306
314static void 307static void parse_openbsd(struct parsed_partitions *state,
315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 308 sector_t offset, sector_t size, int origin)
316 sector_t offset, sector_t size, int origin)
317{ 309{
318#ifdef CONFIG_BSD_DISKLABEL 310#ifdef CONFIG_BSD_DISKLABEL
319 parse_bsd(state, bdev, offset, size, origin, 311 parse_bsd(state, offset, size, origin, "openbsd",
320 "openbsd", OPENBSD_MAXPARTITIONS); 312 OPENBSD_MAXPARTITIONS);
321#endif 313#endif
322} 314}
323 315
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
325 * Create devices for Unixware partitions listed in a disklabel, under a 317 * Create devices for Unixware partitions listed in a disklabel, under a
326 * dos-like partition. See parse_extended() for more information. 318 * dos-like partition. See parse_extended() for more information.
327 */ 319 */
328static void 320static void parse_unixware(struct parsed_partitions *state,
329parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 321 sector_t offset, sector_t size, int origin)
330 sector_t offset, sector_t size, int origin)
331{ 322{
332#ifdef CONFIG_UNIXWARE_DISKLABEL 323#ifdef CONFIG_UNIXWARE_DISKLABEL
333 Sector sect; 324 Sector sect;
334 struct unixware_disklabel *l; 325 struct unixware_disklabel *l;
335 struct unixware_slice *p; 326 struct unixware_slice *p;
336 327
337 l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect); 328 l = read_part_sector(state, offset + 29, &sect);
338 if (!l) 329 if (!l)
339 return; 330 return;
340 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || 331 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
365 * Anand Krishnamurthy <anandk@wiproge.med.ge.com> 356 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
366 * Rajeev V. Pillai <rajeevvp@yahoo.com> 357 * Rajeev V. Pillai <rajeevvp@yahoo.com>
367 */ 358 */
368static void 359static void parse_minix(struct parsed_partitions *state,
369parse_minix(struct parsed_partitions *state, struct block_device *bdev, 360 sector_t offset, sector_t size, int origin)
370 sector_t offset, sector_t size, int origin)
371{ 361{
372#ifdef CONFIG_MINIX_SUBPARTITION 362#ifdef CONFIG_MINIX_SUBPARTITION
373 Sector sect; 363 Sector sect;
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
375 struct partition *p; 365 struct partition *p;
376 int i; 366 int i;
377 367
378 data = read_dev_sector(bdev, offset, &sect); 368 data = read_part_sector(state, offset, &sect);
379 if (!data) 369 if (!data)
380 return; 370 return;
381 371
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
404 394
405static struct { 395static struct {
406 unsigned char id; 396 unsigned char id;
407 void (*parse)(struct parsed_partitions *, struct block_device *, 397 void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
408 sector_t, sector_t, int);
409} subtypes[] = { 398} subtypes[] = {
410 {FREEBSD_PARTITION, parse_freebsd}, 399 {FREEBSD_PARTITION, parse_freebsd},
411 {NETBSD_PARTITION, parse_netbsd}, 400 {NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +406,16 @@ static struct {
417 {0, NULL}, 406 {0, NULL},
418}; 407};
419 408
420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 409int msdos_partition(struct parsed_partitions *state)
421{ 410{
422 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 411 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
423 Sector sect; 412 Sector sect;
424 unsigned char *data; 413 unsigned char *data;
425 struct partition *p; 414 struct partition *p;
426 struct fat_boot_sector *fb; 415 struct fat_boot_sector *fb;
427 int slot; 416 int slot;
428 417
429 data = read_dev_sector(bdev, 0, &sect); 418 data = read_part_sector(state, 0, &sect);
430 if (!data) 419 if (!data)
431 return -1; 420 return -1;
432 if (!msdos_magic_present(data + 510)) { 421 if (!msdos_magic_present(data + 510)) {
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
434 return 0; 423 return 0;
435 } 424 }
436 425
437 if (aix_magic_present(data, bdev)) { 426 if (aix_magic_present(state, data)) {
438 put_dev_sector(sect); 427 put_dev_sector(sect);
439 printk( " [AIX]"); 428 printk( " [AIX]");
440 return 0; 429 return 0;
@@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
503 put_partition(state, slot, start, n); 492 put_partition(state, slot, start, n);
504 493
505 printk(" <"); 494 printk(" <");
506 parse_extended(state, bdev, start, size); 495 parse_extended(state, start, size);
507 printk(" >"); 496 printk(" >");
508 continue; 497 continue;
509 } 498 }
510 put_partition(state, slot, start, size); 499 put_partition(state, slot, start, size);
511 if (SYS_IND(p) == LINUX_RAID_PARTITION) 500 if (SYS_IND(p) == LINUX_RAID_PARTITION)
512 state->parts[slot].flags = 1; 501 state->parts[slot].flags = ADDPART_FLAG_RAID;
513 if (SYS_IND(p) == DM6_PARTITION) 502 if (SYS_IND(p) == DM6_PARTITION)
514 printk("[DM]"); 503 printk("[DM]");
515 if (SYS_IND(p) == EZD_PARTITION) 504 if (SYS_IND(p) == EZD_PARTITION)
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
532 521
533 if (!subtypes[n].parse) 522 if (!subtypes[n].parse)
534 continue; 523 continue;
535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size, 524 subtypes[n].parse(state, start_sect(p) * sector_size,
536 nr_sects(p)*sector_size, slot); 525 nr_sects(p) * sector_size, slot);
537 } 526 }
538 put_dev_sector(sect); 527 put_dev_sector(sect);
539 return 1; 528 return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
4 4
5#define MSDOS_LABEL_MAGIC 0xAA55 5#define MSDOS_LABEL_MAGIC 0xAA55
6 6
7int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); 7int msdos_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13int osf_partition(struct parsed_partitions *state, struct block_device *bdev) 13int osf_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 int slot = 1; 16 int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
49 } * label; 49 } * label;
50 struct d_partition * partition; 50 struct d_partition * partition;
51 51
52 data = read_dev_sector(bdev, 0, &sect); 52 data = read_part_sector(state, 0, &sect);
53 if (!data) 53 if (!data)
54 return -1; 54 return -1;
55 55
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
4 4
5#define DISKLABELMAGIC (0x82564557UL) 5#define DISKLABELMAGIC (0x82564557UL)
6 6
7int osf_partition(struct parsed_partitions *state, struct block_device *bdev); 7int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
27 __be32 _unused1; /* Padding */ 27 __be32 _unused1; /* Padding */
28}; 28};
29 29
30int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) 30int sgi_partition(struct parsed_partitions *state)
31{ 31{
32 int i, csum; 32 int i, csum;
33 __be32 magic; 33 __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
39 struct sgi_partition *p; 39 struct sgi_partition *p;
40 char b[BDEVNAME_SIZE]; 40 char b[BDEVNAME_SIZE];
41 41
42 label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect); 42 label = read_part_sector(state, 0, &sect);
43 if (!label) 43 if (!label)
44 return -1; 44 return -1;
45 p = &label->partitions[0]; 45 p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
57 } 57 }
58 if(csum) { 58 if(csum) {
59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", 59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
60 bdevname(bdev, b)); 60 bdevname(state->bdev, b));
61 put_dev_sector(sect); 61 put_dev_sector(sect);
62 return 0; 62 return 0;
63 } 63 }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
2 * fs/partitions/sgi.h 2 * fs/partitions/sgi.h
3 */ 3 */
4 4
5extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); 5extern int sgi_partition(struct parsed_partitions *state);
6 6
7#define SGI_LABEL_MAGIC 0x0be5a941 7#define SGI_LABEL_MAGIC 0x0be5a941
8 8
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "sun.h" 11#include "sun.h"
12 12
13int sun_partition(struct parsed_partitions *state, struct block_device *bdev) 13int sun_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 __be16 csum; 16 __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
61 int use_vtoc; 61 int use_vtoc;
62 int nparts; 62 int nparts;
63 63
64 label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect); 64 label = read_part_sector(state, 0, &sect);
65 if (!label) 65 if (!label)
66 return -1; 66 return -1;
67 67
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
78 csum ^= *ush--; 78 csum ^= *ush--;
79 if (csum) { 79 if (csum) {
80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", 80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
81 bdevname(bdev, b)); 81 bdevname(state->bdev, b));
82 put_dev_sector(sect); 82 put_dev_sector(sect);
83 return 0; 83 return 0;
84 } 84 }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
5#define SUN_LABEL_MAGIC 0xDABE 5#define SUN_LABEL_MAGIC 0xDABE
6#define SUN_VTOC_SANITY 0x600DDEEE 6#define SUN_VTOC_SANITY 0x600DDEEE
7 7
8int sun_partition(struct parsed_partitions *state, struct block_device *bdev); 8int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
46}; 46};
47 47
48 48
49int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) 49int sysv68_partition(struct parsed_partitions *state)
50{ 50{
51 int i, slices; 51 int i, slices;
52 int slot = 1; 52 int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 57
58 data = read_dev_sector(bdev, 0, &sect); 58 data = read_part_sector(state, 0, &sect);
59 if (!data) 59 if (!data)
60 return -1; 60 return -1;
61 61
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
68 i = be32_to_cpu(b->dk_ios.ios_slcblk); 68 i = be32_to_cpu(b->dk_ios.ios_slcblk);
69 put_dev_sector(sect); 69 put_dev_sector(sect);
70 70
71 data = read_dev_sector(bdev, i, &sect); 71 data = read_part_sector(state, i, &sect);
72 if (!data) 72 if (!data)
73 return -1; 73 return -1;
74 74
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "ultrix.h" 10#include "ultrix.h"
11 11
12int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) 12int ultrix_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 Sector sect; 15 Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
26#define PT_MAGIC 0x032957 /* Partition magic number */ 26#define PT_MAGIC 0x032957 /* Partition magic number */
27#define PT_VALID 1 /* Indicates if struct is valid */ 27#define PT_VALID 1 /* Indicates if struct is valid */
28 28
29 data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect); 29 data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
30 if (!data) 30 if (!data)
31 return -1; 31 return -1;
32 32
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
2 * fs/partitions/ultrix.h 2 * fs/partitions/ultrix.h
3 */ 3 */
4 4
5int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); 5int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..d79872eba09a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/log2.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/pipe_fs_i.h> 16#include <linux/pipe_fs_i.h>
16#include <linux/uio.h> 17#include <linux/uio.h>
@@ -18,11 +19,18 @@
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/audit.h> 20#include <linux/audit.h>
20#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/fcntl.h>
21 23
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <asm/ioctls.h> 25#include <asm/ioctls.h>
24 26
25/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-pages
30 */
31unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
32
33/*
26 * We use a start+len construction, which provides full use of the 34 * We use a start+len construction, which provides full use of the
27 * allocated memory. 35 * allocated memory.
28 * -- Florian Coosmann (FGC) 36 * -- Florian Coosmann (FGC)
@@ -390,7 +398,7 @@ redo:
390 if (!buf->len) { 398 if (!buf->len) {
391 buf->ops = NULL; 399 buf->ops = NULL;
392 ops->release(pipe, buf); 400 ops->release(pipe, buf);
393 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 401 curbuf = (curbuf + 1) & (pipe->buffers - 1);
394 pipe->curbuf = curbuf; 402 pipe->curbuf = curbuf;
395 pipe->nrbufs = --bufs; 403 pipe->nrbufs = --bufs;
396 do_wakeup = 1; 404 do_wakeup = 1;
@@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
472 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 480 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
473 if (pipe->nrbufs && chars != 0) { 481 if (pipe->nrbufs && chars != 0) {
474 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 482 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
475 (PIPE_BUFFERS-1); 483 (pipe->buffers - 1);
476 struct pipe_buffer *buf = pipe->bufs + lastbuf; 484 struct pipe_buffer *buf = pipe->bufs + lastbuf;
477 const struct pipe_buf_operations *ops = buf->ops; 485 const struct pipe_buf_operations *ops = buf->ops;
478 int offset = buf->offset + buf->len; 486 int offset = buf->offset + buf->len;
@@ -518,8 +526,8 @@ redo1:
518 break; 526 break;
519 } 527 }
520 bufs = pipe->nrbufs; 528 bufs = pipe->nrbufs;
521 if (bufs < PIPE_BUFFERS) { 529 if (bufs < pipe->buffers) {
522 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 530 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
523 struct pipe_buffer *buf = pipe->bufs + newbuf; 531 struct pipe_buffer *buf = pipe->bufs + newbuf;
524 struct page *page = pipe->tmp_page; 532 struct page *page = pipe->tmp_page;
525 char *src; 533 char *src;
@@ -580,7 +588,7 @@ redo2:
580 if (!total_len) 588 if (!total_len)
581 break; 589 break;
582 } 590 }
583 if (bufs < PIPE_BUFFERS) 591 if (bufs < pipe->buffers)
584 continue; 592 continue;
585 if (filp->f_flags & O_NONBLOCK) { 593 if (filp->f_flags & O_NONBLOCK) {
586 if (!ret) 594 if (!ret)
@@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
640 nrbufs = pipe->nrbufs; 648 nrbufs = pipe->nrbufs;
641 while (--nrbufs >= 0) { 649 while (--nrbufs >= 0) {
642 count += pipe->bufs[buf].len; 650 count += pipe->bufs[buf].len;
643 buf = (buf+1) & (PIPE_BUFFERS-1); 651 buf = (buf+1) & (pipe->buffers - 1);
644 } 652 }
645 mutex_unlock(&inode->i_mutex); 653 mutex_unlock(&inode->i_mutex);
646 654
@@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait)
671 } 679 }
672 680
673 if (filp->f_mode & FMODE_WRITE) { 681 if (filp->f_mode & FMODE_WRITE) {
674 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 682 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
675 /* 683 /*
676 * Most Unices do not set POLLERR for FIFOs but on Linux they 684 * Most Unices do not set POLLERR for FIFOs but on Linux they
677 * behave exactly like pipes for poll(). 685 * behave exactly like pipes for poll().
@@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
877 885
878 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 886 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
879 if (pipe) { 887 if (pipe) {
880 init_waitqueue_head(&pipe->wait); 888 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
881 pipe->r_counter = pipe->w_counter = 1; 889 if (pipe->bufs) {
882 pipe->inode = inode; 890 init_waitqueue_head(&pipe->wait);
891 pipe->r_counter = pipe->w_counter = 1;
892 pipe->inode = inode;
893 pipe->buffers = PIPE_DEF_BUFFERS;
894 return pipe;
895 }
896 kfree(pipe);
883 } 897 }
884 898
885 return pipe; 899 return NULL;
886} 900}
887 901
888void __free_pipe_info(struct pipe_inode_info *pipe) 902void __free_pipe_info(struct pipe_inode_info *pipe)
889{ 903{
890 int i; 904 int i;
891 905
892 for (i = 0; i < PIPE_BUFFERS; i++) { 906 for (i = 0; i < pipe->buffers; i++) {
893 struct pipe_buffer *buf = pipe->bufs + i; 907 struct pipe_buffer *buf = pipe->bufs + i;
894 if (buf->ops) 908 if (buf->ops)
895 buf->ops->release(pipe, buf); 909 buf->ops->release(pipe, buf);
896 } 910 }
897 if (pipe->tmp_page) 911 if (pipe->tmp_page)
898 __free_page(pipe->tmp_page); 912 __free_page(pipe->tmp_page);
913 kfree(pipe->bufs);
899 kfree(pipe); 914 kfree(pipe);
900} 915}
901 916
@@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1094} 1109}
1095 1110
1096/* 1111/*
1112 * Allocate a new array of pipe buffers and copy the info over. Returns the
1113 * pipe size if successful, or return -ERROR on error.
1114 */
1115static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1116{
1117 struct pipe_buffer *bufs;
1118
1119 /*
1120 * Must be a power-of-2 currently
1121 */
1122 if (!is_power_of_2(arg))
1123 return -EINVAL;
1124
1125 /*
1126 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1127 * expect a lot of shrink+grow operations, just free and allocate
1128 * again like we would do for growing. If the pipe currently
1129 * contains more buffers than arg, then return busy.
1130 */
1131 if (arg < pipe->nrbufs)
1132 return -EBUSY;
1133
1134 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
1135 if (unlikely(!bufs))
1136 return -ENOMEM;
1137
1138 /*
1139 * The pipe array wraps around, so just start the new one at zero
1140 * and adjust the indexes.
1141 */
1142 if (pipe->nrbufs) {
1143 const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
1144 const unsigned int head = pipe->nrbufs - tail;
1145
1146 if (head)
1147 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1148 if (tail)
1149 memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
1150 }
1151
1152 pipe->curbuf = 0;
1153 kfree(pipe->bufs);
1154 pipe->bufs = bufs;
1155 pipe->buffers = arg;
1156 return arg;
1157}
1158
1159long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1160{
1161 struct pipe_inode_info *pipe;
1162 long ret;
1163
1164 pipe = file->f_path.dentry->d_inode->i_pipe;
1165 if (!pipe)
1166 return -EBADF;
1167
1168 mutex_lock(&pipe->inode->i_mutex);
1169
1170 switch (cmd) {
1171 case F_SETPIPE_SZ:
1172 if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
1173 return -EINVAL;
1174 /*
1175 * The pipe needs to be at least 2 pages large to
1176 * guarantee POSIX behaviour.
1177 */
1178 if (arg < 2)
1179 return -EINVAL;
1180 ret = pipe_set_size(pipe, arg);
1181 break;
1182 case F_GETPIPE_SZ:
1183 ret = pipe->buffers;
1184 break;
1185 default:
1186 ret = -EINVAL;
1187 break;
1188 }
1189
1190 mutex_unlock(&pipe->inode->i_mutex);
1191 return ret;
1192}
1193
1194/*
1097 * pipefs should _never_ be mounted by userland - too much of security hassle, 1195 * pipefs should _never_ be mounted by userland - too much of security hassle,
1098 * no real gain from having the whole whorehouse mounted. So we don't need 1196 * no real gain from having the whole whorehouse mounted. So we don't need
1099 * any operations on the root directory. However, we need a non-trivial 1197 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
634 return err; 634 return err;
635} 635}
636 636
637#ifdef CONFIG_HUGETLB_PAGE
637static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 638static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
638{ 639{
639 u64 pme = 0; 640 u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
664 665
665 return err; 666 return err;
666} 667}
668#endif /* HUGETLB_PAGE */
667 669
668/* 670/*
669 * /proc/pid/pagemap - an array mapping virtual pages to pfns 671 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
733 735
734 pagemap_walk.pmd_entry = pagemap_pte_range; 736 pagemap_walk.pmd_entry = pagemap_pte_range;
735 pagemap_walk.pte_hole = pagemap_pte_hole; 737 pagemap_walk.pte_hole = pagemap_pte_hole;
738#ifdef CONFIG_HUGETLB_PAGE
736 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 739 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
740#endif
737 pagemap_walk.mm = mm; 741 pagemap_walk.mm = mm;
738 pagemap_walk.private = &pm; 742 pagemap_walk.private = &pm;
739 743
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 788b5802a7ce..655a4c52b8c3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -82,7 +82,7 @@
82 82
83/* 83/*
84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas 84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
85 * and quota formats, dqstats structure containing statistics about the lists 85 * and quota formats.
86 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and 86 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
87 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. 87 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
88 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly 88 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -132,7 +132,9 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); 132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
135static char *quotatypes[] = INITQFNAMES; 136static char *quotatypes[] = INITQFNAMES;
137#endif
136static struct quota_format_type *quota_formats; /* List of registered formats */ 138static struct quota_format_type *quota_formats; /* List of registered formats */
137static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; 139static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
138 140
@@ -226,6 +228,10 @@ static struct hlist_head *dquot_hash;
226 228
227struct dqstats dqstats; 229struct dqstats dqstats;
228EXPORT_SYMBOL(dqstats); 230EXPORT_SYMBOL(dqstats);
231#ifdef CONFIG_SMP
232struct dqstats *dqstats_pcpu;
233EXPORT_SYMBOL(dqstats_pcpu);
234#endif
229 235
230static qsize_t inode_get_rsv_space(struct inode *inode); 236static qsize_t inode_get_rsv_space(struct inode *inode);
231static void __dquot_initialize(struct inode *inode, int type); 237static void __dquot_initialize(struct inode *inode, int type);
@@ -273,7 +279,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
273static inline void put_dquot_last(struct dquot *dquot) 279static inline void put_dquot_last(struct dquot *dquot)
274{ 280{
275 list_add_tail(&dquot->dq_free, &free_dquots); 281 list_add_tail(&dquot->dq_free, &free_dquots);
276 dqstats.free_dquots++; 282 dqstats_inc(DQST_FREE_DQUOTS);
277} 283}
278 284
279static inline void remove_free_dquot(struct dquot *dquot) 285static inline void remove_free_dquot(struct dquot *dquot)
@@ -281,7 +287,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
281 if (list_empty(&dquot->dq_free)) 287 if (list_empty(&dquot->dq_free))
282 return; 288 return;
283 list_del_init(&dquot->dq_free); 289 list_del_init(&dquot->dq_free);
284 dqstats.free_dquots--; 290 dqstats_dec(DQST_FREE_DQUOTS);
285} 291}
286 292
287static inline void put_inuse(struct dquot *dquot) 293static inline void put_inuse(struct dquot *dquot)
@@ -289,12 +295,12 @@ static inline void put_inuse(struct dquot *dquot)
289 /* We add to the back of inuse list so we don't have to restart 295 /* We add to the back of inuse list so we don't have to restart
290 * when traversing this list and we block */ 296 * when traversing this list and we block */
291 list_add_tail(&dquot->dq_inuse, &inuse_list); 297 list_add_tail(&dquot->dq_inuse, &inuse_list);
292 dqstats.allocated_dquots++; 298 dqstats_inc(DQST_ALLOC_DQUOTS);
293} 299}
294 300
295static inline void remove_inuse(struct dquot *dquot) 301static inline void remove_inuse(struct dquot *dquot)
296{ 302{
297 dqstats.allocated_dquots--; 303 dqstats_dec(DQST_ALLOC_DQUOTS);
298 list_del(&dquot->dq_inuse); 304 list_del(&dquot->dq_inuse);
299} 305}
300/* 306/*
@@ -317,14 +323,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
317 return dquot->dq_sb->dq_op->mark_dirty(dquot); 323 return dquot->dq_sb->dq_op->mark_dirty(dquot);
318} 324}
319 325
326/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
320int dquot_mark_dquot_dirty(struct dquot *dquot) 327int dquot_mark_dquot_dirty(struct dquot *dquot)
321{ 328{
329 int ret = 1;
330
331 /* If quota is dirty already, we don't have to acquire dq_list_lock */
332 if (test_bit(DQ_MOD_B, &dquot->dq_flags))
333 return 1;
334
322 spin_lock(&dq_list_lock); 335 spin_lock(&dq_list_lock);
323 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) 336 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
324 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> 337 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
325 info[dquot->dq_type].dqi_dirty_list); 338 info[dquot->dq_type].dqi_dirty_list);
339 ret = 0;
340 }
326 spin_unlock(&dq_list_lock); 341 spin_unlock(&dq_list_lock);
327 return 0; 342 return ret;
328} 343}
329EXPORT_SYMBOL(dquot_mark_dquot_dirty); 344EXPORT_SYMBOL(dquot_mark_dquot_dirty);
330 345
@@ -550,8 +565,8 @@ int dquot_scan_active(struct super_block *sb,
550 continue; 565 continue;
551 /* Now we have active dquot so we can just increase use count */ 566 /* Now we have active dquot so we can just increase use count */
552 atomic_inc(&dquot->dq_count); 567 atomic_inc(&dquot->dq_count);
553 dqstats.lookups++;
554 spin_unlock(&dq_list_lock); 568 spin_unlock(&dq_list_lock);
569 dqstats_inc(DQST_LOOKUPS);
555 dqput(old_dquot); 570 dqput(old_dquot);
556 old_dquot = dquot; 571 old_dquot = dquot;
557 ret = fn(dquot, priv); 572 ret = fn(dquot, priv);
@@ -596,8 +611,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
596 * holding reference so we can safely just increase 611 * holding reference so we can safely just increase
597 * use count */ 612 * use count */
598 atomic_inc(&dquot->dq_count); 613 atomic_inc(&dquot->dq_count);
599 dqstats.lookups++;
600 spin_unlock(&dq_list_lock); 614 spin_unlock(&dq_list_lock);
615 dqstats_inc(DQST_LOOKUPS);
601 sb->dq_op->write_dquot(dquot); 616 sb->dq_op->write_dquot(dquot);
602 dqput(dquot); 617 dqput(dquot);
603 spin_lock(&dq_list_lock); 618 spin_lock(&dq_list_lock);
@@ -609,9 +624,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
609 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) 624 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
610 && info_dirty(&dqopt->info[cnt])) 625 && info_dirty(&dqopt->info[cnt]))
611 sb->dq_op->write_info(sb, cnt); 626 sb->dq_op->write_info(sb, cnt);
612 spin_lock(&dq_list_lock); 627 dqstats_inc(DQST_SYNCS);
613 dqstats.syncs++;
614 spin_unlock(&dq_list_lock);
615 mutex_unlock(&dqopt->dqonoff_mutex); 628 mutex_unlock(&dqopt->dqonoff_mutex);
616 629
617 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) 630 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
@@ -663,6 +676,22 @@ static void prune_dqcache(int count)
663 } 676 }
664} 677}
665 678
679static int dqstats_read(unsigned int type)
680{
681 int count = 0;
682#ifdef CONFIG_SMP
683 int cpu;
684 for_each_possible_cpu(cpu)
685 count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
686 /* Statistics reading is racy, but absolute accuracy isn't required */
687 if (count < 0)
688 count = 0;
689#else
690 count = dqstats.stat[type];
691#endif
692 return count;
693}
694
666/* 695/*
667 * This is called from kswapd when we think we need some 696 * This is called from kswapd when we think we need some
668 * more memory 697 * more memory
@@ -675,7 +704,7 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
675 prune_dqcache(nr); 704 prune_dqcache(nr);
676 spin_unlock(&dq_list_lock); 705 spin_unlock(&dq_list_lock);
677 } 706 }
678 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; 707 return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure;
679} 708}
680 709
681static struct shrinker dqcache_shrinker = { 710static struct shrinker dqcache_shrinker = {
@@ -703,10 +732,7 @@ void dqput(struct dquot *dquot)
703 BUG(); 732 BUG();
704 } 733 }
705#endif 734#endif
706 735 dqstats_inc(DQST_DROPS);
707 spin_lock(&dq_list_lock);
708 dqstats.drops++;
709 spin_unlock(&dq_list_lock);
710we_slept: 736we_slept:
711 spin_lock(&dq_list_lock); 737 spin_lock(&dq_list_lock);
712 if (atomic_read(&dquot->dq_count) > 1) { 738 if (atomic_read(&dquot->dq_count) > 1) {
@@ -823,15 +849,15 @@ we_slept:
823 put_inuse(dquot); 849 put_inuse(dquot);
824 /* hash it first so it can be found */ 850 /* hash it first so it can be found */
825 insert_dquot_hash(dquot); 851 insert_dquot_hash(dquot);
826 dqstats.lookups++;
827 spin_unlock(&dq_list_lock); 852 spin_unlock(&dq_list_lock);
853 dqstats_inc(DQST_LOOKUPS);
828 } else { 854 } else {
829 if (!atomic_read(&dquot->dq_count)) 855 if (!atomic_read(&dquot->dq_count))
830 remove_free_dquot(dquot); 856 remove_free_dquot(dquot);
831 atomic_inc(&dquot->dq_count); 857 atomic_inc(&dquot->dq_count);
832 dqstats.cache_hits++;
833 dqstats.lookups++;
834 spin_unlock(&dq_list_lock); 858 spin_unlock(&dq_list_lock);
859 dqstats_inc(DQST_CACHE_HITS);
860 dqstats_inc(DQST_LOOKUPS);
835 } 861 }
836 /* Wait for dq_lock - after this we know that either dquot_release() is 862 /* Wait for dq_lock - after this we know that either dquot_release() is
837 * already finished or it will be canceled due to dq_count > 1 test */ 863 * already finished or it will be canceled due to dq_count > 1 test */
@@ -1677,16 +1703,19 @@ EXPORT_SYMBOL(dquot_free_inode);
1677 1703
1678/* 1704/*
1679 * Transfer the number of inode and blocks from one diskquota to an other. 1705 * Transfer the number of inode and blocks from one diskquota to an other.
1706 * On success, dquot references in transfer_to are consumed and references
1707 * to original dquots that need to be released are placed there. On failure,
1708 * references are kept untouched.
1680 * 1709 *
1681 * This operation can block, but only after everything is updated 1710 * This operation can block, but only after everything is updated
1682 * A transaction must be started when entering this function. 1711 * A transaction must be started when entering this function.
1712 *
1683 */ 1713 */
1684static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) 1714int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1685{ 1715{
1686 qsize_t space, cur_space; 1716 qsize_t space, cur_space;
1687 qsize_t rsv_space = 0; 1717 qsize_t rsv_space = 0;
1688 struct dquot *transfer_from[MAXQUOTAS]; 1718 struct dquot *transfer_from[MAXQUOTAS] = {};
1689 struct dquot *transfer_to[MAXQUOTAS];
1690 int cnt, ret = 0; 1719 int cnt, ret = 0;
1691 char warntype_to[MAXQUOTAS]; 1720 char warntype_to[MAXQUOTAS];
1692 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1721 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1696,19 +1725,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1696 if (IS_NOQUOTA(inode)) 1725 if (IS_NOQUOTA(inode))
1697 return 0; 1726 return 0;
1698 /* Initialize the arrays */ 1727 /* Initialize the arrays */
1699 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1728 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1700 transfer_from[cnt] = NULL;
1701 transfer_to[cnt] = NULL;
1702 warntype_to[cnt] = QUOTA_NL_NOWARN; 1729 warntype_to[cnt] = QUOTA_NL_NOWARN;
1703 }
1704 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1705 if (mask & (1 << cnt))
1706 transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
1707 }
1708 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1730 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1709 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1731 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1710 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1732 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1711 goto put_all; 1733 return 0;
1712 } 1734 }
1713 spin_lock(&dq_data_lock); 1735 spin_lock(&dq_data_lock);
1714 cur_space = inode_get_bytes(inode); 1736 cur_space = inode_get_bytes(inode);
@@ -1760,47 +1782,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1760 1782
1761 mark_all_dquot_dirty(transfer_from); 1783 mark_all_dquot_dirty(transfer_from);
1762 mark_all_dquot_dirty(transfer_to); 1784 mark_all_dquot_dirty(transfer_to);
1763 /* The reference we got is transferred to the inode */ 1785 /* Pass back references to put */
1764 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1786 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1765 transfer_to[cnt] = NULL; 1787 transfer_to[cnt] = transfer_from[cnt];
1766warn_put_all: 1788warn:
1767 flush_warnings(transfer_to, warntype_to); 1789 flush_warnings(transfer_to, warntype_to);
1768 flush_warnings(transfer_from, warntype_from_inodes); 1790 flush_warnings(transfer_from, warntype_from_inodes);
1769 flush_warnings(transfer_from, warntype_from_space); 1791 flush_warnings(transfer_from, warntype_from_space);
1770put_all:
1771 dqput_all(transfer_from);
1772 dqput_all(transfer_to);
1773 return ret; 1792 return ret;
1774over_quota: 1793over_quota:
1775 spin_unlock(&dq_data_lock); 1794 spin_unlock(&dq_data_lock);
1776 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1795 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1777 /* Clear dquot pointers we don't want to dqput() */ 1796 goto warn;
1778 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1779 transfer_from[cnt] = NULL;
1780 goto warn_put_all;
1781} 1797}
1798EXPORT_SYMBOL(__dquot_transfer);
1782 1799
1783/* Wrapper for transferring ownership of an inode for uid/gid only 1800/* Wrapper for transferring ownership of an inode for uid/gid only
1784 * Called from FSXXX_setattr() 1801 * Called from FSXXX_setattr()
1785 */ 1802 */
1786int dquot_transfer(struct inode *inode, struct iattr *iattr) 1803int dquot_transfer(struct inode *inode, struct iattr *iattr)
1787{ 1804{
1788 qid_t chid[MAXQUOTAS]; 1805 struct dquot *transfer_to[MAXQUOTAS] = {};
1789 unsigned long mask = 0; 1806 struct super_block *sb = inode->i_sb;
1807 int ret;
1790 1808
1791 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { 1809 if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
1792 mask |= 1 << USRQUOTA; 1810 return 0;
1793 chid[USRQUOTA] = iattr->ia_uid; 1811
1794 } 1812 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
1795 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { 1813 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
1796 mask |= 1 << GRPQUOTA; 1814 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
1797 chid[GRPQUOTA] = iattr->ia_gid; 1815 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA);
1798 } 1816
1799 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { 1817 ret = __dquot_transfer(inode, transfer_to);
1800 dquot_initialize(inode); 1818 dqput_all(transfer_to);
1801 return __dquot_transfer(inode, chid, mask); 1819 return ret;
1802 }
1803 return 0;
1804} 1820}
1805EXPORT_SYMBOL(dquot_transfer); 1821EXPORT_SYMBOL(dquot_transfer);
1806 1822
@@ -2275,25 +2291,30 @@ static inline qsize_t stoqb(qsize_t space)
2275} 2291}
2276 2292
2277/* Generic routine for getting common part of quota structure */ 2293/* Generic routine for getting common part of quota structure */
2278static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) 2294static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2279{ 2295{
2280 struct mem_dqblk *dm = &dquot->dq_dqb; 2296 struct mem_dqblk *dm = &dquot->dq_dqb;
2281 2297
2298 memset(di, 0, sizeof(*di));
2299 di->d_version = FS_DQUOT_VERSION;
2300 di->d_flags = dquot->dq_type == USRQUOTA ?
2301 XFS_USER_QUOTA : XFS_GROUP_QUOTA;
2302 di->d_id = dquot->dq_id;
2303
2282 spin_lock(&dq_data_lock); 2304 spin_lock(&dq_data_lock);
2283 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); 2305 di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
2284 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); 2306 di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
2285 di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace; 2307 di->d_ino_hardlimit = dm->dqb_ihardlimit;
2286 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2308 di->d_ino_softlimit = dm->dqb_isoftlimit;
2287 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2309 di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
2288 di->dqb_curinodes = dm->dqb_curinodes; 2310 di->d_icount = dm->dqb_curinodes;
2289 di->dqb_btime = dm->dqb_btime; 2311 di->d_btimer = dm->dqb_btime;
2290 di->dqb_itime = dm->dqb_itime; 2312 di->d_itimer = dm->dqb_itime;
2291 di->dqb_valid = QIF_ALL;
2292 spin_unlock(&dq_data_lock); 2313 spin_unlock(&dq_data_lock);
2293} 2314}
2294 2315
2295int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, 2316int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2296 struct if_dqblk *di) 2317 struct fs_disk_quota *di)
2297{ 2318{
2298 struct dquot *dquot; 2319 struct dquot *dquot;
2299 2320
@@ -2307,51 +2328,70 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2307} 2328}
2308EXPORT_SYMBOL(vfs_get_dqblk); 2329EXPORT_SYMBOL(vfs_get_dqblk);
2309 2330
2331#define VFS_FS_DQ_MASK \
2332 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
2333 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
2334 FS_DQ_BTIMER | FS_DQ_ITIMER)
2335
2310/* Generic routine for setting common part of quota structure */ 2336/* Generic routine for setting common part of quota structure */
2311static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) 2337static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2312{ 2338{
2313 struct mem_dqblk *dm = &dquot->dq_dqb; 2339 struct mem_dqblk *dm = &dquot->dq_dqb;
2314 int check_blim = 0, check_ilim = 0; 2340 int check_blim = 0, check_ilim = 0;
2315 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; 2341 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
2316 2342
2317 if ((di->dqb_valid & QIF_BLIMITS && 2343 if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
2318 (di->dqb_bhardlimit > dqi->dqi_maxblimit || 2344 return -EINVAL;
2319 di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || 2345
2320 (di->dqb_valid & QIF_ILIMITS && 2346 if (((di->d_fieldmask & FS_DQ_BSOFT) &&
2321 (di->dqb_ihardlimit > dqi->dqi_maxilimit || 2347 (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
2322 di->dqb_isoftlimit > dqi->dqi_maxilimit))) 2348 ((di->d_fieldmask & FS_DQ_BHARD) &&
2349 (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
2350 ((di->d_fieldmask & FS_DQ_ISOFT) &&
2351 (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
2352 ((di->d_fieldmask & FS_DQ_IHARD) &&
2353 (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
2323 return -ERANGE; 2354 return -ERANGE;
2324 2355
2325 spin_lock(&dq_data_lock); 2356 spin_lock(&dq_data_lock);
2326 if (di->dqb_valid & QIF_SPACE) { 2357 if (di->d_fieldmask & FS_DQ_BCOUNT) {
2327 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; 2358 dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
2328 check_blim = 1; 2359 check_blim = 1;
2329 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2360 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2330 } 2361 }
2331 if (di->dqb_valid & QIF_BLIMITS) { 2362
2332 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); 2363 if (di->d_fieldmask & FS_DQ_BSOFT)
2333 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); 2364 dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
2365 if (di->d_fieldmask & FS_DQ_BHARD)
2366 dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
2367 if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
2334 check_blim = 1; 2368 check_blim = 1;
2335 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2369 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2336 } 2370 }
2337 if (di->dqb_valid & QIF_INODES) { 2371
2338 dm->dqb_curinodes = di->dqb_curinodes; 2372 if (di->d_fieldmask & FS_DQ_ICOUNT) {
2373 dm->dqb_curinodes = di->d_icount;
2339 check_ilim = 1; 2374 check_ilim = 1;
2340 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2375 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2341 } 2376 }
2342 if (di->dqb_valid & QIF_ILIMITS) { 2377
2343 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2378 if (di->d_fieldmask & FS_DQ_ISOFT)
2344 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2379 dm->dqb_isoftlimit = di->d_ino_softlimit;
2380 if (di->d_fieldmask & FS_DQ_IHARD)
2381 dm->dqb_ihardlimit = di->d_ino_hardlimit;
2382 if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
2345 check_ilim = 1; 2383 check_ilim = 1;
2346 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2384 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2347 } 2385 }
2348 if (di->dqb_valid & QIF_BTIME) { 2386
2349 dm->dqb_btime = di->dqb_btime; 2387 if (di->d_fieldmask & FS_DQ_BTIMER) {
2388 dm->dqb_btime = di->d_btimer;
2350 check_blim = 1; 2389 check_blim = 1;
2351 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2390 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2352 } 2391 }
2353 if (di->dqb_valid & QIF_ITIME) { 2392
2354 dm->dqb_itime = di->dqb_itime; 2393 if (di->d_fieldmask & FS_DQ_ITIMER) {
2394 dm->dqb_itime = di->d_itimer;
2355 check_ilim = 1; 2395 check_ilim = 1;
2356 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2396 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2357 } 2397 }
@@ -2361,7 +2401,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2361 dm->dqb_curspace < dm->dqb_bsoftlimit) { 2401 dm->dqb_curspace < dm->dqb_bsoftlimit) {
2362 dm->dqb_btime = 0; 2402 dm->dqb_btime = 0;
2363 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2403 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
2364 } else if (!(di->dqb_valid & QIF_BTIME)) 2404 } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
2365 /* Set grace only if user hasn't provided his own... */ 2405 /* Set grace only if user hasn't provided his own... */
2366 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; 2406 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
2367 } 2407 }
@@ -2370,7 +2410,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2370 dm->dqb_curinodes < dm->dqb_isoftlimit) { 2410 dm->dqb_curinodes < dm->dqb_isoftlimit) {
2371 dm->dqb_itime = 0; 2411 dm->dqb_itime = 0;
2372 clear_bit(DQ_INODES_B, &dquot->dq_flags); 2412 clear_bit(DQ_INODES_B, &dquot->dq_flags);
2373 } else if (!(di->dqb_valid & QIF_ITIME)) 2413 } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
2374 /* Set grace only if user hasn't provided his own... */ 2414 /* Set grace only if user hasn't provided his own... */
2375 dm->dqb_itime = get_seconds() + dqi->dqi_igrace; 2415 dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
2376 } 2416 }
@@ -2386,7 +2426,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2386} 2426}
2387 2427
2388int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, 2428int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2389 struct if_dqblk *di) 2429 struct fs_disk_quota *di)
2390{ 2430{
2391 struct dquot *dquot; 2431 struct dquot *dquot;
2392 int rc; 2432 int rc;
@@ -2465,62 +2505,74 @@ const struct quotactl_ops vfs_quotactl_ops = {
2465 .set_dqblk = vfs_set_dqblk 2505 .set_dqblk = vfs_set_dqblk
2466}; 2506};
2467 2507
2508
2509static int do_proc_dqstats(struct ctl_table *table, int write,
2510 void __user *buffer, size_t *lenp, loff_t *ppos)
2511{
2512#ifdef CONFIG_SMP
2513 /* Update global table */
2514 unsigned int type = (int *)table->data - dqstats.stat;
2515 dqstats.stat[type] = dqstats_read(type);
2516#endif
2517 return proc_dointvec(table, write, buffer, lenp, ppos);
2518}
2519
2468static ctl_table fs_dqstats_table[] = { 2520static ctl_table fs_dqstats_table[] = {
2469 { 2521 {
2470 .procname = "lookups", 2522 .procname = "lookups",
2471 .data = &dqstats.lookups, 2523 .data = &dqstats.stat[DQST_LOOKUPS],
2472 .maxlen = sizeof(int), 2524 .maxlen = sizeof(int),
2473 .mode = 0444, 2525 .mode = 0444,
2474 .proc_handler = proc_dointvec, 2526 .proc_handler = do_proc_dqstats,
2475 }, 2527 },
2476 { 2528 {
2477 .procname = "drops", 2529 .procname = "drops",
2478 .data = &dqstats.drops, 2530 .data = &dqstats.stat[DQST_DROPS],
2479 .maxlen = sizeof(int), 2531 .maxlen = sizeof(int),
2480 .mode = 0444, 2532 .mode = 0444,
2481 .proc_handler = proc_dointvec, 2533 .proc_handler = do_proc_dqstats,
2482 }, 2534 },
2483 { 2535 {
2484 .procname = "reads", 2536 .procname = "reads",
2485 .data = &dqstats.reads, 2537 .data = &dqstats.stat[DQST_READS],
2486 .maxlen = sizeof(int), 2538 .maxlen = sizeof(int),
2487 .mode = 0444, 2539 .mode = 0444,
2488 .proc_handler = proc_dointvec, 2540 .proc_handler = do_proc_dqstats,
2489 }, 2541 },
2490 { 2542 {
2491 .procname = "writes", 2543 .procname = "writes",
2492 .data = &dqstats.writes, 2544 .data = &dqstats.stat[DQST_WRITES],
2493 .maxlen = sizeof(int), 2545 .maxlen = sizeof(int),
2494 .mode = 0444, 2546 .mode = 0444,
2495 .proc_handler = proc_dointvec, 2547 .proc_handler = do_proc_dqstats,
2496 }, 2548 },
2497 { 2549 {
2498 .procname = "cache_hits", 2550 .procname = "cache_hits",
2499 .data = &dqstats.cache_hits, 2551 .data = &dqstats.stat[DQST_CACHE_HITS],
2500 .maxlen = sizeof(int), 2552 .maxlen = sizeof(int),
2501 .mode = 0444, 2553 .mode = 0444,
2502 .proc_handler = proc_dointvec, 2554 .proc_handler = do_proc_dqstats,
2503 }, 2555 },
2504 { 2556 {
2505 .procname = "allocated_dquots", 2557 .procname = "allocated_dquots",
2506 .data = &dqstats.allocated_dquots, 2558 .data = &dqstats.stat[DQST_ALLOC_DQUOTS],
2507 .maxlen = sizeof(int), 2559 .maxlen = sizeof(int),
2508 .mode = 0444, 2560 .mode = 0444,
2509 .proc_handler = proc_dointvec, 2561 .proc_handler = do_proc_dqstats,
2510 }, 2562 },
2511 { 2563 {
2512 .procname = "free_dquots", 2564 .procname = "free_dquots",
2513 .data = &dqstats.free_dquots, 2565 .data = &dqstats.stat[DQST_FREE_DQUOTS],
2514 .maxlen = sizeof(int), 2566 .maxlen = sizeof(int),
2515 .mode = 0444, 2567 .mode = 0444,
2516 .proc_handler = proc_dointvec, 2568 .proc_handler = do_proc_dqstats,
2517 }, 2569 },
2518 { 2570 {
2519 .procname = "syncs", 2571 .procname = "syncs",
2520 .data = &dqstats.syncs, 2572 .data = &dqstats.stat[DQST_SYNCS],
2521 .maxlen = sizeof(int), 2573 .maxlen = sizeof(int),
2522 .mode = 0444, 2574 .mode = 0444,
2523 .proc_handler = proc_dointvec, 2575 .proc_handler = do_proc_dqstats,
2524 }, 2576 },
2525#ifdef CONFIG_PRINT_QUOTA_WARNING 2577#ifdef CONFIG_PRINT_QUOTA_WARNING
2526 { 2578 {
@@ -2572,6 +2624,13 @@ static int __init dquot_init(void)
2572 if (!dquot_hash) 2624 if (!dquot_hash)
2573 panic("Cannot create dquot hash table"); 2625 panic("Cannot create dquot hash table");
2574 2626
2627#ifdef CONFIG_SMP
2628 dqstats_pcpu = alloc_percpu(struct dqstats);
2629 if (!dqstats_pcpu)
2630 panic("Cannot create dquot stats table");
2631#endif
2632 memset(&dqstats, 0, sizeof(struct dqstats));
2633
2575 /* Find power-of-two hlist_heads which can fit into allocation */ 2634 /* Find power-of-two hlist_heads which can fit into allocation */
2576 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); 2635 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
2577 dq_hash_bits = 0; 2636 dq_hash_bits = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95388f9b7356..ce3dfd066f59 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
45 return security_quotactl(cmd, type, id, sb); 45 return security_quotactl(cmd, type, id, sb);
46} 46}
47 47
48static void quota_sync_one(struct super_block *sb, void *arg)
49{
50 if (sb->s_qcop && sb->s_qcop->quota_sync)
51 sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
52}
53
48static int quota_sync_all(int type) 54static int quota_sync_all(int type)
49{ 55{
50 struct super_block *sb;
51 int ret; 56 int ret;
52 57
53 if (type >= MAXQUOTAS) 58 if (type >= MAXQUOTAS)
54 return -EINVAL; 59 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL); 60 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret) 61 if (!ret)
57 return ret; 62 iterate_supers(quota_sync_one, &type);
58 63 return ret;
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
74 }
75 spin_unlock(&sb_lock);
76
77 return 0;
78} 64}
79 65
80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
113 struct if_dqinfo info; 99 struct if_dqinfo info;
114 int ret; 100 int ret;
115 101
116 if (!sb_has_quota_active(sb, type))
117 return -ESRCH;
118 if (!sb->s_qcop->get_info) 102 if (!sb->s_qcop->get_info)
119 return -ENOSYS; 103 return -ENOSYS;
120 ret = sb->s_qcop->get_info(sb, type, &info); 104 ret = sb->s_qcop->get_info(sb, type, &info);
@@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
129 113
130 if (copy_from_user(&info, addr, sizeof(info))) 114 if (copy_from_user(&info, addr, sizeof(info)))
131 return -EFAULT; 115 return -EFAULT;
132 if (!sb_has_quota_active(sb, type))
133 return -ESRCH;
134 if (!sb->s_qcop->set_info) 116 if (!sb->s_qcop->set_info)
135 return -ENOSYS; 117 return -ENOSYS;
136 return sb->s_qcop->set_info(sb, type, &info); 118 return sb->s_qcop->set_info(sb, type, &info);
137} 119}
138 120
121static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
122{
123 dst->dqb_bhardlimit = src->d_blk_hardlimit;
124 dst->dqb_bsoftlimit = src->d_blk_softlimit;
125 dst->dqb_curspace = src->d_bcount;
126 dst->dqb_ihardlimit = src->d_ino_hardlimit;
127 dst->dqb_isoftlimit = src->d_ino_softlimit;
128 dst->dqb_curinodes = src->d_icount;
129 dst->dqb_btime = src->d_btimer;
130 dst->dqb_itime = src->d_itimer;
131 dst->dqb_valid = QIF_ALL;
132}
133
139static int quota_getquota(struct super_block *sb, int type, qid_t id, 134static int quota_getquota(struct super_block *sb, int type, qid_t id,
140 void __user *addr) 135 void __user *addr)
141{ 136{
137 struct fs_disk_quota fdq;
142 struct if_dqblk idq; 138 struct if_dqblk idq;
143 int ret; 139 int ret;
144 140
145 if (!sb_has_quota_active(sb, type))
146 return -ESRCH;
147 if (!sb->s_qcop->get_dqblk) 141 if (!sb->s_qcop->get_dqblk)
148 return -ENOSYS; 142 return -ENOSYS;
149 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); 143 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
150 if (ret) 144 if (ret)
151 return ret; 145 return ret;
146 copy_to_if_dqblk(&idq, &fdq);
152 if (copy_to_user(addr, &idq, sizeof(idq))) 147 if (copy_to_user(addr, &idq, sizeof(idq)))
153 return -EFAULT; 148 return -EFAULT;
154 return 0; 149 return 0;
155} 150}
156 151
152static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
153{
154 dst->d_blk_hardlimit = src->dqb_bhardlimit;
155 dst->d_blk_softlimit = src->dqb_bsoftlimit;
156 dst->d_bcount = src->dqb_curspace;
157 dst->d_ino_hardlimit = src->dqb_ihardlimit;
158 dst->d_ino_softlimit = src->dqb_isoftlimit;
159 dst->d_icount = src->dqb_curinodes;
160 dst->d_btimer = src->dqb_btime;
161 dst->d_itimer = src->dqb_itime;
162
163 dst->d_fieldmask = 0;
164 if (src->dqb_valid & QIF_BLIMITS)
165 dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
166 if (src->dqb_valid & QIF_SPACE)
167 dst->d_fieldmask |= FS_DQ_BCOUNT;
168 if (src->dqb_valid & QIF_ILIMITS)
169 dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
170 if (src->dqb_valid & QIF_INODES)
171 dst->d_fieldmask |= FS_DQ_ICOUNT;
172 if (src->dqb_valid & QIF_BTIME)
173 dst->d_fieldmask |= FS_DQ_BTIMER;
174 if (src->dqb_valid & QIF_ITIME)
175 dst->d_fieldmask |= FS_DQ_ITIMER;
176}
177
157static int quota_setquota(struct super_block *sb, int type, qid_t id, 178static int quota_setquota(struct super_block *sb, int type, qid_t id,
158 void __user *addr) 179 void __user *addr)
159{ 180{
181 struct fs_disk_quota fdq;
160 struct if_dqblk idq; 182 struct if_dqblk idq;
161 183
162 if (copy_from_user(&idq, addr, sizeof(idq))) 184 if (copy_from_user(&idq, addr, sizeof(idq)))
163 return -EFAULT; 185 return -EFAULT;
164 if (!sb_has_quota_active(sb, type))
165 return -ESRCH;
166 if (!sb->s_qcop->set_dqblk) 186 if (!sb->s_qcop->set_dqblk)
167 return -ENOSYS; 187 return -ENOSYS;
168 return sb->s_qcop->set_dqblk(sb, type, id, &idq); 188 copy_from_if_dqblk(&fdq, &idq);
189 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
169} 190}
170 191
171static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) 192static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
199 220
200 if (copy_from_user(&fdq, addr, sizeof(fdq))) 221 if (copy_from_user(&fdq, addr, sizeof(fdq)))
201 return -EFAULT; 222 return -EFAULT;
202 if (!sb->s_qcop->set_xquota) 223 if (!sb->s_qcop->set_dqblk)
203 return -ENOSYS; 224 return -ENOSYS;
204 return sb->s_qcop->set_xquota(sb, type, id, &fdq); 225 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
205} 226}
206 227
207static int quota_getxquota(struct super_block *sb, int type, qid_t id, 228static int quota_getxquota(struct super_block *sb, int type, qid_t id,
@@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
210 struct fs_disk_quota fdq; 231 struct fs_disk_quota fdq;
211 int ret; 232 int ret;
212 233
213 if (!sb->s_qcop->get_xquota) 234 if (!sb->s_qcop->get_dqblk)
214 return -ENOSYS; 235 return -ENOSYS;
215 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); 236 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
216 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) 237 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
217 return -EFAULT; 238 return -EFAULT;
218 return ret; 239 return ret;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..24f03407eeb5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,17 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) 60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
61{ 61{
62 struct super_block *sb = info->dqi_sb; 62 struct super_block *sb = info->dqi_sb;
63 ssize_t ret;
63 64
64 return sb->s_op->quota_write(sb, info->dqi_type, buf, 65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
65 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
67 if (ret != info->dqi_usable_bs) {
68 q_warn(KERN_WARNING "VFS: dquota write failed on "
69 "dev %s\n", sb->s_id);
70 if (ret >= 0)
71 ret = -EIO;
72 }
73 return ret;
66} 74}
67 75
68/* Remove empty block from list and return it */ 76/* Remove empty block from list and return it */
@@ -152,7 +160,7 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
152 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); 160 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
153 /* No matter whether write succeeds block is out of list */ 161 /* No matter whether write succeeds block is out of list */
154 if (write_blk(info, blk, buf) < 0) 162 if (write_blk(info, blk, buf) < 0)
155 printk(KERN_ERR 163 q_warn(KERN_ERR
156 "VFS: Can't write block (%u) with free entries.\n", 164 "VFS: Can't write block (%u) with free entries.\n",
157 blk); 165 blk);
158 return 0; 166 return 0;
@@ -244,7 +252,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
244 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { 252 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
245 *err = remove_free_dqentry(info, buf, blk); 253 *err = remove_free_dqentry(info, buf, blk);
246 if (*err < 0) { 254 if (*err < 0) {
247 printk(KERN_ERR "VFS: find_free_dqentry(): Can't " 255 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
248 "remove block (%u) from entry free list.\n", 256 "remove block (%u) from entry free list.\n",
249 blk); 257 blk);
250 goto out_buf; 258 goto out_buf;
@@ -268,7 +276,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
268#endif 276#endif
269 *err = write_blk(info, blk, buf); 277 *err = write_blk(info, blk, buf);
270 if (*err < 0) { 278 if (*err < 0) {
271 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " 279 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
272 "data block %u.\n", blk); 280 "data block %u.\n", blk);
273 goto out_buf; 281 goto out_buf;
274 } 282 }
@@ -303,7 +311,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
303 } else { 311 } else {
304 ret = read_blk(info, *treeblk, buf); 312 ret = read_blk(info, *treeblk, buf);
305 if (ret < 0) { 313 if (ret < 0) {
306 printk(KERN_ERR "VFS: Can't read tree quota block " 314 q_warn(KERN_ERR "VFS: Can't read tree quota block "
307 "%u.\n", *treeblk); 315 "%u.\n", *treeblk);
308 goto out_buf; 316 goto out_buf;
309 } 317 }
@@ -365,7 +373,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
365 if (!dquot->dq_off) { 373 if (!dquot->dq_off) {
366 ret = dq_insert_tree(info, dquot); 374 ret = dq_insert_tree(info, dquot);
367 if (ret < 0) { 375 if (ret < 0) {
368 printk(KERN_ERR "VFS: Error %zd occurred while " 376 q_warn(KERN_ERR "VFS: Error %zd occurred while "
369 "creating quota.\n", ret); 377 "creating quota.\n", ret);
370 kfree(ddquot); 378 kfree(ddquot);
371 return ret; 379 return ret;
@@ -377,14 +385,14 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
377 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, 385 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
378 dquot->dq_off); 386 dquot->dq_off);
379 if (ret != info->dqi_entry_size) { 387 if (ret != info->dqi_entry_size) {
380 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 388 q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
381 sb->s_id); 389 sb->s_id);
382 if (ret >= 0) 390 if (ret >= 0)
383 ret = -ENOSPC; 391 ret = -ENOSPC;
384 } else { 392 } else {
385 ret = 0; 393 ret = 0;
386 } 394 }
387 dqstats.writes++; 395 dqstats_inc(DQST_WRITES);
388 kfree(ddquot); 396 kfree(ddquot);
389 397
390 return ret; 398 return ret;
@@ -402,14 +410,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
402 if (!buf) 410 if (!buf)
403 return -ENOMEM; 411 return -ENOMEM;
404 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { 412 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
405 printk(KERN_ERR "VFS: Quota structure has offset to other " 413 q_warn(KERN_ERR "VFS: Quota structure has offset to other "
406 "block (%u) than it should (%u).\n", blk, 414 "block (%u) than it should (%u).\n", blk,
407 (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); 415 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
408 goto out_buf; 416 goto out_buf;
409 } 417 }
410 ret = read_blk(info, blk, buf); 418 ret = read_blk(info, blk, buf);
411 if (ret < 0) { 419 if (ret < 0) {
412 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); 420 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
413 goto out_buf; 421 goto out_buf;
414 } 422 }
415 dh = (struct qt_disk_dqdbheader *)buf; 423 dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,7 +427,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
419 if (ret >= 0) 427 if (ret >= 0)
420 ret = put_free_dqblk(info, buf, blk); 428 ret = put_free_dqblk(info, buf, blk);
421 if (ret < 0) { 429 if (ret < 0) {
422 printk(KERN_ERR "VFS: Can't move quota data block (%u) " 430 q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
423 "to free list.\n", blk); 431 "to free list.\n", blk);
424 goto out_buf; 432 goto out_buf;
425 } 433 }
@@ -432,14 +440,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
432 /* Insert will write block itself */ 440 /* Insert will write block itself */
433 ret = insert_free_dqentry(info, buf, blk); 441 ret = insert_free_dqentry(info, buf, blk);
434 if (ret < 0) { 442 if (ret < 0) {
435 printk(KERN_ERR "VFS: Can't insert quota data " 443 q_warn(KERN_ERR "VFS: Can't insert quota data "
436 "block (%u) to free entry list.\n", blk); 444 "block (%u) to free entry list.\n", blk);
437 goto out_buf; 445 goto out_buf;
438 } 446 }
439 } else { 447 } else {
440 ret = write_blk(info, blk, buf); 448 ret = write_blk(info, blk, buf);
441 if (ret < 0) { 449 if (ret < 0) {
442 printk(KERN_ERR "VFS: Can't write quota data " 450 q_warn(KERN_ERR "VFS: Can't write quota data "
443 "block %u\n", blk); 451 "block %u\n", blk);
444 goto out_buf; 452 goto out_buf;
445 } 453 }
@@ -464,7 +472,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
464 return -ENOMEM; 472 return -ENOMEM;
465 ret = read_blk(info, *blk, buf); 473 ret = read_blk(info, *blk, buf);
466 if (ret < 0) { 474 if (ret < 0) {
467 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); 475 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
468 goto out_buf; 476 goto out_buf;
469 } 477 }
470 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 478 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,7 +496,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
488 } else { 496 } else {
489 ret = write_blk(info, *blk, buf); 497 ret = write_blk(info, *blk, buf);
490 if (ret < 0) 498 if (ret < 0)
491 printk(KERN_ERR "VFS: Can't write quota tree " 499 q_warn(KERN_ERR "VFS: Can't write quota tree "
492 "block %u.\n", *blk); 500 "block %u.\n", *blk);
493 } 501 }
494 } 502 }
@@ -521,7 +529,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
521 return -ENOMEM; 529 return -ENOMEM;
522 ret = read_blk(info, blk, buf); 530 ret = read_blk(info, blk, buf);
523 if (ret < 0) { 531 if (ret < 0) {
524 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 532 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
525 goto out_buf; 533 goto out_buf;
526 } 534 }
527 ddquot = buf + sizeof(struct qt_disk_dqdbheader); 535 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,7 +539,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
531 ddquot += info->dqi_entry_size; 539 ddquot += info->dqi_entry_size;
532 } 540 }
533 if (i == qtree_dqstr_in_blk(info)) { 541 if (i == qtree_dqstr_in_blk(info)) {
534 printk(KERN_ERR "VFS: Quota for id %u referenced " 542 q_warn(KERN_ERR "VFS: Quota for id %u referenced "
535 "but not present.\n", dquot->dq_id); 543 "but not present.\n", dquot->dq_id);
536 ret = -EIO; 544 ret = -EIO;
537 goto out_buf; 545 goto out_buf;
@@ -556,7 +564,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
556 return -ENOMEM; 564 return -ENOMEM;
557 ret = read_blk(info, blk, buf); 565 ret = read_blk(info, blk, buf);
558 if (ret < 0) { 566 if (ret < 0) {
559 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 567 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
560 goto out_buf; 568 goto out_buf;
561 } 569 }
562 ret = 0; 570 ret = 0;
@@ -599,7 +607,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
599 offset = find_dqentry(info, dquot); 607 offset = find_dqentry(info, dquot);
600 if (offset <= 0) { /* Entry not present? */ 608 if (offset <= 0) { /* Entry not present? */
601 if (offset < 0) 609 if (offset < 0)
602 printk(KERN_ERR "VFS: Can't read quota " 610 q_warn(KERN_ERR "VFS: Can't read quota "
603 "structure for id %u.\n", dquot->dq_id); 611 "structure for id %u.\n", dquot->dq_id);
604 dquot->dq_off = 0; 612 dquot->dq_off = 0;
605 set_bit(DQ_FAKE_B, &dquot->dq_flags); 613 set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -617,7 +625,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
617 if (ret != info->dqi_entry_size) { 625 if (ret != info->dqi_entry_size) {
618 if (ret >= 0) 626 if (ret >= 0)
619 ret = -EIO; 627 ret = -EIO;
620 printk(KERN_ERR "VFS: Error while reading quota " 628 q_warn(KERN_ERR "VFS: Error while reading quota "
621 "structure for id %u.\n", dquot->dq_id); 629 "structure for id %u.\n", dquot->dq_id);
622 set_bit(DQ_FAKE_B, &dquot->dq_flags); 630 set_bit(DQ_FAKE_B, &dquot->dq_flags);
623 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 631 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -634,7 +642,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
634 spin_unlock(&dq_data_lock); 642 spin_unlock(&dq_data_lock);
635 kfree(ddquot); 643 kfree(ddquot);
636out: 644out:
637 dqstats.reads++; 645 dqstats_inc(DQST_READS);
638 return ret; 646 return ret;
639} 647}
640EXPORT_SYMBOL(qtree_read_dquot); 648EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..ccc3e71fb1d8 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,4 +22,10 @@ struct qt_disk_dqdbheader {
22 22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ 23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24 24
25#define q_warn(fmt, args...) \
26do { \
27 if (printk_ratelimit()) \
28 printk(fmt, ## args); \
29} while(0)
30
25#endif /* _LINUX_QUOTAIO_TREE_H */ 31#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..4af344c5852a 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
71 dquot->dq_dqb.dqb_ihardlimit == 0 && 71 dquot->dq_dqb.dqb_ihardlimit == 0 &&
72 dquot->dq_dqb.dqb_isoftlimit == 0) 72 dquot->dq_dqb.dqb_isoftlimit == 0)
73 set_bit(DQ_FAKE_B, &dquot->dq_flags); 73 set_bit(DQ_FAKE_B, &dquot->dq_flags);
74 dqstats.reads++; 74 dqstats_inc(DQST_READS);
75 75
76 return 0; 76 return 0;
77} 77}
@@ -104,7 +104,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
104 ret = 0; 104 ret = 0;
105 105
106out: 106out:
107 dqstats.writes++; 107 dqstats_inc(DQST_WRITES);
108 108
109 return ret; 109 return ret;
110} 110}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..135206af1458 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,7 +63,7 @@ static int v2_read_header(struct super_block *sb, int type,
63 size = sb->s_op->quota_read(sb, type, (char *)dqhead, 63 size = sb->s_op->quota_read(sb, type, (char *)dqhead,
64 sizeof(struct v2_disk_dqheader), 0); 64 sizeof(struct v2_disk_dqheader), 0);
65 if (size != sizeof(struct v2_disk_dqheader)) { 65 if (size != sizeof(struct v2_disk_dqheader)) {
66 printk(KERN_WARNING "quota_v2: Failed header read:" 66 q_warn(KERN_WARNING "quota_v2: Failed header read:"
67 " expected=%zd got=%zd\n", 67 " expected=%zd got=%zd\n",
68 sizeof(struct v2_disk_dqheader), size); 68 sizeof(struct v2_disk_dqheader), size);
69 return 0; 69 return 0;
@@ -106,7 +106,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
108 if (size != sizeof(struct v2_disk_dqinfo)) { 108 if (size != sizeof(struct v2_disk_dqinfo)) {
109 printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", 109 q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
110 sb->s_id); 110 sb->s_id);
111 return -1; 111 return -1;
112 } 112 }
@@ -167,7 +167,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
169 if (size != sizeof(struct v2_disk_dqinfo)) { 169 if (size != sizeof(struct v2_disk_dqinfo)) {
170 printk(KERN_WARNING "Can't write info structure on device %s.\n", 170 q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
171 sb->s_id); 171 sb->s_id);
172 return -1; 172 return -1;
173 } 173 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c94853473ca9..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, 52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
53}; 53};
54 54
55struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) 55struct inode *ramfs_get_inode(struct super_block *sb,
56 const struct inode *dir, int mode, dev_t dev)
56{ 57{
57 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
58 59
59 if (inode) { 60 if (inode) {
60 inode->i_mode = mode; 61 inode_init_owner(inode, dir, mode);
61 inode->i_uid = current_fsuid();
62 inode->i_gid = current_fsgid();
63 inode->i_mapping->a_ops = &ramfs_aops; 62 inode->i_mapping->a_ops = &ramfs_aops;
64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
65 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 64 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
95static int 94static int
96ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 95ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
97{ 96{
98 struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); 97 struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
99 int error = -ENOSPC; 98 int error = -ENOSPC;
100 99
101 if (inode) { 100 if (inode) {
102 if (dir->i_mode & S_ISGID) {
103 inode->i_gid = dir->i_gid;
104 if (S_ISDIR(mode))
105 inode->i_mode |= S_ISGID;
106 }
107 d_instantiate(dentry, inode); 101 d_instantiate(dentry, inode);
108 dget(dentry); /* Extra count - pin the dentry in core */ 102 dget(dentry); /* Extra count - pin the dentry in core */
109 error = 0; 103 error = 0;
@@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
130 struct inode *inode; 124 struct inode *inode;
131 int error = -ENOSPC; 125 int error = -ENOSPC;
132 126
133 inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); 127 inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
134 if (inode) { 128 if (inode) {
135 int l = strlen(symname)+1; 129 int l = strlen(symname)+1;
136 error = page_symlink(inode, symname, l); 130 error = page_symlink(inode, symname, l);
137 if (!error) { 131 if (!error) {
138 if (dir->i_mode & S_ISGID)
139 inode->i_gid = dir->i_gid;
140 d_instantiate(dentry, inode); 132 d_instantiate(dentry, inode);
141 dget(dentry); 133 dget(dentry);
142 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 134 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -214,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
214 return 0; 206 return 0;
215} 207}
216 208
217static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 209int ramfs_fill_super(struct super_block *sb, void *data, int silent)
218{ 210{
219 struct ramfs_fs_info *fsi; 211 struct ramfs_fs_info *fsi;
220 struct inode *inode = NULL; 212 struct inode *inode = NULL;
@@ -241,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
241 sb->s_op = &ramfs_ops; 233 sb->s_op = &ramfs_ops;
242 sb->s_time_gran = 1; 234 sb->s_time_gran = 1;
243 235
244 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 236 inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
245 if (!inode) { 237 if (!inode) {
246 err = -ENOMEM; 238 err = -ENOMEM;
247 goto fail; 239 goto fail;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
147 barrier_done = reiserfs_commit_for_inode(inode); 147 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(inode->i_sb); 148 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 150 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
151 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 152 if (barrier_done < 0)
152 return barrier_done; 153 return barrier_done;
153 return (err < 0) ? -EIO : 0; 154 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index dc2c65e04853..0f22fdaf54ac 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3076,9 +3076,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3077 3077
3078 depth = reiserfs_write_lock_once(inode->i_sb); 3078 depth = reiserfs_write_lock_once(inode->i_sb);
3079 if (attr->ia_valid & ATTR_SIZE) { 3079 if (is_quota_modification(inode, attr))
3080 dquot_initialize(inode); 3080 dquot_initialize(inode);
3081 3081
3082 if (attr->ia_valid & ATTR_SIZE) {
3082 /* version 2 items will be caught by the s_maxbytes check 3083 /* version 2 items will be caught by the s_maxbytes check
3083 ** done for us in vmtruncate 3084 ** done for us in vmtruncate
3084 */ 3085 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d0c43cb99ffc..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
561*/ 561*/
562static int new_inode_init(struct inode *inode, struct inode *dir, int mode) 562static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
563{ 563{
564
565 /* the quota init calls have to know who to charge the quota to, so
566 ** we have to set uid and gid here
567 */
568 inode->i_uid = current_fsuid();
569 inode->i_mode = mode;
570 /* Make inode invalid - just in case we are going to drop it before 564 /* Make inode invalid - just in case we are going to drop it before
571 * the initialization happens */ 565 * the initialization happens */
572 INODE_PKEY(inode)->k_objectid = 0; 566 INODE_PKEY(inode)->k_objectid = 0;
573 567 /* the quota init calls have to know who to charge the quota to, so
574 if (dir->i_mode & S_ISGID) { 568 ** we have to set uid and gid here
575 inode->i_gid = dir->i_gid; 569 */
576 if (S_ISDIR(mode)) 570 inode_init_owner(inode, dir, mode);
577 inode->i_mode |= S_ISGID;
578 } else {
579 inode->i_gid = current_fsgid();
580 }
581 dquot_initialize(inode); 571 dquot_initialize(inode);
582 return 0; 572 return 0;
583} 573}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e7cc00e636dc..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -723,11 +723,11 @@ out:
723 (handler) = *(handlers)++) 723 (handler) = *(handlers)++)
724 724
725/* This is the implementation for the xattr plugin infrastructure */ 725/* This is the implementation for the xattr plugin infrastructure */
726static inline struct xattr_handler * 726static inline const struct xattr_handler *
727find_xattr_handler_prefix(struct xattr_handler **handlers, 727find_xattr_handler_prefix(const struct xattr_handler **handlers,
728 const char *name) 728 const char *name)
729{ 729{
730 struct xattr_handler *xah; 730 const struct xattr_handler *xah;
731 731
732 if (!handlers) 732 if (!handlers)
733 return NULL; 733 return NULL;
@@ -748,7 +748,7 @@ ssize_t
748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
749 size_t size) 749 size_t size)
750{ 750{
751 struct xattr_handler *handler; 751 const struct xattr_handler *handler;
752 752
753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
754 754
@@ -767,7 +767,7 @@ int
767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
768 size_t size, int flags) 768 size_t size, int flags)
769{ 769{
770 struct xattr_handler *handler; 770 const struct xattr_handler *handler;
771 771
772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
773 773
@@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
784 */ 784 */
785int reiserfs_removexattr(struct dentry *dentry, const char *name) 785int reiserfs_removexattr(struct dentry *dentry, const char *name)
786{ 786{
787 struct xattr_handler *handler; 787 const struct xattr_handler *handler;
788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
789 789
790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
807 size_t size; 807 size_t size;
808 if (name[0] != '.' || 808 if (name[0] != '.' ||
809 (namelen != 1 && (name[1] != '.' || namelen != 2))) { 809 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
810 struct xattr_handler *handler; 810 const struct xattr_handler *handler;
811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, 811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
812 name); 812 name);
813 if (!handler) /* Unsupported xattr name */ 813 if (!handler) /* Unsupported xattr name */
@@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
920#endif 920#endif
921 921
922/* Actual operations that are exported to VFS-land */ 922/* Actual operations that are exported to VFS-land */
923struct xattr_handler *reiserfs_xattr_handlers[] = { 923const struct xattr_handler *reiserfs_xattr_handlers[] = {
924#ifdef CONFIG_REISERFS_FS_XATTR 924#ifdef CONFIG_REISERFS_FS_XATTR
925 &reiserfs_xattr_user_handler, 925 &reiserfs_xattr_user_handler,
926 &reiserfs_xattr_trusted_handler, 926 &reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9cdb759645a9..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
500 return size; 500 return size;
501} 501}
502 502
503struct xattr_handler reiserfs_posix_acl_access_handler = { 503const struct xattr_handler reiserfs_posix_acl_access_handler = {
504 .prefix = POSIX_ACL_XATTR_ACCESS, 504 .prefix = POSIX_ACL_XATTR_ACCESS,
505 .flags = ACL_TYPE_ACCESS, 505 .flags = ACL_TYPE_ACCESS,
506 .get = posix_acl_get, 506 .get = posix_acl_get,
@@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
520 return size; 520 return size;
521} 521}
522 522
523struct xattr_handler reiserfs_posix_acl_default_handler = { 523const struct xattr_handler reiserfs_posix_acl_default_handler = {
524 .prefix = POSIX_ACL_XATTR_DEFAULT, 524 .prefix = POSIX_ACL_XATTR_DEFAULT,
525 .flags = ACL_TYPE_DEFAULT, 525 .flags = ACL_TYPE_DEFAULT,
526 .get = posix_acl_get, 526 .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 7271a477c041..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
111 sec->value = NULL; 111 sec->value = NULL;
112} 112}
113 113
114struct xattr_handler reiserfs_xattr_security_handler = { 114const struct xattr_handler reiserfs_xattr_security_handler = {
115 .prefix = XATTR_SECURITY_PREFIX, 115 .prefix = XATTR_SECURITY_PREFIX,
116 .get = security_get, 116 .get = security_get,
117 .set = security_set, 117 .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
48 return len; 48 return len;
49} 49}
50 50
51struct xattr_handler reiserfs_xattr_trusted_handler = { 51const struct xattr_handler reiserfs_xattr_trusted_handler = {
52 .prefix = XATTR_TRUSTED_PREFIX, 52 .prefix = XATTR_TRUSTED_PREFIX,
53 .get = trusted_get, 53 .get = trusted_get,
54 .set = trusted_set, 54 .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
44 return len; 44 return len;
45} 45}
46 46
47struct xattr_handler reiserfs_xattr_user_handler = { 47const struct xattr_handler reiserfs_xattr_user_handler = {
48 .prefix = XATTR_USER_PREFIX, 48 .prefix = XATTR_USER_PREFIX,
49 .get = user_get, 49 .get = user_get,
50 .set = user_set, 50 .set = user_set,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..6c978428892d 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -39,7 +39,7 @@ const struct file_operations smb_dir_operations =
39{ 39{
40 .read = generic_read_dir, 40 .read = generic_read_dir,
41 .readdir = smb_readdir, 41 .readdir = smb_readdir,
42 .ioctl = smb_ioctl, 42 .unlocked_ioctl = smb_ioctl,
43 .open = smb_dir_open, 43 .open = smb_dir_open,
44}; 44};
45 45
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..84ecf0e43f91 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -437,7 +437,7 @@ const struct file_operations smb_file_operations =
437 .aio_read = smb_file_aio_read, 437 .aio_read = smb_file_aio_read,
438 .write = do_sync_write, 438 .write = do_sync_write,
439 .aio_write = smb_file_aio_write, 439 .aio_write = smb_file_aio_write,
440 .ioctl = smb_ioctl, 440 .unlocked_ioctl = smb_ioctl,
441 .mmap = smb_file_mmap, 441 .mmap = smb_file_mmap,
442 .open = smb_file_open, 442 .open = smb_file_open,
443 .release = smb_file_release, 443 .release = smb_file_release,
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
16#include <linux/net.h> 17#include <linux/net.h>
17 18
18#include <linux/smb_fs.h> 19#include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
22 23
23#include "proto.h" 24#include "proto.h"
24 25
25int 26long
26smb_ioctl(struct inode *inode, struct file *filp, 27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
27 unsigned int cmd, unsigned long arg)
28{ 28{
29 struct smb_sb_info *server = server_from_inode(inode); 29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt; 30 struct smb_conn_opt opt;
31 int result = -EINVAL; 31 int result = -EINVAL;
32 32
33 lock_kernel();
33 switch (cmd) { 34 switch (cmd) {
34 uid16_t uid16; 35 uid16_t uid16;
35 uid_t uid32; 36 uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
62 default: 63 default:
63 break; 64 break;
64 } 65 }
66 unlock_kernel();
65 67
66 return result; 68 return result;
67} 69}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations; 67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations; 68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */ 69/* ioctl.c */
70extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); 70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */ 71/* smbiod.c */
72extern void smbiod_wake_up(void); 72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server); 73extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
19 18
20#include <asm/uaccess.h> 19#include <asm/uaccess.h>
21#include <asm/system.h> 20#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
193 break; 193 break;
194 } 194 }
195 195
196 if (pipe->nrbufs < PIPE_BUFFERS) { 196 if (pipe->nrbufs < pipe->buffers) {
197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
198 struct pipe_buffer *buf = pipe->bufs + newbuf; 198 struct pipe_buffer *buf = pipe->bufs + newbuf;
199 199
200 buf->page = spd->pages[page_nr]; 200 buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
214 214
215 if (!--spd->nr_pages) 215 if (!--spd->nr_pages)
216 break; 216 break;
217 if (pipe->nrbufs < PIPE_BUFFERS) 217 if (pipe->nrbufs < pipe->buffers)
218 continue; 218 continue;
219 219
220 break; 220 break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
265 page_cache_release(spd->pages[i]); 265 page_cache_release(spd->pages[i]);
266} 266}
267 267
268/*
269 * Check if we need to grow the arrays holding pages and partial page
270 * descriptions.
271 */
272int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
273{
274 if (pipe->buffers <= PIPE_DEF_BUFFERS)
275 return 0;
276
277 spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
278 spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
279
280 if (spd->pages && spd->partial)
281 return 0;
282
283 kfree(spd->pages);
284 kfree(spd->partial);
285 return -ENOMEM;
286}
287
288void splice_shrink_spd(struct pipe_inode_info *pipe,
289 struct splice_pipe_desc *spd)
290{
291 if (pipe->buffers <= PIPE_DEF_BUFFERS)
292 return;
293
294 kfree(spd->pages);
295 kfree(spd->partial);
296}
297
268static int 298static int
269__generic_file_splice_read(struct file *in, loff_t *ppos, 299__generic_file_splice_read(struct file *in, loff_t *ppos,
270 struct pipe_inode_info *pipe, size_t len, 300 struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
272{ 302{
273 struct address_space *mapping = in->f_mapping; 303 struct address_space *mapping = in->f_mapping;
274 unsigned int loff, nr_pages, req_pages; 304 unsigned int loff, nr_pages, req_pages;
275 struct page *pages[PIPE_BUFFERS]; 305 struct page *pages[PIPE_DEF_BUFFERS];
276 struct partial_page partial[PIPE_BUFFERS]; 306 struct partial_page partial[PIPE_DEF_BUFFERS];
277 struct page *page; 307 struct page *page;
278 pgoff_t index, end_index; 308 pgoff_t index, end_index;
279 loff_t isize; 309 loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
286 .spd_release = spd_release_page, 316 .spd_release = spd_release_page,
287 }; 317 };
288 318
319 if (splice_grow_spd(pipe, &spd))
320 return -ENOMEM;
321
289 index = *ppos >> PAGE_CACHE_SHIFT; 322 index = *ppos >> PAGE_CACHE_SHIFT;
290 loff = *ppos & ~PAGE_CACHE_MASK; 323 loff = *ppos & ~PAGE_CACHE_MASK;
291 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 324 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
292 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 325 nr_pages = min(req_pages, pipe->buffers);
293 326
294 /* 327 /*
295 * Lookup the (hopefully) full range of pages we need. 328 * Lookup the (hopefully) full range of pages we need.
296 */ 329 */
297 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 330 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
298 index += spd.nr_pages; 331 index += spd.nr_pages;
299 332
300 /* 333 /*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
335 unlock_page(page); 368 unlock_page(page);
336 } 369 }
337 370
338 pages[spd.nr_pages++] = page; 371 spd.pages[spd.nr_pages++] = page;
339 index++; 372 index++;
340 } 373 }
341 374
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
356 * this_len is the max we'll use from this page 389 * this_len is the max we'll use from this page
357 */ 390 */
358 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 391 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
359 page = pages[page_nr]; 392 page = spd.pages[page_nr];
360 393
361 if (PageReadahead(page)) 394 if (PageReadahead(page))
362 page_cache_async_readahead(mapping, &in->f_ra, in, 395 page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
393 error = -ENOMEM; 426 error = -ENOMEM;
394 break; 427 break;
395 } 428 }
396 page_cache_release(pages[page_nr]); 429 page_cache_release(spd.pages[page_nr]);
397 pages[page_nr] = page; 430 spd.pages[page_nr] = page;
398 } 431 }
399 /* 432 /*
400 * page was already under io and is now done, great 433 * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
451 len = this_len; 484 len = this_len;
452 } 485 }
453 486
454 partial[page_nr].offset = loff; 487 spd.partial[page_nr].offset = loff;
455 partial[page_nr].len = this_len; 488 spd.partial[page_nr].len = this_len;
456 len -= this_len; 489 len -= this_len;
457 loff = 0; 490 loff = 0;
458 spd.nr_pages++; 491 spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
464 * we got, 'nr_pages' is how many pages are in the map. 497 * we got, 'nr_pages' is how many pages are in the map.
465 */ 498 */
466 while (page_nr < nr_pages) 499 while (page_nr < nr_pages)
467 page_cache_release(pages[page_nr++]); 500 page_cache_release(spd.pages[page_nr++]);
468 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 501 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
469 502
470 if (spd.nr_pages) 503 if (spd.nr_pages)
471 return splice_to_pipe(pipe, &spd); 504 error = splice_to_pipe(pipe, &spd);
472 505
506 splice_shrink_spd(pipe, &spd);
473 return error; 507 return error;
474} 508}
475 509
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
560 unsigned int nr_pages; 594 unsigned int nr_pages;
561 unsigned int nr_freed; 595 unsigned int nr_freed;
562 size_t offset; 596 size_t offset;
563 struct page *pages[PIPE_BUFFERS]; 597 struct page *pages[PIPE_DEF_BUFFERS];
564 struct partial_page partial[PIPE_BUFFERS]; 598 struct partial_page partial[PIPE_DEF_BUFFERS];
565 struct iovec vec[PIPE_BUFFERS]; 599 struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
566 pgoff_t index; 600 pgoff_t index;
567 ssize_t res; 601 ssize_t res;
568 size_t this_len; 602 size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
576 .spd_release = spd_release_page, 610 .spd_release = spd_release_page,
577 }; 611 };
578 612
613 if (splice_grow_spd(pipe, &spd))
614 return -ENOMEM;
615
616 res = -ENOMEM;
617 vec = __vec;
618 if (pipe->buffers > PIPE_DEF_BUFFERS) {
619 vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
620 if (!vec)
621 goto shrink_ret;
622 }
623
579 index = *ppos >> PAGE_CACHE_SHIFT; 624 index = *ppos >> PAGE_CACHE_SHIFT;
580 offset = *ppos & ~PAGE_CACHE_MASK; 625 offset = *ppos & ~PAGE_CACHE_MASK;
581 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 626 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
582 627
583 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { 628 for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
584 struct page *page; 629 struct page *page;
585 630
586 page = alloc_page(GFP_USER); 631 page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
591 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 636 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
592 vec[i].iov_base = (void __user *) page_address(page); 637 vec[i].iov_base = (void __user *) page_address(page);
593 vec[i].iov_len = this_len; 638 vec[i].iov_len = this_len;
594 pages[i] = page; 639 spd.pages[i] = page;
595 spd.nr_pages++; 640 spd.nr_pages++;
596 len -= this_len; 641 len -= this_len;
597 offset = 0; 642 offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
610 nr_freed = 0; 655 nr_freed = 0;
611 for (i = 0; i < spd.nr_pages; i++) { 656 for (i = 0; i < spd.nr_pages; i++) {
612 this_len = min_t(size_t, vec[i].iov_len, res); 657 this_len = min_t(size_t, vec[i].iov_len, res);
613 partial[i].offset = 0; 658 spd.partial[i].offset = 0;
614 partial[i].len = this_len; 659 spd.partial[i].len = this_len;
615 if (!this_len) { 660 if (!this_len) {
616 __free_page(pages[i]); 661 __free_page(spd.pages[i]);
617 pages[i] = NULL; 662 spd.pages[i] = NULL;
618 nr_freed++; 663 nr_freed++;
619 } 664 }
620 res -= this_len; 665 res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
625 if (res > 0) 670 if (res > 0)
626 *ppos += res; 671 *ppos += res;
627 672
673shrink_ret:
674 if (vec != __vec)
675 kfree(vec);
676 splice_shrink_spd(pipe, &spd);
628 return res; 677 return res;
629 678
630err: 679err:
631 for (i = 0; i < spd.nr_pages; i++) 680 for (i = 0; i < spd.nr_pages; i++)
632 __free_page(pages[i]); 681 __free_page(spd.pages[i]);
633 682
634 return error; 683 res = error;
684 goto shrink_ret;
635} 685}
636EXPORT_SYMBOL(default_file_splice_read); 686EXPORT_SYMBOL(default_file_splice_read);
637 687
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
784 if (!buf->len) { 834 if (!buf->len) {
785 buf->ops = NULL; 835 buf->ops = NULL;
786 ops->release(pipe, buf); 836 ops->release(pipe, buf);
787 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 837 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
788 pipe->nrbufs--; 838 pipe->nrbufs--;
789 if (pipe->inode) 839 if (pipe->inode)
790 sd->need_wakeup = true; 840 sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
1211 * If we did an incomplete transfer we must release 1261 * If we did an incomplete transfer we must release
1212 * the pipe buffers in question: 1262 * the pipe buffers in question:
1213 */ 1263 */
1214 for (i = 0; i < PIPE_BUFFERS; i++) { 1264 for (i = 0; i < pipe->buffers; i++) {
1215 struct pipe_buffer *buf = pipe->bufs + i; 1265 struct pipe_buffer *buf = pipe->bufs + i;
1216 1266
1217 if (buf->ops) { 1267 if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1371 */ 1421 */
1372static int get_iovec_page_array(const struct iovec __user *iov, 1422static int get_iovec_page_array(const struct iovec __user *iov,
1373 unsigned int nr_vecs, struct page **pages, 1423 unsigned int nr_vecs, struct page **pages,
1374 struct partial_page *partial, int aligned) 1424 struct partial_page *partial, int aligned,
1425 unsigned int pipe_buffers)
1375{ 1426{
1376 int buffers = 0, error = 0; 1427 int buffers = 0, error = 0;
1377 1428
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1414 break; 1465 break;
1415 1466
1416 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1467 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1417 if (npages > PIPE_BUFFERS - buffers) 1468 if (npages > pipe_buffers - buffers)
1418 npages = PIPE_BUFFERS - buffers; 1469 npages = pipe_buffers - buffers;
1419 1470
1420 error = get_user_pages_fast((unsigned long)base, npages, 1471 error = get_user_pages_fast((unsigned long)base, npages,
1421 0, &pages[buffers]); 1472 0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1450 * or if we mapped the max number of pages that we have 1501 * or if we mapped the max number of pages that we have
1451 * room for. 1502 * room for.
1452 */ 1503 */
1453 if (error < npages || buffers == PIPE_BUFFERS) 1504 if (error < npages || buffers == pipe_buffers)
1454 break; 1505 break;
1455 1506
1456 nr_vecs--; 1507 nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1593 unsigned long nr_segs, unsigned int flags) 1644 unsigned long nr_segs, unsigned int flags)
1594{ 1645{
1595 struct pipe_inode_info *pipe; 1646 struct pipe_inode_info *pipe;
1596 struct page *pages[PIPE_BUFFERS]; 1647 struct page *pages[PIPE_DEF_BUFFERS];
1597 struct partial_page partial[PIPE_BUFFERS]; 1648 struct partial_page partial[PIPE_DEF_BUFFERS];
1598 struct splice_pipe_desc spd = { 1649 struct splice_pipe_desc spd = {
1599 .pages = pages, 1650 .pages = pages,
1600 .partial = partial, 1651 .partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1602 .ops = &user_page_pipe_buf_ops, 1653 .ops = &user_page_pipe_buf_ops,
1603 .spd_release = spd_release_page, 1654 .spd_release = spd_release_page,
1604 }; 1655 };
1656 long ret;
1605 1657
1606 pipe = pipe_info(file->f_path.dentry->d_inode); 1658 pipe = pipe_info(file->f_path.dentry->d_inode);
1607 if (!pipe) 1659 if (!pipe)
1608 return -EBADF; 1660 return -EBADF;
1609 1661
1610 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1662 if (splice_grow_spd(pipe, &spd))
1611 flags & SPLICE_F_GIFT); 1663 return -ENOMEM;
1664
1665 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1666 spd.partial, flags & SPLICE_F_GIFT,
1667 pipe->buffers);
1612 if (spd.nr_pages <= 0) 1668 if (spd.nr_pages <= 0)
1613 return spd.nr_pages; 1669 ret = spd.nr_pages;
1670 else
1671 ret = splice_to_pipe(pipe, &spd);
1614 1672
1615 return splice_to_pipe(pipe, &spd); 1673 splice_shrink_spd(pipe, &spd);
1674 return ret;
1616} 1675}
1617 1676
1618/* 1677/*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1738 * Check ->nrbufs without the inode lock first. This function 1797 * Check ->nrbufs without the inode lock first. This function
1739 * is speculative anyways, so missing one is ok. 1798 * is speculative anyways, so missing one is ok.
1740 */ 1799 */
1741 if (pipe->nrbufs < PIPE_BUFFERS) 1800 if (pipe->nrbufs < pipe->buffers)
1742 return 0; 1801 return 0;
1743 1802
1744 ret = 0; 1803 ret = 0;
1745 pipe_lock(pipe); 1804 pipe_lock(pipe);
1746 1805
1747 while (pipe->nrbufs >= PIPE_BUFFERS) { 1806 while (pipe->nrbufs >= pipe->buffers) {
1748 if (!pipe->readers) { 1807 if (!pipe->readers) {
1749 send_sig(SIGPIPE, current, 0); 1808 send_sig(SIGPIPE, current, 0);
1750 ret = -EPIPE; 1809 ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
1810 * Cannot make any progress, because either the input 1869 * Cannot make any progress, because either the input
1811 * pipe is empty or the output pipe is full. 1870 * pipe is empty or the output pipe is full.
1812 */ 1871 */
1813 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { 1872 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1814 /* Already processed some buffers, break */ 1873 /* Already processed some buffers, break */
1815 if (ret) 1874 if (ret)
1816 break; 1875 break;
@@ -1831,7 +1890,7 @@ retry:
1831 } 1890 }
1832 1891
1833 ibuf = ipipe->bufs + ipipe->curbuf; 1892 ibuf = ipipe->bufs + ipipe->curbuf;
1834 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; 1893 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1835 obuf = opipe->bufs + nbuf; 1894 obuf = opipe->bufs + nbuf;
1836 1895
1837 if (len >= ibuf->len) { 1896 if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
1841 *obuf = *ibuf; 1900 *obuf = *ibuf;
1842 ibuf->ops = NULL; 1901 ibuf->ops = NULL;
1843 opipe->nrbufs++; 1902 opipe->nrbufs++;
1844 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; 1903 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1845 ipipe->nrbufs--; 1904 ipipe->nrbufs--;
1846 input_wakeup = true; 1905 input_wakeup = true;
1847 } else { 1906 } else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1914 * If we have iterated all input buffers or ran out of 1973 * If we have iterated all input buffers or ran out of
1915 * output room, break. 1974 * output room, break.
1916 */ 1975 */
1917 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1976 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1918 break; 1977 break;
1919 1978
1920 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1979 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1921 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1980 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1922 1981
1923 /* 1982 /*
1924 * Get a reference to this pipe buffer, 1983 * Get a reference to this pipe buffer,
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..4ef021f3b612
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
1#include <linux/syscalls.h>
2#include <linux/module.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/statfs.h>
7#include <linux/security.h>
8#include <linux/uaccess.h>
9
10int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
11{
12 int retval = -ENODEV;
13
14 if (dentry) {
15 retval = -ENOSYS;
16 if (dentry->d_sb->s_op->statfs) {
17 memset(buf, 0, sizeof(*buf));
18 retval = security_sb_statfs(dentry);
19 if (retval)
20 return retval;
21 retval = dentry->d_sb->s_op->statfs(dentry, buf);
22 if (retval == 0 && buf->f_frsize == 0)
23 buf->f_frsize = buf->f_bsize;
24 }
25 }
26 return retval;
27}
28
29EXPORT_SYMBOL(vfs_statfs);
30
31static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
32{
33 struct kstatfs st;
34 int retval;
35
36 retval = vfs_statfs(dentry, &st);
37 if (retval)
38 return retval;
39
40 if (sizeof(*buf) == sizeof(st))
41 memcpy(buf, &st, sizeof(st));
42 else {
43 if (sizeof buf->f_blocks == 4) {
44 if ((st.f_blocks | st.f_bfree | st.f_bavail |
45 st.f_bsize | st.f_frsize) &
46 0xffffffff00000000ULL)
47 return -EOVERFLOW;
48 /*
49 * f_files and f_ffree may be -1; it's okay to stuff
50 * that into 32 bits
51 */
52 if (st.f_files != -1 &&
53 (st.f_files & 0xffffffff00000000ULL))
54 return -EOVERFLOW;
55 if (st.f_ffree != -1 &&
56 (st.f_ffree & 0xffffffff00000000ULL))
57 return -EOVERFLOW;
58 }
59
60 buf->f_type = st.f_type;
61 buf->f_bsize = st.f_bsize;
62 buf->f_blocks = st.f_blocks;
63 buf->f_bfree = st.f_bfree;
64 buf->f_bavail = st.f_bavail;
65 buf->f_files = st.f_files;
66 buf->f_ffree = st.f_ffree;
67 buf->f_fsid = st.f_fsid;
68 buf->f_namelen = st.f_namelen;
69 buf->f_frsize = st.f_frsize;
70 memset(buf->f_spare, 0, sizeof(buf->f_spare));
71 }
72 return 0;
73}
74
75static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
76{
77 struct kstatfs st;
78 int retval;
79
80 retval = vfs_statfs(dentry, &st);
81 if (retval)
82 return retval;
83
84 if (sizeof(*buf) == sizeof(st))
85 memcpy(buf, &st, sizeof(st));
86 else {
87 buf->f_type = st.f_type;
88 buf->f_bsize = st.f_bsize;
89 buf->f_blocks = st.f_blocks;
90 buf->f_bfree = st.f_bfree;
91 buf->f_bavail = st.f_bavail;
92 buf->f_files = st.f_files;
93 buf->f_ffree = st.f_ffree;
94 buf->f_fsid = st.f_fsid;
95 buf->f_namelen = st.f_namelen;
96 buf->f_frsize = st.f_frsize;
97 memset(buf->f_spare, 0, sizeof(buf->f_spare));
98 }
99 return 0;
100}
101
102SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
103{
104 struct path path;
105 int error;
106
107 error = user_path(pathname, &path);
108 if (!error) {
109 struct statfs tmp;
110 error = vfs_statfs_native(path.dentry, &tmp);
111 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
112 error = -EFAULT;
113 path_put(&path);
114 }
115 return error;
116}
117
118SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
119{
120 struct path path;
121 long error;
122
123 if (sz != sizeof(*buf))
124 return -EINVAL;
125 error = user_path(pathname, &path);
126 if (!error) {
127 struct statfs64 tmp;
128 error = vfs_statfs64(path.dentry, &tmp);
129 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
130 error = -EFAULT;
131 path_put(&path);
132 }
133 return error;
134}
135
136SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
137{
138 struct file *file;
139 struct statfs tmp;
140 int error;
141
142 error = -EBADF;
143 file = fget(fd);
144 if (!file)
145 goto out;
146 error = vfs_statfs_native(file->f_path.dentry, &tmp);
147 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
148 error = -EFAULT;
149 fput(file);
150out:
151 return error;
152}
153
154SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
155{
156 struct file *file;
157 struct statfs64 tmp;
158 int error;
159
160 if (sz != sizeof(*buf))
161 return -EINVAL;
162
163 error = -EBADF;
164 file = fget(fd);
165 if (!file)
166 goto out;
167 error = vfs_statfs64(file->f_path.dentry, &tmp);
168 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
169 error = -EFAULT;
170 fput(file);
171out:
172 return error;
173}
174
175SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
176{
177 struct super_block *s;
178 struct ustat tmp;
179 struct kstatfs sbuf;
180 int err;
181
182 s = user_get_super(new_decode_dev(dev));
183 if (!s)
184 return -EINVAL;
185
186 err = vfs_statfs(s->s_root, &sbuf);
187 drop_super(s);
188 if (err)
189 return err;
190
191 memset(&tmp,0,sizeof(struct ustat));
192 tmp.f_tfree = sbuf.f_bfree;
193 tmp.f_tinode = sbuf.f_ffree;
194
195 return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
196}
diff --git a/fs/super.c b/fs/super.c
index 1527e6a0ee35..69688b15f1fa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,23 +22,15 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/smp_lock.h>
27#include <linux/acct.h> 25#include <linux/acct.h>
28#include <linux/blkdev.h> 26#include <linux/blkdev.h>
29#include <linux/quotaops.h> 27#include <linux/quotaops.h>
30#include <linux/namei.h>
31#include <linux/mount.h> 28#include <linux/mount.h>
32#include <linux/security.h> 29#include <linux/security.h>
33#include <linux/syscalls.h>
34#include <linux/vfs.h>
35#include <linux/writeback.h> /* for the emergency remount stuff */ 30#include <linux/writeback.h> /* for the emergency remount stuff */
36#include <linux/idr.h> 31#include <linux/idr.h>
37#include <linux/kobject.h>
38#include <linux/mutex.h> 32#include <linux/mutex.h>
39#include <linux/file.h>
40#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
41#include <asm/uaccess.h>
42#include "internal.h" 34#include "internal.h"
43 35
44 36
@@ -93,9 +85,10 @@ static struct super_block *alloc_super(struct file_system_type *type)
93 * subclass. 85 * subclass.
94 */ 86 */
95 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); 87 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
96 s->s_count = S_BIAS; 88 s->s_count = 1;
97 atomic_set(&s->s_active, 1); 89 atomic_set(&s->s_active, 1);
98 mutex_init(&s->s_vfs_rename_mutex); 90 mutex_init(&s->s_vfs_rename_mutex);
91 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
99 mutex_init(&s->s_dquot.dqio_mutex); 92 mutex_init(&s->s_dquot.dqio_mutex);
100 mutex_init(&s->s_dquot.dqonoff_mutex); 93 mutex_init(&s->s_dquot.dqonoff_mutex);
101 init_rwsem(&s->s_dquot.dqptr_sem); 94 init_rwsem(&s->s_dquot.dqptr_sem);
@@ -127,39 +120,14 @@ static inline void destroy_super(struct super_block *s)
127/* Superblock refcounting */ 120/* Superblock refcounting */
128 121
129/* 122/*
130 * Drop a superblock's refcount. Returns non-zero if the superblock was 123 * Drop a superblock's refcount. The caller must hold sb_lock.
131 * destroyed. The caller must hold sb_lock.
132 */ 124 */
133static int __put_super(struct super_block *sb) 125void __put_super(struct super_block *sb)
134{ 126{
135 int ret = 0;
136
137 if (!--sb->s_count) { 127 if (!--sb->s_count) {
128 list_del_init(&sb->s_list);
138 destroy_super(sb); 129 destroy_super(sb);
139 ret = 1;
140 } 130 }
141 return ret;
142}
143
144/*
145 * Drop a superblock's refcount.
146 * Returns non-zero if the superblock is about to be destroyed and
147 * at least is already removed from super_blocks list, so if we are
148 * making a loop through super blocks then we need to restart.
149 * The caller must hold sb_lock.
150 */
151int __put_super_and_need_restart(struct super_block *sb)
152{
153 /* check for race with generic_shutdown_super() */
154 if (list_empty(&sb->s_list)) {
155 /* super block is removed, need to restart... */
156 __put_super(sb);
157 return 1;
158 }
159 /* can't be the last, since s_list is still in use */
160 sb->s_count--;
161 BUG_ON(sb->s_count == 0);
162 return 0;
163} 131}
164 132
165/** 133/**
@@ -178,57 +146,48 @@ void put_super(struct super_block *sb)
178 146
179 147
180/** 148/**
181 * deactivate_super - drop an active reference to superblock 149 * deactivate_locked_super - drop an active reference to superblock
182 * @s: superblock to deactivate 150 * @s: superblock to deactivate
183 * 151 *
184 * Drops an active reference to superblock, acquiring a temprory one if 152 * Drops an active reference to superblock, converting it into a temprory
185 * there is no active references left. In that case we lock superblock, 153 * one if there is no other active references left. In that case we
186 * tell fs driver to shut it down and drop the temporary reference we 154 * tell fs driver to shut it down and drop the temporary reference we
187 * had just acquired. 155 * had just acquired.
156 *
157 * Caller holds exclusive lock on superblock; that lock is released.
188 */ 158 */
189void deactivate_super(struct super_block *s) 159void deactivate_locked_super(struct super_block *s)
190{ 160{
191 struct file_system_type *fs = s->s_type; 161 struct file_system_type *fs = s->s_type;
192 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 162 if (atomic_dec_and_test(&s->s_active)) {
193 s->s_count -= S_BIAS-1;
194 spin_unlock(&sb_lock);
195 vfs_dq_off(s, 0); 163 vfs_dq_off(s, 0);
196 down_write(&s->s_umount);
197 fs->kill_sb(s); 164 fs->kill_sb(s);
198 put_filesystem(fs); 165 put_filesystem(fs);
199 put_super(s); 166 put_super(s);
167 } else {
168 up_write(&s->s_umount);
200 } 169 }
201} 170}
202 171
203EXPORT_SYMBOL(deactivate_super); 172EXPORT_SYMBOL(deactivate_locked_super);
204 173
205/** 174/**
206 * deactivate_locked_super - drop an active reference to superblock 175 * deactivate_super - drop an active reference to superblock
207 * @s: superblock to deactivate 176 * @s: superblock to deactivate
208 * 177 *
209 * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that 178 * Variant of deactivate_locked_super(), except that superblock is *not*
210 * it does not unlock it until it's all over. As the result, it's safe to 179 * locked by caller. If we are going to drop the final active reference,
211 * use to dispose of new superblock on ->get_sb() failure exits - nobody 180 * lock will be acquired prior to that.
212 * will see the sucker until it's all over. Equivalent using up_write +
213 * deactivate_super is safe for that purpose only if superblock is either
214 * safe to use or has NULL ->s_root when we unlock.
215 */ 181 */
216void deactivate_locked_super(struct super_block *s) 182void deactivate_super(struct super_block *s)
217{ 183{
218 struct file_system_type *fs = s->s_type; 184 if (!atomic_add_unless(&s->s_active, -1, 1)) {
219 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 185 down_write(&s->s_umount);
220 s->s_count -= S_BIAS-1; 186 deactivate_locked_super(s);
221 spin_unlock(&sb_lock);
222 vfs_dq_off(s, 0);
223 fs->kill_sb(s);
224 put_filesystem(fs);
225 put_super(s);
226 } else {
227 up_write(&s->s_umount);
228 } 187 }
229} 188}
230 189
231EXPORT_SYMBOL(deactivate_locked_super); 190EXPORT_SYMBOL(deactivate_super);
232 191
233/** 192/**
234 * grab_super - acquire an active reference 193 * grab_super - acquire an active reference
@@ -243,22 +202,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
243 */ 202 */
244static int grab_super(struct super_block *s) __releases(sb_lock) 203static int grab_super(struct super_block *s) __releases(sb_lock)
245{ 204{
205 if (atomic_inc_not_zero(&s->s_active)) {
206 spin_unlock(&sb_lock);
207 return 1;
208 }
209 /* it's going away */
246 s->s_count++; 210 s->s_count++;
247 spin_unlock(&sb_lock); 211 spin_unlock(&sb_lock);
212 /* wait for it to die */
248 down_write(&s->s_umount); 213 down_write(&s->s_umount);
249 if (s->s_root) {
250 spin_lock(&sb_lock);
251 if (s->s_count > S_BIAS) {
252 atomic_inc(&s->s_active);
253 s->s_count--;
254 spin_unlock(&sb_lock);
255 return 1;
256 }
257 spin_unlock(&sb_lock);
258 }
259 up_write(&s->s_umount); 214 up_write(&s->s_umount);
260 put_super(s); 215 put_super(s);
261 yield();
262 return 0; 216 return 0;
263} 217}
264 218
@@ -321,8 +275,7 @@ void generic_shutdown_super(struct super_block *sb)
321 } 275 }
322 spin_lock(&sb_lock); 276 spin_lock(&sb_lock);
323 /* should be initialized for __put_super_and_need_restart() */ 277 /* should be initialized for __put_super_and_need_restart() */
324 list_del_init(&sb->s_list); 278 list_del_init(&sb->s_instances);
325 list_del(&sb->s_instances);
326 spin_unlock(&sb_lock); 279 spin_unlock(&sb_lock);
327 up_write(&sb->s_umount); 280 up_write(&sb->s_umount);
328} 281}
@@ -357,6 +310,7 @@ retry:
357 up_write(&s->s_umount); 310 up_write(&s->s_umount);
358 destroy_super(s); 311 destroy_super(s);
359 } 312 }
313 down_write(&old->s_umount);
360 return old; 314 return old;
361 } 315 }
362 } 316 }
@@ -408,11 +362,12 @@ EXPORT_SYMBOL(drop_super);
408 */ 362 */
409void sync_supers(void) 363void sync_supers(void)
410{ 364{
411 struct super_block *sb; 365 struct super_block *sb, *n;
412 366
413 spin_lock(&sb_lock); 367 spin_lock(&sb_lock);
414restart: 368 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
415 list_for_each_entry(sb, &super_blocks, s_list) { 369 if (list_empty(&sb->s_instances))
370 continue;
416 if (sb->s_op->write_super && sb->s_dirt) { 371 if (sb->s_op->write_super && sb->s_dirt) {
417 sb->s_count++; 372 sb->s_count++;
418 spin_unlock(&sb_lock); 373 spin_unlock(&sb_lock);
@@ -423,14 +378,43 @@ restart:
423 up_read(&sb->s_umount); 378 up_read(&sb->s_umount);
424 379
425 spin_lock(&sb_lock); 380 spin_lock(&sb_lock);
426 if (__put_super_and_need_restart(sb)) 381 __put_super(sb);
427 goto restart;
428 } 382 }
429 } 383 }
430 spin_unlock(&sb_lock); 384 spin_unlock(&sb_lock);
431} 385}
432 386
433/** 387/**
388 * iterate_supers - call function for all active superblocks
389 * @f: function to call
390 * @arg: argument to pass to it
391 *
392 * Scans the superblock list and calls given function, passing it
393 * locked superblock and given argument.
394 */
395void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
396{
397 struct super_block *sb, *n;
398
399 spin_lock(&sb_lock);
400 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
401 if (list_empty(&sb->s_instances))
402 continue;
403 sb->s_count++;
404 spin_unlock(&sb_lock);
405
406 down_read(&sb->s_umount);
407 if (sb->s_root)
408 f(sb, arg);
409 up_read(&sb->s_umount);
410
411 spin_lock(&sb_lock);
412 __put_super(sb);
413 }
414 spin_unlock(&sb_lock);
415}
416
417/**
434 * get_super - get the superblock of a device 418 * get_super - get the superblock of a device
435 * @bdev: device to get the superblock for 419 * @bdev: device to get the superblock for
436 * 420 *
@@ -438,7 +422,7 @@ restart:
438 * mounted on the device given. %NULL is returned if no match is found. 422 * mounted on the device given. %NULL is returned if no match is found.
439 */ 423 */
440 424
441struct super_block * get_super(struct block_device *bdev) 425struct super_block *get_super(struct block_device *bdev)
442{ 426{
443 struct super_block *sb; 427 struct super_block *sb;
444 428
@@ -448,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev)
448 spin_lock(&sb_lock); 432 spin_lock(&sb_lock);
449rescan: 433rescan:
450 list_for_each_entry(sb, &super_blocks, s_list) { 434 list_for_each_entry(sb, &super_blocks, s_list) {
435 if (list_empty(&sb->s_instances))
436 continue;
451 if (sb->s_bdev == bdev) { 437 if (sb->s_bdev == bdev) {
452 sb->s_count++; 438 sb->s_count++;
453 spin_unlock(&sb_lock); 439 spin_unlock(&sb_lock);
454 down_read(&sb->s_umount); 440 down_read(&sb->s_umount);
441 /* still alive? */
455 if (sb->s_root) 442 if (sb->s_root)
456 return sb; 443 return sb;
457 up_read(&sb->s_umount); 444 up_read(&sb->s_umount);
458 /* restart only when sb is no longer on the list */ 445 /* nope, got unmounted */
459 spin_lock(&sb_lock); 446 spin_lock(&sb_lock);
460 if (__put_super_and_need_restart(sb)) 447 __put_super(sb);
461 goto rescan; 448 goto rescan;
462 } 449 }
463 } 450 }
464 spin_unlock(&sb_lock); 451 spin_unlock(&sb_lock);
@@ -473,7 +460,7 @@ EXPORT_SYMBOL(get_super);
473 * 460 *
474 * Scans the superblock list and finds the superblock of the file system 461 * Scans the superblock list and finds the superblock of the file system
475 * mounted on the device given. Returns the superblock with an active 462 * mounted on the device given. Returns the superblock with an active
476 * reference and s_umount held exclusively or %NULL if none was found. 463 * reference or %NULL if none was found.
477 */ 464 */
478struct super_block *get_active_super(struct block_device *bdev) 465struct super_block *get_active_super(struct block_device *bdev)
479{ 466{
@@ -482,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev)
482 if (!bdev) 469 if (!bdev)
483 return NULL; 470 return NULL;
484 471
472restart:
485 spin_lock(&sb_lock); 473 spin_lock(&sb_lock);
486 list_for_each_entry(sb, &super_blocks, s_list) { 474 list_for_each_entry(sb, &super_blocks, s_list) {
487 if (sb->s_bdev != bdev) 475 if (list_empty(&sb->s_instances))
488 continue; 476 continue;
489 477 if (sb->s_bdev == bdev) {
490 sb->s_count++; 478 if (grab_super(sb)) /* drops sb_lock */
491 spin_unlock(&sb_lock);
492 down_write(&sb->s_umount);
493 if (sb->s_root) {
494 spin_lock(&sb_lock);
495 if (sb->s_count > S_BIAS) {
496 atomic_inc(&sb->s_active);
497 sb->s_count--;
498 spin_unlock(&sb_lock);
499 return sb; 479 return sb;
500 } 480 else
501 spin_unlock(&sb_lock); 481 goto restart;
502 } 482 }
503 up_write(&sb->s_umount);
504 put_super(sb);
505 yield();
506 spin_lock(&sb_lock);
507 } 483 }
508 spin_unlock(&sb_lock); 484 spin_unlock(&sb_lock);
509 return NULL; 485 return NULL;
510} 486}
511 487
512struct super_block * user_get_super(dev_t dev) 488struct super_block *user_get_super(dev_t dev)
513{ 489{
514 struct super_block *sb; 490 struct super_block *sb;
515 491
516 spin_lock(&sb_lock); 492 spin_lock(&sb_lock);
517rescan: 493rescan:
518 list_for_each_entry(sb, &super_blocks, s_list) { 494 list_for_each_entry(sb, &super_blocks, s_list) {
495 if (list_empty(&sb->s_instances))
496 continue;
519 if (sb->s_dev == dev) { 497 if (sb->s_dev == dev) {
520 sb->s_count++; 498 sb->s_count++;
521 spin_unlock(&sb_lock); 499 spin_unlock(&sb_lock);
522 down_read(&sb->s_umount); 500 down_read(&sb->s_umount);
501 /* still alive? */
523 if (sb->s_root) 502 if (sb->s_root)
524 return sb; 503 return sb;
525 up_read(&sb->s_umount); 504 up_read(&sb->s_umount);
526 /* restart only when sb is no longer on the list */ 505 /* nope, got unmounted */
527 spin_lock(&sb_lock); 506 spin_lock(&sb_lock);
528 if (__put_super_and_need_restart(sb)) 507 __put_super(sb);
529 goto rescan; 508 goto rescan;
530 } 509 }
531 } 510 }
532 spin_unlock(&sb_lock); 511 spin_unlock(&sb_lock);
533 return NULL; 512 return NULL;
534} 513}
535 514
536SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
537{
538 struct super_block *s;
539 struct ustat tmp;
540 struct kstatfs sbuf;
541 int err = -EINVAL;
542
543 s = user_get_super(new_decode_dev(dev));
544 if (s == NULL)
545 goto out;
546 err = vfs_statfs(s->s_root, &sbuf);
547 drop_super(s);
548 if (err)
549 goto out;
550
551 memset(&tmp,0,sizeof(struct ustat));
552 tmp.f_tfree = sbuf.f_bfree;
553 tmp.f_tinode = sbuf.f_ffree;
554
555 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
556out:
557 return err;
558}
559
560/** 515/**
561 * do_remount_sb - asks filesystem to change mount options. 516 * do_remount_sb - asks filesystem to change mount options.
562 * @sb: superblock in question 517 * @sb: superblock in question
@@ -622,24 +577,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
622 577
623static void do_emergency_remount(struct work_struct *work) 578static void do_emergency_remount(struct work_struct *work)
624{ 579{
625 struct super_block *sb; 580 struct super_block *sb, *n;
626 581
627 spin_lock(&sb_lock); 582 spin_lock(&sb_lock);
628 list_for_each_entry(sb, &super_blocks, s_list) { 583 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
584 if (list_empty(&sb->s_instances))
585 continue;
629 sb->s_count++; 586 sb->s_count++;
630 spin_unlock(&sb_lock); 587 spin_unlock(&sb_lock);
631 down_write(&sb->s_umount); 588 down_write(&sb->s_umount);
632 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { 589 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
633 /* 590 /*
634 * ->remount_fs needs lock_kernel().
635 *
636 * What lock protects sb->s_flags?? 591 * What lock protects sb->s_flags??
637 */ 592 */
638 do_remount_sb(sb, MS_RDONLY, NULL, 1); 593 do_remount_sb(sb, MS_RDONLY, NULL, 1);
639 } 594 }
640 up_write(&sb->s_umount); 595 up_write(&sb->s_umount);
641 put_super(sb);
642 spin_lock(&sb_lock); 596 spin_lock(&sb_lock);
597 __put_super(sb);
643 } 598 }
644 spin_unlock(&sb_lock); 599 spin_unlock(&sb_lock);
645 kfree(work); 600 kfree(work);
@@ -990,6 +945,96 @@ out:
990 945
991EXPORT_SYMBOL_GPL(vfs_kern_mount); 946EXPORT_SYMBOL_GPL(vfs_kern_mount);
992 947
948/**
949 * freeze_super -- lock the filesystem and force it into a consistent state
950 * @super: the super to lock
951 *
952 * Syncs the super to make sure the filesystem is consistent and calls the fs's
953 * freeze_fs. Subsequent calls to this without first thawing the fs will return
954 * -EBUSY.
955 */
956int freeze_super(struct super_block *sb)
957{
958 int ret;
959
960 atomic_inc(&sb->s_active);
961 down_write(&sb->s_umount);
962 if (sb->s_frozen) {
963 deactivate_locked_super(sb);
964 return -EBUSY;
965 }
966
967 if (sb->s_flags & MS_RDONLY) {
968 sb->s_frozen = SB_FREEZE_TRANS;
969 smp_wmb();
970 up_write(&sb->s_umount);
971 return 0;
972 }
973
974 sb->s_frozen = SB_FREEZE_WRITE;
975 smp_wmb();
976
977 sync_filesystem(sb);
978
979 sb->s_frozen = SB_FREEZE_TRANS;
980 smp_wmb();
981
982 sync_blockdev(sb->s_bdev);
983 if (sb->s_op->freeze_fs) {
984 ret = sb->s_op->freeze_fs(sb);
985 if (ret) {
986 printk(KERN_ERR
987 "VFS:Filesystem freeze failed\n");
988 sb->s_frozen = SB_UNFROZEN;
989 deactivate_locked_super(sb);
990 return ret;
991 }
992 }
993 up_write(&sb->s_umount);
994 return 0;
995}
996EXPORT_SYMBOL(freeze_super);
997
998/**
999 * thaw_super -- unlock filesystem
1000 * @sb: the super to thaw
1001 *
1002 * Unlocks the filesystem and marks it writeable again after freeze_super().
1003 */
1004int thaw_super(struct super_block *sb)
1005{
1006 int error;
1007
1008 down_write(&sb->s_umount);
1009 if (sb->s_frozen == SB_UNFROZEN) {
1010 up_write(&sb->s_umount);
1011 return -EINVAL;
1012 }
1013
1014 if (sb->s_flags & MS_RDONLY)
1015 goto out;
1016
1017 if (sb->s_op->unfreeze_fs) {
1018 error = sb->s_op->unfreeze_fs(sb);
1019 if (error) {
1020 printk(KERN_ERR
1021 "VFS:Filesystem thaw failed\n");
1022 sb->s_frozen = SB_FREEZE_TRANS;
1023 up_write(&sb->s_umount);
1024 return error;
1025 }
1026 }
1027
1028out:
1029 sb->s_frozen = SB_UNFROZEN;
1030 smp_wmb();
1031 wake_up(&sb->s_wait_unfrozen);
1032 deactivate_locked_super(sb);
1033
1034 return 0;
1035}
1036EXPORT_SYMBOL(thaw_super);
1037
993static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 1038static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
994{ 1039{
995 int err; 1040 int err;
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..e8cbd415e50a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
42 if (wait) 42 if (wait)
43 sync_inodes_sb(sb); 43 sync_inodes_sb(sb);
44 else 44 else
45 writeback_inodes_sb(sb); 45 writeback_inodes_sb_locked(sb);
46 46
47 if (sb->s_op->sync_fs) 47 if (sb->s_op->sync_fs)
48 sb->s_op->sync_fs(sb, wait); 48 sb->s_op->sync_fs(sb, wait);
@@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
77} 77}
78EXPORT_SYMBOL_GPL(sync_filesystem); 78EXPORT_SYMBOL_GPL(sync_filesystem);
79 79
80static void sync_one_sb(struct super_block *sb, void *arg)
81{
82 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
83 __sync_filesystem(sb, *(int *)arg);
84}
80/* 85/*
81 * Sync all the data for all the filesystems (called by sys_sync() and 86 * Sync all the data for all the filesystems (called by sys_sync() and
82 * emergency sync) 87 * emergency sync)
83 *
84 * This operation is careful to avoid the livelock which could easily happen
85 * if two or more filesystems are being continuously dirtied. s_need_sync
86 * is used only here. We set it against all filesystems and then clear it as
87 * we sync them. So redirtied filesystems are skipped.
88 *
89 * But if process A is currently running sync_filesystems and then process B
90 * calls sync_filesystems as well, process B will set all the s_need_sync
91 * flags again, which will cause process A to resync everything. Fix that with
92 * a local mutex.
93 */ 88 */
94static void sync_filesystems(int wait) 89static void sync_filesystems(int wait)
95{ 90{
96 struct super_block *sb; 91 iterate_supers(sync_one_sb, &wait);
97 static DEFINE_MUTEX(mutex);
98
99 mutex_lock(&mutex); /* Could be down_interruptible */
100 spin_lock(&sb_lock);
101 list_for_each_entry(sb, &super_blocks, s_list)
102 sb->s_need_sync = 1;
103
104restart:
105 list_for_each_entry(sb, &super_blocks, s_list) {
106 if (!sb->s_need_sync)
107 continue;
108 sb->s_need_sync = 0;
109 sb->s_count++;
110 spin_unlock(&sb_lock);
111
112 down_read(&sb->s_umount);
113 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
114 __sync_filesystem(sb, wait);
115 up_read(&sb->s_umount);
116
117 /* restart only when sb is no longer on the list */
118 spin_lock(&sb_lock);
119 if (__put_super_and_need_restart(sb))
120 goto restart;
121 }
122 spin_unlock(&sb_lock);
123 mutex_unlock(&mutex);
124} 92}
125 93
126/* 94/*
@@ -190,7 +158,6 @@ EXPORT_SYMBOL(file_fsync);
190/** 158/**
191 * vfs_fsync_range - helper to sync a range of data & metadata to disk 159 * vfs_fsync_range - helper to sync a range of data & metadata to disk
192 * @file: file to sync 160 * @file: file to sync
193 * @dentry: dentry of @file
194 * @start: offset in bytes of the beginning of data range to sync 161 * @start: offset in bytes of the beginning of data range to sync
195 * @end: offset in bytes of the end of data range (inclusive) 162 * @end: offset in bytes of the end of data range (inclusive)
196 * @datasync: perform only datasync 163 * @datasync: perform only datasync
@@ -198,32 +165,13 @@ EXPORT_SYMBOL(file_fsync);
198 * Write back data in range @start..@end and metadata for @file to disk. If 165 * Write back data in range @start..@end and metadata for @file to disk. If
199 * @datasync is set only metadata needed to access modified file data is 166 * @datasync is set only metadata needed to access modified file data is
200 * written. 167 * written.
201 *
202 * In case this function is called from nfsd @file may be %NULL and
203 * only @dentry is set. This can only happen when the filesystem
204 * implements the export_operations API.
205 */ 168 */
206int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, 169int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
207 loff_t end, int datasync)
208{ 170{
209 const struct file_operations *fop; 171 struct address_space *mapping = file->f_mapping;
210 struct address_space *mapping;
211 int err, ret; 172 int err, ret;
212 173
213 /* 174 if (!file->f_op || !file->f_op->fsync) {
214 * Get mapping and operations from the file in case we have
215 * as file, or get the default values for them in case we
216 * don't have a struct file available. Damn nfsd..
217 */
218 if (file) {
219 mapping = file->f_mapping;
220 fop = file->f_op;
221 } else {
222 mapping = dentry->d_inode->i_mapping;
223 fop = dentry->d_inode->i_fop;
224 }
225
226 if (!fop || !fop->fsync) {
227 ret = -EINVAL; 175 ret = -EINVAL;
228 goto out; 176 goto out;
229 } 177 }
@@ -235,7 +183,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
235 * livelocks in fsync_buffers_list(). 183 * livelocks in fsync_buffers_list().
236 */ 184 */
237 mutex_lock(&mapping->host->i_mutex); 185 mutex_lock(&mapping->host->i_mutex);
238 err = fop->fsync(file, dentry, datasync); 186 err = file->f_op->fsync(file, file->f_path.dentry, datasync);
239 if (!ret) 187 if (!ret)
240 ret = err; 188 ret = err;
241 mutex_unlock(&mapping->host->i_mutex); 189 mutex_unlock(&mapping->host->i_mutex);
@@ -248,19 +196,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
248/** 196/**
249 * vfs_fsync - perform a fsync or fdatasync on a file 197 * vfs_fsync - perform a fsync or fdatasync on a file
250 * @file: file to sync 198 * @file: file to sync
251 * @dentry: dentry of @file
252 * @datasync: only perform a fdatasync operation 199 * @datasync: only perform a fdatasync operation
253 * 200 *
254 * Write back data and metadata for @file to disk. If @datasync is 201 * Write back data and metadata for @file to disk. If @datasync is
255 * set only metadata needed to access modified file data is written. 202 * set only metadata needed to access modified file data is written.
256 *
257 * In case this function is called from nfsd @file may be %NULL and
258 * only @dentry is set. This can only happen when the filesystem
259 * implements the export_operations API.
260 */ 203 */
261int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 204int vfs_fsync(struct file *file, int datasync)
262{ 205{
263 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); 206 return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
264} 207}
265EXPORT_SYMBOL(vfs_fsync); 208EXPORT_SYMBOL(vfs_fsync);
266 209
@@ -271,7 +214,7 @@ static int do_fsync(unsigned int fd, int datasync)
271 214
272 file = fget(fd); 215 file = fget(fd);
273 if (file) { 216 if (file) {
274 ret = vfs_fsync(file, file->f_path.dentry, datasync); 217 ret = vfs_fsync(file, datasync);
275 fput(file); 218 fput(file);
276 } 219 }
277 return ret; 220 return ret;
@@ -299,8 +242,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
299{ 242{
300 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) 243 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
301 return 0; 244 return 0;
302 return vfs_fsync_range(file, file->f_path.dentry, pos, 245 return vfs_fsync_range(file, pos, pos + count - 1,
303 pos + count - 1,
304 (file->f_flags & __O_SYNC) ? 0 : 1); 246 (file->f_flags & __O_SYNC) ? 0 : 1);
305} 247}
306EXPORT_SYMBOL(generic_write_sync); 248EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d293593e52..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
46}; 46};
47 47
48static int 48static int
49fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) 49fill_read(struct file *file, char *buffer, loff_t off, size_t count)
50{ 50{
51 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 51 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
54 int rc; 54 int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(file, kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
@@ -70,8 +70,7 @@ static ssize_t
70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) 70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
71{ 71{
72 struct bin_buffer *bb = file->private_data; 72 struct bin_buffer *bb = file->private_data;
73 struct dentry *dentry = file->f_path.dentry; 73 int size = file->f_path.dentry->d_inode->i_size;
74 int size = dentry->d_inode->i_size;
75 loff_t offs = *off; 74 loff_t offs = *off;
76 int count = min_t(size_t, bytes, PAGE_SIZE); 75 int count = min_t(size_t, bytes, PAGE_SIZE);
77 char *temp; 76 char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
92 91
93 mutex_lock(&bb->mutex); 92 mutex_lock(&bb->mutex);
94 93
95 count = fill_read(dentry, bb->buffer, offs, count); 94 count = fill_read(file, bb->buffer, offs, count);
96 if (count < 0) { 95 if (count < 0) {
97 mutex_unlock(&bb->mutex); 96 mutex_unlock(&bb->mutex);
98 goto out_free; 97 goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
117} 116}
118 117
119static int 118static int
120flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) 119flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
121{ 120{
122 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 121 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
123 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 122 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
124 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 123 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
125 int rc; 124 int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
130 129
131 rc = -EIO; 130 rc = -EIO;
132 if (attr->write) 131 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 132 rc = attr->write(file, kobj, attr, buffer, offset, count);
134 133
135 sysfs_put_active(attr_sd); 134 sysfs_put_active(attr_sd);
136 135
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
141 size_t bytes, loff_t *off) 140 size_t bytes, loff_t *off)
142{ 141{
143 struct bin_buffer *bb = file->private_data; 142 struct bin_buffer *bb = file->private_data;
144 struct dentry *dentry = file->f_path.dentry; 143 int size = file->f_path.dentry->d_inode->i_size;
145 int size = dentry->d_inode->i_size;
146 loff_t offs = *off; 144 loff_t offs = *off;
147 int count = min_t(size_t, bytes, PAGE_SIZE); 145 int count = min_t(size_t, bytes, PAGE_SIZE);
148 char *temp; 146 char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
165 163
166 memcpy(bb->buffer, temp, count); 164 memcpy(bb->buffer, temp, count);
167 165
168 count = flush_write(dentry, bb->buffer, offs, count); 166 count = flush_write(file, bb->buffer, offs, count);
169 mutex_unlock(&bb->mutex); 167 mutex_unlock(&bb->mutex);
170 168
171 if (count > 0) 169 if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
363 if (!attr->mmap) 361 if (!attr->mmap)
364 goto out_put; 362 goto out_put;
365 363
366 rc = attr->mmap(kobj, attr, vma); 364 rc = attr->mmap(file, kobj, attr, vma);
367 if (rc) 365 if (rc)
368 goto out_put; 366 goto out_put;
369 367
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
501void sysfs_remove_bin_file(struct kobject *kobj, 499void sysfs_remove_bin_file(struct kobject *kobj,
502 const struct bin_attribute *attr) 500 const struct bin_attribute *attr)
503{ 501{
504 sysfs_hash_and_remove(kobj->sd, attr->attr.name); 502 sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
505} 503}
506 504
507EXPORT_SYMBOL_GPL(sysfs_create_bin_file); 505EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 590717861c7a..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
380{ 380{
381 struct sysfs_inode_attrs *ps_iattr; 381 struct sysfs_inode_attrs *ps_iattr;
382 382
383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) 383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
384 return -EEXIST; 384 return -EEXIST;
385 385
386 sd->s_parent = sysfs_get(acxt->parent_sd); 386 sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
533 * Pointer to sysfs_dirent if found, NULL if not. 533 * Pointer to sysfs_dirent if found, NULL if not.
534 */ 534 */
535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
536 const void *ns,
536 const unsigned char *name) 537 const unsigned char *name)
537{ 538{
538 struct sysfs_dirent *sd; 539 struct sysfs_dirent *sd;
539 540
540 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) 541 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
542 if (ns && sd->s_ns && (sd->s_ns != ns))
543 continue;
541 if (!strcmp(sd->s_name, name)) 544 if (!strcmp(sd->s_name, name))
542 return sd; 545 return sd;
546 }
543 return NULL; 547 return NULL;
544} 548}
545 549
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
558 * Pointer to sysfs_dirent if found, NULL if not. 562 * Pointer to sysfs_dirent if found, NULL if not.
559 */ 563 */
560struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 564struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
565 const void *ns,
561 const unsigned char *name) 566 const unsigned char *name)
562{ 567{
563 struct sysfs_dirent *sd; 568 struct sysfs_dirent *sd;
564 569
565 mutex_lock(&sysfs_mutex); 570 mutex_lock(&sysfs_mutex);
566 sd = sysfs_find_dirent(parent_sd, name); 571 sd = sysfs_find_dirent(parent_sd, ns, name);
567 sysfs_get(sd); 572 sysfs_get(sd);
568 mutex_unlock(&sysfs_mutex); 573 mutex_unlock(&sysfs_mutex);
569 574
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
572EXPORT_SYMBOL_GPL(sysfs_get_dirent); 577EXPORT_SYMBOL_GPL(sysfs_get_dirent);
573 578
574static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, 579static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
575 const char *name, struct sysfs_dirent **p_sd) 580 enum kobj_ns_type type, const void *ns, const char *name,
581 struct sysfs_dirent **p_sd)
576{ 582{
577 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 583 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
578 struct sysfs_addrm_cxt acxt; 584 struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
583 sd = sysfs_new_dirent(name, mode, SYSFS_DIR); 589 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
584 if (!sd) 590 if (!sd)
585 return -ENOMEM; 591 return -ENOMEM;
592
593 sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
594 sd->s_ns = ns;
586 sd->s_dir.kobj = kobj; 595 sd->s_dir.kobj = kobj;
587 596
588 /* link in */ 597 /* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
601int sysfs_create_subdir(struct kobject *kobj, const char *name, 610int sysfs_create_subdir(struct kobject *kobj, const char *name,
602 struct sysfs_dirent **p_sd) 611 struct sysfs_dirent **p_sd)
603{ 612{
604 return create_dir(kobj, kobj->sd, name, p_sd); 613 return create_dir(kobj, kobj->sd,
614 KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
615}
616
617/**
618 * sysfs_read_ns_type: return associated ns_type
619 * @kobj: the kobject being queried
620 *
621 * Each kobject can be tagged with exactly one namespace type
622 * (i.e. network or user). Return the ns_type associated with
623 * this object if any
624 */
625static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
626{
627 const struct kobj_ns_type_operations *ops;
628 enum kobj_ns_type type;
629
630 ops = kobj_child_ns_ops(kobj);
631 if (!ops)
632 return KOBJ_NS_TYPE_NONE;
633
634 type = ops->type;
635 BUG_ON(type <= KOBJ_NS_TYPE_NONE);
636 BUG_ON(type >= KOBJ_NS_TYPES);
637 BUG_ON(!kobj_ns_type_registered(type));
638
639 return type;
605} 640}
606 641
607/** 642/**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
610 */ 645 */
611int sysfs_create_dir(struct kobject * kobj) 646int sysfs_create_dir(struct kobject * kobj)
612{ 647{
648 enum kobj_ns_type type;
613 struct sysfs_dirent *parent_sd, *sd; 649 struct sysfs_dirent *parent_sd, *sd;
650 const void *ns = NULL;
614 int error = 0; 651 int error = 0;
615 652
616 BUG_ON(!kobj); 653 BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
620 else 657 else
621 parent_sd = &sysfs_root; 658 parent_sd = &sysfs_root;
622 659
623 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); 660 if (sysfs_ns_type(parent_sd))
661 ns = kobj->ktype->namespace(kobj);
662 type = sysfs_read_ns_type(kobj);
663
664 error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
624 if (!error) 665 if (!error)
625 kobj->sd = sd; 666 kobj->sd = sd;
626 return error; 667 return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
630 struct nameidata *nd) 671 struct nameidata *nd)
631{ 672{
632 struct dentry *ret = NULL; 673 struct dentry *ret = NULL;
633 struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata; 674 struct dentry *parent = dentry->d_parent;
675 struct sysfs_dirent *parent_sd = parent->d_fsdata;
634 struct sysfs_dirent *sd; 676 struct sysfs_dirent *sd;
635 struct inode *inode; 677 struct inode *inode;
678 enum kobj_ns_type type;
679 const void *ns;
636 680
637 mutex_lock(&sysfs_mutex); 681 mutex_lock(&sysfs_mutex);
638 682
639 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); 683 type = sysfs_ns_type(parent_sd);
684 ns = sysfs_info(dir->i_sb)->ns[type];
685
686 sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
640 687
641 /* no such entry */ 688 /* no such entry */
642 if (!sd) { 689 if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
735} 782}
736 783
737int sysfs_rename(struct sysfs_dirent *sd, 784int sysfs_rename(struct sysfs_dirent *sd,
738 struct sysfs_dirent *new_parent_sd, const char *new_name) 785 struct sysfs_dirent *new_parent_sd, const void *new_ns,
786 const char *new_name)
739{ 787{
740 const char *dup_name = NULL; 788 const char *dup_name = NULL;
741 int error; 789 int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
743 mutex_lock(&sysfs_mutex); 791 mutex_lock(&sysfs_mutex);
744 792
745 error = 0; 793 error = 0;
746 if ((sd->s_parent == new_parent_sd) && 794 if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
747 (strcmp(sd->s_name, new_name) == 0)) 795 (strcmp(sd->s_name, new_name) == 0))
748 goto out; /* nothing to rename */ 796 goto out; /* nothing to rename */
749 797
750 error = -EEXIST; 798 error = -EEXIST;
751 if (sysfs_find_dirent(new_parent_sd, new_name)) 799 if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
752 goto out; 800 goto out;
753 801
754 /* rename sysfs_dirent */ 802 /* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
770 sd->s_parent = new_parent_sd; 818 sd->s_parent = new_parent_sd;
771 sysfs_link_sibling(sd); 819 sysfs_link_sibling(sd);
772 } 820 }
821 sd->s_ns = new_ns;
773 822
774 error = 0; 823 error = 0;
775 out: 824 out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
780 829
781int sysfs_rename_dir(struct kobject *kobj, const char *new_name) 830int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
782{ 831{
783 return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); 832 struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
833 const void *new_ns = NULL;
834
835 if (sysfs_ns_type(parent_sd))
836 new_ns = kobj->ktype->namespace(kobj);
837
838 return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
784} 839}
785 840
786int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) 841int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
787{ 842{
788 struct sysfs_dirent *sd = kobj->sd; 843 struct sysfs_dirent *sd = kobj->sd;
789 struct sysfs_dirent *new_parent_sd; 844 struct sysfs_dirent *new_parent_sd;
845 const void *new_ns = NULL;
790 846
791 BUG_ON(!sd->s_parent); 847 BUG_ON(!sd->s_parent);
848 if (sysfs_ns_type(sd->s_parent))
849 new_ns = kobj->ktype->namespace(kobj);
792 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? 850 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
793 new_parent_kobj->sd : &sysfs_root; 851 new_parent_kobj->sd : &sysfs_root;
794 852
795 return sysfs_rename(sd, new_parent_sd, sd->s_name); 853 return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
796} 854}
797 855
798/* Relationship between s_mode and the DT_xxx types */ 856/* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
807 return 0; 865 return 0;
808} 866}
809 867
810static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd, 868static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
811 ino_t ino, struct sysfs_dirent *pos) 869 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
812{ 870{
813 if (pos) { 871 if (pos) {
814 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && 872 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
815 pos->s_parent == parent_sd && 873 pos->s_parent == parent_sd &&
816 ino == pos->s_ino; 874 ino == pos->s_ino;
817 sysfs_put(pos); 875 sysfs_put(pos);
818 if (valid) 876 if (!valid)
819 return pos; 877 pos = NULL;
820 } 878 }
821 pos = NULL; 879 if (!pos && (ino > 1) && (ino < INT_MAX)) {
822 if ((ino > 1) && (ino < INT_MAX)) {
823 pos = parent_sd->s_dir.children; 880 pos = parent_sd->s_dir.children;
824 while (pos && (ino > pos->s_ino)) 881 while (pos && (ino > pos->s_ino))
825 pos = pos->s_sibling; 882 pos = pos->s_sibling;
826 } 883 }
884 while (pos && pos->s_ns && pos->s_ns != ns)
885 pos = pos->s_sibling;
827 return pos; 886 return pos;
828} 887}
829 888
830static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd, 889static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
831 ino_t ino, struct sysfs_dirent *pos) 890 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
832{ 891{
833 pos = sysfs_dir_pos(parent_sd, ino, pos); 892 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
834 if (pos) 893 if (pos)
835 pos = pos->s_sibling; 894 pos = pos->s_sibling;
895 while (pos && pos->s_ns && pos->s_ns != ns)
896 pos = pos->s_sibling;
836 return pos; 897 return pos;
837} 898}
838 899
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
841 struct dentry *dentry = filp->f_path.dentry; 902 struct dentry *dentry = filp->f_path.dentry;
842 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 903 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
843 struct sysfs_dirent *pos = filp->private_data; 904 struct sysfs_dirent *pos = filp->private_data;
905 enum kobj_ns_type type;
906 const void *ns;
844 ino_t ino; 907 ino_t ino;
845 908
909 type = sysfs_ns_type(parent_sd);
910 ns = sysfs_info(dentry->d_sb)->ns[type];
911
846 if (filp->f_pos == 0) { 912 if (filp->f_pos == 0) {
847 ino = parent_sd->s_ino; 913 ino = parent_sd->s_ino;
848 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) 914 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
857 filp->f_pos++; 923 filp->f_pos++;
858 } 924 }
859 mutex_lock(&sysfs_mutex); 925 mutex_lock(&sysfs_mutex);
860 for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos); 926 for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
861 pos; 927 pos;
862 pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) { 928 pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
863 const char * name; 929 const char * name;
864 unsigned int type; 930 unsigned int type;
865 int len, ret; 931 int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b2582746..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
478 mutex_lock(&sysfs_mutex); 478 mutex_lock(&sysfs_mutex);
479 479
480 if (sd && dir) 480 if (sd && dir)
481 sd = sysfs_find_dirent(sd, dir); 481 /* Only directories are tagged, so no need to pass
482 * a tag explicitly.
483 */
484 sd = sysfs_find_dirent(sd, NULL, dir);
482 if (sd && attr) 485 if (sd && attr)
483 sd = sysfs_find_dirent(sd, attr); 486 sd = sysfs_find_dirent(sd, NULL, attr);
484 if (sd) 487 if (sd)
485 sysfs_notify_dirent(sd); 488 sysfs_notify_dirent(sd);
486 489
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
569 int error; 572 int error;
570 573
571 if (group) 574 if (group)
572 dir_sd = sysfs_get_dirent(kobj->sd, group); 575 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
573 else 576 else
574 dir_sd = sysfs_get(kobj->sd); 577 dir_sd = sysfs_get(kobj->sd);
575 578
@@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
599 mutex_lock(&sysfs_mutex); 602 mutex_lock(&sysfs_mutex);
600 603
601 rc = -ENOENT; 604 rc = -ENOENT;
602 sd = sysfs_find_dirent(kobj->sd, attr->name); 605 sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
603 if (!sd) 606 if (!sd)
604 goto out; 607 goto out;
605 608
@@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
624 627
625void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 628void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
626{ 629{
627 sysfs_hash_and_remove(kobj->sd, attr->name); 630 sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
628} 631}
629 632
630void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) 633void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
646 struct sysfs_dirent *dir_sd; 649 struct sysfs_dirent *dir_sd;
647 650
648 if (group) 651 if (group)
649 dir_sd = sysfs_get_dirent(kobj->sd, group); 652 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
650 else 653 else
651 dir_sd = sysfs_get(kobj->sd); 654 dir_sd = sysfs_get(kobj->sd);
652 if (dir_sd) { 655 if (dir_sd) {
653 sysfs_hash_and_remove(dir_sd, attr->name); 656 sysfs_hash_and_remove(dir_sd, NULL, attr->name);
654 sysfs_put(dir_sd); 657 sysfs_put(dir_sd);
655 } 658 }
656} 659}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
23 int i; 23 int i;
24 24
25 for (i = 0, attr = grp->attrs; *attr; i++, attr++) 25 for (i = 0, attr = grp->attrs; *attr; i++, attr++)
26 sysfs_hash_and_remove(dir_sd, (*attr)->name); 26 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
27} 27}
28 28
29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
39 * visibility. Do this by first removing then 39 * visibility. Do this by first removing then
40 * re-adding (if required) the file */ 40 * re-adding (if required) the file */
41 if (update) 41 if (update)
42 sysfs_hash_and_remove(dir_sd, (*attr)->name); 42 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
43 if (grp->is_visible) { 43 if (grp->is_visible) {
44 mode = grp->is_visible(kobj, *attr, i); 44 mode = grp->is_visible(kobj, *attr, i);
45 if (!mode) 45 if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
132 struct sysfs_dirent *sd; 132 struct sysfs_dirent *sd;
133 133
134 if (grp->name) { 134 if (grp->name) {
135 sd = sysfs_get_dirent(dir_sd, grp->name); 135 sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
136 if (!sd) { 136 if (!sd) {
137 WARN(!sd, KERN_WARNING "sysfs group %p not found for " 137 WARN(!sd, KERN_WARNING "sysfs group %p not found for "
138 "kobject '%s'\n", grp, kobject_name(kobj)); 138 "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a4a0a9419711..bbd77e95cf7f 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -324,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode)
324 sysfs_put(sd); 324 sysfs_put(sd);
325} 325}
326 326
327int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 327int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
328{ 328{
329 struct sysfs_addrm_cxt acxt; 329 struct sysfs_addrm_cxt acxt;
330 struct sysfs_dirent *sd; 330 struct sysfs_dirent *sd;
@@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
334 334
335 sysfs_addrm_start(&acxt, dir_sd); 335 sysfs_addrm_start(&acxt, dir_sd);
336 336
337 sd = sysfs_find_dirent(dir_sd, name); 337 sd = sysfs_find_dirent(dir_sd, ns, name);
338 if (sd && (sd->s_ns != ns))
339 sd = NULL;
338 if (sd) 340 if (sd)
339 sysfs_remove_one(&acxt, sd); 341 sysfs_remove_one(&acxt, sd);
340 342
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 776137828dca..281c0c9bc39f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = {
35struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
36 .s_name = "", 36 .s_name = "",
37 .s_count = ATOMIC_INIT(1), 37 .s_count = ATOMIC_INIT(1),
38 .s_flags = SYSFS_DIR, 38 .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
40 .s_ino = 1, 40 .s_ino = 1,
41}; 41};
@@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
72 return 0; 72 return 0;
73} 73}
74 74
75static int sysfs_test_super(struct super_block *sb, void *data)
76{
77 struct sysfs_super_info *sb_info = sysfs_info(sb);
78 struct sysfs_super_info *info = data;
79 enum kobj_ns_type type;
80 int found = 1;
81
82 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
83 if (sb_info->ns[type] != info->ns[type])
84 found = 0;
85 }
86 return found;
87}
88
89static int sysfs_set_super(struct super_block *sb, void *data)
90{
91 int error;
92 error = set_anon_super(sb, data);
93 if (!error)
94 sb->s_fs_info = data;
95 return error;
96}
97
75static int sysfs_get_sb(struct file_system_type *fs_type, 98static int sysfs_get_sb(struct file_system_type *fs_type,
76 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
77{ 100{
78 return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type;
103 struct super_block *sb;
104 int error;
105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info)
109 goto out;
110
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type);
113
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info);
117 if (IS_ERR(sb)) {
118 error = PTR_ERR(sb);
119 goto out;
120 }
121 if (!sb->s_root) {
122 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) {
125 deactivate_locked_super(sb);
126 goto out;
127 }
128 sb->s_flags |= MS_ACTIVE;
129 }
130
131 simple_set_mnt(mnt, sb);
132 error = 0;
133out:
134 return error;
135}
136
137static void sysfs_kill_sb(struct super_block *sb)
138{
139 struct sysfs_super_info *info = sysfs_info(sb);
140
141 /* Remove the superblock from fs_supers/s_instances
142 * so we can't find it, before freeing sysfs_super_info.
143 */
144 kill_anon_super(sb);
145 kfree(info);
79} 146}
80 147
81static struct file_system_type sysfs_fs_type = { 148static struct file_system_type sysfs_fs_type = {
82 .name = "sysfs", 149 .name = "sysfs",
83 .get_sb = sysfs_get_sb, 150 .get_sb = sysfs_get_sb,
84 .kill_sb = kill_anon_super, 151 .kill_sb = sysfs_kill_sb,
85}; 152};
86 153
154void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
155{
156 struct super_block *sb;
157
158 mutex_lock(&sysfs_mutex);
159 spin_lock(&sb_lock);
160 list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
161 struct sysfs_super_info *info = sysfs_info(sb);
162 /*
163 * If we see a superblock on the fs_supers/s_instances
164 * list the unmount has not completed and sb->s_fs_info
165 * points to a valid struct sysfs_super_info.
166 */
167 /* Ignore superblocks with the wrong ns */
168 if (info->ns[type] != ns)
169 continue;
170 info->ns[type] = NULL;
171 }
172 spin_unlock(&sb_lock);
173 mutex_unlock(&sysfs_mutex);
174}
175
87int __init sysfs_init(void) 176int __init sysfs_init(void)
88{ 177{
89 int err = -ENOMEM; 178 int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index b93ec51fa7ac..f71246bebfe4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -58,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
58 if (!sd) 58 if (!sd)
59 goto out_put; 59 goto out_put;
60 60
61 if (sysfs_ns_type(parent_sd))
62 sd->s_ns = target->ktype->namespace(target);
61 sd->s_symlink.target_sd = target_sd; 63 sd->s_symlink.target_sd = target_sd;
62 target_sd = NULL; /* reference is now owned by the symlink */ 64 target_sd = NULL; /* reference is now owned by the symlink */
63 65
@@ -107,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
107} 109}
108 110
109/** 111/**
112 * sysfs_delete_link - remove symlink in object's directory.
113 * @kobj: object we're acting for.
114 * @targ: object we're pointing to.
115 * @name: name of the symlink to remove.
116 *
117 * Unlike sysfs_remove_link sysfs_delete_link has enough information
118 * to successfully delete symlinks in tagged directories.
119 */
120void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
121 const char *name)
122{
123 const void *ns = NULL;
124 spin_lock(&sysfs_assoc_lock);
125 if (targ->sd)
126 ns = targ->sd->s_ns;
127 spin_unlock(&sysfs_assoc_lock);
128 sysfs_hash_and_remove(kobj->sd, ns, name);
129}
130
131/**
110 * sysfs_remove_link - remove symlink in object's directory. 132 * sysfs_remove_link - remove symlink in object's directory.
111 * @kobj: object we're acting for. 133 * @kobj: object we're acting for.
112 * @name: name of the symlink to remove. 134 * @name: name of the symlink to remove.
@@ -121,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
121 else 143 else
122 parent_sd = kobj->sd; 144 parent_sd = kobj->sd;
123 145
124 sysfs_hash_and_remove(parent_sd, name); 146 sysfs_hash_and_remove(parent_sd, NULL, name);
125} 147}
126 148
127/** 149/**
@@ -137,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
137 const char *old, const char *new) 159 const char *old, const char *new)
138{ 160{
139 struct sysfs_dirent *parent_sd, *sd = NULL; 161 struct sysfs_dirent *parent_sd, *sd = NULL;
162 const void *old_ns = NULL, *new_ns = NULL;
140 int result; 163 int result;
141 164
142 if (!kobj) 165 if (!kobj)
@@ -144,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
144 else 167 else
145 parent_sd = kobj->sd; 168 parent_sd = kobj->sd;
146 169
170 if (targ->sd)
171 old_ns = targ->sd->s_ns;
172
147 result = -ENOENT; 173 result = -ENOENT;
148 sd = sysfs_get_dirent(parent_sd, old); 174 sd = sysfs_get_dirent(parent_sd, old_ns, old);
149 if (!sd) 175 if (!sd)
150 goto out; 176 goto out;
151 177
@@ -155,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
155 if (sd->s_symlink.target_sd->s_dir.kobj != targ) 181 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
156 goto out; 182 goto out;
157 183
158 result = sysfs_rename(sd, parent_sd, new); 184 if (sysfs_ns_type(parent_sd))
185 new_ns = targ->ktype->namespace(targ);
186
187 result = sysfs_rename(sd, parent_sd, new_ns, new);
159 188
160out: 189out:
161 sysfs_put(sd); 190 sysfs_put(sd);
@@ -261,3 +290,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
261 290
262EXPORT_SYMBOL_GPL(sysfs_create_link); 291EXPORT_SYMBOL_GPL(sysfs_create_link);
263EXPORT_SYMBOL_GPL(sysfs_remove_link); 292EXPORT_SYMBOL_GPL(sysfs_remove_link);
293EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44fb5d3..6a13105b5594 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
58 struct sysfs_dirent *s_sibling; 58 struct sysfs_dirent *s_sibling;
59 const char *s_name; 59 const char *s_name;
60 60
61 const void *s_ns; /* namespace tag */
61 union { 62 union {
62 struct sysfs_elem_dir s_dir; 63 struct sysfs_elem_dir s_dir;
63 struct sysfs_elem_symlink s_symlink; 64 struct sysfs_elem_symlink s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 82#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
82#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) 83#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
83 84
84#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 85/* identify any namespace tag on sysfs_dirents */
85#define SYSFS_FLAG_REMOVED 0x0200 86#define SYSFS_NS_TYPE_MASK 0xff00
87#define SYSFS_NS_TYPE_SHIFT 8
88
89#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
90#define SYSFS_FLAG_REMOVED 0x020000
86 91
87static inline unsigned int sysfs_type(struct sysfs_dirent *sd) 92static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
88{ 93{
89 return sd->s_flags & SYSFS_TYPE_MASK; 94 return sd->s_flags & SYSFS_TYPE_MASK;
90} 95}
91 96
97/*
98 * Return any namespace tags on this dirent.
99 * enum kobj_ns_type is defined in linux/kobject.h
100 */
101static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
102{
103 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
104}
105
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 106#ifdef CONFIG_DEBUG_LOCK_ALLOC
93#define sysfs_dirent_init_lockdep(sd) \ 107#define sysfs_dirent_init_lockdep(sd) \
94do { \ 108do { \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
114/* 128/*
115 * mount.c 129 * mount.c
116 */ 130 */
131
132/*
133 * Each sb is associated with a set of namespace tags (i.e.
134 * the network namespace of the task which mounted this sysfs
135 * instance).
136 */
137struct sysfs_super_info {
138 const void *ns[KOBJ_NS_TYPES];
139};
140#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
117extern struct sysfs_dirent sysfs_root; 141extern struct sysfs_dirent sysfs_root;
118extern struct kmem_cache *sysfs_dir_cachep; 142extern struct kmem_cache *sysfs_dir_cachep;
119 143
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
137void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); 161void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
138 162
139struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 163struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
164 const void *ns,
140 const unsigned char *name); 165 const unsigned char *name);
141struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 166struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
167 const void *ns,
142 const unsigned char *name); 168 const unsigned char *name);
143struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); 169struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
144 170
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
149void sysfs_remove_subdir(struct sysfs_dirent *sd); 175void sysfs_remove_subdir(struct sysfs_dirent *sd);
150 176
151int sysfs_rename(struct sysfs_dirent *sd, 177int sysfs_rename(struct sysfs_dirent *sd,
152 struct sysfs_dirent *new_parent_sd, const char *new_name); 178 struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
153 179
154static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) 180static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
155{ 181{
@@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
179int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 205int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
180int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 206int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
181 size_t size, int flags); 207 size_t size, int flags);
182int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 208int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
183int sysfs_inode_init(void); 209int sysfs_inode_init(void);
184 210
185/* 211/*
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..bbd69bdb0fa8 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); 159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); 160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
161 dirty_sb(sb); 161 dirty_sb(sb);
162 162 inode_init_owner(inode, dir, mode);
163 if (dir->i_mode & S_ISGID) {
164 inode->i_gid = dir->i_gid;
165 if (S_ISDIR(mode))
166 mode |= S_ISGID;
167 } else
168 inode->i_gid = current_fsgid();
169
170 inode->i_uid = current_fsuid();
171 inode->i_ino = fs16_to_cpu(sbi, ino); 163 inode->i_ino = fs16_to_cpu(sbi, ino);
172 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 164 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
173 inode->i_blocks = 0; 165 inode->i_blocks = 0;
@@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
176 insert_inode_hash(inode); 168 insert_inode_hash(inode);
177 mark_inode_dirty(inode); 169 mark_inode_dirty(inode);
178 170
179 inode->i_mode = mode; /* for sysv_write_inode() */
180 sysv_write_inode(inode, 0); /* ensure inode not allocated again */ 171 sysv_write_inode(inode, 0); /* ensure inode not allocated again */
181 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ 172 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
182 /* That's it. */ 173 /* That's it. */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 98158de91d24..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -110,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
110 struct timerfd_ctx *ctx = file->private_data; 110 struct timerfd_ctx *ctx = file->private_data;
111 ssize_t res; 111 ssize_t res;
112 u64 ticks = 0; 112 u64 ticks = 0;
113 DECLARE_WAITQUEUE(wait, current);
114 113
115 if (count < sizeof(ticks)) 114 if (count < sizeof(ticks))
116 return -EINVAL; 115 return -EINVAL;
117 spin_lock_irq(&ctx->wqh.lock); 116 spin_lock_irq(&ctx->wqh.lock);
118 res = -EAGAIN; 117 if (file->f_flags & O_NONBLOCK)
119 if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { 118 res = -EAGAIN;
120 __add_wait_queue(&ctx->wqh, &wait); 119 else
121 for (res = 0;;) { 120 res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
122 set_current_state(TASK_INTERRUPTIBLE);
123 if (ctx->ticks) {
124 res = 0;
125 break;
126 }
127 if (signal_pending(current)) {
128 res = -ERESTARTSYS;
129 break;
130 }
131 spin_unlock_irq(&ctx->wqh.lock);
132 schedule();
133 spin_lock_irq(&ctx->wqh.lock);
134 }
135 __remove_wait_queue(&ctx->wqh, &wait);
136 __set_current_state(TASK_RUNNING);
137 }
138 if (ctx->ticks) { 121 if (ctx->ticks) {
139 ticks = ctx->ticks; 122 ticks = ctx->ticks;
140 if (ctx->expired && ctx->tintv.tv64) { 123 if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
104 */ 104 */
105 inode->i_flags |= (S_NOCMTIME); 105 inode->i_flags |= (S_NOCMTIME);
106 106
107 inode->i_uid = current_fsuid(); 107 inode_init_owner(inode, dir, mode);
108 if (dir->i_mode & S_ISGID) {
109 inode->i_gid = dir->i_gid;
110 if (S_ISDIR(mode))
111 mode |= S_ISGID;
112 } else
113 inode->i_gid = current_fsgid();
114 inode->i_mode = mode;
115 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
116 ubifs_current_time(inode); 109 ubifs_current_time(inode);
117 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 77d5cf4a7547..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -64,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
64 if (!c->ro_media) { 64 if (!c->ro_media) {
65 c->ro_media = 1; 65 c->ro_media = 1;
66 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY;
67 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
68 dbg_dump_stack(); 69 dbg_dump_stack();
69 } 70 }
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a436251e..3a84455c2a77 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -209,6 +209,6 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .read = generic_read_dir, 210 .read = generic_read_dir,
211 .readdir = udf_readdir, 211 .readdir = udf_readdir,
212 .ioctl = udf_ioctl, 212 .unlocked_ioctl = udf_ioctl,
213 .fsync = simple_fsync, 213 .fsync = simple_fsync,
214}; 214};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 4b6a46ccbf46..baae3a723946 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -37,6 +37,7 @@
37#include <linux/quotaops.h> 37#include <linux/quotaops.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/aio.h> 39#include <linux/aio.h>
40#include <linux/smp_lock.h>
40 41
41#include "udf_i.h" 42#include "udf_i.h"
42#include "udf_sb.h" 43#include "udf_sb.h"
@@ -144,50 +145,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
144 return retval; 145 return retval;
145} 146}
146 147
147int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 148long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
148 unsigned long arg)
149{ 149{
150 struct inode *inode = filp->f_dentry->d_inode;
150 long old_block, new_block; 151 long old_block, new_block;
151 int result = -EINVAL; 152 int result = -EINVAL;
152 153
154 lock_kernel();
155
153 if (file_permission(filp, MAY_READ) != 0) { 156 if (file_permission(filp, MAY_READ) != 0) {
154 udf_debug("no permission to access inode %lu\n", 157 udf_debug("no permission to access inode %lu\n", inode->i_ino);
155 inode->i_ino); 158 result = -EPERM;
156 return -EPERM; 159 goto out;
157 } 160 }
158 161
159 if (!arg) { 162 if (!arg) {
160 udf_debug("invalid argument to udf_ioctl\n"); 163 udf_debug("invalid argument to udf_ioctl\n");
161 return -EINVAL; 164 result = -EINVAL;
165 goto out;
162 } 166 }
163 167
164 switch (cmd) { 168 switch (cmd) {
165 case UDF_GETVOLIDENT: 169 case UDF_GETVOLIDENT:
166 if (copy_to_user((char __user *)arg, 170 if (copy_to_user((char __user *)arg,
167 UDF_SB(inode->i_sb)->s_volume_ident, 32)) 171 UDF_SB(inode->i_sb)->s_volume_ident, 32))
168 return -EFAULT; 172 result = -EFAULT;
169 else 173 else
170 return 0; 174 result = 0;
175 goto out;
171 case UDF_RELOCATE_BLOCKS: 176 case UDF_RELOCATE_BLOCKS:
172 if (!capable(CAP_SYS_ADMIN)) 177 if (!capable(CAP_SYS_ADMIN)) {
173 return -EACCES; 178 result = -EACCES;
174 if (get_user(old_block, (long __user *)arg)) 179 goto out;
175 return -EFAULT; 180 }
181 if (get_user(old_block, (long __user *)arg)) {
182 result = -EFAULT;
183 goto out;
184 }
176 result = udf_relocate_blocks(inode->i_sb, 185 result = udf_relocate_blocks(inode->i_sb,
177 old_block, &new_block); 186 old_block, &new_block);
178 if (result == 0) 187 if (result == 0)
179 result = put_user(new_block, (long __user *)arg); 188 result = put_user(new_block, (long __user *)arg);
180 return result; 189 goto out;
181 case UDF_GETEASIZE: 190 case UDF_GETEASIZE:
182 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); 191 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
183 break; 192 goto out;
184 case UDF_GETEABLOCK: 193 case UDF_GETEABLOCK:
185 result = copy_to_user((char __user *)arg, 194 result = copy_to_user((char __user *)arg,
186 UDF_I(inode)->i_ext.i_data, 195 UDF_I(inode)->i_ext.i_data,
187 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; 196 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
188 break; 197 goto out;
189 } 198 }
190 199
200out:
201 unlock_kernel();
191 return result; 202 return result;
192} 203}
193 204
@@ -207,7 +218,7 @@ static int udf_release_file(struct inode *inode, struct file *filp)
207const struct file_operations udf_file_operations = { 218const struct file_operations udf_file_operations = {
208 .read = do_sync_read, 219 .read = do_sync_read,
209 .aio_read = generic_file_aio_read, 220 .aio_read = generic_file_aio_read,
210 .ioctl = udf_ioctl, 221 .unlocked_ioctl = udf_ioctl,
211 .open = dquot_file_open, 222 .open = dquot_file_open,
212 .mmap = generic_file_mmap, 223 .mmap = generic_file_mmap,
213 .write = do_sync_write, 224 .write = do_sync_write,
@@ -227,7 +238,7 @@ int udf_setattr(struct dentry *dentry, struct iattr *iattr)
227 if (error) 238 if (error)
228 return error; 239 return error;
229 240
230 if (iattr->ia_valid & ATTR_SIZE) 241 if (is_quota_modification(inode, iattr))
231 dquot_initialize(inode); 242 dquot_initialize(inode);
232 243
233 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 244 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..2b5586c7f02a 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -124,15 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
124 udf_updated_lvid(sb); 124 udf_updated_lvid(sb);
125 } 125 }
126 mutex_unlock(&sbi->s_alloc_mutex); 126 mutex_unlock(&sbi->s_alloc_mutex);
127 inode->i_mode = mode; 127
128 inode->i_uid = current_fsuid(); 128 inode_init_owner(inode, dir, mode);
129 if (dir->i_mode & S_ISGID) {
130 inode->i_gid = dir->i_gid;
131 if (S_ISDIR(mode))
132 mode |= S_ISGID;
133 } else {
134 inode->i_gid = current_fsgid();
135 }
136 129
137 iinfo->i_location.logicalBlockNum = block; 130 iinfo->i_location.logicalBlockNum = block;
138 iinfo->i_location.partitionReferenceNum = 131 iinfo->i_location.partitionReferenceNum =
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 75816025f95f..585f733615dc 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -579,7 +579,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
579 inode->i_data.a_ops = &udf_aops; 579 inode->i_data.a_ops = &udf_aops;
580 inode->i_op = &udf_file_inode_operations; 580 inode->i_op = &udf_file_inode_operations;
581 inode->i_fop = &udf_file_operations; 581 inode->i_fop = &udf_file_operations;
582 inode->i_mode = mode;
583 mark_inode_dirty(inode); 582 mark_inode_dirty(inode);
584 583
585 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 584 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -627,7 +626,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 goto out; 626 goto out;
628 627
629 iinfo = UDF_I(inode); 628 iinfo = UDF_I(inode);
630 inode->i_uid = current_fsuid();
631 init_special_inode(inode, mode, rdev); 629 init_special_inode(inode, mode, rdev);
632 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 630 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
633 if (!fi) { 631 if (!fi) {
@@ -674,7 +672,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
674 goto out; 672 goto out;
675 673
676 err = -EIO; 674 err = -EIO;
677 inode = udf_new_inode(dir, S_IFDIR, &err); 675 inode = udf_new_inode(dir, S_IFDIR | mode, &err);
678 if (!inode) 676 if (!inode)
679 goto out; 677 goto out;
680 678
@@ -697,9 +695,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
697 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; 695 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
698 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); 696 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
699 brelse(fibh.sbh); 697 brelse(fibh.sbh);
700 inode->i_mode = S_IFDIR | mode;
701 if (dir->i_mode & S_ISGID)
702 inode->i_mode |= S_ISGID;
703 mark_inode_dirty(inode); 698 mark_inode_dirty(inode);
704 699
705 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 700 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -912,7 +907,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
912 dquot_initialize(dir); 907 dquot_initialize(dir);
913 908
914 lock_kernel(); 909 lock_kernel();
915 inode = udf_new_inode(dir, S_IFLNK, &err); 910 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
916 if (!inode) 911 if (!inode)
917 goto out; 912 goto out;
918 913
@@ -923,7 +918,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 } 918 }
924 919
925 iinfo = UDF_I(inode); 920 iinfo = UDF_I(inode);
926 inode->i_mode = S_IFLNK | S_IRWXUGO;
927 inode->i_data.a_ops = &udf_symlink_aops; 921 inode->i_data.a_ops = &udf_symlink_aops;
928 inode->i_op = &udf_symlink_inode_operations; 922 inode->i_op = &udf_symlink_inode_operations;
929 923
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 702a1148e702..9079ff7d6255 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -130,8 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
130 uint8_t *, uint8_t *); 130 uint8_t *, uint8_t *);
131 131
132/* file.c */ 132/* file.c */
133extern int udf_ioctl(struct inode *, struct file *, unsigned int, 133extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134 unsigned long);
135extern int udf_setattr(struct dentry *dentry, struct iattr *iattr); 134extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
136/* inode.c */ 135/* inode.c */
137extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..3a959d55084d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -303,15 +303,7 @@ cg_found:
303 sb->s_dirt = 1; 303 sb->s_dirt = 1;
304 304
305 inode->i_ino = cg * uspi->s_ipg + bit; 305 inode->i_ino = cg * uspi->s_ipg + bit;
306 inode->i_mode = mode; 306 inode_init_owner(inode, dir, mode);
307 inode->i_uid = current_fsuid();
308 if (dir->i_mode & S_ISGID) {
309 inode->i_gid = dir->i_gid;
310 if (S_ISDIR(mode))
311 inode->i_mode |= S_ISGID;
312 } else
313 inode->i_gid = current_fsgid();
314
315 inode->i_blocks = 0; 307 inode->i_blocks = 0;
316 inode->i_generation = 0; 308 inode->i_generation = 0;
317 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 309 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 80b68c3702d1..cffa756f1047 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -603,7 +603,7 @@ static void ufs_set_inode_ops(struct inode *inode)
603 if (!inode->i_blocks) 603 if (!inode->i_blocks)
604 inode->i_op = &ufs_fast_symlink_inode_operations; 604 inode->i_op = &ufs_fast_symlink_inode_operations;
605 else { 605 else {
606 inode->i_op = &page_symlink_inode_operations; 606 inode->i_op = &ufs_symlink_inode_operations;
607 inode->i_mapping->a_ops = &ufs_aops; 607 inode->i_mapping->a_ops = &ufs_aops;
608 } 608 }
609 } else 609 } else
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 118556243e7a..eabc02eb1294 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -148,7 +148,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
148 148
149 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { 149 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
150 /* slow symlink */ 150 /* slow symlink */
151 inode->i_op = &page_symlink_inode_operations; 151 inode->i_op = &ufs_symlink_inode_operations;
152 inode->i_mapping->a_ops = &ufs_aops; 152 inode->i_mapping->a_ops = &ufs_aops;
153 err = page_symlink(inode, symname, l); 153 err = page_symlink(inode, symname, l);
154 if (err) 154 if (err)
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
42const struct inode_operations ufs_fast_symlink_inode_operations = { 42const struct inode_operations ufs_fast_symlink_inode_operations = {
43 .readlink = generic_readlink, 43 .readlink = generic_readlink,
44 .follow_link = ufs_follow_link, 44 .follow_link = ufs_follow_link,
45 .setattr = ufs_setattr,
46};
47
48const struct inode_operations ufs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52 .setattr = ufs_setattr,
45}; 53};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index d3b6270cb377..f294c44577dc 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -508,7 +508,7 @@ out:
508 * - there is no way to know old size 508 * - there is no way to know old size
509 * - there is no way inform user about error, if it happens in `truncate' 509 * - there is no way inform user about error, if it happens in `truncate'
510 */ 510 */
511static int ufs_setattr(struct dentry *dentry, struct iattr *attr) 511int ufs_setattr(struct dentry *dentry, struct iattr *attr)
512{ 512{
513 struct inode *inode = dentry->d_inode; 513 struct inode *inode = dentry->d_inode;
514 unsigned int ia_valid = attr->ia_valid; 514 unsigned int ia_valid = attr->ia_valid;
@@ -518,18 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (error) 518 if (error)
519 return error; 519 return error;
520 520
521 if (is_quota_modification(inode, attr))
522 dquot_initialize(inode);
523
521 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 524 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
522 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 525 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
523 error = dquot_transfer(inode, attr); 526 error = dquot_transfer(inode, attr);
524 if (error) 527 if (error)
525 return error; 528 return error;
526 } 529 }
527 if (ia_valid & ATTR_SIZE && 530 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
528 attr->ia_size != i_size_read(inode)) {
529 loff_t old_i_size = inode->i_size; 531 loff_t old_i_size = inode->i_size;
530 532
531 dquot_initialize(inode);
532
533 error = vmtruncate(inode, attr->ia_size); 533 error = vmtruncate(inode, attr->ia_size);
534 if (error) 534 if (error)
535 return error; 535 return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 43f9f5d5670e..179ae6b3180a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
122 122
123/* symlink.c */ 123/* symlink.c */
124extern const struct inode_operations ufs_fast_symlink_inode_operations; 124extern const struct inode_operations ufs_fast_symlink_inode_operations;
125extern const struct inode_operations ufs_symlink_inode_operations;
125 126
126/* truncate.c */ 127/* truncate.c */
127extern int ufs_truncate (struct inode *, loff_t); 128extern int ufs_truncate (struct inode *, loff_t);
129extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
128 130
129static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) 131static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
130{ 132{
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
590/* 590/*
591 * Find the xattr_handler with the matching prefix. 591 * Find the xattr_handler with the matching prefix.
592 */ 592 */
593static struct xattr_handler * 593static const struct xattr_handler *
594xattr_resolve_name(struct xattr_handler **handlers, const char **name) 594xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
595{ 595{
596 struct xattr_handler *handler; 596 const struct xattr_handler *handler;
597 597
598 if (!*name) 598 if (!*name)
599 return NULL; 599 return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
614ssize_t 614ssize_t
615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) 615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
616{ 616{
617 struct xattr_handler *handler; 617 const struct xattr_handler *handler;
618 618
619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
620 if (!handler) 620 if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
629ssize_t 629ssize_t
630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
631{ 631{
632 struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; 632 const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
633 unsigned int size = 0; 633 unsigned int size = 0;
634 634
635 if (!buffer) { 635 if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
659int 659int
660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) 660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
661{ 661{
662 struct xattr_handler *handler; 662 const struct xattr_handler *handler;
663 663
664 if (size == 0) 664 if (size == 0)
665 value = ""; /* empty EA, do not remove */ 665 value = ""; /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
676int 676int
677generic_removexattr(struct dentry *dentry, const char *name) 677generic_removexattr(struct dentry *dentry, const char *name)
678{ 678{
679 struct xattr_handler *handler; 679 const struct xattr_handler *handler;
680 680
681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
682 if (!handler) 682 if (!handler)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d60..9f769b5b38fc 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
440 return error; 440 return error;
441} 441}
442 442
443struct xattr_handler xfs_xattr_acl_access_handler = { 443const struct xattr_handler xfs_xattr_acl_access_handler = {
444 .prefix = POSIX_ACL_XATTR_ACCESS, 444 .prefix = POSIX_ACL_XATTR_ACCESS,
445 .flags = ACL_TYPE_ACCESS, 445 .flags = ACL_TYPE_ACCESS,
446 .get = xfs_xattr_acl_get, 446 .get = xfs_xattr_acl_get,
447 .set = xfs_xattr_acl_set, 447 .set = xfs_xattr_acl_set,
448}; 448};
449 449
450struct xattr_handler xfs_xattr_acl_default_handler = { 450const struct xattr_handler xfs_xattr_acl_default_handler = {
451 .prefix = POSIX_ACL_XATTR_DEFAULT, 451 .prefix = POSIX_ACL_XATTR_DEFAULT,
452 .flags = ACL_TYPE_DEFAULT, 452 .flags = ACL_TYPE_DEFAULT,
453 .get = xfs_xattr_acl_get, 453 .get = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 0f8b9968a803..089eaca860b4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -45,6 +45,15 @@
45#include <linux/pagevec.h> 45#include <linux/pagevec.h>
46#include <linux/writeback.h> 46#include <linux/writeback.h>
47 47
48/*
49 * Types of I/O for bmap clustering and I/O completion tracking.
50 */
51enum {
52 IO_READ, /* mapping for a read */
53 IO_DELAY, /* mapping covers delalloc region */
54 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
55 IO_NEW /* just allocated */
56};
48 57
49/* 58/*
50 * Prime number of hash buckets since address is used as the key. 59 * Prime number of hash buckets since address is used as the key.
@@ -103,8 +112,9 @@ xfs_count_page_state(
103 112
104STATIC struct block_device * 113STATIC struct block_device *
105xfs_find_bdev_for_inode( 114xfs_find_bdev_for_inode(
106 struct xfs_inode *ip) 115 struct inode *inode)
107{ 116{
117 struct xfs_inode *ip = XFS_I(inode);
108 struct xfs_mount *mp = ip->i_mount; 118 struct xfs_mount *mp = ip->i_mount;
109 119
110 if (XFS_IS_REALTIME_INODE(ip)) 120 if (XFS_IS_REALTIME_INODE(ip))
@@ -183,7 +193,7 @@ xfs_setfilesize(
183 xfs_fsize_t isize; 193 xfs_fsize_t isize;
184 194
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 195 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IOMAP_READ); 196 ASSERT(ioend->io_type != IO_READ);
187 197
188 if (unlikely(ioend->io_error)) 198 if (unlikely(ioend->io_error))
189 return 0; 199 return 0;
@@ -214,7 +224,7 @@ xfs_finish_ioend(
214 if (atomic_dec_and_test(&ioend->io_remaining)) { 224 if (atomic_dec_and_test(&ioend->io_remaining)) {
215 struct workqueue_struct *wq; 225 struct workqueue_struct *wq;
216 226
217 wq = (ioend->io_type == IOMAP_UNWRITTEN) ? 227 wq = (ioend->io_type == IO_UNWRITTEN) ?
218 xfsconvertd_workqueue : xfsdatad_workqueue; 228 xfsconvertd_workqueue : xfsdatad_workqueue;
219 queue_work(wq, &ioend->io_work); 229 queue_work(wq, &ioend->io_work);
220 if (wait) 230 if (wait)
@@ -237,7 +247,7 @@ xfs_end_io(
237 * For unwritten extents we need to issue transactions to convert a 247 * For unwritten extents we need to issue transactions to convert a
238 * range to normal written extens after the data I/O has finished. 248 * range to normal written extens after the data I/O has finished.
239 */ 249 */
240 if (ioend->io_type == IOMAP_UNWRITTEN && 250 if (ioend->io_type == IO_UNWRITTEN &&
241 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 251 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
242 252
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 253 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -250,7 +260,7 @@ xfs_end_io(
250 * We might have to update the on-disk file size after extending 260 * We might have to update the on-disk file size after extending
251 * writes. 261 * writes.
252 */ 262 */
253 if (ioend->io_type != IOMAP_READ) { 263 if (ioend->io_type != IO_READ) {
254 error = xfs_setfilesize(ioend); 264 error = xfs_setfilesize(ioend);
255 ASSERT(!error || error == EAGAIN); 265 ASSERT(!error || error == EAGAIN);
256 } 266 }
@@ -309,21 +319,25 @@ xfs_map_blocks(
309 struct inode *inode, 319 struct inode *inode,
310 loff_t offset, 320 loff_t offset,
311 ssize_t count, 321 ssize_t count,
312 xfs_iomap_t *mapp, 322 struct xfs_bmbt_irec *imap,
313 int flags) 323 int flags)
314{ 324{
315 int nmaps = 1; 325 int nmaps = 1;
326 int new = 0;
316 327
317 return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); 328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
318} 329}
319 330
320STATIC int 331STATIC int
321xfs_iomap_valid( 332xfs_imap_valid(
322 xfs_iomap_t *iomapp, 333 struct inode *inode,
323 loff_t offset) 334 struct xfs_bmbt_irec *imap,
335 xfs_off_t offset)
324{ 336{
325 return offset >= iomapp->iomap_offset && 337 offset >>= inode->i_blkbits;
326 offset < iomapp->iomap_offset + iomapp->iomap_bsize; 338
339 return offset >= imap->br_startoff &&
340 offset < imap->br_startoff + imap->br_blockcount;
327} 341}
328 342
329/* 343/*
@@ -554,19 +568,23 @@ xfs_add_to_ioend(
554 568
555STATIC void 569STATIC void
556xfs_map_buffer( 570xfs_map_buffer(
571 struct inode *inode,
557 struct buffer_head *bh, 572 struct buffer_head *bh,
558 xfs_iomap_t *mp, 573 struct xfs_bmbt_irec *imap,
559 xfs_off_t offset, 574 xfs_off_t offset)
560 uint block_bits)
561{ 575{
562 sector_t bn; 576 sector_t bn;
577 struct xfs_mount *m = XFS_I(inode)->i_mount;
578 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
579 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
563 580
564 ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); 581 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
582 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
565 583
566 bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + 584 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
567 ((offset - mp->iomap_offset) >> block_bits); 585 ((offset - iomap_offset) >> inode->i_blkbits);
568 586
569 ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); 587 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
570 588
571 bh->b_blocknr = bn; 589 bh->b_blocknr = bn;
572 set_buffer_mapped(bh); 590 set_buffer_mapped(bh);
@@ -574,17 +592,17 @@ xfs_map_buffer(
574 592
575STATIC void 593STATIC void
576xfs_map_at_offset( 594xfs_map_at_offset(
595 struct inode *inode,
577 struct buffer_head *bh, 596 struct buffer_head *bh,
578 loff_t offset, 597 struct xfs_bmbt_irec *imap,
579 int block_bits, 598 xfs_off_t offset)
580 xfs_iomap_t *iomapp)
581{ 599{
582 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); 600 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
583 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); 601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
584 602
585 lock_buffer(bh); 603 lock_buffer(bh);
586 xfs_map_buffer(bh, iomapp, offset, block_bits); 604 xfs_map_buffer(inode, bh, imap, offset);
587 bh->b_bdev = iomapp->iomap_target->bt_bdev; 605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
588 set_buffer_mapped(bh); 606 set_buffer_mapped(bh);
589 clear_buffer_delay(bh); 607 clear_buffer_delay(bh);
590 clear_buffer_unwritten(bh); 608 clear_buffer_unwritten(bh);
@@ -713,11 +731,11 @@ xfs_is_delayed_page(
713 bh = head = page_buffers(page); 731 bh = head = page_buffers(page);
714 do { 732 do {
715 if (buffer_unwritten(bh)) 733 if (buffer_unwritten(bh))
716 acceptable = (type == IOMAP_UNWRITTEN); 734 acceptable = (type == IO_UNWRITTEN);
717 else if (buffer_delay(bh)) 735 else if (buffer_delay(bh))
718 acceptable = (type == IOMAP_DELAY); 736 acceptable = (type == IO_DELAY);
719 else if (buffer_dirty(bh) && buffer_mapped(bh)) 737 else if (buffer_dirty(bh) && buffer_mapped(bh))
720 acceptable = (type == IOMAP_NEW); 738 acceptable = (type == IO_NEW);
721 else 739 else
722 break; 740 break;
723 } while ((bh = bh->b_this_page) != head); 741 } while ((bh = bh->b_this_page) != head);
@@ -740,7 +758,7 @@ xfs_convert_page(
740 struct inode *inode, 758 struct inode *inode,
741 struct page *page, 759 struct page *page,
742 loff_t tindex, 760 loff_t tindex,
743 xfs_iomap_t *mp, 761 struct xfs_bmbt_irec *imap,
744 xfs_ioend_t **ioendp, 762 xfs_ioend_t **ioendp,
745 struct writeback_control *wbc, 763 struct writeback_control *wbc,
746 int startio, 764 int startio,
@@ -750,7 +768,6 @@ xfs_convert_page(
750 xfs_off_t end_offset; 768 xfs_off_t end_offset;
751 unsigned long p_offset; 769 unsigned long p_offset;
752 unsigned int type; 770 unsigned int type;
753 int bbits = inode->i_blkbits;
754 int len, page_dirty; 771 int len, page_dirty;
755 int count = 0, done = 0, uptodate = 1; 772 int count = 0, done = 0, uptodate = 1;
756 xfs_off_t offset = page_offset(page); 773 xfs_off_t offset = page_offset(page);
@@ -802,19 +819,19 @@ xfs_convert_page(
802 819
803 if (buffer_unwritten(bh) || buffer_delay(bh)) { 820 if (buffer_unwritten(bh) || buffer_delay(bh)) {
804 if (buffer_unwritten(bh)) 821 if (buffer_unwritten(bh))
805 type = IOMAP_UNWRITTEN; 822 type = IO_UNWRITTEN;
806 else 823 else
807 type = IOMAP_DELAY; 824 type = IO_DELAY;
808 825
809 if (!xfs_iomap_valid(mp, offset)) { 826 if (!xfs_imap_valid(inode, imap, offset)) {
810 done = 1; 827 done = 1;
811 continue; 828 continue;
812 } 829 }
813 830
814 ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); 831 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
815 ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); 832 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
816 833
817 xfs_map_at_offset(bh, offset, bbits, mp); 834 xfs_map_at_offset(inode, bh, imap, offset);
818 if (startio) { 835 if (startio) {
819 xfs_add_to_ioend(inode, bh, offset, 836 xfs_add_to_ioend(inode, bh, offset,
820 type, ioendp, done); 837 type, ioendp, done);
@@ -826,7 +843,7 @@ xfs_convert_page(
826 page_dirty--; 843 page_dirty--;
827 count++; 844 count++;
828 } else { 845 } else {
829 type = IOMAP_NEW; 846 type = IO_NEW;
830 if (buffer_mapped(bh) && all_bh && startio) { 847 if (buffer_mapped(bh) && all_bh && startio) {
831 lock_buffer(bh); 848 lock_buffer(bh);
832 xfs_add_to_ioend(inode, bh, offset, 849 xfs_add_to_ioend(inode, bh, offset,
@@ -866,7 +883,7 @@ STATIC void
866xfs_cluster_write( 883xfs_cluster_write(
867 struct inode *inode, 884 struct inode *inode,
868 pgoff_t tindex, 885 pgoff_t tindex,
869 xfs_iomap_t *iomapp, 886 struct xfs_bmbt_irec *imap,
870 xfs_ioend_t **ioendp, 887 xfs_ioend_t **ioendp,
871 struct writeback_control *wbc, 888 struct writeback_control *wbc,
872 int startio, 889 int startio,
@@ -885,7 +902,7 @@ xfs_cluster_write(
885 902
886 for (i = 0; i < pagevec_count(&pvec); i++) { 903 for (i = 0; i < pagevec_count(&pvec); i++) {
887 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 904 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
888 iomapp, ioendp, wbc, startio, all_bh); 905 imap, ioendp, wbc, startio, all_bh);
889 if (done) 906 if (done)
890 break; 907 break;
891 } 908 }
@@ -930,7 +947,7 @@ xfs_aops_discard_page(
930 loff_t offset = page_offset(page); 947 loff_t offset = page_offset(page);
931 ssize_t len = 1 << inode->i_blkbits; 948 ssize_t len = 1 << inode->i_blkbits;
932 949
933 if (!xfs_is_delayed_page(page, IOMAP_DELAY)) 950 if (!xfs_is_delayed_page(page, IO_DELAY))
934 goto out_invalidate; 951 goto out_invalidate;
935 952
936 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 953 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1042,15 +1059,15 @@ xfs_page_state_convert(
1042 int unmapped) /* also implies page uptodate */ 1059 int unmapped) /* also implies page uptodate */
1043{ 1060{
1044 struct buffer_head *bh, *head; 1061 struct buffer_head *bh, *head;
1045 xfs_iomap_t iomap; 1062 struct xfs_bmbt_irec imap;
1046 xfs_ioend_t *ioend = NULL, *iohead = NULL; 1063 xfs_ioend_t *ioend = NULL, *iohead = NULL;
1047 loff_t offset; 1064 loff_t offset;
1048 unsigned long p_offset = 0; 1065 unsigned long p_offset = 0;
1049 unsigned int type; 1066 unsigned int type;
1050 __uint64_t end_offset; 1067 __uint64_t end_offset;
1051 pgoff_t end_index, last_index, tlast; 1068 pgoff_t end_index, last_index;
1052 ssize_t size, len; 1069 ssize_t size, len;
1053 int flags, err, iomap_valid = 0, uptodate = 1; 1070 int flags, err, imap_valid = 0, uptodate = 1;
1054 int page_dirty, count = 0; 1071 int page_dirty, count = 0;
1055 int trylock = 0; 1072 int trylock = 0;
1056 int all_bh = unmapped; 1073 int all_bh = unmapped;
@@ -1097,7 +1114,7 @@ xfs_page_state_convert(
1097 bh = head = page_buffers(page); 1114 bh = head = page_buffers(page);
1098 offset = page_offset(page); 1115 offset = page_offset(page);
1099 flags = BMAPI_READ; 1116 flags = BMAPI_READ;
1100 type = IOMAP_NEW; 1117 type = IO_NEW;
1101 1118
1102 /* TODO: cleanup count and page_dirty */ 1119 /* TODO: cleanup count and page_dirty */
1103 1120
@@ -1111,12 +1128,12 @@ xfs_page_state_convert(
1111 * the iomap is actually still valid, but the ioend 1128 * the iomap is actually still valid, but the ioend
1112 * isn't. shouldn't happen too often. 1129 * isn't. shouldn't happen too often.
1113 */ 1130 */
1114 iomap_valid = 0; 1131 imap_valid = 0;
1115 continue; 1132 continue;
1116 } 1133 }
1117 1134
1118 if (iomap_valid) 1135 if (imap_valid)
1119 iomap_valid = xfs_iomap_valid(&iomap, offset); 1136 imap_valid = xfs_imap_valid(inode, &imap, offset);
1120 1137
1121 /* 1138 /*
1122 * First case, map an unwritten extent and prepare for 1139 * First case, map an unwritten extent and prepare for
@@ -1137,20 +1154,20 @@ xfs_page_state_convert(
1137 * Make sure we don't use a read-only iomap 1154 * Make sure we don't use a read-only iomap
1138 */ 1155 */
1139 if (flags == BMAPI_READ) 1156 if (flags == BMAPI_READ)
1140 iomap_valid = 0; 1157 imap_valid = 0;
1141 1158
1142 if (buffer_unwritten(bh)) { 1159 if (buffer_unwritten(bh)) {
1143 type = IOMAP_UNWRITTEN; 1160 type = IO_UNWRITTEN;
1144 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 1161 flags = BMAPI_WRITE | BMAPI_IGNSTATE;
1145 } else if (buffer_delay(bh)) { 1162 } else if (buffer_delay(bh)) {
1146 type = IOMAP_DELAY; 1163 type = IO_DELAY;
1147 flags = BMAPI_ALLOCATE | trylock; 1164 flags = BMAPI_ALLOCATE | trylock;
1148 } else { 1165 } else {
1149 type = IOMAP_NEW; 1166 type = IO_NEW;
1150 flags = BMAPI_WRITE | BMAPI_MMAP; 1167 flags = BMAPI_WRITE | BMAPI_MMAP;
1151 } 1168 }
1152 1169
1153 if (!iomap_valid) { 1170 if (!imap_valid) {
1154 /* 1171 /*
1155 * if we didn't have a valid mapping then we 1172 * if we didn't have a valid mapping then we
1156 * need to ensure that we put the new mapping 1173 * need to ensure that we put the new mapping
@@ -1160,7 +1177,7 @@ xfs_page_state_convert(
1160 * for unwritten extent conversion. 1177 * for unwritten extent conversion.
1161 */ 1178 */
1162 new_ioend = 1; 1179 new_ioend = 1;
1163 if (type == IOMAP_NEW) { 1180 if (type == IO_NEW) {
1164 size = xfs_probe_cluster(inode, 1181 size = xfs_probe_cluster(inode,
1165 page, bh, head, 0); 1182 page, bh, head, 0);
1166 } else { 1183 } else {
@@ -1168,14 +1185,14 @@ xfs_page_state_convert(
1168 } 1185 }
1169 1186
1170 err = xfs_map_blocks(inode, offset, size, 1187 err = xfs_map_blocks(inode, offset, size,
1171 &iomap, flags); 1188 &imap, flags);
1172 if (err) 1189 if (err)
1173 goto error; 1190 goto error;
1174 iomap_valid = xfs_iomap_valid(&iomap, offset); 1191 imap_valid = xfs_imap_valid(inode, &imap,
1192 offset);
1175 } 1193 }
1176 if (iomap_valid) { 1194 if (imap_valid) {
1177 xfs_map_at_offset(bh, offset, 1195 xfs_map_at_offset(inode, bh, &imap, offset);
1178 inode->i_blkbits, &iomap);
1179 if (startio) { 1196 if (startio) {
1180 xfs_add_to_ioend(inode, bh, offset, 1197 xfs_add_to_ioend(inode, bh, offset,
1181 type, &ioend, 1198 type, &ioend,
@@ -1194,40 +1211,41 @@ xfs_page_state_convert(
1194 * That means it must already have extents allocated 1211 * That means it must already have extents allocated
1195 * underneath it. Map the extent by reading it. 1212 * underneath it. Map the extent by reading it.
1196 */ 1213 */
1197 if (!iomap_valid || flags != BMAPI_READ) { 1214 if (!imap_valid || flags != BMAPI_READ) {
1198 flags = BMAPI_READ; 1215 flags = BMAPI_READ;
1199 size = xfs_probe_cluster(inode, page, bh, 1216 size = xfs_probe_cluster(inode, page, bh,
1200 head, 1); 1217 head, 1);
1201 err = xfs_map_blocks(inode, offset, size, 1218 err = xfs_map_blocks(inode, offset, size,
1202 &iomap, flags); 1219 &imap, flags);
1203 if (err) 1220 if (err)
1204 goto error; 1221 goto error;
1205 iomap_valid = xfs_iomap_valid(&iomap, offset); 1222 imap_valid = xfs_imap_valid(inode, &imap,
1223 offset);
1206 } 1224 }
1207 1225
1208 /* 1226 /*
1209 * We set the type to IOMAP_NEW in case we are doing a 1227 * We set the type to IO_NEW in case we are doing a
1210 * small write at EOF that is extending the file but 1228 * small write at EOF that is extending the file but
1211 * without needing an allocation. We need to update the 1229 * without needing an allocation. We need to update the
1212 * file size on I/O completion in this case so it is 1230 * file size on I/O completion in this case so it is
1213 * the same case as having just allocated a new extent 1231 * the same case as having just allocated a new extent
1214 * that we are writing into for the first time. 1232 * that we are writing into for the first time.
1215 */ 1233 */
1216 type = IOMAP_NEW; 1234 type = IO_NEW;
1217 if (trylock_buffer(bh)) { 1235 if (trylock_buffer(bh)) {
1218 ASSERT(buffer_mapped(bh)); 1236 ASSERT(buffer_mapped(bh));
1219 if (iomap_valid) 1237 if (imap_valid)
1220 all_bh = 1; 1238 all_bh = 1;
1221 xfs_add_to_ioend(inode, bh, offset, type, 1239 xfs_add_to_ioend(inode, bh, offset, type,
1222 &ioend, !iomap_valid); 1240 &ioend, !imap_valid);
1223 page_dirty--; 1241 page_dirty--;
1224 count++; 1242 count++;
1225 } else { 1243 } else {
1226 iomap_valid = 0; 1244 imap_valid = 0;
1227 } 1245 }
1228 } else if ((buffer_uptodate(bh) || PageUptodate(page)) && 1246 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
1229 (unmapped || startio)) { 1247 (unmapped || startio)) {
1230 iomap_valid = 0; 1248 imap_valid = 0;
1231 } 1249 }
1232 1250
1233 if (!iohead) 1251 if (!iohead)
@@ -1241,12 +1259,23 @@ xfs_page_state_convert(
1241 if (startio) 1259 if (startio)
1242 xfs_start_page_writeback(page, 1, count); 1260 xfs_start_page_writeback(page, 1, count);
1243 1261
1244 if (ioend && iomap_valid) { 1262 if (ioend && imap_valid) {
1245 offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> 1263 xfs_off_t end_index;
1246 PAGE_CACHE_SHIFT; 1264
1247 tlast = min_t(pgoff_t, offset, last_index); 1265 end_index = imap.br_startoff + imap.br_blockcount;
1248 xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, 1266
1249 wbc, startio, all_bh, tlast); 1267 /* to bytes */
1268 end_index <<= inode->i_blkbits;
1269
1270 /* to pages */
1271 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1272
1273 /* check against file size */
1274 if (end_index > last_index)
1275 end_index = last_index;
1276
1277 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1278 wbc, startio, all_bh, end_index);
1250 } 1279 }
1251 1280
1252 if (iohead) 1281 if (iohead)
@@ -1448,10 +1477,11 @@ __xfs_get_blocks(
1448 int direct, 1477 int direct,
1449 bmapi_flags_t flags) 1478 bmapi_flags_t flags)
1450{ 1479{
1451 xfs_iomap_t iomap; 1480 struct xfs_bmbt_irec imap;
1452 xfs_off_t offset; 1481 xfs_off_t offset;
1453 ssize_t size; 1482 ssize_t size;
1454 int niomap = 1; 1483 int nimap = 1;
1484 int new = 0;
1455 int error; 1485 int error;
1456 1486
1457 offset = (xfs_off_t)iblock << inode->i_blkbits; 1487 offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1462,22 +1492,21 @@ __xfs_get_blocks(
1462 return 0; 1492 return 0;
1463 1493
1464 error = xfs_iomap(XFS_I(inode), offset, size, 1494 error = xfs_iomap(XFS_I(inode), offset, size,
1465 create ? flags : BMAPI_READ, &iomap, &niomap); 1495 create ? flags : BMAPI_READ, &imap, &nimap, &new);
1466 if (error) 1496 if (error)
1467 return -error; 1497 return -error;
1468 if (niomap == 0) 1498 if (nimap == 0)
1469 return 0; 1499 return 0;
1470 1500
1471 if (iomap.iomap_bn != IOMAP_DADDR_NULL) { 1501 if (imap.br_startblock != HOLESTARTBLOCK &&
1502 imap.br_startblock != DELAYSTARTBLOCK) {
1472 /* 1503 /*
1473 * For unwritten extents do not report a disk address on 1504 * For unwritten extents do not report a disk address on
1474 * the read case (treat as if we're reading into a hole). 1505 * the read case (treat as if we're reading into a hole).
1475 */ 1506 */
1476 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { 1507 if (create || !ISUNWRITTEN(&imap))
1477 xfs_map_buffer(bh_result, &iomap, offset, 1508 xfs_map_buffer(inode, bh_result, &imap, offset);
1478 inode->i_blkbits); 1509 if (create && ISUNWRITTEN(&imap)) {
1479 }
1480 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1481 if (direct) 1510 if (direct)
1482 bh_result->b_private = inode; 1511 bh_result->b_private = inode;
1483 set_buffer_unwritten(bh_result); 1512 set_buffer_unwritten(bh_result);
@@ -1488,7 +1517,7 @@ __xfs_get_blocks(
1488 * If this is a realtime file, data may be on a different device. 1517 * If this is a realtime file, data may be on a different device.
1489 * to that pointed to from the buffer_head b_bdev currently. 1518 * to that pointed to from the buffer_head b_bdev currently.
1490 */ 1519 */
1491 bh_result->b_bdev = iomap.iomap_target->bt_bdev; 1520 bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1492 1521
1493 /* 1522 /*
1494 * If we previously allocated a block out beyond eof and we are now 1523 * If we previously allocated a block out beyond eof and we are now
@@ -1502,10 +1531,10 @@ __xfs_get_blocks(
1502 if (create && 1531 if (create &&
1503 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1532 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1504 (offset >= i_size_read(inode)) || 1533 (offset >= i_size_read(inode)) ||
1505 (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) 1534 (new || ISUNWRITTEN(&imap))))
1506 set_buffer_new(bh_result); 1535 set_buffer_new(bh_result);
1507 1536
1508 if (iomap.iomap_flags & IOMAP_DELAY) { 1537 if (imap.br_startblock == DELAYSTARTBLOCK) {
1509 BUG_ON(direct); 1538 BUG_ON(direct);
1510 if (create) { 1539 if (create) {
1511 set_buffer_uptodate(bh_result); 1540 set_buffer_uptodate(bh_result);
@@ -1514,11 +1543,23 @@ __xfs_get_blocks(
1514 } 1543 }
1515 } 1544 }
1516 1545
1546 /*
1547 * If this is O_DIRECT or the mpage code calling tell them how large
1548 * the mapping is, so that we can avoid repeated get_blocks calls.
1549 */
1517 if (direct || size > (1 << inode->i_blkbits)) { 1550 if (direct || size > (1 << inode->i_blkbits)) {
1518 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); 1551 xfs_off_t mapping_size;
1519 offset = min_t(xfs_off_t, 1552
1520 iomap.iomap_bsize - iomap.iomap_delta, size); 1553 mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1521 bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); 1554 mapping_size <<= inode->i_blkbits;
1555
1556 ASSERT(mapping_size > 0);
1557 if (mapping_size > size)
1558 mapping_size = size;
1559 if (mapping_size > LONG_MAX)
1560 mapping_size = LONG_MAX;
1561
1562 bh_result->b_size = mapping_size;
1522 } 1563 }
1523 1564
1524 return 0; 1565 return 0;
@@ -1576,7 +1617,7 @@ xfs_end_io_direct(
1576 */ 1617 */
1577 ioend->io_offset = offset; 1618 ioend->io_offset = offset;
1578 ioend->io_size = size; 1619 ioend->io_size = size;
1579 if (ioend->io_type == IOMAP_READ) { 1620 if (ioend->io_type == IO_READ) {
1580 xfs_finish_ioend(ioend, 0); 1621 xfs_finish_ioend(ioend, 0);
1581 } else if (private && size > 0) { 1622 } else if (private && size > 0) {
1582 xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); 1623 xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1587,7 +1628,7 @@ xfs_end_io_direct(
1587 * didn't map an unwritten extent so switch it's completion 1628 * didn't map an unwritten extent so switch it's completion
1588 * handler. 1629 * handler.
1589 */ 1630 */
1590 ioend->io_type = IOMAP_NEW; 1631 ioend->io_type = IO_NEW;
1591 xfs_finish_ioend(ioend, 0); 1632 xfs_finish_ioend(ioend, 0);
1592 } 1633 }
1593 1634
@@ -1612,10 +1653,10 @@ xfs_vm_direct_IO(
1612 struct block_device *bdev; 1653 struct block_device *bdev;
1613 ssize_t ret; 1654 ssize_t ret;
1614 1655
1615 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1656 bdev = xfs_find_bdev_for_inode(inode);
1616 1657
1617 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? 1658 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1618 IOMAP_UNWRITTEN : IOMAP_READ); 1659 IO_UNWRITTEN : IO_READ);
1619 1660
1620 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, 1661 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1621 offset, nr_segs, 1662 offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 44c2b0ef9a41..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
37 37
38#include "xfs_sb.h" 38#include "xfs_sb.h"
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h"
40#include "xfs_ag.h" 41#include "xfs_ag.h"
41#include "xfs_dmapi.h" 42#include "xfs_dmapi.h"
42#include "xfs_mount.h" 43#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
850 * Note that this in no way locks the underlying pages, so it is only 851 * Note that this in no way locks the underlying pages, so it is only
851 * useful for synchronizing concurrent use of buffer objects, not for 852 * useful for synchronizing concurrent use of buffer objects, not for
852 * synchronizing independent access to the underlying pages. 853 * synchronizing independent access to the underlying pages.
854 *
855 * If we come across a stale, pinned, locked buffer, we know that we
856 * are being asked to lock a buffer that has been reallocated. Because
857 * it is pinned, we know that the log has not been pushed to disk and
858 * hence it will still be locked. Rather than sleeping until someone
859 * else pushes the log, push it ourselves before trying to get the lock.
853 */ 860 */
854void 861void
855xfs_buf_lock( 862xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
857{ 864{
858 trace_xfs_buf_lock(bp, _RET_IP_); 865 trace_xfs_buf_lock(bp, _RET_IP_);
859 866
867 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
868 xfs_log_force(bp->b_mount, 0);
860 if (atomic_read(&bp->b_io_remaining)) 869 if (atomic_read(&bp->b_io_remaining))
861 blk_run_address_space(bp->b_target->bt_mapping); 870 blk_run_address_space(bp->b_target->bt_mapping);
862 down(&bp->b_sema); 871 down(&bp->b_sema);
@@ -1007,25 +1016,20 @@ xfs_bwrite(
1007 struct xfs_mount *mp, 1016 struct xfs_mount *mp,
1008 struct xfs_buf *bp) 1017 struct xfs_buf *bp)
1009{ 1018{
1010 int iowait = (bp->b_flags & XBF_ASYNC) == 0; 1019 int error;
1011 int error = 0;
1012 1020
1013 bp->b_strat = xfs_bdstrat_cb; 1021 bp->b_strat = xfs_bdstrat_cb;
1014 bp->b_mount = mp; 1022 bp->b_mount = mp;
1015 bp->b_flags |= XBF_WRITE; 1023 bp->b_flags |= XBF_WRITE;
1016 if (!iowait) 1024 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
1017 bp->b_flags |= _XBF_RUN_QUEUES;
1018 1025
1019 xfs_buf_delwri_dequeue(bp); 1026 xfs_buf_delwri_dequeue(bp);
1020 xfs_buf_iostrategy(bp); 1027 xfs_buf_iostrategy(bp);
1021 1028
1022 if (iowait) { 1029 error = xfs_buf_iowait(bp);
1023 error = xfs_buf_iowait(bp); 1030 if (error)
1024 if (error) 1031 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1025 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1032 xfs_buf_relse(bp);
1026 xfs_buf_relse(bp);
1027 }
1028
1029 return error; 1033 return error;
1030} 1034}
1031 1035
@@ -1614,7 +1618,8 @@ xfs_mapping_buftarg(
1614 1618
1615STATIC int 1619STATIC int
1616xfs_alloc_delwrite_queue( 1620xfs_alloc_delwrite_queue(
1617 xfs_buftarg_t *btp) 1621 xfs_buftarg_t *btp,
1622 const char *fsname)
1618{ 1623{
1619 int error = 0; 1624 int error = 0;
1620 1625
@@ -1622,7 +1627,7 @@ xfs_alloc_delwrite_queue(
1622 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1627 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1623 spin_lock_init(&btp->bt_delwrite_lock); 1628 spin_lock_init(&btp->bt_delwrite_lock);
1624 btp->bt_flags = 0; 1629 btp->bt_flags = 0;
1625 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); 1630 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1626 if (IS_ERR(btp->bt_task)) { 1631 if (IS_ERR(btp->bt_task)) {
1627 error = PTR_ERR(btp->bt_task); 1632 error = PTR_ERR(btp->bt_task);
1628 goto out_error; 1633 goto out_error;
@@ -1635,7 +1640,8 @@ out_error:
1635xfs_buftarg_t * 1640xfs_buftarg_t *
1636xfs_alloc_buftarg( 1641xfs_alloc_buftarg(
1637 struct block_device *bdev, 1642 struct block_device *bdev,
1638 int external) 1643 int external,
1644 const char *fsname)
1639{ 1645{
1640 xfs_buftarg_t *btp; 1646 xfs_buftarg_t *btp;
1641 1647
@@ -1647,7 +1653,7 @@ xfs_alloc_buftarg(
1647 goto error; 1653 goto error;
1648 if (xfs_mapping_buftarg(btp, bdev)) 1654 if (xfs_mapping_buftarg(btp, bdev))
1649 goto error; 1655 goto error;
1650 if (xfs_alloc_delwrite_queue(btp)) 1656 if (xfs_alloc_delwrite_queue(btp, fsname))
1651 goto error; 1657 goto error;
1652 xfs_alloc_bufhash(btp, external); 1658 xfs_alloc_bufhash(btp, external);
1653 return btp; 1659 return btp;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..5fbecefa5dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
390/* 390/*
391 * Handling of buftargs. 391 * Handling of buftargs.
392 */ 392 */
393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
395extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..d8fb1b5d6cb5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -115,6 +115,8 @@ xfs_file_fsync(
115 115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED); 116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117 117
118 xfs_ioend_wait(ip);
119
118 /* 120 /*
119 * We always need to make sure that the required inode state is safe on 121 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the 122 * disk. The inode might be clean but we still might need to force the
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 7b26cc2fd284..699b60cbab9c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -527,6 +527,10 @@ xfs_attrmulti_by_handle(
527 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) 527 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
528 return -XFS_ERROR(EFAULT); 528 return -XFS_ERROR(EFAULT);
529 529
530 /* overflow check */
531 if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
532 return -E2BIG;
533
530 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); 534 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
531 if (IS_ERR(dentry)) 535 if (IS_ERR(dentry))
532 return PTR_ERR(dentry); 536 return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 593c05b4df8d..9287135e9bfc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -420,6 +420,10 @@ xfs_compat_attrmulti_by_handle(
420 sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) 420 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
421 return -XFS_ERROR(EFAULT); 421 return -XFS_ERROR(EFAULT);
422 422
423 /* overflow check */
424 if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
425 return -E2BIG;
426
423 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); 427 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
424 if (IS_ERR(dentry)) 428 if (IS_ERR(dentry))
425 return PTR_ERR(dentry); 429 return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e65a7937f3a4..9c8019c78c92 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,7 +673,10 @@ xfs_vn_fiemap(
673 bm.bmv_length = BTOBB(length); 673 bm.bmv_length = BTOBB(length);
674 674
675 /* We add one because in getbmap world count includes the header */ 675 /* We add one because in getbmap world count includes the header */
676 bm.bmv_count = fieinfo->fi_extents_max + 1; 676 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
677 fieinfo->fi_extents_max + 1;
678 bm.bmv_count = min_t(__s32, bm.bmv_count,
679 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
677 bm.bmv_iflags = BMV_IF_PREALLOC; 680 bm.bmv_iflags = BMV_IF_PREALLOC;
678 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 681 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
679 bm.bmv_iflags |= BMV_IF_ATTRFORK; 682 bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 1947514ce1ad..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
19#include "xfs_dmapi.h" 19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 20#include "xfs_sb.h"
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_log.h"
22#include "xfs_ag.h" 23#include "xfs_ag.h"
23#include "xfs_mount.h" 24#include "xfs_mount.h"
24#include "xfs_quota.h" 25#include "xfs_quota.h"
@@ -97,7 +98,7 @@ xfs_fs_set_xstate(
97} 98}
98 99
99STATIC int 100STATIC int
100xfs_fs_get_xquota( 101xfs_fs_get_dqblk(
101 struct super_block *sb, 102 struct super_block *sb,
102 int type, 103 int type,
103 qid_t id, 104 qid_t id,
@@ -114,7 +115,7 @@ xfs_fs_get_xquota(
114} 115}
115 116
116STATIC int 117STATIC int
117xfs_fs_set_xquota( 118xfs_fs_set_dqblk(
118 struct super_block *sb, 119 struct super_block *sb,
119 int type, 120 int type,
120 qid_t id, 121 qid_t id,
@@ -135,6 +136,6 @@ xfs_fs_set_xquota(
135const struct quotactl_ops xfs_quotactl_operations = { 136const struct quotactl_ops xfs_quotactl_operations = {
136 .get_xstate = xfs_fs_get_xstate, 137 .get_xstate = xfs_fs_get_xstate,
137 .set_xstate = xfs_fs_set_xstate, 138 .set_xstate = xfs_fs_set_xstate,
138 .get_xquota = xfs_fs_get_xquota, 139 .get_dqblk = xfs_fs_get_dqblk,
139 .set_xquota = xfs_fs_set_xquota, 140 .set_dqblk = xfs_fs_set_dqblk,
140}; 141};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 29f1edca76de..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ 121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
122 124
123/* 125/*
124 * Table driven mount option parser. 126 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
374 mp->m_flags |= XFS_MOUNT_DMAPI; 376 mp->m_flags |= XFS_MOUNT_DMAPI;
375 } else if (!strcmp(this_char, MNTOPT_DMI)) { 377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
376 mp->m_flags |= XFS_MOUNT_DMAPI; 378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN,
382 "Enabling EXPERIMENTAL delayed logging feature "
383 "- use at your own risk.\n");
384 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
385 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
377 } else if (!strcmp(this_char, "ihashsize")) { 386 } else if (!strcmp(this_char, "ihashsize")) {
378 cmn_err(CE_WARN, 387 cmn_err(CE_WARN,
379 "XFS: ihashsize no longer used, option is deprecated."); 388 "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
535 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
536 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, 545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
537 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
538 { 0, NULL } 548 { 0, NULL }
539 }; 549 };
540 static struct proc_xfs_info xfs_info_unset[] = { 550 static struct proc_xfs_info xfs_info_unset[] = {
@@ -725,7 +735,8 @@ void
725xfs_blkdev_issue_flush( 735xfs_blkdev_issue_flush(
726 xfs_buftarg_t *buftarg) 736 xfs_buftarg_t *buftarg)
727{ 737{
728 blkdev_issue_flush(buftarg->bt_bdev, NULL); 738 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
739 BLKDEV_IFL_WAIT);
729} 740}
730 741
731STATIC void 742STATIC void
@@ -789,18 +800,18 @@ xfs_open_devices(
789 * Setup xfs_mount buffer target pointers 800 * Setup xfs_mount buffer target pointers
790 */ 801 */
791 error = ENOMEM; 802 error = ENOMEM;
792 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); 803 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
793 if (!mp->m_ddev_targp) 804 if (!mp->m_ddev_targp)
794 goto out_close_rtdev; 805 goto out_close_rtdev;
795 806
796 if (rtdev) { 807 if (rtdev) {
797 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); 808 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
798 if (!mp->m_rtdev_targp) 809 if (!mp->m_rtdev_targp)
799 goto out_free_ddev_targ; 810 goto out_free_ddev_targ;
800 } 811 }
801 812
802 if (logdev && logdev != ddev) { 813 if (logdev && logdev != ddev) {
803 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); 814 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
804 if (!mp->m_logdev_targp) 815 if (!mp->m_logdev_targp)
805 goto out_free_rtdev_targ; 816 goto out_free_rtdev_targ;
806 } else { 817 } else {
@@ -902,7 +913,8 @@ xfsaild_start(
902 struct xfs_ail *ailp) 913 struct xfs_ail *ailp)
903{ 914{
904 ailp->xa_target = 0; 915 ailp->xa_target = 0;
905 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); 916 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
917 ailp->xa_mount->m_fsname);
906 if (IS_ERR(ailp->xa_task)) 918 if (IS_ERR(ailp->xa_task))
907 return -PTR_ERR(ailp->xa_task); 919 return -PTR_ERR(ailp->xa_task);
908 return 0; 920 return 0;
@@ -1092,6 +1104,7 @@ xfs_fs_write_inode(
1092 * the code will only flush the inode if it isn't already 1104 * the code will only flush the inode if it isn't already
1093 * being flushed. 1105 * being flushed.
1094 */ 1106 */
1107 xfs_ioend_wait(ip);
1095 xfs_ilock(ip, XFS_ILOCK_SHARED); 1108 xfs_ilock(ip, XFS_ILOCK_SHARED);
1096 if (ip->i_update_core) { 1109 if (ip->i_update_core) {
1097 error = xfs_log_inode(ip); 1110 error = xfs_log_inode(ip);
@@ -1752,7 +1765,7 @@ xfs_init_zones(void)
1752 * but it is much faster. 1765 * but it is much faster.
1753 */ 1766 */
1754 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + 1767 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
1755 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / 1768 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1756 NBWORD) * sizeof(int))), "xfs_buf_item"); 1769 NBWORD) * sizeof(int))), "xfs_buf_item");
1757 if (!xfs_buf_item_zone) 1770 if (!xfs_buf_item_zone)
1758 goto out_destroy_trans_zone; 1771 goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..519618e9279e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
86 86
87extern const struct export_operations xfs_export_operations; 87extern const struct export_operations xfs_export_operations;
88extern struct xattr_handler *xfs_xattr_handlers[]; 88extern const struct xattr_handler *xfs_xattr_handlers[];
89extern const struct quotactl_ops xfs_quotactl_operations; 89extern const struct quotactl_ops xfs_quotactl_operations;
90 90
91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a427c638d909..3884e20bc14e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -356,68 +356,23 @@ xfs_commit_dummy_trans(
356 356
357STATIC int 357STATIC int
358xfs_sync_fsdata( 358xfs_sync_fsdata(
359 struct xfs_mount *mp, 359 struct xfs_mount *mp)
360 int flags)
361{ 360{
362 struct xfs_buf *bp; 361 struct xfs_buf *bp;
363 struct xfs_buf_log_item *bip;
364 int error = 0;
365 362
366 /* 363 /*
367 * If this is xfssyncd() then only sync the superblock if we can 364 * If the buffer is pinned then push on the log so we won't get stuck
368 * lock it without sleeping and it is not pinned. 365 * waiting in the write for someone, maybe ourselves, to flush the log.
366 *
367 * Even though we just pushed the log above, we did not have the
368 * superblock buffer locked at that point so it can become pinned in
369 * between there and here.
369 */ 370 */
370 if (flags & SYNC_TRYLOCK) { 371 bp = xfs_getsb(mp, 0);
371 ASSERT(!(flags & SYNC_WAIT)); 372 if (XFS_BUF_ISPINNED(bp))
372 373 xfs_log_force(mp, 0);
373 bp = xfs_getsb(mp, XBF_TRYLOCK);
374 if (!bp)
375 goto out;
376
377 bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
378 if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
379 goto out_brelse;
380 } else {
381 bp = xfs_getsb(mp, 0);
382
383 /*
384 * If the buffer is pinned then push on the log so we won't
385 * get stuck waiting in the write for someone, maybe
386 * ourselves, to flush the log.
387 *
388 * Even though we just pushed the log above, we did not have
389 * the superblock buffer locked at that point so it can
390 * become pinned in between there and here.
391 */
392 if (XFS_BUF_ISPINNED(bp))
393 xfs_log_force(mp, 0);
394 }
395
396
397 if (flags & SYNC_WAIT)
398 XFS_BUF_UNASYNC(bp);
399 else
400 XFS_BUF_ASYNC(bp);
401
402 error = xfs_bwrite(mp, bp);
403 if (error)
404 return error;
405
406 /*
407 * If this is a data integrity sync make sure all pending buffers
408 * are flushed out for the log coverage check below.
409 */
410 if (flags & SYNC_WAIT)
411 xfs_flush_buftarg(mp->m_ddev_targp, 1);
412
413 if (xfs_log_need_covered(mp))
414 error = xfs_commit_dummy_trans(mp, flags);
415 return error;
416 374
417 out_brelse: 375 return xfs_bwrite(mp, bp);
418 xfs_buf_relse(bp);
419 out:
420 return error;
421} 376}
422 377
423/* 378/*
@@ -441,7 +396,7 @@ int
441xfs_quiesce_data( 396xfs_quiesce_data(
442 struct xfs_mount *mp) 397 struct xfs_mount *mp)
443{ 398{
444 int error; 399 int error, error2 = 0;
445 400
446 /* push non-blocking */ 401 /* push non-blocking */
447 xfs_sync_data(mp, 0); 402 xfs_sync_data(mp, 0);
@@ -452,13 +407,20 @@ xfs_quiesce_data(
452 xfs_qm_sync(mp, SYNC_WAIT); 407 xfs_qm_sync(mp, SYNC_WAIT);
453 408
454 /* write superblock and hoover up shutdown errors */ 409 /* write superblock and hoover up shutdown errors */
455 error = xfs_sync_fsdata(mp, SYNC_WAIT); 410 error = xfs_sync_fsdata(mp);
411
412 /* make sure all delwri buffers are written out */
413 xfs_flush_buftarg(mp->m_ddev_targp, 1);
414
415 /* mark the log as covered if needed */
416 if (xfs_log_need_covered(mp))
417 error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
456 418
457 /* flush data-only devices */ 419 /* flush data-only devices */
458 if (mp->m_rtdev_targp) 420 if (mp->m_rtdev_targp)
459 XFS_bflush(mp->m_rtdev_targp); 421 XFS_bflush(mp->m_rtdev_targp);
460 422
461 return error; 423 return error ? error : error2;
462} 424}
463 425
464STATIC void 426STATIC void
@@ -581,9 +543,9 @@ xfs_flush_inodes(
581} 543}
582 544
583/* 545/*
584 * Every sync period we need to unpin all items, reclaim inodes, sync 546 * Every sync period we need to unpin all items, reclaim inodes and sync
585 * quota and write out the superblock. We might need to cover the log 547 * disk quotas. We might need to cover the log to indicate that the
586 * to indicate it is idle. 548 * filesystem is idle.
587 */ 549 */
588STATIC void 550STATIC void
589xfs_sync_worker( 551xfs_sync_worker(
@@ -597,7 +559,8 @@ xfs_sync_worker(
597 xfs_reclaim_inodes(mp, 0); 559 xfs_reclaim_inodes(mp, 0);
598 /* dgc: errors ignored here */ 560 /* dgc: errors ignored here */
599 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 561 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
600 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 562 if (xfs_log_need_covered(mp))
563 error = xfs_commit_dummy_trans(mp, 0);
601 } 564 }
602 mp->m_sync_seq++; 565 mp->m_sync_seq++;
603 wake_up(&mp->m_wait_single_sync_task); 566 wake_up(&mp->m_wait_single_sync_task);
@@ -660,7 +623,7 @@ xfs_syncd_init(
660 mp->m_sync_work.w_syncer = xfs_sync_worker; 623 mp->m_sync_work.w_syncer = xfs_sync_worker;
661 mp->m_sync_work.w_mount = mp; 624 mp->m_sync_work.w_mount = mp;
662 mp->m_sync_work.w_completion = NULL; 625 mp->m_sync_work.w_completion = NULL;
663 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); 626 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
664 if (IS_ERR(mp->m_sync_task)) 627 if (IS_ERR(mp->m_sync_task))
665 return -PTR_ERR(mp->m_sync_task); 628 return -PTR_ERR(mp->m_sync_task);
666 return 0; 629 return 0;
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..207fa77f63ae 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
41#include "xfs_alloc.h" 41#include "xfs_alloc.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include "xfs_attr.h" 43#include "xfs_attr.h"
44#include "xfs_attr_sf.h"
45#include "xfs_attr_leaf.h" 44#include "xfs_attr_leaf.h"
46#include "xfs_log_priv.h" 45#include "xfs_log_priv.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
@@ -50,6 +49,9 @@
50#include "xfs_aops.h" 49#include "xfs_aops.h"
51#include "quota/xfs_dquot_item.h" 50#include "quota/xfs_dquot_item.h"
52#include "quota/xfs_dquot.h" 51#include "quota/xfs_dquot.h"
52#include "xfs_log_recover.h"
53#include "xfs_buf_item.h"
54#include "xfs_inode_item.h"
53 55
54/* 56/*
55 * We include this last to have the helpers above available for the trace 57 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
32struct xfs_dquot; 32struct xfs_dquot;
33struct xlog_ticket; 33struct xlog_ticket;
34struct log; 34struct log;
35struct xlog_recover;
36struct xlog_recover_item;
37struct xfs_buf_log_format;
38struct xfs_inode_log_format;
35 39
36DECLARE_EVENT_CLASS(xfs_attr_list_class, 40DECLARE_EVENT_CLASS(xfs_attr_list_class,
37 TP_PROTO(struct xfs_attr_list_context *ctx), 41 TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
562 __field(dev_t, dev) 566 __field(dev_t, dev)
563 __field(xfs_ino_t, ino) 567 __field(xfs_ino_t, ino)
564 __field(int, count) 568 __field(int, count)
569 __field(int, pincount)
565 __field(unsigned long, caller_ip) 570 __field(unsigned long, caller_ip)
566 ), 571 ),
567 TP_fast_assign( 572 TP_fast_assign(
568 __entry->dev = VFS_I(ip)->i_sb->s_dev; 573 __entry->dev = VFS_I(ip)->i_sb->s_dev;
569 __entry->ino = ip->i_ino; 574 __entry->ino = ip->i_ino;
570 __entry->count = atomic_read(&VFS_I(ip)->i_count); 575 __entry->count = atomic_read(&VFS_I(ip)->i_count);
576 __entry->pincount = atomic_read(&ip->i_pincount);
571 __entry->caller_ip = caller_ip; 577 __entry->caller_ip = caller_ip;
572 ), 578 ),
573 TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", 579 TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
574 MAJOR(__entry->dev), MINOR(__entry->dev), 580 MAJOR(__entry->dev), MINOR(__entry->dev),
575 __entry->ino, 581 __entry->ino,
576 __entry->count, 582 __entry->count,
583 __entry->pincount,
577 (char *)__entry->caller_ip) 584 (char *)__entry->caller_ip)
578) 585)
579 586
@@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
583 TP_ARGS(ip, caller_ip)) 590 TP_ARGS(ip, caller_ip))
584DEFINE_INODE_EVENT(xfs_ihold); 591DEFINE_INODE_EVENT(xfs_ihold);
585DEFINE_INODE_EVENT(xfs_irele); 592DEFINE_INODE_EVENT(xfs_irele);
593DEFINE_INODE_EVENT(xfs_inode_pin);
594DEFINE_INODE_EVENT(xfs_inode_unpin);
595DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
596
586/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ 597/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
587DEFINE_INODE_EVENT(xfs_inode); 598DEFINE_INODE_EVENT(xfs_inode);
588#define xfs_itrace_entry(ip) \ 599#define xfs_itrace_entry(ip) \
@@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
642 TP_PROTO(struct xfs_dquot *dqp), \ 653 TP_PROTO(struct xfs_dquot *dqp), \
643 TP_ARGS(dqp)) 654 TP_ARGS(dqp))
644DEFINE_DQUOT_EVENT(xfs_dqadjust); 655DEFINE_DQUOT_EVENT(xfs_dqadjust);
645DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
646DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
647DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); 656DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
648DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); 657DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
649DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); 658DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
658DEFINE_DQUOT_EVENT(xfs_dqlookup_found); 667DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
659DEFINE_DQUOT_EVENT(xfs_dqlookup_want); 668DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
660DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); 669DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
661DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
662DEFINE_DQUOT_EVENT(xfs_dqlookup_done); 670DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
663DEFINE_DQUOT_EVENT(xfs_dqget_hit); 671DEFINE_DQUOT_EVENT(xfs_dqget_hit);
664DEFINE_DQUOT_EVENT(xfs_dqget_miss); 672DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -1051,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
1051 1059
1052); 1060);
1053 1061
1062#define XFS_BUSY_SYNC \
1063 { 0, "async" }, \
1064 { 1, "sync" }
1065
1054TRACE_EVENT(xfs_alloc_busy, 1066TRACE_EVENT(xfs_alloc_busy,
1055 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1067 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1056 xfs_extlen_t len, int slot), 1068 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1057 TP_ARGS(mp, agno, agbno, len, slot), 1069 TP_ARGS(trans, agno, agbno, len, sync),
1058 TP_STRUCT__entry( 1070 TP_STRUCT__entry(
1059 __field(dev_t, dev) 1071 __field(dev_t, dev)
1072 __field(struct xfs_trans *, tp)
1073 __field(int, tid)
1060 __field(xfs_agnumber_t, agno) 1074 __field(xfs_agnumber_t, agno)
1061 __field(xfs_agblock_t, agbno) 1075 __field(xfs_agblock_t, agbno)
1062 __field(xfs_extlen_t, len) 1076 __field(xfs_extlen_t, len)
1063 __field(int, slot) 1077 __field(int, sync)
1064 ), 1078 ),
1065 TP_fast_assign( 1079 TP_fast_assign(
1066 __entry->dev = mp->m_super->s_dev; 1080 __entry->dev = trans->t_mountp->m_super->s_dev;
1081 __entry->tp = trans;
1082 __entry->tid = trans->t_ticket->t_tid;
1067 __entry->agno = agno; 1083 __entry->agno = agno;
1068 __entry->agbno = agbno; 1084 __entry->agbno = agbno;
1069 __entry->len = len; 1085 __entry->len = len;
1070 __entry->slot = slot; 1086 __entry->sync = sync;
1071 ), 1087 ),
1072 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1088 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1073 MAJOR(__entry->dev), MINOR(__entry->dev), 1089 MAJOR(__entry->dev), MINOR(__entry->dev),
1090 __entry->tp,
1091 __entry->tid,
1074 __entry->agno, 1092 __entry->agno,
1075 __entry->agbno, 1093 __entry->agbno,
1076 __entry->len, 1094 __entry->len,
1077 __entry->slot) 1095 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1078 1096
1079); 1097);
1080 1098
1081#define XFS_BUSY_STATES \
1082 { 0, "found" }, \
1083 { 1, "missing" }
1084
1085TRACE_EVENT(xfs_alloc_unbusy, 1099TRACE_EVENT(xfs_alloc_unbusy,
1086 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1100 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1087 int slot, int found), 1101 xfs_agblock_t agbno, xfs_extlen_t len),
1088 TP_ARGS(mp, agno, slot, found), 1102 TP_ARGS(mp, agno, agbno, len),
1089 TP_STRUCT__entry( 1103 TP_STRUCT__entry(
1090 __field(dev_t, dev) 1104 __field(dev_t, dev)
1091 __field(xfs_agnumber_t, agno) 1105 __field(xfs_agnumber_t, agno)
1092 __field(int, slot) 1106 __field(xfs_agblock_t, agbno)
1093 __field(int, found) 1107 __field(xfs_extlen_t, len)
1094 ), 1108 ),
1095 TP_fast_assign( 1109 TP_fast_assign(
1096 __entry->dev = mp->m_super->s_dev; 1110 __entry->dev = mp->m_super->s_dev;
1097 __entry->agno = agno; 1111 __entry->agno = agno;
1098 __entry->slot = slot; 1112 __entry->agbno = agbno;
1099 __entry->found = found; 1113 __entry->len = len;
1100 ), 1114 ),
1101 TP_printk("dev %d:%d agno %u slot %d %s", 1115 TP_printk("dev %d:%d agno %u agbno %u len %u",
1102 MAJOR(__entry->dev), MINOR(__entry->dev), 1116 MAJOR(__entry->dev), MINOR(__entry->dev),
1103 __entry->agno, 1117 __entry->agno,
1104 __entry->slot, 1118 __entry->agbno,
1105 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1119 __entry->len)
1106); 1120);
1107 1121
1122#define XFS_BUSY_STATES \
1123 { 0, "missing" }, \
1124 { 1, "found" }
1125
1108TRACE_EVENT(xfs_alloc_busysearch, 1126TRACE_EVENT(xfs_alloc_busysearch,
1109 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1127 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1110 xfs_extlen_t len, xfs_lsn_t lsn), 1128 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1111 TP_ARGS(mp, agno, agbno, len, lsn), 1129 TP_ARGS(mp, agno, agbno, len, found),
1112 TP_STRUCT__entry( 1130 TP_STRUCT__entry(
1113 __field(dev_t, dev) 1131 __field(dev_t, dev)
1114 __field(xfs_agnumber_t, agno) 1132 __field(xfs_agnumber_t, agno)
1115 __field(xfs_agblock_t, agbno) 1133 __field(xfs_agblock_t, agbno)
1116 __field(xfs_extlen_t, len) 1134 __field(xfs_extlen_t, len)
1117 __field(xfs_lsn_t, lsn) 1135 __field(int, found)
1118 ), 1136 ),
1119 TP_fast_assign( 1137 TP_fast_assign(
1120 __entry->dev = mp->m_super->s_dev; 1138 __entry->dev = mp->m_super->s_dev;
1121 __entry->agno = agno; 1139 __entry->agno = agno;
1122 __entry->agbno = agbno; 1140 __entry->agbno = agbno;
1123 __entry->len = len; 1141 __entry->len = len;
1124 __entry->lsn = lsn; 1142 __entry->found = found;
1125 ), 1143 ),
1126 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1144 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1127 MAJOR(__entry->dev), MINOR(__entry->dev), 1145 MAJOR(__entry->dev), MINOR(__entry->dev),
1128 __entry->agno, 1146 __entry->agno,
1129 __entry->agbno, 1147 __entry->agbno,
1130 __entry->len, 1148 __entry->len,
1149 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1150);
1151
1152TRACE_EVENT(xfs_trans_commit_lsn,
1153 TP_PROTO(struct xfs_trans *trans),
1154 TP_ARGS(trans),
1155 TP_STRUCT__entry(
1156 __field(dev_t, dev)
1157 __field(struct xfs_trans *, tp)
1158 __field(xfs_lsn_t, lsn)
1159 ),
1160 TP_fast_assign(
1161 __entry->dev = trans->t_mountp->m_super->s_dev;
1162 __entry->tp = trans;
1163 __entry->lsn = trans->t_commit_lsn;
1164 ),
1165 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1166 MAJOR(__entry->dev), MINOR(__entry->dev),
1167 __entry->tp,
1131 __entry->lsn) 1168 __entry->lsn)
1132); 1169);
1133 1170
@@ -1495,6 +1532,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); 1532DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); 1533DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497 1534
1535DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
1536 TP_PROTO(struct log *log, struct xlog_recover *trans,
1537 struct xlog_recover_item *item, int pass),
1538 TP_ARGS(log, trans, item, pass),
1539 TP_STRUCT__entry(
1540 __field(dev_t, dev)
1541 __field(unsigned long, item)
1542 __field(xlog_tid_t, tid)
1543 __field(int, type)
1544 __field(int, pass)
1545 __field(int, count)
1546 __field(int, total)
1547 ),
1548 TP_fast_assign(
1549 __entry->dev = log->l_mp->m_super->s_dev;
1550 __entry->item = (unsigned long)item;
1551 __entry->tid = trans->r_log_tid;
1552 __entry->type = ITEM_TYPE(item);
1553 __entry->pass = pass;
1554 __entry->count = item->ri_cnt;
1555 __entry->total = item->ri_total;
1556 ),
1557 TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
1558 "item region count/total %d/%d",
1559 MAJOR(__entry->dev), MINOR(__entry->dev),
1560 __entry->tid,
1561 __entry->pass,
1562 (void *)__entry->item,
1563 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
1564 __entry->count,
1565 __entry->total)
1566)
1567
1568#define DEFINE_LOG_RECOVER_ITEM(name) \
1569DEFINE_EVENT(xfs_log_recover_item_class, name, \
1570 TP_PROTO(struct log *log, struct xlog_recover *trans, \
1571 struct xlog_recover_item *item, int pass), \
1572 TP_ARGS(log, trans, item, pass))
1573
1574DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
1575DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
1576DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
1577DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
1578DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
1579
1580DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
1581 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
1582 TP_ARGS(log, buf_f),
1583 TP_STRUCT__entry(
1584 __field(dev_t, dev)
1585 __field(__int64_t, blkno)
1586 __field(unsigned short, len)
1587 __field(unsigned short, flags)
1588 __field(unsigned short, size)
1589 __field(unsigned int, map_size)
1590 ),
1591 TP_fast_assign(
1592 __entry->dev = log->l_mp->m_super->s_dev;
1593 __entry->blkno = buf_f->blf_blkno;
1594 __entry->len = buf_f->blf_len;
1595 __entry->flags = buf_f->blf_flags;
1596 __entry->size = buf_f->blf_size;
1597 __entry->map_size = buf_f->blf_map_size;
1598 ),
1599 TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
1600 "map_size %d",
1601 MAJOR(__entry->dev), MINOR(__entry->dev),
1602 __entry->blkno,
1603 __entry->len,
1604 __entry->flags,
1605 __entry->size,
1606 __entry->map_size)
1607)
1608
1609#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
1610DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
1611 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
1612 TP_ARGS(log, buf_f))
1613
1614DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
1615DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
1616DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
1617DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
1618DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
1619DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
1620DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
1621DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
1622
1623DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
1624 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
1625 TP_ARGS(log, in_f),
1626 TP_STRUCT__entry(
1627 __field(dev_t, dev)
1628 __field(xfs_ino_t, ino)
1629 __field(unsigned short, size)
1630 __field(int, fields)
1631 __field(unsigned short, asize)
1632 __field(unsigned short, dsize)
1633 __field(__int64_t, blkno)
1634 __field(int, len)
1635 __field(int, boffset)
1636 ),
1637 TP_fast_assign(
1638 __entry->dev = log->l_mp->m_super->s_dev;
1639 __entry->ino = in_f->ilf_ino;
1640 __entry->size = in_f->ilf_size;
1641 __entry->fields = in_f->ilf_fields;
1642 __entry->asize = in_f->ilf_asize;
1643 __entry->dsize = in_f->ilf_dsize;
1644 __entry->blkno = in_f->ilf_blkno;
1645 __entry->len = in_f->ilf_len;
1646 __entry->boffset = in_f->ilf_boffset;
1647 ),
1648 TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
1649 "dsize %d, blkno 0x%llx, len %d, boffset %d",
1650 MAJOR(__entry->dev), MINOR(__entry->dev),
1651 __entry->ino,
1652 __entry->size,
1653 __entry->fields,
1654 __entry->asize,
1655 __entry->dsize,
1656 __entry->blkno,
1657 __entry->len,
1658 __entry->boffset)
1659)
1660#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
1661DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
1662 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
1663 TP_ARGS(log, in_f))
1664
1665DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1666DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1667DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1668
1498#endif /* _TRACE_XFS_H */ 1669#endif /* _TRACE_XFS_H */
1499 1670
1500#undef TRACE_INCLUDE_PATH 1671#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
72 (void *)value, size, xflags); 72 (void *)value, size, xflags);
73} 73}
74 74
75static struct xattr_handler xfs_xattr_user_handler = { 75static const struct xattr_handler xfs_xattr_user_handler = {
76 .prefix = XATTR_USER_PREFIX, 76 .prefix = XATTR_USER_PREFIX,
77 .flags = 0, /* no flags implies user namespace */ 77 .flags = 0, /* no flags implies user namespace */
78 .get = xfs_xattr_get, 78 .get = xfs_xattr_get,
79 .set = xfs_xattr_set, 79 .set = xfs_xattr_set,
80}; 80};
81 81
82static struct xattr_handler xfs_xattr_trusted_handler = { 82static const struct xattr_handler xfs_xattr_trusted_handler = {
83 .prefix = XATTR_TRUSTED_PREFIX, 83 .prefix = XATTR_TRUSTED_PREFIX,
84 .flags = ATTR_ROOT, 84 .flags = ATTR_ROOT,
85 .get = xfs_xattr_get, 85 .get = xfs_xattr_get,
86 .set = xfs_xattr_set, 86 .set = xfs_xattr_set,
87}; 87};
88 88
89static struct xattr_handler xfs_xattr_security_handler = { 89static const struct xattr_handler xfs_xattr_security_handler = {
90 .prefix = XATTR_SECURITY_PREFIX, 90 .prefix = XATTR_SECURITY_PREFIX,
91 .flags = ATTR_SECURE, 91 .flags = ATTR_SECURE,
92 .get = xfs_xattr_get, 92 .get = xfs_xattr_get,
93 .set = xfs_xattr_set, 93 .set = xfs_xattr_set,
94}; 94};
95 95
96struct xattr_handler *xfs_xattr_handlers[] = { 96const struct xattr_handler *xfs_xattr_handlers[] = {
97 &xfs_xattr_user_handler, 97 &xfs_xattr_user_handler,
98 &xfs_xattr_trusted_handler, 98 &xfs_xattr_trusted_handler,
99 &xfs_xattr_security_handler, 99 &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
101 * No need to re-initialize these if this is a reclaimed dquot. 101 * No need to re-initialize these if this is a reclaimed dquot.
102 */ 102 */
103 if (brandnewdquot) { 103 if (brandnewdquot) {
104 dqp->dq_flnext = dqp->dq_flprev = dqp; 104 INIT_LIST_HEAD(&dqp->q_freelist);
105 mutex_init(&dqp->q_qlock); 105 mutex_init(&dqp->q_qlock);
106 init_waitqueue_head(&dqp->q_pinwait); 106 init_waitqueue_head(&dqp->q_pinwait);
107 107
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
119 * Only the q_core portion was zeroed in dqreclaim_one(). 119 * Only the q_core portion was zeroed in dqreclaim_one().
120 * So, we need to reset others. 120 * So, we need to reset others.
121 */ 121 */
122 dqp->q_nrefs = 0; 122 dqp->q_nrefs = 0;
123 dqp->q_blkno = 0; 123 dqp->q_blkno = 0;
124 dqp->MPL_NEXT = dqp->HL_NEXT = NULL; 124 INIT_LIST_HEAD(&dqp->q_mplist);
125 dqp->HL_PREVP = dqp->MPL_PREVP = NULL; 125 INIT_LIST_HEAD(&dqp->q_hashlist);
126 dqp->q_bufoffset = 0; 126 dqp->q_bufoffset = 0;
127 dqp->q_fileoffset = 0; 127 dqp->q_fileoffset = 0;
128 dqp->q_transp = NULL; 128 dqp->q_transp = NULL;
129 dqp->q_gdquot = NULL; 129 dqp->q_gdquot = NULL;
130 dqp->q_res_bcount = 0; 130 dqp->q_res_bcount = 0;
131 dqp->q_res_icount = 0; 131 dqp->q_res_icount = 0;
132 dqp->q_res_rtbcount = 0; 132 dqp->q_res_rtbcount = 0;
133 atomic_set(&dqp->q_pincount, 0); 133 atomic_set(&dqp->q_pincount, 0);
134 dqp->q_hash = NULL; 134 dqp->q_hash = NULL;
135 ASSERT(dqp->dq_flnext == dqp->dq_flprev); 135 ASSERT(list_empty(&dqp->q_freelist));
136 136
137 trace_xfs_dqreuse(dqp); 137 trace_xfs_dqreuse(dqp);
138 } 138 }
@@ -158,7 +158,7 @@ void
158xfs_qm_dqdestroy( 158xfs_qm_dqdestroy(
159 xfs_dquot_t *dqp) 159 xfs_dquot_t *dqp)
160{ 160{
161 ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); 161 ASSERT(list_empty(&dqp->q_freelist));
162 162
163 mutex_destroy(&dqp->q_qlock); 163 mutex_destroy(&dqp->q_qlock);
164 sv_destroy(&dqp->q_pinwait); 164 sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
252 (be64_to_cpu(d->d_bcount) >= 252 (be64_to_cpu(d->d_bcount) >=
253 be64_to_cpu(d->d_blk_hardlimit)))) { 253 be64_to_cpu(d->d_blk_hardlimit)))) {
254 d->d_btimer = cpu_to_be32(get_seconds() + 254 d->d_btimer = cpu_to_be32(get_seconds() +
255 XFS_QI_BTIMELIMIT(mp)); 255 mp->m_quotainfo->qi_btimelimit);
256 } else { 256 } else {
257 d->d_bwarns = 0; 257 d->d_bwarns = 0;
258 } 258 }
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
275 (be64_to_cpu(d->d_icount) >= 275 (be64_to_cpu(d->d_icount) >=
276 be64_to_cpu(d->d_ino_hardlimit)))) { 276 be64_to_cpu(d->d_ino_hardlimit)))) {
277 d->d_itimer = cpu_to_be32(get_seconds() + 277 d->d_itimer = cpu_to_be32(get_seconds() +
278 XFS_QI_ITIMELIMIT(mp)); 278 mp->m_quotainfo->qi_itimelimit);
279 } else { 279 } else {
280 d->d_iwarns = 0; 280 d->d_iwarns = 0;
281 } 281 }
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
298 (be64_to_cpu(d->d_rtbcount) >= 298 (be64_to_cpu(d->d_rtbcount) >=
299 be64_to_cpu(d->d_rtb_hardlimit)))) { 299 be64_to_cpu(d->d_rtb_hardlimit)))) {
300 d->d_rtbtimer = cpu_to_be32(get_seconds() + 300 d->d_rtbtimer = cpu_to_be32(get_seconds() +
301 XFS_QI_RTBTIMELIMIT(mp)); 301 mp->m_quotainfo->qi_rtbtimelimit);
302 } else { 302 } else {
303 d->d_rtbwarns = 0; 303 d->d_rtbwarns = 0;
304 } 304 }
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
325 uint type, 325 uint type,
326 xfs_buf_t *bp) 326 xfs_buf_t *bp)
327{ 327{
328 struct xfs_quotainfo *q = mp->m_quotainfo;
328 xfs_dqblk_t *d; 329 xfs_dqblk_t *d;
329 int curid, i; 330 int curid, i;
330 331
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
337 /* 338 /*
338 * ID of the first dquot in the block - id's are zero based. 339 * ID of the first dquot in the block - id's are zero based.
339 */ 340 */
340 curid = id - (id % XFS_QM_DQPERBLK(mp)); 341 curid = id - (id % q->qi_dqperchunk);
341 ASSERT(curid >= 0); 342 ASSERT(curid >= 0);
342 memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); 343 memset(d, 0, BBTOB(q->qi_dqchunklen));
343 for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) 344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
344 xfs_qm_dqinit_core(curid, type, d); 345 xfs_qm_dqinit_core(curid, type, d);
345 xfs_trans_dquot_buf(tp, bp, 346 xfs_trans_dquot_buf(tp, bp,
346 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 347 (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
347 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 348 ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
348 XFS_BLI_GDQUOT_BUF))); 349 XFS_BLF_GDQUOT_BUF)));
349 xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); 350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
350} 351}
351 352
352 353
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
419 /* now we can just get the buffer (there's nothing to read yet) */ 420 /* now we can just get the buffer (there's nothing to read yet) */
420 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, 421 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
421 dqp->q_blkno, 422 dqp->q_blkno,
422 XFS_QI_DQCHUNKLEN(mp), 423 mp->m_quotainfo->qi_dqchunklen,
423 0); 424 0);
424 if (!bp || (error = XFS_BUF_GETERROR(bp))) 425 if (!bp || (error = XFS_BUF_GETERROR(bp)))
425 goto error1; 426 goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
500 */ 501 */
501 if (dqp->q_blkno == (xfs_daddr_t) 0) { 502 if (dqp->q_blkno == (xfs_daddr_t) 0) {
502 /* We use the id as an index */ 503 /* We use the id as an index */
503 dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); 504 dqp->q_fileoffset = (xfs_fileoff_t)id /
505 mp->m_quotainfo->qi_dqperchunk;
504 nmaps = 1; 506 nmaps = 1;
505 quotip = XFS_DQ_TO_QIP(dqp); 507 quotip = XFS_DQ_TO_QIP(dqp);
506 xfs_ilock(quotip, XFS_ILOCK_SHARED); 508 xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
529 /* 531 /*
530 * offset of dquot in the (fixed sized) dquot chunk. 532 * offset of dquot in the (fixed sized) dquot chunk.
531 */ 533 */
532 dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * 534 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
533 sizeof(xfs_dqblk_t); 535 sizeof(xfs_dqblk_t);
534 if (map.br_startblock == HOLESTARTBLOCK) { 536 if (map.br_startblock == HOLESTARTBLOCK) {
535 /* 537 /*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
559 * Read in the buffer, unless we've just done the allocation 561 * Read in the buffer, unless we've just done the allocation
560 * (in which case we already have the buf). 562 * (in which case we already have the buf).
561 */ 563 */
562 if (! newdquot) { 564 if (!newdquot) {
563 trace_xfs_dqtobp_read(dqp); 565 trace_xfs_dqtobp_read(dqp);
564 566
565 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 567 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
566 dqp->q_blkno, 568 dqp->q_blkno,
567 XFS_QI_DQCHUNKLEN(mp), 569 mp->m_quotainfo->qi_dqchunklen,
568 0, &bp))) { 570 0, &bp);
569 return (error);
570 }
571 if (error || !bp) 571 if (error || !bp)
572 return XFS_ERROR(error); 572 return XFS_ERROR(error);
573 } 573 }
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
689 tp = NULL; 689 tp = NULL;
690 if (flags & XFS_QMOPT_DQALLOC) { 690 if (flags & XFS_QMOPT_DQALLOC) {
691 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 691 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
692 if ((error = xfs_trans_reserve(tp, 692 error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
693 XFS_QM_DQALLOC_SPACE_RES(mp), 693 XFS_WRITE_LOG_RES(mp) +
694 XFS_WRITE_LOG_RES(mp) + 694 BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
695 BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + 695 128,
696 128, 696 0,
697 0, 697 XFS_TRANS_PERM_LOG_RES,
698 XFS_TRANS_PERM_LOG_RES, 698 XFS_WRITE_LOG_COUNT);
699 XFS_WRITE_LOG_COUNT))) { 699 if (error) {
700 cancelflags = 0; 700 cancelflags = 0;
701 goto error0; 701 goto error0;
702 } 702 }
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
751{ 751{
752 xfs_dquot_t *dqp; 752 xfs_dquot_t *dqp;
753 uint flist_locked; 753 uint flist_locked;
754 xfs_dquot_t *d;
755 754
756 ASSERT(mutex_is_locked(&qh->qh_lock)); 755 ASSERT(mutex_is_locked(&qh->qh_lock));
757 756
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
760 /* 759 /*
761 * Traverse the hashchain looking for a match 760 * Traverse the hashchain looking for a match
762 */ 761 */
763 for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { 762 list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
764 /* 763 /*
765 * We already have the hashlock. We don't need the 764 * We already have the hashlock. We don't need the
766 * dqlock to look at the id field of the dquot, since the 765 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
772 /* 771 /*
773 * All in core dquots must be on the dqlist of mp 772 * All in core dquots must be on the dqlist of mp
774 */ 773 */
775 ASSERT(dqp->MPL_PREVP != NULL); 774 ASSERT(!list_empty(&dqp->q_mplist));
776 775
777 xfs_dqlock(dqp); 776 xfs_dqlock(dqp);
778 if (dqp->q_nrefs == 0) { 777 if (dqp->q_nrefs == 0) {
779 ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); 778 ASSERT(!list_empty(&dqp->q_freelist));
780 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 779 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
781 trace_xfs_dqlookup_want(dqp); 780 trace_xfs_dqlookup_want(dqp);
782 781
783 /* 782 /*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
787 */ 786 */
788 dqp->dq_flags |= XFS_DQ_WANT; 787 dqp->dq_flags |= XFS_DQ_WANT;
789 xfs_dqunlock(dqp); 788 xfs_dqunlock(dqp);
790 xfs_qm_freelist_lock(xfs_Gqm); 789 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
791 xfs_dqlock(dqp); 790 xfs_dqlock(dqp);
792 dqp->dq_flags &= ~(XFS_DQ_WANT); 791 dqp->dq_flags &= ~(XFS_DQ_WANT);
793 } 792 }
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
802 801
803 if (flist_locked) { 802 if (flist_locked) {
804 if (dqp->q_nrefs != 0) { 803 if (dqp->q_nrefs != 0) {
805 xfs_qm_freelist_unlock(xfs_Gqm); 804 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
806 flist_locked = B_FALSE; 805 flist_locked = B_FALSE;
807 } else { 806 } else {
808 /* 807 /* take it off the freelist */
809 * take it off the freelist
810 */
811 trace_xfs_dqlookup_freelist(dqp); 808 trace_xfs_dqlookup_freelist(dqp);
812 XQM_FREELIST_REMOVE(dqp); 809 list_del_init(&dqp->q_freelist);
813 /* xfs_qm_freelist_print(&(xfs_Gqm-> 810 xfs_Gqm->qm_dqfrlist_cnt--;
814 qm_dqfreelist),
815 "after removal"); */
816 } 811 }
817 } 812 }
818 813
819 /*
820 * grab a reference
821 */
822 XFS_DQHOLD(dqp); 814 XFS_DQHOLD(dqp);
823 815
824 if (flist_locked) 816 if (flist_locked)
825 xfs_qm_freelist_unlock(xfs_Gqm); 817 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
826 /* 818 /*
827 * move the dquot to the front of the hashchain 819 * move the dquot to the front of the hashchain
828 */ 820 */
829 ASSERT(mutex_is_locked(&qh->qh_lock)); 821 ASSERT(mutex_is_locked(&qh->qh_lock));
830 if (dqp->HL_PREVP != &qh->qh_next) { 822 list_move(&dqp->q_hashlist, &qh->qh_list);
831 trace_xfs_dqlookup_move(dqp);
832 if ((d = dqp->HL_NEXT))
833 d->HL_PREVP = dqp->HL_PREVP;
834 *(dqp->HL_PREVP) = d;
835 d = qh->qh_next;
836 d->HL_PREVP = &dqp->HL_NEXT;
837 dqp->HL_NEXT = d;
838 dqp->HL_PREVP = &qh->qh_next;
839 qh->qh_next = dqp;
840 }
841 trace_xfs_dqlookup_done(dqp); 823 trace_xfs_dqlookup_done(dqp);
842 *O_dqpp = dqp; 824 *O_dqpp = dqp;
843 ASSERT(mutex_is_locked(&qh->qh_lock)); 825 return 0;
844 return (0);
845 } 826 }
846 } 827 }
847 828
@@ -975,16 +956,17 @@ xfs_qm_dqget(
975 */ 956 */
976 if (ip) { 957 if (ip) {
977 xfs_ilock(ip, XFS_ILOCK_EXCL); 958 xfs_ilock(ip, XFS_ILOCK_EXCL);
978 if (! XFS_IS_DQTYPE_ON(mp, type)) { 959
979 /* inode stays locked on return */
980 xfs_qm_dqdestroy(dqp);
981 return XFS_ERROR(ESRCH);
982 }
983 /* 960 /*
984 * A dquot could be attached to this inode by now, since 961 * A dquot could be attached to this inode by now, since
985 * we had dropped the ilock. 962 * we had dropped the ilock.
986 */ 963 */
987 if (type == XFS_DQ_USER) { 964 if (type == XFS_DQ_USER) {
965 if (!XFS_IS_UQUOTA_ON(mp)) {
966 /* inode stays locked on return */
967 xfs_qm_dqdestroy(dqp);
968 return XFS_ERROR(ESRCH);
969 }
988 if (ip->i_udquot) { 970 if (ip->i_udquot) {
989 xfs_qm_dqdestroy(dqp); 971 xfs_qm_dqdestroy(dqp);
990 dqp = ip->i_udquot; 972 dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
992 goto dqret; 974 goto dqret;
993 } 975 }
994 } else { 976 } else {
977 if (!XFS_IS_OQUOTA_ON(mp)) {
978 /* inode stays locked on return */
979 xfs_qm_dqdestroy(dqp);
980 return XFS_ERROR(ESRCH);
981 }
995 if (ip->i_gdquot) { 982 if (ip->i_gdquot) {
996 xfs_qm_dqdestroy(dqp); 983 xfs_qm_dqdestroy(dqp);
997 dqp = ip->i_gdquot; 984 dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
1033 */ 1020 */
1034 ASSERT(mutex_is_locked(&h->qh_lock)); 1021 ASSERT(mutex_is_locked(&h->qh_lock));
1035 dqp->q_hash = h; 1022 dqp->q_hash = h;
1036 XQM_HASHLIST_INSERT(h, dqp); 1023 list_add(&dqp->q_hashlist, &h->qh_list);
1024 h->qh_version++;
1037 1025
1038 /* 1026 /*
1039 * Attach this dquot to this filesystem's list of all dquots, 1027 * Attach this dquot to this filesystem's list of all dquots,
1040 * kept inside the mount structure in m_quotainfo field 1028 * kept inside the mount structure in m_quotainfo field
1041 */ 1029 */
1042 xfs_qm_mplist_lock(mp); 1030 mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
1043 1031
1044 /* 1032 /*
1045 * We return a locked dquot to the caller, with a reference taken 1033 * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
1047 xfs_dqlock(dqp); 1035 xfs_dqlock(dqp);
1048 dqp->q_nrefs = 1; 1036 dqp->q_nrefs = 1;
1049 1037
1050 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); 1038 list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
1051 1039 mp->m_quotainfo->qi_dquots++;
1052 xfs_qm_mplist_unlock(mp); 1040 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1053 mutex_unlock(&h->qh_lock); 1041 mutex_unlock(&h->qh_lock);
1054 dqret: 1042 dqret:
1055 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1043 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
1086 * drop the dqlock and acquire the freelist and dqlock 1074 * drop the dqlock and acquire the freelist and dqlock
1087 * in the right order; but try to get it out-of-order first 1075 * in the right order; but try to get it out-of-order first
1088 */ 1076 */
1089 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 1077 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
1090 trace_xfs_dqput_wait(dqp); 1078 trace_xfs_dqput_wait(dqp);
1091 xfs_dqunlock(dqp); 1079 xfs_dqunlock(dqp);
1092 xfs_qm_freelist_lock(xfs_Gqm); 1080 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1093 xfs_dqlock(dqp); 1081 xfs_dqlock(dqp);
1094 } 1082 }
1095 1083
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
1100 if (--dqp->q_nrefs == 0) { 1088 if (--dqp->q_nrefs == 0) {
1101 trace_xfs_dqput_free(dqp); 1089 trace_xfs_dqput_free(dqp);
1102 1090
1103 /* 1091 list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
1104 * insert at end of the freelist. 1092 xfs_Gqm->qm_dqfrlist_cnt++;
1105 */
1106 XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
1107 1093
1108 /* 1094 /*
1109 * If we just added a udquot to the freelist, then 1095 * If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
1118 xfs_dqlock(gdqp); 1104 xfs_dqlock(gdqp);
1119 dqp->q_gdquot = NULL; 1105 dqp->q_gdquot = NULL;
1120 } 1106 }
1121
1122 /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
1123 "@@@@@++ Free list (after append) @@@@@+");
1124 */
1125 } 1107 }
1126 xfs_dqunlock(dqp); 1108 xfs_dqunlock(dqp);
1127 1109
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
1133 break; 1115 break;
1134 dqp = gdqp; 1116 dqp = gdqp;
1135 } 1117 }
1136 xfs_qm_freelist_unlock(xfs_Gqm); 1118 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1137} 1119}
1138 1120
1139/* 1121/*
@@ -1386,10 +1368,10 @@ int
1386xfs_qm_dqpurge( 1368xfs_qm_dqpurge(
1387 xfs_dquot_t *dqp) 1369 xfs_dquot_t *dqp)
1388{ 1370{
1389 xfs_dqhash_t *thishash; 1371 xfs_dqhash_t *qh = dqp->q_hash;
1390 xfs_mount_t *mp = dqp->q_mount; 1372 xfs_mount_t *mp = dqp->q_mount;
1391 1373
1392 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1374 ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
1393 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); 1375 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
1394 1376
1395 xfs_dqlock(dqp); 1377 xfs_dqlock(dqp);
@@ -1407,7 +1389,7 @@ xfs_qm_dqpurge(
1407 return (1); 1389 return (1);
1408 } 1390 }
1409 1391
1410 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1392 ASSERT(!list_empty(&dqp->q_freelist));
1411 1393
1412 /* 1394 /*
1413 * If we're turning off quotas, we have to make sure that, for 1395 * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1434,16 @@ xfs_qm_dqpurge(
1452 ASSERT(XFS_FORCED_SHUTDOWN(mp) || 1434 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
1453 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); 1435 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
1454 1436
1455 thishash = dqp->q_hash; 1437 list_del_init(&dqp->q_hashlist);
1456 XQM_HASHLIST_REMOVE(thishash, dqp); 1438 qh->qh_version++;
1457 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); 1439 list_del_init(&dqp->q_mplist);
1440 mp->m_quotainfo->qi_dqreclaims++;
1441 mp->m_quotainfo->qi_dquots--;
1458 /* 1442 /*
1459 * XXX Move this to the front of the freelist, if we can get the 1443 * XXX Move this to the front of the freelist, if we can get the
1460 * freelist lock. 1444 * freelist lock.
1461 */ 1445 */
1462 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1446 ASSERT(!list_empty(&dqp->q_freelist));
1463 1447
1464 dqp->q_mount = NULL; 1448 dqp->q_mount = NULL;
1465 dqp->q_hash = NULL; 1449 dqp->q_hash = NULL;
@@ -1467,7 +1451,7 @@ xfs_qm_dqpurge(
1467 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 1451 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1468 xfs_dqfunlock(dqp); 1452 xfs_dqfunlock(dqp);
1469 xfs_dqunlock(dqp); 1453 xfs_dqunlock(dqp);
1470 mutex_unlock(&thishash->qh_lock); 1454 mutex_unlock(&qh->qh_lock);
1471 return (0); 1455 return (0);
1472} 1456}
1473 1457
@@ -1517,6 +1501,7 @@ void
1517xfs_qm_dqflock_pushbuf_wait( 1501xfs_qm_dqflock_pushbuf_wait(
1518 xfs_dquot_t *dqp) 1502 xfs_dquot_t *dqp)
1519{ 1503{
1504 xfs_mount_t *mp = dqp->q_mount;
1520 xfs_buf_t *bp; 1505 xfs_buf_t *bp;
1521 1506
1522 /* 1507 /*
@@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait(
1525 * out immediately. We'll be able to acquire 1510 * out immediately. We'll be able to acquire
1526 * the flush lock when the I/O completes. 1511 * the flush lock when the I/O completes.
1527 */ 1512 */
1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1513 bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); 1514 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
1530 if (!bp) 1515 if (!bp)
1531 goto out_lock; 1516 goto out_lock;
1532 1517
1533 if (XFS_BUF_ISDELAYWRITE(bp)) { 1518 if (XFS_BUF_ISDELAYWRITE(bp)) {
1534 if (XFS_BUF_ISPINNED(bp)) 1519 if (XFS_BUF_ISPINNED(bp))
1535 xfs_log_force(dqp->q_mount, 0); 1520 xfs_log_force(mp, 0);
1536 xfs_buf_delwri_promote(bp); 1521 xfs_buf_delwri_promote(bp);
1537 wake_up_process(bp->b_target->bt_task); 1522 wake_up_process(bp->b_target->bt_task);
1538 } 1523 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
33 * The hash chain headers (hash buckets) 33 * The hash chain headers (hash buckets)
34 */ 34 */
35typedef struct xfs_dqhash { 35typedef struct xfs_dqhash {
36 struct xfs_dquot *qh_next; 36 struct list_head qh_list;
37 struct mutex qh_lock; 37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */ 38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */ 39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t; 40} xfs_dqhash_t;
41 41
42typedef struct xfs_dqlink {
43 struct xfs_dquot *ql_next; /* forward link */
44 struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */
45} xfs_dqlink_t;
46
47struct xfs_mount; 42struct xfs_mount;
48struct xfs_trans; 43struct xfs_trans;
49 44
50/* 45/*
51 * This is the marker which is designed to occupy the first few
52 * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
53 * must come first.
54 * This serves as the marker ("sentinel") when we have to restart list
55 * iterations because of locking considerations.
56 */
57typedef struct xfs_dqmarker {
58 struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */
59 struct xfs_dquot*dqm_flprev;
60 xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */
61 xfs_dqlink_t dqm_hashlist; /* link to the hash chain */
62 uint dqm_flags; /* various flags (XFS_DQ_*) */
63} xfs_dqmarker_t;
64
65/*
66 * The incore dquot structure 46 * The incore dquot structure
67 */ 47 */
68typedef struct xfs_dquot { 48typedef struct xfs_dquot {
69 xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ 49 uint dq_flags; /* various flags (XFS_DQ_*) */
50 struct list_head q_freelist; /* global free list of dquots */
51 struct list_head q_mplist; /* mount's list of dquots */
52 struct list_head q_hashlist; /* gloabl hash list of dquots */
70 xfs_dqhash_t *q_hash; /* the hashchain header */ 53 xfs_dqhash_t *q_hash; /* the hashchain header */
71 struct xfs_mount*q_mount; /* filesystem this relates to */ 54 struct xfs_mount*q_mount; /* filesystem this relates to */
72 struct xfs_trans*q_transp; /* trans this belongs to currently */ 55 struct xfs_trans*q_transp; /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 70 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
88} xfs_dquot_t; 71} xfs_dquot_t;
89 72
90
91#define dq_flnext q_lists.dqm_flnext
92#define dq_flprev q_lists.dqm_flprev
93#define dq_mplist q_lists.dqm_mplist
94#define dq_hashlist q_lists.dqm_hashlist
95#define dq_flags q_lists.dqm_flags
96
97/* 73/*
98 * Lock hierarchy for q_qlock: 74 * Lock hierarchy for q_qlock:
99 * XFS_QLOCK_NORMAL is the implicit default, 75 * XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
127} 103}
128 104
129#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) 105#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
130#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
131#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 106#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
132#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 107#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
133#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 108#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..8d89a24ae324 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
107/* ARGSUSED */ 107/* ARGSUSED */
108STATIC void 108STATIC void
109xfs_qm_dquot_logitem_unpin( 109xfs_qm_dquot_logitem_unpin(
110 xfs_dq_logitem_t *logitem, 110 xfs_dq_logitem_t *logitem)
111 int stale)
112{ 111{
113 xfs_dquot_t *dqp = logitem->qli_dquot; 112 xfs_dquot_t *dqp = logitem->qli_dquot;
114 113
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
123 xfs_dq_logitem_t *logitem, 122 xfs_dq_logitem_t *logitem,
124 xfs_trans_t *tp) 123 xfs_trans_t *tp)
125{ 124{
126 xfs_qm_dquot_logitem_unpin(logitem, 0); 125 xfs_qm_dquot_logitem_unpin(logitem);
127} 126}
128 127
129/* 128/*
@@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf(
228 } 227 }
229 mp = dqp->q_mount; 228 mp = dqp->q_mount;
230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 229 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); 230 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
232 xfs_dqunlock(dqp); 231 xfs_dqunlock(dqp);
233 if (!bp) 232 if (!bp)
234 return; 233 return;
@@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
329 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 328 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
330 xfs_qm_dquot_logitem_format, 329 xfs_qm_dquot_logitem_format,
331 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, 330 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
332 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 331 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
333 xfs_qm_dquot_logitem_unpin,
334 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 332 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
335 xfs_qm_dquot_logitem_unpin_remove, 333 xfs_qm_dquot_logitem_unpin_remove,
336 .iop_trylock = (uint(*)(xfs_log_item_t*)) 334 .iop_trylock = (uint(*)(xfs_log_item_t*))
@@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init(
357 xfs_dq_logitem_t *lp; 355 xfs_dq_logitem_t *lp;
358 lp = &dqp->q_logitem; 356 lp = &dqp->q_logitem;
359 357
360 lp->qli_item.li_type = XFS_LI_DQUOT; 358 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
361 lp->qli_item.li_ops = &xfs_dquot_item_ops; 359 &xfs_dquot_item_ops);
362 lp->qli_item.li_mountp = dqp->q_mount;
363 lp->qli_dquot = dqp; 360 lp->qli_dquot = dqp;
364 lp->qli_format.qlf_type = XFS_LI_DQUOT; 361 lp->qli_format.qlf_type = XFS_LI_DQUOT;
365 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); 362 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
426 */ 423 */
427/*ARGSUSED*/ 424/*ARGSUSED*/
428STATIC void 425STATIC void
429xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) 426xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
430{ 427{
431 return; 428 return;
432} 429}
@@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
537 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 534 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
538 xfs_qm_qoff_logitem_format, 535 xfs_qm_qoff_logitem_format,
539 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 536 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
540 .iop_unpin = (void(*)(xfs_log_item_t* ,int)) 537 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
541 xfs_qm_qoff_logitem_unpin,
542 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 538 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
543 xfs_qm_qoff_logitem_unpin_remove, 539 xfs_qm_qoff_logitem_unpin_remove,
544 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 540 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
559 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 555 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
560 xfs_qm_qoff_logitem_format, 556 xfs_qm_qoff_logitem_format,
561 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 557 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
562 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 558 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
563 xfs_qm_qoff_logitem_unpin,
564 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 559 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
565 xfs_qm_qoff_logitem_unpin_remove, 560 xfs_qm_qoff_logitem_unpin_remove,
566 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 561 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init(
586 581
587 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); 582 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
588 583
589 qf->qql_item.li_type = XFS_LI_QUOTAOFF; 584 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
590 if (start) 585 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
591 qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
592 else
593 qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
594 qf->qql_item.li_mountp = mp; 586 qf->qql_item.li_mountp = mp;
595 qf->qql_format.qf_type = XFS_LI_QUOTAOFF; 587 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
596 qf->qql_format.qf_flags = flags; 588 qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..38e764146644 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,9 +67,6 @@ static cred_t xfs_zerocr;
67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 69
70STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
72
73STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 70STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
74STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 71STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
75STATIC int xfs_qm_shake(int, gfp_t); 72STATIC int xfs_qm_shake(int, gfp_t);
@@ -84,21 +81,25 @@ extern struct mutex qcheck_lock;
84#endif 81#endif
85 82
86#ifdef QUOTADEBUG 83#ifdef QUOTADEBUG
87#define XQM_LIST_PRINT(l, NXT, title) \ 84static void
88{ \ 85xfs_qm_dquot_list_print(
89 xfs_dquot_t *dqp; int i = 0; \ 86 struct xfs_mount *mp)
90 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 87{
91 for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ 88 xfs_dquot_t *dqp;
92 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ 89 int i = 0;
93 "bcnt = %d, icnt = %d, refs = %d", \ 90
94 ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ 91 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
95 DQFLAGTO_TYPESTR(dqp), \ 92 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" "
96 (int) be64_to_cpu(dqp->q_core.d_bcount), \ 93 "bcnt = %lld, icnt = %lld, refs = %d",
97 (int) be64_to_cpu(dqp->q_core.d_icount), \ 94 i++, be32_to_cpu(dqp->q_core.d_id),
98 (int) dqp->q_nrefs); } \ 95 DQFLAGTO_TYPESTR(dqp),
96 (long long)be64_to_cpu(dqp->q_core.d_bcount),
97 (long long)be64_to_cpu(dqp->q_core.d_icount),
98 dqp->q_nrefs);
99 }
99} 100}
100#else 101#else
101#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) 102static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
102#endif 103#endif
103 104
104/* 105/*
@@ -144,7 +145,9 @@ xfs_Gqm_init(void)
144 /* 145 /*
145 * Freelist of all dquots of all file systems 146 * Freelist of all dquots of all file systems
146 */ 147 */
147 xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); 148 INIT_LIST_HEAD(&xqm->qm_dqfrlist);
149 xqm->qm_dqfrlist_cnt = 0;
150 mutex_init(&xqm->qm_dqfrlist_lock);
148 151
149 /* 152 /*
150 * dquot zone. we register our own low-memory callback. 153 * dquot zone. we register our own low-memory callback.
@@ -189,6 +192,7 @@ STATIC void
189xfs_qm_destroy( 192xfs_qm_destroy(
190 struct xfs_qm *xqm) 193 struct xfs_qm *xqm)
191{ 194{
195 struct xfs_dquot *dqp, *n;
192 int hsize, i; 196 int hsize, i;
193 197
194 ASSERT(xqm != NULL); 198 ASSERT(xqm != NULL);
@@ -204,7 +208,21 @@ xfs_qm_destroy(
204 xqm->qm_usr_dqhtable = NULL; 208 xqm->qm_usr_dqhtable = NULL;
205 xqm->qm_grp_dqhtable = NULL; 209 xqm->qm_grp_dqhtable = NULL;
206 xqm->qm_dqhashmask = 0; 210 xqm->qm_dqhashmask = 0;
207 xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); 211
212 /* frlist cleanup */
213 mutex_lock(&xqm->qm_dqfrlist_lock);
214 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
215 xfs_dqlock(dqp);
216#ifdef QUOTADEBUG
217 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
218#endif
219 list_del_init(&dqp->q_freelist);
220 xfs_Gqm->qm_dqfrlist_cnt--;
221 xfs_dqunlock(dqp);
222 xfs_qm_dqdestroy(dqp);
223 }
224 mutex_unlock(&xqm->qm_dqfrlist_lock);
225 mutex_destroy(&xqm->qm_dqfrlist_lock);
208#ifdef DEBUG 226#ifdef DEBUG
209 mutex_destroy(&qcheck_lock); 227 mutex_destroy(&qcheck_lock);
210#endif 228#endif
@@ -256,7 +274,7 @@ STATIC void
256xfs_qm_rele_quotafs_ref( 274xfs_qm_rele_quotafs_ref(
257 struct xfs_mount *mp) 275 struct xfs_mount *mp)
258{ 276{
259 xfs_dquot_t *dqp, *nextdqp; 277 xfs_dquot_t *dqp, *n;
260 278
261 ASSERT(xfs_Gqm); 279 ASSERT(xfs_Gqm);
262 ASSERT(xfs_Gqm->qm_nrefs > 0); 280 ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref(
264 /* 282 /*
265 * Go thru the freelist and destroy all inactive dquots. 283 * Go thru the freelist and destroy all inactive dquots.
266 */ 284 */
267 xfs_qm_freelist_lock(xfs_Gqm); 285 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
268 286
269 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 287 list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
270 dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
271 xfs_dqlock(dqp); 288 xfs_dqlock(dqp);
272 nextdqp = dqp->dq_flnext;
273 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 289 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
274 ASSERT(dqp->q_mount == NULL); 290 ASSERT(dqp->q_mount == NULL);
275 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 291 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
276 ASSERT(dqp->HL_PREVP == NULL); 292 ASSERT(list_empty(&dqp->q_hashlist));
277 ASSERT(dqp->MPL_PREVP == NULL); 293 ASSERT(list_empty(&dqp->q_mplist));
278 XQM_FREELIST_REMOVE(dqp); 294 list_del_init(&dqp->q_freelist);
295 xfs_Gqm->qm_dqfrlist_cnt--;
279 xfs_dqunlock(dqp); 296 xfs_dqunlock(dqp);
280 xfs_qm_dqdestroy(dqp); 297 xfs_qm_dqdestroy(dqp);
281 } else { 298 } else {
282 xfs_dqunlock(dqp); 299 xfs_dqunlock(dqp);
283 } 300 }
284 dqp = nextdqp;
285 } 301 }
286 xfs_qm_freelist_unlock(xfs_Gqm); 302 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
287 303
288 /* 304 /*
289 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 305 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +321,7 @@ xfs_qm_unmount(
305 struct xfs_mount *mp) 321 struct xfs_mount *mp)
306{ 322{
307 if (mp->m_quotainfo) { 323 if (mp->m_quotainfo) {
308 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); 324 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
309 xfs_qm_destroy_quotainfo(mp); 325 xfs_qm_destroy_quotainfo(mp);
310 } 326 }
311} 327}
@@ -449,20 +465,21 @@ xfs_qm_unmount_quotas(
449 */ 465 */
450STATIC int 466STATIC int
451xfs_qm_dqflush_all( 467xfs_qm_dqflush_all(
452 xfs_mount_t *mp, 468 struct xfs_mount *mp,
453 int sync_mode) 469 int sync_mode)
454{ 470{
455 int recl; 471 struct xfs_quotainfo *q = mp->m_quotainfo;
456 xfs_dquot_t *dqp; 472 int recl;
457 int niters; 473 struct xfs_dquot *dqp;
458 int error; 474 int niters;
475 int error;
459 476
460 if (mp->m_quotainfo == NULL) 477 if (!q)
461 return 0; 478 return 0;
462 niters = 0; 479 niters = 0;
463again: 480again:
464 xfs_qm_mplist_lock(mp); 481 mutex_lock(&q->qi_dqlist_lock);
465 FOREACH_DQUOT_IN_MP(dqp, mp) { 482 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
466 xfs_dqlock(dqp); 483 xfs_dqlock(dqp);
467 if (! XFS_DQ_IS_DIRTY(dqp)) { 484 if (! XFS_DQ_IS_DIRTY(dqp)) {
468 xfs_dqunlock(dqp); 485 xfs_dqunlock(dqp);
@@ -470,7 +487,7 @@ again:
470 } 487 }
471 488
472 /* XXX a sentinel would be better */ 489 /* XXX a sentinel would be better */
473 recl = XFS_QI_MPLRECLAIMS(mp); 490 recl = q->qi_dqreclaims;
474 if (!xfs_dqflock_nowait(dqp)) { 491 if (!xfs_dqflock_nowait(dqp)) {
475 /* 492 /*
476 * If we can't grab the flush lock then check 493 * If we can't grab the flush lock then check
@@ -485,21 +502,21 @@ again:
485 * Let go of the mplist lock. We don't want to hold it 502 * Let go of the mplist lock. We don't want to hold it
486 * across a disk write. 503 * across a disk write.
487 */ 504 */
488 xfs_qm_mplist_unlock(mp); 505 mutex_unlock(&q->qi_dqlist_lock);
489 error = xfs_qm_dqflush(dqp, sync_mode); 506 error = xfs_qm_dqflush(dqp, sync_mode);
490 xfs_dqunlock(dqp); 507 xfs_dqunlock(dqp);
491 if (error) 508 if (error)
492 return error; 509 return error;
493 510
494 xfs_qm_mplist_lock(mp); 511 mutex_lock(&q->qi_dqlist_lock);
495 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 512 if (recl != q->qi_dqreclaims) {
496 xfs_qm_mplist_unlock(mp); 513 mutex_unlock(&q->qi_dqlist_lock);
497 /* XXX restart limit */ 514 /* XXX restart limit */
498 goto again; 515 goto again;
499 } 516 }
500 } 517 }
501 518
502 xfs_qm_mplist_unlock(mp); 519 mutex_unlock(&q->qi_dqlist_lock);
503 /* return ! busy */ 520 /* return ! busy */
504 return 0; 521 return 0;
505} 522}
@@ -509,15 +526,15 @@ again:
509 */ 526 */
510STATIC void 527STATIC void
511xfs_qm_detach_gdquots( 528xfs_qm_detach_gdquots(
512 xfs_mount_t *mp) 529 struct xfs_mount *mp)
513{ 530{
514 xfs_dquot_t *dqp, *gdqp; 531 struct xfs_quotainfo *q = mp->m_quotainfo;
515 int nrecl; 532 struct xfs_dquot *dqp, *gdqp;
533 int nrecl;
516 534
517 again: 535 again:
518 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 536 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
519 dqp = XFS_QI_MPLNEXT(mp); 537 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
520 while (dqp) {
521 xfs_dqlock(dqp); 538 xfs_dqlock(dqp);
522 if ((gdqp = dqp->q_gdquot)) { 539 if ((gdqp = dqp->q_gdquot)) {
523 xfs_dqlock(gdqp); 540 xfs_dqlock(gdqp);
@@ -530,15 +547,14 @@ xfs_qm_detach_gdquots(
530 * Can't hold the mplist lock across a dqput. 547 * Can't hold the mplist lock across a dqput.
531 * XXXmust convert to marker based iterations here. 548 * XXXmust convert to marker based iterations here.
532 */ 549 */
533 nrecl = XFS_QI_MPLRECLAIMS(mp); 550 nrecl = q->qi_dqreclaims;
534 xfs_qm_mplist_unlock(mp); 551 mutex_unlock(&q->qi_dqlist_lock);
535 xfs_qm_dqput(gdqp); 552 xfs_qm_dqput(gdqp);
536 553
537 xfs_qm_mplist_lock(mp); 554 mutex_lock(&q->qi_dqlist_lock);
538 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) 555 if (nrecl != q->qi_dqreclaims)
539 goto again; 556 goto again;
540 } 557 }
541 dqp = dqp->MPL_NEXT;
542 } 558 }
543} 559}
544 560
@@ -550,23 +566,23 @@ xfs_qm_detach_gdquots(
550 */ 566 */
551STATIC int 567STATIC int
552xfs_qm_dqpurge_int( 568xfs_qm_dqpurge_int(
553 xfs_mount_t *mp, 569 struct xfs_mount *mp,
554 uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ 570 uint flags)
555{ 571{
556 xfs_dquot_t *dqp; 572 struct xfs_quotainfo *q = mp->m_quotainfo;
557 uint dqtype; 573 struct xfs_dquot *dqp, *n;
558 int nrecl; 574 uint dqtype;
559 xfs_dquot_t *nextdqp; 575 int nrecl;
560 int nmisses; 576 int nmisses;
561 577
562 if (mp->m_quotainfo == NULL) 578 if (!q)
563 return 0; 579 return 0;
564 580
565 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; 581 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
566 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; 582 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
567 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; 583 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
568 584
569 xfs_qm_mplist_lock(mp); 585 mutex_lock(&q->qi_dqlist_lock);
570 586
571 /* 587 /*
572 * In the first pass through all incore dquots of this filesystem, 588 * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +594,25 @@ xfs_qm_dqpurge_int(
578 594
579 again: 595 again:
580 nmisses = 0; 596 nmisses = 0;
581 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 597 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
582 /* 598 /*
583 * Try to get rid of all of the unwanted dquots. The idea is to 599 * Try to get rid of all of the unwanted dquots. The idea is to
584 * get them off mplist and hashlist, but leave them on freelist. 600 * get them off mplist and hashlist, but leave them on freelist.
585 */ 601 */
586 dqp = XFS_QI_MPLNEXT(mp); 602 list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
587 while (dqp) {
588 /* 603 /*
589 * It's OK to look at the type without taking dqlock here. 604 * It's OK to look at the type without taking dqlock here.
590 * We're holding the mplist lock here, and that's needed for 605 * We're holding the mplist lock here, and that's needed for
591 * a dqreclaim. 606 * a dqreclaim.
592 */ 607 */
593 if ((dqp->dq_flags & dqtype) == 0) { 608 if ((dqp->dq_flags & dqtype) == 0)
594 dqp = dqp->MPL_NEXT;
595 continue; 609 continue;
596 }
597 610
598 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 611 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
599 nrecl = XFS_QI_MPLRECLAIMS(mp); 612 nrecl = q->qi_dqreclaims;
600 xfs_qm_mplist_unlock(mp); 613 mutex_unlock(&q->qi_dqlist_lock);
601 mutex_lock(&dqp->q_hash->qh_lock); 614 mutex_lock(&dqp->q_hash->qh_lock);
602 xfs_qm_mplist_lock(mp); 615 mutex_lock(&q->qi_dqlist_lock);
603 616
604 /* 617 /*
605 * XXXTheoretically, we can get into a very long 618 * XXXTheoretically, we can get into a very long
@@ -607,7 +620,7 @@ xfs_qm_dqpurge_int(
607 * No one can be adding dquots to the mplist at 620 * No one can be adding dquots to the mplist at
608 * this point, but somebody might be taking things off. 621 * this point, but somebody might be taking things off.
609 */ 622 */
610 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { 623 if (nrecl != q->qi_dqreclaims) {
611 mutex_unlock(&dqp->q_hash->qh_lock); 624 mutex_unlock(&dqp->q_hash->qh_lock);
612 goto again; 625 goto again;
613 } 626 }
@@ -617,11 +630,9 @@ xfs_qm_dqpurge_int(
617 * Take the dquot off the mplist and hashlist. It may remain on 630 * Take the dquot off the mplist and hashlist. It may remain on
618 * freelist in INACTIVE state. 631 * freelist in INACTIVE state.
619 */ 632 */
620 nextdqp = dqp->MPL_NEXT;
621 nmisses += xfs_qm_dqpurge(dqp); 633 nmisses += xfs_qm_dqpurge(dqp);
622 dqp = nextdqp;
623 } 634 }
624 xfs_qm_mplist_unlock(mp); 635 mutex_unlock(&q->qi_dqlist_lock);
625 return nmisses; 636 return nmisses;
626} 637}
627 638
@@ -921,12 +932,13 @@ xfs_qm_dqdetach(
921 932
922int 933int
923xfs_qm_sync( 934xfs_qm_sync(
924 xfs_mount_t *mp, 935 struct xfs_mount *mp,
925 int flags) 936 int flags)
926{ 937{
927 int recl, restarts; 938 struct xfs_quotainfo *q = mp->m_quotainfo;
928 xfs_dquot_t *dqp; 939 int recl, restarts;
929 int error; 940 struct xfs_dquot *dqp;
941 int error;
930 942
931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 943 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
932 return 0; 944 return 0;
@@ -934,18 +946,19 @@ xfs_qm_sync(
934 restarts = 0; 946 restarts = 0;
935 947
936 again: 948 again:
937 xfs_qm_mplist_lock(mp); 949 mutex_lock(&q->qi_dqlist_lock);
938 /* 950 /*
939 * dqpurge_all() also takes the mplist lock and iterate thru all dquots 951 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
940 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared 952 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
941 * when we have the mplist lock, we know that dquots will be consistent 953 * when we have the mplist lock, we know that dquots will be consistent
942 * as long as we have it locked. 954 * as long as we have it locked.
943 */ 955 */
944 if (! XFS_IS_QUOTA_ON(mp)) { 956 if (!XFS_IS_QUOTA_ON(mp)) {
945 xfs_qm_mplist_unlock(mp); 957 mutex_unlock(&q->qi_dqlist_lock);
946 return 0; 958 return 0;
947 } 959 }
948 FOREACH_DQUOT_IN_MP(dqp, mp) { 960 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
961 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
949 /* 962 /*
950 * If this is vfs_sync calling, then skip the dquots that 963 * If this is vfs_sync calling, then skip the dquots that
951 * don't 'seem' to be dirty. ie. don't acquire dqlock. 964 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +982,7 @@ xfs_qm_sync(
969 } 982 }
970 983
971 /* XXX a sentinel would be better */ 984 /* XXX a sentinel would be better */
972 recl = XFS_QI_MPLRECLAIMS(mp); 985 recl = q->qi_dqreclaims;
973 if (!xfs_dqflock_nowait(dqp)) { 986 if (!xfs_dqflock_nowait(dqp)) {
974 if (flags & SYNC_TRYLOCK) { 987 if (flags & SYNC_TRYLOCK) {
975 xfs_dqunlock(dqp); 988 xfs_dqunlock(dqp);
@@ -989,7 +1002,7 @@ xfs_qm_sync(
989 * Let go of the mplist lock. We don't want to hold it 1002 * Let go of the mplist lock. We don't want to hold it
990 * across a disk write 1003 * across a disk write
991 */ 1004 */
992 xfs_qm_mplist_unlock(mp); 1005 mutex_unlock(&q->qi_dqlist_lock);
993 error = xfs_qm_dqflush(dqp, flags); 1006 error = xfs_qm_dqflush(dqp, flags);
994 xfs_dqunlock(dqp); 1007 xfs_dqunlock(dqp);
995 if (error && XFS_FORCED_SHUTDOWN(mp)) 1008 if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1010,17 @@ xfs_qm_sync(
997 else if (error) 1010 else if (error)
998 return error; 1011 return error;
999 1012
1000 xfs_qm_mplist_lock(mp); 1013 mutex_lock(&q->qi_dqlist_lock);
1001 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 1014 if (recl != q->qi_dqreclaims) {
1002 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) 1015 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
1003 break; 1016 break;
1004 1017
1005 xfs_qm_mplist_unlock(mp); 1018 mutex_unlock(&q->qi_dqlist_lock);
1006 goto again; 1019 goto again;
1007 } 1020 }
1008 } 1021 }
1009 1022
1010 xfs_qm_mplist_unlock(mp); 1023 mutex_unlock(&q->qi_dqlist_lock);
1011 return 0; 1024 return 0;
1012} 1025}
1013 1026
@@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo(
1052 return error; 1065 return error;
1053 } 1066 }
1054 1067
1055 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); 1068 INIT_LIST_HEAD(&qinf->qi_dqlist);
1056 lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); 1069 mutex_init(&qinf->qi_dqlist_lock);
1070 lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
1057 1071
1058 qinf->qi_dqreclaims = 0; 1072 qinf->qi_dqreclaims = 0;
1059 1073
@@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo(
1150 */ 1164 */
1151 xfs_qm_rele_quotafs_ref(mp); 1165 xfs_qm_rele_quotafs_ref(mp);
1152 1166
1153 xfs_qm_list_destroy(&qi->qi_dqlist); 1167 ASSERT(list_empty(&qi->qi_dqlist));
1168 mutex_destroy(&qi->qi_dqlist_lock);
1154 1169
1155 if (qi->qi_uquotaip) { 1170 if (qi->qi_uquotaip) {
1156 IRELE(qi->qi_uquotaip); 1171 IRELE(qi->qi_uquotaip);
@@ -1177,7 +1192,7 @@ xfs_qm_list_init(
1177 int n) 1192 int n)
1178{ 1193{
1179 mutex_init(&list->qh_lock); 1194 mutex_init(&list->qh_lock);
1180 list->qh_next = NULL; 1195 INIT_LIST_HEAD(&list->qh_list);
1181 list->qh_version = 0; 1196 list->qh_version = 0;
1182 list->qh_nelems = 0; 1197 list->qh_nelems = 0;
1183} 1198}
@@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc(
1316 */ 1331 */
1317 spin_lock(&mp->m_sb_lock); 1332 spin_lock(&mp->m_sb_lock);
1318 if (flags & XFS_QMOPT_SBVERSION) { 1333 if (flags & XFS_QMOPT_SBVERSION) {
1319#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1320 unsigned oldv = mp->m_sb.sb_versionnum;
1321#endif
1322 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 1334 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
1323 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1335 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1324 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == 1336 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc(
1331 1343
1332 /* qflags will get updated _after_ quotacheck */ 1344 /* qflags will get updated _after_ quotacheck */
1333 mp->m_sb.sb_qflags = 0; 1345 mp->m_sb.sb_qflags = 0;
1334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1335 cmn_err(CE_NOTE,
1336 "Old superblock version %x, converting to %x.",
1337 oldv, mp->m_sb.sb_versionnum);
1338#endif
1339 } 1346 }
1340 if (flags & XFS_QMOPT_UQUOTA) 1347 if (flags & XFS_QMOPT_UQUOTA)
1341 mp->m_sb.sb_uquotino = (*ip)->i_ino; 1348 mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts(
1371#ifdef DEBUG 1378#ifdef DEBUG
1372 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); 1379 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
1373 do_div(j, sizeof(xfs_dqblk_t)); 1380 do_div(j, sizeof(xfs_dqblk_t));
1374 ASSERT(XFS_QM_DQPERBLK(mp) == j); 1381 ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
1375#endif 1382#endif
1376 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); 1383 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
1377 for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { 1384 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
1378 /* 1385 /*
1379 * Do a sanity check, and if needed, repair the dqblk. Don't 1386 * Do a sanity check, and if needed, repair the dqblk. Don't
1380 * output any warnings because it's perfectly possible to 1387 * output any warnings because it's perfectly possible to
@@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs(
1429 while (blkcnt--) { 1436 while (blkcnt--) {
1430 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1437 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1431 XFS_FSB_TO_DADDR(mp, bno), 1438 XFS_FSB_TO_DADDR(mp, bno),
1432 (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); 1439 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1433 if (error) 1440 if (error)
1434 break; 1441 break;
1435 1442
@@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs(
1439 * goto the next block. 1446 * goto the next block.
1440 */ 1447 */
1441 bno++; 1448 bno++;
1442 firstid += XFS_QM_DQPERBLK(mp); 1449 firstid += mp->m_quotainfo->qi_dqperchunk;
1443 } 1450 }
1444 return error; 1451 return error;
1445} 1452}
@@ -1505,7 +1512,7 @@ xfs_qm_dqiterate(
1505 continue; 1512 continue;
1506 1513
1507 firstid = (xfs_dqid_t) map[i].br_startoff * 1514 firstid = (xfs_dqid_t) map[i].br_startoff *
1508 XFS_QM_DQPERBLK(mp); 1515 mp->m_quotainfo->qi_dqperchunk;
1509 /* 1516 /*
1510 * Do a read-ahead on the next extent. 1517 * Do a read-ahead on the next extent.
1511 */ 1518 */
@@ -1516,7 +1523,7 @@ xfs_qm_dqiterate(
1516 while (rablkcnt--) { 1523 while (rablkcnt--) {
1517 xfs_baread(mp->m_ddev_targp, 1524 xfs_baread(mp->m_ddev_targp,
1518 XFS_FSB_TO_DADDR(mp, rablkno), 1525 XFS_FSB_TO_DADDR(mp, rablkno),
1519 (int)XFS_QI_DQCHUNKLEN(mp)); 1526 mp->m_quotainfo->qi_dqchunklen);
1520 rablkno++; 1527 rablkno++;
1521 } 1528 }
1522 } 1529 }
@@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust(
1576 1583
1577 /* 1584 /*
1578 * Set default limits, adjust timers (since we changed usages) 1585 * Set default limits, adjust timers (since we changed usages)
1586 *
1587 * There are no timers for the default values set in the root dquot.
1579 */ 1588 */
1580 if (! XFS_IS_SUSER_DQUOT(dqp)) { 1589 if (dqp->q_core.d_id) {
1581 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1590 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
1582 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1591 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
1583 } 1592 }
@@ -1747,14 +1756,14 @@ xfs_qm_quotacheck(
1747 lastino = 0; 1756 lastino = 0;
1748 flags = 0; 1757 flags = 0;
1749 1758
1750 ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); 1759 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
1751 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1760 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1752 1761
1753 /* 1762 /*
1754 * There should be no cached dquots. The (simplistic) quotacheck 1763 * There should be no cached dquots. The (simplistic) quotacheck
1755 * algorithm doesn't like that. 1764 * algorithm doesn't like that.
1756 */ 1765 */
1757 ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); 1766 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1758 1767
1759 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1768 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
1760 1769
@@ -1763,15 +1772,19 @@ xfs_qm_quotacheck(
1763 * their counters to zero. We need a clean slate. 1772 * their counters to zero. We need a clean slate.
1764 * We don't log our changes till later. 1773 * We don't log our changes till later.
1765 */ 1774 */
1766 if ((uip = XFS_QI_UQIP(mp))) { 1775 uip = mp->m_quotainfo->qi_uquotaip;
1767 if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) 1776 if (uip) {
1777 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
1778 if (error)
1768 goto error_return; 1779 goto error_return;
1769 flags |= XFS_UQUOTA_CHKD; 1780 flags |= XFS_UQUOTA_CHKD;
1770 } 1781 }
1771 1782
1772 if ((gip = XFS_QI_GQIP(mp))) { 1783 gip = mp->m_quotainfo->qi_gquotaip;
1773 if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1784 if (gip) {
1774 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) 1785 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1786 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
1787 if (error)
1775 goto error_return; 1788 goto error_return;
1776 flags |= XFS_OQUOTA_CHKD; 1789 flags |= XFS_OQUOTA_CHKD;
1777 } 1790 }
@@ -1804,7 +1817,7 @@ xfs_qm_quotacheck(
1804 * at this point (because we intentionally didn't in dqget_noattach). 1817 * at this point (because we intentionally didn't in dqget_noattach).
1805 */ 1818 */
1806 if (error) { 1819 if (error) {
1807 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); 1820 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
1808 goto error_return; 1821 goto error_return;
1809 } 1822 }
1810 1823
@@ -1825,7 +1838,7 @@ xfs_qm_quotacheck(
1825 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); 1838 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
1826 mp->m_qflags |= flags; 1839 mp->m_qflags |= flags;
1827 1840
1828 XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); 1841 xfs_qm_dquot_list_print(mp);
1829 1842
1830 error_return: 1843 error_return:
1831 if (error) { 1844 if (error) {
@@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos(
1920 } 1933 }
1921 } 1934 }
1922 1935
1923 XFS_QI_UQIP(mp) = uip; 1936 mp->m_quotainfo->qi_uquotaip = uip;
1924 XFS_QI_GQIP(mp) = gip; 1937 mp->m_quotainfo->qi_gquotaip = gip;
1925 1938
1926 return 0; 1939 return 0;
1927} 1940}
1928 1941
1929 1942
1943
1930/* 1944/*
1931 * Traverse the freelist of dquots and attempt to reclaim a maximum of 1945 * Just pop the least recently used dquot off the freelist and
1932 * 'howmany' dquots. This operation races with dqlookup(), and attempts to 1946 * recycle it. The returned dquot is locked.
1933 * favor the lookup function ...
1934 * XXXsup merge this with qm_reclaim_one().
1935 */ 1947 */
1936STATIC int 1948STATIC xfs_dquot_t *
1937xfs_qm_shake_freelist( 1949xfs_qm_dqreclaim_one(void)
1938 int howmany)
1939{ 1950{
1940 int nreclaimed; 1951 xfs_dquot_t *dqpout;
1941 xfs_dqhash_t *hash; 1952 xfs_dquot_t *dqp;
1942 xfs_dquot_t *dqp, *nextdqp;
1943 int restarts; 1953 int restarts;
1944 int nflushes;
1945
1946 if (howmany <= 0)
1947 return 0;
1948 1954
1949 nreclaimed = 0;
1950 restarts = 0; 1955 restarts = 0;
1951 nflushes = 0; 1956 dqpout = NULL;
1952 1957
1953#ifdef QUOTADEBUG 1958 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1954 cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); 1959startagain:
1955#endif 1960 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1956 /* lock order is : hashchainlock, freelistlock, mplistlock */
1957 tryagain:
1958 xfs_qm_freelist_lock(xfs_Gqm);
1959 1961
1960 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 1962 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
1961 ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && 1963 struct xfs_mount *mp = dqp->q_mount;
1962 nreclaimed < howmany); ) {
1963 xfs_dqlock(dqp); 1964 xfs_dqlock(dqp);
1964 1965
1965 /* 1966 /*
1966 * We are racing with dqlookup here. Naturally we don't 1967 * We are racing with dqlookup here. Naturally we don't
1967 * want to reclaim a dquot that lookup wants. 1968 * want to reclaim a dquot that lookup wants. We release the
1969 * freelist lock and start over, so that lookup will grab
1970 * both the dquot and the freelistlock.
1968 */ 1971 */
1969 if (dqp->dq_flags & XFS_DQ_WANT) { 1972 if (dqp->dq_flags & XFS_DQ_WANT) {
1973 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1974
1975 trace_xfs_dqreclaim_want(dqp);
1976
1970 xfs_dqunlock(dqp); 1977 xfs_dqunlock(dqp);
1971 xfs_qm_freelist_unlock(xfs_Gqm); 1978 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1972 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1979 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1973 return nreclaimed; 1980 return NULL;
1974 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1981 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1975 goto tryagain; 1982 goto startagain;
1976 } 1983 }
1977 1984
1978 /* 1985 /*
@@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist(
1981 * life easier. 1988 * life easier.
1982 */ 1989 */
1983 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 1990 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
1984 ASSERT(dqp->q_mount == NULL); 1991 ASSERT(mp == NULL);
1985 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 1992 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
1986 ASSERT(dqp->HL_PREVP == NULL); 1993 ASSERT(list_empty(&dqp->q_hashlist));
1987 ASSERT(dqp->MPL_PREVP == NULL); 1994 ASSERT(list_empty(&dqp->q_mplist));
1995 list_del_init(&dqp->q_freelist);
1996 xfs_Gqm->qm_dqfrlist_cnt--;
1997 xfs_dqunlock(dqp);
1998 dqpout = dqp;
1988 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1999 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1989 nextdqp = dqp->dq_flnext; 2000 break;
1990 goto off_freelist;
1991 } 2001 }
1992 2002
1993 ASSERT(dqp->MPL_PREVP); 2003 ASSERT(dqp->q_hash);
2004 ASSERT(!list_empty(&dqp->q_mplist));
2005
1994 /* 2006 /*
1995 * Try to grab the flush lock. If this dquot is in the process of 2007 * Try to grab the flush lock. If this dquot is in the process of
1996 * getting flushed to disk, we don't want to reclaim it. 2008 * getting flushed to disk, we don't want to reclaim it.
1997 */ 2009 */
1998 if (!xfs_dqflock_nowait(dqp)) { 2010 if (!xfs_dqflock_nowait(dqp)) {
1999 xfs_dqunlock(dqp); 2011 xfs_dqunlock(dqp);
2000 dqp = dqp->dq_flnext;
2001 continue; 2012 continue;
2002 } 2013 }
2003 2014
@@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist(
2010 if (XFS_DQ_IS_DIRTY(dqp)) { 2021 if (XFS_DQ_IS_DIRTY(dqp)) {
2011 int error; 2022 int error;
2012 2023
2013 trace_xfs_dqshake_dirty(dqp); 2024 trace_xfs_dqreclaim_dirty(dqp);
2014 2025
2015 /* 2026 /*
2016 * We flush it delayed write, so don't bother 2027 * We flush it delayed write, so don't bother
2017 * releasing the mplock. 2028 * releasing the freelist lock.
2018 */ 2029 */
2019 error = xfs_qm_dqflush(dqp, 0); 2030 error = xfs_qm_dqflush(dqp, 0);
2020 if (error) { 2031 if (error) {
2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2032 xfs_fs_cmn_err(CE_WARN, mp,
2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2033 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2023 } 2034 }
2024 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 2035 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2025 dqp = dqp->dq_flnext;
2026 continue; 2036 continue;
2027 } 2037 }
2038
2028 /* 2039 /*
2029 * We're trying to get the hashlock out of order. This races 2040 * We're trying to get the hashlock out of order. This races
2030 * with dqlookup; so, we giveup and goto the next dquot if 2041 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist(
2033 * waiting for the freelist lock. 2044 * waiting for the freelist lock.
2034 */ 2045 */
2035 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 2046 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
2036 xfs_dqfunlock(dqp); 2047 restarts++;
2037 xfs_dqunlock(dqp); 2048 goto dqfunlock;
2038 dqp = dqp->dq_flnext;
2039 continue;
2040 } 2049 }
2050
2041 /* 2051 /*
2042 * This races with dquot allocation code as well as dqflush_all 2052 * This races with dquot allocation code as well as dqflush_all
2043 * and reclaim code. So, if we failed to grab the mplist lock, 2053 * and reclaim code. So, if we failed to grab the mplist lock,
2044 * giveup everything and start over. 2054 * giveup everything and start over.
2045 */ 2055 */
2046 hash = dqp->q_hash; 2056 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
2047 ASSERT(hash); 2057 restarts++;
2048 if (! xfs_qm_mplist_nowait(dqp->q_mount)) { 2058 mutex_unlock(&dqp->q_hash->qh_lock);
2049 /* XXX put a sentinel so that we can come back here */
2050 xfs_dqfunlock(dqp); 2059 xfs_dqfunlock(dqp);
2051 xfs_dqunlock(dqp); 2060 xfs_dqunlock(dqp);
2052 mutex_unlock(&hash->qh_lock); 2061 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2053 xfs_qm_freelist_unlock(xfs_Gqm); 2062 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
2054 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2063 return NULL;
2055 return nreclaimed; 2064 goto startagain;
2056 goto tryagain;
2057 } 2065 }
2058 2066
2059 trace_xfs_dqshake_unlink(dqp);
2060
2061#ifdef QUOTADEBUG
2062 cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
2063 dqp, be32_to_cpu(dqp->q_core.d_id));
2064#endif
2065 ASSERT(dqp->q_nrefs == 0); 2067 ASSERT(dqp->q_nrefs == 0);
2066 nextdqp = dqp->dq_flnext; 2068 list_del_init(&dqp->q_mplist);
2067 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); 2069 mp->m_quotainfo->qi_dquots--;
2068 XQM_HASHLIST_REMOVE(hash, dqp); 2070 mp->m_quotainfo->qi_dqreclaims++;
2071 list_del_init(&dqp->q_hashlist);
2072 dqp->q_hash->qh_version++;
2073 list_del_init(&dqp->q_freelist);
2074 xfs_Gqm->qm_dqfrlist_cnt--;
2075 dqpout = dqp;
2076 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
2077 mutex_unlock(&dqp->q_hash->qh_lock);
2078dqfunlock:
2069 xfs_dqfunlock(dqp); 2079 xfs_dqfunlock(dqp);
2070 xfs_qm_mplist_unlock(dqp->q_mount);
2071 mutex_unlock(&hash->qh_lock);
2072
2073 off_freelist:
2074 XQM_FREELIST_REMOVE(dqp);
2075 xfs_dqunlock(dqp); 2080 xfs_dqunlock(dqp);
2076 nreclaimed++; 2081 if (dqpout)
2077 XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); 2082 break;
2083 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2084 return NULL;
2085 }
2086 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2087 return dqpout;
2088}
2089
2090/*
2091 * Traverse the freelist of dquots and attempt to reclaim a maximum of
2092 * 'howmany' dquots. This operation races with dqlookup(), and attempts to
2093 * favor the lookup function ...
2094 */
2095STATIC int
2096xfs_qm_shake_freelist(
2097 int howmany)
2098{
2099 int nreclaimed = 0;
2100 xfs_dquot_t *dqp;
2101
2102 if (howmany <= 0)
2103 return 0;
2104
2105 while (nreclaimed < howmany) {
2106 dqp = xfs_qm_dqreclaim_one();
2107 if (!dqp)
2108 return nreclaimed;
2078 xfs_qm_dqdestroy(dqp); 2109 xfs_qm_dqdestroy(dqp);
2079 dqp = nextdqp; 2110 nreclaimed++;
2080 } 2111 }
2081 xfs_qm_freelist_unlock(xfs_Gqm);
2082 return nreclaimed; 2112 return nreclaimed;
2083} 2113}
2084 2114
2085
2086/* 2115/*
2087 * The kmem_shake interface is invoked when memory is running low. 2116 * The kmem_shake interface is invoked when memory is running low.
2088 */ 2117 */
@@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2097 if (!xfs_Gqm) 2126 if (!xfs_Gqm)
2098 return 0; 2127 return 0;
2099 2128
2100 nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ 2129 nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
2101 /* incore dquots in all f/s's */ 2130 /* incore dquots in all f/s's */
2102 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; 2131 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
2103 2132
@@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2113} 2142}
2114 2143
2115 2144
2116/*
2117 * Just pop the least recently used dquot off the freelist and
2118 * recycle it. The returned dquot is locked.
2119 */
2120STATIC xfs_dquot_t *
2121xfs_qm_dqreclaim_one(void)
2122{
2123 xfs_dquot_t *dqpout;
2124 xfs_dquot_t *dqp;
2125 int restarts;
2126 int nflushes;
2127
2128 restarts = 0;
2129 dqpout = NULL;
2130 nflushes = 0;
2131
2132 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
2133 startagain:
2134 xfs_qm_freelist_lock(xfs_Gqm);
2135
2136 FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
2137 xfs_dqlock(dqp);
2138
2139 /*
2140 * We are racing with dqlookup here. Naturally we don't
2141 * want to reclaim a dquot that lookup wants. We release the
2142 * freelist lock and start over, so that lookup will grab
2143 * both the dquot and the freelistlock.
2144 */
2145 if (dqp->dq_flags & XFS_DQ_WANT) {
2146 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
2147
2148 trace_xfs_dqreclaim_want(dqp);
2149
2150 xfs_dqunlock(dqp);
2151 xfs_qm_freelist_unlock(xfs_Gqm);
2152 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2153 return NULL;
2154 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2155 goto startagain;
2156 }
2157
2158 /*
2159 * If the dquot is inactive, we are assured that it is
2160 * not on the mplist or the hashlist, and that makes our
2161 * life easier.
2162 */
2163 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
2164 ASSERT(dqp->q_mount == NULL);
2165 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
2166 ASSERT(dqp->HL_PREVP == NULL);
2167 ASSERT(dqp->MPL_PREVP == NULL);
2168 XQM_FREELIST_REMOVE(dqp);
2169 xfs_dqunlock(dqp);
2170 dqpout = dqp;
2171 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
2172 break;
2173 }
2174
2175 ASSERT(dqp->q_hash);
2176 ASSERT(dqp->MPL_PREVP);
2177
2178 /*
2179 * Try to grab the flush lock. If this dquot is in the process of
2180 * getting flushed to disk, we don't want to reclaim it.
2181 */
2182 if (!xfs_dqflock_nowait(dqp)) {
2183 xfs_dqunlock(dqp);
2184 continue;
2185 }
2186
2187 /*
2188 * We have the flush lock so we know that this is not in the
2189 * process of being flushed. So, if this is dirty, flush it
2190 * DELWRI so that we don't get a freelist infested with
2191 * dirty dquots.
2192 */
2193 if (XFS_DQ_IS_DIRTY(dqp)) {
2194 int error;
2195
2196 trace_xfs_dqreclaim_dirty(dqp);
2197
2198 /*
2199 * We flush it delayed write, so don't bother
2200 * releasing the freelist lock.
2201 */
2202 error = xfs_qm_dqflush(dqp, 0);
2203 if (error) {
2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2206 }
2207 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2208 continue;
2209 }
2210
2211 if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
2212 xfs_dqfunlock(dqp);
2213 xfs_dqunlock(dqp);
2214 continue;
2215 }
2216
2217 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2218 goto mplistunlock;
2219
2220 trace_xfs_dqreclaim_unlink(dqp);
2221
2222 ASSERT(dqp->q_nrefs == 0);
2223 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
2224 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2225 XQM_FREELIST_REMOVE(dqp);
2226 dqpout = dqp;
2227 mutex_unlock(&dqp->q_hash->qh_lock);
2228 mplistunlock:
2229 xfs_qm_mplist_unlock(dqp->q_mount);
2230 xfs_dqfunlock(dqp);
2231 xfs_dqunlock(dqp);
2232 if (dqpout)
2233 break;
2234 }
2235
2236 xfs_qm_freelist_unlock(xfs_Gqm);
2237 return dqpout;
2238}
2239
2240
2241/*------------------------------------------------------------------*/ 2145/*------------------------------------------------------------------*/
2242 2146
2243/* 2147/*
@@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach(
2662 } 2566 }
2663} 2567}
2664 2568
2665/* ------------- list stuff -----------------*/
2666STATIC void
2667xfs_qm_freelist_init(xfs_frlist_t *ql)
2668{
2669 ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
2670 mutex_init(&ql->qh_lock);
2671 ql->qh_version = 0;
2672 ql->qh_nelems = 0;
2673}
2674
2675STATIC void
2676xfs_qm_freelist_destroy(xfs_frlist_t *ql)
2677{
2678 xfs_dquot_t *dqp, *nextdqp;
2679
2680 mutex_lock(&ql->qh_lock);
2681 for (dqp = ql->qh_next;
2682 dqp != (xfs_dquot_t *)ql; ) {
2683 xfs_dqlock(dqp);
2684 nextdqp = dqp->dq_flnext;
2685#ifdef QUOTADEBUG
2686 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
2687#endif
2688 XQM_FREELIST_REMOVE(dqp);
2689 xfs_dqunlock(dqp);
2690 xfs_qm_dqdestroy(dqp);
2691 dqp = nextdqp;
2692 }
2693 mutex_unlock(&ql->qh_lock);
2694 mutex_destroy(&ql->qh_lock);
2695
2696 ASSERT(ql->qh_nelems == 0);
2697}
2698
2699STATIC void
2700xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
2701{
2702 dq->dq_flnext = ql->qh_next;
2703 dq->dq_flprev = (xfs_dquot_t *)ql;
2704 ql->qh_next = dq;
2705 dq->dq_flnext->dq_flprev = dq;
2706 xfs_Gqm->qm_dqfreelist.qh_nelems++;
2707 xfs_Gqm->qm_dqfreelist.qh_version++;
2708}
2709
2710void
2711xfs_qm_freelist_unlink(xfs_dquot_t *dq)
2712{
2713 xfs_dquot_t *next = dq->dq_flnext;
2714 xfs_dquot_t *prev = dq->dq_flprev;
2715
2716 next->dq_flprev = prev;
2717 prev->dq_flnext = next;
2718 dq->dq_flnext = dq->dq_flprev = dq;
2719 xfs_Gqm->qm_dqfreelist.qh_nelems--;
2720 xfs_Gqm->qm_dqfreelist.qh_version++;
2721}
2722
2723void
2724xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2725{
2726 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2727}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone;
72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
73 73
74typedef xfs_dqhash_t xfs_dqlist_t; 74typedef xfs_dqhash_t xfs_dqlist_t;
75/*
76 * The freelist head. The first two fields match the first two in the
77 * xfs_dquot_t structure (in xfs_dqmarker_t)
78 */
79typedef struct xfs_frlist {
80 struct xfs_dquot *qh_next;
81 struct xfs_dquot *qh_prev;
82 struct mutex qh_lock;
83 uint qh_version;
84 uint qh_nelems;
85} xfs_frlist_t;
86 75
87/* 76/*
88 * Quota Manager (global) structure. Lives only in core. 77 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
91 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ 80 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
92 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ 81 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
93 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ 82 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
94 xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ 83 struct list_head qm_dqfrlist; /* freelist of dquots */
84 struct mutex qm_dqfrlist_lock;
85 int qm_dqfrlist_cnt;
95 atomic_t qm_totaldquots; /* total incore dquots */ 86 atomic_t qm_totaldquots; /* total incore dquots */
96 uint qm_nrefs; /* file systems with quota on */ 87 uint qm_nrefs; /* file systems with quota on */
97 int qm_dqfree_ratio;/* ratio of free to inuse dquots */ 88 int qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
106typedef struct xfs_quotainfo { 97typedef struct xfs_quotainfo {
107 xfs_inode_t *qi_uquotaip; /* user quota inode */ 98 xfs_inode_t *qi_uquotaip; /* user quota inode */
108 xfs_inode_t *qi_gquotaip; /* group quota inode */ 99 xfs_inode_t *qi_gquotaip; /* group quota inode */
109 xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ 100 struct list_head qi_dqlist; /* all dquots in filesys */
101 struct mutex qi_dqlist_lock;
102 int qi_dquots;
110 int qi_dqreclaims; /* a change here indicates 103 int qi_dqreclaims; /* a change here indicates
111 a removal in the dqlist */ 104 a removal in the dqlist */
112 time_t qi_btimelimit; /* limit for blks timer */ 105 time_t qi_btimelimit; /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
175extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 168extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
176extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 169extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
177 170
178/* list stuff */
179extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
180extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
181
182#ifdef DEBUG 171#ifdef DEBUG
183extern int xfs_qm_internalqcheck(xfs_mount_t *); 172extern int xfs_qm_internalqcheck(xfs_mount_t *);
184#else 173#else
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..3d1fc79532e2 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
55 ndquot, 55 ndquot,
56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 58 xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
59 return 0; 59 return 0;
60} 60}
61 61
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 50bee07d6b0e..92b002f1805f 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
79 xfs_mount_t *mp, 79 xfs_mount_t *mp,
80 uint flags) 80 uint flags)
81{ 81{
82 struct xfs_quotainfo *q = mp->m_quotainfo;
82 uint dqtype; 83 uint dqtype;
83 int error; 84 int error;
84 uint inactivate_flags; 85 uint inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
102 * critical thing. 103 * critical thing.
103 * If quotaoff, then we must be dealing with the root filesystem. 104 * If quotaoff, then we must be dealing with the root filesystem.
104 */ 105 */
105 ASSERT(mp->m_quotainfo); 106 ASSERT(q);
106 if (mp->m_quotainfo) 107 mutex_lock(&q->qi_quotaofflock);
107 mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
108
109 ASSERT(mp->m_quotainfo);
110 108
111 /* 109 /*
112 * If we're just turning off quota enforcement, change mp and go. 110 * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
117 spin_lock(&mp->m_sb_lock); 115 spin_lock(&mp->m_sb_lock);
118 mp->m_sb.sb_qflags = mp->m_qflags; 116 mp->m_sb.sb_qflags = mp->m_qflags;
119 spin_unlock(&mp->m_sb_lock); 117 spin_unlock(&mp->m_sb_lock);
120 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 118 mutex_unlock(&q->qi_quotaofflock);
121 119
122 /* XXX what to do if error ? Revert back to old vals incore ? */ 120 /* XXX what to do if error ? Revert back to old vals incore ? */
123 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); 121 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
150 * Nothing to do? Don't complain. This happens when we're just 148 * Nothing to do? Don't complain. This happens when we're just
151 * turning off quota enforcement. 149 * turning off quota enforcement.
152 */ 150 */
153 if ((mp->m_qflags & flags) == 0) { 151 if ((mp->m_qflags & flags) == 0)
154 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 152 goto out_unlock;
155 return (0);
156 }
157 153
158 /* 154 /*
159 * Write the LI_QUOTAOFF log record, and do SB changes atomically, 155 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
162 */ 158 */
163 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); 159 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
164 if (error) 160 if (error)
165 goto out_error; 161 goto out_unlock;
166 162
167 /* 163 /*
168 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct 164 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
204 * So, if we couldn't purge all the dquots from the filesystem, 200 * So, if we couldn't purge all the dquots from the filesystem,
205 * we can't get rid of the incore data structures. 201 * we can't get rid of the incore data structures.
206 */ 202 */
207 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) 203 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
208 delay(10 * nculprits); 204 delay(10 * nculprits);
209 205
210 /* 206 /*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
222 if (error) { 218 if (error) {
223 /* We're screwed now. Shutdown is the only option. */ 219 /* We're screwed now. Shutdown is the only option. */
224 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 220 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
225 goto out_error; 221 goto out_unlock;
226 } 222 }
227 223
228 /* 224 /*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
230 */ 226 */
231 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || 227 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
232 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { 228 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
233 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 229 mutex_unlock(&q->qi_quotaofflock);
234 xfs_qm_destroy_quotainfo(mp); 230 xfs_qm_destroy_quotainfo(mp);
235 return (0); 231 return (0);
236 } 232 }
237 233
238 /* 234 /*
239 * Release our quotainode references, and vn_purge them, 235 * Release our quotainode references if we don't need them anymore.
240 * if we don't need them anymore.
241 */ 236 */
242 if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { 237 if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
243 IRELE(XFS_QI_UQIP(mp)); 238 IRELE(q->qi_uquotaip);
244 XFS_QI_UQIP(mp) = NULL; 239 q->qi_uquotaip = NULL;
245 } 240 }
246 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { 241 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
247 IRELE(XFS_QI_GQIP(mp)); 242 IRELE(q->qi_gquotaip);
248 XFS_QI_GQIP(mp) = NULL; 243 q->qi_gquotaip = NULL;
249 } 244 }
250out_error:
251 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
252 245
253 return (error); 246out_unlock:
247 mutex_unlock(&q->qi_quotaofflock);
248 return error;
254} 249}
255 250
256int 251int
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
379 /* 374 /*
380 * Switch on quota enforcement in core. 375 * Switch on quota enforcement in core.
381 */ 376 */
382 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 377 mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
383 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); 378 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
384 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 379 mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
385 380
386 return (0); 381 return (0);
387} 382}
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
392 */ 387 */
393int 388int
394xfs_qm_scall_getqstat( 389xfs_qm_scall_getqstat(
395 xfs_mount_t *mp, 390 struct xfs_mount *mp,
396 fs_quota_stat_t *out) 391 struct fs_quota_stat *out)
397{ 392{
398 xfs_inode_t *uip, *gip; 393 struct xfs_quotainfo *q = mp->m_quotainfo;
399 boolean_t tempuqip, tempgqip; 394 struct xfs_inode *uip, *gip;
395 boolean_t tempuqip, tempgqip;
400 396
401 uip = gip = NULL; 397 uip = gip = NULL;
402 tempuqip = tempgqip = B_FALSE; 398 tempuqip = tempgqip = B_FALSE;
@@ -415,9 +411,9 @@ xfs_qm_scall_getqstat(
415 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; 411 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
416 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 412 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
417 413
418 if (mp->m_quotainfo) { 414 if (q) {
419 uip = mp->m_quotainfo->qi_uquotaip; 415 uip = q->qi_uquotaip;
420 gip = mp->m_quotainfo->qi_gquotaip; 416 gip = q->qi_gquotaip;
421 } 417 }
422 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 418 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
423 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 419 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,17 +437,20 @@ xfs_qm_scall_getqstat(
441 if (tempgqip) 437 if (tempgqip)
442 IRELE(gip); 438 IRELE(gip);
443 } 439 }
444 if (mp->m_quotainfo) { 440 if (q) {
445 out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); 441 out->qs_incoredqs = q->qi_dquots;
446 out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); 442 out->qs_btimelimit = q->qi_btimelimit;
447 out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); 443 out->qs_itimelimit = q->qi_itimelimit;
448 out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); 444 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
449 out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); 445 out->qs_bwarnlimit = q->qi_bwarnlimit;
450 out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); 446 out->qs_iwarnlimit = q->qi_iwarnlimit;
451 } 447 }
452 return (0); 448 return 0;
453} 449}
454 450
451#define XFS_DQ_MASK \
452 (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
453
455/* 454/*
456 * Adjust quota limits, and start/stop timers accordingly. 455 * Adjust quota limits, and start/stop timers accordingly.
457 */ 456 */
@@ -462,15 +461,17 @@ xfs_qm_scall_setqlim(
462 uint type, 461 uint type,
463 fs_disk_quota_t *newlim) 462 fs_disk_quota_t *newlim)
464{ 463{
464 struct xfs_quotainfo *q = mp->m_quotainfo;
465 xfs_disk_dquot_t *ddq; 465 xfs_disk_dquot_t *ddq;
466 xfs_dquot_t *dqp; 466 xfs_dquot_t *dqp;
467 xfs_trans_t *tp; 467 xfs_trans_t *tp;
468 int error; 468 int error;
469 xfs_qcnt_t hard, soft; 469 xfs_qcnt_t hard, soft;
470 470
471 if ((newlim->d_fieldmask & 471 if (newlim->d_fieldmask & ~XFS_DQ_MASK)
472 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) 472 return EINVAL;
473 return (0); 473 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
474 return 0;
474 475
475 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 476 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
476 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, 477 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
@@ -485,7 +486,7 @@ xfs_qm_scall_setqlim(
485 * a quotaoff from happening). (XXXThis doesn't currently happen 486 * a quotaoff from happening). (XXXThis doesn't currently happen
486 * because we take the vfslock before calling xfs_qm_sysent). 487 * because we take the vfslock before calling xfs_qm_sysent).
487 */ 488 */
488 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 489 mutex_lock(&q->qi_quotaofflock);
489 490
490 /* 491 /*
491 * Get the dquot (locked), and join it to the transaction. 492 * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +494,8 @@ xfs_qm_scall_setqlim(
493 */ 494 */
494 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { 495 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
495 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 496 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
496 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
497 ASSERT(error != ENOENT); 497 ASSERT(error != ENOENT);
498 return (error); 498 goto out_unlock;
499 } 499 }
500 xfs_trans_dqjoin(tp, dqp); 500 xfs_trans_dqjoin(tp, dqp);
501 ddq = &dqp->q_core; 501 ddq = &dqp->q_core;
@@ -513,8 +513,8 @@ xfs_qm_scall_setqlim(
513 ddq->d_blk_hardlimit = cpu_to_be64(hard); 513 ddq->d_blk_hardlimit = cpu_to_be64(hard);
514 ddq->d_blk_softlimit = cpu_to_be64(soft); 514 ddq->d_blk_softlimit = cpu_to_be64(soft);
515 if (id == 0) { 515 if (id == 0) {
516 mp->m_quotainfo->qi_bhardlimit = hard; 516 q->qi_bhardlimit = hard;
517 mp->m_quotainfo->qi_bsoftlimit = soft; 517 q->qi_bsoftlimit = soft;
518 } 518 }
519 } else { 519 } else {
520 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 520 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +529,8 @@ xfs_qm_scall_setqlim(
529 ddq->d_rtb_hardlimit = cpu_to_be64(hard); 529 ddq->d_rtb_hardlimit = cpu_to_be64(hard);
530 ddq->d_rtb_softlimit = cpu_to_be64(soft); 530 ddq->d_rtb_softlimit = cpu_to_be64(soft);
531 if (id == 0) { 531 if (id == 0) {
532 mp->m_quotainfo->qi_rtbhardlimit = hard; 532 q->qi_rtbhardlimit = hard;
533 mp->m_quotainfo->qi_rtbsoftlimit = soft; 533 q->qi_rtbsoftlimit = soft;
534 } 534 }
535 } else { 535 } else {
536 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 536 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +546,8 @@ xfs_qm_scall_setqlim(
546 ddq->d_ino_hardlimit = cpu_to_be64(hard); 546 ddq->d_ino_hardlimit = cpu_to_be64(hard);
547 ddq->d_ino_softlimit = cpu_to_be64(soft); 547 ddq->d_ino_softlimit = cpu_to_be64(soft);
548 if (id == 0) { 548 if (id == 0) {
549 mp->m_quotainfo->qi_ihardlimit = hard; 549 q->qi_ihardlimit = hard;
550 mp->m_quotainfo->qi_isoftlimit = soft; 550 q->qi_isoftlimit = soft;
551 } 551 }
552 } else { 552 } else {
553 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 553 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +572,23 @@ xfs_qm_scall_setqlim(
572 * for warnings. 572 * for warnings.
573 */ 573 */
574 if (newlim->d_fieldmask & FS_DQ_BTIMER) { 574 if (newlim->d_fieldmask & FS_DQ_BTIMER) {
575 mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; 575 q->qi_btimelimit = newlim->d_btimer;
576 ddq->d_btimer = cpu_to_be32(newlim->d_btimer); 576 ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
577 } 577 }
578 if (newlim->d_fieldmask & FS_DQ_ITIMER) { 578 if (newlim->d_fieldmask & FS_DQ_ITIMER) {
579 mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; 579 q->qi_itimelimit = newlim->d_itimer;
580 ddq->d_itimer = cpu_to_be32(newlim->d_itimer); 580 ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
581 } 581 }
582 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { 582 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
583 mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; 583 q->qi_rtbtimelimit = newlim->d_rtbtimer;
584 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); 584 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
585 } 585 }
586 if (newlim->d_fieldmask & FS_DQ_BWARNS) 586 if (newlim->d_fieldmask & FS_DQ_BWARNS)
587 mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; 587 q->qi_bwarnlimit = newlim->d_bwarns;
588 if (newlim->d_fieldmask & FS_DQ_IWARNS) 588 if (newlim->d_fieldmask & FS_DQ_IWARNS)
589 mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; 589 q->qi_iwarnlimit = newlim->d_iwarns;
590 if (newlim->d_fieldmask & FS_DQ_RTBWARNS) 590 if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
591 mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; 591 q->qi_rtbwarnlimit = newlim->d_rtbwarns;
592 } else { 592 } else {
593 /* 593 /*
594 * If the user is now over quota, start the timelimit. 594 * If the user is now over quota, start the timelimit.
@@ -605,8 +605,9 @@ xfs_qm_scall_setqlim(
605 error = xfs_trans_commit(tp, 0); 605 error = xfs_trans_commit(tp, 0);
606 xfs_qm_dqprint(dqp); 606 xfs_qm_dqprint(dqp);
607 xfs_qm_dqrele(dqp); 607 xfs_qm_dqrele(dqp);
608 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
609 608
609 out_unlock:
610 mutex_unlock(&q->qi_quotaofflock);
610 return error; 611 return error;
611} 612}
612 613
@@ -853,7 +854,8 @@ xfs_dqrele_inode(
853 int error; 854 int error;
854 855
855 /* skip quota inodes */ 856 /* skip quota inodes */
856 if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { 857 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
858 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
857 ASSERT(ip->i_udquot == NULL); 859 ASSERT(ip->i_udquot == NULL);
858 ASSERT(ip->i_gdquot == NULL); 860 ASSERT(ip->i_gdquot == NULL);
859 read_unlock(&pag->pag_ici_lock); 861 read_unlock(&pag->pag_ici_lock);
@@ -931,7 +933,8 @@ struct mutex qcheck_lock;
931} 933}
932 934
933typedef struct dqtest { 935typedef struct dqtest {
934 xfs_dqmarker_t q_lists; 936 uint dq_flags; /* various flags (XFS_DQ_*) */
937 struct list_head q_hashlist;
935 xfs_dqhash_t *q_hash; /* the hashchain header */ 938 xfs_dqhash_t *q_hash; /* the hashchain header */
936 xfs_mount_t *q_mount; /* filesystem this relates to */ 939 xfs_mount_t *q_mount; /* filesystem this relates to */
937 xfs_dqid_t d_id; /* user id or group id */ 940 xfs_dqid_t d_id; /* user id or group id */
@@ -942,14 +945,9 @@ typedef struct dqtest {
942STATIC void 945STATIC void
943xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) 946xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
944{ 947{
945 xfs_dquot_t *d; 948 list_add(&dqp->q_hashlist, &h->qh_list);
946 if (((d) = (h)->qh_next)) 949 h->qh_version++;
947 (d)->HL_PREVP = &((dqp)->HL_NEXT); 950 h->qh_nelems++;
948 (dqp)->HL_NEXT = d;
949 (dqp)->HL_PREVP = &((h)->qh_next);
950 (h)->qh_next = (xfs_dquot_t *)dqp;
951 (h)->qh_version++;
952 (h)->qh_nelems++;
953} 951}
954STATIC void 952STATIC void
955xfs_qm_dqtest_print( 953xfs_qm_dqtest_print(
@@ -1061,9 +1059,7 @@ xfs_qm_internalqcheck_dqget(
1061 xfs_dqhash_t *h; 1059 xfs_dqhash_t *h;
1062 1060
1063 h = DQTEST_HASH(mp, id, type); 1061 h = DQTEST_HASH(mp, id, type);
1064 for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; 1062 list_for_each_entry(d, &h->qh_list, q_hashlist) {
1065 d = (xfs_dqtest_t *) d->HL_NEXT) {
1066 /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
1067 if (d->d_id == id && mp == d->q_mount) { 1063 if (d->d_id == id && mp == d->q_mount) {
1068 *O_dq = d; 1064 *O_dq = d;
1069 return (0); 1065 return (0);
@@ -1074,6 +1070,7 @@ xfs_qm_internalqcheck_dqget(
1074 d->d_id = id; 1070 d->d_id = id;
1075 d->q_mount = mp; 1071 d->q_mount = mp;
1076 d->q_hash = h; 1072 d->q_hash = h;
1073 INIT_LIST_HEAD(&d->q_hashlist);
1077 xfs_qm_hashinsert(h, d); 1074 xfs_qm_hashinsert(h, d);
1078 *O_dq = d; 1075 *O_dq = d;
1079 return (0); 1076 return (0);
@@ -1180,8 +1177,6 @@ xfs_qm_internalqcheck(
1180 xfs_ino_t lastino; 1177 xfs_ino_t lastino;
1181 int done, count; 1178 int done, count;
1182 int i; 1179 int i;
1183 xfs_dqtest_t *d, *e;
1184 xfs_dqhash_t *h1;
1185 int error; 1180 int error;
1186 1181
1187 lastino = 0; 1182 lastino = 0;
@@ -1221,19 +1216,18 @@ xfs_qm_internalqcheck(
1221 } 1216 }
1222 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1217 cmn_err(CE_DEBUG, "Checking results against system dquots");
1223 for (i = 0; i < qmtest_hashmask; i++) { 1218 for (i = 0; i < qmtest_hashmask; i++) {
1224 h1 = &qmtest_udqtab[i]; 1219 xfs_dqtest_t *d, *n;
1225 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1220 xfs_dqhash_t *h;
1221
1222 h = &qmtest_udqtab[i];
1223 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1226 xfs_dqtest_cmp(d); 1224 xfs_dqtest_cmp(d);
1227 e = (xfs_dqtest_t *) d->HL_NEXT;
1228 kmem_free(d); 1225 kmem_free(d);
1229 d = e;
1230 } 1226 }
1231 h1 = &qmtest_gdqtab[i]; 1227 h = &qmtest_gdqtab[i];
1232 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1228 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1233 xfs_dqtest_cmp(d); 1229 xfs_dqtest_cmp(d);
1234 e = (xfs_dqtest_t *) d->HL_NEXT;
1235 kmem_free(d); 1230 kmem_free(d);
1236 d = e;
1237 } 1231 }
1238 } 1232 }
1239 1233
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
24 */ 24 */
25#define XFS_DQITER_MAP_SIZE 10 25#define XFS_DQITER_MAP_SIZE 10
26 26
27/* Number of dquots that fit in to a dquot block */
28#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk)
29
30#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t))
31
32#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims)
33#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip)
34#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip)
35#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen)
36#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit)
37#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
38#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit)
39#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit)
40#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
41#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit)
42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
43
44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
45#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
46#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
47
48#define xfs_qm_mplist_lock(mp) \
49 mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
50#define xfs_qm_mplist_nowait(mp) \
51 mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
52#define xfs_qm_mplist_unlock(mp) \
53 mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
54#define XFS_QM_IS_MPLIST_LOCKED(mp) \
55 mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
56
57#define xfs_qm_freelist_lock(qm) \
58 mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
59#define xfs_qm_freelist_lock_nowait(qm) \
60 mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
61#define xfs_qm_freelist_unlock(qm) \
62 mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
63
64/* 27/*
65 * Hash into a bucket in the dquot hash table, based on <mp, id>. 28 * Hash into a bucket in the dquot hash table, based on <mp, id>.
66 */ 29 */
@@ -72,9 +35,6 @@
72 XFS_DQ_HASHVAL(mp, id)) : \ 35 XFS_DQ_HASHVAL(mp, id)) : \
73 (xfs_Gqm->qm_grp_dqhtable + \ 36 (xfs_Gqm->qm_grp_dqhtable + \
74 XFS_DQ_HASHVAL(mp, id))) 37 XFS_DQ_HASHVAL(mp, id)))
75#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \
76 XFS_IS_UQUOTA_ON(mp) : \
77 XFS_IS_OQUOTA_ON(mp))
78#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ 38#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
79 !dqp->q_core.d_blk_hardlimit && \ 39 !dqp->q_core.d_blk_hardlimit && \
80 !dqp->q_core.d_blk_softlimit && \ 40 !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
86 !dqp->q_core.d_rtbcount && \ 46 !dqp->q_core.d_rtbcount && \
87 !dqp->q_core.d_icount) 47 !dqp->q_core.d_icount)
88 48
89#define HL_PREVP dq_hashlist.ql_prevp
90#define HL_NEXT dq_hashlist.ql_next
91#define MPL_PREVP dq_mplist.ql_prevp
92#define MPL_NEXT dq_mplist.ql_next
93
94
95#define _LIST_REMOVE(h, dqp, PVP, NXT) \
96 { \
97 xfs_dquot_t *d; \
98 if (((d) = (dqp)->NXT)) \
99 (d)->PVP = (dqp)->PVP; \
100 *((dqp)->PVP) = d; \
101 (dqp)->NXT = NULL; \
102 (dqp)->PVP = NULL; \
103 (h)->qh_version++; \
104 (h)->qh_nelems--; \
105 }
106
107#define _LIST_INSERT(h, dqp, PVP, NXT) \
108 { \
109 xfs_dquot_t *d; \
110 if (((d) = (h)->qh_next)) \
111 (d)->PVP = &((dqp)->NXT); \
112 (dqp)->NXT = d; \
113 (dqp)->PVP = &((h)->qh_next); \
114 (h)->qh_next = dqp; \
115 (h)->qh_version++; \
116 (h)->qh_nelems++; \
117 }
118
119#define FOREACH_DQUOT_IN_MP(dqp, mp) \
120 for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
121
122#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \
123for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
124 (dqp) = (dqp)->dq_flnext)
125
126#define XQM_HASHLIST_INSERT(h, dqp) \
127 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
128
129#define XQM_FREELIST_INSERT(h, dqp) \
130 xfs_qm_freelist_append(h, dqp)
131
132#define XQM_MPLIST_INSERT(h, dqp) \
133 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
134
135#define XQM_HASHLIST_REMOVE(h, dqp) \
136 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
137#define XQM_FREELIST_REMOVE(dqp) \
138 xfs_qm_freelist_unlink(dqp)
139#define XQM_MPLIST_REMOVE(h, dqp) \
140 { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
141 XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
142
143#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp))
144
145#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \
146 (tp)->t_dqinfo->dqa_usrdquots : \
147 (tp)->t_dqinfo->dqa_grpdquots)
148#define XFS_IS_SUSER_DQUOT(dqp) \
149 (!((dqp)->q_core.d_id))
150
151#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ 49#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
152 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ 50 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
153 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) 51 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..061d827da33c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
59 xfs_trans_t *tp, 59 xfs_trans_t *tp,
60 xfs_dquot_t *dqp) 60 xfs_dquot_t *dqp)
61{ 61{
62 xfs_dq_logitem_t *lp; 62 xfs_dq_logitem_t *lp = &dqp->q_logitem;
63 63
64 ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 64 ASSERT(dqp->q_transp != tp);
65 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 65 ASSERT(XFS_DQ_IS_LOCKED(dqp));
66 ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); 66 ASSERT(lp->qli_dquot == dqp);
67 lp = &dqp->q_logitem;
68 67
69 /* 68 /*
70 * Get a log_item_desc to point at the new item. 69 * Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
96{ 95{
97 xfs_log_item_desc_t *lidp; 96 xfs_log_item_desc_t *lidp;
98 97
99 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 98 ASSERT(dqp->q_transp == tp);
100 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 99 ASSERT(XFS_DQ_IS_LOCKED(dqp));
101 100
102 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); 101 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
198 int i; 197 int i;
199 xfs_dqtrx_t *qa; 198 xfs_dqtrx_t *qa;
200 199
201 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 200 qa = XFS_QM_ISUDQ(dqp) ?
202 qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); 201 tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
203 202
203 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
204 if (qa[i].qt_dquot == NULL || 204 if (qa[i].qt_dquot == NULL ||
205 qa[i].qt_dquot == dqp) { 205 qa[i].qt_dquot == dqp)
206 return (&qa[i]); 206 return &qa[i];
207 }
208 } 207 }
209 208
210 return (NULL); 209 return NULL;
211} 210}
212 211
213/* 212/*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
381 break; 380 break;
382 381
383 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 382 ASSERT(XFS_DQ_IS_LOCKED(dqp));
384 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 383 ASSERT(dqp->q_transp == tp);
385 384
386 /* 385 /*
387 * adjust the actual number of blocks used 386 * adjust the actual number of blocks used
@@ -639,7 +638,7 @@ xfs_trans_dqresv(
639 softlimit = q->qi_bsoftlimit; 638 softlimit = q->qi_bsoftlimit;
640 timer = be32_to_cpu(dqp->q_core.d_btimer); 639 timer = be32_to_cpu(dqp->q_core.d_btimer);
641 warns = be16_to_cpu(dqp->q_core.d_bwarns); 640 warns = be16_to_cpu(dqp->q_core.d_bwarns);
642 warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); 641 warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
643 resbcountp = &dqp->q_res_bcount; 642 resbcountp = &dqp->q_res_bcount;
644 } else { 643 } else {
645 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); 644 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +650,7 @@ xfs_trans_dqresv(
651 softlimit = q->qi_rtbsoftlimit; 650 softlimit = q->qi_rtbsoftlimit;
652 timer = be32_to_cpu(dqp->q_core.d_rtbtimer); 651 timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
653 warns = be16_to_cpu(dqp->q_core.d_rtbwarns); 652 warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 653 warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
655 resbcountp = &dqp->q_res_rtbcount; 654 resbcountp = &dqp->q_res_rtbcount;
656 } 655 }
657 656
@@ -691,7 +690,7 @@ xfs_trans_dqresv(
691 count = be64_to_cpu(dqp->q_core.d_icount); 690 count = be64_to_cpu(dqp->q_core.d_icount);
692 timer = be32_to_cpu(dqp->q_core.d_itimer); 691 timer = be32_to_cpu(dqp->q_core.d_itimer);
693 warns = be16_to_cpu(dqp->q_core.d_iwarns); 692 warns = be16_to_cpu(dqp->q_core.d_iwarns);
694 warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); 693 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
695 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 694 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
696 if (!hardlimit) 695 if (!hardlimit)
697 hardlimit = q->qi_ihardlimit; 696 hardlimit = q->qi_ihardlimit;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
49extern int posix_acl_access_exists(struct inode *inode); 49extern int posix_acl_access_exists(struct inode *inode);
50extern int posix_acl_default_exists(struct inode *inode); 50extern int posix_acl_default_exists(struct inode *inode);
51 51
52extern struct xattr_handler xfs_xattr_acl_access_handler; 52extern const struct xattr_handler xfs_xattr_acl_access_handler;
53extern struct xattr_handler xfs_xattr_acl_default_handler; 53extern const struct xattr_handler xfs_xattr_acl_default_handler;
54#else 54#else
55# define xfs_check_acl NULL 55# define xfs_check_acl NULL
56# define xfs_get_acl(inode, type) NULL 56# define xfs_get_acl(inode, type) NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Used in perag to mark blocks that have been freed 178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * but whose transactions aren't committed to disk yet. 179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
180 */ 183 */
181typedef struct xfs_perag_busy { 184struct xfs_busy_extent {
182 xfs_agblock_t busy_start; 185 struct rb_node rb_node; /* ag by-bno indexed search tree */
183 xfs_extlen_t busy_length; 186 struct list_head list; /* transaction busy extent list */
184 struct xfs_trans *busy_tp; /* transaction that did the free */ 187 xfs_agnumber_t agno;
185} xfs_perag_busy_t; 188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191};
186 192
187/* 193/*
188 * Per-ag incore structure, copies of information in agf and agi, 194 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
216 xfs_agino_t pagl_leftrec; 222 xfs_agino_t pagl_leftrec;
217 xfs_agino_t pagl_rightrec; 223 xfs_agino_t pagl_rightrec;
218#ifdef __KERNEL__ 224#ifdef __KERNEL__
219 spinlock_t pagb_lock; /* lock for pagb_list */ 225 spinlock_t pagb_lock; /* lock for pagb_tree */
226 struct rb_root pagb_tree; /* ordered tree of busy extents */
220 227
221 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
222 229
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
226 int pag_ici_reclaimable; /* reclaimable inodes */ 233 int pag_ici_reclaimable; /* reclaimable inodes */
227#endif 234#endif
228 int pagb_count; /* pagb slots in use */ 235 int pagb_count; /* pagb slots in use */
229 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
230} xfs_perag_t; 236} xfs_perag_t;
231 237
232/* 238/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
46#define XFSA_FIXUP_BNO_OK 1 46#define XFSA_FIXUP_BNO_OK 1
47#define XFSA_FIXUP_CNT_OK 2 47#define XFSA_FIXUP_CNT_OK 2
48 48
49STATIC void 49static int
50xfs_alloc_search_busy(xfs_trans_t *tp, 50xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agnumber_t agno, 51 xfs_agblock_t bno, xfs_extlen_t len);
52 xfs_agblock_t bno,
53 xfs_extlen_t len);
54 52
55/* 53/*
56 * Prototypes for per-ag allocation routines 54 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
540 be32_to_cpu(agf->agf_length)); 538 be32_to_cpu(agf->agf_length));
541 xfs_alloc_log_agf(args->tp, args->agbp, 539 xfs_alloc_log_agf(args->tp, args->agbp,
542 XFS_AGF_FREEBLKS); 540 XFS_AGF_FREEBLKS);
543 /* search the busylist for these blocks */ 541 /*
544 xfs_alloc_search_busy(args->tp, args->agno, 542 * Search the busylist for these blocks and mark the
545 args->agbno, args->len); 543 * transaction as synchronous if blocks are found. This
544 * avoids the need to block due to a synchronous log
545 * force to ensure correct ordering as the synchronous
546 * transaction will guarantee that for us.
547 */
548 if (xfs_alloc_busy_search(args->mp, args->agno,
549 args->agbno, args->len))
550 xfs_trans_set_sync(args->tp);
546 } 551 }
547 if (!args->isfl) 552 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp, 553 xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
1693 * when the iclog commits to disk. If a busy block is allocated, 1698 * when the iclog commits to disk. If a busy block is allocated,
1694 * the iclog is pushed up to the LSN that freed the block. 1699 * the iclog is pushed up to the LSN that freed the block.
1695 */ 1700 */
1696 xfs_alloc_mark_busy(tp, agno, bno, len); 1701 xfs_alloc_busy_insert(tp, agno, bno, len);
1697 return 0; 1702 return 0;
1698 1703
1699 error0: 1704 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
1989 *bnop = bno; 1994 *bnop = bno;
1990 1995
1991 /* 1996 /*
1992 * As blocks are freed, they are added to the per-ag busy list 1997 * As blocks are freed, they are added to the per-ag busy list and
1993 * and remain there until the freeing transaction is committed to 1998 * remain there until the freeing transaction is committed to disk.
1994 * disk. Now that we have allocated blocks, this list must be 1999 * Now that we have allocated blocks, this list must be searched to see
1995 * searched to see if a block is being reused. If one is, then 2000 * if a block is being reused. If one is, then the freeing transaction
1996 * the freeing transaction must be pushed to disk NOW by forcing 2001 * must be pushed to disk before this transaction.
1997 * to disk all iclogs up that transaction's LSN. 2002 *
2003 * We do this by setting the current transaction to a sync transaction
2004 * which guarantees that the freeing transaction is on disk before this
2005 * transaction. This is done instead of a synchronous log force here so
2006 * that we don't sit and wait with the AGF locked in the transaction
2007 * during the log force.
1998 */ 2008 */
1999 xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); 2009 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2010 xfs_trans_set_sync(tp);
2000 return 0; 2011 return 0;
2001} 2012}
2002 2013
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2212 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2202 spin_lock_init(&pag->pagb_lock); 2213 spin_lock_init(&pag->pagb_lock);
2203 pag->pagb_count = 0; 2214 pag->pagb_count = 0;
2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); 2215 pag->pagb_tree = RB_ROOT;
2205 pag->pagf_init = 1; 2216 pag->pagf_init = 1;
2206 } 2217 }
2207#ifdef DEBUG 2218#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
2479 * list is reused, the transaction that freed it must be forced to disk 2490 * list is reused, the transaction that freed it must be forced to disk
2480 * before continuing to use the block. 2491 * before continuing to use the block.
2481 * 2492 *
2482 * xfs_alloc_mark_busy - add to the per-ag busy list 2493 * xfs_alloc_busy_insert - add to the per-ag busy list
2483 * xfs_alloc_clear_busy - remove an item from the per-ag busy list 2494 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2495 * xfs_alloc_busy_search - search for a busy extent
2496 */
2497
2498/*
2499 * Insert a new extent into the busy tree.
2500 *
2501 * The busy extent tree is indexed by the start block of the busy extent.
2502 * there can be multiple overlapping ranges in the busy extent tree but only
2503 * ever one entry at a given start block. The reason for this is that
2504 * multi-block extents can be freed, then smaller chunks of that extent
2505 * allocated and freed again before the first transaction commit is on disk.
2506 * If the exact same start block is freed a second time, we have to wait for
2507 * that busy extent to pass out of the tree before the new extent is inserted.
2508 * There are two main cases we have to handle here.
2509 *
2510 * The first case is a transaction that triggers a "free - allocate - free"
2511 * cycle. This can occur during btree manipulations as a btree block is freed
2512 * to the freelist, then allocated from the free list, then freed again. In
2513 * this case, the second extxpnet free is what triggers the duplicate and as
2514 * such the transaction IDs should match. Because the extent was allocated in
2515 * this transaction, the transaction must be marked as synchronous. This is
2516 * true for all cases where the free/alloc/free occurs in the one transaction,
2517 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2518 * This serves to catch violations of the second case quite effectively.
2519 *
2520 * The second case is where the free/alloc/free occur in different
2521 * transactions. In this case, the thread freeing the extent the second time
2522 * can't mark the extent busy immediately because it is already tracked in a
2523 * transaction that may be committing. When the log commit for the existing
2524 * busy extent completes, the busy extent will be removed from the tree. If we
2525 * allow the second busy insert to continue using that busy extent structure,
2526 * it can be freed before this transaction is safely in the log. Hence our
2527 * only option in this case is to force the log to remove the existing busy
2528 * extent from the list before we insert the new one with the current
2529 * transaction ID.
2530 *
2531 * The problem we are trying to avoid in the free-alloc-free in separate
2532 * transactions is most easily described with a timeline:
2533 *
2534 * Thread 1 Thread 2 Thread 3 xfslogd
2535 * xact alloc
2536 * free X
2537 * mark busy
2538 * commit xact
2539 * free xact
2540 * xact alloc
2541 * alloc X
2542 * busy search
2543 * mark xact sync
2544 * commit xact
2545 * free xact
2546 * force log
2547 * checkpoint starts
2548 * ....
2549 * xact alloc
2550 * free X
2551 * mark busy
2552 * finds match
2553 * *** KABOOM! ***
2554 * ....
2555 * log IO completes
2556 * unbusy X
2557 * checkpoint completes
2558 *
2559 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2560 * the checkpoint completes, and the busy extent it matched will have been
2561 * removed from the tree when it is woken. Hence it can then continue safely.
2562 *
2563 * However, to ensure this matching process is robust, we need to use the
2564 * transaction ID for identifying transaction, as delayed logging results in
2565 * the busy extent and transaction lifecycles being different. i.e. the busy
2566 * extent is active for a lot longer than the transaction. Hence the
2567 * transaction structure can be freed and reallocated, then mark the same
2568 * extent busy again in the new transaction. In this case the new transaction
2569 * will have a different tid but can have the same address, and hence we need
2570 * to check against the tid.
2571 *
2572 * Future: for delayed logging, we could avoid the log force if the extent was
2573 * first freed in the current checkpoint sequence. This, however, requires the
2574 * ability to pin the current checkpoint in memory until this transaction
2575 * commits to ensure that both the original free and the current one combine
2576 * logically into the one checkpoint. If the checkpoint sequences are
2577 * different, however, we still need to wait on a log force.
2484 */ 2578 */
2485void 2579void
2486xfs_alloc_mark_busy(xfs_trans_t *tp, 2580xfs_alloc_busy_insert(
2487 xfs_agnumber_t agno, 2581 struct xfs_trans *tp,
2488 xfs_agblock_t bno, 2582 xfs_agnumber_t agno,
2489 xfs_extlen_t len) 2583 xfs_agblock_t bno,
2584 xfs_extlen_t len)
2490{ 2585{
2491 xfs_perag_busy_t *bsy; 2586 struct xfs_busy_extent *new;
2587 struct xfs_busy_extent *busyp;
2492 struct xfs_perag *pag; 2588 struct xfs_perag *pag;
2493 int n; 2589 struct rb_node **rbp;
2590 struct rb_node *parent;
2591 int match;
2494 2592
2495 pag = xfs_perag_get(tp->t_mountp, agno);
2496 spin_lock(&pag->pagb_lock);
2497 2593
2498 /* search pagb_list for an open slot */ 2594 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2499 for (bsy = pag->pagb_list, n = 0; 2595 if (!new) {
2500 n < XFS_PAGB_NUM_SLOTS; 2596 /*
2501 bsy++, n++) { 2597 * No Memory! Since it is now not possible to track the free
2502 if (bsy->busy_tp == NULL) { 2598 * block, make this a synchronous transaction to insure that
2503 break; 2599 * the block is not reused before this transaction commits.
2504 } 2600 */
2601 trace_xfs_alloc_busy(tp, agno, bno, len, 1);
2602 xfs_trans_set_sync(tp);
2603 return;
2505 } 2604 }
2506 2605
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); 2606 new->agno = agno;
2607 new->bno = bno;
2608 new->length = len;
2609 new->tid = xfs_log_get_trans_ident(tp);
2508 2610
2509 if (n < XFS_PAGB_NUM_SLOTS) { 2611 INIT_LIST_HEAD(&new->list);
2510 bsy = &pag->pagb_list[n]; 2612
2511 pag->pagb_count++; 2613 /* trace before insert to be able to see failed inserts */
2512 bsy->busy_start = bno; 2614 trace_xfs_alloc_busy(tp, agno, bno, len, 0);
2513 bsy->busy_length = len; 2615
2514 bsy->busy_tp = tp; 2616 pag = xfs_perag_get(tp->t_mountp, new->agno);
2515 xfs_trans_add_busy(tp, agno, n); 2617restart:
2516 } else { 2618 spin_lock(&pag->pagb_lock);
2619 rbp = &pag->pagb_tree.rb_node;
2620 parent = NULL;
2621 busyp = NULL;
2622 match = 0;
2623 while (*rbp && match >= 0) {
2624 parent = *rbp;
2625 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2626
2627 if (new->bno < busyp->bno) {
2628 /* may overlap, but exact start block is lower */
2629 rbp = &(*rbp)->rb_left;
2630 if (new->bno + new->length > busyp->bno)
2631 match = busyp->tid == new->tid ? 1 : -1;
2632 } else if (new->bno > busyp->bno) {
2633 /* may overlap, but exact start block is higher */
2634 rbp = &(*rbp)->rb_right;
2635 if (bno < busyp->bno + busyp->length)
2636 match = busyp->tid == new->tid ? 1 : -1;
2637 } else {
2638 match = busyp->tid == new->tid ? 1 : -1;
2639 break;
2640 }
2641 }
2642 if (match < 0) {
2643 /* overlap marked busy in different transaction */
2644 spin_unlock(&pag->pagb_lock);
2645 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2646 goto restart;
2647 }
2648 if (match > 0) {
2517 /* 2649 /*
2518 * The busy list is full! Since it is now not possible to 2650 * overlap marked busy in same transaction. Update if exact
2519 * track the free block, make this a synchronous transaction 2651 * start block match, otherwise combine the busy extents into
2520 * to insure that the block is not reused before this 2652 * a single range.
2521 * transaction commits.
2522 */ 2653 */
2523 xfs_trans_set_sync(tp); 2654 if (busyp->bno == new->bno) {
2524 } 2655 busyp->length = max(busyp->length, new->length);
2656 spin_unlock(&pag->pagb_lock);
2657 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2658 xfs_perag_put(pag);
2659 kmem_free(new);
2660 return;
2661 }
2662 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2663 new->length = max(busyp->bno + busyp->length,
2664 new->bno + new->length) -
2665 min(busyp->bno, new->bno);
2666 new->bno = min(busyp->bno, new->bno);
2667 } else
2668 busyp = NULL;
2525 2669
2670 rb_link_node(&new->rb_node, parent, rbp);
2671 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2672
2673 list_add(&new->list, &tp->t_busy);
2526 spin_unlock(&pag->pagb_lock); 2674 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag); 2675 xfs_perag_put(pag);
2676 kmem_free(busyp);
2528} 2677}
2529 2678
2530void 2679/*
2531xfs_alloc_clear_busy(xfs_trans_t *tp, 2680 * Search for a busy extent within the range of the extent we are about to
2532 xfs_agnumber_t agno, 2681 * allocate. You need to be holding the busy extent tree lock when calling
2533 int idx) 2682 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2683 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2684 * match. This is done so that a non-zero return indicates an overlap that
2685 * will require a synchronous transaction, but it can still be
2686 * used to distinguish between a partial or exact match.
2687 */
2688static int
2689xfs_alloc_busy_search(
2690 struct xfs_mount *mp,
2691 xfs_agnumber_t agno,
2692 xfs_agblock_t bno,
2693 xfs_extlen_t len)
2534{ 2694{
2535 struct xfs_perag *pag; 2695 struct xfs_perag *pag;
2536 xfs_perag_busy_t *list; 2696 struct rb_node *rbp;
2697 struct xfs_busy_extent *busyp;
2698 int match = 0;
2537 2699
2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2700 pag = xfs_perag_get(mp, agno);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock); 2701 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2542 2702
2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); 2703 rbp = pag->pagb_tree.rb_node;
2544 2704
2545 if (list[idx].busy_tp == tp) { 2705 /* find closest start bno overlap */
2546 list[idx].busy_tp = NULL; 2706 while (rbp) {
2547 pag->pagb_count--; 2707 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2708 if (bno < busyp->bno) {
2709 /* may overlap, but exact start block is lower */
2710 if (bno + len > busyp->bno)
2711 match = -1;
2712 rbp = rbp->rb_left;
2713 } else if (bno > busyp->bno) {
2714 /* may overlap, but exact start block is higher */
2715 if (bno < busyp->bno + busyp->length)
2716 match = -1;
2717 rbp = rbp->rb_right;
2718 } else {
2719 /* bno matches busyp, length determines exact match */
2720 match = (busyp->length == len) ? 1 : -1;
2721 break;
2722 }
2548 } 2723 }
2549
2550 spin_unlock(&pag->pagb_lock); 2724 spin_unlock(&pag->pagb_lock);
2725 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2551 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
2727 return match;
2552} 2728}
2553 2729
2554 2730void
2555/* 2731xfs_alloc_busy_clear(
2556 * If we find the extent in the busy list, force the log out to get the 2732 struct xfs_mount *mp,
2557 * extent out of the busy list so the caller can use it straight away. 2733 struct xfs_busy_extent *busyp)
2558 */
2559STATIC void
2560xfs_alloc_search_busy(xfs_trans_t *tp,
2561 xfs_agnumber_t agno,
2562 xfs_agblock_t bno,
2563 xfs_extlen_t len)
2564{ 2734{
2565 struct xfs_perag *pag; 2735 struct xfs_perag *pag;
2566 xfs_perag_busy_t *bsy;
2567 xfs_agblock_t uend, bend;
2568 xfs_lsn_t lsn = 0;
2569 int cnt;
2570 2736
2571 pag = xfs_perag_get(tp->t_mountp, agno); 2737 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2572 spin_lock(&pag->pagb_lock); 2738 busyp->length);
2573 cnt = pag->pagb_count;
2574 2739
2575 /* 2740 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2576 * search pagb_list for this slot, skipping open slots. We have to 2741 busyp->length) == 1);
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2581 uend = bno + len - 1;
2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2584 if (!bsy->busy_tp)
2585 continue;
2586 2742
2587 bend = bsy->busy_start + bsy->busy_length - 1; 2743 list_del_init(&busyp->list);
2588 if (bno > bend || uend < bsy->busy_start)
2589 continue;
2590 2744
2591 /* (start1,length1) within (start2, length2) */ 2745 pag = xfs_perag_get(mp, busyp->agno);
2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2746 spin_lock(&pag->pagb_lock);
2593 lsn = bsy->busy_tp->t_commit_lsn; 2747 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2594 }
2595 spin_unlock(&pag->pagb_lock); 2748 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag); 2749 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2598 2750
2599 /* 2751 kmem_free(busyp);
2600 * If a block was found, force the log through the LSN of the
2601 * transaction that freed the block
2602 */
2603 if (lsn)
2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2605} 2752}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
22struct xfs_mount; 22struct xfs_mount;
23struct xfs_perag; 23struct xfs_perag;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_busy_extent;
25 26
26/* 27/*
27 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
119#ifdef __KERNEL__ 120#ifdef __KERNEL__
120 121
121void 122void
122xfs_alloc_mark_busy(xfs_trans_t *tp, 123xfs_alloc_busy_insert(xfs_trans_t *tp,
123 xfs_agnumber_t agno, 124 xfs_agnumber_t agno,
124 xfs_agblock_t bno, 125 xfs_agblock_t bno,
125 xfs_extlen_t len); 126 xfs_extlen_t len);
126 127
127void 128void
128xfs_alloc_clear_busy(xfs_trans_t *tp, 129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
129 xfs_agnumber_t ag,
130 int idx);
131 130
132#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
133 132
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
134 * disk. If a busy block is allocated, the iclog is pushed up to the 134 * disk. If a busy block is allocated, the iclog is pushed up to the
135 * LSN that freed the block. 135 * LSN that freed the block.
136 */ 136 */
137 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 137 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
138 xfs_trans_agbtree_delta(cur->bc_tp, -1); 138 xfs_trans_agbtree_delta(cur->bc_tp, -1);
139 return 0; 139 return 0;
140} 140}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..99587ded043f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork(
3829 } 3829 }
3830 if ((error = xfs_bmap_finish(&tp, &flist, &committed))) 3830 if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
3831 goto error2; 3831 goto error2;
3832 error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); 3832 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3833 ASSERT(ip->i_df.if_ext_max == 3833 ASSERT(ip->i_df.if_ext_max ==
3834 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); 3834 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3835 return error; 3835 return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
64 nbytes = last - first + 1; 64 nbytes = last - first + 1;
65 bfset(bip->bli_logged, first, nbytes); 65 bfset(bip->bli_logged, first, nbytes);
66 for (x = 0; x < nbytes; x++) { 66 for (x = 0; x < nbytes; x++) {
67 chunk_num = byte >> XFS_BLI_SHIFT; 67 chunk_num = byte >> XFS_BLF_SHIFT;
68 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 68 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
69 bit_num = chunk_num & (NBWORD - 1); 69 bit_num = chunk_num & (NBWORD - 1);
70 wordp = &(bip->bli_format.blf_data_map[word_num]); 70 wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
166 * cancel flag in it. 166 * cancel flag in it.
167 */ 167 */
168 trace_xfs_buf_item_size_stale(bip); 168 trace_xfs_buf_item_size_stale(bip);
169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 169 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
170 return 1; 170 return 1;
171 } 171 }
172 172
@@ -197,9 +197,9 @@ xfs_buf_item_size(
197 } else if (next_bit != last_bit + 1) { 197 } else if (next_bit != last_bit + 1) {
198 last_bit = next_bit; 198 last_bit = next_bit;
199 nvecs++; 199 nvecs++;
200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
201 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 201 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
202 XFS_BLI_CHUNK)) { 202 XFS_BLF_CHUNK)) {
203 last_bit = next_bit; 203 last_bit = next_bit;
204 nvecs++; 204 nvecs++;
205 } else { 205 } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
257 /*
258 * If it is an inode buffer, transfer the in-memory state to the
259 * format flags and clear the in-memory state. We do not transfer
260 * this state if the inode buffer allocation has not yet been committed
261 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262 * correct replay of the inode allocation.
263 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 }
270
257 if (bip->bli_flags & XFS_BLI_STALE) { 271 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 272 /*
259 * The buffer is stale, so all we need to log 273 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
261 * cancel flag in it. 275 * cancel flag in it.
262 */ 276 */
263 trace_xfs_buf_item_format_stale(bip); 277 trace_xfs_buf_item_format_stale(bip);
264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 278 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
265 bip->bli_format.blf_size = nvecs; 279 bip->bli_format.blf_size = nvecs;
266 return; 280 return;
267 } 281 }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
294 * keep counting and scanning. 308 * keep counting and scanning.
295 */ 309 */
296 if (next_bit == -1) { 310 if (next_bit == -1) {
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 311 buffer_offset = first_bit * XFS_BLF_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 312 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 313 vecp->i_len = nbits * XFS_BLF_CHUNK;
300 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 314 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 315 nvecs++;
302 break; 316 break;
303 } else if (next_bit != last_bit + 1) { 317 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 318 buffer_offset = first_bit * XFS_BLF_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 319 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 320 vecp->i_len = nbits * XFS_BLF_CHUNK;
307 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 321 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 322 nvecs++;
309 vecp++; 323 vecp++;
310 first_bit = next_bit; 324 first_bit = next_bit;
311 last_bit = next_bit; 325 last_bit = next_bit;
312 nbits = 1; 326 nbits = 1;
313 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 327 } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
314 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 328 (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
315 XFS_BLI_CHUNK)) { 329 XFS_BLF_CHUNK)) {
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 330 buffer_offset = first_bit * XFS_BLF_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 331 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 332 vecp->i_len = nbits * XFS_BLF_CHUNK;
319 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 333 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 334/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 335 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
341} 355}
342 356
343/* 357/*
344 * This is called to pin the buffer associated with the buf log 358 * This is called to pin the buffer associated with the buf log item in memory
345 * item in memory so it cannot be written out. Simply call bpin() 359 * so it cannot be written out. Simply call bpin() on the buffer to do this.
346 * on the buffer to do this. 360 *
361 * We also always take a reference to the buffer log item here so that the bli
362 * is held while the item is pinned in memory. This means that we can
363 * unconditionally drop the reference count a transaction holds when the
364 * transaction is completed.
347 */ 365 */
366
348STATIC void 367STATIC void
349xfs_buf_item_pin( 368xfs_buf_item_pin(
350 xfs_buf_log_item_t *bip) 369 xfs_buf_log_item_t *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
356 ASSERT(atomic_read(&bip->bli_refcount) > 0); 375 ASSERT(atomic_read(&bip->bli_refcount) > 0);
357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
358 (bip->bli_flags & XFS_BLI_STALE)); 377 (bip->bli_flags & XFS_BLI_STALE));
378 atomic_inc(&bip->bli_refcount);
359 trace_xfs_buf_item_pin(bip); 379 trace_xfs_buf_item_pin(bip);
360 xfs_bpin(bp); 380 xfs_bpin(bp);
361} 381}
@@ -372,12 +392,12 @@ xfs_buf_item_pin(
372 */ 392 */
373STATIC void 393STATIC void
374xfs_buf_item_unpin( 394xfs_buf_item_unpin(
375 xfs_buf_log_item_t *bip, 395 xfs_buf_log_item_t *bip)
376 int stale)
377{ 396{
378 struct xfs_ail *ailp; 397 struct xfs_ail *ailp;
379 xfs_buf_t *bp; 398 xfs_buf_t *bp;
380 int freed; 399 int freed;
400 int stale = bip->bli_flags & XFS_BLI_STALE;
381 401
382 bp = bip->bli_buf; 402 bp = bip->bli_buf;
383 ASSERT(bp != NULL); 403 ASSERT(bp != NULL);
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
393 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 413 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 414 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
395 ASSERT(XFS_BUF_ISSTALE(bp)); 415 ASSERT(XFS_BUF_ISSTALE(bp));
396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 416 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
397 trace_xfs_buf_item_unpin_stale(bip); 417 trace_xfs_buf_item_unpin_stale(bip);
398 418
399 /* 419 /*
@@ -428,40 +448,34 @@ xfs_buf_item_unpin_remove(
428 xfs_buf_log_item_t *bip, 448 xfs_buf_log_item_t *bip,
429 xfs_trans_t *tp) 449 xfs_trans_t *tp)
430{ 450{
431 xfs_buf_t *bp; 451 /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
432 xfs_log_item_desc_t *lidp;
433 int stale = 0;
434
435 bp = bip->bli_buf;
436 /*
437 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
438 */
439 if ((atomic_read(&bip->bli_refcount) == 1) && 452 if ((atomic_read(&bip->bli_refcount) == 1) &&
440 (bip->bli_flags & XFS_BLI_STALE)) { 453 (bip->bli_flags & XFS_BLI_STALE)) {
454 /*
455 * yes -- We can safely do some work here and then call
456 * buf_item_unpin to do the rest because we are
457 * are holding the buffer locked so no one else will be
458 * able to bump up the refcount. We have to remove the
459 * log item from the transaction as we are about to release
460 * our reference to the buffer. If we don't, the unlock that
461 * occurs later in the xfs_trans_uncommit() will try to
462 * reference the buffer which we no longer have a hold on.
463 */
464 struct xfs_log_item_desc *lidp;
465
441 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); 466 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
442 trace_xfs_buf_item_unpin_stale(bip); 467 trace_xfs_buf_item_unpin_stale(bip);
443 468
444 /* 469 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
445 * yes -- clear the xaction descriptor in-use flag
446 * and free the chunk if required. We can safely
447 * do some work here and then call buf_item_unpin
448 * to do the rest because if the if is true, then
449 * we are holding the buffer locked so no one else
450 * will be able to bump up the refcount.
451 */
452 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
453 stale = lidp->lid_flags & XFS_LID_BUF_STALE;
454 xfs_trans_free_item(tp, lidp); 470 xfs_trans_free_item(tp, lidp);
471
455 /* 472 /*
456 * Since the transaction no longer refers to the buffer, 473 * Since the transaction no longer refers to the buffer, the
457 * the buffer should no longer refer to the transaction. 474 * buffer should no longer refer to the transaction.
458 */ 475 */
459 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 476 XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
460 } 477 }
461 478 xfs_buf_item_unpin(bip);
462 xfs_buf_item_unpin(bip, stale);
463
464 return;
465} 479}
466 480
467/* 481/*
@@ -495,20 +509,23 @@ xfs_buf_item_trylock(
495} 509}
496 510
497/* 511/*
498 * Release the buffer associated with the buf log item. 512 * Release the buffer associated with the buf log item. If there is no dirty
499 * If there is no dirty logged data associated with the 513 * logged data associated with the buffer recorded in the buf log item, then
500 * buffer recorded in the buf log item, then free the 514 * free the buf log item and remove the reference to it in the buffer.
501 * buf log item and remove the reference to it in the 515 *
502 * buffer. 516 * This call ignores the recursion count. It is only called when the buffer
517 * should REALLY be unlocked, regardless of the recursion count.
503 * 518 *
504 * This call ignores the recursion count. It is only called 519 * We unconditionally drop the transaction's reference to the log item. If the
505 * when the buffer should REALLY be unlocked, regardless 520 * item was logged, then another reference was taken when it was pinned, so we
506 * of the recursion count. 521 * can safely drop the transaction reference now. This also allows us to avoid
522 * potential races with the unpin code freeing the bli by not referencing the
523 * bli after we've dropped the reference count.
507 * 524 *
508 * If the XFS_BLI_HOLD flag is set in the buf log item, then 525 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
509 * free the log item if necessary but do not unlock the buffer. 526 * if necessary but do not unlock the buffer. This is for support of
510 * This is for support of xfs_trans_bhold(). Make sure the 527 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
511 * XFS_BLI_HOLD field is cleared if we don't free the item. 528 * free the item.
512 */ 529 */
513STATIC void 530STATIC void
514xfs_buf_item_unlock( 531xfs_buf_item_unlock(
@@ -520,73 +537,54 @@ xfs_buf_item_unlock(
520 537
521 bp = bip->bli_buf; 538 bp = bip->bli_buf;
522 539
523 /* 540 /* Clear the buffer's association with this transaction. */
524 * Clear the buffer's association with this transaction.
525 */
526 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 541 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
527 542
528 /* 543 /*
529 * If this is a transaction abort, don't return early. 544 * If this is a transaction abort, don't return early. Instead, allow
530 * Instead, allow the brelse to happen. 545 * the brelse to happen. Normally it would be done for stale
531 * Normally it would be done for stale (cancelled) buffers 546 * (cancelled) buffers at unpin time, but we'll never go through the
532 * at unpin time, but we'll never go through the pin/unpin 547 * pin/unpin cycle if we abort inside commit.
533 * cycle if we abort inside commit.
534 */ 548 */
535 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 549 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
536 550
537 /* 551 /*
538 * If the buf item is marked stale, then don't do anything. 552 * Before possibly freeing the buf item, determine if we should
539 * We'll unlock the buffer and free the buf item when the 553 * release the buffer at the end of this routine.
540 * buffer is unpinned for the last time.
541 */ 554 */
542 if (bip->bli_flags & XFS_BLI_STALE) { 555 hold = bip->bli_flags & XFS_BLI_HOLD;
543 bip->bli_flags &= ~XFS_BLI_LOGGED; 556
544 trace_xfs_buf_item_unlock_stale(bip); 557 /* Clear the per transaction state. */
545 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 558 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
546 if (!aborted)
547 return;
548 }
549 559
550 /* 560 /*
551 * Drop the transaction's reference to the log item if 561 * If the buf item is marked stale, then don't do anything. We'll
552 * it was not logged as part of the transaction. Otherwise 562 * unlock the buffer and free the buf item when the buffer is unpinned
553 * we'll drop the reference in xfs_buf_item_unpin() when 563 * for the last time.
554 * the transaction is really through with the buffer.
555 */ 564 */
556 if (!(bip->bli_flags & XFS_BLI_LOGGED)) { 565 if (bip->bli_flags & XFS_BLI_STALE) {
557 atomic_dec(&bip->bli_refcount); 566 trace_xfs_buf_item_unlock_stale(bip);
558 } else { 567 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
559 /* 568 if (!aborted) {
560 * Clear the logged flag since this is per 569 atomic_dec(&bip->bli_refcount);
561 * transaction state. 570 return;
562 */ 571 }
563 bip->bli_flags &= ~XFS_BLI_LOGGED;
564 } 572 }
565 573
566 /*
567 * Before possibly freeing the buf item, determine if we should
568 * release the buffer at the end of this routine.
569 */
570 hold = bip->bli_flags & XFS_BLI_HOLD;
571 trace_xfs_buf_item_unlock(bip); 574 trace_xfs_buf_item_unlock(bip);
572 575
573 /* 576 /*
574 * If the buf item isn't tracking any data, free it. 577 * If the buf item isn't tracking any data, free it, otherwise drop the
575 * Otherwise, if XFS_BLI_HOLD is set clear it. 578 * reference we hold to it.
576 */ 579 */
577 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 580 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
578 bip->bli_format.blf_map_size)) { 581 bip->bli_format.blf_map_size))
579 xfs_buf_item_relse(bp); 582 xfs_buf_item_relse(bp);
580 } else if (hold) { 583 else
581 bip->bli_flags &= ~XFS_BLI_HOLD; 584 atomic_dec(&bip->bli_refcount);
582 }
583 585
584 /* 586 if (!hold)
585 * Release the buffer if XFS_BLI_HOLD was not set.
586 */
587 if (!hold) {
588 xfs_buf_relse(bp); 587 xfs_buf_relse(bp);
589 }
590} 588}
591 589
592/* 590/*
@@ -675,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
675 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 673 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
676 xfs_buf_item_format, 674 xfs_buf_item_format,
677 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, 675 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
678 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, 676 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
679 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 677 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
680 xfs_buf_item_unpin_remove, 678 xfs_buf_item_unpin_remove,
681 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, 679 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -723,20 +721,17 @@ xfs_buf_item_init(
723 } 721 }
724 722
725 /* 723 /*
726 * chunks is the number of XFS_BLI_CHUNK size pieces 724 * chunks is the number of XFS_BLF_CHUNK size pieces
727 * the buffer can be divided into. Make sure not to 725 * the buffer can be divided into. Make sure not to
728 * truncate any pieces. map_size is the size of the 726 * truncate any pieces. map_size is the size of the
729 * bitmap needed to describe the chunks of the buffer. 727 * bitmap needed to describe the chunks of the buffer.
730 */ 728 */
731 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 729 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
732 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 730 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
733 731
734 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 732 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
735 KM_SLEEP); 733 KM_SLEEP);
736 bip->bli_item.li_type = XFS_LI_BUF; 734 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
737 bip->bli_item.li_ops = &xfs_buf_item_ops;
738 bip->bli_item.li_mountp = mp;
739 bip->bli_item.li_ailp = mp->m_ail;
740 bip->bli_buf = bp; 735 bip->bli_buf = bp;
741 xfs_buf_hold(bp); 736 xfs_buf_hold(bp);
742 bip->bli_format.blf_type = XFS_LI_BUF; 737 bip->bli_format.blf_type = XFS_LI_BUF;
@@ -799,8 +794,8 @@ xfs_buf_item_log(
799 /* 794 /*
800 * Convert byte offsets to bit numbers. 795 * Convert byte offsets to bit numbers.
801 */ 796 */
802 first_bit = first >> XFS_BLI_SHIFT; 797 first_bit = first >> XFS_BLF_SHIFT;
803 last_bit = last >> XFS_BLI_SHIFT; 798 last_bit = last >> XFS_BLF_SHIFT;
804 799
805 /* 800 /*
806 * Calculate the total number of bits to be set. 801 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone;
26 * have been logged. 26 * have been logged.
27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. 27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
28 */ 28 */
29typedef struct xfs_buf_log_format_t { 29typedef struct xfs_buf_log_format {
30 unsigned short blf_type; /* buf log item type indicator */ 30 unsigned short blf_type; /* buf log item type indicator */
31 unsigned short blf_size; /* size of this item */ 31 unsigned short blf_size; /* size of this item */
32 ushort blf_flags; /* misc state */ 32 ushort blf_flags; /* misc state */
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t {
41 * This flag indicates that the buffer contains on disk inodes 41 * This flag indicates that the buffer contains on disk inodes
42 * and requires special recovery handling. 42 * and requires special recovery handling.
43 */ 43 */
44#define XFS_BLI_INODE_BUF 0x1 44#define XFS_BLF_INODE_BUF 0x1
45/* 45/*
46 * This flag indicates that the buffer should not be replayed 46 * This flag indicates that the buffer should not be replayed
47 * during recovery because its blocks are being freed. 47 * during recovery because its blocks are being freed.
48 */ 48 */
49#define XFS_BLI_CANCEL 0x2 49#define XFS_BLF_CANCEL 0x2
50/* 50/*
51 * This flag indicates that the buffer contains on disk 51 * This flag indicates that the buffer contains on disk
52 * user or group dquots and may require special recovery handling. 52 * user or group dquots and may require special recovery handling.
53 */ 53 */
54#define XFS_BLI_UDQUOT_BUF 0x4 54#define XFS_BLF_UDQUOT_BUF 0x4
55#define XFS_BLI_PDQUOT_BUF 0x8 55#define XFS_BLF_PDQUOT_BUF 0x8
56#define XFS_BLI_GDQUOT_BUF 0x10 56#define XFS_BLF_GDQUOT_BUF 0x10
57 57
58#define XFS_BLI_CHUNK 128 58#define XFS_BLF_CHUNK 128
59#define XFS_BLI_SHIFT 7 59#define XFS_BLF_SHIFT 7
60#define BIT_TO_WORD_SHIFT 5 60#define BIT_TO_WORD_SHIFT 5
61#define NBWORD (NBBY * sizeof(unsigned int)) 61#define NBWORD (NBBY * sizeof(unsigned int))
62 62
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
170 va_list ap; 170 va_list ap;
171 171
172#ifdef DEBUG 172#ifdef DEBUG
173 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; 173 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
174#endif 174#endif
175 175
176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag) 176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
186 186
187void 187void
188xfs_error_report( 188xfs_error_report(
189 char *tag, 189 const char *tag,
190 int level, 190 int level,
191 xfs_mount_t *mp, 191 struct xfs_mount *mp,
192 char *fname, 192 const char *filename,
193 int linenum, 193 int linenum,
194 inst_t *ra) 194 inst_t *ra)
195{ 195{
196 if (level <= xfs_error_level) { 196 if (level <= xfs_error_level) {
197 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 197 xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
198 CE_ALERT, mp, 198 CE_ALERT, mp,
199 "XFS internal error %s at line %d of file %s. Caller 0x%p\n", 199 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
200 tag, linenum, fname, ra); 200 tag, linenum, filename, ra);
201 201
202 xfs_stack_trace(); 202 xfs_stack_trace();
203 } 203 }
@@ -205,15 +205,15 @@ xfs_error_report(
205 205
206void 206void
207xfs_corruption_error( 207xfs_corruption_error(
208 char *tag, 208 const char *tag,
209 int level, 209 int level,
210 xfs_mount_t *mp, 210 struct xfs_mount *mp,
211 void *p, 211 void *p,
212 char *fname, 212 const char *filename,
213 int linenum, 213 int linenum,
214 inst_t *ra) 214 inst_t *ra)
215{ 215{
216 if (level <= xfs_error_level) 216 if (level <= xfs_error_level)
217 xfs_hex_dump(p, 16); 217 xfs_hex_dump(p, 16);
218 xfs_error_report(tag, level, mp, fname, linenum, ra); 218 xfs_error_report(tag, level, mp, filename, linenum, ra);
219} 219}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int xfs_error_trap(int);
29 29
30struct xfs_mount; 30struct xfs_mount;
31 31
32extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, 32extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
33 char *fname, int linenum, inst_t *ra); 33 const char *filename, int linenum, inst_t *ra);
34extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, 34extern void xfs_corruption_error(const char *tag, int level,
35 void *p, char *fname, int linenum, inst_t *ra); 35 struct xfs_mount *mp, void *p, const char *filename,
36 int linenum, inst_t *ra);
36 37
37#define XFS_ERROR_REPORT(e, lvl, mp) \ 38#define XFS_ERROR_REPORT(e, lvl, mp) \
38 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) 39 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..409fe81585fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
106 */ 106 */
107/*ARGSUSED*/ 107/*ARGSUSED*/
108STATIC void 108STATIC void
109xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) 109xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
110{ 110{
111 struct xfs_ail *ailp = efip->efi_item.li_ailp; 111 struct xfs_ail *ailp = efip->efi_item.li_ailp;
112 112
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
225 xfs_efi_item_format, 225 xfs_efi_item_format,
226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, 226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
227 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, 227 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
229 xfs_efi_item_unpin_remove, 229 xfs_efi_item_unpin_remove,
230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, 230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp,
259 KM_SLEEP); 259 KM_SLEEP);
260 } 260 }
261 261
262 efip->efi_item.li_type = XFS_LI_EFI; 262 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
263 efip->efi_item.li_ops = &xfs_efi_item_ops;
264 efip->efi_item.li_mountp = mp;
265 efip->efi_item.li_ailp = mp->m_ail;
266 efip->efi_format.efi_nextents = nextents; 263 efip->efi_format.efi_nextents = nextents;
267 efip->efi_format.efi_id = (__psint_t)(void*)efip; 264 efip->efi_format.efi_id = (__psint_t)(void*)efip;
268 265
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
428 */ 425 */
429/*ARGSUSED*/ 426/*ARGSUSED*/
430STATIC void 427STATIC void
431xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) 428xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
432{ 429{
433 return; 430 return;
434} 431}
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
518 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 515 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
519 xfs_efd_item_format, 516 xfs_efd_item_format,
520 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, 517 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
521 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, 518 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
522 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 519 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
523 xfs_efd_item_unpin_remove, 520 xfs_efd_item_unpin_remove,
524 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, 521 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp,
554 KM_SLEEP); 551 KM_SLEEP);
555 } 552 }
556 553
557 efdp->efd_item.li_type = XFS_LI_EFD; 554 xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
558 efdp->efd_item.li_ops = &xfs_efd_item_ops;
559 efdp->efd_item.li_mountp = mp;
560 efdp->efd_item.li_ailp = mp->m_ail;
561 efdp->efd_efip = efip; 555 efdp->efd_efip = efip;
562 efdp->efd_format.efd_nextents = nextents; 556 efdp->efd_format.efd_nextents = nextents;
563 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; 557 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..8cd6e8d8fe9c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2449,6 +2449,8 @@ xfs_iunpin_nowait(
2449{ 2449{
2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2451 2451
2452 trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2453
2452 /* Give the log a push to start the unpinning I/O */ 2454 /* Give the log a push to start the unpinning I/O */
2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2455 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2454 2456
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..cf8249a60004 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -543,6 +543,7 @@ xfs_inode_item_pin(
543{ 543{
544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
545 545
546 trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
546 atomic_inc(&iip->ili_inode->i_pincount); 547 atomic_inc(&iip->ili_inode->i_pincount);
547} 548}
548 549
@@ -556,11 +557,11 @@ xfs_inode_item_pin(
556/* ARGSUSED */ 557/* ARGSUSED */
557STATIC void 558STATIC void
558xfs_inode_item_unpin( 559xfs_inode_item_unpin(
559 xfs_inode_log_item_t *iip, 560 xfs_inode_log_item_t *iip)
560 int stale)
561{ 561{
562 struct xfs_inode *ip = iip->ili_inode; 562 struct xfs_inode *ip = iip->ili_inode;
563 563
564 trace_xfs_inode_unpin(ip, _RET_IP_);
564 ASSERT(atomic_read(&ip->i_pincount) > 0); 565 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount)) 566 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait); 567 wake_up(&ip->i_ipin_wait);
@@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove(
572 xfs_inode_log_item_t *iip, 573 xfs_inode_log_item_t *iip,
573 xfs_trans_t *tp) 574 xfs_trans_t *tp)
574{ 575{
575 xfs_inode_item_unpin(iip, 0); 576 xfs_inode_item_unpin(iip);
576} 577}
577 578
578/* 579/*
@@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
838 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 839 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
839 xfs_inode_item_format, 840 xfs_inode_item_format,
840 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, 841 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
841 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, 842 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
842 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 843 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
843 xfs_inode_item_unpin_remove, 844 xfs_inode_item_unpin_remove,
844 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, 845 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -865,17 +866,9 @@ xfs_inode_item_init(
865 ASSERT(ip->i_itemp == NULL); 866 ASSERT(ip->i_itemp == NULL);
866 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 867 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
867 868
868 iip->ili_item.li_type = XFS_LI_INODE;
869 iip->ili_item.li_ops = &xfs_inode_item_ops;
870 iip->ili_item.li_mountp = mp;
871 iip->ili_item.li_ailp = mp->m_ail;
872 iip->ili_inode = ip; 869 iip->ili_inode = ip;
873 870 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
874 /* 871 &xfs_inode_item_ops);
875 We have zeroed memory. No need ...
876 iip->ili_extents_buf = NULL;
877 */
878
879 iip->ili_format.ilf_type = XFS_LI_INODE; 872 iip->ili_format.ilf_type = XFS_LI_INODE;
880 iip->ili_format.ilf_ino = ip->i_ino; 873 iip->ili_format.ilf_ino = ip->i_ino;
881 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; 874 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..ef14943829da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
55#define XFS_STRAT_WRITE_IMAPS 2 55#define XFS_STRAT_WRITE_IMAPS 2
56#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 56#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
57 57
58STATIC int 58STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
59xfs_imap_to_bmap( 59 int, struct xfs_bmbt_irec *, int *);
60 xfs_inode_t *ip, 60STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
61 xfs_off_t offset, 61 struct xfs_bmbt_irec *, int *);
62 xfs_bmbt_irec_t *imap, 62STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
63 xfs_iomap_t *iomapp, 63 struct xfs_bmbt_irec *, int *);
64 int imaps, /* Number of imap entries */
65 int iomaps, /* Number of iomap entries */
66 int flags)
67{
68 xfs_mount_t *mp = ip->i_mount;
69 int pbm;
70 xfs_fsblock_t start_block;
71
72
73 for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
74 iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
75 iomapp->iomap_delta = offset - iomapp->iomap_offset;
76 iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
77 iomapp->iomap_flags = flags;
78
79 if (XFS_IS_REALTIME_INODE(ip)) {
80 iomapp->iomap_flags |= IOMAP_REALTIME;
81 iomapp->iomap_target = mp->m_rtdev_targp;
82 } else {
83 iomapp->iomap_target = mp->m_ddev_targp;
84 }
85 start_block = imap->br_startblock;
86 if (start_block == HOLESTARTBLOCK) {
87 iomapp->iomap_bn = IOMAP_DADDR_NULL;
88 iomapp->iomap_flags |= IOMAP_HOLE;
89 } else if (start_block == DELAYSTARTBLOCK) {
90 iomapp->iomap_bn = IOMAP_DADDR_NULL;
91 iomapp->iomap_flags |= IOMAP_DELAY;
92 } else {
93 iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
94 if (ISUNWRITTEN(imap))
95 iomapp->iomap_flags |= IOMAP_UNWRITTEN;
96 }
97
98 offset += iomapp->iomap_bsize - iomapp->iomap_delta;
99 }
100 return pbm; /* Return the number filled */
101}
102 64
103int 65int
104xfs_iomap( 66xfs_iomap(
105 xfs_inode_t *ip, 67 struct xfs_inode *ip,
106 xfs_off_t offset, 68 xfs_off_t offset,
107 ssize_t count, 69 ssize_t count,
108 int flags, 70 int flags,
109 xfs_iomap_t *iomapp, 71 struct xfs_bmbt_irec *imap,
110 int *niomaps) 72 int *nimaps,
73 int *new)
111{ 74{
112 xfs_mount_t *mp = ip->i_mount; 75 struct xfs_mount *mp = ip->i_mount;
113 xfs_fileoff_t offset_fsb, end_fsb; 76 xfs_fileoff_t offset_fsb, end_fsb;
114 int error = 0; 77 int error = 0;
115 int lockmode = 0; 78 int lockmode = 0;
116 xfs_bmbt_irec_t imap; 79 int bmapi_flags = 0;
117 int nimaps = 1;
118 int bmapi_flags = 0;
119 int iomap_flags = 0;
120 80
121 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 81 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
122 82
83 *new = 0;
84
123 if (XFS_FORCED_SHUTDOWN(mp)) 85 if (XFS_FORCED_SHUTDOWN(mp))
124 return XFS_ERROR(EIO); 86 return XFS_ERROR(EIO);
125 87
@@ -160,8 +122,8 @@ xfs_iomap(
160 122
161 error = xfs_bmapi(NULL, ip, offset_fsb, 123 error = xfs_bmapi(NULL, ip, offset_fsb,
162 (xfs_filblks_t)(end_fsb - offset_fsb), 124 (xfs_filblks_t)(end_fsb - offset_fsb),
163 bmapi_flags, NULL, 0, &imap, 125 bmapi_flags, NULL, 0, imap,
164 &nimaps, NULL, NULL); 126 nimaps, NULL, NULL);
165 127
166 if (error) 128 if (error)
167 goto out; 129 goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
169 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { 131 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
170 case BMAPI_WRITE: 132 case BMAPI_WRITE:
171 /* If we found an extent, return it */ 133 /* If we found an extent, return it */
172 if (nimaps && 134 if (*nimaps &&
173 (imap.br_startblock != HOLESTARTBLOCK) && 135 (imap->br_startblock != HOLESTARTBLOCK) &&
174 (imap.br_startblock != DELAYSTARTBLOCK)) { 136 (imap->br_startblock != DELAYSTARTBLOCK)) {
175 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 137 trace_xfs_iomap_found(ip, offset, count, flags, imap);
176 break; 138 break;
177 } 139 }
178 140
179 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { 141 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
180 error = xfs_iomap_write_direct(ip, offset, count, flags, 142 error = xfs_iomap_write_direct(ip, offset, count, flags,
181 &imap, &nimaps, nimaps); 143 imap, nimaps);
182 } else { 144 } else {
183 error = xfs_iomap_write_delay(ip, offset, count, flags, 145 error = xfs_iomap_write_delay(ip, offset, count, flags,
184 &imap, &nimaps); 146 imap, nimaps);
185 } 147 }
186 if (!error) { 148 if (!error) {
187 trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); 149 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
188 } 150 }
189 iomap_flags = IOMAP_NEW; 151 *new = 1;
190 break; 152 break;
191 case BMAPI_ALLOCATE: 153 case BMAPI_ALLOCATE:
192 /* If we found an extent, return it */ 154 /* If we found an extent, return it */
193 xfs_iunlock(ip, lockmode); 155 xfs_iunlock(ip, lockmode);
194 lockmode = 0; 156 lockmode = 0;
195 157
196 if (nimaps && !isnullstartblock(imap.br_startblock)) { 158 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
197 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 159 trace_xfs_iomap_found(ip, offset, count, flags, imap);
198 break; 160 break;
199 } 161 }
200 162
201 error = xfs_iomap_write_allocate(ip, offset, count, 163 error = xfs_iomap_write_allocate(ip, offset, count,
202 &imap, &nimaps); 164 imap, nimaps);
203 break; 165 break;
204 } 166 }
205 167
206 if (nimaps) { 168 ASSERT(*nimaps <= 1);
207 *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
208 iomapp, nimaps, *niomaps, iomap_flags);
209 } else if (niomaps) {
210 *niomaps = 0;
211 }
212 169
213out: 170out:
214 if (lockmode) 171 if (lockmode)
@@ -216,7 +173,6 @@ out:
216 return XFS_ERROR(error); 173 return XFS_ERROR(error);
217} 174}
218 175
219
220STATIC int 176STATIC int
221xfs_iomap_eof_align_last_fsb( 177xfs_iomap_eof_align_last_fsb(
222 xfs_mount_t *mp, 178 xfs_mount_t *mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
285 return EFSCORRUPTED; 241 return EFSCORRUPTED;
286} 242}
287 243
288int 244STATIC int
289xfs_iomap_write_direct( 245xfs_iomap_write_direct(
290 xfs_inode_t *ip, 246 xfs_inode_t *ip,
291 xfs_off_t offset, 247 xfs_off_t offset,
292 size_t count, 248 size_t count,
293 int flags, 249 int flags,
294 xfs_bmbt_irec_t *ret_imap, 250 xfs_bmbt_irec_t *ret_imap,
295 int *nmaps, 251 int *nmaps)
296 int found)
297{ 252{
298 xfs_mount_t *mp = ip->i_mount; 253 xfs_mount_t *mp = ip->i_mount;
299 xfs_fileoff_t offset_fsb; 254 xfs_fileoff_t offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
330 if (error) 285 if (error)
331 goto error_out; 286 goto error_out;
332 } else { 287 } else {
333 if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) 288 if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
334 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 289 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
335 ret_imap->br_blockcount + 290 ret_imap->br_blockcount +
336 ret_imap->br_startoff); 291 ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
485 return 0; 440 return 0;
486} 441}
487 442
488int 443STATIC int
489xfs_iomap_write_delay( 444xfs_iomap_write_delay(
490 xfs_inode_t *ip, 445 xfs_inode_t *ip,
491 xfs_off_t offset, 446 xfs_off_t offset,
@@ -588,7 +543,7 @@ retry:
588 * We no longer bother to look at the incoming map - all we have to 543 * We no longer bother to look at the incoming map - all we have to
589 * guarantee is that whatever we allocate fills the required range. 544 * guarantee is that whatever we allocate fills the required range.
590 */ 545 */
591int 546STATIC int
592xfs_iomap_write_allocate( 547xfs_iomap_write_allocate(
593 xfs_inode_t *ip, 548 xfs_inode_t *ip,
594 xfs_off_t offset, 549 xfs_off_t offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..81ac4afd45b3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
22
23
24typedef enum { /* iomap_flags values */
25 IOMAP_READ = 0, /* mapping for a read */
26 IOMAP_HOLE = 0x02, /* mapping covers a hole */
27 IOMAP_DELAY = 0x04, /* mapping covers delalloc region */
28 IOMAP_REALTIME = 0x10, /* mapping on the realtime device */
29 IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */
30 /* but uninitialized file data */
31 IOMAP_NEW = 0x40 /* just allocate */
32} iomap_flags_t;
33
34typedef enum { 21typedef enum {
35 /* base extent manipulation calls */ 22 /* base extent manipulation calls */
36 BMAPI_READ = (1 << 0), /* read extents */ 23 BMAPI_READ = (1 << 0), /* read extents */
@@ -52,43 +39,11 @@ typedef enum {
52 { BMAPI_MMAP, "MMAP" }, \ 39 { BMAPI_MMAP, "MMAP" }, \
53 { BMAPI_TRYLOCK, "TRYLOCK" } 40 { BMAPI_TRYLOCK, "TRYLOCK" }
54 41
55/*
56 * xfs_iomap_t: File system I/O map
57 *
58 * The iomap_bn field is expressed in 512-byte blocks, and is where the
59 * mapping starts on disk.
60 *
61 * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
62 * iomap_offset is the offset of the mapping in the file itself.
63 * iomap_bsize is the size of the mapping, iomap_delta is the
64 * desired data's offset into the mapping, given the offset supplied
65 * to the file I/O map routine.
66 *
67 * When a request is made to read beyond the logical end of the object,
68 * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
69 * to the actual amount of underlying storage that has been allocated, if any.
70 */
71
72typedef struct xfs_iomap {
73 xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
74 xfs_buftarg_t *iomap_target;
75 xfs_off_t iomap_offset; /* offset of mapping, bytes */
76 xfs_off_t iomap_bsize; /* size of mapping, bytes */
77 xfs_off_t iomap_delta; /* offset into mapping, bytes */
78 iomap_flags_t iomap_flags;
79} xfs_iomap_t;
80
81struct xfs_inode; 42struct xfs_inode;
82struct xfs_bmbt_irec; 43struct xfs_bmbt_irec;
83 44
84extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 45extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
85 struct xfs_iomap *, int *); 46 struct xfs_bmbt_irec *, int *, int *);
86extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
87 int, struct xfs_bmbt_irec *, int *, int);
88extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
89 struct xfs_bmbt_irec *, int *);
90extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
91 struct xfs_bmbt_irec *, int *);
92extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 47extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
93 48
94#endif /* __XFS_IOMAP_H__*/ 49#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2be019136287..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,13 +44,8 @@
44 44
45kmem_zone_t *xfs_log_ticket_zone; 45kmem_zone_t *xfs_log_ticket_zone;
46 46
47#define xlog_write_adv_cnt(ptr, len, off, bytes) \
48 { (ptr) += (bytes); \
49 (len) -= (bytes); \
50 (off) += (bytes);}
51
52/* Local miscellaneous function prototypes */ 47/* Local miscellaneous function prototypes */
53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 48STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
54 xlog_in_core_t **, xfs_lsn_t *); 49 xlog_in_core_t **, xfs_lsn_t *);
55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 50STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
56 xfs_buftarg_t *log_target, 51 xfs_buftarg_t *log_target,
@@ -59,11 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
59STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
61STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
63 int nentries, struct xlog_ticket *tic,
64 xfs_lsn_t *start_lsn,
65 xlog_in_core_t **commit_iclog,
66 uint flags);
67 57
68/* local state machine functions */ 58/* local state machine functions */
69STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 59STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -93,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
93STATIC void xlog_ungrant_log_space(xlog_t *log, 83STATIC void xlog_ungrant_log_space(xlog_t *log,
94 xlog_ticket_t *ticket); 84 xlog_ticket_t *ticket);
95 85
96
97/* local ticket functions */
98STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
99 int unit_bytes,
100 int count,
101 char clientid,
102 uint flags);
103
104#if defined(DEBUG) 86#if defined(DEBUG)
105STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); 87STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
106STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 88STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
107STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 89STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
108 int count, boolean_t syncing); 90 int count, boolean_t syncing);
@@ -258,7 +240,7 @@ xfs_log_done(
258 * If we get an error, just continue and give back the log ticket. 240 * If we get an error, just continue and give back the log ticket.
259 */ 241 */
260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 242 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) { 243 (xlog_commit_record(log, ticket, iclog, &lsn)))) {
262 lsn = (xfs_lsn_t) -1; 244 lsn = (xfs_lsn_t) -1;
263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 245 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
264 flags |= XFS_LOG_REL_PERM_RESERV; 246 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -367,6 +349,15 @@ xfs_log_reserve(
367 ASSERT(flags & XFS_LOG_PERM_RESERV); 349 ASSERT(flags & XFS_LOG_PERM_RESERV);
368 internal_ticket = *ticket; 350 internal_ticket = *ticket;
369 351
352 /*
353 * this is a new transaction on the ticket, so we need to
354 * change the transaction ID so that the next transaction has a
355 * different TID in the log. Just add one to the existing tid
356 * so that we can see chains of rolling transactions in the log
357 * easily.
358 */
359 internal_ticket->t_tid++;
360
370 trace_xfs_log_reserve(log, internal_ticket); 361 trace_xfs_log_reserve(log, internal_ticket);
371 362
372 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 363 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -374,7 +365,8 @@ xfs_log_reserve(
374 } else { 365 } else {
375 /* may sleep if need to allocate more tickets */ 366 /* may sleep if need to allocate more tickets */
376 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, 367 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
377 client, flags); 368 client, flags,
369 KM_SLEEP|KM_MAYFAIL);
378 if (!internal_ticket) 370 if (!internal_ticket)
379 return XFS_ERROR(ENOMEM); 371 return XFS_ERROR(ENOMEM);
380 internal_ticket->t_trans_type = t_type; 372 internal_ticket->t_trans_type = t_type;
@@ -459,6 +451,13 @@ xfs_log_mount(
459 /* Normal transactions can now occur */ 451 /* Normal transactions can now occur */
460 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 452 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
461 453
454 /*
455 * Now the log has been fully initialised and we know were our
456 * space grant counters are, we can initialise the permanent ticket
457 * needed for delayed logging to work.
458 */
459 xlog_cil_init_post_recovery(mp->m_log);
460
462 return 0; 461 return 0;
463 462
464out_destroy_ail: 463out_destroy_ail:
@@ -516,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
516#ifdef DEBUG 515#ifdef DEBUG
517 xlog_in_core_t *first_iclog; 516 xlog_in_core_t *first_iclog;
518#endif 517#endif
519 xfs_log_iovec_t reg[1];
520 xlog_ticket_t *tic = NULL; 518 xlog_ticket_t *tic = NULL;
521 xfs_lsn_t lsn; 519 xfs_lsn_t lsn;
522 int error; 520 int error;
523 521
524 /* the data section must be 32 bit size aligned */
525 struct {
526 __uint16_t magic;
527 __uint16_t pad1;
528 __uint32_t pad2; /* may as well make it 64 bits */
529 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
530
531 /* 522 /*
532 * Don't write out unmount record on read-only mounts. 523 * Don't write out unmount record on read-only mounts.
533 * Or, if we are doing a forced umount (typically because of IO errors). 524 * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
549 } while (iclog != first_iclog); 540 } while (iclog != first_iclog);
550#endif 541#endif
551 if (! (XLOG_FORCED_SHUTDOWN(log))) { 542 if (! (XLOG_FORCED_SHUTDOWN(log))) {
552 reg[0].i_addr = (void*)&magic;
553 reg[0].i_len = sizeof(magic);
554 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
555
556 error = xfs_log_reserve(mp, 600, 1, &tic, 543 error = xfs_log_reserve(mp, 600, 1, &tic,
557 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 544 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
558 if (!error) { 545 if (!error) {
546 /* the data section must be 32 bit size aligned */
547 struct {
548 __uint16_t magic;
549 __uint16_t pad1;
550 __uint32_t pad2; /* may as well make it 64 bits */
551 } magic = {
552 .magic = XLOG_UNMOUNT_TYPE,
553 };
554 struct xfs_log_iovec reg = {
555 .i_addr = (void *)&magic,
556 .i_len = sizeof(magic),
557 .i_type = XLOG_REG_TYPE_UNMOUNT,
558 };
559 struct xfs_log_vec vec = {
560 .lv_niovecs = 1,
561 .lv_iovecp = &reg,
562 };
563
559 /* remove inited flag */ 564 /* remove inited flag */
560 ((xlog_ticket_t *)tic)->t_flags = 0; 565 tic->t_flags = 0;
561 error = xlog_write(mp, reg, 1, tic, &lsn, 566 error = xlog_write(log, &vec, tic, &lsn,
562 NULL, XLOG_UNMOUNT_TRANS); 567 NULL, XLOG_UNMOUNT_TRANS);
563 /* 568 /*
564 * At this point, we're umounting anyway, 569 * At this point, we're umounting anyway,
@@ -648,10 +653,30 @@ xfs_log_unmount(xfs_mount_t *mp)
648 xlog_dealloc_log(mp->m_log); 653 xlog_dealloc_log(mp->m_log);
649} 654}
650 655
656void
657xfs_log_item_init(
658 struct xfs_mount *mp,
659 struct xfs_log_item *item,
660 int type,
661 struct xfs_item_ops *ops)
662{
663 item->li_mountp = mp;
664 item->li_ailp = mp->m_ail;
665 item->li_type = type;
666 item->li_ops = ops;
667 item->li_lv = NULL;
668
669 INIT_LIST_HEAD(&item->li_ail);
670 INIT_LIST_HEAD(&item->li_cil);
671}
672
651/* 673/*
652 * Write region vectors to log. The write happens using the space reservation 674 * Write region vectors to log. The write happens using the space reservation
653 * of the ticket (tic). It is not a requirement that all writes for a given 675 * of the ticket (tic). It is not a requirement that all writes for a given
654 * transaction occur with one call to xfs_log_write(). 676 * transaction occur with one call to xfs_log_write(). However, it is important
677 * to note that the transaction reservation code makes an assumption about the
678 * number of log headers a transaction requires that may be violated if you
679 * don't pass all the transaction vectors in one call....
655 */ 680 */
656int 681int
657xfs_log_write( 682xfs_log_write(
@@ -663,11 +688,15 @@ xfs_log_write(
663{ 688{
664 struct log *log = mp->m_log; 689 struct log *log = mp->m_log;
665 int error; 690 int error;
691 struct xfs_log_vec vec = {
692 .lv_niovecs = nentries,
693 .lv_iovecp = reg,
694 };
666 695
667 if (XLOG_FORCED_SHUTDOWN(log)) 696 if (XLOG_FORCED_SHUTDOWN(log))
668 return XFS_ERROR(EIO); 697 return XFS_ERROR(EIO);
669 698
670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); 699 error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
671 if (error) 700 if (error)
672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 701 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
673 return error; 702 return error;
@@ -1020,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1020 int i; 1049 int i;
1021 int iclogsize; 1050 int iclogsize;
1022 int error = ENOMEM; 1051 int error = ENOMEM;
1052 uint log2_size = 0;
1023 1053
1024 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1054 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1025 if (!log) { 1055 if (!log) {
@@ -1045,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t *mp,
1045 1075
1046 error = EFSCORRUPTED; 1076 error = EFSCORRUPTED;
1047 if (xfs_sb_version_hassector(&mp->m_sb)) { 1077 if (xfs_sb_version_hassector(&mp->m_sb)) {
1048 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; 1078 log2_size = mp->m_sb.sb_logsectlog;
1049 if (log->l_sectbb_log < 0 || 1079 if (log2_size < BBSHIFT) {
1050 log->l_sectbb_log > mp->m_sectbb_log) { 1080 xlog_warn("XFS: Log sector size too small "
1051 xlog_warn("XFS: Log sector size (0x%x) out of range.", 1081 "(0x%x < 0x%x)", log2_size, BBSHIFT);
1052 log->l_sectbb_log);
1053 goto out_free_log; 1082 goto out_free_log;
1054 } 1083 }
1055 1084
1056 /* for larger sector sizes, must have v2 or external log */ 1085 log2_size -= BBSHIFT;
1057 if (log->l_sectbb_log != 0 && 1086 if (log2_size > mp->m_sectbb_log) {
1058 (log->l_logBBstart != 0 && 1087 xlog_warn("XFS: Log sector size too large "
1059 !xfs_sb_version_haslogv2(&mp->m_sb))) { 1088 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
1060 xlog_warn("XFS: log sector size (0x%x) invalid "
1061 "for configuration.", log->l_sectbb_log);
1062 goto out_free_log; 1089 goto out_free_log;
1063 } 1090 }
1064 if (mp->m_sb.sb_logsectlog < BBSHIFT) { 1091
1065 xlog_warn("XFS: Log sector log (0x%x) too small.", 1092 /* for larger sector sizes, must have v2 or external log */
1066 mp->m_sb.sb_logsectlog); 1093 if (log2_size && log->l_logBBstart > 0 &&
1094 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1095
1096 xlog_warn("XFS: log sector size (0x%x) invalid "
1097 "for configuration.", log2_size);
1067 goto out_free_log; 1098 goto out_free_log;
1068 } 1099 }
1069 } 1100 }
1070 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; 1101 log->l_sectBBsize = 1 << log2_size;
1071 1102
1072 xlog_get_iclog_buffer_size(mp, log); 1103 xlog_get_iclog_buffer_size(mp, log);
1073 1104
@@ -1147,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1147 *iclogp = log->l_iclog; /* complete ring */ 1178 *iclogp = log->l_iclog; /* complete ring */
1148 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1179 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1149 1180
1181 error = xlog_cil_init(log);
1182 if (error)
1183 goto out_free_iclog;
1150 return log; 1184 return log;
1151 1185
1152out_free_iclog: 1186out_free_iclog:
@@ -1174,26 +1208,31 @@ out:
1174 * ticket. Return the lsn of the commit record. 1208 * ticket. Return the lsn of the commit record.
1175 */ 1209 */
1176STATIC int 1210STATIC int
1177xlog_commit_record(xfs_mount_t *mp, 1211xlog_commit_record(
1178 xlog_ticket_t *ticket, 1212 struct log *log,
1179 xlog_in_core_t **iclog, 1213 struct xlog_ticket *ticket,
1180 xfs_lsn_t *commitlsnp) 1214 struct xlog_in_core **iclog,
1215 xfs_lsn_t *commitlsnp)
1181{ 1216{
1182 int error; 1217 struct xfs_mount *mp = log->l_mp;
1183 xfs_log_iovec_t reg[1]; 1218 int error;
1184 1219 struct xfs_log_iovec reg = {
1185 reg[0].i_addr = NULL; 1220 .i_addr = NULL,
1186 reg[0].i_len = 0; 1221 .i_len = 0,
1187 reg[0].i_type = XLOG_REG_TYPE_COMMIT; 1222 .i_type = XLOG_REG_TYPE_COMMIT,
1223 };
1224 struct xfs_log_vec vec = {
1225 .lv_niovecs = 1,
1226 .lv_iovecp = &reg,
1227 };
1188 1228
1189 ASSERT_ALWAYS(iclog); 1229 ASSERT_ALWAYS(iclog);
1190 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1230 error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
1191 iclog, XLOG_COMMIT_TRANS))) { 1231 XLOG_COMMIT_TRANS);
1232 if (error)
1192 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1233 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1193 }
1194 return error; 1234 return error;
1195} /* xlog_commit_record */ 1235}
1196
1197 1236
1198/* 1237/*
1199 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1238 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1468,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
1468 xlog_in_core_t *iclog, *next_iclog; 1507 xlog_in_core_t *iclog, *next_iclog;
1469 int i; 1508 int i;
1470 1509
1510 xlog_cil_destroy(log);
1511
1471 iclog = log->l_iclog; 1512 iclog = log->l_iclog;
1472 for (i=0; i<log->l_iclog_bufs; i++) { 1513 for (i=0; i<log->l_iclog_bufs; i++) {
1473 sv_destroy(&iclog->ic_force_wait); 1514 sv_destroy(&iclog->ic_force_wait);
@@ -1510,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
1510 * print out info relating to regions written which consume 1551 * print out info relating to regions written which consume
1511 * the reservation 1552 * the reservation
1512 */ 1553 */
1513STATIC void 1554void
1514xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1555xlog_print_tic_res(
1556 struct xfs_mount *mp,
1557 struct xlog_ticket *ticket)
1515{ 1558{
1516 uint i; 1559 uint i;
1517 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1560 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1611,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1611 "bad-rtype" : res_type_str[r_type-1]), 1654 "bad-rtype" : res_type_str[r_type-1]),
1612 ticket->t_res_arr[i].r_len); 1655 ticket->t_res_arr[i].r_len);
1613 } 1656 }
1657
1658 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1659 "xfs_log_write: reservation ran out. Need to up reservation");
1660 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1661}
1662
1663/*
1664 * Calculate the potential space needed by the log vector. Each region gets
1665 * its own xlog_op_header_t and may need to be double word aligned.
1666 */
1667static int
1668xlog_write_calc_vec_length(
1669 struct xlog_ticket *ticket,
1670 struct xfs_log_vec *log_vector)
1671{
1672 struct xfs_log_vec *lv;
1673 int headers = 0;
1674 int len = 0;
1675 int i;
1676
1677 /* acct for start rec of xact */
1678 if (ticket->t_flags & XLOG_TIC_INITED)
1679 headers++;
1680
1681 for (lv = log_vector; lv; lv = lv->lv_next) {
1682 headers += lv->lv_niovecs;
1683
1684 for (i = 0; i < lv->lv_niovecs; i++) {
1685 struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
1686
1687 len += vecp->i_len;
1688 xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
1689 }
1690 }
1691
1692 ticket->t_res_num_ophdrs += headers;
1693 len += headers * sizeof(struct xlog_op_header);
1694
1695 return len;
1696}
1697
1698/*
1699 * If first write for transaction, insert start record We can't be trying to
1700 * commit if we are inited. We can't have any "partial_copy" if we are inited.
1701 */
1702static int
1703xlog_write_start_rec(
1704 struct xlog_op_header *ophdr,
1705 struct xlog_ticket *ticket)
1706{
1707 if (!(ticket->t_flags & XLOG_TIC_INITED))
1708 return 0;
1709
1710 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1711 ophdr->oh_clientid = ticket->t_clientid;
1712 ophdr->oh_len = 0;
1713 ophdr->oh_flags = XLOG_START_TRANS;
1714 ophdr->oh_res2 = 0;
1715
1716 ticket->t_flags &= ~XLOG_TIC_INITED;
1717
1718 return sizeof(struct xlog_op_header);
1719}
1720
1721static xlog_op_header_t *
1722xlog_write_setup_ophdr(
1723 struct log *log,
1724 struct xlog_op_header *ophdr,
1725 struct xlog_ticket *ticket,
1726 uint flags)
1727{
1728 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1729 ophdr->oh_clientid = ticket->t_clientid;
1730 ophdr->oh_res2 = 0;
1731
1732 /* are we copying a commit or unmount record? */
1733 ophdr->oh_flags = flags;
1734
1735 /*
1736 * We've seen logs corrupted with bad transaction client ids. This
1737 * makes sure that XFS doesn't generate them on. Turn this into an EIO
1738 * and shut down the filesystem.
1739 */
1740 switch (ophdr->oh_clientid) {
1741 case XFS_TRANSACTION:
1742 case XFS_VOLUME:
1743 case XFS_LOG:
1744 break;
1745 default:
1746 xfs_fs_cmn_err(CE_WARN, log->l_mp,
1747 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1748 ophdr->oh_clientid, ticket);
1749 return NULL;
1750 }
1751
1752 return ophdr;
1753}
1754
1755/*
1756 * Set up the parameters of the region copy into the log. This has
1757 * to handle region write split across multiple log buffers - this
1758 * state is kept external to this function so that this code can
1759 * can be written in an obvious, self documenting manner.
1760 */
1761static int
1762xlog_write_setup_copy(
1763 struct xlog_ticket *ticket,
1764 struct xlog_op_header *ophdr,
1765 int space_available,
1766 int space_required,
1767 int *copy_off,
1768 int *copy_len,
1769 int *last_was_partial_copy,
1770 int *bytes_consumed)
1771{
1772 int still_to_copy;
1773
1774 still_to_copy = space_required - *bytes_consumed;
1775 *copy_off = *bytes_consumed;
1776
1777 if (still_to_copy <= space_available) {
1778 /* write of region completes here */
1779 *copy_len = still_to_copy;
1780 ophdr->oh_len = cpu_to_be32(*copy_len);
1781 if (*last_was_partial_copy)
1782 ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1783 *last_was_partial_copy = 0;
1784 *bytes_consumed = 0;
1785 return 0;
1786 }
1787
1788 /* partial write of region, needs extra log op header reservation */
1789 *copy_len = space_available;
1790 ophdr->oh_len = cpu_to_be32(*copy_len);
1791 ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
1792 if (*last_was_partial_copy)
1793 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
1794 *bytes_consumed += *copy_len;
1795 (*last_was_partial_copy)++;
1796
1797 /* account for new log op header */
1798 ticket->t_curr_res -= sizeof(struct xlog_op_header);
1799 ticket->t_res_num_ophdrs++;
1800
1801 return sizeof(struct xlog_op_header);
1802}
1803
1804static int
1805xlog_write_copy_finish(
1806 struct log *log,
1807 struct xlog_in_core *iclog,
1808 uint flags,
1809 int *record_cnt,
1810 int *data_cnt,
1811 int *partial_copy,
1812 int *partial_copy_len,
1813 int log_offset,
1814 struct xlog_in_core **commit_iclog)
1815{
1816 if (*partial_copy) {
1817 /*
1818 * This iclog has already been marked WANT_SYNC by
1819 * xlog_state_get_iclog_space.
1820 */
1821 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1822 *record_cnt = 0;
1823 *data_cnt = 0;
1824 return xlog_state_release_iclog(log, iclog);
1825 }
1826
1827 *partial_copy = 0;
1828 *partial_copy_len = 0;
1829
1830 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1831 /* no more space in this iclog - push it. */
1832 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1833 *record_cnt = 0;
1834 *data_cnt = 0;
1835
1836 spin_lock(&log->l_icloglock);
1837 xlog_state_want_sync(log, iclog);
1838 spin_unlock(&log->l_icloglock);
1839
1840 if (!commit_iclog)
1841 return xlog_state_release_iclog(log, iclog);
1842 ASSERT(flags & XLOG_COMMIT_TRANS);
1843 *commit_iclog = iclog;
1844 }
1845
1846 return 0;
1614} 1847}
1615 1848
1616/* 1849/*
@@ -1653,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1653 * we don't update ic_offset until the end when we know exactly how many 1886 * we don't update ic_offset until the end when we know exactly how many
1654 * bytes have been written out. 1887 * bytes have been written out.
1655 */ 1888 */
1656STATIC int 1889int
1657xlog_write( 1890xlog_write(
1658 struct xfs_mount *mp, 1891 struct log *log,
1659 struct xfs_log_iovec reg[], 1892 struct xfs_log_vec *log_vector,
1660 int nentries,
1661 struct xlog_ticket *ticket, 1893 struct xlog_ticket *ticket,
1662 xfs_lsn_t *start_lsn, 1894 xfs_lsn_t *start_lsn,
1663 struct xlog_in_core **commit_iclog, 1895 struct xlog_in_core **commit_iclog,
1664 uint flags) 1896 uint flags)
1665{ 1897{
1666 xlog_t *log = mp->m_log; 1898 struct xlog_in_core *iclog = NULL;
1667 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1899 struct xfs_log_iovec *vecp;
1668 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1900 struct xfs_log_vec *lv;
1669 __psint_t ptr; /* copy address into data region */ 1901 int len;
1670 int len; /* # xlog_write() bytes 2 still copy */ 1902 int index;
1671 int index; /* region index currently copying */ 1903 int partial_copy = 0;
1672 int log_offset; /* offset (from 0) into data region */ 1904 int partial_copy_len = 0;
1673 int start_rec_copy; /* # bytes to copy for start record */ 1905 int contwr = 0;
1674 int partial_copy; /* did we split a region? */ 1906 int record_cnt = 0;
1675 int partial_copy_len;/* # bytes copied if split region */ 1907 int data_cnt = 0;
1676 int need_copy; /* # bytes need to memcpy this region */ 1908 int error;
1677 int copy_len; /* # bytes actually memcpy'ing */
1678 int copy_off; /* # bytes from entry start */
1679 int contwr; /* continued write of in-core log? */
1680 int error;
1681 int record_cnt = 0, data_cnt = 0;
1682
1683 partial_copy_len = partial_copy = 0;
1684
1685 /* Calculate potential maximum space. Each region gets its own
1686 * xlog_op_header_t and may need to be double word aligned.
1687 */
1688 len = 0;
1689 if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */
1690 len += sizeof(xlog_op_header_t);
1691 ticket->t_res_num_ophdrs++;
1692 }
1693
1694 for (index = 0; index < nentries; index++) {
1695 len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
1696 ticket->t_res_num_ophdrs++;
1697 len += reg[index].i_len;
1698 xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
1699 }
1700 contwr = *start_lsn = 0;
1701 1909
1702 if (ticket->t_curr_res < len) { 1910 *start_lsn = 0;
1703 xlog_print_tic_res(mp, ticket);
1704#ifdef DEBUG
1705 xlog_panic(
1706 "xfs_log_write: reservation ran out. Need to up reservation");
1707#else
1708 /* Customer configurable panic */
1709 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1710 "xfs_log_write: reservation ran out. Need to up reservation");
1711 /* If we did not panic, shutdown the filesystem */
1712 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1713#endif
1714 } else
1715 ticket->t_curr_res -= len;
1716 1911
1717 for (index = 0; index < nentries; ) { 1912 len = xlog_write_calc_vec_length(ticket, log_vector);
1718 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 1913 if (log->l_cilp) {
1719 &contwr, &log_offset))) 1914 /*
1720 return error; 1915 * Region headers and bytes are already accounted for.
1916 * We only need to take into account start records and
1917 * split regions in this function.
1918 */
1919 if (ticket->t_flags & XLOG_TIC_INITED)
1920 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1721 1921
1722 ASSERT(log_offset <= iclog->ic_size - 1); 1922 /*
1723 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); 1923 * Commit record headers need to be accounted for. These
1924 * come in as separate writes so are easy to detect.
1925 */
1926 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1927 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1928 } else
1929 ticket->t_curr_res -= len;
1930
1931 if (ticket->t_curr_res < 0)
1932 xlog_print_tic_res(log->l_mp, ticket);
1933
1934 index = 0;
1935 lv = log_vector;
1936 vecp = lv->lv_iovecp;
1937 while (lv && index < lv->lv_niovecs) {
1938 void *ptr;
1939 int log_offset;
1940
1941 error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
1942 &contwr, &log_offset);
1943 if (error)
1944 return error;
1724 1945
1725 /* start_lsn is the first lsn written to. That's all we need. */ 1946 ASSERT(log_offset <= iclog->ic_size - 1);
1726 if (! *start_lsn) 1947 ptr = iclog->ic_datap + log_offset;
1727 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1728 1948
1729 /* This loop writes out as many regions as can fit in the amount 1949 /* start_lsn is the first lsn written to. That's all we need. */
1730 * of space which was allocated by xlog_state_get_iclog_space(). 1950 if (!*start_lsn)
1731 */ 1951 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1732 while (index < nentries) {
1733 ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
1734 ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
1735 start_rec_copy = 0;
1736
1737 /* If first write for transaction, insert start record.
1738 * We can't be trying to commit if we are inited. We can't
1739 * have any "partial_copy" if we are inited.
1740 */
1741 if (ticket->t_flags & XLOG_TIC_INITED) {
1742 logop_head = (xlog_op_header_t *)ptr;
1743 logop_head->oh_tid = cpu_to_be32(ticket->t_tid);
1744 logop_head->oh_clientid = ticket->t_clientid;
1745 logop_head->oh_len = 0;
1746 logop_head->oh_flags = XLOG_START_TRANS;
1747 logop_head->oh_res2 = 0;
1748 ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
1749 record_cnt++;
1750
1751 start_rec_copy = sizeof(xlog_op_header_t);
1752 xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
1753 }
1754 1952
1755 /* Copy log operation header directly into data section */ 1953 /*
1756 logop_head = (xlog_op_header_t *)ptr; 1954 * This loop writes out as many regions as can fit in the amount
1757 logop_head->oh_tid = cpu_to_be32(ticket->t_tid); 1955 * of space which was allocated by xlog_state_get_iclog_space().
1758 logop_head->oh_clientid = ticket->t_clientid; 1956 */
1759 logop_head->oh_res2 = 0; 1957 while (lv && index < lv->lv_niovecs) {
1958 struct xfs_log_iovec *reg = &vecp[index];
1959 struct xlog_op_header *ophdr;
1960 int start_rec_copy;
1961 int copy_len;
1962 int copy_off;
1963
1964 ASSERT(reg->i_len % sizeof(__int32_t) == 0);
1965 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
1966
1967 start_rec_copy = xlog_write_start_rec(ptr, ticket);
1968 if (start_rec_copy) {
1969 record_cnt++;
1970 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1971 start_rec_copy);
1972 }
1760 1973
1761 /* header copied directly */ 1974 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
1762 xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); 1975 if (!ophdr)
1976 return XFS_ERROR(EIO);
1763 1977
1764 /* are we copying a commit or unmount record? */ 1978 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1765 logop_head->oh_flags = flags; 1979 sizeof(struct xlog_op_header));
1980
1981 len += xlog_write_setup_copy(ticket, ophdr,
1982 iclog->ic_size-log_offset,
1983 reg->i_len,
1984 &copy_off, &copy_len,
1985 &partial_copy,
1986 &partial_copy_len);
1987 xlog_verify_dest_ptr(log, ptr);
1988
1989 /* copy region */
1990 ASSERT(copy_len >= 0);
1991 memcpy(ptr, reg->i_addr + copy_off, copy_len);
1992 xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
1993
1994 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1995 record_cnt++;
1996 data_cnt += contwr ? copy_len : 0;
1997
1998 error = xlog_write_copy_finish(log, iclog, flags,
1999 &record_cnt, &data_cnt,
2000 &partial_copy,
2001 &partial_copy_len,
2002 log_offset,
2003 commit_iclog);
2004 if (error)
2005 return error;
1766 2006
1767 /* 2007 /*
1768 * We've seen logs corrupted with bad transaction client 2008 * if we had a partial copy, we need to get more iclog
1769 * ids. This makes sure that XFS doesn't generate them on. 2009 * space but we don't want to increment the region
1770 * Turn this into an EIO and shut down the filesystem. 2010 * index because there is still more is this region to
1771 */ 2011 * write.
1772 switch (logop_head->oh_clientid) { 2012 *
1773 case XFS_TRANSACTION: 2013 * If we completed writing this region, and we flushed
1774 case XFS_VOLUME: 2014 * the iclog (indicated by resetting of the record
1775 case XFS_LOG: 2015 * count), then we also need to get more log space. If
1776 break; 2016 * this was the last record, though, we are done and
1777 default: 2017 * can just return.
1778 xfs_fs_cmn_err(CE_WARN, mp, 2018 */
1779 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 2019 if (partial_copy)
1780 logop_head->oh_clientid, ticket); 2020 break;
1781 return XFS_ERROR(EIO);
1782 }
1783 2021
1784 /* Partial write last time? => (partial_copy != 0) 2022 if (++index == lv->lv_niovecs) {
1785 * need_copy is the amount we'd like to copy if everything could 2023 lv = lv->lv_next;
1786 * fit in the current memcpy. 2024 index = 0;
1787 */ 2025 if (lv)
1788 need_copy = reg[index].i_len - partial_copy_len; 2026 vecp = lv->lv_iovecp;
1789 2027 }
1790 copy_off = partial_copy_len; 2028 if (record_cnt == 0) {
1791 if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ 2029 if (!lv)
1792 copy_len = need_copy; 2030 return 0;
1793 logop_head->oh_len = cpu_to_be32(copy_len); 2031 break;
1794 if (partial_copy) 2032 }
1795 logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1796 partial_copy_len = partial_copy = 0;
1797 } else { /* partial write */
1798 copy_len = iclog->ic_size - log_offset;
1799 logop_head->oh_len = cpu_to_be32(copy_len);
1800 logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
1801 if (partial_copy)
1802 logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
1803 partial_copy_len += copy_len;
1804 partial_copy++;
1805 len += sizeof(xlog_op_header_t); /* from splitting of region */
1806 /* account for new log op header */
1807 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1808 ticket->t_res_num_ophdrs++;
1809 }
1810 xlog_verify_dest_ptr(log, ptr);
1811
1812 /* copy region */
1813 ASSERT(copy_len >= 0);
1814 memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
1815 xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
1816
1817 /* make copy_len total bytes copied, including headers */
1818 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1819 record_cnt++;
1820 data_cnt += contwr ? copy_len : 0;
1821 if (partial_copy) { /* copied partial region */
1822 /* already marked WANT_SYNC by xlog_state_get_iclog_space */
1823 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1824 record_cnt = data_cnt = 0;
1825 if ((error = xlog_state_release_iclog(log, iclog)))
1826 return error;
1827 break; /* don't increment index */
1828 } else { /* copied entire region */
1829 index++;
1830 partial_copy_len = partial_copy = 0;
1831
1832 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1833 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1834 record_cnt = data_cnt = 0;
1835 spin_lock(&log->l_icloglock);
1836 xlog_state_want_sync(log, iclog);
1837 spin_unlock(&log->l_icloglock);
1838 if (commit_iclog) {
1839 ASSERT(flags & XLOG_COMMIT_TRANS);
1840 *commit_iclog = iclog;
1841 } else if ((error = xlog_state_release_iclog(log, iclog)))
1842 return error;
1843 if (index == nentries)
1844 return 0; /* we are done */
1845 else
1846 break;
1847 } 2033 }
1848 } /* if (partial_copy) */ 2034 }
1849 } /* while (index < nentries) */ 2035
1850 } /* for (index = 0; index < nentries; ) */ 2036 ASSERT(len == 0);
1851 ASSERT(len == 0); 2037
2038 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2039 if (!commit_iclog)
2040 return xlog_state_release_iclog(log, iclog);
1852 2041
1853 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1854 if (commit_iclog) {
1855 ASSERT(flags & XLOG_COMMIT_TRANS); 2042 ASSERT(flags & XLOG_COMMIT_TRANS);
1856 *commit_iclog = iclog; 2043 *commit_iclog = iclog;
1857 return 0; 2044 return 0;
1858 } 2045}
1859 return xlog_state_release_iclog(log, iclog);
1860} /* xlog_write */
1861 2046
1862 2047
1863/***************************************************************************** 2048/*****************************************************************************
@@ -2840,6 +3025,8 @@ _xfs_log_force(
2840 3025
2841 XFS_STATS_INC(xs_log_force); 3026 XFS_STATS_INC(xs_log_force);
2842 3027
3028 xlog_cil_push(log, 1);
3029
2843 spin_lock(&log->l_icloglock); 3030 spin_lock(&log->l_icloglock);
2844 3031
2845 iclog = log->l_iclog; 3032 iclog = log->l_iclog;
@@ -2989,6 +3176,12 @@ _xfs_log_force_lsn(
2989 3176
2990 XFS_STATS_INC(xs_log_force); 3177 XFS_STATS_INC(xs_log_force);
2991 3178
3179 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN)
3182 return 0;
3183 }
3184
2992try_again: 3185try_again:
2993 spin_lock(&log->l_icloglock); 3186 spin_lock(&log->l_icloglock);
2994 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
@@ -3153,20 +3346,30 @@ xfs_log_ticket_get(
3153 return ticket; 3346 return ticket;
3154} 3347}
3155 3348
3349xlog_tid_t
3350xfs_log_get_trans_ident(
3351 struct xfs_trans *tp)
3352{
3353 return tp->t_ticket->t_tid;
3354}
3355
3156/* 3356/*
3157 * Allocate and initialise a new log ticket. 3357 * Allocate and initialise a new log ticket.
3158 */ 3358 */
3159STATIC xlog_ticket_t * 3359xlog_ticket_t *
3160xlog_ticket_alloc(xlog_t *log, 3360xlog_ticket_alloc(
3161 int unit_bytes, 3361 struct log *log,
3162 int cnt, 3362 int unit_bytes,
3163 char client, 3363 int cnt,
3164 uint xflags) 3364 char client,
3365 uint xflags,
3366 int alloc_flags)
3165{ 3367{
3166 xlog_ticket_t *tic; 3368 struct xlog_ticket *tic;
3167 uint num_headers; 3369 uint num_headers;
3370 int iclog_space;
3168 3371
3169 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3372 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3170 if (!tic) 3373 if (!tic)
3171 return NULL; 3374 return NULL;
3172 3375
@@ -3208,16 +3411,40 @@ xlog_ticket_alloc(xlog_t *log,
3208 /* for start-rec */ 3411 /* for start-rec */
3209 unit_bytes += sizeof(xlog_op_header_t); 3412 unit_bytes += sizeof(xlog_op_header_t);
3210 3413
3211 /* for LR headers */ 3414 /*
3212 num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); 3415 * for LR headers - the space for data in an iclog is the size minus
3416 * the space used for the headers. If we use the iclog size, then we
3417 * undercalculate the number of headers required.
3418 *
3419 * Furthermore - the addition of op headers for split-recs might
3420 * increase the space required enough to require more log and op
3421 * headers, so take that into account too.
3422 *
3423 * IMPORTANT: This reservation makes the assumption that if this
3424 * transaction is the first in an iclog and hence has the LR headers
3425 * accounted to it, then the remaining space in the iclog is
3426 * exclusively for this transaction. i.e. if the transaction is larger
3427 * than the iclog, it will be the only thing in that iclog.
3428 * Fundamentally, this means we must pass the entire log vector to
3429 * xlog_write to guarantee this.
3430 */
3431 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3432 num_headers = howmany(unit_bytes, iclog_space);
3433
3434 /* for split-recs - ophdrs added when data split over LRs */
3435 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3436
3437 /* add extra header reservations if we overrun */
3438 while (!num_headers ||
3439 howmany(unit_bytes, iclog_space) > num_headers) {
3440 unit_bytes += sizeof(xlog_op_header_t);
3441 num_headers++;
3442 }
3213 unit_bytes += log->l_iclog_hsize * num_headers; 3443 unit_bytes += log->l_iclog_hsize * num_headers;
3214 3444
3215 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3445 /* for commit-rec LR header - note: padding will subsume the ophdr */
3216 unit_bytes += log->l_iclog_hsize; 3446 unit_bytes += log->l_iclog_hsize;
3217 3447
3218 /* for split-recs - ophdrs added when data split over LRs */
3219 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3220
3221 /* for roundoff padding for transaction data and one for commit record */ 3448 /* for roundoff padding for transaction data and one for commit record */
3222 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3449 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
3223 log->l_mp->m_sb.sb_logsunit > 1) { 3450 log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3233,13 +3460,13 @@ xlog_ticket_alloc(xlog_t *log,
3233 tic->t_curr_res = unit_bytes; 3460 tic->t_curr_res = unit_bytes;
3234 tic->t_cnt = cnt; 3461 tic->t_cnt = cnt;
3235 tic->t_ocnt = cnt; 3462 tic->t_ocnt = cnt;
3236 tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); 3463 tic->t_tid = random32();
3237 tic->t_clientid = client; 3464 tic->t_clientid = client;
3238 tic->t_flags = XLOG_TIC_INITED; 3465 tic->t_flags = XLOG_TIC_INITED;
3239 tic->t_trans_type = 0; 3466 tic->t_trans_type = 0;
3240 if (xflags & XFS_LOG_PERM_RESERV) 3467 if (xflags & XFS_LOG_PERM_RESERV)
3241 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3468 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3242 sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); 3469 sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
3243 3470
3244 xlog_tic_reset_res(tic); 3471 xlog_tic_reset_res(tic);
3245 3472
@@ -3260,20 +3487,22 @@ xlog_ticket_alloc(xlog_t *log,
3260 * part of the log in case we trash the log structure. 3487 * part of the log in case we trash the log structure.
3261 */ 3488 */
3262void 3489void
3263xlog_verify_dest_ptr(xlog_t *log, 3490xlog_verify_dest_ptr(
3264 __psint_t ptr) 3491 struct log *log,
3492 char *ptr)
3265{ 3493{
3266 int i; 3494 int i;
3267 int good_ptr = 0; 3495 int good_ptr = 0;
3268 3496
3269 for (i=0; i < log->l_iclog_bufs; i++) { 3497 for (i = 0; i < log->l_iclog_bufs; i++) {
3270 if (ptr >= (__psint_t)log->l_iclog_bak[i] && 3498 if (ptr >= log->l_iclog_bak[i] &&
3271 ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) 3499 ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
3272 good_ptr++; 3500 good_ptr++;
3273 } 3501 }
3274 if (! good_ptr) 3502
3503 if (!good_ptr)
3275 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3504 xlog_panic("xlog_verify_dest_ptr: invalid ptr");
3276} /* xlog_verify_dest_ptr */ 3505}
3277 3506
3278STATIC void 3507STATIC void
3279xlog_verify_grant_head(xlog_t *log, int equals) 3508xlog_verify_grant_head(xlog_t *log, int equals)
@@ -3459,6 +3688,11 @@ xlog_state_ioerror(
3459 * c. nothing new gets queued up after (a) and (b) are done. 3688 * c. nothing new gets queued up after (a) and (b) are done.
3460 * d. if !logerror, flush the iclogs to disk, then seal them off 3689 * d. if !logerror, flush the iclogs to disk, then seal them off
3461 * for business. 3690 * for business.
3691 *
3692 * Note: for delayed logging the !logerror case needs to flush the regions
3693 * held in memory out to the iclogs before flushing them to disk. This needs
3694 * to be done before the log is marked as shutdown, otherwise the flush to the
3695 * iclogs will fail.
3462 */ 3696 */
3463int 3697int
3464xfs_log_force_umount( 3698xfs_log_force_umount(
@@ -3492,6 +3726,16 @@ xfs_log_force_umount(
3492 return 1; 3726 return 1;
3493 } 3727 }
3494 retval = 0; 3728 retval = 0;
3729
3730 /*
3731 * Flush the in memory commit item list before marking the log as
3732 * being shut down. We need to do it in this order to ensure all the
3733 * completed transactions are flushed to disk with the xfs_log_force()
3734 * call below.
3735 */
3736 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3737 xlog_cil_push(log, 1);
3738
3495 /* 3739 /*
3496 * We must hold both the GRANT lock and the LOG lock, 3740 * We must hold both the GRANT lock and the LOG lock,
3497 * before we mark the filesystem SHUTDOWN and wake 3741 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21/* get lsn fields */
22
23#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) 22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
24#define BLOCK_LSN(lsn) ((uint)(lsn)) 23#define BLOCK_LSN(lsn) ((uint)(lsn))
25 24
@@ -110,6 +109,15 @@ typedef struct xfs_log_iovec {
110 uint i_type; /* type of region */ 109 uint i_type; /* type of region */
111} xfs_log_iovec_t; 110} xfs_log_iovec_t;
112 111
112struct xfs_log_vec {
113 struct xfs_log_vec *lv_next; /* next lv in build list */
114 int lv_niovecs; /* number of iovecs in lv */
115 struct xfs_log_iovec *lv_iovecp; /* iovec array */
116 struct xfs_log_item *lv_item; /* owner */
117 char *lv_buf; /* formatted buffer */
118 int lv_buf_len; /* size of formatted buffer */
119};
120
113/* 121/*
114 * Structure used to pass callback function and the function's argument 122 * Structure used to pass callback function and the function's argument
115 * to the log manager. 123 * to the log manager.
@@ -126,6 +134,14 @@ typedef struct xfs_log_callback {
126struct xfs_mount; 134struct xfs_mount;
127struct xlog_in_core; 135struct xlog_in_core;
128struct xlog_ticket; 136struct xlog_ticket;
137struct xfs_log_item;
138struct xfs_item_ops;
139struct xfs_trans;
140
141void xfs_log_item_init(struct xfs_mount *mp,
142 struct xfs_log_item *item,
143 int type,
144 struct xfs_item_ops *ops);
129 145
130xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 146xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
131 struct xlog_ticket *ticket, 147 struct xlog_ticket *ticket,
@@ -174,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp);
174 190
175void xlog_iodone(struct xfs_buf *); 191void xlog_iodone(struct xfs_buf *);
176 192
177struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 193struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
178void xfs_log_ticket_put(struct xlog_ticket *ticket); 194void xfs_log_ticket_put(struct xlog_ticket *ticket);
179 195
196xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
197
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags);
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
202
180#endif 203#endif
181 204
182 205
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_error.h"
33#include "xfs_alloc.h"
34
35/*
36 * Perform initial CIL structure initialisation. If the CIL is not
37 * enabled in this filesystem, ensure the log->l_cilp is null so
38 * we can check this conditional to determine if we are doing delayed
39 * logging or not.
40 */
41int
42xlog_cil_init(
43 struct log *log)
44{
45 struct xfs_cil *cil;
46 struct xfs_cil_ctx *ctx;
47
48 log->l_cilp = NULL;
49 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
50 return 0;
51
52 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
53 if (!cil)
54 return ENOMEM;
55
56 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
57 if (!ctx) {
58 kmem_free(cil);
59 return ENOMEM;
60 }
61
62 INIT_LIST_HEAD(&cil->xc_cil);
63 INIT_LIST_HEAD(&cil->xc_committing);
64 spin_lock_init(&cil->xc_cil_lock);
65 init_rwsem(&cil->xc_ctx_lock);
66 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
67
68 INIT_LIST_HEAD(&ctx->committing);
69 INIT_LIST_HEAD(&ctx->busy_extents);
70 ctx->sequence = 1;
71 ctx->cil = cil;
72 cil->xc_ctx = ctx;
73
74 cil->xc_log = log;
75 log->l_cilp = cil;
76 return 0;
77}
78
79void
80xlog_cil_destroy(
81 struct log *log)
82{
83 if (!log->l_cilp)
84 return;
85
86 if (log->l_cilp->xc_ctx) {
87 if (log->l_cilp->xc_ctx->ticket)
88 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
89 kmem_free(log->l_cilp->xc_ctx);
90 }
91
92 ASSERT(list_empty(&log->l_cilp->xc_cil));
93 kmem_free(log->l_cilp);
94}
95
96/*
97 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
98 * recover, so we don't allow failure here. Also, we allocate in a context that
99 * we don't want to be issuing transactions from, so we need to tell the
100 * allocation code this as well.
101 *
102 * We don't reserve any space for the ticket - we are going to steal whatever
103 * space we require from transactions as they commit. To ensure we reserve all
104 * the space required, we need to set the current reservation of the ticket to
105 * zero so that we know to steal the initial transaction overhead from the
106 * first transaction commit.
107 */
108static struct xlog_ticket *
109xlog_cil_ticket_alloc(
110 struct log *log)
111{
112 struct xlog_ticket *tic;
113
114 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
115 KM_SLEEP|KM_NOFS);
116 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
117
118 /*
119 * set the current reservation to zero so we know to steal the basic
120 * transaction overhead reservation from the first transaction commit.
121 */
122 tic->t_curr_res = 0;
123 return tic;
124}
125
126/*
127 * After the first stage of log recovery is done, we know where the head and
128 * tail of the log are. We need this log initialisation done before we can
129 * initialise the first CIL checkpoint context.
130 *
131 * Here we allocate a log ticket to track space usage during a CIL push. This
132 * ticket is passed to xlog_write() directly so that we don't slowly leak log
133 * space by failing to account for space used by log headers and additional
134 * region headers for split regions.
135 */
136void
137xlog_cil_init_post_recovery(
138 struct log *log)
139{
140 if (!log->l_cilp)
141 return;
142
143 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
144 log->l_cilp->xc_ctx->sequence = 1;
145 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
146 log->l_curr_block);
147}
148
149/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
212 * Now transfer enough transaction reservation to the context ticket
213 * for the checkpoint. The context ticket is special - the unit
214 * reservation has to grow as well as the current reservation as we
215 * steal from tickets so we can correctly determine the space used
216 * during the transaction commit.
217 */
218 if (ctx->ticket->t_curr_res == 0) {
219 /* first commit in checkpoint, steal the header reservation */
220 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
221 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
222 ticket->t_curr_res -= ctx->ticket->t_unit_res;
223 }
224
225 /* do we need space for more log record headers? */
226 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
227 if (len > 0 && (ctx->space_used / iclog_space !=
228 (ctx->space_used + len) / iclog_space)) {
229 int hdrs;
230
231 hdrs = (len + iclog_space - 1) / iclog_space;
232 /* need to take into account split region headers, too */
233 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
234 ctx->ticket->t_unit_res += hdrs;
235 ctx->ticket->t_curr_res += hdrs;
236 ticket->t_curr_res -= hdrs;
237 ASSERT(ticket->t_curr_res >= len);
238 }
239 ticket->t_curr_res -= len;
240 ctx->space_used += len;
241
242 spin_unlock(&cil->xc_cil_lock);
243}
244
245/*
246 * Format log item into a flat buffers
247 *
248 * For delayed logging, we need to hold a formatted buffer containing all the
249 * changes on the log item. This enables us to relog the item in memory and
250 * write it out asynchronously without needing to relock the object that was
251 * modified at the time it gets written into the iclog.
252 *
253 * This function builds a vector for the changes in each log item in the
254 * transaction. It then works out the length of the buffer needed for each log
255 * item, allocates them and formats the vector for the item into the buffer.
256 * The buffer is then attached to the log item are then inserted into the
257 * Committed Item List for tracking until the next checkpoint is written out.
258 *
259 * We don't set up region headers during this process; we simply copy the
260 * regions into the flat buffer. We can do this because we still have to do a
261 * formatting step to write the regions into the iclog buffer. Writing the
262 * ophdrs during the iclog write means that we can support splitting large
263 * regions across iclog boundares without needing a change in the format of the
264 * item/region encapsulation.
265 *
266 * Hence what we need to do now is change the rewrite the vector array to point
267 * to the copied region inside the buffer we just allocated. This allows us to
268 * format the regions into the iclog as though they are being formatted
269 * directly out of the objects themselves.
270 */
271static void
272xlog_cil_format_items(
273 struct log *log,
274 struct xfs_log_vec *log_vector,
275 struct xlog_ticket *ticket,
276 xfs_lsn_t *start_lsn)
277{
278 struct xfs_log_vec *lv;
279
280 if (start_lsn)
281 *start_lsn = log->l_cilp->xc_ctx->sequence;
282
283 ASSERT(log_vector);
284 for (lv = log_vector; lv; lv = lv->lv_next) {
285 void *ptr;
286 int index;
287 int len = 0;
288
289 /* build the vector array and calculate it's length */
290 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
291 for (index = 0; index < lv->lv_niovecs; index++)
292 len += lv->lv_iovecp[index].i_len;
293
294 lv->lv_buf_len = len;
295 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
296 ptr = lv->lv_buf;
297
298 for (index = 0; index < lv->lv_niovecs; index++) {
299 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
300
301 memcpy(ptr, vec->i_addr, vec->i_len);
302 vec->i_addr = ptr;
303 ptr += vec->i_len;
304 }
305 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
306
307 xlog_cil_insert(log, ticket, lv->lv_item, lv);
308 }
309}
310
311static void
312xlog_cil_free_logvec(
313 struct xfs_log_vec *log_vector)
314{
315 struct xfs_log_vec *lv;
316
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 }
323}
324
325/*
326 * Commit a transaction with the given vector to the Committed Item List.
327 *
328 * To do this, we need to format the item, pin it in memory if required and
329 * account for the space used by the transaction. Once we have done that we
330 * need to release the unused reservation for the transaction, attach the
331 * transaction to the checkpoint context so we carry the busy extents through
332 * to checkpoint completion, and then unlock all the items in the transaction.
333 *
334 * For more specific information about the order of operations in
335 * xfs_log_commit_cil() please refer to the comments in
336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
341 */
342int
343xfs_log_commit_cil(
344 struct xfs_mount *mp,
345 struct xfs_trans *tp,
346 struct xfs_log_vec *log_vector,
347 xfs_lsn_t *commit_lsn,
348 int flags)
349{
350 struct log *log = mp->m_log;
351 int log_flags = 0;
352 int push = 0;
353
354 if (flags & XFS_TRANS_RELEASE_LOG_RES)
355 log_flags = XFS_LOG_REL_PERM_RESERV;
356
357 if (XLOG_FORCED_SHUTDOWN(log)) {
358 xlog_cil_free_logvec(log_vector);
359 return XFS_ERROR(EIO);
360 }
361
362 /* lock out background commit */
363 down_read(&log->l_cilp->xc_ctx_lock);
364 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
365
366 /* check we didn't blow the reservation */
367 if (tp->t_ticket->t_curr_res < 0)
368 xlog_print_tic_res(log->l_mp, tp->t_ticket);
369
370 /* attach the transaction to the CIL if it has any busy extents */
371 if (!list_empty(&tp->t_busy)) {
372 spin_lock(&log->l_cilp->xc_cil_lock);
373 list_splice_init(&tp->t_busy,
374 &log->l_cilp->xc_ctx->busy_extents);
375 spin_unlock(&log->l_cilp->xc_cil_lock);
376 }
377
378 tp->t_commit_lsn = *commit_lsn;
379 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
380 xfs_trans_unreserve_and_mod_sb(tp);
381
382 /* check for background commit before unlock */
383 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
384 push = 1;
385 up_read(&log->l_cilp->xc_ctx_lock);
386
387 /*
388 * We need to push CIL every so often so we don't cache more than we
389 * can fit in the log. The limit really is that a checkpoint can't be
390 * more than half the log (the current checkpoint is not allowed to
391 * overwrite the previous checkpoint), but commit latency and memory
392 * usage limit this to a smaller size in most cases.
393 */
394 if (push)
395 xlog_cil_push(log, 0);
396 return 0;
397}
398
399/*
400 * Mark all items committed and clear busy extents. We free the log vector
401 * chains in a separate pass so that we unpin the log items as quickly as
402 * possible.
403 */
404static void
405xlog_cil_committed(
406 void *args,
407 int abort)
408{
409 struct xfs_cil_ctx *ctx = args;
410 struct xfs_log_vec *lv;
411 int abortflag = abort ? XFS_LI_ABORTED : 0;
412 struct xfs_busy_extent *busyp, *n;
413
414 /* unpin all the log items */
415 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
416 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
417 abortflag);
418 }
419
420 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
421 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
422
423 spin_lock(&ctx->cil->xc_cil_lock);
424 list_del(&ctx->committing);
425 spin_unlock(&ctx->cil->xc_cil_lock);
426
427 xlog_cil_free_logvec(ctx->lv_chain);
428 kmem_free(ctx);
429}
430
431/*
432 * Push the Committed Item List to the log. If the push_now flag is not set,
433 * then it is a background flush and so we can chose to ignore it.
434 */
435int
436xlog_cil_push(
437 struct log *log,
438 int push_now)
439{
440 struct xfs_cil *cil = log->l_cilp;
441 struct xfs_log_vec *lv;
442 struct xfs_cil_ctx *ctx;
443 struct xfs_cil_ctx *new_ctx;
444 struct xlog_in_core *commit_iclog;
445 struct xlog_ticket *tic;
446 int num_lv;
447 int num_iovecs;
448 int len;
449 int error = 0;
450 struct xfs_trans_header thdr;
451 struct xfs_log_iovec lhdr;
452 struct xfs_log_vec lvhdr = { NULL };
453 xfs_lsn_t commit_lsn;
454
455 if (!cil)
456 return 0;
457
458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
459 new_ctx->ticket = xlog_cil_ticket_alloc(log);
460
461 /* lock out transaction commit, but don't block on background push */
462 if (!down_write_trylock(&cil->xc_ctx_lock)) {
463 if (!push_now)
464 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock);
466 }
467 ctx = cil->xc_ctx;
468
469 /* check if we've anything to push */
470 if (list_empty(&cil->xc_cil))
471 goto out_skip;
472
473 /* check for spurious background flush */
474 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
475 goto out_skip;
476
477 /*
478 * pull all the log vectors off the items in the CIL, and
479 * remove the items from the CIL. We don't need the CIL lock
480 * here because it's only needed on the transaction commit
481 * side which is currently locked out by the flush lock.
482 */
483 lv = NULL;
484 num_lv = 0;
485 num_iovecs = 0;
486 len = 0;
487 while (!list_empty(&cil->xc_cil)) {
488 struct xfs_log_item *item;
489 int i;
490
491 item = list_first_entry(&cil->xc_cil,
492 struct xfs_log_item, li_cil);
493 list_del_init(&item->li_cil);
494 if (!ctx->lv_chain)
495 ctx->lv_chain = item->li_lv;
496 else
497 lv->lv_next = item->li_lv;
498 lv = item->li_lv;
499 item->li_lv = NULL;
500
501 num_lv++;
502 num_iovecs += lv->lv_niovecs;
503 for (i = 0; i < lv->lv_niovecs; i++)
504 len += lv->lv_iovecp[i].i_len;
505 }
506
507 /*
508 * initialise the new context and attach it to the CIL. Then attach
509 * the current context to the CIL committing lsit so it can be found
510 * during log forces to extract the commit lsn of the sequence that
511 * needs to be forced.
512 */
513 INIT_LIST_HEAD(&new_ctx->committing);
514 INIT_LIST_HEAD(&new_ctx->busy_extents);
515 new_ctx->sequence = ctx->sequence + 1;
516 new_ctx->cil = cil;
517 cil->xc_ctx = new_ctx;
518
519 /*
520 * The switch is now done, so we can drop the context lock and move out
521 * of a shared context. We can't just go straight to the commit record,
522 * though - we need to synchronise with previous and future commits so
523 * that the commit records are correctly ordered in the log to ensure
524 * that we process items during log IO completion in the correct order.
525 *
526 * For example, if we get an EFI in one checkpoint and the EFD in the
527 * next (e.g. due to log forces), we do not want the checkpoint with
528 * the EFD to be committed before the checkpoint with the EFI. Hence
529 * we must strictly order the commit records of the checkpoints so
530 * that: a) the checkpoint callbacks are attached to the iclogs in the
531 * correct order; and b) the checkpoints are replayed in correct order
532 * in log recovery.
533 *
534 * Hence we need to add this context to the committing context list so
535 * that higher sequences will wait for us to write out a commit record
536 * before they do.
537 */
538 spin_lock(&cil->xc_cil_lock);
539 list_add(&ctx->committing, &cil->xc_committing);
540 spin_unlock(&cil->xc_cil_lock);
541 up_write(&cil->xc_ctx_lock);
542
543 /*
544 * Build a checkpoint transaction header and write it to the log to
545 * begin the transaction. We need to account for the space used by the
546 * transaction header here as it is not accounted for in xlog_write().
547 *
548 * The LSN we need to pass to the log items on transaction commit is
549 * the LSN reported by the first log vector write. If we use the commit
550 * record lsn then we can move the tail beyond the grant write head.
551 */
552 tic = ctx->ticket;
553 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
554 thdr.th_type = XFS_TRANS_CHECKPOINT;
555 thdr.th_tid = tic->t_tid;
556 thdr.th_num_items = num_iovecs;
557 lhdr.i_addr = (xfs_caddr_t)&thdr;
558 lhdr.i_len = sizeof(xfs_trans_header_t);
559 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
560 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
561
562 lvhdr.lv_niovecs = 1;
563 lvhdr.lv_iovecp = &lhdr;
564 lvhdr.lv_next = ctx->lv_chain;
565
566 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
567 if (error)
568 goto out_abort;
569
570 /*
571 * now that we've written the checkpoint into the log, strictly
572 * order the commit records so replay will get them in the right order.
573 */
574restart:
575 spin_lock(&cil->xc_cil_lock);
576 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
577 /*
578 * Higher sequences will wait for this one so skip them.
579 * Don't wait for own own sequence, either.
580 */
581 if (new_ctx->sequence >= ctx->sequence)
582 continue;
583 if (!new_ctx->commit_lsn) {
584 /*
585 * It is still being pushed! Wait for the push to
586 * complete, then start again from the beginning.
587 */
588 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
589 goto restart;
590 }
591 }
592 spin_unlock(&cil->xc_cil_lock);
593
594 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
595 if (error || commit_lsn == -1)
596 goto out_abort;
597
598 /* attach all the transactions w/ busy extents to iclog */
599 ctx->log_cb.cb_func = xlog_cil_committed;
600 ctx->log_cb.cb_arg = ctx;
601 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
602 if (error)
603 goto out_abort;
604
605 /*
606 * now the checkpoint commit is complete and we've attached the
607 * callbacks to the iclog we can assign the commit LSN to the context
608 * and wake up anyone who is waiting for the commit to complete.
609 */
610 spin_lock(&cil->xc_cil_lock);
611 ctx->commit_lsn = commit_lsn;
612 sv_broadcast(&cil->xc_commit_wait);
613 spin_unlock(&cil->xc_cil_lock);
614
615 /* release the hounds! */
616 return xfs_log_release_iclog(log->l_mp, commit_iclog);
617
618out_skip:
619 up_write(&cil->xc_ctx_lock);
620out_free_ticket:
621 xfs_log_ticket_put(new_ctx->ticket);
622 kmem_free(new_ctx);
623 return 0;
624
625out_abort:
626 xlog_cil_committed(ctx, XFS_LI_ABORTED);
627 return XFS_ERROR(EIO);
628}
629
630/*
631 * Conditionally push the CIL based on the sequence passed in.
632 *
633 * We only need to push if we haven't already pushed the sequence
634 * number given. Hence the only time we will trigger a push here is
635 * if the push sequence is the same as the current context.
636 *
637 * We return the current commit lsn to allow the callers to determine if a
638 * iclog flush is necessary following this call.
639 *
640 * XXX: Initially, just push the CIL unconditionally and return whatever
641 * commit lsn is there. It'll be empty, so this is broken for now.
642 */
643xfs_lsn_t
644xlog_cil_push_lsn(
645 struct log *log,
646 xfs_lsn_t push_seq)
647{
648 struct xfs_cil *cil = log->l_cilp;
649 struct xfs_cil_ctx *ctx;
650 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
651
652restart:
653 down_write(&cil->xc_ctx_lock);
654 ASSERT(push_seq <= cil->xc_ctx->sequence);
655
656 /* check to see if we need to force out the current context */
657 if (push_seq == cil->xc_ctx->sequence) {
658 up_write(&cil->xc_ctx_lock);
659 xlog_cil_push(log, 1);
660 goto restart;
661 }
662
663 /*
664 * See if we can find a previous sequence still committing.
665 * We can drop the flush lock as soon as we have the cil lock
666 * because we are now only comparing contexts protected by
667 * the cil lock.
668 *
669 * We need to wait for all previous sequence commits to complete
670 * before allowing the force of push_seq to go ahead. Hence block
671 * on commits for those as well.
672 */
673 spin_lock(&cil->xc_cil_lock);
674 up_write(&cil->xc_ctx_lock);
675 list_for_each_entry(ctx, &cil->xc_committing, committing) {
676 if (ctx->sequence > push_seq)
677 continue;
678 if (!ctx->commit_lsn) {
679 /*
680 * It is still being pushed! Wait for the push to
681 * complete, then start again from the beginning.
682 */
683 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
684 goto restart;
685 }
686 if (ctx->sequence != push_seq)
687 continue;
688 /* found it! */
689 commit_lsn = ctx->commit_lsn;
690 }
691 spin_unlock(&cil->xc_cil_lock);
692 return commit_lsn;
693}
694
695/*
696 * Check if the current log item was first committed in this sequence.
697 * We can't rely on just the log item being in the CIL, we have to check
698 * the recorded commit sequence number.
699 *
700 * Note: for this to be used in a non-racy manner, it has to be called with
701 * CIL flushing locked out. As a result, it should only be used during the
702 * transaction commit process when deciding what to format into the item.
703 */
704bool
705xfs_log_item_in_current_chkpt(
706 struct xfs_log_item *lip)
707{
708 struct xfs_cil_ctx *ctx;
709
710 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711 return false;
712 if (list_empty(&lip->li_cil))
713 return false;
714
715 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716
717 /*
718 * li_seq is written on the first commit of a log item to record the
719 * first checkpoint it is written to. Hence if it is different to the
720 * current sequence, we're in a new checkpoint.
721 */
722 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723 return false;
724 return true;
725}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 154 shutdown */
155typedef __uint32_t xlog_tid_t;
156
157 155
158#ifdef __KERNEL__ 156#ifdef __KERNEL__
159/* 157/*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
379} xlog_in_core_t; 377} xlog_in_core_t;
380 378
381/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425};
426
427/*
428 * The amount of log space we should the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space
430 * effectively, that it is large enough to capture sufficient relogging to
431 * reduce log buffer IO significantly, but it is not too large for the log or
432 * induces too much latency when writing out through the iclogs. We track both
433 * space consumed and the number of vectors in the checkpoint context, so we
434 * need to decide which to use for limiting.
435 *
436 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of
438 * at least 512 bytes per 32k of log space just for the LR headers. That means
439 * 16KB of reservation per megabyte of delayed logging space we will consume,
440 * plus various headers. The number of headers will vary based on the num of
441 * io vectors, so limiting on a specific number of vectors is going to result
442 * in transactions of varying size. IOWs, it is more consistent to track and
443 * limit space consumed in the log rather than by the number of objects being
444 * logged in order to prevent checkpoint ticket overruns.
445 *
446 * Further, use of static reservations through the log grant mechanism is
447 * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
448 * grant) and a significant deadlock potential because regranting write space
449 * can block on log pushes. Hence if we have to regrant log space during a log
450 * push, we can deadlock.
451 *
452 * However, we can avoid this by use of a dynamic "reservation stealing"
453 * technique during transaction commit whereby unused reservation space in the
454 * transaction ticket is transferred to the CIL ctx commit ticket to cover the
455 * space needed by the checkpoint transaction. This means that we never need to
456 * specifically reserve space for the CIL checkpoint transaction, nor do we
457 * need to regrant space once the checkpoint completes. This also means the
458 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself.
460 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the
462 * checkpoint size so long as they don't violate any other size rules. Hence
463 * the initial maximum size for the checkpoint transaction will be set to a
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
465 * right now based on the latency of writing out a large amount of data through
466 * the circular iclog buffers.
467 */
468
469#define XLOG_CIL_SPACE_LIMIT(log) \
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471
472/*
382 * The reservation head lsn is not made up of a cycle number and block number. 473 * The reservation head lsn is not made up of a cycle number and block number.
383 * Instead, it uses a cycle number and byte number. Logs don't expect to 474 * Instead, it uses a cycle number and byte number. Logs don't expect to
384 * overflow 31 bits worth of byte offset, so using a byte number will mean 475 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
388 /* The following fields don't need locking */ 479 /* The following fields don't need locking */
389 struct xfs_mount *l_mp; /* mount point */ 480 struct xfs_mount *l_mp; /* mount point */
390 struct xfs_ail *l_ailp; /* AIL log is working with */ 481 struct xfs_ail *l_ailp; /* AIL log is working with */
482 struct xfs_cil *l_cilp; /* CIL log is working with */
391 struct xfs_buf *l_xbuf; /* extra buffer for log 483 struct xfs_buf *l_xbuf; /* extra buffer for log
392 * wrapping */ 484 * wrapping */
393 struct xfs_buftarg *l_targ; /* buftarg of log */ 485 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -396,9 +488,7 @@ typedef struct log {
396 struct xfs_buf_cancel **l_buf_cancel_table; 488 struct xfs_buf_cancel **l_buf_cancel_table;
397 int l_iclog_hsize; /* size of iclog header */ 489 int l_iclog_hsize; /* size of iclog header */
398 int l_iclog_heads; /* # of iclog header sectors */ 490 int l_iclog_heads; /* # of iclog header sectors */
399 uint l_sectbb_log; /* log2 of sector size in BBs */ 491 uint l_sectBBsize; /* sector size in BBs (2^n) */
400 uint l_sectbb_mask; /* sector size (in BBs)
401 * alignment mask */
402 int l_iclog_size; /* size of log in bytes */ 492 int l_iclog_size; /* size of log in bytes */
403 int l_iclog_size_log; /* log power size of log */ 493 int l_iclog_size_log; /* log power size of log */
404 int l_iclog_bufs; /* number of iclog buffers */ 494 int l_iclog_bufs; /* number of iclog buffers */
@@ -440,14 +530,40 @@ typedef struct log {
440 530
441#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 531#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
442 532
443
444/* common routines */ 533/* common routines */
445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 534extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
446extern int xlog_recover(xlog_t *log); 535extern int xlog_recover(xlog_t *log);
447extern int xlog_recover_finish(xlog_t *log); 536extern int xlog_recover_finish(xlog_t *log);
448extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 537extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
449 538
450extern kmem_zone_t *xfs_log_ticket_zone; 539extern kmem_zone_t *xfs_log_ticket_zone;
540struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
541 int count, char client, uint xflags,
542 int alloc_flags);
543
544
545static inline void
546xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
547{
548 *ptr += bytes;
549 *len -= bytes;
550 *off += bytes;
551}
552
553void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
554int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
555 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
556 xlog_in_core_t **commit_iclog, uint flags);
557
558/*
559 * Committed Item List interfaces
560 */
561int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log);
564
565int xlog_cil_push(struct log *log, int push_now);
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
451 567
452/* 568/*
453 * Unmount record type is used as a pseudo transaction type for the ticket. 569 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -56,33 +56,61 @@ STATIC void xlog_recover_check_summary(xlog_t *);
56#define xlog_recover_check_summary(log) 56#define xlog_recover_check_summary(log)
57#endif 57#endif
58 58
59
60/* 59/*
61 * Sector aligned buffer routines for buffer create/read/write/access 60 * Sector aligned buffer routines for buffer create/read/write/access
62 */ 61 */
63 62
64#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ 63/*
65 ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ 64 * Verify the given count of basic blocks is valid number of blocks
66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 65 * to specify for an operation involving the given XFS log buffer.
67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 66 * Returns nonzero if the count is valid, 0 otherwise.
67 */
68 68
69static inline int
70xlog_buf_bbcount_valid(
71 xlog_t *log,
72 int bbcount)
73{
74 return bbcount > 0 && bbcount <= log->l_logBBsize;
75}
76
77/*
78 * Allocate a buffer to hold log data. The buffer needs to be able
79 * to map to a range of nbblks basic blocks at any valid (basic
80 * block) offset within the log.
81 */
69STATIC xfs_buf_t * 82STATIC xfs_buf_t *
70xlog_get_bp( 83xlog_get_bp(
71 xlog_t *log, 84 xlog_t *log,
72 int nbblks) 85 int nbblks)
73{ 86{
74 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 87 if (!xlog_buf_bbcount_valid(log, nbblks)) {
75 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 88 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
76 XFS_ERROR_REPORT("xlog_get_bp(1)", 89 nbblks);
77 XFS_ERRLEVEL_HIGH, log->l_mp); 90 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
78 return NULL; 91 return NULL;
79 } 92 }
80 93
81 if (log->l_sectbb_log) { 94 /*
82 if (nbblks > 1) 95 * We do log I/O in units of log sectors (a power-of-2
83 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 96 * multiple of the basic block size), so we round up the
84 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 97 * requested size to acommodate the basic blocks required
85 } 98 * for complete log sectors.
99 *
100 * In addition, the buffer may be used for a non-sector-
101 * aligned block offset, in which case an I/O of the
102 * requested size could extend beyond the end of the
103 * buffer. If the requested size is only 1 basic block it
104 * will never straddle a sector boundary, so this won't be
105 * an issue. Nor will this be a problem if the log I/O is
106 * done in basic blocks (sector size 1). But otherwise we
107 * extend the buffer by one extra log sector to ensure
108 * there's space to accomodate this possiblility.
109 */
110 if (nbblks > 1 && log->l_sectBBsize > 1)
111 nbblks += log->l_sectBBsize;
112 nbblks = round_up(nbblks, log->l_sectBBsize);
113
86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 114 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
87} 115}
88 116
@@ -93,6 +121,10 @@ xlog_put_bp(
93 xfs_buf_free(bp); 121 xfs_buf_free(bp);
94} 122}
95 123
124/*
125 * Return the address of the start of the given block number's data
126 * in a log buffer. The buffer covers a log sector-aligned region.
127 */
96STATIC xfs_caddr_t 128STATIC xfs_caddr_t
97xlog_align( 129xlog_align(
98 xlog_t *log, 130 xlog_t *log,
@@ -100,14 +132,14 @@ xlog_align(
100 int nbblks, 132 int nbblks,
101 xfs_buf_t *bp) 133 xfs_buf_t *bp)
102{ 134{
135 xfs_daddr_t offset;
103 xfs_caddr_t ptr; 136 xfs_caddr_t ptr;
104 137
105 if (!log->l_sectbb_log) 138 offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
106 return XFS_BUF_PTR(bp); 139 ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
140
141 ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
107 142
108 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
109 ASSERT(XFS_BUF_SIZE(bp) >=
110 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
111 return ptr; 143 return ptr;
112} 144}
113 145
@@ -124,21 +156,18 @@ xlog_bread_noalign(
124{ 156{
125 int error; 157 int error;
126 158
127 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 159 if (!xlog_buf_bbcount_valid(log, nbblks)) {
128 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 160 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
129 XFS_ERROR_REPORT("xlog_bread(1)", 161 nbblks);
130 XFS_ERRLEVEL_HIGH, log->l_mp); 162 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
131 return EFSCORRUPTED; 163 return EFSCORRUPTED;
132 } 164 }
133 165
134 if (log->l_sectbb_log) { 166 blk_no = round_down(blk_no, log->l_sectBBsize);
135 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 167 nbblks = round_up(nbblks, log->l_sectBBsize);
136 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
137 }
138 168
139 ASSERT(nbblks > 0); 169 ASSERT(nbblks > 0);
140 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 170 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
141 ASSERT(bp);
142 171
143 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 172 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
144 XFS_BUF_READ(bp); 173 XFS_BUF_READ(bp);
@@ -186,17 +215,15 @@ xlog_bwrite(
186{ 215{
187 int error; 216 int error;
188 217
189 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 218 if (!xlog_buf_bbcount_valid(log, nbblks)) {
190 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 219 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
191 XFS_ERROR_REPORT("xlog_bwrite(1)", 220 nbblks);
192 XFS_ERRLEVEL_HIGH, log->l_mp); 221 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
193 return EFSCORRUPTED; 222 return EFSCORRUPTED;
194 } 223 }
195 224
196 if (log->l_sectbb_log) { 225 blk_no = round_down(blk_no, log->l_sectBBsize);
197 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 226 nbblks = round_up(nbblks, log->l_sectBBsize);
198 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
199 }
200 227
201 ASSERT(nbblks > 0); 228 ASSERT(nbblks > 0);
202 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 229 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +354,38 @@ xlog_find_cycle_start(
327{ 354{
328 xfs_caddr_t offset; 355 xfs_caddr_t offset;
329 xfs_daddr_t mid_blk; 356 xfs_daddr_t mid_blk;
357 xfs_daddr_t end_blk;
330 uint mid_cycle; 358 uint mid_cycle;
331 int error; 359 int error;
332 360
333 mid_blk = BLK_AVG(first_blk, *last_blk); 361 end_blk = *last_blk;
334 while (mid_blk != first_blk && mid_blk != *last_blk) { 362 mid_blk = BLK_AVG(first_blk, end_blk);
363 while (mid_blk != first_blk && mid_blk != end_blk) {
335 error = xlog_bread(log, mid_blk, 1, bp, &offset); 364 error = xlog_bread(log, mid_blk, 1, bp, &offset);
336 if (error) 365 if (error)
337 return error; 366 return error;
338 mid_cycle = xlog_get_cycle(offset); 367 mid_cycle = xlog_get_cycle(offset);
339 if (mid_cycle == cycle) { 368 if (mid_cycle == cycle)
340 *last_blk = mid_blk; 369 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
341 /* last_half_cycle == mid_cycle */ 370 else
342 } else { 371 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
343 first_blk = mid_blk; 372 mid_blk = BLK_AVG(first_blk, end_blk);
344 /* first_half_cycle == mid_cycle */
345 }
346 mid_blk = BLK_AVG(first_blk, *last_blk);
347 } 373 }
348 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || 374 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
349 (mid_blk == *last_blk && mid_blk-1 == first_blk)); 375 (mid_blk == end_blk && mid_blk-1 == first_blk));
376
377 *last_blk = end_blk;
350 378
351 return 0; 379 return 0;
352} 380}
353 381
354/* 382/*
355 * Check that the range of blocks does not contain the cycle number 383 * Check that a range of blocks does not contain stop_on_cycle_no.
356 * given. The scan needs to occur from front to back and the ptr into the 384 * Fill in *new_blk with the block offset where such a block is
357 * region must be updated since a later routine will need to perform another 385 * found, or with -1 (an invalid block number) if there is no such
358 * test. If the region is completely good, we end up returning the same 386 * block in the range. The scan needs to occur from front to back
359 * last block number. 387 * and the pointer into the region must be updated since a later
360 * 388 * routine will need to perform another test.
361 * Set blkno to -1 if we encounter no errors. This is an invalid block number
362 * since we don't ever expect logs to get this large.
363 */ 389 */
364STATIC int 390STATIC int
365xlog_find_verify_cycle( 391xlog_find_verify_cycle(
@@ -376,12 +402,16 @@ xlog_find_verify_cycle(
376 xfs_caddr_t buf = NULL; 402 xfs_caddr_t buf = NULL;
377 int error = 0; 403 int error = 0;
378 404
405 /*
406 * Greedily allocate a buffer big enough to handle the full
407 * range of basic blocks we'll be examining. If that fails,
408 * try a smaller size. We need to be able to read at least
409 * a log sector, or we're out of luck.
410 */
379 bufblks = 1 << ffs(nbblks); 411 bufblks = 1 << ffs(nbblks);
380
381 while (!(bp = xlog_get_bp(log, bufblks))) { 412 while (!(bp = xlog_get_bp(log, bufblks))) {
382 /* can't get enough memory to do everything in one big buffer */
383 bufblks >>= 1; 413 bufblks >>= 1;
384 if (bufblks <= log->l_sectbb_log) 414 if (bufblks < log->l_sectBBsize)
385 return ENOMEM; 415 return ENOMEM;
386 } 416 }
387 417
@@ -629,7 +659,7 @@ xlog_find_head(
629 * In this case we want to find the first block with cycle 659 * In this case we want to find the first block with cycle
630 * number matching last_half_cycle. We expect the log to be 660 * number matching last_half_cycle. We expect the log to be
631 * some variation on 661 * some variation on
632 * x + 1 ... | x ... 662 * x + 1 ... | x ... | x
633 * The first block with cycle number x (last_half_cycle) will 663 * The first block with cycle number x (last_half_cycle) will
634 * be where the new head belongs. First we do a binary search 664 * be where the new head belongs. First we do a binary search
635 * for the first occurrence of last_half_cycle. The binary 665 * for the first occurrence of last_half_cycle. The binary
@@ -639,11 +669,13 @@ xlog_find_head(
639 * the log, then we look for occurrences of last_half_cycle - 1 669 * the log, then we look for occurrences of last_half_cycle - 1
640 * at the end of the log. The cases we're looking for look 670 * at the end of the log. The cases we're looking for look
641 * like 671 * like
642 * x + 1 ... | x | x + 1 | x ... 672 * v binary search stopped here
643 * ^ binary search stopped here 673 * x + 1 ... | x | x + 1 | x ... | x
674 * ^ but we want to locate this spot
644 * or 675 * or
645 * x + 1 ... | x ... | x - 1 | x
646 * <---------> less than scan distance 676 * <---------> less than scan distance
677 * x + 1 ... | x ... | x - 1 | x
678 * ^ we want to locate this spot
647 */ 679 */
648 stop_on_cycle = last_half_cycle; 680 stop_on_cycle = last_half_cycle;
649 if ((error = xlog_find_cycle_start(log, bp, first_blk, 681 if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +731,16 @@ xlog_find_head(
699 * certainly not the head of the log. By searching for 731 * certainly not the head of the log. By searching for
700 * last_half_cycle-1 we accomplish that. 732 * last_half_cycle-1 we accomplish that.
701 */ 733 */
702 start_blk = log_bbnum - num_scan_bblks + head_blk;
703 ASSERT(head_blk <= INT_MAX && 734 ASSERT(head_blk <= INT_MAX &&
704 (xfs_daddr_t) num_scan_bblks - head_blk >= 0); 735 (xfs_daddr_t) num_scan_bblks >= head_blk);
736 start_blk = log_bbnum - (num_scan_bblks - head_blk);
705 if ((error = xlog_find_verify_cycle(log, start_blk, 737 if ((error = xlog_find_verify_cycle(log, start_blk,
706 num_scan_bblks - (int)head_blk, 738 num_scan_bblks - (int)head_blk,
707 (stop_on_cycle - 1), &new_blk))) 739 (stop_on_cycle - 1), &new_blk)))
708 goto bp_err; 740 goto bp_err;
709 if (new_blk != -1) { 741 if (new_blk != -1) {
710 head_blk = new_blk; 742 head_blk = new_blk;
711 goto bad_blk; 743 goto validate_head;
712 } 744 }
713 745
714 /* 746 /*
@@ -726,7 +758,7 @@ xlog_find_head(
726 head_blk = new_blk; 758 head_blk = new_blk;
727 } 759 }
728 760
729 bad_blk: 761validate_head:
730 /* 762 /*
731 * Now we need to make sure head_blk is not pointing to a block in 763 * Now we need to make sure head_blk is not pointing to a block in
732 * the middle of a log record. 764 * the middle of a log record.
@@ -748,7 +780,7 @@ xlog_find_head(
748 if ((error = xlog_find_verify_log_record(log, start_blk, 780 if ((error = xlog_find_verify_log_record(log, start_blk,
749 &head_blk, 0)) == -1) { 781 &head_blk, 0)) == -1) {
750 /* We hit the beginning of the log during our search */ 782 /* We hit the beginning of the log during our search */
751 start_blk = log_bbnum - num_scan_bblks + head_blk; 783 start_blk = log_bbnum - (num_scan_bblks - head_blk);
752 new_blk = log_bbnum; 784 new_blk = log_bbnum;
753 ASSERT(start_blk <= INT_MAX && 785 ASSERT(start_blk <= INT_MAX &&
754 (xfs_daddr_t) log_bbnum-start_blk >= 0); 786 (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +865,12 @@ xlog_find_tail(
833 if (*head_blk == 0) { /* special case */ 865 if (*head_blk == 0) { /* special case */
834 error = xlog_bread(log, 0, 1, bp, &offset); 866 error = xlog_bread(log, 0, 1, bp, &offset);
835 if (error) 867 if (error)
836 goto bread_err; 868 goto done;
837 869
838 if (xlog_get_cycle(offset) == 0) { 870 if (xlog_get_cycle(offset) == 0) {
839 *tail_blk = 0; 871 *tail_blk = 0;
840 /* leave all other log inited values alone */ 872 /* leave all other log inited values alone */
841 goto exit; 873 goto done;
842 } 874 }
843 } 875 }
844 876
@@ -849,7 +881,7 @@ xlog_find_tail(
849 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 881 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
850 error = xlog_bread(log, i, 1, bp, &offset); 882 error = xlog_bread(log, i, 1, bp, &offset);
851 if (error) 883 if (error)
852 goto bread_err; 884 goto done;
853 885
854 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 886 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
855 found = 1; 887 found = 1;
@@ -866,7 +898,7 @@ xlog_find_tail(
866 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 898 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
867 error = xlog_bread(log, i, 1, bp, &offset); 899 error = xlog_bread(log, i, 1, bp, &offset);
868 if (error) 900 if (error)
869 goto bread_err; 901 goto done;
870 902
871 if (XLOG_HEADER_MAGIC_NUM == 903 if (XLOG_HEADER_MAGIC_NUM ==
872 be32_to_cpu(*(__be32 *)offset)) { 904 be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +973,7 @@ xlog_find_tail(
941 umount_data_blk = (i + hblks) % log->l_logBBsize; 973 umount_data_blk = (i + hblks) % log->l_logBBsize;
942 error = xlog_bread(log, umount_data_blk, 1, bp, &offset); 974 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
943 if (error) 975 if (error)
944 goto bread_err; 976 goto done;
945 977
946 op_head = (xlog_op_header_t *)offset; 978 op_head = (xlog_op_header_t *)offset;
947 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 979 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1019,10 @@ xlog_find_tail(
987 * But... if the -device- itself is readonly, just skip this. 1019 * But... if the -device- itself is readonly, just skip this.
988 * We can't recover this device anyway, so it won't matter. 1020 * We can't recover this device anyway, so it won't matter.
989 */ 1021 */
990 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { 1022 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
991 error = xlog_clear_stale_blocks(log, tail_lsn); 1023 error = xlog_clear_stale_blocks(log, tail_lsn);
992 }
993 1024
994bread_err: 1025done:
995exit:
996 xlog_put_bp(bp); 1026 xlog_put_bp(bp);
997 1027
998 if (error) 1028 if (error)
@@ -1152,16 +1182,22 @@ xlog_write_log_records(
1152 xfs_caddr_t offset; 1182 xfs_caddr_t offset;
1153 xfs_buf_t *bp; 1183 xfs_buf_t *bp;
1154 int balign, ealign; 1184 int balign, ealign;
1155 int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 1185 int sectbb = log->l_sectBBsize;
1156 int end_block = start_block + blocks; 1186 int end_block = start_block + blocks;
1157 int bufblks; 1187 int bufblks;
1158 int error = 0; 1188 int error = 0;
1159 int i, j = 0; 1189 int i, j = 0;
1160 1190
1191 /*
1192 * Greedily allocate a buffer big enough to handle the full
1193 * range of basic blocks to be written. If that fails, try
1194 * a smaller size. We need to be able to write at least a
1195 * log sector, or we're out of luck.
1196 */
1161 bufblks = 1 << ffs(blocks); 1197 bufblks = 1 << ffs(blocks);
1162 while (!(bp = xlog_get_bp(log, bufblks))) { 1198 while (!(bp = xlog_get_bp(log, bufblks))) {
1163 bufblks >>= 1; 1199 bufblks >>= 1;
1164 if (bufblks <= log->l_sectbb_log) 1200 if (bufblks < sectbb)
1165 return ENOMEM; 1201 return ENOMEM;
1166 } 1202 }
1167 1203
@@ -1169,7 +1205,7 @@ xlog_write_log_records(
1169 * the buffer in the starting sector not covered by the first 1205 * the buffer in the starting sector not covered by the first
1170 * write below. 1206 * write below.
1171 */ 1207 */
1172 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1208 balign = round_down(start_block, sectbb);
1173 if (balign != start_block) { 1209 if (balign != start_block) {
1174 error = xlog_bread_noalign(log, start_block, 1, bp); 1210 error = xlog_bread_noalign(log, start_block, 1, bp);
1175 if (error) 1211 if (error)
@@ -1188,7 +1224,7 @@ xlog_write_log_records(
1188 * the buffer in the final sector not covered by the write. 1224 * the buffer in the final sector not covered by the write.
1189 * If this is the same sector as the above read, skip it. 1225 * If this is the same sector as the above read, skip it.
1190 */ 1226 */
1191 ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); 1227 ealign = round_down(end_block, sectbb);
1192 if (j == 0 && (start_block + endcount > ealign)) { 1228 if (j == 0 && (start_block + endcount > ealign)) {
1193 offset = XFS_BUF_PTR(bp); 1229 offset = XFS_BUF_PTR(bp);
1194 balign = BBTOB(ealign - start_block); 1230 balign = BBTOB(ealign - start_block);
@@ -1408,6 +1444,7 @@ xlog_recover_add_item(
1408 1444
1409STATIC int 1445STATIC int
1410xlog_recover_add_to_cont_trans( 1446xlog_recover_add_to_cont_trans(
1447 struct log *log,
1411 xlog_recover_t *trans, 1448 xlog_recover_t *trans,
1412 xfs_caddr_t dp, 1449 xfs_caddr_t dp,
1413 int len) 1450 int len)
@@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans(
1434 memcpy(&ptr[old_len], dp, len); /* d, s, l */ 1471 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1435 item->ri_buf[item->ri_cnt-1].i_len += len; 1472 item->ri_buf[item->ri_cnt-1].i_len += len;
1436 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 1473 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1474 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1437 return 0; 1475 return 0;
1438} 1476}
1439 1477
@@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans(
1452 */ 1490 */
1453STATIC int 1491STATIC int
1454xlog_recover_add_to_trans( 1492xlog_recover_add_to_trans(
1493 struct log *log,
1455 xlog_recover_t *trans, 1494 xlog_recover_t *trans,
1456 xfs_caddr_t dp, 1495 xfs_caddr_t dp,
1457 int len) 1496 int len)
@@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans(
1510 item->ri_buf[item->ri_cnt].i_addr = ptr; 1549 item->ri_buf[item->ri_cnt].i_addr = ptr;
1511 item->ri_buf[item->ri_cnt].i_len = len; 1550 item->ri_buf[item->ri_cnt].i_len = len;
1512 item->ri_cnt++; 1551 item->ri_cnt++;
1552 trace_xfs_log_recover_item_add(log, trans, item, 0);
1513 return 0; 1553 return 0;
1514} 1554}
1515 1555
@@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans(
1521 */ 1561 */
1522STATIC int 1562STATIC int
1523xlog_recover_reorder_trans( 1563xlog_recover_reorder_trans(
1524 xlog_recover_t *trans) 1564 struct log *log,
1565 xlog_recover_t *trans,
1566 int pass)
1525{ 1567{
1526 xlog_recover_item_t *item, *n; 1568 xlog_recover_item_t *item, *n;
1527 LIST_HEAD(sort_list); 1569 LIST_HEAD(sort_list);
@@ -1534,7 +1576,9 @@ xlog_recover_reorder_trans(
1534 1576
1535 switch (ITEM_TYPE(item)) { 1577 switch (ITEM_TYPE(item)) {
1536 case XFS_LI_BUF: 1578 case XFS_LI_BUF:
1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1579 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1580 trace_xfs_log_recover_item_reorder_head(log,
1581 trans, item, pass);
1538 list_move(&item->ri_list, &trans->r_itemq); 1582 list_move(&item->ri_list, &trans->r_itemq);
1539 break; 1583 break;
1540 } 1584 }
@@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans(
1543 case XFS_LI_QUOTAOFF: 1587 case XFS_LI_QUOTAOFF:
1544 case XFS_LI_EFD: 1588 case XFS_LI_EFD:
1545 case XFS_LI_EFI: 1589 case XFS_LI_EFI:
1590 trace_xfs_log_recover_item_reorder_tail(log,
1591 trans, item, pass);
1546 list_move_tail(&item->ri_list, &trans->r_itemq); 1592 list_move_tail(&item->ri_list, &trans->r_itemq);
1547 break; 1593 break;
1548 default: 1594 default:
@@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1(
1592 /* 1638 /*
1593 * If this isn't a cancel buffer item, then just return. 1639 * If this isn't a cancel buffer item, then just return.
1594 */ 1640 */
1595 if (!(flags & XFS_BLI_CANCEL)) 1641 if (!(flags & XFS_BLF_CANCEL)) {
1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1596 return; 1643 return;
1644 }
1597 1645
1598 /* 1646 /*
1599 * Insert an xfs_buf_cancel record into the hash table of 1647 * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1(
1627 while (nextp != NULL) { 1675 while (nextp != NULL) {
1628 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1676 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1629 nextp->bc_refcount++; 1677 nextp->bc_refcount++;
1678 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1630 return; 1679 return;
1631 } 1680 }
1632 prevp = nextp; 1681 prevp = nextp;
@@ -1640,13 +1689,14 @@ xlog_recover_do_buffer_pass1(
1640 bcp->bc_refcount = 1; 1689 bcp->bc_refcount = 1;
1641 bcp->bc_next = NULL; 1690 bcp->bc_next = NULL;
1642 prevp->bc_next = bcp; 1691 prevp->bc_next = bcp;
1692 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1643} 1693}
1644 1694
1645/* 1695/*
1646 * Check to see whether the buffer being recovered has a corresponding 1696 * Check to see whether the buffer being recovered has a corresponding
1647 * entry in the buffer cancel record table. If it does then return 1 1697 * entry in the buffer cancel record table. If it does then return 1
1648 * so that it will be cancelled, otherwise return 0. If the buffer is 1698 * so that it will be cancelled, otherwise return 0. If the buffer is
1649 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1699 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1650 * the refcount on the entry in the table and remove it from the table 1700 * the refcount on the entry in the table and remove it from the table
1651 * if this is the last reference. 1701 * if this is the last reference.
1652 * 1702 *
@@ -1671,7 +1721,7 @@ xlog_check_buffer_cancelled(
1671 * There is nothing in the table built in pass one, 1721 * There is nothing in the table built in pass one,
1672 * so this buffer must not be cancelled. 1722 * so this buffer must not be cancelled.
1673 */ 1723 */
1674 ASSERT(!(flags & XFS_BLI_CANCEL)); 1724 ASSERT(!(flags & XFS_BLF_CANCEL));
1675 return 0; 1725 return 0;
1676 } 1726 }
1677 1727
@@ -1683,7 +1733,7 @@ xlog_check_buffer_cancelled(
1683 * There is no corresponding entry in the table built 1733 * There is no corresponding entry in the table built
1684 * in pass one, so this buffer has not been cancelled. 1734 * in pass one, so this buffer has not been cancelled.
1685 */ 1735 */
1686 ASSERT(!(flags & XFS_BLI_CANCEL)); 1736 ASSERT(!(flags & XFS_BLF_CANCEL));
1687 return 0; 1737 return 0;
1688 } 1738 }
1689 1739
@@ -1702,7 +1752,7 @@ xlog_check_buffer_cancelled(
1702 * one in the table and remove it if this is the 1752 * one in the table and remove it if this is the
1703 * last reference. 1753 * last reference.
1704 */ 1754 */
1705 if (flags & XFS_BLI_CANCEL) { 1755 if (flags & XFS_BLF_CANCEL) {
1706 bcp->bc_refcount--; 1756 bcp->bc_refcount--;
1707 if (bcp->bc_refcount == 0) { 1757 if (bcp->bc_refcount == 0) {
1708 if (prevp == NULL) { 1758 if (prevp == NULL) {
@@ -1722,7 +1772,7 @@ xlog_check_buffer_cancelled(
1722 * We didn't find a corresponding entry in the table, so 1772 * We didn't find a corresponding entry in the table, so
1723 * return 0 so that the buffer is NOT cancelled. 1773 * return 0 so that the buffer is NOT cancelled.
1724 */ 1774 */
1725 ASSERT(!(flags & XFS_BLI_CANCEL)); 1775 ASSERT(!(flags & XFS_BLF_CANCEL));
1726 return 0; 1776 return 0;
1727} 1777}
1728 1778
@@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer(
1779 unsigned int *data_map = NULL; 1829 unsigned int *data_map = NULL;
1780 unsigned int map_size = 0; 1830 unsigned int map_size = 0;
1781 1831
1832 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1833
1782 switch (buf_f->blf_type) { 1834 switch (buf_f->blf_type) {
1783 case XFS_LI_BUF: 1835 case XFS_LI_BUF:
1784 data_map = buf_f->blf_data_map; 1836 data_map = buf_f->blf_data_map;
@@ -1822,8 +1874,8 @@ xlog_recover_do_inode_buffer(
1822 nbits = xfs_contig_bits(data_map, map_size, 1874 nbits = xfs_contig_bits(data_map, map_size,
1823 bit); 1875 bit);
1824 ASSERT(nbits > 0); 1876 ASSERT(nbits > 0);
1825 reg_buf_offset = bit << XFS_BLI_SHIFT; 1877 reg_buf_offset = bit << XFS_BLF_SHIFT;
1826 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1878 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1827 item_index++; 1879 item_index++;
1828 } 1880 }
1829 1881
@@ -1837,7 +1889,7 @@ xlog_recover_do_inode_buffer(
1837 } 1889 }
1838 1890
1839 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1891 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1840 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1841 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1842 1894
1843 /* 1895 /*
@@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer(
1874/*ARGSUSED*/ 1926/*ARGSUSED*/
1875STATIC void 1927STATIC void
1876xlog_recover_do_reg_buffer( 1928xlog_recover_do_reg_buffer(
1929 struct xfs_mount *mp,
1877 xlog_recover_item_t *item, 1930 xlog_recover_item_t *item,
1878 xfs_buf_t *bp, 1931 xfs_buf_t *bp,
1879 xfs_buf_log_format_t *buf_f) 1932 xfs_buf_log_format_t *buf_f)
@@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer(
1885 unsigned int map_size = 0; 1938 unsigned int map_size = 0;
1886 int error; 1939 int error;
1887 1940
1941 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1942
1888 switch (buf_f->blf_type) { 1943 switch (buf_f->blf_type) {
1889 case XFS_LI_BUF: 1944 case XFS_LI_BUF:
1890 data_map = buf_f->blf_data_map; 1945 data_map = buf_f->blf_data_map;
@@ -1900,9 +1955,9 @@ xlog_recover_do_reg_buffer(
1900 nbits = xfs_contig_bits(data_map, map_size, bit); 1955 nbits = xfs_contig_bits(data_map, map_size, bit);
1901 ASSERT(nbits > 0); 1956 ASSERT(nbits > 0);
1902 ASSERT(item->ri_buf[i].i_addr != NULL); 1957 ASSERT(item->ri_buf[i].i_addr != NULL);
1903 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1958 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1904 ASSERT(XFS_BUF_COUNT(bp) >= 1959 ASSERT(XFS_BUF_COUNT(bp) >=
1905 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1960 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1906 1961
1907 /* 1962 /*
1908 * Do a sanity check if this is a dquot buffer. Just checking 1963 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1911,7 +1966,7 @@ xlog_recover_do_reg_buffer(
1911 */ 1966 */
1912 error = 0; 1967 error = 0;
1913 if (buf_f->blf_flags & 1968 if (buf_f->blf_flags &
1914 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1969 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1915 if (item->ri_buf[i].i_addr == NULL) { 1970 if (item->ri_buf[i].i_addr == NULL) {
1916 cmn_err(CE_ALERT, 1971 cmn_err(CE_ALERT,
1917 "XFS: NULL dquot in %s.", __func__); 1972 "XFS: NULL dquot in %s.", __func__);
@@ -1932,9 +1987,9 @@ xlog_recover_do_reg_buffer(
1932 } 1987 }
1933 1988
1934 memcpy(xfs_buf_offset(bp, 1989 memcpy(xfs_buf_offset(bp,
1935 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1990 (uint)bit << XFS_BLF_SHIFT), /* dest */
1936 item->ri_buf[i].i_addr, /* source */ 1991 item->ri_buf[i].i_addr, /* source */
1937 nbits<<XFS_BLI_SHIFT); /* length */ 1992 nbits<<XFS_BLF_SHIFT); /* length */
1938 next: 1993 next:
1939 i++; 1994 i++;
1940 bit += nbits; 1995 bit += nbits;
@@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer(
2083{ 2138{
2084 uint type; 2139 uint type;
2085 2140
2141 trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2142
2086 /* 2143 /*
2087 * Filesystems are required to send in quota flags at mount time. 2144 * Filesystems are required to send in quota flags at mount time.
2088 */ 2145 */
@@ -2091,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
2091 } 2148 }
2092 2149
2093 type = 0; 2150 type = 0;
2094 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2151 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2095 type |= XFS_DQ_USER; 2152 type |= XFS_DQ_USER;
2096 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2153 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2097 type |= XFS_DQ_PROJ; 2154 type |= XFS_DQ_PROJ;
2098 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2155 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2099 type |= XFS_DQ_GROUP; 2156 type |= XFS_DQ_GROUP;
2100 /* 2157 /*
2101 * This type of quotas was turned off, so ignore this buffer 2158 * This type of quotas was turned off, so ignore this buffer
@@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
2103 if (log->l_quotaoffs_flag & type) 2160 if (log->l_quotaoffs_flag & type)
2104 return; 2161 return;
2105 2162
2106 xlog_recover_do_reg_buffer(item, bp, buf_f); 2163 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2107} 2164}
2108 2165
2109/* 2166/*
@@ -2116,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
2116 * here which overlaps that may be stale. 2173 * here which overlaps that may be stale.
2117 * 2174 *
2118 * When meta-data buffers are freed at run time we log a buffer item 2175 * When meta-data buffers are freed at run time we log a buffer item
2119 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2176 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2120 * of the buffer in the log should not be replayed at recovery time. 2177 * of the buffer in the log should not be replayed at recovery time.
2121 * This is so that if the blocks covered by the buffer are reused for 2178 * This is so that if the blocks covered by the buffer are reused for
2122 * file data before we crash we don't end up replaying old, freed 2179 * file data before we crash we don't end up replaying old, freed
@@ -2150,7 +2207,7 @@ xlog_recover_do_buffer_trans(
2150 if (pass == XLOG_RECOVER_PASS1) { 2207 if (pass == XLOG_RECOVER_PASS1) {
2151 /* 2208 /*
2152 * In this pass we're only looking for buf items 2209 * In this pass we're only looking for buf items
2153 * with the XFS_BLI_CANCEL bit set. 2210 * with the XFS_BLF_CANCEL bit set.
2154 */ 2211 */
2155 xlog_recover_do_buffer_pass1(log, buf_f); 2212 xlog_recover_do_buffer_pass1(log, buf_f);
2156 return 0; 2213 return 0;
@@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans(
2164 */ 2221 */
2165 cancel = xlog_recover_do_buffer_pass2(log, buf_f); 2222 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2166 if (cancel) { 2223 if (cancel) {
2224 trace_xfs_log_recover_buf_cancel(log, buf_f);
2167 return 0; 2225 return 0;
2168 } 2226 }
2169 } 2227 }
2228 trace_xfs_log_recover_buf_recover(log, buf_f);
2170 switch (buf_f->blf_type) { 2229 switch (buf_f->blf_type) {
2171 case XFS_LI_BUF: 2230 case XFS_LI_BUF:
2172 blkno = buf_f->blf_blkno; 2231 blkno = buf_f->blf_blkno;
@@ -2185,7 +2244,7 @@ xlog_recover_do_buffer_trans(
2185 2244
2186 mp = log->l_mp; 2245 mp = log->l_mp;
2187 buf_flags = XBF_LOCK; 2246 buf_flags = XBF_LOCK;
2188 if (!(flags & XFS_BLI_INODE_BUF)) 2247 if (!(flags & XFS_BLF_INODE_BUF))
2189 buf_flags |= XBF_MAPPED; 2248 buf_flags |= XBF_MAPPED;
2190 2249
2191 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2198,13 +2257,13 @@ xlog_recover_do_buffer_trans(
2198 } 2257 }
2199 2258
2200 error = 0; 2259 error = 0;
2201 if (flags & XFS_BLI_INODE_BUF) { 2260 if (flags & XFS_BLF_INODE_BUF) {
2202 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2203 } else if (flags & 2262 } else if (flags &
2204 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2263 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2205 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2206 } else { 2265 } else {
2207 xlog_recover_do_reg_buffer(item, bp, buf_f); 2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2208 } 2267 }
2209 if (error) 2268 if (error)
2210 return XFS_ERROR(error); 2269 return XFS_ERROR(error);
@@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans(
2284 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, 2343 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2285 in_f->ilf_len, 0)) { 2344 in_f->ilf_len, 0)) {
2286 error = 0; 2345 error = 0;
2346 trace_xfs_log_recover_inode_cancel(log, in_f);
2287 goto error; 2347 goto error;
2288 } 2348 }
2349 trace_xfs_log_recover_inode_recover(log, in_f);
2289 2350
2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2351 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2291 XBF_LOCK); 2352 XBF_LOCK);
@@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans(
2337 /* do nothing */ 2398 /* do nothing */
2338 } else { 2399 } else {
2339 xfs_buf_relse(bp); 2400 xfs_buf_relse(bp);
2401 trace_xfs_log_recover_inode_skip(log, in_f);
2340 error = 0; 2402 error = 0;
2341 goto error; 2403 goto error;
2342 } 2404 }
@@ -2758,11 +2820,12 @@ xlog_recover_do_trans(
2758 int error = 0; 2820 int error = 0;
2759 xlog_recover_item_t *item; 2821 xlog_recover_item_t *item;
2760 2822
2761 error = xlog_recover_reorder_trans(trans); 2823 error = xlog_recover_reorder_trans(log, trans, pass);
2762 if (error) 2824 if (error)
2763 return error; 2825 return error;
2764 2826
2765 list_for_each_entry(item, &trans->r_itemq, ri_list) { 2827 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2828 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2766 switch (ITEM_TYPE(item)) { 2829 switch (ITEM_TYPE(item)) {
2767 case XFS_LI_BUF: 2830 case XFS_LI_BUF:
2768 error = xlog_recover_do_buffer_trans(log, item, pass); 2831 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2982,9 @@ xlog_recover_process_data(
2919 error = xlog_recover_unmount_trans(trans); 2982 error = xlog_recover_unmount_trans(trans);
2920 break; 2983 break;
2921 case XLOG_WAS_CONT_TRANS: 2984 case XLOG_WAS_CONT_TRANS:
2922 error = xlog_recover_add_to_cont_trans(trans, 2985 error = xlog_recover_add_to_cont_trans(log,
2923 dp, be32_to_cpu(ohead->oh_len)); 2986 trans, dp,
2987 be32_to_cpu(ohead->oh_len));
2924 break; 2988 break;
2925 case XLOG_START_TRANS: 2989 case XLOG_START_TRANS:
2926 xlog_warn( 2990 xlog_warn(
@@ -2930,7 +2994,7 @@ xlog_recover_process_data(
2930 break; 2994 break;
2931 case 0: 2995 case 0:
2932 case XLOG_CONTINUE_TRANS: 2996 case XLOG_CONTINUE_TRANS:
2933 error = xlog_recover_add_to_trans(trans, 2997 error = xlog_recover_add_to_trans(log, trans,
2934 dp, be32_to_cpu(ohead->oh_len)); 2998 dp, be32_to_cpu(ohead->oh_len));
2935 break; 2999 break;
2936 default: 3000 default:
@@ -3331,42 +3395,6 @@ xlog_pack_data(
3331 } 3395 }
3332} 3396}
3333 3397
3334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3335STATIC void
3336xlog_unpack_data_checksum(
3337 xlog_rec_header_t *rhead,
3338 xfs_caddr_t dp,
3339 xlog_t *log)
3340{
3341 __be32 *up = (__be32 *)dp;
3342 uint chksum = 0;
3343 int i;
3344
3345 /* divide length by 4 to get # words */
3346 for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
3347 chksum ^= be32_to_cpu(*up);
3348 up++;
3349 }
3350 if (chksum != be32_to_cpu(rhead->h_chksum)) {
3351 if (rhead->h_chksum ||
3352 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3353 cmn_err(CE_DEBUG,
3354 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3355 be32_to_cpu(rhead->h_chksum), chksum);
3356 cmn_err(CE_DEBUG,
3357"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3358 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3359 cmn_err(CE_DEBUG,
3360 "XFS: LogR this is a LogV2 filesystem\n");
3361 }
3362 log->l_flags |= XLOG_CHKSUM_MISMATCH;
3363 }
3364 }
3365}
3366#else
3367#define xlog_unpack_data_checksum(rhead, dp, log)
3368#endif
3369
3370STATIC void 3398STATIC void
3371xlog_unpack_data( 3399xlog_unpack_data(
3372 xlog_rec_header_t *rhead, 3400 xlog_rec_header_t *rhead,
@@ -3390,8 +3418,6 @@ xlog_unpack_data(
3390 dp += BBSIZE; 3418 dp += BBSIZE;
3391 } 3419 }
3392 } 3420 }
3393
3394 xlog_unpack_data_checksum(rhead, dp, log);
3395} 3421}
3396 3422
3397STATIC int 3423STATIC int
@@ -3490,7 +3516,7 @@ xlog_do_recovery_pass(
3490 hblks = 1; 3516 hblks = 1;
3491 } 3517 }
3492 } else { 3518 } else {
3493 ASSERT(log->l_sectbb_log == 0); 3519 ASSERT(log->l_sectBBsize == 1);
3494 hblks = 1; 3520 hblks = 1;
3495 hbp = xlog_get_bp(log, 1); 3521 hbp = xlog_get_bp(log, 1);
3496 h_size = XLOG_BIG_RECORD_BSIZE; 3522 h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3972,6 @@ xlog_recover_check_summary(
3946 xfs_agf_t *agfp; 3972 xfs_agf_t *agfp;
3947 xfs_buf_t *agfbp; 3973 xfs_buf_t *agfbp;
3948 xfs_buf_t *agibp; 3974 xfs_buf_t *agibp;
3949 xfs_buf_t *sbbp;
3950#ifdef XFS_LOUD_RECOVERY
3951 xfs_sb_t *sbp;
3952#endif
3953 xfs_agnumber_t agno; 3975 xfs_agnumber_t agno;
3954 __uint64_t freeblks; 3976 __uint64_t freeblks;
3955 __uint64_t itotal; 3977 __uint64_t itotal;
@@ -3984,30 +4006,5 @@ xlog_recover_check_summary(
3984 xfs_buf_relse(agibp); 4006 xfs_buf_relse(agibp);
3985 } 4007 }
3986 } 4008 }
3987
3988 sbbp = xfs_getsb(mp, 0);
3989#ifdef XFS_LOUD_RECOVERY
3990 sbp = &mp->m_sb;
3991 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
3992 cmn_err(CE_NOTE,
3993 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
3994 sbp->sb_icount, itotal);
3995 cmn_err(CE_NOTE,
3996 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
3997 sbp->sb_ifree, ifree);
3998 cmn_err(CE_NOTE,
3999 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4000 sbp->sb_fdblocks, freeblks);
4001#if 0
4002 /*
4003 * This is turned off until I account for the allocation
4004 * btree blocks which live in free space.
4005 */
4006 ASSERT(sbp->sb_icount == itotal);
4007 ASSERT(sbp->sb_ifree == ifree);
4008 ASSERT(sbp->sb_fdblocks == freeblks);
4009#endif
4010#endif
4011 xfs_buf_relse(sbbp);
4012} 4009}
4013#endif /* DEBUG */ 4010#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
28#define XLOG_RHASH(tid) \ 28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) 29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30 30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) 31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32 32
33 33
34/* 34/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..d7bf38c8cd1c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1405,13 +1405,6 @@ xfs_mountfs(
1405 xfs_qm_mount_quotas(mp); 1405 xfs_qm_mount_quotas(mp);
1406 } 1406 }
1407 1407
1408#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1409 if (XFS_IS_QUOTA_ON(mp))
1410 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
1411 else
1412 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
1413#endif
1414
1415 /* 1408 /*
1416 * Now we are mounted, reserve a small amount of unused space for 1409 * Now we are mounted, reserve a small amount of unused space for
1417 * privileged transactions. This is needed so that transaction 1410 * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
269 must be synchronous except 269 must be synchronous except
270 for space allocations */ 270 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
271#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
272#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
273#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ 201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ 202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ 203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
204#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */
205#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */
206#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */
207#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ 204#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
208#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ 205#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
209#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ 206#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,24 +44,14 @@
44#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
47 47#include "xfs_trace.h"
48
49STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
50STATIC uint xfs_trans_count_vecs(xfs_trans_t *);
51STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
52STATIC void xfs_trans_uncommit(xfs_trans_t *, uint);
53STATIC void xfs_trans_committed(xfs_trans_t *, int);
54STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
55STATIC void xfs_trans_free(xfs_trans_t *);
56 48
57kmem_zone_t *xfs_trans_zone; 49kmem_zone_t *xfs_trans_zone;
58 50
59
60/* 51/*
61 * Reservation functions here avoid a huge stack in xfs_trans_init 52 * Reservation functions here avoid a huge stack in xfs_trans_init
62 * due to register overflow from temporaries in the calculations. 53 * due to register overflow from temporaries in the calculations.
63 */ 54 */
64
65STATIC uint 55STATIC uint
66xfs_calc_write_reservation(xfs_mount_t *mp) 56xfs_calc_write_reservation(xfs_mount_t *mp)
67{ 57{
@@ -254,13 +244,30 @@ _xfs_trans_alloc(
254 tp->t_type = type; 244 tp->t_type = type;
255 tp->t_mountp = mp; 245 tp->t_mountp = mp;
256 tp->t_items_free = XFS_LIC_NUM_SLOTS; 246 tp->t_items_free = XFS_LIC_NUM_SLOTS;
257 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
258 xfs_lic_init(&(tp->t_items)); 247 xfs_lic_init(&(tp->t_items));
259 XFS_LBC_INIT(&(tp->t_busy)); 248 INIT_LIST_HEAD(&tp->t_busy);
260 return tp; 249 return tp;
261} 250}
262 251
263/* 252/*
253 * Free the transaction structure. If there is more clean up
254 * to do when the structure is freed, add it here.
255 */
256STATIC void
257xfs_trans_free(
258 struct xfs_trans *tp)
259{
260 struct xfs_busy_extent *busyp, *n;
261
262 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
263 xfs_alloc_busy_clear(tp->t_mountp, busyp);
264
265 atomic_dec(&tp->t_mountp->m_active_trans);
266 xfs_trans_free_dqinfo(tp);
267 kmem_zone_free(xfs_trans_zone, tp);
268}
269
270/*
264 * This is called to create a new transaction which will share the 271 * This is called to create a new transaction which will share the
265 * permanent log reservation of the given transaction. The remaining 272 * permanent log reservation of the given transaction. The remaining
266 * unused block and rt extent reservations are also inherited. This 273 * unused block and rt extent reservations are also inherited. This
@@ -283,9 +290,8 @@ xfs_trans_dup(
283 ntp->t_type = tp->t_type; 290 ntp->t_type = tp->t_type;
284 ntp->t_mountp = tp->t_mountp; 291 ntp->t_mountp = tp->t_mountp;
285 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 292 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
286 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
287 xfs_lic_init(&(ntp->t_items)); 293 xfs_lic_init(&(ntp->t_items));
288 XFS_LBC_INIT(&(ntp->t_busy)); 294 INIT_LIST_HEAD(&ntp->t_busy);
289 295
290 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 296 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
291 ASSERT(tp->t_ticket != NULL); 297 ASSERT(tp->t_ticket != NULL);
@@ -421,7 +427,6 @@ undo_blocks:
421 return error; 427 return error;
422} 428}
423 429
424
425/* 430/*
426 * Record the indicated change to the given field for application 431 * Record the indicated change to the given field for application
427 * to the file system's superblock when the transaction commits. 432 * to the file system's superblock when the transaction commits.
@@ -650,7 +655,7 @@ xfs_trans_apply_sb_deltas(
650 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 655 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
651 * still need to update the incore superblock with the changes. 656 * still need to update the incore superblock with the changes.
652 */ 657 */
653STATIC void 658void
654xfs_trans_unreserve_and_mod_sb( 659xfs_trans_unreserve_and_mod_sb(
655 xfs_trans_t *tp) 660 xfs_trans_t *tp)
656{ 661{
@@ -764,94 +769,256 @@ xfs_trans_unreserve_and_mod_sb(
764 } 769 }
765} 770}
766 771
772/*
773 * Total up the number of log iovecs needed to commit this
774 * transaction. The transaction itself needs one for the
775 * transaction header. Ask each dirty item in turn how many
776 * it needs to get the total.
777 */
778static uint
779xfs_trans_count_vecs(
780 struct xfs_trans *tp)
781{
782 int nvecs;
783 xfs_log_item_desc_t *lidp;
784
785 nvecs = 1;
786 lidp = xfs_trans_first_item(tp);
787 ASSERT(lidp != NULL);
788
789 /* In the non-debug case we need to start bailing out if we
790 * didn't find a log_item here, return zero and let trans_commit
791 * deal with it.
792 */
793 if (lidp == NULL)
794 return 0;
795
796 while (lidp != NULL) {
797 /*
798 * Skip items which aren't dirty in this transaction.
799 */
800 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
801 lidp = xfs_trans_next_item(tp, lidp);
802 continue;
803 }
804 lidp->lid_size = IOP_SIZE(lidp->lid_item);
805 nvecs += lidp->lid_size;
806 lidp = xfs_trans_next_item(tp, lidp);
807 }
808
809 return nvecs;
810}
767 811
768/* 812/*
769 * xfs_trans_commit 813 * Fill in the vector with pointers to data to be logged
814 * by this transaction. The transaction header takes
815 * the first vector, and then each dirty item takes the
816 * number of vectors it indicated it needed in xfs_trans_count_vecs().
770 * 817 *
771 * Commit the given transaction to the log a/synchronously. 818 * As each item fills in the entries it needs, also pin the item
819 * so that it cannot be flushed out until the log write completes.
820 */
821static void
822xfs_trans_fill_vecs(
823 struct xfs_trans *tp,
824 struct xfs_log_iovec *log_vector)
825{
826 xfs_log_item_desc_t *lidp;
827 struct xfs_log_iovec *vecp;
828 uint nitems;
829
830 /*
831 * Skip over the entry for the transaction header, we'll
832 * fill that in at the end.
833 */
834 vecp = log_vector + 1;
835
836 nitems = 0;
837 lidp = xfs_trans_first_item(tp);
838 ASSERT(lidp);
839 while (lidp) {
840 /* Skip items which aren't dirty in this transaction. */
841 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
842 lidp = xfs_trans_next_item(tp, lidp);
843 continue;
844 }
845
846 /*
847 * The item may be marked dirty but not log anything. This can
848 * be used to get called when a transaction is committed.
849 */
850 if (lidp->lid_size)
851 nitems++;
852 IOP_FORMAT(lidp->lid_item, vecp);
853 vecp += lidp->lid_size;
854 IOP_PIN(lidp->lid_item);
855 lidp = xfs_trans_next_item(tp, lidp);
856 }
857
858 /*
859 * Now that we've counted the number of items in this transaction, fill
860 * in the transaction header. Note that the transaction header does not
861 * have a log item.
862 */
863 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
864 tp->t_header.th_type = tp->t_type;
865 tp->t_header.th_num_items = nitems;
866 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
867 log_vector->i_len = sizeof(xfs_trans_header_t);
868 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
869}
870
871/*
872 * The committed item processing consists of calling the committed routine of
873 * each logged item, updating the item's position in the AIL if necessary, and
874 * unpinning each item. If the committed routine returns -1, then do nothing
875 * further with the item because it may have been freed.
772 * 876 *
773 * XFS disk error handling mechanism is not based on a typical 877 * Since items are unlocked when they are copied to the incore log, it is
774 * transaction abort mechanism. Logically after the filesystem 878 * possible for two transactions to be completing and manipulating the same
775 * gets marked 'SHUTDOWN', we can't let any new transactions 879 * item simultaneously. The AIL lock will protect the lsn field of each item.
776 * be durable - ie. committed to disk - because some metadata might 880 * The value of this field can never go backwards.
777 * be inconsistent. In such cases, this returns an error, and the 881 *
778 * caller may assume that all locked objects joined to the transaction 882 * We unpin the items after repositioning them in the AIL, because otherwise
779 * have already been unlocked as if the commit had succeeded. 883 * they could be immediately flushed and we'd have to race with the flusher
780 * Do not reference the transaction structure after this call. 884 * trying to pull the item from the AIL as we add it.
781 */ 885 */
782 /*ARGSUSED*/ 886void
783int 887xfs_trans_item_committed(
784_xfs_trans_commit( 888 struct xfs_log_item *lip,
785 xfs_trans_t *tp, 889 xfs_lsn_t commit_lsn,
786 uint flags, 890 int aborted)
787 int *log_flushed)
788{ 891{
789 xfs_log_iovec_t *log_vector; 892 xfs_lsn_t item_lsn;
790 int nvec; 893 struct xfs_ail *ailp;
791 xfs_mount_t *mp;
792 xfs_lsn_t commit_lsn;
793 /* REFERENCED */
794 int error;
795 int log_flags;
796 int sync;
797#define XFS_TRANS_LOGVEC_COUNT 16
798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
799 struct xlog_in_core *commit_iclog;
800 int shutdown;
801 894
802 commit_lsn = -1; 895 if (aborted)
896 lip->li_flags |= XFS_LI_ABORTED;
897 item_lsn = IOP_COMMITTED(lip, commit_lsn);
898
899 /* If the committed routine returns -1, item has been freed. */
900 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
901 return;
803 902
804 /* 903 /*
805 * Determine whether this commit is releasing a permanent 904 * If the returned lsn is greater than what it contained before, update
806 * log reservation or not. 905 * the location of the item in the AIL. If it is not, then do nothing.
906 * Items can never move backwards in the AIL.
907 *
908 * While the new lsn should usually be greater, it is possible that a
909 * later transaction completing simultaneously with an earlier one
910 * using the same item could complete first with a higher lsn. This
911 * would cause the earlier transaction to fail the test below.
807 */ 912 */
808 if (flags & XFS_TRANS_RELEASE_LOG_RES) { 913 ailp = lip->li_ailp;
809 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 914 spin_lock(&ailp->xa_lock);
810 log_flags = XFS_LOG_REL_PERM_RESERV; 915 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
916 /*
917 * This will set the item's lsn to item_lsn and update the
918 * position of the item in the AIL.
919 *
920 * xfs_trans_ail_update() drops the AIL lock.
921 */
922 xfs_trans_ail_update(ailp, lip, item_lsn);
811 } else { 923 } else {
812 log_flags = 0; 924 spin_unlock(&ailp->xa_lock);
813 } 925 }
814 mp = tp->t_mountp;
815 926
816 /* 927 /*
817 * If there is nothing to be logged by the transaction, 928 * Now that we've repositioned the item in the AIL, unpin it so it can
818 * then unlock all of the items associated with the 929 * be flushed. Pass information about buffer stale state down from the
819 * transaction and free the transaction structure. 930 * log item flags, if anyone else stales the buffer we do not want to
820 * Also make sure to return any reserved blocks to 931 * pay any attention to it.
821 * the free pool.
822 */ 932 */
823shut_us_down: 933 IOP_UNPIN(lip);
824 shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; 934}
825 if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { 935
826 xfs_trans_unreserve_and_mod_sb(tp); 936/*
937 * This is typically called by the LM when a transaction has been fully
938 * committed to disk. It needs to unpin the items which have
939 * been logged by the transaction and update their positions
940 * in the AIL if necessary.
941 *
942 * This also gets called when the transactions didn't get written out
943 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
944 */
945STATIC void
946xfs_trans_committed(
947 struct xfs_trans *tp,
948 int abortflag)
949{
950 xfs_log_item_desc_t *lidp;
951 xfs_log_item_chunk_t *licp;
952 xfs_log_item_chunk_t *next_licp;
953
954 /* Call the transaction's completion callback if there is one. */
955 if (tp->t_callback != NULL)
956 tp->t_callback(tp, tp->t_callarg);
957
958 for (lidp = xfs_trans_first_item(tp);
959 lidp != NULL;
960 lidp = xfs_trans_next_item(tp, lidp)) {
961 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
962 }
963
964 /* free the item chunks, ignoring the embedded chunk */
965 for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
966 next_licp = licp->lic_next;
967 kmem_free(licp);
968 }
969
970 xfs_trans_free(tp);
971}
972
973/*
974 * Called from the trans_commit code when we notice that
975 * the filesystem is in the middle of a forced shutdown.
976 */
977STATIC void
978xfs_trans_uncommit(
979 struct xfs_trans *tp,
980 uint flags)
981{
982 xfs_log_item_desc_t *lidp;
983
984 for (lidp = xfs_trans_first_item(tp);
985 lidp != NULL;
986 lidp = xfs_trans_next_item(tp, lidp)) {
827 /* 987 /*
828 * It is indeed possible for the transaction to be 988 * Unpin all but those that aren't dirty.
829 * not dirty but the dqinfo portion to be. All that
830 * means is that we have some (non-persistent) quota
831 * reservations that need to be unreserved.
832 */ 989 */
833 xfs_trans_unreserve_and_mod_dquots(tp); 990 if (lidp->lid_flags & XFS_LID_DIRTY)
834 if (tp->t_ticket) { 991 IOP_UNPIN_REMOVE(lidp->lid_item, tp);
835 commit_lsn = xfs_log_done(mp, tp->t_ticket,
836 NULL, log_flags);
837 if (commit_lsn == -1 && !shutdown)
838 shutdown = XFS_ERROR(EIO);
839 }
840 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
841 xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
842 xfs_trans_free_busy(tp);
843 xfs_trans_free(tp);
844 XFS_STATS_INC(xs_trans_empty);
845 return (shutdown);
846 } 992 }
847 ASSERT(tp->t_ticket != NULL);
848 993
849 /* 994 xfs_trans_unreserve_and_mod_sb(tp);
850 * If we need to update the superblock, then do it now. 995 xfs_trans_unreserve_and_mod_dquots(tp);
851 */ 996
852 if (tp->t_flags & XFS_TRANS_SB_DIRTY) 997 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
853 xfs_trans_apply_sb_deltas(tp); 998 xfs_trans_free(tp);
854 xfs_trans_apply_dquot_deltas(tp); 999}
1000
1001/*
1002 * Format the transaction direct to the iclog. This isolates the physical
1003 * transaction commit operation from the logical operation and hence allows
1004 * other methods to be introduced without affecting the existing commit path.
1005 */
1006static int
1007xfs_trans_commit_iclog(
1008 struct xfs_mount *mp,
1009 struct xfs_trans *tp,
1010 xfs_lsn_t *commit_lsn,
1011 int flags)
1012{
1013 int shutdown;
1014 int error;
1015 int log_flags = 0;
1016 struct xlog_in_core *commit_iclog;
1017#define XFS_TRANS_LOGVEC_COUNT 16
1018 struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
1019 struct xfs_log_iovec *log_vector;
1020 uint nvec;
1021
855 1022
856 /* 1023 /*
857 * Ask each log item how many log_vector entries it will 1024 * Ask each log item how many log_vector entries it will
@@ -861,8 +1028,7 @@ shut_us_down:
861 */ 1028 */
862 nvec = xfs_trans_count_vecs(tp); 1029 nvec = xfs_trans_count_vecs(tp);
863 if (nvec == 0) { 1030 if (nvec == 0) {
864 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1031 return ENOMEM; /* triggers a shutdown! */
865 goto shut_us_down;
866 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { 1032 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
867 log_vector = log_vector_fast; 1033 log_vector = log_vector_fast;
868 } else { 1034 } else {
@@ -877,6 +1043,9 @@ shut_us_down:
877 */ 1043 */
878 xfs_trans_fill_vecs(tp, log_vector); 1044 xfs_trans_fill_vecs(tp, log_vector);
879 1045
1046 if (flags & XFS_TRANS_RELEASE_LOG_RES)
1047 log_flags = XFS_LOG_REL_PERM_RESERV;
1048
880 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); 1049 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
881 1050
882 /* 1051 /*
@@ -884,18 +1053,19 @@ shut_us_down:
884 * at any time after this call. However, all the items associated 1053 * at any time after this call. However, all the items associated
885 * with the transaction are still locked and pinned in memory. 1054 * with the transaction are still locked and pinned in memory.
886 */ 1055 */
887 commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1056 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
888 1057
889 tp->t_commit_lsn = commit_lsn; 1058 tp->t_commit_lsn = *commit_lsn;
890 if (nvec > XFS_TRANS_LOGVEC_COUNT) { 1059 trace_xfs_trans_commit_lsn(tp);
1060
1061 if (nvec > XFS_TRANS_LOGVEC_COUNT)
891 kmem_free(log_vector); 1062 kmem_free(log_vector);
892 }
893 1063
894 /* 1064 /*
895 * If we got a log write error. Unpin the logitems that we 1065 * If we got a log write error. Unpin the logitems that we
896 * had pinned, clean up, free trans structure, and return error. 1066 * had pinned, clean up, free trans structure, and return error.
897 */ 1067 */
898 if (error || commit_lsn == -1) { 1068 if (error || *commit_lsn == -1) {
899 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1069 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
900 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); 1070 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
901 return XFS_ERROR(EIO); 1071 return XFS_ERROR(EIO);
@@ -909,8 +1079,6 @@ shut_us_down:
909 */ 1079 */
910 xfs_trans_unreserve_and_mod_sb(tp); 1080 xfs_trans_unreserve_and_mod_sb(tp);
911 1081
912 sync = tp->t_flags & XFS_TRANS_SYNC;
913
914 /* 1082 /*
915 * Tell the LM to call the transaction completion routine 1083 * Tell the LM to call the transaction completion routine
916 * when the log write with LSN commit_lsn completes (e.g. 1084 * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1121,7 @@ shut_us_down:
953 * the commit lsn of this transaction for dependency tracking 1121 * the commit lsn of this transaction for dependency tracking
954 * purposes. 1122 * purposes.
955 */ 1123 */
956 xfs_trans_unlock_items(tp, commit_lsn); 1124 xfs_trans_unlock_items(tp, *commit_lsn);
957 1125
958 /* 1126 /*
959 * If we detected a log error earlier, finish committing 1127 * If we detected a log error earlier, finish committing
@@ -973,156 +1141,204 @@ shut_us_down:
973 * and the items are released we can finally allow the iclog to 1141 * and the items are released we can finally allow the iclog to
974 * go to disk. 1142 * go to disk.
975 */ 1143 */
976 error = xfs_log_release_iclog(mp, commit_iclog); 1144 return xfs_log_release_iclog(mp, commit_iclog);
977
978 /*
979 * If the transaction needs to be synchronous, then force the
980 * log out now and wait for it.
981 */
982 if (sync) {
983 if (!error) {
984 error = _xfs_log_force_lsn(mp, commit_lsn,
985 XFS_LOG_SYNC, log_flushed);
986 }
987 XFS_STATS_INC(xs_trans_sync);
988 } else {
989 XFS_STATS_INC(xs_trans_async);
990 }
991
992 return (error);
993} 1145}
994 1146
995
996/* 1147/*
997 * Total up the number of log iovecs needed to commit this 1148 * Walk the log items and allocate log vector structures for
998 * transaction. The transaction itself needs one for the 1149 * each item large enough to fit all the vectors they require.
999 * transaction header. Ask each dirty item in turn how many 1150 * Note that this format differs from the old log vector format in
1000 * it needs to get the total. 1151 * that there is no transaction header in these log vectors.
1001 */ 1152 */
1002STATIC uint 1153STATIC struct xfs_log_vec *
1003xfs_trans_count_vecs( 1154xfs_trans_alloc_log_vecs(
1004 xfs_trans_t *tp) 1155 xfs_trans_t *tp)
1005{ 1156{
1006 int nvecs;
1007 xfs_log_item_desc_t *lidp; 1157 xfs_log_item_desc_t *lidp;
1158 struct xfs_log_vec *lv = NULL;
1159 struct xfs_log_vec *ret_lv = NULL;
1008 1160
1009 nvecs = 1;
1010 lidp = xfs_trans_first_item(tp); 1161 lidp = xfs_trans_first_item(tp);
1011 ASSERT(lidp != NULL);
1012 1162
1013 /* In the non-debug case we need to start bailing out if we 1163 /* Bail out if we didn't find a log item. */
1014 * didn't find a log_item here, return zero and let trans_commit 1164 if (!lidp) {
1015 * deal with it. 1165 ASSERT(0);
1016 */ 1166 return NULL;
1017 if (lidp == NULL) 1167 }
1018 return 0;
1019 1168
1020 while (lidp != NULL) { 1169 while (lidp != NULL) {
1021 /* 1170 struct xfs_log_vec *new_lv;
1022 * Skip items which aren't dirty in this transaction. 1171
1023 */ 1172 /* Skip items which aren't dirty in this transaction. */
1024 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1173 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1025 lidp = xfs_trans_next_item(tp, lidp); 1174 lidp = xfs_trans_next_item(tp, lidp);
1026 continue; 1175 continue;
1027 } 1176 }
1177
1178 /* Skip items that do not have any vectors for writing */
1028 lidp->lid_size = IOP_SIZE(lidp->lid_item); 1179 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1029 nvecs += lidp->lid_size; 1180 if (!lidp->lid_size) {
1181 lidp = xfs_trans_next_item(tp, lidp);
1182 continue;
1183 }
1184
1185 new_lv = kmem_zalloc(sizeof(*new_lv) +
1186 lidp->lid_size * sizeof(struct xfs_log_iovec),
1187 KM_SLEEP);
1188
1189 /* The allocated iovec region lies beyond the log vector. */
1190 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1191 new_lv->lv_niovecs = lidp->lid_size;
1192 new_lv->lv_item = lidp->lid_item;
1193 if (!ret_lv)
1194 ret_lv = new_lv;
1195 else
1196 lv->lv_next = new_lv;
1197 lv = new_lv;
1030 lidp = xfs_trans_next_item(tp, lidp); 1198 lidp = xfs_trans_next_item(tp, lidp);
1031 } 1199 }
1032 1200
1033 return nvecs; 1201 return ret_lv;
1034} 1202}
1035 1203
1036/* 1204static int
1037 * Called from the trans_commit code when we notice that 1205xfs_trans_commit_cil(
1038 * the filesystem is in the middle of a forced shutdown. 1206 struct xfs_mount *mp,
1039 */ 1207 struct xfs_trans *tp,
1040STATIC void 1208 xfs_lsn_t *commit_lsn,
1041xfs_trans_uncommit( 1209 int flags)
1042 xfs_trans_t *tp,
1043 uint flags)
1044{ 1210{
1045 xfs_log_item_desc_t *lidp; 1211 struct xfs_log_vec *log_vector;
1212 int error;
1046 1213
1047 for (lidp = xfs_trans_first_item(tp); 1214 /*
1048 lidp != NULL; 1215 * Get each log item to allocate a vector structure for
1049 lidp = xfs_trans_next_item(tp, lidp)) { 1216 * the log item to to pass to the log write code. The
1050 /* 1217 * CIL commit code will format the vector and save it away.
1051 * Unpin all but those that aren't dirty. 1218 */
1052 */ 1219 log_vector = xfs_trans_alloc_log_vecs(tp);
1053 if (lidp->lid_flags & XFS_LID_DIRTY) 1220 if (!log_vector)
1054 IOP_UNPIN_REMOVE(lidp->lid_item, tp); 1221 return ENOMEM;
1055 }
1056 1222
1057 xfs_trans_unreserve_and_mod_sb(tp); 1223 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1058 xfs_trans_unreserve_and_mod_dquots(tp); 1224 if (error)
1225 return error;
1059 1226
1060 xfs_trans_free_items(tp, flags); 1227 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1061 xfs_trans_free_busy(tp); 1228
1229 /* xfs_trans_free_items() unlocks them first */
1230 xfs_trans_free_items(tp, *commit_lsn, 0);
1062 xfs_trans_free(tp); 1231 xfs_trans_free(tp);
1232 return 0;
1063} 1233}
1064 1234
1065/* 1235/*
1066 * Fill in the vector with pointers to data to be logged 1236 * xfs_trans_commit
1067 * by this transaction. The transaction header takes
1068 * the first vector, and then each dirty item takes the
1069 * number of vectors it indicated it needed in xfs_trans_count_vecs().
1070 * 1237 *
1071 * As each item fills in the entries it needs, also pin the item 1238 * Commit the given transaction to the log a/synchronously.
1072 * so that it cannot be flushed out until the log write completes. 1239 *
1240 * XFS disk error handling mechanism is not based on a typical
1241 * transaction abort mechanism. Logically after the filesystem
1242 * gets marked 'SHUTDOWN', we can't let any new transactions
1243 * be durable - ie. committed to disk - because some metadata might
1244 * be inconsistent. In such cases, this returns an error, and the
1245 * caller may assume that all locked objects joined to the transaction
1246 * have already been unlocked as if the commit had succeeded.
1247 * Do not reference the transaction structure after this call.
1073 */ 1248 */
1074STATIC void 1249int
1075xfs_trans_fill_vecs( 1250_xfs_trans_commit(
1076 xfs_trans_t *tp, 1251 struct xfs_trans *tp,
1077 xfs_log_iovec_t *log_vector) 1252 uint flags,
1253 int *log_flushed)
1078{ 1254{
1079 xfs_log_item_desc_t *lidp; 1255 struct xfs_mount *mp = tp->t_mountp;
1080 xfs_log_iovec_t *vecp; 1256 xfs_lsn_t commit_lsn = -1;
1081 uint nitems; 1257 int error = 0;
1258 int log_flags = 0;
1259 int sync = tp->t_flags & XFS_TRANS_SYNC;
1082 1260
1083 /* 1261 /*
1084 * Skip over the entry for the transaction header, we'll 1262 * Determine whether this commit is releasing a permanent
1085 * fill that in at the end. 1263 * log reservation or not.
1086 */ 1264 */
1087 vecp = log_vector + 1; /* pointer arithmetic */ 1265 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
1266 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1267 log_flags = XFS_LOG_REL_PERM_RESERV;
1268 }
1088 1269
1089 nitems = 0; 1270 /*
1090 lidp = xfs_trans_first_item(tp); 1271 * If there is nothing to be logged by the transaction,
1091 ASSERT(lidp != NULL); 1272 * then unlock all of the items associated with the
1092 while (lidp != NULL) { 1273 * transaction and free the transaction structure.
1093 /* 1274 * Also make sure to return any reserved blocks to
1094 * Skip items which aren't dirty in this transaction. 1275 * the free pool.
1095 */ 1276 */
1096 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1277 if (!(tp->t_flags & XFS_TRANS_DIRTY))
1097 lidp = xfs_trans_next_item(tp, lidp); 1278 goto out_unreserve;
1098 continue; 1279
1099 } 1280 if (XFS_FORCED_SHUTDOWN(mp)) {
1100 /* 1281 error = XFS_ERROR(EIO);
1101 * The item may be marked dirty but not log anything. 1282 goto out_unreserve;
1102 * This can be used to get called when a transaction 1283 }
1103 * is committed. 1284
1104 */ 1285 ASSERT(tp->t_ticket != NULL);
1105 if (lidp->lid_size) { 1286
1106 nitems++; 1287 /*
1288 * If we need to update the superblock, then do it now.
1289 */
1290 if (tp->t_flags & XFS_TRANS_SB_DIRTY)
1291 xfs_trans_apply_sb_deltas(tp);
1292 xfs_trans_apply_dquot_deltas(tp);
1293
1294 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1295 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1296 else
1297 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1298
1299 if (error == ENOMEM) {
1300 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1301 error = XFS_ERROR(EIO);
1302 goto out_unreserve;
1303 }
1304
1305 /*
1306 * If the transaction needs to be synchronous, then force the
1307 * log out now and wait for it.
1308 */
1309 if (sync) {
1310 if (!error) {
1311 error = _xfs_log_force_lsn(mp, commit_lsn,
1312 XFS_LOG_SYNC, log_flushed);
1107 } 1313 }
1108 IOP_FORMAT(lidp->lid_item, vecp); 1314 XFS_STATS_INC(xs_trans_sync);
1109 vecp += lidp->lid_size; /* pointer arithmetic */ 1315 } else {
1110 IOP_PIN(lidp->lid_item); 1316 XFS_STATS_INC(xs_trans_async);
1111 lidp = xfs_trans_next_item(tp, lidp);
1112 } 1317 }
1113 1318
1319 return error;
1320
1321out_unreserve:
1322 xfs_trans_unreserve_and_mod_sb(tp);
1323
1114 /* 1324 /*
1115 * Now that we've counted the number of items in this 1325 * It is indeed possible for the transaction to be not dirty but
1116 * transaction, fill in the transaction header. 1326 * the dqinfo portion to be. All that means is that we have some
1327 * (non-persistent) quota reservations that need to be unreserved.
1117 */ 1328 */
1118 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; 1329 xfs_trans_unreserve_and_mod_dquots(tp);
1119 tp->t_header.th_type = tp->t_type; 1330 if (tp->t_ticket) {
1120 tp->t_header.th_num_items = nitems; 1331 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1332 if (commit_lsn == -1 && !error)
1122 log_vector->i_len = sizeof(xfs_trans_header_t); 1333 error = XFS_ERROR(EIO);
1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; 1334 }
1124} 1335 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1336 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1337 xfs_trans_free(tp);
1125 1338
1339 XFS_STATS_INC(xs_trans_empty);
1340 return error;
1341}
1126 1342
1127/* 1343/*
1128 * Unlock all of the transaction's items and free the transaction. 1344 * Unlock all of the transaction's items and free the transaction.
@@ -1195,25 +1411,10 @@ xfs_trans_cancel(
1195 /* mark this thread as no longer being in a transaction */ 1411 /* mark this thread as no longer being in a transaction */
1196 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1412 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1197 1413
1198 xfs_trans_free_items(tp, flags); 1414 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1199 xfs_trans_free_busy(tp);
1200 xfs_trans_free(tp); 1415 xfs_trans_free(tp);
1201} 1416}
1202 1417
1203
1204/*
1205 * Free the transaction structure. If there is more clean up
1206 * to do when the structure is freed, add it here.
1207 */
1208STATIC void
1209xfs_trans_free(
1210 xfs_trans_t *tp)
1211{
1212 atomic_dec(&tp->t_mountp->m_active_trans);
1213 xfs_trans_free_dqinfo(tp);
1214 kmem_zone_free(xfs_trans_zone, tp);
1215}
1216
1217/* 1418/*
1218 * Roll from one trans in the sequence of PERMANENT transactions to 1419 * Roll from one trans in the sequence of PERMANENT transactions to
1219 * the next: permanent transactions are only flushed out when 1420 * the next: permanent transactions are only flushed out when
@@ -1283,174 +1484,3 @@ xfs_trans_roll(
1283 xfs_trans_ihold(trans, dp); 1484 xfs_trans_ihold(trans, dp);
1284 return 0; 1485 return 0;
1285} 1486}
1286
1287/*
1288 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
1289 *
1290 * This is typically called by the LM when a transaction has been fully
1291 * committed to disk. It needs to unpin the items which have
1292 * been logged by the transaction and update their positions
1293 * in the AIL if necessary.
1294 * This also gets called when the transactions didn't get written out
1295 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
1296 *
1297 * Call xfs_trans_chunk_committed() to process the items in
1298 * each chunk.
1299 */
1300STATIC void
1301xfs_trans_committed(
1302 xfs_trans_t *tp,
1303 int abortflag)
1304{
1305 xfs_log_item_chunk_t *licp;
1306 xfs_log_item_chunk_t *next_licp;
1307 xfs_log_busy_chunk_t *lbcp;
1308 xfs_log_busy_slot_t *lbsp;
1309 int i;
1310
1311 /*
1312 * Call the transaction's completion callback if there
1313 * is one.
1314 */
1315 if (tp->t_callback != NULL) {
1316 tp->t_callback(tp, tp->t_callarg);
1317 }
1318
1319 /*
1320 * Special case the chunk embedded in the transaction.
1321 */
1322 licp = &(tp->t_items);
1323 if (!(xfs_lic_are_all_free(licp))) {
1324 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1325 }
1326
1327 /*
1328 * Process the items in each chunk in turn.
1329 */
1330 licp = licp->lic_next;
1331 while (licp != NULL) {
1332 ASSERT(!xfs_lic_are_all_free(licp));
1333 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1334 next_licp = licp->lic_next;
1335 kmem_free(licp);
1336 licp = next_licp;
1337 }
1338
1339 /*
1340 * Clear all the per-AG busy list items listed in this transaction
1341 */
1342 lbcp = &tp->t_busy;
1343 while (lbcp != NULL) {
1344 for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
1345 if (!XFS_LBC_ISFREE(lbcp, i)) {
1346 xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
1347 lbsp->lbc_idx);
1348 }
1349 }
1350 lbcp = lbcp->lbc_next;
1351 }
1352 xfs_trans_free_busy(tp);
1353
1354 /*
1355 * That's it for the transaction structure. Free it.
1356 */
1357 xfs_trans_free(tp);
1358}
1359
1360/*
1361 * This is called to perform the commit processing for each
1362 * item described by the given chunk.
1363 *
1364 * The commit processing consists of unlocking items which were
1365 * held locked with the SYNC_UNLOCK attribute, calling the committed
1366 * routine of each logged item, updating the item's position in the AIL
1367 * if necessary, and unpinning each item. If the committed routine
1368 * returns -1, then do nothing further with the item because it
1369 * may have been freed.
1370 *
1371 * Since items are unlocked when they are copied to the incore
1372 * log, it is possible for two transactions to be completing
1373 * and manipulating the same item simultaneously. The AIL lock
1374 * will protect the lsn field of each item. The value of this
1375 * field can never go backwards.
1376 *
1377 * We unpin the items after repositioning them in the AIL, because
1378 * otherwise they could be immediately flushed and we'd have to race
1379 * with the flusher trying to pull the item from the AIL as we add it.
1380 */
1381STATIC void
1382xfs_trans_chunk_committed(
1383 xfs_log_item_chunk_t *licp,
1384 xfs_lsn_t lsn,
1385 int aborted)
1386{
1387 xfs_log_item_desc_t *lidp;
1388 xfs_log_item_t *lip;
1389 xfs_lsn_t item_lsn;
1390 int i;
1391
1392 lidp = licp->lic_descs;
1393 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1394 struct xfs_ail *ailp;
1395
1396 if (xfs_lic_isfree(licp, i)) {
1397 continue;
1398 }
1399
1400 lip = lidp->lid_item;
1401 if (aborted)
1402 lip->li_flags |= XFS_LI_ABORTED;
1403
1404 /*
1405 * Send in the ABORTED flag to the COMMITTED routine
1406 * so that it knows whether the transaction was aborted
1407 * or not.
1408 */
1409 item_lsn = IOP_COMMITTED(lip, lsn);
1410
1411 /*
1412 * If the committed routine returns -1, make
1413 * no more references to the item.
1414 */
1415 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
1416 continue;
1417 }
1418
1419 /*
1420 * If the returned lsn is greater than what it
1421 * contained before, update the location of the
1422 * item in the AIL. If it is not, then do nothing.
1423 * Items can never move backwards in the AIL.
1424 *
1425 * While the new lsn should usually be greater, it
1426 * is possible that a later transaction completing
1427 * simultaneously with an earlier one using the
1428 * same item could complete first with a higher lsn.
1429 * This would cause the earlier transaction to fail
1430 * the test below.
1431 */
1432 ailp = lip->li_ailp;
1433 spin_lock(&ailp->xa_lock);
1434 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
1435 /*
1436 * This will set the item's lsn to item_lsn
1437 * and update the position of the item in
1438 * the AIL.
1439 *
1440 * xfs_trans_ail_update() drops the AIL lock.
1441 */
1442 xfs_trans_ail_update(ailp, lip, item_lsn);
1443 } else {
1444 spin_unlock(&ailp->xa_lock);
1445 }
1446
1447 /*
1448 * Now that we've repositioned the item in the AIL,
1449 * unpin it so it can be flushed. Pass information
1450 * about buffer stale state down from the log item
1451 * flags, if anyone else stales the buffer we do not
1452 * want to pay any attention to it.
1453 */
1454 IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
1455 }
1456}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
49#define XFS_LI_DQUOT 0x123d 49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e 50#define XFS_LI_QUOTAOFF 0x123e
51 51
52#define XFS_LI_TYPE_DESC \
53 { XFS_LI_EFI, "XFS_LI_EFI" }, \
54 { XFS_LI_EFD, "XFS_LI_EFD" }, \
55 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
56 { XFS_LI_INODE, "XFS_LI_INODE" }, \
57 { XFS_LI_BUF, "XFS_LI_BUF" }, \
58 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
59 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
60
52/* 61/*
53 * Transaction types. Used to distinguish types of buffers. 62 * Transaction types. Used to distinguish types of buffers.
54 */ 63 */
@@ -97,7 +106,8 @@ typedef struct xfs_trans_header {
97#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
98#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
99#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
100#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
101/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
102 112
103#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -139,6 +149,7 @@ typedef struct xfs_trans_header {
139 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
140 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
141 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
142 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
143 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
144 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc {
159 170
160#define XFS_LID_DIRTY 0x1 171#define XFS_LID_DIRTY 0x1
161#define XFS_LID_PINNED 0x2 172#define XFS_LID_PINNED 0x2
162#define XFS_LID_BUF_STALE 0x8
163 173
164/* 174/*
165 * This structure is used to maintain a chunk list of log_item_desc 175 * This structure is used to maintain a chunk list of log_item_desc
@@ -805,6 +815,7 @@ struct xfs_log_item_desc;
805struct xfs_mount; 815struct xfs_mount;
806struct xfs_trans; 816struct xfs_trans;
807struct xfs_dquot_acct; 817struct xfs_dquot_acct;
818struct xfs_busy_extent;
808 819
809typedef struct xfs_log_item { 820typedef struct xfs_log_item {
810 struct list_head li_ail; /* AIL pointers */ 821 struct list_head li_ail; /* AIL pointers */
@@ -820,6 +831,11 @@ typedef struct xfs_log_item {
820 /* buffer item iodone */ 831 /* buffer item iodone */
821 /* callback func */ 832 /* callback func */
822 struct xfs_item_ops *li_ops; /* function list */ 833 struct xfs_item_ops *li_ops; /* function list */
834
835 /* delayed logging */
836 struct list_head li_cil; /* CIL pointers */
837 struct xfs_log_vec *li_lv; /* active log vector */
838 xfs_lsn_t li_seq; /* CIL commit seq */
823} xfs_log_item_t; 839} xfs_log_item_t;
824 840
825#define XFS_LI_IN_AIL 0x1 841#define XFS_LI_IN_AIL 0x1
@@ -833,7 +849,7 @@ typedef struct xfs_item_ops {
833 uint (*iop_size)(xfs_log_item_t *); 849 uint (*iop_size)(xfs_log_item_t *);
834 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 850 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
835 void (*iop_pin)(xfs_log_item_t *); 851 void (*iop_pin)(xfs_log_item_t *);
836 void (*iop_unpin)(xfs_log_item_t *, int); 852 void (*iop_unpin)(xfs_log_item_t *);
837 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); 853 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
838 uint (*iop_trylock)(xfs_log_item_t *); 854 uint (*iop_trylock)(xfs_log_item_t *);
839 void (*iop_unlock)(xfs_log_item_t *); 855 void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +862,7 @@ typedef struct xfs_item_ops {
846#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 862#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
847#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 863#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
848#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 864#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
849#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) 865#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip)
850#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) 866#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
851#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 867#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
852#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 868#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
@@ -864,34 +880,6 @@ typedef struct xfs_item_ops {
864#define XFS_ITEM_PUSHBUF 3 880#define XFS_ITEM_PUSHBUF 3
865 881
866/* 882/*
867 * This structure is used to maintain a list of block ranges that have been
868 * freed in the transaction. The ranges are listed in the perag[] busy list
869 * between when they're freed and the transaction is committed to disk.
870 */
871
872typedef struct xfs_log_busy_slot {
873 xfs_agnumber_t lbc_ag;
874 ushort lbc_idx; /* index in perag.busy[] */
875} xfs_log_busy_slot_t;
876
877#define XFS_LBC_NUM_SLOTS 31
878typedef struct xfs_log_busy_chunk {
879 struct xfs_log_busy_chunk *lbc_next;
880 uint lbc_free; /* free slots bitmask */
881 ushort lbc_unused; /* first unused */
882 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
883} xfs_log_busy_chunk_t;
884
885#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
886#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
887
888#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
889#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
890#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
891#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
892#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
893
894/*
895 * This is the type of function which can be given to xfs_trans_callback() 883 * This is the type of function which can be given to xfs_trans_callback()
896 * to be called upon the transaction's commit to disk. 884 * to be called upon the transaction's commit to disk.
897 */ 885 */
@@ -942,8 +930,7 @@ typedef struct xfs_trans {
942 unsigned int t_items_free; /* log item descs free */ 930 unsigned int t_items_free; /* log item descs free */
943 xfs_log_item_chunk_t t_items; /* first log item desc chunk */ 931 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
944 xfs_trans_header_t t_header; /* header for in-log trans */ 932 xfs_trans_header_t t_header; /* header for in-log trans */
945 unsigned int t_busy_free; /* busy descs free */ 933 struct list_head t_busy; /* list of busy extents */
946 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
947 unsigned long t_pflags; /* saved process flags state */ 934 unsigned long t_pflags; /* saved process flags state */
948} xfs_trans_t; 935} xfs_trans_t;
949 936
@@ -1017,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *,
1017void xfs_trans_cancel(xfs_trans_t *, int); 1004void xfs_trans_cancel(xfs_trans_t *, int);
1018int xfs_trans_ail_init(struct xfs_mount *); 1005int xfs_trans_ail_init(struct xfs_mount *);
1019void xfs_trans_ail_destroy(struct xfs_mount *); 1006void xfs_trans_ail_destroy(struct xfs_mount *);
1020xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1021 xfs_agnumber_t ag,
1022 xfs_extlen_t idx);
1023 1007
1024extern kmem_zone_t *xfs_trans_zone; 1008extern kmem_zone_t *xfs_trans_zone;
1025 1009
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,11 +40,51 @@
40#include "xfs_rw.h" 40#include "xfs_rw.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42 42
43/*
44 * Check to see if a buffer matching the given parameters is already
45 * a part of the given transaction.
46 */
47STATIC struct xfs_buf *
48xfs_trans_buf_item_match(
49 struct xfs_trans *tp,
50 struct xfs_buftarg *target,
51 xfs_daddr_t blkno,
52 int len)
53{
54 xfs_log_item_chunk_t *licp;
55 xfs_log_item_desc_t *lidp;
56 xfs_buf_log_item_t *blip;
57 int i;
58
59 len = BBTOB(len);
60 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
61 if (xfs_lic_are_all_free(licp)) {
62 ASSERT(licp == &tp->t_items);
63 ASSERT(licp->lic_next == NULL);
64 return NULL;
65 }
66
67 for (i = 0; i < licp->lic_unused; i++) {
68 /*
69 * Skip unoccupied slots.
70 */
71 if (xfs_lic_isfree(licp, i))
72 continue;
73
74 lidp = xfs_lic_slot(licp, i);
75 blip = (xfs_buf_log_item_t *)lidp->lid_item;
76 if (blip->bli_item.li_type != XFS_LI_BUF)
77 continue;
78
79 if (XFS_BUF_TARGET(blip->bli_buf) == target &&
80 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
81 XFS_BUF_COUNT(blip->bli_buf) == len)
82 return blip->bli_buf;
83 }
84 }
43 85
44STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, 86 return NULL;
45 xfs_daddr_t, int); 87}
46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
47 xfs_daddr_t, int);
48 88
49/* 89/*
50 * Add the locked buffer to the transaction. 90 * Add the locked buffer to the transaction.
@@ -74,7 +114,7 @@ _xfs_trans_bjoin(
74 xfs_buf_item_init(bp, tp->t_mountp); 114 xfs_buf_item_init(bp, tp->t_mountp);
75 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
76 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
77 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
78 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
79 if (reset_recur) 119 if (reset_recur)
80 bip->bli_recur = 0; 120 bip->bli_recur = 0;
@@ -112,14 +152,6 @@ xfs_trans_bjoin(
112 * within the transaction, just increment its lock recursion count 152 * within the transaction, just increment its lock recursion count
113 * and return a pointer to it. 153 * and return a pointer to it.
114 * 154 *
115 * Use the fast path function xfs_trans_buf_item_match() or the buffer
116 * cache routine incore_match() to find the buffer
117 * if it is already owned by this transaction.
118 *
119 * If we don't already own the buffer, use get_buf() to get it.
120 * If it doesn't yet have an associated xfs_buf_log_item structure,
121 * then allocate one and add the item to this transaction.
122 *
123 * If the transaction pointer is NULL, make this just a normal 155 * If the transaction pointer is NULL, make this just a normal
124 * get_buf() call. 156 * get_buf() call.
125 */ 157 */
@@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
149 * have it locked. In this case we just increment the lock 181 * have it locked. In this case we just increment the lock
150 * recursion count and return the buffer to the caller. 182 * recursion count and return the buffer to the caller.
151 */ 183 */
152 if (tp->t_items.lic_next == NULL) { 184 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
153 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
154 } else {
155 bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
156 }
157 if (bp != NULL) { 185 if (bp != NULL) {
158 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 186 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
159 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) 187 if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +287,6 @@ int xfs_error_mod = 33;
259 * within the transaction and already read in, just increment its 287 * within the transaction and already read in, just increment its
260 * lock recursion count and return a pointer to it. 288 * lock recursion count and return a pointer to it.
261 * 289 *
262 * Use the fast path function xfs_trans_buf_item_match() or the buffer
263 * cache routine incore_match() to find the buffer
264 * if it is already owned by this transaction.
265 *
266 * If we don't already own the buffer, use read_buf() to get it.
267 * If it doesn't yet have an associated xfs_buf_log_item structure,
268 * then allocate one and add the item to this transaction.
269 *
270 * If the transaction pointer is NULL, make this just a normal 290 * If the transaction pointer is NULL, make this just a normal
271 * read_buf() call. 291 * read_buf() call.
272 */ 292 */
@@ -328,11 +348,7 @@ xfs_trans_read_buf(
328 * If the buffer is not yet read in, then we read it in, increment 348 * If the buffer is not yet read in, then we read it in, increment
329 * the lock recursion count, and return it to the caller. 349 * the lock recursion count, and return it to the caller.
330 */ 350 */
331 if (tp->t_items.lic_next == NULL) { 351 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
332 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
333 } else {
334 bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
335 }
336 if (bp != NULL) { 352 if (bp != NULL) {
337 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 353 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
338 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 354 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -495,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
495 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
496 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
497 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
498 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
499 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 ASSERT(atomic_read(&bip->bli_refcount) > 0);
500 516
501 /* 517 /*
@@ -603,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
603 619
604 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
605 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
606 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
607 ASSERT(atomic_read(&bip->bli_refcount) > 0); 623 ASSERT(atomic_read(&bip->bli_refcount) > 0);
608 bip->bli_flags |= XFS_BLI_HOLD; 624 bip->bli_flags |= XFS_BLI_HOLD;
609 trace_xfs_trans_bhold(bip); 625 trace_xfs_trans_bhold(bip);
@@ -625,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
625 641
626 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
627 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
628 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
629 ASSERT(atomic_read(&bip->bli_refcount) > 0); 645 ASSERT(atomic_read(&bip->bli_refcount) > 0);
630 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 646 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
631 bip->bli_flags &= ~XFS_BLI_HOLD; 647 bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -688,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
688 bip->bli_flags &= ~XFS_BLI_STALE; 704 bip->bli_flags &= ~XFS_BLI_STALE;
689 ASSERT(XFS_BUF_ISSTALE(bp)); 705 ASSERT(XFS_BUF_ISSTALE(bp));
690 XFS_BUF_UNSTALE(bp); 706 XFS_BUF_UNSTALE(bp);
691 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; 707 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
692 } 708 }
693 709
694 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); 710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
696 712
697 tp->t_flags |= XFS_TRANS_DIRTY; 713 tp->t_flags |= XFS_TRANS_DIRTY;
698 lidp->lid_flags |= XFS_LID_DIRTY; 714 lidp->lid_flags |= XFS_LID_DIRTY;
699 lidp->lid_flags &= ~XFS_LID_BUF_STALE;
700 bip->bli_flags |= XFS_BLI_LOGGED; 715 bip->bli_flags |= XFS_BLI_LOGGED;
701 xfs_buf_item_log(bip, first, last); 716 xfs_buf_item_log(bip, first, last);
702} 717}
@@ -747,8 +762,8 @@ xfs_trans_binval(
747 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
748 ASSERT(XFS_BUF_ISSTALE(bp)); 763 ASSERT(XFS_BUF_ISSTALE(bp));
749 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
750 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); 765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
751 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 766 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
752 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
753 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
754 return; 769 return;
@@ -759,7 +774,7 @@ xfs_trans_binval(
759 * in the buf log item. The STALE flag will be used in 774 * in the buf log item. The STALE flag will be used in
760 * xfs_buf_item_unpin() to determine if it should clean up 775 * xfs_buf_item_unpin() to determine if it should clean up
761 * when the last reference to the buf item is given up. 776 * when the last reference to the buf item is given up.
762 * We set the XFS_BLI_CANCEL flag in the buf log format structure 777 * We set the XFS_BLF_CANCEL flag in the buf log format structure
763 * and log the buf item. This will be used at recovery time 778 * and log the buf item. This will be used at recovery time
764 * to determine that copies of the buffer in the log before 779 * to determine that copies of the buffer in the log before
765 * this should not be replayed. 780 * this should not be replayed.
@@ -777,26 +792,26 @@ xfs_trans_binval(
777 XFS_BUF_UNDELAYWRITE(bp); 792 XFS_BUF_UNDELAYWRITE(bp);
778 XFS_BUF_STALE(bp); 793 XFS_BUF_STALE(bp);
779 bip->bli_flags |= XFS_BLI_STALE; 794 bip->bli_flags |= XFS_BLI_STALE;
780 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 795 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
781 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; 796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
782 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
783 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
784 (bip->bli_format.blf_map_size * sizeof(uint))); 799 (bip->bli_format.blf_map_size * sizeof(uint)));
785 lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; 800 lidp->lid_flags |= XFS_LID_DIRTY;
786 tp->t_flags |= XFS_TRANS_DIRTY; 801 tp->t_flags |= XFS_TRANS_DIRTY;
787} 802}
788 803
789/* 804/*
790 * This call is used to indicate that the buffer contains on-disk 805 * This call is used to indicate that the buffer contains on-disk inodes which
791 * inodes which must be handled specially during recovery. They 806 * must be handled specially during recovery. They require special handling
792 * require special handling because only the di_next_unlinked from 807 * because only the di_next_unlinked from the inodes in the buffer should be
793 * the inodes in the buffer should be recovered. The rest of the 808 * recovered. The rest of the data in the buffer is logged via the inodes
794 * data in the buffer is logged via the inodes themselves. 809 * themselves.
795 * 810 *
796 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 811 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
797 * format structure so that we'll know what to do at recovery time. 812 * transferred to the buffer's log format structure so that we'll know what to
813 * do at recovery time.
798 */ 814 */
799/* ARGSUSED */
800void 815void
801xfs_trans_inode_buf( 816xfs_trans_inode_buf(
802 xfs_trans_t *tp, 817 xfs_trans_t *tp,
@@ -811,7 +826,7 @@ xfs_trans_inode_buf(
811 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
812 ASSERT(atomic_read(&bip->bli_refcount) > 0); 827 ASSERT(atomic_read(&bip->bli_refcount) > 0);
813 828
814 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; 829 bip->bli_flags |= XFS_BLI_INODE_BUF;
815} 830}
816 831
817/* 832/*
@@ -893,120 +908,12 @@ xfs_trans_dquot_buf(
893 ASSERT(XFS_BUF_ISBUSY(bp)); 908 ASSERT(XFS_BUF_ISBUSY(bp));
894 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
895 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
896 ASSERT(type == XFS_BLI_UDQUOT_BUF || 911 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
897 type == XFS_BLI_PDQUOT_BUF || 912 type == XFS_BLF_PDQUOT_BUF ||
898 type == XFS_BLI_GDQUOT_BUF); 913 type == XFS_BLF_GDQUOT_BUF);
899 914
900 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
901 ASSERT(atomic_read(&bip->bli_refcount) > 0); 916 ASSERT(atomic_read(&bip->bli_refcount) > 0);
902 917
903 bip->bli_format.blf_flags |= type; 918 bip->bli_format.blf_flags |= type;
904} 919}
905
906/*
907 * Check to see if a buffer matching the given parameters is already
908 * a part of the given transaction. Only check the first, embedded
909 * chunk, since we don't want to spend all day scanning large transactions.
910 */
911STATIC xfs_buf_t *
912xfs_trans_buf_item_match(
913 xfs_trans_t *tp,
914 xfs_buftarg_t *target,
915 xfs_daddr_t blkno,
916 int len)
917{
918 xfs_log_item_chunk_t *licp;
919 xfs_log_item_desc_t *lidp;
920 xfs_buf_log_item_t *blip;
921 xfs_buf_t *bp;
922 int i;
923
924 bp = NULL;
925 len = BBTOB(len);
926 licp = &tp->t_items;
927 if (!xfs_lic_are_all_free(licp)) {
928 for (i = 0; i < licp->lic_unused; i++) {
929 /*
930 * Skip unoccupied slots.
931 */
932 if (xfs_lic_isfree(licp, i)) {
933 continue;
934 }
935
936 lidp = xfs_lic_slot(licp, i);
937 blip = (xfs_buf_log_item_t *)lidp->lid_item;
938 if (blip->bli_item.li_type != XFS_LI_BUF) {
939 continue;
940 }
941
942 bp = blip->bli_buf;
943 if ((XFS_BUF_TARGET(bp) == target) &&
944 (XFS_BUF_ADDR(bp) == blkno) &&
945 (XFS_BUF_COUNT(bp) == len)) {
946 /*
947 * We found it. Break out and
948 * return the pointer to the buffer.
949 */
950 break;
951 } else {
952 bp = NULL;
953 }
954 }
955 }
956 return bp;
957}
958
959/*
960 * Check to see if a buffer matching the given parameters is already
961 * a part of the given transaction. Check all the chunks, we
962 * want to be thorough.
963 */
964STATIC xfs_buf_t *
965xfs_trans_buf_item_match_all(
966 xfs_trans_t *tp,
967 xfs_buftarg_t *target,
968 xfs_daddr_t blkno,
969 int len)
970{
971 xfs_log_item_chunk_t *licp;
972 xfs_log_item_desc_t *lidp;
973 xfs_buf_log_item_t *blip;
974 xfs_buf_t *bp;
975 int i;
976
977 bp = NULL;
978 len = BBTOB(len);
979 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
980 if (xfs_lic_are_all_free(licp)) {
981 ASSERT(licp == &tp->t_items);
982 ASSERT(licp->lic_next == NULL);
983 return NULL;
984 }
985 for (i = 0; i < licp->lic_unused; i++) {
986 /*
987 * Skip unoccupied slots.
988 */
989 if (xfs_lic_isfree(licp, i)) {
990 continue;
991 }
992
993 lidp = xfs_lic_slot(licp, i);
994 blip = (xfs_buf_log_item_t *)lidp->lid_item;
995 if (blip->bli_item.li_type != XFS_LI_BUF) {
996 continue;
997 }
998
999 bp = blip->bli_buf;
1000 if ((XFS_BUF_TARGET(bp) == target) &&
1001 (XFS_BUF_ADDR(bp) == blkno) &&
1002 (XFS_BUF_COUNT(bp) == len)) {
1003 /*
1004 * We found it. Break out and
1005 * return the pointer to the buffer.
1006 */
1007 return bp;
1008 }
1009 }
1010 }
1011 return NULL;
1012}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
299void 299void
300xfs_trans_free_items( 300xfs_trans_free_items(
301 xfs_trans_t *tp, 301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
302 int flags) 303 int flags)
303{ 304{
304 xfs_log_item_chunk_t *licp; 305 xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
311 * Special case the embedded chunk so we don't free it below. 312 * Special case the embedded chunk so we don't free it below.
312 */ 313 */
313 if (!xfs_lic_are_all_free(licp)) { 314 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
315 xfs_lic_all_free(licp); 316 xfs_lic_all_free(licp);
316 licp->lic_unused = 0; 317 licp->lic_unused = 0;
317 } 318 }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
322 */ 323 */
323 while (licp != NULL) { 324 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp)); 325 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
326 next_licp = licp->lic_next; 327 next_licp = licp->lic_next;
327 kmem_free(licp); 328 kmem_free(licp);
328 licp = next_licp; 329 licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
438 439
439 return freed; 440 return freed;
440} 441}
441
442
443/*
444 * This is called to add the given busy item to the transaction's
445 * list of busy items. It must find a free busy item descriptor
446 * or allocate a new one and add the item to that descriptor.
447 * The function returns a pointer to busy descriptor used to point
448 * to the new busy entry. The log busy entry will now point to its new
449 * descriptor with its ???? field.
450 */
451xfs_log_busy_slot_t *
452xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
453{
454 xfs_log_busy_chunk_t *lbcp;
455 xfs_log_busy_slot_t *lbsp;
456 int i=0;
457
458 /*
459 * If there are no free descriptors, allocate a new chunk
460 * of them and put it at the front of the chunk list.
461 */
462 if (tp->t_busy_free == 0) {
463 lbcp = (xfs_log_busy_chunk_t*)
464 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
465 ASSERT(lbcp != NULL);
466 /*
467 * Initialize the chunk, and then
468 * claim the first slot in the newly allocated chunk.
469 */
470 XFS_LBC_INIT(lbcp);
471 XFS_LBC_CLAIM(lbcp, 0);
472 lbcp->lbc_unused = 1;
473 lbsp = XFS_LBC_SLOT(lbcp, 0);
474
475 /*
476 * Link in the new chunk and update the free count.
477 */
478 lbcp->lbc_next = tp->t_busy.lbc_next;
479 tp->t_busy.lbc_next = lbcp;
480 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
481
482 /*
483 * Initialize the descriptor and the generic portion
484 * of the log item.
485 *
486 * Point the new slot at this item and return it.
487 * Also point the log item at its currently active
488 * descriptor and set the item's mount pointer.
489 */
490 lbsp->lbc_ag = ag;
491 lbsp->lbc_idx = idx;
492 return lbsp;
493 }
494
495 /*
496 * Find the free descriptor. It is somewhere in the chunklist
497 * of descriptors.
498 */
499 lbcp = &tp->t_busy;
500 while (lbcp != NULL) {
501 if (XFS_LBC_VACANCY(lbcp)) {
502 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
503 i = lbcp->lbc_unused;
504 break;
505 } else {
506 /* out-of-order vacancy */
507 cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
508 ASSERT(0);
509 }
510 }
511 lbcp = lbcp->lbc_next;
512 }
513 ASSERT(lbcp != NULL);
514 /*
515 * If we find a free descriptor, claim it,
516 * initialize it, and return it.
517 */
518 XFS_LBC_CLAIM(lbcp, i);
519 if (lbcp->lbc_unused <= i) {
520 lbcp->lbc_unused = i + 1;
521 }
522 lbsp = XFS_LBC_SLOT(lbcp, i);
523 tp->t_busy_free--;
524 lbsp->lbc_ag = ag;
525 lbsp->lbc_idx = idx;
526 return lbsp;
527}
528
529
530/*
531 * xfs_trans_free_busy
532 * Free all of the busy lists from a transaction
533 */
534void
535xfs_trans_free_busy(xfs_trans_t *tp)
536{
537 xfs_log_busy_chunk_t *lbcp;
538 xfs_log_busy_chunk_t *lbcq;
539
540 lbcp = tp->t_busy.lbc_next;
541 while (lbcp != NULL) {
542 lbcq = lbcp->lbc_next;
543 kmem_free(lbcp);
544 lbcp = lbcq;
545 }
546
547 XFS_LBC_INIT(&tp->t_busy);
548 tp->t_busy.lbc_unused = 0;
549}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); 35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, 36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *); 37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int); 38
39void xfs_trans_unlock_items(struct xfs_trans *, 39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40 xfs_lsn_t); 40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41void xfs_trans_free_busy(xfs_trans_t *tp); 41 int flags);
42xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, 42
43 xfs_agnumber_t ag, 43void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_extlen_t idx); 44 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
45 46
46/* 47/*
47 * AIL traversal cursor. 48 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ 76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77 77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
78/* 80/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 81 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 82 * Disk based types: