aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2009-11-01 05:11:07 -0500
committerTakashi Iwai <tiwai@suse.de>2009-11-01 05:11:07 -0500
commite87a3dd33eab30b4db539500064a9584867e4f2c (patch)
tree2f7ad16e46ae30518ff63bb5391b63f7f7cc74dd /fs
parentb14f5de731ae657d498d18d713c6431bfbeefb4b (diff)
parent3d00941371a765779c4e3509214c7e5793cce1fe (diff)
Merge branch 'fix/misc' into topic/misc
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig9
-rw-r--r--fs/9p/Makefile3
-rw-r--r--fs/9p/cache.c474
-rw-r--r--fs/9p/cache.h176
-rw-r--r--fs/9p/v9fs.c196
-rw-r--r--fs/9p/v9fs.h13
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c88
-rw-r--r--fs/9p/vfs_file.c25
-rw-r--r--fs/9p/vfs_inode.c61
-rw-r--r--fs/9p/vfs_super.c16
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/adfs/inode.c7
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/proc.c8
-rw-r--r--fs/aio.c57
-rw-r--r--fs/anon_inodes.c70
-rw-r--r--fs/attr.c46
-rw-r--r--fs/autofs/dirhash.c2
-rw-r--r--fs/befs/linuxvfs.c9
-rw-r--r--fs/binfmt_elf.c96
-rw-r--r--fs/binfmt_elf_fdpic.c73
-rw-r--r--fs/binfmt_flat.c22
-rw-r--r--fs/bio.c49
-rw-r--r--fs/block_dev.c142
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c254
-rw-r--r--fs/btrfs/async-thread.h12
-rw-r--r--fs/btrfs/btrfs_inode.h9
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c6
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/dir-item.c47
-rw-r--r--fs/btrfs/disk-io.c247
-rw-r--r--fs/btrfs/export.c133
-rw-r--r--fs/btrfs/extent-tree.c2035
-rw-r--r--fs/btrfs/extent_io.c416
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/extent_map.c103
-rw-r--r--fs/btrfs/extent_map.h5
-rw-r--r--fs/btrfs/file.c70
-rw-r--r--fs/btrfs/free-space-cache.c36
-rw-r--r--fs/btrfs/inode-item.c4
-rw-r--r--fs/btrfs/inode-map.c93
-rw-r--r--fs/btrfs/inode.c929
-rw-r--r--fs/btrfs/ioctl.c399
-rw-r--r--fs/btrfs/ioctl.h3
-rw-r--r--fs/btrfs/ordered-data.c126
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/orphan.c20
-rw-r--r--fs/btrfs/relocation.c280
-rw-r--r--fs/btrfs/root-tree.c138
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/transaction.c48
-rw-r--r--fs/btrfs/tree-log.c27
-rw-r--r--fs/btrfs/volumes.c121
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c77
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifsfs.c100
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifsproto.h11
-rw-r--r--fs/cifs/cifssmb.c1
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/dir.c64
-rw-r--r--fs/cifs/file.c137
-rw-r--r--fs/cifs/inode.c53
-rw-r--r--fs/cifs/misc.c34
-rw-r--r--fs/cifs/readdir.c4
-rw-r--r--fs/cifs/transport.c50
-rw-r--r--fs/coda/coda_int.h1
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/compat.c31
-rw-r--r--fs/devpts/inode.c3
-rw-r--r--fs/dlm/debug_fs.c12
-rw-r--r--fs/drop_caches.c4
-rw-r--r--fs/ecryptfs/Kconfig4
-rw-r--r--fs/ecryptfs/crypto.c39
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/keystore.c39
-rw-r--r--fs/ecryptfs/kthread.c24
-rw-r--r--fs/ecryptfs/main.c3
-rw-r--r--fs/ecryptfs/mmap.c6
-rw-r--r--fs/ecryptfs/read_write.c32
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/eventfd.c67
-rw-r--r--fs/exec.c125
-rw-r--r--fs/exofs/super.c6
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/xip.c2
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/Kconfig14
-rw-r--r--fs/ext4/ext4.h54
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c444
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c584
-rw-r--r--fs/ext4/mballoc.c305
-rw-r--r--fs/ext4/mballoc.h35
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c20
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/ext4/super.c134
-rw-r--r--fs/fat/fat.h2
-rw-r--r--fs/fat/inode.c34
-rw-r--r--fs/fat/misc.c8
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c108
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fs-writeback.c165
-rw-r--r--fs/fuse/dir.c14
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c11
-rw-r--r--fs/gfs2/aops.c3
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/ops_inode.c1
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/hfs/mdb.c6
-rw-r--r--fs/hfsplus/super.c6
-rw-r--r--fs/hugetlbfs/inode.c48
-rw-r--r--fs/inode.c124
-rw-r--r--fs/internal.h1
-rw-r--r--fs/ioctl.c9
-rw-r--r--fs/isofs/inode.c8
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/commit.c59
-rw-r--r--fs/jbd2/journal.c200
-rw-r--r--fs/jffs2/background.c20
-rw-r--r--fs/jffs2/malloc.c4
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/super.c9
-rw-r--r--fs/libfs.c13
-rw-r--r--fs/lockd/clntlock.c2
-rw-r--r--fs/lockd/clntproc.c2
-rw-r--r--fs/lockd/host.c4
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c1
-rw-r--r--fs/lockd/xdr4.c1
-rw-r--r--fs/locks.c2
-rw-r--r--fs/minix/dir.c22
-rw-r--r--fs/namespace.c77
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/inode.c12
-rw-r--r--fs/ncpfs/ioctl.c8
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c27
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/fscache.c25
-rw-r--r--fs/nfs/fscache.h6
-rw-r--r--fs/nfs/inode.c54
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4proc.c1
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c80
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/nfs3xdr.c75
-rw-r--r--fs/nfsd/nfs4acl.c4
-rw-r--r--fs/nfsd/nfs4callback.c263
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c89
-rw-r--r--fs/nfsd/nfs4state.c685
-rw-r--r--fs/nfsd/nfs4xdr.c42
-rw-r--r--fs/nfsd/nfsctl.c10
-rw-r--r--fs/nfsd/nfsfh.c158
-rw-r--r--fs/nfsd/nfssvc.c54
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nilfs2/btnode.c3
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/file.c6
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/inode.c3
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h14
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nls/nls_base.c11
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/file.c42
-rw-r--r--fs/ntfs/layout.h2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ntfs/super.c10
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c1342
-rw-r--r--fs/ocfs2/alloc.h101
-rw-r--r--fs/ocfs2/aops.c38
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/buffer_head_io.c47
-rw-r--r--fs/ocfs2/buffer_head_io.h8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/dir.c107
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c11
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlm/dlmthread.c7
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c1
-rw-r--r--fs/ocfs2/dlmglue.c105
-rw-r--r--fs/ocfs2/dlmglue.h6
-rw-r--r--fs/ocfs2/extent_map.c33
-rw-r--r--fs/ocfs2/extent_map.h8
-rw-r--r--fs/ocfs2/file.c151
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c86
-rw-r--r--fs/ocfs2/inode.h20
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/journal.c82
-rw-r--r--fs/ocfs2/journal.h94
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/namei.c341
-rw-r--r--fs/ocfs2/namei.h6
-rw-r--r--fs/ocfs2/ocfs2.h52
-rw-r--r--fs/ocfs2/ocfs2_fs.h107
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c9
-rw-r--r--fs/ocfs2/quota_local.c26
-rw-r--r--fs/ocfs2/refcounttree.c4313
-rw-r--r--fs/ocfs2/refcounttree.h106
-rw-r--r--fs/ocfs2/resize.c16
-rw-r--r--fs/ocfs2/slot_map.c10
-rw-r--r--fs/ocfs2/suballoc.c35
-rw-r--r--fs/ocfs2/super.c18
-rw-r--r--fs/ocfs2/symlink.c1
-rw-r--r--fs/ocfs2/uptodate.c265
-rw-r--r--fs/ocfs2/uptodate.h51
-rw-r--r--fs/ocfs2/xattr.c2056
-rw-r--r--fs/ocfs2/xattr.h15
-rw-r--r--fs/omfs/dir.c4
-rw-r--r--fs/omfs/file.c6
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/omfs/omfs.h10
-rw-r--r--fs/open.c5
-rw-r--r--fs/partitions/check.c14
-rw-r--r--fs/proc/array.c92
-rw-r--r--fs/proc/base.c67
-rw-r--r--fs/proc/kcore.c335
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/task_mmu.c57
-rw-r--r--fs/proc/uptime.c7
-rw-r--r--fs/qnx4/Kconfig11
-rw-r--r--fs/qnx4/Makefile2
-rw-r--r--fs/qnx4/bitmap.c81
-rw-r--r--fs/qnx4/dir.c5
-rw-r--r--fs/qnx4/file.c40
-rw-r--r--fs/qnx4/inode.c84
-rw-r--r--fs/qnx4/namei.c105
-rw-r--r--fs/qnx4/qnx4.h8
-rw-r--r--fs/qnx4/truncate.c34
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/ramfs/file-nommu.c18
-rw-r--r--fs/ramfs/inode.c4
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/select.c15
-rw-r--r--fs/seq_file.c74
-rw-r--r--fs/smbfs/inode.c10
-rw-r--r--fs/smbfs/proc.c2
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/super.c69
-rw-r--r--fs/sync.c1
-rw-r--r--fs/sysfs/bin.c4
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c112
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/file.c64
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c29
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/key.h35
-rw-r--r--fs/ubifs/log.c17
-rw-r--r--fs/ubifs/lprops.c43
-rw-r--r--fs/ubifs/master.c20
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c4
-rw-r--r--fs/ubifs/replay.c6
-rw-r--r--fs/ubifs/scan.c32
-rw-r--r--fs/ubifs/super.c33
-rw-r--r--fs/ubifs/tnc.c76
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c3
-rw-r--r--fs/xfs/xfs_fs.h2
320 files changed, 17274 insertions, 6948 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 74e0723e90bc..795233702a4e 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -8,3 +8,12 @@ config 9P_FS
8 See <http://v9fs.sf.net> for more information. 8 See <http://v9fs.sf.net> for more information.
9 9
10 If unsure, say N. 10 If unsure, say N.
11
12config 9P_FSCACHE
13 bool "Enable 9P client caching support (EXPERIMENTAL)"
14 depends on EXPERIMENTAL
15 depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
16 help
17 Choose Y here to enable persistent, read-only local
18 caching support for 9p clients using FS-Cache
19
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index bc7f0d1551e6..1a940ec7af61 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,5 +8,6 @@ obj-$(CONFIG_9P_FS) := 9p.o
8 vfs_dir.o \ 8 vfs_dir.o \
9 vfs_dentry.o \ 9 vfs_dentry.o \
10 v9fs.o \ 10 v9fs.o \
11 fid.o \ 11 fid.o
12 12
139p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
new file mode 100644
index 000000000000..51c94e26a346
--- /dev/null
+++ b/fs/9p/cache.c
@@ -0,0 +1,474 @@
1/*
2 * V9FS cache definitions.
3 *
4 * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to:
17 * Free Software Foundation
18 * 51 Franklin Street, Fifth Floor
19 * Boston, MA 02111-1301 USA
20 *
21 */
22
23#include <linux/jiffies.h>
24#include <linux/file.h>
25#include <linux/stat.h>
26#include <linux/sched.h>
27#include <linux/fs.h>
28#include <net/9p/9p.h>
29
30#include "v9fs.h"
31#include "cache.h"
32
33#define CACHETAG_LEN 11
34
35struct kmem_cache *vcookie_cache;
36
37struct fscache_netfs v9fs_cache_netfs = {
38 .name = "9p",
39 .version = 0,
40};
41
42static void init_once(void *foo)
43{
44 struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
45 vcookie->fscache = NULL;
46 vcookie->qid = NULL;
47 inode_init_once(&vcookie->inode);
48}
49
50/**
51 * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
52 * vcookie to inode mapping
53 *
54 * Returns 0 on success.
55 */
56
57static int v9fs_init_vcookiecache(void)
58{
59 vcookie_cache = kmem_cache_create("vcookie_cache",
60 sizeof(struct v9fs_cookie),
61 0, (SLAB_RECLAIM_ACCOUNT|
62 SLAB_MEM_SPREAD),
63 init_once);
64 if (!vcookie_cache)
65 return -ENOMEM;
66
67 return 0;
68}
69
70/**
71 * v9fs_destroy_vcookiecache - destroy the cache of vcookies
72 *
73 */
74
75static void v9fs_destroy_vcookiecache(void)
76{
77 kmem_cache_destroy(vcookie_cache);
78}
79
80int __v9fs_cache_register(void)
81{
82 int ret;
83 ret = v9fs_init_vcookiecache();
84 if (ret < 0)
85 return ret;
86
87 return fscache_register_netfs(&v9fs_cache_netfs);
88}
89
90void __v9fs_cache_unregister(void)
91{
92 v9fs_destroy_vcookiecache();
93 fscache_unregister_netfs(&v9fs_cache_netfs);
94}
95
96/**
97 * v9fs_random_cachetag - Generate a random tag to be associated
98 * with a new cache session.
99 *
100 * The value of jiffies is used for a fairly randomly cache tag.
101 */
102
103static
104int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
105{
106 v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
107 if (!v9ses->cachetag)
108 return -ENOMEM;
109
110 return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
111}
112
113static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
114 void *buffer, uint16_t bufmax)
115{
116 struct v9fs_session_info *v9ses;
117 uint16_t klen = 0;
118
119 v9ses = (struct v9fs_session_info *)cookie_netfs_data;
120 P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
121 buffer, bufmax);
122
123 if (v9ses->cachetag)
124 klen = strlen(v9ses->cachetag);
125
126 if (klen > bufmax)
127 return 0;
128
129 memcpy(buffer, v9ses->cachetag, klen);
130 P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
131 return klen;
132}
133
134const struct fscache_cookie_def v9fs_cache_session_index_def = {
135 .name = "9P.session",
136 .type = FSCACHE_COOKIE_TYPE_INDEX,
137 .get_key = v9fs_cache_session_get_key,
138};
139
140void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
141{
142 /* If no cache session tag was specified, we generate a random one. */
143 if (!v9ses->cachetag)
144 v9fs_random_cachetag(v9ses);
145
146 v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
147 &v9fs_cache_session_index_def,
148 v9ses);
149 P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
150 v9ses->fscache);
151}
152
153void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
154{
155 P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
156 v9ses->fscache);
157 fscache_relinquish_cookie(v9ses->fscache, 0);
158 v9ses->fscache = NULL;
159}
160
161
162static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
163 void *buffer, uint16_t bufmax)
164{
165 const struct v9fs_cookie *vcookie = cookie_netfs_data;
166 memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
167
168 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
169 vcookie->qid->path);
170 return sizeof(vcookie->qid->path);
171}
172
173static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
174 uint64_t *size)
175{
176 const struct v9fs_cookie *vcookie = cookie_netfs_data;
177 *size = i_size_read(&vcookie->inode);
178
179 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
180 *size);
181}
182
183static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
184 void *buffer, uint16_t buflen)
185{
186 const struct v9fs_cookie *vcookie = cookie_netfs_data;
187 memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
188
189 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
190 vcookie->qid->version);
191 return sizeof(vcookie->qid->version);
192}
193
194static enum
195fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
196 const void *buffer,
197 uint16_t buflen)
198{
199 const struct v9fs_cookie *vcookie = cookie_netfs_data;
200
201 if (buflen != sizeof(vcookie->qid->version))
202 return FSCACHE_CHECKAUX_OBSOLETE;
203
204 if (memcmp(buffer, &vcookie->qid->version,
205 sizeof(vcookie->qid->version)))
206 return FSCACHE_CHECKAUX_OBSOLETE;
207
208 return FSCACHE_CHECKAUX_OKAY;
209}
210
211static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
212{
213 struct v9fs_cookie *vcookie = cookie_netfs_data;
214 struct pagevec pvec;
215 pgoff_t first;
216 int loop, nr_pages;
217
218 pagevec_init(&pvec, 0);
219 first = 0;
220
221 for (;;) {
222 nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
223 first,
224 PAGEVEC_SIZE - pagevec_count(&pvec));
225 if (!nr_pages)
226 break;
227
228 for (loop = 0; loop < nr_pages; loop++)
229 ClearPageFsCache(pvec.pages[loop]);
230
231 first = pvec.pages[nr_pages - 1]->index + 1;
232
233 pvec.nr = nr_pages;
234 pagevec_release(&pvec);
235 cond_resched();
236 }
237}
238
239const struct fscache_cookie_def v9fs_cache_inode_index_def = {
240 .name = "9p.inode",
241 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
242 .get_key = v9fs_cache_inode_get_key,
243 .get_attr = v9fs_cache_inode_get_attr,
244 .get_aux = v9fs_cache_inode_get_aux,
245 .check_aux = v9fs_cache_inode_check_aux,
246 .now_uncached = v9fs_cache_inode_now_uncached,
247};
248
249void v9fs_cache_inode_get_cookie(struct inode *inode)
250{
251 struct v9fs_cookie *vcookie;
252 struct v9fs_session_info *v9ses;
253
254 if (!S_ISREG(inode->i_mode))
255 return;
256
257 vcookie = v9fs_inode2cookie(inode);
258 if (vcookie->fscache)
259 return;
260
261 v9ses = v9fs_inode2v9ses(inode);
262 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
263 &v9fs_cache_inode_index_def,
264 vcookie);
265
266 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
267 vcookie->fscache);
268}
269
270void v9fs_cache_inode_put_cookie(struct inode *inode)
271{
272 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
273
274 if (!vcookie->fscache)
275 return;
276 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
277 vcookie->fscache);
278
279 fscache_relinquish_cookie(vcookie->fscache, 0);
280 vcookie->fscache = NULL;
281}
282
283void v9fs_cache_inode_flush_cookie(struct inode *inode)
284{
285 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
286
287 if (!vcookie->fscache)
288 return;
289 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
290 vcookie->fscache);
291
292 fscache_relinquish_cookie(vcookie->fscache, 1);
293 vcookie->fscache = NULL;
294}
295
296void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
297{
298 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
299 struct p9_fid *fid;
300
301 if (!vcookie->fscache)
302 return;
303
304 spin_lock(&vcookie->lock);
305 fid = filp->private_data;
306 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
307 v9fs_cache_inode_flush_cookie(inode);
308 else
309 v9fs_cache_inode_get_cookie(inode);
310
311 spin_unlock(&vcookie->lock);
312}
313
314void v9fs_cache_inode_reset_cookie(struct inode *inode)
315{
316 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
317 struct v9fs_session_info *v9ses;
318 struct fscache_cookie *old;
319
320 if (!vcookie->fscache)
321 return;
322
323 old = vcookie->fscache;
324
325 spin_lock(&vcookie->lock);
326 fscache_relinquish_cookie(vcookie->fscache, 1);
327
328 v9ses = v9fs_inode2v9ses(inode);
329 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
330 &v9fs_cache_inode_index_def,
331 vcookie);
332
333 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
334 inode, old, vcookie->fscache);
335
336 spin_unlock(&vcookie->lock);
337}
338
339int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
340{
341 struct inode *inode = page->mapping->host;
342 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
343
344 BUG_ON(!vcookie->fscache);
345
346 if (PageFsCache(page)) {
347 if (fscache_check_page_write(vcookie->fscache, page)) {
348 if (!(gfp & __GFP_WAIT))
349 return 0;
350 fscache_wait_on_page_write(vcookie->fscache, page);
351 }
352
353 fscache_uncache_page(vcookie->fscache, page);
354 ClearPageFsCache(page);
355 }
356
357 return 1;
358}
359
360void __v9fs_fscache_invalidate_page(struct page *page)
361{
362 struct inode *inode = page->mapping->host;
363 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
364
365 BUG_ON(!vcookie->fscache);
366
367 if (PageFsCache(page)) {
368 fscache_wait_on_page_write(vcookie->fscache, page);
369 BUG_ON(!PageLocked(page));
370 fscache_uncache_page(vcookie->fscache, page);
371 ClearPageFsCache(page);
372 }
373}
374
375static void v9fs_vfs_readpage_complete(struct page *page, void *data,
376 int error)
377{
378 if (!error)
379 SetPageUptodate(page);
380
381 unlock_page(page);
382}
383
384/**
385 * __v9fs_readpage_from_fscache - read a page from cache
386 *
387 * Returns 0 if the pages are in cache and a BIO is submitted,
388 * 1 if the pages are not in cache and -error otherwise.
389 */
390
391int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
392{
393 int ret;
394 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
395
396 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
397 if (!vcookie->fscache)
398 return -ENOBUFS;
399
400 ret = fscache_read_or_alloc_page(vcookie->fscache,
401 page,
402 v9fs_vfs_readpage_complete,
403 NULL,
404 GFP_KERNEL);
405 switch (ret) {
406 case -ENOBUFS:
407 case -ENODATA:
408 P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
409 return 1;
410 case 0:
411 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
412 return ret;
413 default:
414 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
415 return ret;
416 }
417}
418
419/**
420 * __v9fs_readpages_from_fscache - read multiple pages from cache
421 *
422 * Returns 0 if the pages are in cache and a BIO is submitted,
423 * 1 if the pages are not in cache and -error otherwise.
424 */
425
426int __v9fs_readpages_from_fscache(struct inode *inode,
427 struct address_space *mapping,
428 struct list_head *pages,
429 unsigned *nr_pages)
430{
431 int ret;
432 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
433
434 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
435 if (!vcookie->fscache)
436 return -ENOBUFS;
437
438 ret = fscache_read_or_alloc_pages(vcookie->fscache,
439 mapping, pages, nr_pages,
440 v9fs_vfs_readpage_complete,
441 NULL,
442 mapping_gfp_mask(mapping));
443 switch (ret) {
444 case -ENOBUFS:
445 case -ENODATA:
446 P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
447 return 1;
448 case 0:
449 BUG_ON(!list_empty(pages));
450 BUG_ON(*nr_pages != 0);
451 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
452 return ret;
453 default:
454 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
455 return ret;
456 }
457}
458
459/**
460 * __v9fs_readpage_to_fscache - write a page to the cache
461 *
462 */
463
464void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
465{
466 int ret;
467 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
468
469 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
470 ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
471 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
472 if (ret != 0)
473 v9fs_uncache_page(inode, page);
474}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
new file mode 100644
index 000000000000..a94192bfaee8
--- /dev/null
+++ b/fs/9p/cache.h
@@ -0,0 +1,176 @@
1/*
2 * V9FS cache definitions.
3 *
4 * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to:
17 * Free Software Foundation
18 * 51 Franklin Street, Fifth Floor
19 * Boston, MA 02111-1301 USA
20 *
21 */
22
23#ifndef _9P_CACHE_H
24#ifdef CONFIG_9P_FSCACHE
25#include <linux/fscache.h>
26#include <linux/spinlock.h>
27
28extern struct kmem_cache *vcookie_cache;
29
30struct v9fs_cookie {
31 spinlock_t lock;
32 struct inode inode;
33 struct fscache_cookie *fscache;
34 struct p9_qid *qid;
35};
36
37static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
38{
39 return container_of(inode, struct v9fs_cookie, inode);
40}
41
42extern struct fscache_netfs v9fs_cache_netfs;
43extern const struct fscache_cookie_def v9fs_cache_session_index_def;
44extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
45
46extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
47extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
48
49extern void v9fs_cache_inode_get_cookie(struct inode *inode);
50extern void v9fs_cache_inode_put_cookie(struct inode *inode);
51extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
52extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
53extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
54
55extern int __v9fs_cache_register(void);
56extern void __v9fs_cache_unregister(void);
57
58extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
59extern void __v9fs_fscache_invalidate_page(struct page *page);
60extern int __v9fs_readpage_from_fscache(struct inode *inode,
61 struct page *page);
62extern int __v9fs_readpages_from_fscache(struct inode *inode,
63 struct address_space *mapping,
64 struct list_head *pages,
65 unsigned *nr_pages);
66extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
67
68
69/**
70 * v9fs_cache_register - Register v9fs file system with the cache
71 */
72static inline int v9fs_cache_register(void)
73{
74 return __v9fs_cache_register();
75}
76
77/**
78 * v9fs_cache_unregister - Unregister v9fs from the cache
79 */
80static inline void v9fs_cache_unregister(void)
81{
82 __v9fs_cache_unregister();
83}
84
85static inline int v9fs_fscache_release_page(struct page *page,
86 gfp_t gfp)
87{
88 return __v9fs_fscache_release_page(page, gfp);
89}
90
91static inline void v9fs_fscache_invalidate_page(struct page *page)
92{
93 __v9fs_fscache_invalidate_page(page);
94}
95
96static inline int v9fs_readpage_from_fscache(struct inode *inode,
97 struct page *page)
98{
99 return __v9fs_readpage_from_fscache(inode, page);
100}
101
102static inline int v9fs_readpages_from_fscache(struct inode *inode,
103 struct address_space *mapping,
104 struct list_head *pages,
105 unsigned *nr_pages)
106{
107 return __v9fs_readpages_from_fscache(inode, mapping, pages,
108 nr_pages);
109}
110
111static inline void v9fs_readpage_to_fscache(struct inode *inode,
112 struct page *page)
113{
114 if (PageFsCache(page))
115 __v9fs_readpage_to_fscache(inode, page);
116}
117
118static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
119{
120 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
121 fscache_uncache_page(vcookie->fscache, page);
122 BUG_ON(PageFsCache(page));
123}
124
125static inline void v9fs_vcookie_set_qid(struct inode *inode,
126 struct p9_qid *qid)
127{
128 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
129 spin_lock(&vcookie->lock);
130 vcookie->qid = qid;
131 spin_unlock(&vcookie->lock);
132}
133
134#else /* CONFIG_9P_FSCACHE */
135
136static inline int v9fs_cache_register(void)
137{
138 return 1;
139}
140
141static inline void v9fs_cache_unregister(void) {}
142
143static inline int v9fs_fscache_release_page(struct page *page,
144 gfp_t gfp) {
145 return 1;
146}
147
148static inline void v9fs_fscache_invalidate_page(struct page *page) {}
149
150static inline int v9fs_readpage_from_fscache(struct inode *inode,
151 struct page *page)
152{
153 return -ENOBUFS;
154}
155
156static inline int v9fs_readpages_from_fscache(struct inode *inode,
157 struct address_space *mapping,
158 struct list_head *pages,
159 unsigned *nr_pages)
160{
161 return -ENOBUFS;
162}
163
164static inline void v9fs_readpage_to_fscache(struct inode *inode,
165 struct page *page)
166{}
167
168static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
169{}
170
171static inline void v9fs_vcookie_set_qid(struct inode *inode,
172 struct p9_qid *qid)
173{}
174
175#endif /* CONFIG_9P_FSCACHE */
176#endif /* _9P_CACHE_H */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f7003cfac63d..cf62b05e296a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -34,21 +34,25 @@
34#include <net/9p/transport.h> 34#include <net/9p/transport.h>
35#include "v9fs.h" 35#include "v9fs.h"
36#include "v9fs_vfs.h" 36#include "v9fs_vfs.h"
37#include "cache.h"
38
39static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
40static LIST_HEAD(v9fs_sessionlist);
37 41
38/* 42/*
39 * Option Parsing (code inspired by NFS code) 43 * Option Parsing (code inspired by NFS code)
40 * NOTE: each transport will parse its own options 44 * NOTE: each transport will parse its own options
41 */ 45 */
42 46
43enum { 47enum {
44 /* Options that take integer arguments */ 48 /* Options that take integer arguments */
45 Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, 49 Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
46 /* String options */ 50 /* String options */
47 Opt_uname, Opt_remotename, Opt_trans, 51 Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
48 /* Options that take no arguments */ 52 /* Options that take no arguments */
49 Opt_nodevmap, 53 Opt_nodevmap,
50 /* Cache options */ 54 /* Cache options */
51 Opt_cache_loose, 55 Opt_cache_loose, Opt_fscache,
52 /* Access options */ 56 /* Access options */
53 Opt_access, 57 Opt_access,
54 /* Error token */ 58 /* Error token */
@@ -63,8 +67,10 @@ static const match_table_t tokens = {
63 {Opt_uname, "uname=%s"}, 67 {Opt_uname, "uname=%s"},
64 {Opt_remotename, "aname=%s"}, 68 {Opt_remotename, "aname=%s"},
65 {Opt_nodevmap, "nodevmap"}, 69 {Opt_nodevmap, "nodevmap"},
66 {Opt_cache_loose, "cache=loose"}, 70 {Opt_cache, "cache=%s"},
67 {Opt_cache_loose, "loose"}, 71 {Opt_cache_loose, "loose"},
72 {Opt_fscache, "fscache"},
73 {Opt_cachetag, "cachetag=%s"},
68 {Opt_access, "access=%s"}, 74 {Opt_access, "access=%s"},
69 {Opt_err, NULL} 75 {Opt_err, NULL}
70}; 76};
@@ -89,16 +95,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
89 v9ses->afid = ~0; 95 v9ses->afid = ~0;
90 v9ses->debug = 0; 96 v9ses->debug = 0;
91 v9ses->cache = 0; 97 v9ses->cache = 0;
98#ifdef CONFIG_9P_FSCACHE
99 v9ses->cachetag = NULL;
100#endif
92 101
93 if (!opts) 102 if (!opts)
94 return 0; 103 return 0;
95 104
96 options = kstrdup(opts, GFP_KERNEL); 105 options = kstrdup(opts, GFP_KERNEL);
97 if (!options) { 106 if (!options)
98 P9_DPRINTK(P9_DEBUG_ERROR, 107 goto fail_option_alloc;
99 "failed to allocate copy of option string\n");
100 return -ENOMEM;
101 }
102 108
103 while ((p = strsep(&options, ",")) != NULL) { 109 while ((p = strsep(&options, ",")) != NULL) {
104 int token; 110 int token;
@@ -143,16 +149,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
143 case Opt_cache_loose: 149 case Opt_cache_loose:
144 v9ses->cache = CACHE_LOOSE; 150 v9ses->cache = CACHE_LOOSE;
145 break; 151 break;
152 case Opt_fscache:
153 v9ses->cache = CACHE_FSCACHE;
154 break;
155 case Opt_cachetag:
156#ifdef CONFIG_9P_FSCACHE
157 v9ses->cachetag = match_strdup(&args[0]);
158#endif
159 break;
160 case Opt_cache:
161 s = match_strdup(&args[0]);
162 if (!s)
163 goto fail_option_alloc;
164
165 if (strcmp(s, "loose") == 0)
166 v9ses->cache = CACHE_LOOSE;
167 else if (strcmp(s, "fscache") == 0)
168 v9ses->cache = CACHE_FSCACHE;
169 else
170 v9ses->cache = CACHE_NONE;
171 kfree(s);
172 break;
146 173
147 case Opt_access: 174 case Opt_access:
148 s = match_strdup(&args[0]); 175 s = match_strdup(&args[0]);
149 if (!s) { 176 if (!s)
150 P9_DPRINTK(P9_DEBUG_ERROR, 177 goto fail_option_alloc;
151 "failed to allocate copy" 178
152 " of option argument\n");
153 ret = -ENOMEM;
154 break;
155 }
156 v9ses->flags &= ~V9FS_ACCESS_MASK; 179 v9ses->flags &= ~V9FS_ACCESS_MASK;
157 if (strcmp(s, "user") == 0) 180 if (strcmp(s, "user") == 0)
158 v9ses->flags |= V9FS_ACCESS_USER; 181 v9ses->flags |= V9FS_ACCESS_USER;
@@ -173,6 +196,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
173 } 196 }
174 kfree(options); 197 kfree(options);
175 return ret; 198 return ret;
199
200fail_option_alloc:
201 P9_DPRINTK(P9_DEBUG_ERROR,
202 "failed to allocate copy of option argument\n");
203 return -ENOMEM;
176} 204}
177 205
178/** 206/**
@@ -200,6 +228,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
200 return ERR_PTR(-ENOMEM); 228 return ERR_PTR(-ENOMEM);
201 } 229 }
202 230
231 spin_lock(&v9fs_sessionlist_lock);
232 list_add(&v9ses->slist, &v9fs_sessionlist);
233 spin_unlock(&v9fs_sessionlist_lock);
234
203 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; 235 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
204 strcpy(v9ses->uname, V9FS_DEFUSER); 236 strcpy(v9ses->uname, V9FS_DEFUSER);
205 strcpy(v9ses->aname, V9FS_DEFANAME); 237 strcpy(v9ses->aname, V9FS_DEFANAME);
@@ -249,6 +281,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
249 else 281 else
250 fid->uid = ~0; 282 fid->uid = ~0;
251 283
284#ifdef CONFIG_9P_FSCACHE
285 /* register the session for caching */
286 v9fs_cache_session_get_cookie(v9ses);
287#endif
288
252 return fid; 289 return fid;
253 290
254error: 291error:
@@ -268,8 +305,18 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
268 v9ses->clnt = NULL; 305 v9ses->clnt = NULL;
269 } 306 }
270 307
308#ifdef CONFIG_9P_FSCACHE
309 if (v9ses->fscache) {
310 v9fs_cache_session_put_cookie(v9ses);
311 kfree(v9ses->cachetag);
312 }
313#endif
271 __putname(v9ses->uname); 314 __putname(v9ses->uname);
272 __putname(v9ses->aname); 315 __putname(v9ses->aname);
316
317 spin_lock(&v9fs_sessionlist_lock);
318 list_del(&v9ses->slist);
319 spin_unlock(&v9fs_sessionlist_lock);
273} 320}
274 321
275/** 322/**
@@ -286,25 +333,132 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
286 333
287extern int v9fs_error_init(void); 334extern int v9fs_error_init(void);
288 335
336static struct kobject *v9fs_kobj;
337
338#ifdef CONFIG_9P_FSCACHE
289/** 339/**
290 * v9fs_init - Initialize module 340 * caches_show - list caches associated with a session
341 *
342 * Returns the size of buffer written.
343 */
344
345static ssize_t caches_show(struct kobject *kobj,
346 struct kobj_attribute *attr,
347 char *buf)
348{
349 ssize_t n = 0, count = 0, limit = PAGE_SIZE;
350 struct v9fs_session_info *v9ses;
351
352 spin_lock(&v9fs_sessionlist_lock);
353 list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
354 if (v9ses->cachetag) {
355 n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
356 if (n < 0) {
357 count = n;
358 break;
359 }
360
361 count += n;
362 limit -= n;
363 }
364 }
365
366 spin_unlock(&v9fs_sessionlist_lock);
367 return count;
368}
369
370static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
371#endif /* CONFIG_9P_FSCACHE */
372
373static struct attribute *v9fs_attrs[] = {
374#ifdef CONFIG_9P_FSCACHE
375 &v9fs_attr_cache.attr,
376#endif
377 NULL,
378};
379
380static struct attribute_group v9fs_attr_group = {
381 .attrs = v9fs_attrs,
382};
383
384/**
385 * v9fs_sysfs_init - Initialize the v9fs sysfs interface
386 *
387 */
388
389static int v9fs_sysfs_init(void)
390{
391 v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
392 if (!v9fs_kobj)
393 return -ENOMEM;
394
395 if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
396 kobject_put(v9fs_kobj);
397 return -ENOMEM;
398 }
399
400 return 0;
401}
402
403/**
404 * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
405 *
406 */
407
408static void v9fs_sysfs_cleanup(void)
409{
410 sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
411 kobject_put(v9fs_kobj);
412}
413
414/**
415 * init_v9fs - Initialize module
291 * 416 *
292 */ 417 */
293 418
294static int __init init_v9fs(void) 419static int __init init_v9fs(void)
295{ 420{
421 int err;
296 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); 422 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
297 /* TODO: Setup list of registered trasnport modules */ 423 /* TODO: Setup list of registered trasnport modules */
298 return register_filesystem(&v9fs_fs_type); 424 err = register_filesystem(&v9fs_fs_type);
425 if (err < 0) {
426 printk(KERN_ERR "Failed to register filesystem\n");
427 return err;
428 }
429
430 err = v9fs_cache_register();
431 if (err < 0) {
432 printk(KERN_ERR "Failed to register v9fs for caching\n");
433 goto out_fs_unreg;
434 }
435
436 err = v9fs_sysfs_init();
437 if (err < 0) {
438 printk(KERN_ERR "Failed to register with sysfs\n");
439 goto out_sysfs_cleanup;
440 }
441
442 return 0;
443
444out_sysfs_cleanup:
445 v9fs_sysfs_cleanup();
446
447out_fs_unreg:
448 unregister_filesystem(&v9fs_fs_type);
449
450 return err;
299} 451}
300 452
301/** 453/**
302 * v9fs_init - shutdown module 454 * exit_v9fs - shutdown module
303 * 455 *
304 */ 456 */
305 457
306static void __exit exit_v9fs(void) 458static void __exit exit_v9fs(void)
307{ 459{
460 v9fs_sysfs_cleanup();
461 v9fs_cache_unregister();
308 unregister_filesystem(&v9fs_fs_type); 462 unregister_filesystem(&v9fs_fs_type);
309} 463}
310 464
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 38762bf102a9..019f4ccb70c1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -51,6 +51,7 @@ enum p9_session_flags {
51enum p9_cache_modes { 51enum p9_cache_modes {
52 CACHE_NONE, 52 CACHE_NONE,
53 CACHE_LOOSE, 53 CACHE_LOOSE,
54 CACHE_FSCACHE,
54}; 55};
55 56
56/** 57/**
@@ -60,6 +61,8 @@ enum p9_cache_modes {
60 * @debug: debug level 61 * @debug: debug level
61 * @afid: authentication handle 62 * @afid: authentication handle
62 * @cache: cache mode of type &p9_cache_modes 63 * @cache: cache mode of type &p9_cache_modes
64 * @cachetag: the tag of the cache associated with this session
65 * @fscache: session cookie associated with FS-Cache
63 * @options: copy of options string given by user 66 * @options: copy of options string given by user
64 * @uname: string user name to mount hierarchy as 67 * @uname: string user name to mount hierarchy as
65 * @aname: mount specifier for remote hierarchy 68 * @aname: mount specifier for remote hierarchy
@@ -68,7 +71,7 @@ enum p9_cache_modes {
68 * @dfltgid: default numeric groupid to mount hierarchy as 71 * @dfltgid: default numeric groupid to mount hierarchy as
69 * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy 72 * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
70 * @clnt: reference to 9P network client instantiated for this session 73 * @clnt: reference to 9P network client instantiated for this session
71 * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug 74 * @slist: reference to list of registered 9p sessions
72 * 75 *
73 * This structure holds state for each session instance established during 76 * This structure holds state for each session instance established during
74 * a sys_mount() . 77 * a sys_mount() .
@@ -84,6 +87,10 @@ struct v9fs_session_info {
84 unsigned short debug; 87 unsigned short debug;
85 unsigned int afid; 88 unsigned int afid;
86 unsigned int cache; 89 unsigned int cache;
90#ifdef CONFIG_9P_FSCACHE
91 char *cachetag;
92 struct fscache_cookie *fscache;
93#endif
87 94
88 char *uname; /* user name to mount as */ 95 char *uname; /* user name to mount as */
89 char *aname; /* name of remote hierarchy being mounted */ 96 char *aname; /* name of remote hierarchy being mounted */
@@ -92,11 +99,9 @@ struct v9fs_session_info {
92 unsigned int dfltgid; /* default gid for legacy support */ 99 unsigned int dfltgid; /* default gid for legacy support */
93 u32 uid; /* if ACCESS_SINGLE, the uid that has access */ 100 u32 uid; /* if ACCESS_SINGLE, the uid that has access */
94 struct p9_client *clnt; /* 9p client */ 101 struct p9_client *clnt; /* 9p client */
95 struct dentry *debugfs_dir; 102 struct list_head slist; /* list of sessions registered with v9fs */
96}; 103};
97 104
98extern struct dentry *v9fs_debugfs_root;
99
100struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 105struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
101 char *); 106 char *);
102void v9fs_session_close(struct v9fs_session_info *v9ses); 107void v9fs_session_close(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f0c7de78e205..3a7560e35865 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,7 +44,13 @@ extern const struct file_operations v9fs_dir_operations;
44extern const struct dentry_operations v9fs_dentry_operations; 44extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 45extern const struct dentry_operations v9fs_cached_dentry_operations;
46 46
47#ifdef CONFIG_9P_FSCACHE
48struct inode *v9fs_alloc_inode(struct super_block *sb);
49void v9fs_destroy_inode(struct inode *inode);
50#endif
51
47struct inode *v9fs_get_inode(struct super_block *sb, int mode); 52struct inode *v9fs_get_inode(struct super_block *sb, int mode);
53void v9fs_clear_inode(struct inode *inode);
48ino_t v9fs_qid2ino(struct p9_qid *qid); 54ino_t v9fs_qid2ino(struct p9_qid *qid);
49void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 55void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
50int v9fs_dir_release(struct inode *inode, struct file *filp); 56int v9fs_dir_release(struct inode *inode, struct file *filp);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 92828281a30b..90e38449f4b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,6 +38,7 @@
38 38
39#include "v9fs.h" 39#include "v9fs.h"
40#include "v9fs_vfs.h" 40#include "v9fs_vfs.h"
41#include "cache.h"
41 42
42/** 43/**
43 * v9fs_vfs_readpage - read an entire page in from 9P 44 * v9fs_vfs_readpage - read an entire page in from 9P
@@ -52,18 +53,31 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
52 int retval; 53 int retval;
53 loff_t offset; 54 loff_t offset;
54 char *buffer; 55 char *buffer;
56 struct inode *inode;
55 57
58 inode = page->mapping->host;
56 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 59 P9_DPRINTK(P9_DEBUG_VFS, "\n");
60
61 BUG_ON(!PageLocked(page));
62
63 retval = v9fs_readpage_from_fscache(inode, page);
64 if (retval == 0)
65 return retval;
66
57 buffer = kmap(page); 67 buffer = kmap(page);
58 offset = page_offset(page); 68 offset = page_offset(page);
59 69
60 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); 70 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
61 if (retval < 0) 71 if (retval < 0) {
72 v9fs_uncache_page(inode, page);
62 goto done; 73 goto done;
74 }
63 75
64 memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval); 76 memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
65 flush_dcache_page(page); 77 flush_dcache_page(page);
66 SetPageUptodate(page); 78 SetPageUptodate(page);
79
80 v9fs_readpage_to_fscache(inode, page);
67 retval = 0; 81 retval = 0;
68 82
69done: 83done:
@@ -72,6 +86,78 @@ done:
72 return retval; 86 return retval;
73} 87}
74 88
89/**
90 * v9fs_vfs_readpages - read a set of pages from 9P
91 *
92 * @filp: file being read
93 * @mapping: the address space
94 * @pages: list of pages to read
95 * @nr_pages: count of pages to read
96 *
97 */
98
99static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
100 struct list_head *pages, unsigned nr_pages)
101{
102 int ret = 0;
103 struct inode *inode;
104
105 inode = mapping->host;
106 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
107
108 ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
109 if (ret == 0)
110 return ret;
111
112 ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
113 P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret);
114 return ret;
115}
116
117/**
118 * v9fs_release_page - release the private state associated with a page
119 *
120 * Returns 1 if the page can be released, false otherwise.
121 */
122
123static int v9fs_release_page(struct page *page, gfp_t gfp)
124{
125 if (PagePrivate(page))
126 return 0;
127
128 return v9fs_fscache_release_page(page, gfp);
129}
130
131/**
132 * v9fs_invalidate_page - Invalidate a page completely or partially
133 *
134 * @page: structure to page
135 * @offset: offset in the page
136 */
137
138static void v9fs_invalidate_page(struct page *page, unsigned long offset)
139{
140 if (offset == 0)
141 v9fs_fscache_invalidate_page(page);
142}
143
144/**
145 * v9fs_launder_page - Writeback a dirty page
146 * Since the writes go directly to the server, we simply return a 0
147 * here to indicate success.
148 *
149 * Returns 0 on success.
150 */
151
152static int v9fs_launder_page(struct page *page)
153{
154 return 0;
155}
156
75const struct address_space_operations v9fs_addr_operations = { 157const struct address_space_operations v9fs_addr_operations = {
76 .readpage = v9fs_vfs_readpage, 158 .readpage = v9fs_vfs_readpage,
159 .readpages = v9fs_vfs_readpages,
160 .releasepage = v9fs_release_page,
161 .invalidatepage = v9fs_invalidate_page,
162 .launder_page = v9fs_launder_page,
77}; 163};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 68bf2af6c389..3902bf43a088 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/pagemap.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <linux/idr.h> 37#include <linux/idr.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
@@ -40,6 +41,7 @@
40#include "v9fs.h" 41#include "v9fs.h"
41#include "v9fs_vfs.h" 42#include "v9fs_vfs.h"
42#include "fid.h" 43#include "fid.h"
44#include "cache.h"
43 45
44static const struct file_operations v9fs_cached_file_operations; 46static const struct file_operations v9fs_cached_file_operations;
45 47
@@ -72,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
72 return err; 74 return err;
73 } 75 }
74 if (omode & P9_OTRUNC) { 76 if (omode & P9_OTRUNC) {
75 inode->i_size = 0; 77 i_size_write(inode, 0);
76 inode->i_blocks = 0; 78 inode->i_blocks = 0;
77 } 79 }
78 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) 80 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
@@ -85,6 +87,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
85 /* enable cached file options */ 87 /* enable cached file options */
86 if(file->f_op == &v9fs_file_operations) 88 if(file->f_op == &v9fs_file_operations)
87 file->f_op = &v9fs_cached_file_operations; 89 file->f_op = &v9fs_cached_file_operations;
90
91#ifdef CONFIG_9P_FSCACHE
92 v9fs_cache_inode_set_cookie(inode, file);
93#endif
88 } 94 }
89 95
90 return 0; 96 return 0;
@@ -210,6 +216,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
210 struct p9_client *clnt; 216 struct p9_client *clnt;
211 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
212 int origin = *offset; 218 int origin = *offset;
219 unsigned long pg_start, pg_end;
213 220
214 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
215 (int)count, (int)*offset); 222 (int)count, (int)*offset);
@@ -225,7 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
225 if (count < rsize) 232 if (count < rsize)
226 rsize = count; 233 rsize = count;
227 234
228 n = p9_client_write(fid, NULL, data+total, *offset+total, 235 n = p9_client_write(fid, NULL, data+total, origin+total,
229 rsize); 236 rsize);
230 if (n <= 0) 237 if (n <= 0)
231 break; 238 break;
@@ -234,14 +241,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
234 } while (count > 0); 241 } while (count > 0);
235 242
236 if (total > 0) { 243 if (total > 0) {
237 invalidate_inode_pages2_range(inode->i_mapping, origin, 244 pg_start = origin >> PAGE_CACHE_SHIFT;
238 origin+total); 245 pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
246 if (inode->i_mapping && inode->i_mapping->nrpages)
247 invalidate_inode_pages2_range(inode->i_mapping,
248 pg_start, pg_end);
239 *offset += total; 249 *offset += total;
240 } 250 i_size_write(inode, i_size_read(inode) + total);
241 251 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
242 if (*offset > inode->i_size) {
243 inode->i_size = *offset;
244 inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
245 } 252 }
246 253
247 if (n < 0) 254 if (n < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 06a223d50a81..5947628aefef 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,6 +40,7 @@
40#include "v9fs.h" 40#include "v9fs.h"
41#include "v9fs_vfs.h" 41#include "v9fs_vfs.h"
42#include "fid.h" 42#include "fid.h"
43#include "cache.h"
43 44
44static const struct inode_operations v9fs_dir_inode_operations; 45static const struct inode_operations v9fs_dir_inode_operations;
45static const struct inode_operations v9fs_dir_inode_operations_ext; 46static const struct inode_operations v9fs_dir_inode_operations_ext;
@@ -197,6 +198,39 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
197 wstat->extension = NULL; 198 wstat->extension = NULL;
198} 199}
199 200
201#ifdef CONFIG_9P_FSCACHE
202/**
203 * v9fs_alloc_inode - helper function to allocate an inode
204 * This callback is executed before setting up the inode so that we
205 * can associate a vcookie with each inode.
206 *
207 */
208
209struct inode *v9fs_alloc_inode(struct super_block *sb)
210{
211 struct v9fs_cookie *vcookie;
212 vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
213 GFP_KERNEL);
214 if (!vcookie)
215 return NULL;
216
217 vcookie->fscache = NULL;
218 vcookie->qid = NULL;
219 spin_lock_init(&vcookie->lock);
220 return &vcookie->inode;
221}
222
223/**
224 * v9fs_destroy_inode - destroy an inode
225 *
226 */
227
228void v9fs_destroy_inode(struct inode *inode)
229{
230 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
231}
232#endif
233
200/** 234/**
201 * v9fs_get_inode - helper function to setup an inode 235 * v9fs_get_inode - helper function to setup an inode
202 * @sb: superblock 236 * @sb: superblock
@@ -326,6 +360,21 @@ error:
326} 360}
327*/ 361*/
328 362
363
364/**
365 * v9fs_clear_inode - release an inode
366 * @inode: inode to release
367 *
368 */
369void v9fs_clear_inode(struct inode *inode)
370{
371 filemap_fdatawrite(inode->i_mapping);
372
373#ifdef CONFIG_9P_FSCACHE
374 v9fs_cache_inode_put_cookie(inode);
375#endif
376}
377
329/** 378/**
330 * v9fs_inode_from_fid - populate an inode by issuing a attribute request 379 * v9fs_inode_from_fid - populate an inode by issuing a attribute request
331 * @v9ses: session information 380 * @v9ses: session information
@@ -356,8 +405,14 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
356 405
357 v9fs_stat2inode(st, ret, sb); 406 v9fs_stat2inode(st, ret, sb);
358 ret->i_ino = v9fs_qid2ino(&st->qid); 407 ret->i_ino = v9fs_qid2ino(&st->qid);
408
409#ifdef CONFIG_9P_FSCACHE
410 v9fs_vcookie_set_qid(ret, &st->qid);
411 v9fs_cache_inode_get_cookie(ret);
412#endif
359 p9stat_free(st); 413 p9stat_free(st);
360 kfree(st); 414 kfree(st);
415
361 return ret; 416 return ret;
362 417
363error: 418error:
@@ -751,7 +806,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
751 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 806 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
752 err = -EPERM; 807 err = -EPERM;
753 v9ses = v9fs_inode2v9ses(dentry->d_inode); 808 v9ses = v9fs_inode2v9ses(dentry->d_inode);
754 if (v9ses->cache == CACHE_LOOSE) 809 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
755 return simple_getattr(mnt, dentry, stat); 810 return simple_getattr(mnt, dentry, stat);
756 811
757 fid = v9fs_fid_lookup(dentry); 812 fid = v9fs_fid_lookup(dentry);
@@ -872,10 +927,10 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
872 } else 927 } else
873 inode->i_rdev = 0; 928 inode->i_rdev = 0;
874 929
875 inode->i_size = stat->length; 930 i_size_write(inode, stat->length);
876 931
877 /* not real number of blocks, but 512 byte ones ... */ 932 /* not real number of blocks, but 512 byte ones ... */
878 inode->i_blocks = (inode->i_size + 512 - 1) >> 9; 933 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
879} 934}
880 935
881/** 936/**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8961f1a8f668..14a86448572c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,21 +44,9 @@
44#include "v9fs_vfs.h" 44#include "v9fs_vfs.h"
45#include "fid.h" 45#include "fid.h"
46 46
47static void v9fs_clear_inode(struct inode *);
48static const struct super_operations v9fs_super_ops; 47static const struct super_operations v9fs_super_ops;
49 48
50/** 49/**
51 * v9fs_clear_inode - release an inode
52 * @inode: inode to release
53 *
54 */
55
56static void v9fs_clear_inode(struct inode *inode)
57{
58 filemap_fdatawrite(inode->i_mapping);
59}
60
61/**
62 * v9fs_set_super - set the superblock 50 * v9fs_set_super - set the superblock
63 * @s: super block 51 * @s: super block
64 * @data: file system specific data 52 * @data: file system specific data
@@ -220,6 +208,10 @@ v9fs_umount_begin(struct super_block *sb)
220} 208}
221 209
222static const struct super_operations v9fs_super_ops = { 210static const struct super_operations v9fs_super_ops = {
211#ifdef CONFIG_9P_FSCACHE
212 .alloc_inode = v9fs_alloc_inode,
213 .destroy_inode = v9fs_destroy_inode,
214#endif
223 .statfs = simple_statfs, 215 .statfs = simple_statfs,
224 .clear_inode = v9fs_clear_inode, 216 .clear_inode = v9fs_clear_inode,
225 .show_options = generic_show_options, 217 .show_options = generic_show_options,
diff --git a/fs/Kconfig b/fs/Kconfig
index 455aa207e67e..d4bf8caad8d0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,6 +109,7 @@ source "fs/sysfs/Kconfig"
109 109
110config TMPFS 110config TMPFS
111 bool "Virtual memory file system support (former shm fs)" 111 bool "Virtual memory file system support (former shm fs)"
112 depends on SHMEM
112 help 113 help
113 Tmpfs is a file system which keeps all files in virtual memory. 114 Tmpfs is a file system which keeps all files in virtual memory.
114 115
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 798cb071d132..3f57ce4bee5d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -19,9 +19,6 @@ static int
19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, 19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
20 int create) 20 int create)
21{ 21{
22 if (block < 0)
23 goto abort_negative;
24
25 if (!create) { 22 if (!create) {
26 if (block >= inode->i_blocks) 23 if (block >= inode->i_blocks)
27 goto abort_toobig; 24 goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
34 /* don't support allocation of blocks yet */ 31 /* don't support allocation of blocks yet */
35 return -EIO; 32 return -EIO;
36 33
37abort_negative:
38 adfs_error(inode->i_sb, "block %d < 0", block);
39 return -EIO;
40
41abort_toobig: 34abort_toobig:
42 return 0; 35 return 0;
43} 36}
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b499e90..000000000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
1/* AFS local cache management interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fscache.h>
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3ff8bdd18fb3..0931bc1325eb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl);
21static struct workqueue_struct *afs_lock_manager; 21static struct workqueue_struct *afs_lock_manager;
22static DEFINE_MUTEX(afs_lock_manager_mutex); 22static DEFINE_MUTEX(afs_lock_manager_mutex);
23 23
24static struct file_lock_operations afs_lock_ops = { 24static const struct file_lock_operations afs_lock_ops = {
25 .fl_copy_lock = afs_fl_copy_lock, 25 .fl_copy_lock = afs_fl_copy_lock,
26 .fl_release_private = afs_fl_release_private, 26 .fl_release_private = afs_fl_release_private,
27}; 27};
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66dafd2..6ece2a13bf71 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
18#include <linux/key.h> 18#include <linux/key.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h>
21 22
22#include "afs.h" 23#include "afs.h"
23#include "afs_vl.h" 24#include "afs_vl.h"
24#include "cache.h"
25 25
26#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
27 27
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 8630615e57fe..852739d262a9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -28,7 +28,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v);
28static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, 28static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
29 size_t size, loff_t *_pos); 29 size_t size, loff_t *_pos);
30 30
31static struct seq_operations afs_proc_cells_ops = { 31static const struct seq_operations afs_proc_cells_ops = {
32 .start = afs_proc_cells_start, 32 .start = afs_proc_cells_start,
33 .next = afs_proc_cells_next, 33 .next = afs_proc_cells_next,
34 .stop = afs_proc_cells_stop, 34 .stop = afs_proc_cells_stop,
@@ -70,7 +70,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
70static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); 70static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v);
71static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); 71static int afs_proc_cell_volumes_show(struct seq_file *m, void *v);
72 72
73static struct seq_operations afs_proc_cell_volumes_ops = { 73static const struct seq_operations afs_proc_cell_volumes_ops = {
74 .start = afs_proc_cell_volumes_start, 74 .start = afs_proc_cell_volumes_start,
75 .next = afs_proc_cell_volumes_next, 75 .next = afs_proc_cell_volumes_next,
76 .stop = afs_proc_cell_volumes_stop, 76 .stop = afs_proc_cell_volumes_stop,
@@ -95,7 +95,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
95static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); 95static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v);
96static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); 96static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v);
97 97
98static struct seq_operations afs_proc_cell_vlservers_ops = { 98static const struct seq_operations afs_proc_cell_vlservers_ops = {
99 .start = afs_proc_cell_vlservers_start, 99 .start = afs_proc_cell_vlservers_start,
100 .next = afs_proc_cell_vlservers_next, 100 .next = afs_proc_cell_vlservers_next,
101 .stop = afs_proc_cell_vlservers_stop, 101 .stop = afs_proc_cell_vlservers_stop,
@@ -119,7 +119,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
119static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); 119static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
120static int afs_proc_cell_servers_show(struct seq_file *m, void *v); 120static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
121 121
122static struct seq_operations afs_proc_cell_servers_ops = { 122static const struct seq_operations afs_proc_cell_servers_ops = {
123 .start = afs_proc_cell_servers_start, 123 .start = afs_proc_cell_servers_start,
124 .next = afs_proc_cell_servers_next, 124 .next = afs_proc_cell_servers_next,
125 .stop = afs_proc_cell_servers_stop, 125 .stop = afs_proc_cell_servers_stop,
diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..02a2c9340573 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -24,6 +24,7 @@
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/mman.h> 26#include <linux/mman.h>
27#include <linux/mmu_context.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
28#include <linux/timer.h> 29#include <linux/timer.h>
29#include <linux/aio.h> 30#include <linux/aio.h>
@@ -34,7 +35,6 @@
34 35
35#include <asm/kmap_types.h> 36#include <asm/kmap_types.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <asm/mmu_context.h>
38 38
39#if DEBUG > 1 39#if DEBUG > 1
40#define dprintk printk 40#define dprintk printk
@@ -78,6 +78,7 @@ static int __init aio_setup(void)
78 78
79 return 0; 79 return 0;
80} 80}
81__initcall(aio_setup);
81 82
82static void aio_free_ring(struct kioctx *ctx) 83static void aio_free_ring(struct kioctx *ctx)
83{ 84{
@@ -380,6 +381,7 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
380 __set_current_state(TASK_RUNNING); 381 __set_current_state(TASK_RUNNING);
381 return iocb->ki_user_data; 382 return iocb->ki_user_data;
382} 383}
384EXPORT_SYMBOL(wait_on_sync_kiocb);
383 385
384/* exit_aio: called when the last user of mm goes away. At this point, 386/* exit_aio: called when the last user of mm goes away. At this point,
385 * there is no way for any new requests to be submited or any of the 387 * there is no way for any new requests to be submited or any of the
@@ -573,6 +575,7 @@ int aio_put_req(struct kiocb *req)
573 spin_unlock_irq(&ctx->ctx_lock); 575 spin_unlock_irq(&ctx->ctx_lock);
574 return ret; 576 return ret;
575} 577}
578EXPORT_SYMBOL(aio_put_req);
576 579
577static struct kioctx *lookup_ioctx(unsigned long ctx_id) 580static struct kioctx *lookup_ioctx(unsigned long ctx_id)
578{ 581{
@@ -595,51 +598,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
595} 598}
596 599
597/* 600/*
598 * use_mm
599 * Makes the calling kernel thread take on the specified
600 * mm context.
601 * Called by the retry thread execute retries within the
602 * iocb issuer's mm context, so that copy_from/to_user
603 * operations work seamlessly for aio.
604 * (Note: this routine is intended to be called only
605 * from a kernel thread context)
606 */
607static void use_mm(struct mm_struct *mm)
608{
609 struct mm_struct *active_mm;
610 struct task_struct *tsk = current;
611
612 task_lock(tsk);
613 active_mm = tsk->active_mm;
614 atomic_inc(&mm->mm_count);
615 tsk->mm = mm;
616 tsk->active_mm = mm;
617 switch_mm(active_mm, mm, tsk);
618 task_unlock(tsk);
619
620 mmdrop(active_mm);
621}
622
623/*
624 * unuse_mm
625 * Reverses the effect of use_mm, i.e. releases the
626 * specified mm context which was earlier taken on
627 * by the calling kernel thread
628 * (Note: this routine is intended to be called only
629 * from a kernel thread context)
630 */
631static void unuse_mm(struct mm_struct *mm)
632{
633 struct task_struct *tsk = current;
634
635 task_lock(tsk);
636 tsk->mm = NULL;
637 /* active_mm is still 'mm' */
638 enter_lazy_tlb(mm, tsk);
639 task_unlock(tsk);
640}
641
642/*
643 * Queue up a kiocb to be retried. Assumes that the kiocb 601 * Queue up a kiocb to be retried. Assumes that the kiocb
644 * has already been marked as kicked, and places it on 602 * has already been marked as kicked, and places it on
645 * the retry run list for the corresponding ioctx, if it 603 * the retry run list for the corresponding ioctx, if it
@@ -1037,6 +995,7 @@ put_rq:
1037 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 995 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
1038 return ret; 996 return ret;
1039} 997}
998EXPORT_SYMBOL(aio_complete);
1040 999
1041/* aio_read_evt 1000/* aio_read_evt
1042 * Pull an event off of the ioctx's event ring. Returns the number of 1001 * Pull an event off of the ioctx's event ring. Returns the number of
@@ -1825,9 +1784,3 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1825 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); 1784 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
1826 return ret; 1785 return ret;
1827} 1786}
1828
1829__initcall(aio_setup);
1830
1831EXPORT_SYMBOL(aio_complete);
1832EXPORT_SYMBOL(aio_put_req);
1833EXPORT_SYMBOL(wait_on_sync_kiocb);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47d4a01c5393..2ca7a7cafdbf 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/cred.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h>
13#include <linux/slab.h> 15#include <linux/slab.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/fs.h> 17#include <linux/fs.h>
@@ -77,28 +79,24 @@ static const struct address_space_operations anon_aops = {
77 * 79 *
78 * Creates a new file by hooking it on a single inode. This is useful for files 80 * Creates a new file by hooking it on a single inode. This is useful for files
79 * that do not need to have a full-fledged inode in order to operate correctly. 81 * that do not need to have a full-fledged inode in order to operate correctly.
80 * All the files created with anon_inode_getfd() will share a single inode, 82 * All the files created with anon_inode_getfile() will share a single inode,
81 * hence saving memory and avoiding code duplication for the file/inode/dentry 83 * hence saving memory and avoiding code duplication for the file/inode/dentry
82 * setup. Returns new descriptor or -error. 84 * setup. Returns the newly created file* or an error pointer.
83 */ 85 */
84int anon_inode_getfd(const char *name, const struct file_operations *fops, 86struct file *anon_inode_getfile(const char *name,
85 void *priv, int flags) 87 const struct file_operations *fops,
88 void *priv, int flags)
86{ 89{
87 struct qstr this; 90 struct qstr this;
88 struct dentry *dentry; 91 struct dentry *dentry;
89 struct file *file; 92 struct file *file;
90 int error, fd; 93 int error;
91 94
92 if (IS_ERR(anon_inode_inode)) 95 if (IS_ERR(anon_inode_inode))
93 return -ENODEV; 96 return ERR_PTR(-ENODEV);
94 97
95 if (fops->owner && !try_module_get(fops->owner)) 98 if (fops->owner && !try_module_get(fops->owner))
96 return -ENOENT; 99 return ERR_PTR(-ENOENT);
97
98 error = get_unused_fd_flags(flags);
99 if (error < 0)
100 goto err_module;
101 fd = error;
102 100
103 /* 101 /*
104 * Link the inode to a directory entry by creating a unique name 102 * Link the inode to a directory entry by creating a unique name
@@ -110,7 +108,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
110 this.hash = 0; 108 this.hash = 0;
111 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); 109 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
112 if (!dentry) 110 if (!dentry)
113 goto err_put_unused_fd; 111 goto err_module;
114 112
115 /* 113 /*
116 * We know the anon_inode inode count is always greater than zero, 114 * We know the anon_inode inode count is always greater than zero,
@@ -136,16 +134,54 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
136 file->f_version = 0; 134 file->f_version = 0;
137 file->private_data = priv; 135 file->private_data = priv;
138 136
137 return file;
138
139err_dput:
140 dput(dentry);
141err_module:
142 module_put(fops->owner);
143 return ERR_PTR(error);
144}
145EXPORT_SYMBOL_GPL(anon_inode_getfile);
146
147/**
148 * anon_inode_getfd - creates a new file instance by hooking it up to an
149 * anonymous inode, and a dentry that describe the "class"
150 * of the file
151 *
152 * @name: [in] name of the "class" of the new file
153 * @fops: [in] file operations for the new file
154 * @priv: [in] private data for the new file (will be file's private_data)
155 * @flags: [in] flags
156 *
157 * Creates a new file by hooking it on a single inode. This is useful for files
158 * that do not need to have a full-fledged inode in order to operate correctly.
159 * All the files created with anon_inode_getfd() will share a single inode,
160 * hence saving memory and avoiding code duplication for the file/inode/dentry
161 * setup. Returns new descriptor or an error code.
162 */
163int anon_inode_getfd(const char *name, const struct file_operations *fops,
164 void *priv, int flags)
165{
166 int error, fd;
167 struct file *file;
168
169 error = get_unused_fd_flags(flags);
170 if (error < 0)
171 return error;
172 fd = error;
173
174 file = anon_inode_getfile(name, fops, priv, flags);
175 if (IS_ERR(file)) {
176 error = PTR_ERR(file);
177 goto err_put_unused_fd;
178 }
139 fd_install(fd, file); 179 fd_install(fd, file);
140 180
141 return fd; 181 return fd;
142 182
143err_dput:
144 dput(dentry);
145err_put_unused_fd: 183err_put_unused_fd:
146 put_unused_fd(fd); 184 put_unused_fd(fd);
147err_module:
148 module_put(fops->owner);
149 return error; 185 return error;
150} 186}
151EXPORT_SYMBOL_GPL(anon_inode_getfd); 187EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/attr.c b/fs/attr.c
index 9fe1b1bd30a8..96d394bdaddf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,7 +18,7 @@
18/* Taken over from the old code... */ 18/* Taken over from the old code... */
19 19
20/* POSIX UID/GID verification for setting inode attributes. */ 20/* POSIX UID/GID verification for setting inode attributes. */
21int inode_change_ok(struct inode *inode, struct iattr *attr) 21int inode_change_ok(const struct inode *inode, struct iattr *attr)
22{ 22{
23 int retval = -EPERM; 23 int retval = -EPERM;
24 unsigned int ia_valid = attr->ia_valid; 24 unsigned int ia_valid = attr->ia_valid;
@@ -60,9 +60,51 @@ fine:
60error: 60error:
61 return retval; 61 return retval;
62} 62}
63
64EXPORT_SYMBOL(inode_change_ok); 63EXPORT_SYMBOL(inode_change_ok);
65 64
65/**
66 * inode_newsize_ok - may this inode be truncated to a given size
67 * @inode: the inode to be truncated
68 * @offset: the new size to assign to the inode
69 * @Returns: 0 on success, -ve errno on failure
70 *
71 * inode_newsize_ok will check filesystem limits and ulimits to check that the
72 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
73 * when necessary. Caller must not proceed with inode size change if failure is
74 * returned. @inode must be a file (not directory), with appropriate
75 * permissions to allow truncate (inode_newsize_ok does NOT check these
76 * conditions).
77 *
78 * inode_newsize_ok must be called with i_mutex held.
79 */
80int inode_newsize_ok(const struct inode *inode, loff_t offset)
81{
82 if (inode->i_size < offset) {
83 unsigned long limit;
84
85 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
86 if (limit != RLIM_INFINITY && offset > limit)
87 goto out_sig;
88 if (offset > inode->i_sb->s_maxbytes)
89 goto out_big;
90 } else {
91 /*
92 * truncation of in-use swapfiles is disallowed - it would
93 * cause subsequent swapout to scribble on the now-freed
94 * blocks.
95 */
96 if (IS_SWAPFILE(inode))
97 return -ETXTBSY;
98 }
99
100 return 0;
101out_sig:
102 send_sig(SIGXFSZ, current, 0);
103out_big:
104 return -EFBIG;
105}
106EXPORT_SYMBOL(inode_newsize_ok);
107
66int inode_setattr(struct inode * inode, struct iattr * attr) 108int inode_setattr(struct inode * inode, struct iattr * attr)
67{ 109{
68 unsigned int ia_valid = attr->ia_valid; 110 unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 2316e944a109..e947915109e5 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -90,7 +90,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue; 91 continue;
92 } 92 }
93 while (d_mountpoint(path.dentry) && follow_down(&path)); 93 while (d_mountpoint(path.dentry) && follow_down(&path))
94 ; 94 ;
95 umount_ok = may_umount(path.mnt); 95 umount_ok = may_umount(path.mnt);
96 path_put(&path); 96 path_put(&path);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 615d5496fe0f..33baf27fac78 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb)
737{ 737{
738 kfree(BEFS_SB(sb)->mount_opts.iocharset); 738 kfree(BEFS_SB(sb)->mount_opts.iocharset);
739 BEFS_SB(sb)->mount_opts.iocharset = NULL; 739 BEFS_SB(sb)->mount_opts.iocharset = NULL;
740 740 unload_nls(BEFS_SB(sb)->nls);
741 if (BEFS_SB(sb)->nls) {
742 unload_nls(BEFS_SB(sb)->nls);
743 BEFS_SB(sb)->nls = NULL;
744 }
745
746 kfree(sb->s_fs_info); 741 kfree(sb->s_fs_info);
747 sb->s_fs_info = NULL; 742 sb->s_fs_info = NULL;
748} 743}
@@ -842,7 +837,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
842 sb->s_magic = BEFS_SUPER_MAGIC; 837 sb->s_magic = BEFS_SUPER_MAGIC;
843 /* Set real blocksize of fs */ 838 /* Set real blocksize of fs */
844 sb_set_blocksize(sb, (ulong) befs_sb->block_size); 839 sb_set_blocksize(sb, (ulong) befs_sb->block_size);
845 sb->s_op = (struct super_operations *) &befs_sops; 840 sb->s_op = &befs_sops;
846 root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); 841 root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
847 if (IS_ERR(root)) { 842 if (IS_ERR(root)) {
848 ret = PTR_ERR(root); 843 ret = PTR_ERR(root);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7c1e65d54872..b9b3bb51b1e4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file,
1280#define DUMP_WRITE(addr, nr) \ 1280#define DUMP_WRITE(addr, nr) \
1281 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ 1281 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1282 goto end_coredump; 1282 goto end_coredump;
1283#define DUMP_SEEK(off) \
1284 if (!dump_seek(file, (off))) \
1285 goto end_coredump;
1286 1283
1287static void fill_elf_header(struct elfhdr *elf, int segs, 1284static void fill_elf_header(struct elfhdr *elf, int segs,
1288 u16 machine, u32 flags, u8 osabi) 1285 u16 machine, u32 flags, u8 osabi)
@@ -1714,42 +1711,52 @@ struct elf_note_info {
1714 int numnote; 1711 int numnote;
1715}; 1712};
1716 1713
1717static int fill_note_info(struct elfhdr *elf, int phdrs, 1714static int elf_note_info_init(struct elf_note_info *info)
1718 struct elf_note_info *info,
1719 long signr, struct pt_regs *regs)
1720{ 1715{
1721#define NUM_NOTES 6 1716 memset(info, 0, sizeof(*info));
1722 struct list_head *t;
1723
1724 info->notes = NULL;
1725 info->prstatus = NULL;
1726 info->psinfo = NULL;
1727 info->fpu = NULL;
1728#ifdef ELF_CORE_COPY_XFPREGS
1729 info->xfpu = NULL;
1730#endif
1731 INIT_LIST_HEAD(&info->thread_list); 1717 INIT_LIST_HEAD(&info->thread_list);
1732 1718
1733 info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), 1719 /* Allocate space for six ELF notes */
1734 GFP_KERNEL); 1720 info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
1735 if (!info->notes) 1721 if (!info->notes)
1736 return 0; 1722 return 0;
1737 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); 1723 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
1738 if (!info->psinfo) 1724 if (!info->psinfo)
1739 return 0; 1725 goto notes_free;
1740 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); 1726 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
1741 if (!info->prstatus) 1727 if (!info->prstatus)
1742 return 0; 1728 goto psinfo_free;
1743 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); 1729 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
1744 if (!info->fpu) 1730 if (!info->fpu)
1745 return 0; 1731 goto prstatus_free;
1746#ifdef ELF_CORE_COPY_XFPREGS 1732#ifdef ELF_CORE_COPY_XFPREGS
1747 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); 1733 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
1748 if (!info->xfpu) 1734 if (!info->xfpu)
1749 return 0; 1735 goto fpu_free;
1736#endif
1737 return 1;
1738#ifdef ELF_CORE_COPY_XFPREGS
1739 fpu_free:
1740 kfree(info->fpu);
1750#endif 1741#endif
1742 prstatus_free:
1743 kfree(info->prstatus);
1744 psinfo_free:
1745 kfree(info->psinfo);
1746 notes_free:
1747 kfree(info->notes);
1748 return 0;
1749}
1750
1751static int fill_note_info(struct elfhdr *elf, int phdrs,
1752 struct elf_note_info *info,
1753 long signr, struct pt_regs *regs)
1754{
1755 struct list_head *t;
1756
1757 if (!elf_note_info_init(info))
1758 return 0;
1751 1759
1752 info->thread_status_size = 0;
1753 if (signr) { 1760 if (signr) {
1754 struct core_thread *ct; 1761 struct core_thread *ct;
1755 struct elf_thread_status *ets; 1762 struct elf_thread_status *ets;
@@ -1809,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1809#endif 1816#endif
1810 1817
1811 return 1; 1818 return 1;
1812
1813#undef NUM_NOTES
1814} 1819}
1815 1820
1816static size_t get_note_info_size(struct elf_note_info *info) 1821static size_t get_note_info_size(struct elf_note_info *info)
@@ -2016,7 +2021,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2016 goto end_coredump; 2021 goto end_coredump;
2017 2022
2018 /* Align to page */ 2023 /* Align to page */
2019 DUMP_SEEK(dataoff - foffset); 2024 if (!dump_seek(file, dataoff - foffset))
2025 goto end_coredump;
2020 2026
2021 for (vma = first_vma(current, gate_vma); vma != NULL; 2027 for (vma = first_vma(current, gate_vma); vma != NULL;
2022 vma = next_vma(vma, gate_vma)) { 2028 vma = next_vma(vma, gate_vma)) {
@@ -2027,33 +2033,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2027 2033
2028 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { 2034 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
2029 struct page *page; 2035 struct page *page;
2030 struct vm_area_struct *tmp_vma; 2036 int stop;
2031 2037
2032 if (get_user_pages(current, current->mm, addr, 1, 0, 1, 2038 page = get_dump_page(addr);
2033 &page, &tmp_vma) <= 0) { 2039 if (page) {
2034 DUMP_SEEK(PAGE_SIZE); 2040 void *kaddr = kmap(page);
2035 } else { 2041 stop = ((size += PAGE_SIZE) > limit) ||
2036 if (page == ZERO_PAGE(0)) { 2042 !dump_write(file, kaddr, PAGE_SIZE);
2037 if (!dump_seek(file, PAGE_SIZE)) { 2043 kunmap(page);
2038 page_cache_release(page);
2039 goto end_coredump;
2040 }
2041 } else {
2042 void *kaddr;
2043 flush_cache_page(tmp_vma, addr,
2044 page_to_pfn(page));
2045 kaddr = kmap(page);
2046 if ((size += PAGE_SIZE) > limit ||
2047 !dump_write(file, kaddr,
2048 PAGE_SIZE)) {
2049 kunmap(page);
2050 page_cache_release(page);
2051 goto end_coredump;
2052 }
2053 kunmap(page);
2054 }
2055 page_cache_release(page); 2044 page_cache_release(page);
2056 } 2045 } else
2046 stop = !dump_seek(file, PAGE_SIZE);
2047 if (stop)
2048 goto end_coredump;
2057 } 2049 }
2058 } 2050 }
2059 2051
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 20fbeced472b..38502c67987c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
283 } 283 }
284 284
285 stack_size = exec_params.stack_size; 285 stack_size = exec_params.stack_size;
286 if (stack_size < interp_params.stack_size)
287 stack_size = interp_params.stack_size;
288
289 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) 286 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
290 executable_stack = EXSTACK_ENABLE_X; 287 executable_stack = EXSTACK_ENABLE_X;
291 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) 288 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
292 executable_stack = EXSTACK_DISABLE_X; 289 executable_stack = EXSTACK_DISABLE_X;
293 else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
294 executable_stack = EXSTACK_ENABLE_X;
295 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
296 executable_stack = EXSTACK_DISABLE_X;
297 else 290 else
298 executable_stack = EXSTACK_DEFAULT; 291 executable_stack = EXSTACK_DEFAULT;
299 292
293 if (stack_size == 0) {
294 stack_size = interp_params.stack_size;
295 if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
296 executable_stack = EXSTACK_ENABLE_X;
297 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
298 executable_stack = EXSTACK_DISABLE_X;
299 else
300 executable_stack = EXSTACK_DEFAULT;
301 }
302
300 retval = -ENOEXEC; 303 retval = -ENOEXEC;
301 if (stack_size == 0) 304 if (stack_size == 0)
302 goto error; 305 goto error;
@@ -1325,9 +1328,6 @@ static int writenote(struct memelfnote *men, struct file *file)
1325#define DUMP_WRITE(addr, nr) \ 1328#define DUMP_WRITE(addr, nr) \
1326 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ 1329 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1327 goto end_coredump; 1330 goto end_coredump;
1328#define DUMP_SEEK(off) \
1329 if (!dump_seek(file, (off))) \
1330 goto end_coredump;
1331 1331
1332static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) 1332static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
1333{ 1333{
@@ -1518,6 +1518,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1518 unsigned long *limit, unsigned long mm_flags) 1518 unsigned long *limit, unsigned long mm_flags)
1519{ 1519{
1520 struct vm_area_struct *vma; 1520 struct vm_area_struct *vma;
1521 int err = 0;
1521 1522
1522 for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1523 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1523 unsigned long addr; 1524 unsigned long addr;
@@ -1525,43 +1526,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1525 if (!maydump(vma, mm_flags)) 1526 if (!maydump(vma, mm_flags))
1526 continue; 1527 continue;
1527 1528
1528 for (addr = vma->vm_start; 1529 for (addr = vma->vm_start; addr < vma->vm_end;
1529 addr < vma->vm_end; 1530 addr += PAGE_SIZE) {
1530 addr += PAGE_SIZE 1531 struct page *page = get_dump_page(addr);
1531 ) { 1532 if (page) {
1532 struct vm_area_struct *vma; 1533 void *kaddr = kmap(page);
1533 struct page *page; 1534 *size += PAGE_SIZE;
1534 1535 if (*size > *limit)
1535 if (get_user_pages(current, current->mm, addr, 1, 0, 1, 1536 err = -EFBIG;
1536 &page, &vma) <= 0) { 1537 else if (!dump_write(file, kaddr, PAGE_SIZE))
1537 DUMP_SEEK(file->f_pos + PAGE_SIZE); 1538 err = -EIO;
1538 }
1539 else if (page == ZERO_PAGE(0)) {
1540 page_cache_release(page);
1541 DUMP_SEEK(file->f_pos + PAGE_SIZE);
1542 }
1543 else {
1544 void *kaddr;
1545
1546 flush_cache_page(vma, addr, page_to_pfn(page));
1547 kaddr = kmap(page);
1548 if ((*size += PAGE_SIZE) > *limit ||
1549 !dump_write(file, kaddr, PAGE_SIZE)
1550 ) {
1551 kunmap(page);
1552 page_cache_release(page);
1553 return -EIO;
1554 }
1555 kunmap(page); 1539 kunmap(page);
1556 page_cache_release(page); 1540 page_cache_release(page);
1557 } 1541 } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
1542 err = -EFBIG;
1543 if (err)
1544 goto out;
1558 } 1545 }
1559 } 1546 }
1560 1547out:
1561 return 0; 1548 return err;
1562
1563end_coredump:
1564 return -EFBIG;
1565} 1549}
1566#endif 1550#endif
1567 1551
@@ -1802,7 +1786,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1802 goto end_coredump; 1786 goto end_coredump;
1803 } 1787 }
1804 1788
1805 DUMP_SEEK(dataoff); 1789 if (!dump_seek(file, dataoff))
1790 goto end_coredump;
1806 1791
1807 if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) 1792 if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
1808 goto end_coredump; 1793 goto end_coredump;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e92f229e3c6e..a2796651e756 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -278,8 +278,6 @@ static int decompress_exec(
278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); 278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
279 if (ret <= 0) 279 if (ret <= 0)
280 break; 280 break;
281 if (ret >= (unsigned long) -4096)
282 break;
283 len -= ret; 281 len -= ret;
284 282
285 strm.next_in = buf; 283 strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
335 "(%d != %d)", (unsigned) r, curid, id); 333 "(%d != %d)", (unsigned) r, curid, id);
336 goto failed; 334 goto failed;
337 } else if ( ! p->lib_list[id].loaded && 335 } else if ( ! p->lib_list[id].loaded &&
338 load_flat_shared_library(id, p) > (unsigned long) -4096) { 336 IS_ERR_VALUE(load_flat_shared_library(id, p))) {
339 printk("BINFMT_FLAT: failed to load library %d", id); 337 printk("BINFMT_FLAT: failed to load library %d", id);
340 goto failed; 338 goto failed;
341 } 339 }
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
545 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, 543 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
546 MAP_PRIVATE|MAP_EXECUTABLE, 0); 544 MAP_PRIVATE|MAP_EXECUTABLE, 0);
547 up_write(&current->mm->mmap_sem); 545 up_write(&current->mm->mmap_sem);
548 if (!textpos || textpos >= (unsigned long) -4096) { 546 if (!textpos || IS_ERR_VALUE(textpos)) {
549 if (!textpos) 547 if (!textpos)
550 textpos = (unsigned long) -ENOMEM; 548 textpos = (unsigned long) -ENOMEM;
551 printk("Unable to mmap process text, errno %d\n", (int)-textpos); 549 printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
560 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); 558 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
561 up_write(&current->mm->mmap_sem); 559 up_write(&current->mm->mmap_sem);
562 560
563 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { 561 if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
564 if (!realdatastart) 562 if (!realdatastart)
565 realdatastart = (unsigned long) -ENOMEM; 563 realdatastart = (unsigned long) -ENOMEM;
566 printk("Unable to allocate RAM for process data, errno %d\n", 564 printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
587 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 585 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
588 data_len + (relocs * sizeof(unsigned long)), &fpos); 586 data_len + (relocs * sizeof(unsigned long)), &fpos);
589 } 587 }
590 if (result >= (unsigned long)-4096) { 588 if (IS_ERR_VALUE(result)) {
591 printk("Unable to read data+bss, errno %d\n", (int)-result); 589 printk("Unable to read data+bss, errno %d\n", (int)-result);
592 do_munmap(current->mm, textpos, text_len); 590 do_munmap(current->mm, textpos, text_len);
593 do_munmap(current->mm, realdatastart, data_len + extra); 591 do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
607 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); 605 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
608 up_write(&current->mm->mmap_sem); 606 up_write(&current->mm->mmap_sem);
609 607
610 if (!textpos || textpos >= (unsigned long) -4096) { 608 if (!textpos || IS_ERR_VALUE(textpos)) {
611 if (!textpos) 609 if (!textpos)
612 textpos = (unsigned long) -ENOMEM; 610 textpos = (unsigned long) -ENOMEM;
613 printk("Unable to allocate RAM for process text/data, errno %d\n", 611 printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
641 fpos = 0; 639 fpos = 0;
642 result = bprm->file->f_op->read(bprm->file, 640 result = bprm->file->f_op->read(bprm->file,
643 (char *) textpos, text_len, &fpos); 641 (char *) textpos, text_len, &fpos);
644 if (result < (unsigned long) -4096) 642 if (!IS_ERR_VALUE(result))
645 result = decompress_exec(bprm, text_len, (char *) datapos, 643 result = decompress_exec(bprm, text_len, (char *) datapos,
646 data_len + (relocs * sizeof(unsigned long)), 0); 644 data_len + (relocs * sizeof(unsigned long)), 0);
647 } 645 }
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
651 fpos = 0; 649 fpos = 0;
652 result = bprm->file->f_op->read(bprm->file, 650 result = bprm->file->f_op->read(bprm->file,
653 (char *) textpos, text_len, &fpos); 651 (char *) textpos, text_len, &fpos);
654 if (result < (unsigned long) -4096) { 652 if (!IS_ERR_VALUE(result)) {
655 fpos = ntohl(hdr->data_start); 653 fpos = ntohl(hdr->data_start);
656 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 654 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
657 data_len + (relocs * sizeof(unsigned long)), &fpos); 655 data_len + (relocs * sizeof(unsigned long)), &fpos);
658 } 656 }
659 } 657 }
660 if (result >= (unsigned long)-4096) { 658 if (IS_ERR_VALUE(result)) {
661 printk("Unable to read code+data+bss, errno %d\n",(int)-result); 659 printk("Unable to read code+data+bss, errno %d\n",(int)-result);
662 do_munmap(current->mm, textpos, text_len + data_len + extra + 660 do_munmap(current->mm, textpos, text_len + data_len + extra +
663 MAX_SHARED_LIBS * sizeof(unsigned long)); 661 MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
835 833
836 res = prepare_binprm(&bprm); 834 res = prepare_binprm(&bprm);
837 835
838 if (res <= (unsigned long)-4096) 836 if (!IS_ERR_VALUE(res))
839 res = load_flat_file(&bprm, libs, id, NULL); 837 res = load_flat_file(&bprm, libs, id, NULL);
840 838
841 abort_creds(bprm.cred); 839 abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
880 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ 878 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
881 879
882 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 880 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
883 if (res > (unsigned long)-4096) 881 if (IS_ERR_VALUE(res))
884 return res; 882 return res;
885 883
886 /* Update data segment pointers for all libraries */ 884 /* Update data segment pointers for all libraries */
diff --git a/fs/bio.c b/fs/bio.c
index 76738005c8e8..402cb84a92a1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
249 249
250 mempool_free(p, bs->bio_pool); 250 mempool_free(p, bs->bio_pool);
251} 251}
252EXPORT_SYMBOL(bio_free);
252 253
253void bio_init(struct bio *bio) 254void bio_init(struct bio *bio)
254{ 255{
@@ -257,6 +258,7 @@ void bio_init(struct bio *bio)
257 bio->bi_comp_cpu = -1; 258 bio->bi_comp_cpu = -1;
258 atomic_set(&bio->bi_cnt, 1); 259 atomic_set(&bio->bi_cnt, 1);
259} 260}
261EXPORT_SYMBOL(bio_init);
260 262
261/** 263/**
262 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
@@ -311,6 +313,7 @@ err_free:
311 mempool_free(p, bs->bio_pool); 313 mempool_free(p, bs->bio_pool);
312 return NULL; 314 return NULL;
313} 315}
316EXPORT_SYMBOL(bio_alloc_bioset);
314 317
315static void bio_fs_destructor(struct bio *bio) 318static void bio_fs_destructor(struct bio *bio)
316{ 319{
@@ -337,6 +340,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
337 340
338 return bio; 341 return bio;
339} 342}
343EXPORT_SYMBOL(bio_alloc);
340 344
341static void bio_kmalloc_destructor(struct bio *bio) 345static void bio_kmalloc_destructor(struct bio *bio)
342{ 346{
@@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
380 384
381 return bio; 385 return bio;
382} 386}
387EXPORT_SYMBOL(bio_kmalloc);
383 388
384void zero_fill_bio(struct bio *bio) 389void zero_fill_bio(struct bio *bio)
385{ 390{
@@ -416,6 +421,7 @@ void bio_put(struct bio *bio)
416 bio->bi_destructor(bio); 421 bio->bi_destructor(bio);
417 } 422 }
418} 423}
424EXPORT_SYMBOL(bio_put);
419 425
420inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 426inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
421{ 427{
@@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
424 430
425 return bio->bi_phys_segments; 431 return bio->bi_phys_segments;
426} 432}
433EXPORT_SYMBOL(bio_phys_segments);
427 434
428/** 435/**
429 * __bio_clone - clone a bio 436 * __bio_clone - clone a bio
@@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
451 bio->bi_size = bio_src->bi_size; 458 bio->bi_size = bio_src->bi_size;
452 bio->bi_idx = bio_src->bi_idx; 459 bio->bi_idx = bio_src->bi_idx;
453} 460}
461EXPORT_SYMBOL(__bio_clone);
454 462
455/** 463/**
456 * bio_clone - clone a bio 464 * bio_clone - clone a bio
@@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
482 490
483 return b; 491 return b;
484} 492}
493EXPORT_SYMBOL(bio_clone);
485 494
486/** 495/**
487 * bio_get_nr_vecs - return approx number of vecs 496 * bio_get_nr_vecs - return approx number of vecs
@@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev)
505 514
506 return nr_pages; 515 return nr_pages;
507} 516}
517EXPORT_SYMBOL(bio_get_nr_vecs);
508 518
509static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page 519static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
510 *page, unsigned int len, unsigned int offset, 520 *page, unsigned int len, unsigned int offset,
@@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
635 return __bio_add_page(q, bio, page, len, offset, 645 return __bio_add_page(q, bio, page, len, offset,
636 queue_max_hw_sectors(q)); 646 queue_max_hw_sectors(q));
637} 647}
648EXPORT_SYMBOL(bio_add_pc_page);
638 649
639/** 650/**
640 * bio_add_page - attempt to add page to bio 651 * bio_add_page - attempt to add page to bio
@@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
655 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 666 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
656 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); 667 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
657} 668}
669EXPORT_SYMBOL(bio_add_page);
658 670
659struct bio_map_data { 671struct bio_map_data {
660 struct bio_vec *iovecs; 672 struct bio_vec *iovecs;
@@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio)
776 bio_put(bio); 788 bio_put(bio);
777 return ret; 789 return ret;
778} 790}
791EXPORT_SYMBOL(bio_uncopy_user);
779 792
780/** 793/**
781 * bio_copy_user_iov - copy user data to bio 794 * bio_copy_user_iov - copy user data to bio
@@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
920 933
921 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); 934 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
922} 935}
936EXPORT_SYMBOL(bio_copy_user);
923 937
924static struct bio *__bio_map_user_iov(struct request_queue *q, 938static struct bio *__bio_map_user_iov(struct request_queue *q,
925 struct block_device *bdev, 939 struct block_device *bdev,
@@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1050 1064
1051 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); 1065 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1052} 1066}
1067EXPORT_SYMBOL(bio_map_user);
1053 1068
1054/** 1069/**
1055 * bio_map_user_iov - map user sg_iovec table into bio 1070 * bio_map_user_iov - map user sg_iovec table into bio
@@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio)
1117 __bio_unmap_user(bio); 1132 __bio_unmap_user(bio);
1118 bio_put(bio); 1133 bio_put(bio);
1119} 1134}
1135EXPORT_SYMBOL(bio_unmap_user);
1120 1136
1121static void bio_map_kern_endio(struct bio *bio, int err) 1137static void bio_map_kern_endio(struct bio *bio, int err)
1122{ 1138{
1123 bio_put(bio); 1139 bio_put(bio);
1124} 1140}
1125 1141
1126
1127static struct bio *__bio_map_kern(struct request_queue *q, void *data, 1142static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1128 unsigned int len, gfp_t gfp_mask) 1143 unsigned int len, gfp_t gfp_mask)
1129{ 1144{
@@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1189 bio_put(bio); 1204 bio_put(bio);
1190 return ERR_PTR(-EINVAL); 1205 return ERR_PTR(-EINVAL);
1191} 1206}
1207EXPORT_SYMBOL(bio_map_kern);
1192 1208
1193static void bio_copy_kern_endio(struct bio *bio, int err) 1209static void bio_copy_kern_endio(struct bio *bio, int err)
1194{ 1210{
@@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1250 1266
1251 return bio; 1267 return bio;
1252} 1268}
1269EXPORT_SYMBOL(bio_copy_kern);
1253 1270
1254/* 1271/*
1255 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 1272 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error)
1400 if (bio->bi_end_io) 1417 if (bio->bi_end_io)
1401 bio->bi_end_io(bio, error); 1418 bio->bi_end_io(bio, error);
1402} 1419}
1420EXPORT_SYMBOL(bio_endio);
1403 1421
1404void bio_pair_release(struct bio_pair *bp) 1422void bio_pair_release(struct bio_pair *bp)
1405{ 1423{
@@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp)
1410 mempool_free(bp, bp->bio2.bi_private); 1428 mempool_free(bp, bp->bio2.bi_private);
1411 } 1429 }
1412} 1430}
1431EXPORT_SYMBOL(bio_pair_release);
1413 1432
1414static void bio_pair_end_1(struct bio *bi, int err) 1433static void bio_pair_end_1(struct bio *bi, int err)
1415{ 1434{
@@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1477 1496
1478 return bp; 1497 return bp;
1479} 1498}
1499EXPORT_SYMBOL(bio_split);
1480 1500
1481/** 1501/**
1482 * bio_sector_offset - Find hardware sector offset in bio 1502 * bio_sector_offset - Find hardware sector offset in bio
@@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs)
1547 1567
1548 kfree(bs); 1568 kfree(bs);
1549} 1569}
1570EXPORT_SYMBOL(bioset_free);
1550 1571
1551/** 1572/**
1552 * bioset_create - Create a bio_set 1573 * bioset_create - Create a bio_set
@@ -1592,6 +1613,7 @@ bad:
1592 bioset_free(bs); 1613 bioset_free(bs);
1593 return NULL; 1614 return NULL;
1594} 1615}
1616EXPORT_SYMBOL(bioset_create);
1595 1617
1596static void __init biovec_init_slabs(void) 1618static void __init biovec_init_slabs(void)
1597{ 1619{
@@ -1636,29 +1658,4 @@ static int __init init_bio(void)
1636 1658
1637 return 0; 1659 return 0;
1638} 1660}
1639
1640subsys_initcall(init_bio); 1661subsys_initcall(init_bio);
1641
1642EXPORT_SYMBOL(bio_alloc);
1643EXPORT_SYMBOL(bio_kmalloc);
1644EXPORT_SYMBOL(bio_put);
1645EXPORT_SYMBOL(bio_free);
1646EXPORT_SYMBOL(bio_endio);
1647EXPORT_SYMBOL(bio_init);
1648EXPORT_SYMBOL(__bio_clone);
1649EXPORT_SYMBOL(bio_clone);
1650EXPORT_SYMBOL(bio_phys_segments);
1651EXPORT_SYMBOL(bio_add_page);
1652EXPORT_SYMBOL(bio_add_pc_page);
1653EXPORT_SYMBOL(bio_get_nr_vecs);
1654EXPORT_SYMBOL(bio_map_user);
1655EXPORT_SYMBOL(bio_unmap_user);
1656EXPORT_SYMBOL(bio_map_kern);
1657EXPORT_SYMBOL(bio_copy_kern);
1658EXPORT_SYMBOL(bio_pair_release);
1659EXPORT_SYMBOL(bio_split);
1660EXPORT_SYMBOL(bio_copy_user);
1661EXPORT_SYMBOL(bio_uncopy_user);
1662EXPORT_SYMBOL(bioset_create);
1663EXPORT_SYMBOL(bioset_free);
1664EXPORT_SYMBOL(bio_alloc_bioset);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 71e7e03ac343..9cf4b926f8e4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev);
216 * freeze_bdev -- lock a filesystem and force it into a consistent state 216 * freeze_bdev -- lock a filesystem and force it into a consistent state
217 * @bdev: blockdevice to lock 217 * @bdev: blockdevice to lock
218 * 218 *
219 * This takes the block device bd_mount_sem to make sure no new mounts
220 * happen on bdev until thaw_bdev() is called.
221 * If a superblock is found on this device, we take the s_umount semaphore 219 * If a superblock is found on this device, we take the s_umount semaphore
222 * on it to make sure nobody unmounts until the snapshot creation is done. 220 * on it to make sure nobody unmounts until the snapshot creation is done.
223 * The reference counter (bd_fsfreeze_count) guarantees that only the last 221 * The reference counter (bd_fsfreeze_count) guarantees that only the last
@@ -232,46 +230,55 @@ struct super_block *freeze_bdev(struct block_device *bdev)
232 int error = 0; 230 int error = 0;
233 231
234 mutex_lock(&bdev->bd_fsfreeze_mutex); 232 mutex_lock(&bdev->bd_fsfreeze_mutex);
235 if (bdev->bd_fsfreeze_count > 0) { 233 if (++bdev->bd_fsfreeze_count > 1) {
236 bdev->bd_fsfreeze_count++; 234 /*
235 * We don't even need to grab a reference - the first call
236 * to freeze_bdev grab an active reference and only the last
237 * thaw_bdev drops it.
238 */
237 sb = get_super(bdev); 239 sb = get_super(bdev);
240 drop_super(sb);
238 mutex_unlock(&bdev->bd_fsfreeze_mutex); 241 mutex_unlock(&bdev->bd_fsfreeze_mutex);
239 return sb; 242 return sb;
240 } 243 }
241 bdev->bd_fsfreeze_count++; 244
242 245 sb = get_active_super(bdev);
243 down(&bdev->bd_mount_sem); 246 if (!sb)
244 sb = get_super(bdev); 247 goto out;
245 if (sb && !(sb->s_flags & MS_RDONLY)) { 248 if (sb->s_flags & MS_RDONLY) {
246 sb->s_frozen = SB_FREEZE_WRITE; 249 deactivate_locked_super(sb);
247 smp_wmb(); 250 mutex_unlock(&bdev->bd_fsfreeze_mutex);
248 251 return sb;
249 sync_filesystem(sb); 252 }
250 253
251 sb->s_frozen = SB_FREEZE_TRANS; 254 sb->s_frozen = SB_FREEZE_WRITE;
252 smp_wmb(); 255 smp_wmb();
253 256
254 sync_blockdev(sb->s_bdev); 257 sync_filesystem(sb);
255 258
256 if (sb->s_op->freeze_fs) { 259 sb->s_frozen = SB_FREEZE_TRANS;
257 error = sb->s_op->freeze_fs(sb); 260 smp_wmb();
258 if (error) { 261
259 printk(KERN_ERR 262 sync_blockdev(sb->s_bdev);
260 "VFS:Filesystem freeze failed\n"); 263
261 sb->s_frozen = SB_UNFROZEN; 264 if (sb->s_op->freeze_fs) {
262 drop_super(sb); 265 error = sb->s_op->freeze_fs(sb);
263 up(&bdev->bd_mount_sem); 266 if (error) {
264 bdev->bd_fsfreeze_count--; 267 printk(KERN_ERR
265 mutex_unlock(&bdev->bd_fsfreeze_mutex); 268 "VFS:Filesystem freeze failed\n");
266 return ERR_PTR(error); 269 sb->s_frozen = SB_UNFROZEN;
267 } 270 deactivate_locked_super(sb);
271 bdev->bd_fsfreeze_count--;
272 mutex_unlock(&bdev->bd_fsfreeze_mutex);
273 return ERR_PTR(error);
268 } 274 }
269 } 275 }
276 up_write(&sb->s_umount);
270 277
278 out:
271 sync_blockdev(bdev); 279 sync_blockdev(bdev);
272 mutex_unlock(&bdev->bd_fsfreeze_mutex); 280 mutex_unlock(&bdev->bd_fsfreeze_mutex);
273 281 return sb; /* thaw_bdev releases s->s_umount */
274 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
275} 282}
276EXPORT_SYMBOL(freeze_bdev); 283EXPORT_SYMBOL(freeze_bdev);
277 284
@@ -284,44 +291,44 @@ EXPORT_SYMBOL(freeze_bdev);
284 */ 291 */
285int thaw_bdev(struct block_device *bdev, struct super_block *sb) 292int thaw_bdev(struct block_device *bdev, struct super_block *sb)
286{ 293{
287 int error = 0; 294 int error = -EINVAL;
288 295
289 mutex_lock(&bdev->bd_fsfreeze_mutex); 296 mutex_lock(&bdev->bd_fsfreeze_mutex);
290 if (!bdev->bd_fsfreeze_count) { 297 if (!bdev->bd_fsfreeze_count)
291 mutex_unlock(&bdev->bd_fsfreeze_mutex); 298 goto out_unlock;
292 return -EINVAL; 299
293 } 300 error = 0;
294 301 if (--bdev->bd_fsfreeze_count > 0)
295 bdev->bd_fsfreeze_count--; 302 goto out_unlock;
296 if (bdev->bd_fsfreeze_count > 0) { 303
297 if (sb) 304 if (!sb)
298 drop_super(sb); 305 goto out_unlock;
299 mutex_unlock(&bdev->bd_fsfreeze_mutex); 306
300 return 0; 307 BUG_ON(sb->s_bdev != bdev);
301 } 308 down_write(&sb->s_umount);
302 309 if (sb->s_flags & MS_RDONLY)
303 if (sb) { 310 goto out_deactivate;
304 BUG_ON(sb->s_bdev != bdev); 311
305 if (!(sb->s_flags & MS_RDONLY)) { 312 if (sb->s_op->unfreeze_fs) {
306 if (sb->s_op->unfreeze_fs) { 313 error = sb->s_op->unfreeze_fs(sb);
307 error = sb->s_op->unfreeze_fs(sb); 314 if (error) {
308 if (error) { 315 printk(KERN_ERR
309 printk(KERN_ERR 316 "VFS:Filesystem thaw failed\n");
310 "VFS:Filesystem thaw failed\n"); 317 sb->s_frozen = SB_FREEZE_TRANS;
311 sb->s_frozen = SB_FREEZE_TRANS; 318 bdev->bd_fsfreeze_count++;
312 bdev->bd_fsfreeze_count++; 319 mutex_unlock(&bdev->bd_fsfreeze_mutex);
313 mutex_unlock(&bdev->bd_fsfreeze_mutex); 320 return error;
314 return error;
315 }
316 }
317 sb->s_frozen = SB_UNFROZEN;
318 smp_wmb();
319 wake_up(&sb->s_wait_unfrozen);
320 } 321 }
321 drop_super(sb);
322 } 322 }
323 323
324 up(&bdev->bd_mount_sem); 324 sb->s_frozen = SB_UNFROZEN;
325 smp_wmb();
326 wake_up(&sb->s_wait_unfrozen);
327
328out_deactivate:
329 if (sb)
330 deactivate_locked_super(sb);
331out_unlock:
325 mutex_unlock(&bdev->bd_fsfreeze_mutex); 332 mutex_unlock(&bdev->bd_fsfreeze_mutex);
326 return 0; 333 return 0;
327} 334}
@@ -430,7 +437,6 @@ static void init_once(void *foo)
430 437
431 memset(bdev, 0, sizeof(*bdev)); 438 memset(bdev, 0, sizeof(*bdev));
432 mutex_init(&bdev->bd_mutex); 439 mutex_init(&bdev->bd_mutex);
433 sema_init(&bdev->bd_mount_sem, 1);
434 INIT_LIST_HEAD(&bdev->bd_inodes); 440 INIT_LIST_HEAD(&bdev->bd_inodes);
435 INIT_LIST_HEAD(&bdev->bd_list); 441 INIT_LIST_HEAD(&bdev->bd_list);
436#ifdef CONFIG_SYSFS 442#ifdef CONFIG_SYSFS
@@ -1114,7 +1120,7 @@ EXPORT_SYMBOL(revalidate_disk);
1114int check_disk_change(struct block_device *bdev) 1120int check_disk_change(struct block_device *bdev)
1115{ 1121{
1116 struct gendisk *disk = bdev->bd_disk; 1122 struct gendisk *disk = bdev->bd_disk;
1117 struct block_device_operations * bdops = disk->fops; 1123 const struct block_device_operations *bdops = disk->fops;
1118 1124
1119 if (!bdops->media_changed) 1125 if (!bdops->media_changed)
1120 return 0; 1126 return 0;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..69b355ae7f49 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_BTRFS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
313 .set = btrfs_xattr_acl_access_set, 313 .set = btrfs_xattr_acl_access_set,
314}; 314};
315 315
316#else /* CONFIG_FS_POSIX_ACL */ 316#else /* CONFIG_BTRFS_POSIX_ACL */
317 317
318int btrfs_acl_chmod(struct inode *inode) 318int btrfs_acl_chmod(struct inode *inode)
319{ 319{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
325 return 0; 325 return 0;
326} 326}
327 327
328#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_BTRFS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 019e8af449ab..282ca085c2fb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
48 /* number of things on the pending list */ 48 /* number of things on the pending list */
49 atomic_t num_pending; 49 atomic_t num_pending;
50 50
51 /* reference counter for this struct */
52 atomic_t refs;
53
51 unsigned long sequence; 54 unsigned long sequence;
52 55
53 /* protects the pending list. */ 56 /* protects the pending list. */
@@ -71,7 +74,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
71 unsigned long flags; 74 unsigned long flags;
72 spin_lock_irqsave(&worker->workers->lock, flags); 75 spin_lock_irqsave(&worker->workers->lock, flags);
73 worker->idle = 1; 76 worker->idle = 1;
74 list_move(&worker->worker_list, &worker->workers->idle_list); 77
78 /* the list may be empty if the worker is just starting */
79 if (!list_empty(&worker->worker_list)) {
80 list_move(&worker->worker_list,
81 &worker->workers->idle_list);
82 }
75 spin_unlock_irqrestore(&worker->workers->lock, flags); 83 spin_unlock_irqrestore(&worker->workers->lock, flags);
76 } 84 }
77} 85}
@@ -87,23 +95,49 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
87 unsigned long flags; 95 unsigned long flags;
88 spin_lock_irqsave(&worker->workers->lock, flags); 96 spin_lock_irqsave(&worker->workers->lock, flags);
89 worker->idle = 0; 97 worker->idle = 0;
90 list_move_tail(&worker->worker_list, 98
91 &worker->workers->worker_list); 99 if (!list_empty(&worker->worker_list)) {
100 list_move_tail(&worker->worker_list,
101 &worker->workers->worker_list);
102 }
92 spin_unlock_irqrestore(&worker->workers->lock, flags); 103 spin_unlock_irqrestore(&worker->workers->lock, flags);
93 } 104 }
94} 105}
95 106
96static noinline int run_ordered_completions(struct btrfs_workers *workers, 107static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
97 struct btrfs_work *work)
98{ 108{
109 struct btrfs_workers *workers = worker->workers;
99 unsigned long flags; 110 unsigned long flags;
100 111
112 rmb();
113 if (!workers->atomic_start_pending)
114 return;
115
116 spin_lock_irqsave(&workers->lock, flags);
117 if (!workers->atomic_start_pending)
118 goto out;
119
120 workers->atomic_start_pending = 0;
121 if (workers->num_workers >= workers->max_workers)
122 goto out;
123
124 spin_unlock_irqrestore(&workers->lock, flags);
125 btrfs_start_workers(workers, 1);
126 return;
127
128out:
129 spin_unlock_irqrestore(&workers->lock, flags);
130}
131
132static noinline int run_ordered_completions(struct btrfs_workers *workers,
133 struct btrfs_work *work)
134{
101 if (!workers->ordered) 135 if (!workers->ordered)
102 return 0; 136 return 0;
103 137
104 set_bit(WORK_DONE_BIT, &work->flags); 138 set_bit(WORK_DONE_BIT, &work->flags);
105 139
106 spin_lock_irqsave(&workers->lock, flags); 140 spin_lock(&workers->order_lock);
107 141
108 while (1) { 142 while (1) {
109 if (!list_empty(&workers->prio_order_list)) { 143 if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +160,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
126 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 160 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
127 break; 161 break;
128 162
129 spin_unlock_irqrestore(&workers->lock, flags); 163 spin_unlock(&workers->order_lock);
130 164
131 work->ordered_func(work); 165 work->ordered_func(work);
132 166
133 /* now take the lock again and call the freeing code */ 167 /* now take the lock again and call the freeing code */
134 spin_lock_irqsave(&workers->lock, flags); 168 spin_lock(&workers->order_lock);
135 list_del(&work->order_list); 169 list_del(&work->order_list);
136 work->ordered_free(work); 170 work->ordered_free(work);
137 } 171 }
138 172
139 spin_unlock_irqrestore(&workers->lock, flags); 173 spin_unlock(&workers->order_lock);
140 return 0; 174 return 0;
141} 175}
142 176
177static void put_worker(struct btrfs_worker_thread *worker)
178{
179 if (atomic_dec_and_test(&worker->refs))
180 kfree(worker);
181}
182
183static int try_worker_shutdown(struct btrfs_worker_thread *worker)
184{
185 int freeit = 0;
186
187 spin_lock_irq(&worker->lock);
188 spin_lock(&worker->workers->lock);
189 if (worker->workers->num_workers > 1 &&
190 worker->idle &&
191 !worker->working &&
192 !list_empty(&worker->worker_list) &&
193 list_empty(&worker->prio_pending) &&
194 list_empty(&worker->pending) &&
195 atomic_read(&worker->num_pending) == 0) {
196 freeit = 1;
197 list_del_init(&worker->worker_list);
198 worker->workers->num_workers--;
199 }
200 spin_unlock(&worker->workers->lock);
201 spin_unlock_irq(&worker->lock);
202
203 if (freeit)
204 put_worker(worker);
205 return freeit;
206}
207
208static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
209 struct list_head *prio_head,
210 struct list_head *head)
211{
212 struct btrfs_work *work = NULL;
213 struct list_head *cur = NULL;
214
215 if(!list_empty(prio_head))
216 cur = prio_head->next;
217
218 smp_mb();
219 if (!list_empty(&worker->prio_pending))
220 goto refill;
221
222 if (!list_empty(head))
223 cur = head->next;
224
225 if (cur)
226 goto out;
227
228refill:
229 spin_lock_irq(&worker->lock);
230 list_splice_tail_init(&worker->prio_pending, prio_head);
231 list_splice_tail_init(&worker->pending, head);
232
233 if (!list_empty(prio_head))
234 cur = prio_head->next;
235 else if (!list_empty(head))
236 cur = head->next;
237 spin_unlock_irq(&worker->lock);
238
239 if (!cur)
240 goto out_fail;
241
242out:
243 work = list_entry(cur, struct btrfs_work, list);
244
245out_fail:
246 return work;
247}
248
143/* 249/*
144 * main loop for servicing work items 250 * main loop for servicing work items
145 */ 251 */
146static int worker_loop(void *arg) 252static int worker_loop(void *arg)
147{ 253{
148 struct btrfs_worker_thread *worker = arg; 254 struct btrfs_worker_thread *worker = arg;
149 struct list_head *cur; 255 struct list_head head;
256 struct list_head prio_head;
150 struct btrfs_work *work; 257 struct btrfs_work *work;
258
259 INIT_LIST_HEAD(&head);
260 INIT_LIST_HEAD(&prio_head);
261
151 do { 262 do {
152 spin_lock_irq(&worker->lock); 263again:
153again_locked:
154 while (1) { 264 while (1) {
155 if (!list_empty(&worker->prio_pending)) 265
156 cur = worker->prio_pending.next; 266
157 else if (!list_empty(&worker->pending)) 267 work = get_next_work(worker, &prio_head, &head);
158 cur = worker->pending.next; 268 if (!work)
159 else
160 break; 269 break;
161 270
162 work = list_entry(cur, struct btrfs_work, list);
163 list_del(&work->list); 271 list_del(&work->list);
164 clear_bit(WORK_QUEUED_BIT, &work->flags); 272 clear_bit(WORK_QUEUED_BIT, &work->flags);
165 273
166 work->worker = worker; 274 work->worker = worker;
167 spin_unlock_irq(&worker->lock);
168 275
169 work->func(work); 276 work->func(work);
170 277
@@ -175,9 +282,13 @@ again_locked:
175 */ 282 */
176 run_ordered_completions(worker->workers, work); 283 run_ordered_completions(worker->workers, work);
177 284
178 spin_lock_irq(&worker->lock); 285 check_pending_worker_creates(worker);
179 check_idle_worker(worker); 286
180 } 287 }
288
289 spin_lock_irq(&worker->lock);
290 check_idle_worker(worker);
291
181 if (freezing(current)) { 292 if (freezing(current)) {
182 worker->working = 0; 293 worker->working = 0;
183 spin_unlock_irq(&worker->lock); 294 spin_unlock_irq(&worker->lock);
@@ -216,8 +327,10 @@ again_locked:
216 spin_lock_irq(&worker->lock); 327 spin_lock_irq(&worker->lock);
217 set_current_state(TASK_INTERRUPTIBLE); 328 set_current_state(TASK_INTERRUPTIBLE);
218 if (!list_empty(&worker->pending) || 329 if (!list_empty(&worker->pending) ||
219 !list_empty(&worker->prio_pending)) 330 !list_empty(&worker->prio_pending)) {
220 goto again_locked; 331 spin_unlock_irq(&worker->lock);
332 goto again;
333 }
221 334
222 /* 335 /*
223 * this makes sure we get a wakeup when someone 336 * this makes sure we get a wakeup when someone
@@ -226,8 +339,13 @@ again_locked:
226 worker->working = 0; 339 worker->working = 0;
227 spin_unlock_irq(&worker->lock); 340 spin_unlock_irq(&worker->lock);
228 341
229 if (!kthread_should_stop()) 342 if (!kthread_should_stop()) {
230 schedule(); 343 schedule_timeout(HZ * 120);
344 if (!worker->working &&
345 try_worker_shutdown(worker)) {
346 return 0;
347 }
348 }
231 } 349 }
232 __set_current_state(TASK_RUNNING); 350 __set_current_state(TASK_RUNNING);
233 } 351 }
@@ -242,16 +360,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
242{ 360{
243 struct list_head *cur; 361 struct list_head *cur;
244 struct btrfs_worker_thread *worker; 362 struct btrfs_worker_thread *worker;
363 int can_stop;
245 364
365 spin_lock_irq(&workers->lock);
246 list_splice_init(&workers->idle_list, &workers->worker_list); 366 list_splice_init(&workers->idle_list, &workers->worker_list);
247 while (!list_empty(&workers->worker_list)) { 367 while (!list_empty(&workers->worker_list)) {
248 cur = workers->worker_list.next; 368 cur = workers->worker_list.next;
249 worker = list_entry(cur, struct btrfs_worker_thread, 369 worker = list_entry(cur, struct btrfs_worker_thread,
250 worker_list); 370 worker_list);
251 kthread_stop(worker->task); 371
252 list_del(&worker->worker_list); 372 atomic_inc(&worker->refs);
253 kfree(worker); 373 workers->num_workers -= 1;
374 if (!list_empty(&worker->worker_list)) {
375 list_del_init(&worker->worker_list);
376 put_worker(worker);
377 can_stop = 1;
378 } else
379 can_stop = 0;
380 spin_unlock_irq(&workers->lock);
381 if (can_stop)
382 kthread_stop(worker->task);
383 spin_lock_irq(&workers->lock);
384 put_worker(worker);
254 } 385 }
386 spin_unlock_irq(&workers->lock);
255 return 0; 387 return 0;
256} 388}
257 389
@@ -266,10 +398,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
266 INIT_LIST_HEAD(&workers->order_list); 398 INIT_LIST_HEAD(&workers->order_list);
267 INIT_LIST_HEAD(&workers->prio_order_list); 399 INIT_LIST_HEAD(&workers->prio_order_list);
268 spin_lock_init(&workers->lock); 400 spin_lock_init(&workers->lock);
401 spin_lock_init(&workers->order_lock);
269 workers->max_workers = max; 402 workers->max_workers = max;
270 workers->idle_thresh = 32; 403 workers->idle_thresh = 32;
271 workers->name = name; 404 workers->name = name;
272 workers->ordered = 0; 405 workers->ordered = 0;
406 workers->atomic_start_pending = 0;
407 workers->atomic_worker_start = 0;
273} 408}
274 409
275/* 410/*
@@ -293,7 +428,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
293 INIT_LIST_HEAD(&worker->prio_pending); 428 INIT_LIST_HEAD(&worker->prio_pending);
294 INIT_LIST_HEAD(&worker->worker_list); 429 INIT_LIST_HEAD(&worker->worker_list);
295 spin_lock_init(&worker->lock); 430 spin_lock_init(&worker->lock);
431
296 atomic_set(&worker->num_pending, 0); 432 atomic_set(&worker->num_pending, 0);
433 atomic_set(&worker->refs, 1);
297 worker->workers = workers; 434 worker->workers = workers;
298 worker->task = kthread_run(worker_loop, worker, 435 worker->task = kthread_run(worker_loop, worker,
299 "btrfs-%s-%d", workers->name, 436 "btrfs-%s-%d", workers->name,
@@ -303,7 +440,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
303 kfree(worker); 440 kfree(worker);
304 goto fail; 441 goto fail;
305 } 442 }
306
307 spin_lock_irq(&workers->lock); 443 spin_lock_irq(&workers->lock);
308 list_add_tail(&worker->worker_list, &workers->idle_list); 444 list_add_tail(&worker->worker_list, &workers->idle_list);
309 worker->idle = 1; 445 worker->idle = 1;
@@ -350,7 +486,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
350 */ 486 */
351 next = workers->worker_list.next; 487 next = workers->worker_list.next;
352 worker = list_entry(next, struct btrfs_worker_thread, worker_list); 488 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
353 atomic_inc(&worker->num_pending);
354 worker->sequence++; 489 worker->sequence++;
355 490
356 if (worker->sequence % workers->idle_thresh == 0) 491 if (worker->sequence % workers->idle_thresh == 0)
@@ -367,28 +502,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
367{ 502{
368 struct btrfs_worker_thread *worker; 503 struct btrfs_worker_thread *worker;
369 unsigned long flags; 504 unsigned long flags;
505 struct list_head *fallback;
370 506
371again: 507again:
372 spin_lock_irqsave(&workers->lock, flags); 508 spin_lock_irqsave(&workers->lock, flags);
373 worker = next_worker(workers); 509 worker = next_worker(workers);
374 spin_unlock_irqrestore(&workers->lock, flags);
375 510
376 if (!worker) { 511 if (!worker) {
377 spin_lock_irqsave(&workers->lock, flags);
378 if (workers->num_workers >= workers->max_workers) { 512 if (workers->num_workers >= workers->max_workers) {
379 struct list_head *fallback = NULL; 513 goto fallback;
380 /* 514 } else if (workers->atomic_worker_start) {
381 * we have failed to find any workers, just 515 workers->atomic_start_pending = 1;
382 * return the force one 516 goto fallback;
383 */
384 if (!list_empty(&workers->worker_list))
385 fallback = workers->worker_list.next;
386 if (!list_empty(&workers->idle_list))
387 fallback = workers->idle_list.next;
388 BUG_ON(!fallback);
389 worker = list_entry(fallback,
390 struct btrfs_worker_thread, worker_list);
391 spin_unlock_irqrestore(&workers->lock, flags);
392 } else { 517 } else {
393 spin_unlock_irqrestore(&workers->lock, flags); 518 spin_unlock_irqrestore(&workers->lock, flags);
394 /* we're below the limit, start another worker */ 519 /* we're below the limit, start another worker */
@@ -396,6 +521,28 @@ again:
396 goto again; 521 goto again;
397 } 522 }
398 } 523 }
524 goto found;
525
526fallback:
527 fallback = NULL;
528 /*
529 * we have failed to find any workers, just
530 * return the first one we can find.
531 */
532 if (!list_empty(&workers->worker_list))
533 fallback = workers->worker_list.next;
534 if (!list_empty(&workers->idle_list))
535 fallback = workers->idle_list.next;
536 BUG_ON(!fallback);
537 worker = list_entry(fallback,
538 struct btrfs_worker_thread, worker_list);
539found:
540 /*
541 * this makes sure the worker doesn't exit before it is placed
542 * onto a busy/idle list
543 */
544 atomic_inc(&worker->num_pending);
545 spin_unlock_irqrestore(&workers->lock, flags);
399 return worker; 546 return worker;
400} 547}
401 548
@@ -427,7 +574,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
427 spin_lock(&worker->workers->lock); 574 spin_lock(&worker->workers->lock);
428 worker->idle = 0; 575 worker->idle = 0;
429 list_move_tail(&worker->worker_list, 576 list_move_tail(&worker->worker_list,
430 &worker->workers->worker_list); 577 &worker->workers->worker_list);
431 spin_unlock(&worker->workers->lock); 578 spin_unlock(&worker->workers->lock);
432 } 579 }
433 if (!worker->working) { 580 if (!worker->working) {
@@ -435,9 +582,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
435 worker->working = 1; 582 worker->working = 1;
436 } 583 }
437 584
438 spin_unlock_irqrestore(&worker->lock, flags);
439 if (wake) 585 if (wake)
440 wake_up_process(worker->task); 586 wake_up_process(worker->task);
587 spin_unlock_irqrestore(&worker->lock, flags);
441out: 588out:
442 589
443 return 0; 590 return 0;
@@ -463,14 +610,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
463 610
464 worker = find_worker(workers); 611 worker = find_worker(workers);
465 if (workers->ordered) { 612 if (workers->ordered) {
466 spin_lock_irqsave(&workers->lock, flags); 613 /*
614 * you're not allowed to do ordered queues from an
615 * interrupt handler
616 */
617 spin_lock(&workers->order_lock);
467 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { 618 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
468 list_add_tail(&work->order_list, 619 list_add_tail(&work->order_list,
469 &workers->prio_order_list); 620 &workers->prio_order_list);
470 } else { 621 } else {
471 list_add_tail(&work->order_list, &workers->order_list); 622 list_add_tail(&work->order_list, &workers->order_list);
472 } 623 }
473 spin_unlock_irqrestore(&workers->lock, flags); 624 spin_unlock(&workers->order_lock);
474 } else { 625 } else {
475 INIT_LIST_HEAD(&work->order_list); 626 INIT_LIST_HEAD(&work->order_list);
476 } 627 }
@@ -481,7 +632,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
481 list_add_tail(&work->list, &worker->prio_pending); 632 list_add_tail(&work->list, &worker->prio_pending);
482 else 633 else
483 list_add_tail(&work->list, &worker->pending); 634 list_add_tail(&work->list, &worker->pending);
484 atomic_inc(&worker->num_pending);
485 check_busy_worker(worker); 635 check_busy_worker(worker);
486 636
487 /* 637 /*
@@ -492,10 +642,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
492 wake = 1; 642 wake = 1;
493 worker->working = 1; 643 worker->working = 1;
494 644
495 spin_unlock_irqrestore(&worker->lock, flags);
496
497 if (wake) 645 if (wake)
498 wake_up_process(worker->task); 646 wake_up_process(worker->task);
647 spin_unlock_irqrestore(&worker->lock, flags);
648
499out: 649out:
500 return 0; 650 return 0;
501} 651}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1b511c109db6..fc089b95ec14 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -73,6 +73,15 @@ struct btrfs_workers {
73 /* force completions in the order they were queued */ 73 /* force completions in the order they were queued */
74 int ordered; 74 int ordered;
75 75
76 /* more workers required, but in an interrupt handler */
77 int atomic_start_pending;
78
79 /*
80 * are we allowed to sleep while starting workers or are we required
81 * to start them at a later time?
82 */
83 int atomic_worker_start;
84
76 /* list with all the work threads. The workers on the idle thread 85 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the 86 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above. 87 * idle thresh limit above.
@@ -90,6 +99,9 @@ struct btrfs_workers {
90 /* lock for finding the next worker thread to queue on */ 99 /* lock for finding the next worker thread to queue on */
91 spinlock_t lock; 100 spinlock_t lock;
92 101
102 /* lock for the ordered lists */
103 spinlock_t order_lock;
104
93 /* extra name for this worker, used for current->name */ 105 /* extra name for this worker, used for current->name */
94 char *name; 106 char *name;
95}; 107};
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ea1ea0af8c0e..a54d354cefcb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,6 +128,14 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 128 u64 last_unlink_trans;
129 129
130 /* 130 /*
131 * These two counters are for delalloc metadata reservations. We keep
132 * track of how many extents we've accounted for vs how many extents we
133 * have.
134 */
135 int delalloc_reserved_extents;
136 int delalloc_extents;
137
138 /*
131 * ordered_data_close is set by truncate when a file that used 139 * ordered_data_close is set by truncate when a file that used
132 * to have good data has been truncated to zero. When it is set 140 * to have good data has been truncated to zero. When it is set
133 * the btrfs file release call will add this inode to the 141 * the btrfs file release call will add this inode to the
@@ -138,6 +146,7 @@ struct btrfs_inode {
138 * of these. 146 * of these.
139 */ 147 */
140 unsigned ordered_data_close:1; 148 unsigned ordered_data_close:1;
149 unsigned dummy_inode:1;
141 150
142 struct inode vfs_inode; 151 struct inode vfs_inode;
143}; 152};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d8ba4d54a37..a11a32058b50 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
506 */ 506 */
507 set_page_extent_mapped(page); 507 set_page_extent_mapped(page);
508 lock_extent(tree, last_offset, end, GFP_NOFS); 508 lock_extent(tree, last_offset, end, GFP_NOFS);
509 spin_lock(&em_tree->lock); 509 read_lock(&em_tree->lock);
510 em = lookup_extent_mapping(em_tree, last_offset, 510 em = lookup_extent_mapping(em_tree, last_offset,
511 PAGE_CACHE_SIZE); 511 PAGE_CACHE_SIZE);
512 spin_unlock(&em_tree->lock); 512 read_unlock(&em_tree->lock);
513 513
514 if (!em || last_offset < em->start || 514 if (!em || last_offset < em->start ||
515 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || 515 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
593 em_tree = &BTRFS_I(inode)->extent_tree; 593 em_tree = &BTRFS_I(inode)->extent_tree;
594 594
595 /* we need the actual starting offset of this extent in the file */ 595 /* we need the actual starting offset of this extent in the file */
596 spin_lock(&em_tree->lock); 596 read_lock(&em_tree->lock);
597 em = lookup_extent_mapping(em_tree, 597 em = lookup_extent_mapping(em_tree,
598 page_offset(bio->bi_io_vec->bv_page), 598 page_offset(bio->bi_io_vec->bv_page),
599 PAGE_CACHE_SIZE); 599 PAGE_CACHE_SIZE);
600 spin_unlock(&em_tree->lock); 600 read_unlock(&em_tree->lock);
601 601
602 compressed_len = em->block_len; 602 compressed_len = em->block_len;
603 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 603 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fdcc0512d3a..ec96f3a6d536 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2853 int split; 2853 int split;
2854 int num_doubles = 0; 2854 int num_doubles = 0;
2855 2855
2856 l = path->nodes[0];
2857 slot = path->slots[0];
2858 if (extend && data_size + btrfs_item_size_nr(l, slot) +
2859 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2860 return -EOVERFLOW;
2861
2856 /* first try to make some room by pushing left and right */ 2862 /* first try to make some room by pushing left and right */
2857 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2863 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2858 wret = push_leaf_right(trans, root, path, data_size, 0); 2864 wret = push_leaf_right(trans, root, path, data_size, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 837435ce84ca..dd8ced9814c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
114 */ 114 */
115#define BTRFS_DEV_ITEMS_OBJECTID 1ULL 115#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
116 116
117#define BTRFS_BTREE_INODE_OBJECTID 1
118
119#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
120
117/* 121/*
118 * we can actually store much bigger names, but lets not confuse the rest 122 * we can actually store much bigger names, but lets not confuse the rest
119 * of linux 123 * of linux
@@ -670,18 +674,20 @@ struct btrfs_space_info {
670 u64 bytes_reserved; /* total bytes the allocator has reserved for 674 u64 bytes_reserved; /* total bytes the allocator has reserved for
671 current allocations */ 675 current allocations */
672 u64 bytes_readonly; /* total bytes that are read only */ 676 u64 bytes_readonly; /* total bytes that are read only */
673 677 u64 bytes_super; /* total bytes reserved for the super blocks */
674 /* delalloc accounting */ 678 u64 bytes_root; /* the number of bytes needed to commit a
675 u64 bytes_delalloc; /* number of bytes reserved for allocation, 679 transaction */
676 this space is not necessarily reserved yet
677 by the allocator */
678 u64 bytes_may_use; /* number of bytes that may be used for 680 u64 bytes_may_use; /* number of bytes that may be used for
679 delalloc */ 681 delalloc/allocations */
682 u64 bytes_delalloc; /* number of bytes currently reserved for
683 delayed allocation */
680 684
681 int full; /* indicates that we cannot allocate any more 685 int full; /* indicates that we cannot allocate any more
682 chunks for this space */ 686 chunks for this space */
683 int force_alloc; /* set if we need to force a chunk alloc for 687 int force_alloc; /* set if we need to force a chunk alloc for
684 this space */ 688 this space */
689 int force_delalloc; /* make people start doing filemap_flush until
690 we're under a threshold */
685 691
686 struct list_head list; 692 struct list_head list;
687 693
@@ -690,6 +696,9 @@ struct btrfs_space_info {
690 spinlock_t lock; 696 spinlock_t lock;
691 struct rw_semaphore groups_sem; 697 struct rw_semaphore groups_sem;
692 atomic_t caching_threads; 698 atomic_t caching_threads;
699
700 int allocating_chunk;
701 wait_queue_head_t wait;
693}; 702};
694 703
695/* 704/*
@@ -726,6 +735,15 @@ enum btrfs_caching_type {
726 BTRFS_CACHE_FINISHED = 2, 735 BTRFS_CACHE_FINISHED = 2,
727}; 736};
728 737
738struct btrfs_caching_control {
739 struct list_head list;
740 struct mutex mutex;
741 wait_queue_head_t wait;
742 struct btrfs_block_group_cache *block_group;
743 u64 progress;
744 atomic_t count;
745};
746
729struct btrfs_block_group_cache { 747struct btrfs_block_group_cache {
730 struct btrfs_key key; 748 struct btrfs_key key;
731 struct btrfs_block_group_item item; 749 struct btrfs_block_group_item item;
@@ -733,6 +751,7 @@ struct btrfs_block_group_cache {
733 spinlock_t lock; 751 spinlock_t lock;
734 u64 pinned; 752 u64 pinned;
735 u64 reserved; 753 u64 reserved;
754 u64 bytes_super;
736 u64 flags; 755 u64 flags;
737 u64 sectorsize; 756 u64 sectorsize;
738 int extents_thresh; 757 int extents_thresh;
@@ -742,8 +761,9 @@ struct btrfs_block_group_cache {
742 int dirty; 761 int dirty;
743 762
744 /* cache tracking stuff */ 763 /* cache tracking stuff */
745 wait_queue_head_t caching_q;
746 int cached; 764 int cached;
765 struct btrfs_caching_control *caching_ctl;
766 u64 last_byte_to_unpin;
747 767
748 struct btrfs_space_info *space_info; 768 struct btrfs_space_info *space_info;
749 769
@@ -782,13 +802,16 @@ struct btrfs_fs_info {
782 802
783 /* the log root tree is a directory of all the other log roots */ 803 /* the log root tree is a directory of all the other log roots */
784 struct btrfs_root *log_root_tree; 804 struct btrfs_root *log_root_tree;
805
806 spinlock_t fs_roots_radix_lock;
785 struct radix_tree_root fs_roots_radix; 807 struct radix_tree_root fs_roots_radix;
786 808
787 /* block group cache stuff */ 809 /* block group cache stuff */
788 spinlock_t block_group_cache_lock; 810 spinlock_t block_group_cache_lock;
789 struct rb_root block_group_cache_tree; 811 struct rb_root block_group_cache_tree;
790 812
791 struct extent_io_tree pinned_extents; 813 struct extent_io_tree freed_extents[2];
814 struct extent_io_tree *pinned_extents;
792 815
793 /* logical->physical extent mapping */ 816 /* logical->physical extent mapping */
794 struct btrfs_mapping_tree mapping_tree; 817 struct btrfs_mapping_tree mapping_tree;
@@ -822,11 +845,7 @@ struct btrfs_fs_info {
822 struct mutex transaction_kthread_mutex; 845 struct mutex transaction_kthread_mutex;
823 struct mutex cleaner_mutex; 846 struct mutex cleaner_mutex;
824 struct mutex chunk_mutex; 847 struct mutex chunk_mutex;
825 struct mutex drop_mutex;
826 struct mutex volume_mutex; 848 struct mutex volume_mutex;
827 struct mutex tree_reloc_mutex;
828 struct rw_semaphore extent_commit_sem;
829
830 /* 849 /*
831 * this protects the ordered operations list only while we are 850 * this protects the ordered operations list only while we are
832 * processing all of the entries on it. This way we make 851 * processing all of the entries on it. This way we make
@@ -835,10 +854,16 @@ struct btrfs_fs_info {
835 * before jumping into the main commit. 854 * before jumping into the main commit.
836 */ 855 */
837 struct mutex ordered_operations_mutex; 856 struct mutex ordered_operations_mutex;
857 struct rw_semaphore extent_commit_sem;
858
859 struct rw_semaphore subvol_sem;
860
861 struct srcu_struct subvol_srcu;
838 862
839 struct list_head trans_list; 863 struct list_head trans_list;
840 struct list_head hashers; 864 struct list_head hashers;
841 struct list_head dead_roots; 865 struct list_head dead_roots;
866 struct list_head caching_block_groups;
842 867
843 atomic_t nr_async_submits; 868 atomic_t nr_async_submits;
844 atomic_t async_submit_draining; 869 atomic_t async_submit_draining;
@@ -996,10 +1021,12 @@ struct btrfs_root {
996 u32 stripesize; 1021 u32 stripesize;
997 1022
998 u32 type; 1023 u32 type;
999 u64 highest_inode; 1024
1000 u64 last_inode_alloc; 1025 u64 highest_objectid;
1001 int ref_cows; 1026 int ref_cows;
1002 int track_dirty; 1027 int track_dirty;
1028 int in_radix;
1029
1003 u64 defrag_trans_start; 1030 u64 defrag_trans_start;
1004 struct btrfs_key defrag_progress; 1031 struct btrfs_key defrag_progress;
1005 struct btrfs_key defrag_max; 1032 struct btrfs_key defrag_max;
@@ -1920,8 +1947,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1920int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1947int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root, unsigned long count); 1948 struct btrfs_root *root, unsigned long count);
1922int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1949int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1923int btrfs_update_pinned_extents(struct btrfs_root *root, 1950int btrfs_pin_extent(struct btrfs_root *root,
1924 u64 bytenr, u64 num, int pin); 1951 u64 bytenr, u64 num, int reserved);
1925int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1952int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1926 struct btrfs_root *root, struct extent_buffer *leaf); 1953 struct btrfs_root *root, struct extent_buffer *leaf);
1927int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1954int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1971,9 +1998,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
1971 u64 root_objectid, u64 owner, u64 offset); 1998 u64 root_objectid, u64 owner, u64 offset);
1972 1999
1973int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2000int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2001int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2002 struct btrfs_root *root);
1974int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2003int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1975 struct btrfs_root *root, 2004 struct btrfs_root *root);
1976 struct extent_io_tree *unpin);
1977int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2005int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1978 struct btrfs_root *root, 2006 struct btrfs_root *root,
1979 u64 bytenr, u64 num_bytes, u64 parent, 2007 u64 bytenr, u64 num_bytes, u64 parent,
@@ -1984,6 +2012,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1984int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); 2012int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
1985int btrfs_free_block_groups(struct btrfs_fs_info *info); 2013int btrfs_free_block_groups(struct btrfs_fs_info *info);
1986int btrfs_read_block_groups(struct btrfs_root *root); 2014int btrfs_read_block_groups(struct btrfs_root *root);
2015int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
1987int btrfs_make_block_group(struct btrfs_trans_handle *trans, 2016int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, u64 bytes_used, 2017 struct btrfs_root *root, u64 bytes_used,
1989 u64 type, u64 chunk_objectid, u64 chunk_offset, 2018 u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -1997,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1997void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2026void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1998void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2027void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
1999 2028
2000int btrfs_check_metadata_free_space(struct btrfs_root *root); 2029int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2030int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2031int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2032 struct inode *inode, int num_items);
2033int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2034 struct inode *inode, int num_items);
2001int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2035int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2002 u64 bytes); 2036 u64 bytes);
2003void btrfs_free_reserved_data_space(struct btrfs_root *root, 2037void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2006,7 +2040,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2006 u64 bytes); 2040 u64 bytes);
2007void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2041void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2008 u64 bytes); 2042 u64 bytes);
2009void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
2010/* ctree.c */ 2043/* ctree.c */
2011int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2044int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2012 int level, int *slot); 2045 int level, int *slot);
@@ -2100,12 +2133,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2100 struct extent_buffer *parent); 2133 struct extent_buffer *parent);
2101/* root-item.c */ 2134/* root-item.c */
2102int btrfs_find_root_ref(struct btrfs_root *tree_root, 2135int btrfs_find_root_ref(struct btrfs_root *tree_root,
2103 struct btrfs_path *path, 2136 struct btrfs_path *path,
2104 u64 root_id, u64 ref_id); 2137 u64 root_id, u64 ref_id);
2105int btrfs_add_root_ref(struct btrfs_trans_handle *trans, 2138int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
2106 struct btrfs_root *tree_root, 2139 struct btrfs_root *tree_root,
2107 u64 root_id, u8 type, u64 ref_id, 2140 u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
2108 u64 dirid, u64 sequence, 2141 const char *name, int name_len);
2142int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
2143 struct btrfs_root *tree_root,
2144 u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
2109 const char *name, int name_len); 2145 const char *name, int name_len);
2110int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2146int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2111 struct btrfs_key *key); 2147 struct btrfs_key *key);
@@ -2120,6 +2156,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2120int btrfs_search_root(struct btrfs_root *root, u64 search_start, 2156int btrfs_search_root(struct btrfs_root *root, u64 search_start,
2121 u64 *found_objectid); 2157 u64 *found_objectid);
2122int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 2158int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2159int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2123int btrfs_set_root_node(struct btrfs_root_item *item, 2160int btrfs_set_root_node(struct btrfs_root_item *item,
2124 struct extent_buffer *node); 2161 struct extent_buffer *node);
2125/* dir-item.c */ 2162/* dir-item.c */
@@ -2138,6 +2175,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
2138 struct btrfs_path *path, u64 dir, 2175 struct btrfs_path *path, u64 dir,
2139 u64 objectid, const char *name, int name_len, 2176 u64 objectid, const char *name, int name_len,
2140 int mod); 2177 int mod);
2178struct btrfs_dir_item *
2179btrfs_search_dir_index_item(struct btrfs_root *root,
2180 struct btrfs_path *path, u64 dirid,
2181 const char *name, int name_len);
2141struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, 2182struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
2142 struct btrfs_path *path, 2183 struct btrfs_path *path,
2143 const char *name, int name_len); 2184 const char *name, int name_len);
@@ -2160,6 +2201,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
2160 struct btrfs_root *root, u64 offset); 2201 struct btrfs_root *root, u64 offset);
2161int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, 2202int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
2162 struct btrfs_root *root, u64 offset); 2203 struct btrfs_root *root, u64 offset);
2204int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
2163 2205
2164/* inode-map.c */ 2206/* inode-map.c */
2165int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, 2207int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
@@ -2232,6 +2274,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2232int btrfs_add_link(struct btrfs_trans_handle *trans, 2274int btrfs_add_link(struct btrfs_trans_handle *trans,
2233 struct inode *parent_inode, struct inode *inode, 2275 struct inode *parent_inode, struct inode *inode,
2234 const char *name, int name_len, int add_backref, u64 index); 2276 const char *name, int name_len, int add_backref, u64 index);
2277int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2278 struct btrfs_root *root,
2279 struct inode *dir, u64 objectid,
2280 const char *name, int name_len);
2235int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 2281int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2236 struct btrfs_root *root, 2282 struct btrfs_root *root,
2237 struct inode *inode, u64 new_size, 2283 struct inode *inode, u64 new_size,
@@ -2242,7 +2288,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2242int btrfs_writepages(struct address_space *mapping, 2288int btrfs_writepages(struct address_space *mapping,
2243 struct writeback_control *wbc); 2289 struct writeback_control *wbc);
2244int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2290int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *new_root, struct dentry *dentry, 2291 struct btrfs_root *new_root,
2246 u64 new_dirid, u64 alloc_hint); 2292 u64 new_dirid, u64 alloc_hint);
2247int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 2293int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2248 size_t size, struct bio *bio, unsigned long bio_flags); 2294 size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2258,6 +2304,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
2258void btrfs_dirty_inode(struct inode *inode); 2304void btrfs_dirty_inode(struct inode *inode);
2259struct inode *btrfs_alloc_inode(struct super_block *sb); 2305struct inode *btrfs_alloc_inode(struct super_block *sb);
2260void btrfs_destroy_inode(struct inode *inode); 2306void btrfs_destroy_inode(struct inode *inode);
2307void btrfs_drop_inode(struct inode *inode);
2261int btrfs_init_cachep(void); 2308int btrfs_init_cachep(void);
2262void btrfs_destroy_cachep(void); 2309void btrfs_destroy_cachep(void);
2263long btrfs_ioctl_trans_end(struct file *file); 2310long btrfs_ioctl_trans_end(struct file *file);
@@ -2275,6 +2322,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2275int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2322int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2276void btrfs_orphan_cleanup(struct btrfs_root *root); 2323void btrfs_orphan_cleanup(struct btrfs_root *root);
2277int btrfs_cont_expand(struct inode *inode, loff_t size); 2324int btrfs_cont_expand(struct inode *inode, loff_t size);
2325int btrfs_invalidate_inodes(struct btrfs_root *root);
2326extern struct dentry_operations btrfs_dentry_operations;
2278 2327
2279/* ioctl.c */ 2328/* ioctl.c */
2280long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2329long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2286,11 +2335,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2286int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2335int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2287 int skip_pinned); 2336 int skip_pinned);
2288int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2337int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2289extern struct file_operations btrfs_file_operations; 2338extern const struct file_operations btrfs_file_operations;
2290int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2339int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2291 struct btrfs_root *root, struct inode *inode, 2340 struct btrfs_root *root, struct inode *inode,
2292 u64 start, u64 end, u64 locked_end, 2341 u64 start, u64 end, u64 locked_end,
2293 u64 inline_limit, u64 *hint_block); 2342 u64 inline_limit, u64 *hint_block, int drop_cache);
2294int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2343int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2295 struct btrfs_root *root, 2344 struct btrfs_root *root,
2296 struct inode *inode, u64 start, u64 end); 2345 struct inode *inode, u64 start, u64 end);
@@ -2317,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2317int btrfs_sync_fs(struct super_block *sb, int wait); 2366int btrfs_sync_fs(struct super_block *sb, int wait);
2318 2367
2319/* acl.c */ 2368/* acl.c */
2320#ifdef CONFIG_FS_POSIX_ACL 2369#ifdef CONFIG_BTRFS_POSIX_ACL
2321int btrfs_check_acl(struct inode *inode, int mask); 2370int btrfs_check_acl(struct inode *inode, int mask);
2322#else 2371#else
2323#define btrfs_check_acl NULL 2372#define btrfs_check_acl NULL
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1d70236ba00c..f3a6075519cc 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
281 return btrfs_match_dir_item_name(root, path, name, name_len); 281 return btrfs_match_dir_item_name(root, path, name, name_len);
282} 282}
283 283
284struct btrfs_dir_item *
285btrfs_search_dir_index_item(struct btrfs_root *root,
286 struct btrfs_path *path, u64 dirid,
287 const char *name, int name_len)
288{
289 struct extent_buffer *leaf;
290 struct btrfs_dir_item *di;
291 struct btrfs_key key;
292 u32 nritems;
293 int ret;
294
295 key.objectid = dirid;
296 key.type = BTRFS_DIR_INDEX_KEY;
297 key.offset = 0;
298
299 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
300 if (ret < 0)
301 return ERR_PTR(ret);
302
303 leaf = path->nodes[0];
304 nritems = btrfs_header_nritems(leaf);
305
306 while (1) {
307 if (path->slots[0] >= nritems) {
308 ret = btrfs_next_leaf(root, path);
309 if (ret < 0)
310 return ERR_PTR(ret);
311 if (ret > 0)
312 break;
313 leaf = path->nodes[0];
314 nritems = btrfs_header_nritems(leaf);
315 continue;
316 }
317
318 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
319 if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
320 break;
321
322 di = btrfs_match_dir_item_name(root, path, name, name_len);
323 if (di)
324 return di;
325
326 path->slots[0]++;
327 }
328 return NULL;
329}
330
284struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, 331struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
285 struct btrfs_root *root, 332 struct btrfs_root *root,
286 struct btrfs_path *path, u64 dir, 333 struct btrfs_path *path, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8b8192790011..af0435f79fa6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
41 41
42static struct extent_io_ops btree_extent_io_ops; 42static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 43static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root);
44 45
45static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); 46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
46 47
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
123 struct extent_map *em; 124 struct extent_map *em;
124 int ret; 125 int ret;
125 126
126 spin_lock(&em_tree->lock); 127 read_lock(&em_tree->lock);
127 em = lookup_extent_mapping(em_tree, start, len); 128 em = lookup_extent_mapping(em_tree, start, len);
128 if (em) { 129 if (em) {
129 em->bdev = 130 em->bdev =
130 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 131 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
131 spin_unlock(&em_tree->lock); 132 read_unlock(&em_tree->lock);
132 goto out; 133 goto out;
133 } 134 }
134 spin_unlock(&em_tree->lock); 135 read_unlock(&em_tree->lock);
135 136
136 em = alloc_extent_map(GFP_NOFS); 137 em = alloc_extent_map(GFP_NOFS);
137 if (!em) { 138 if (!em) {
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
144 em->block_start = 0; 145 em->block_start = 0;
145 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 146 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
146 147
147 spin_lock(&em_tree->lock); 148 write_lock(&em_tree->lock);
148 ret = add_extent_mapping(em_tree, em); 149 ret = add_extent_mapping(em_tree, em);
149 if (ret == -EEXIST) { 150 if (ret == -EEXIST) {
150 u64 failed_start = em->start; 151 u64 failed_start = em->start;
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
163 free_extent_map(em); 164 free_extent_map(em);
164 em = NULL; 165 em = NULL;
165 } 166 }
166 spin_unlock(&em_tree->lock); 167 write_unlock(&em_tree->lock);
167 168
168 if (ret) 169 if (ret)
169 em = ERR_PTR(ret); 170 em = ERR_PTR(ret);
@@ -772,7 +773,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
772 } 773 }
773} 774}
774 775
775static struct address_space_operations btree_aops = { 776static const struct address_space_operations btree_aops = {
776 .readpage = btree_readpage, 777 .readpage = btree_readpage,
777 .writepage = btree_writepage, 778 .writepage = btree_writepage,
778 .writepages = btree_writepages, 779 .writepages = btree_writepages,
@@ -821,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
821 822
822int btrfs_write_tree_block(struct extent_buffer *buf) 823int btrfs_write_tree_block(struct extent_buffer *buf)
823{ 824{
824 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 825 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
825 buf->start + buf->len - 1, WB_SYNC_ALL); 826 buf->start + buf->len - 1);
826} 827}
827 828
828int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
829{ 830{
830 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 831 return filemap_fdatawait_range(buf->first_page->mapping,
831 buf->start, buf->start + buf->len - 1); 832 buf->start, buf->start + buf->len - 1);
832} 833}
833 834
834struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
895 root->fs_info = fs_info; 896 root->fs_info = fs_info;
896 root->objectid = objectid; 897 root->objectid = objectid;
897 root->last_trans = 0; 898 root->last_trans = 0;
898 root->highest_inode = 0; 899 root->highest_objectid = 0;
899 root->last_inode_alloc = 0;
900 root->name = NULL; 900 root->name = NULL;
901 root->in_sysfs = 0; 901 root->in_sysfs = 0;
902 root->inode_tree.rb_node = NULL; 902 root->inode_tree.rb_node = NULL;
@@ -952,14 +952,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
952 root, fs_info, objectid); 952 root, fs_info, objectid);
953 ret = btrfs_find_last_root(tree_root, objectid, 953 ret = btrfs_find_last_root(tree_root, objectid,
954 &root->root_item, &root->root_key); 954 &root->root_item, &root->root_key);
955 if (ret > 0)
956 return -ENOENT;
955 BUG_ON(ret); 957 BUG_ON(ret);
956 958
957 generation = btrfs_root_generation(&root->root_item); 959 generation = btrfs_root_generation(&root->root_item);
958 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 960 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
959 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 961 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
960 blocksize, generation); 962 blocksize, generation);
961 root->commit_root = btrfs_root_node(root);
962 BUG_ON(!root->node); 963 BUG_ON(!root->node);
964 root->commit_root = btrfs_root_node(root);
963 return 0; 965 return 0;
964} 966}
965 967
@@ -1095,7 +1097,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1095 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1097 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1096 struct btrfs_path *path; 1098 struct btrfs_path *path;
1097 struct extent_buffer *l; 1099 struct extent_buffer *l;
1098 u64 highest_inode;
1099 u64 generation; 1100 u64 generation;
1100 u32 blocksize; 1101 u32 blocksize;
1101 int ret = 0; 1102 int ret = 0;
@@ -1110,7 +1111,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1110 kfree(root); 1111 kfree(root);
1111 return ERR_PTR(ret); 1112 return ERR_PTR(ret);
1112 } 1113 }
1113 goto insert; 1114 goto out;
1114 } 1115 }
1115 1116
1116 __setup_root(tree_root->nodesize, tree_root->leafsize, 1117 __setup_root(tree_root->nodesize, tree_root->leafsize,
@@ -1120,39 +1121,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1120 path = btrfs_alloc_path(); 1121 path = btrfs_alloc_path();
1121 BUG_ON(!path); 1122 BUG_ON(!path);
1122 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1123 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1123 if (ret != 0) { 1124 if (ret == 0) {
1124 if (ret > 0) 1125 l = path->nodes[0];
1125 ret = -ENOENT; 1126 read_extent_buffer(l, &root->root_item,
1126 goto out; 1127 btrfs_item_ptr_offset(l, path->slots[0]),
1128 sizeof(root->root_item));
1129 memcpy(&root->root_key, location, sizeof(*location));
1127 } 1130 }
1128 l = path->nodes[0];
1129 read_extent_buffer(l, &root->root_item,
1130 btrfs_item_ptr_offset(l, path->slots[0]),
1131 sizeof(root->root_item));
1132 memcpy(&root->root_key, location, sizeof(*location));
1133 ret = 0;
1134out:
1135 btrfs_release_path(root, path);
1136 btrfs_free_path(path); 1131 btrfs_free_path(path);
1137 if (ret) { 1132 if (ret) {
1138 kfree(root); 1133 if (ret > 0)
1134 ret = -ENOENT;
1139 return ERR_PTR(ret); 1135 return ERR_PTR(ret);
1140 } 1136 }
1137
1141 generation = btrfs_root_generation(&root->root_item); 1138 generation = btrfs_root_generation(&root->root_item);
1142 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1139 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1143 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1140 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1144 blocksize, generation); 1141 blocksize, generation);
1145 root->commit_root = btrfs_root_node(root); 1142 root->commit_root = btrfs_root_node(root);
1146 BUG_ON(!root->node); 1143 BUG_ON(!root->node);
1147insert: 1144out:
1148 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1145 if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
1149 root->ref_cows = 1; 1146 root->ref_cows = 1;
1150 ret = btrfs_find_highest_inode(root, &highest_inode); 1147
1151 if (ret == 0) {
1152 root->highest_inode = highest_inode;
1153 root->last_inode_alloc = highest_inode;
1154 }
1155 }
1156 return root; 1148 return root;
1157} 1149}
1158 1150
@@ -1187,39 +1179,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1187 return fs_info->dev_root; 1179 return fs_info->dev_root;
1188 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) 1180 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1189 return fs_info->csum_root; 1181 return fs_info->csum_root;
1190 1182again:
1183 spin_lock(&fs_info->fs_roots_radix_lock);
1191 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1184 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1192 (unsigned long)location->objectid); 1185 (unsigned long)location->objectid);
1186 spin_unlock(&fs_info->fs_roots_radix_lock);
1193 if (root) 1187 if (root)
1194 return root; 1188 return root;
1195 1189
1190 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1191 if (ret == 0)
1192 ret = -ENOENT;
1193 if (ret < 0)
1194 return ERR_PTR(ret);
1195
1196 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1196 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1197 if (IS_ERR(root)) 1197 if (IS_ERR(root))
1198 return root; 1198 return root;
1199 1199
1200 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1200 set_anon_super(&root->anon_super, NULL); 1201 set_anon_super(&root->anon_super, NULL);
1201 1202
1203 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1204 if (ret)
1205 goto fail;
1206
1207 spin_lock(&fs_info->fs_roots_radix_lock);
1202 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1208 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1203 (unsigned long)root->root_key.objectid, 1209 (unsigned long)root->root_key.objectid,
1204 root); 1210 root);
1211 if (ret == 0)
1212 root->in_radix = 1;
1213 spin_unlock(&fs_info->fs_roots_radix_lock);
1214 radix_tree_preload_end();
1205 if (ret) { 1215 if (ret) {
1206 free_extent_buffer(root->node); 1216 if (ret == -EEXIST) {
1207 kfree(root); 1217 free_fs_root(root);
1208 return ERR_PTR(ret); 1218 goto again;
1219 }
1220 goto fail;
1209 } 1221 }
1210 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 1222
1211 ret = btrfs_find_dead_roots(fs_info->tree_root, 1223 ret = btrfs_find_dead_roots(fs_info->tree_root,
1212 root->root_key.objectid); 1224 root->root_key.objectid);
1213 BUG_ON(ret); 1225 WARN_ON(ret);
1226
1227 if (!(fs_info->sb->s_flags & MS_RDONLY))
1214 btrfs_orphan_cleanup(root); 1228 btrfs_orphan_cleanup(root);
1215 } 1229
1216 return root; 1230 return root;
1231fail:
1232 free_fs_root(root);
1233 return ERR_PTR(ret);
1217} 1234}
1218 1235
1219struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, 1236struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1220 struct btrfs_key *location, 1237 struct btrfs_key *location,
1221 const char *name, int namelen) 1238 const char *name, int namelen)
1222{ 1239{
1240 return btrfs_read_fs_root_no_name(fs_info, location);
1241#if 0
1223 struct btrfs_root *root; 1242 struct btrfs_root *root;
1224 int ret; 1243 int ret;
1225 1244
@@ -1236,7 +1255,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1236 kfree(root); 1255 kfree(root);
1237 return ERR_PTR(ret); 1256 return ERR_PTR(ret);
1238 } 1257 }
1239#if 0 1258
1240 ret = btrfs_sysfs_add_root(root); 1259 ret = btrfs_sysfs_add_root(root);
1241 if (ret) { 1260 if (ret) {
1242 free_extent_buffer(root->node); 1261 free_extent_buffer(root->node);
@@ -1244,9 +1263,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1244 kfree(root); 1263 kfree(root);
1245 return ERR_PTR(ret); 1264 return ERR_PTR(ret);
1246 } 1265 }
1247#endif
1248 root->in_sysfs = 1; 1266 root->in_sysfs = 1;
1249 return root; 1267 return root;
1268#endif
1250} 1269}
1251 1270
1252static int btrfs_congested_fn(void *congested_data, int bdi_bits) 1271static int btrfs_congested_fn(void *congested_data, int bdi_bits)
@@ -1325,9 +1344,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1325 offset = page_offset(page); 1344 offset = page_offset(page);
1326 1345
1327 em_tree = &BTRFS_I(inode)->extent_tree; 1346 em_tree = &BTRFS_I(inode)->extent_tree;
1328 spin_lock(&em_tree->lock); 1347 read_lock(&em_tree->lock);
1329 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 1348 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1330 spin_unlock(&em_tree->lock); 1349 read_unlock(&em_tree->lock);
1331 if (!em) { 1350 if (!em) {
1332 __unplug_io_fn(bdi, page); 1351 __unplug_io_fn(bdi, page);
1333 return; 1352 return;
@@ -1360,8 +1379,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1360 1379
1361 err = bdi_register(bdi, NULL, "btrfs-%d", 1380 err = bdi_register(bdi, NULL, "btrfs-%d",
1362 atomic_inc_return(&btrfs_bdi_num)); 1381 atomic_inc_return(&btrfs_bdi_num));
1363 if (err) 1382 if (err) {
1383 bdi_destroy(bdi);
1364 return err; 1384 return err;
1385 }
1365 1386
1366 bdi->ra_pages = default_backing_dev_info.ra_pages; 1387 bdi->ra_pages = default_backing_dev_info.ra_pages;
1367 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1388 bdi->unplug_io_fn = btrfs_unplug_io_fn;
@@ -1451,9 +1472,12 @@ static int cleaner_kthread(void *arg)
1451 break; 1472 break;
1452 1473
1453 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1474 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1454 mutex_lock(&root->fs_info->cleaner_mutex); 1475
1455 btrfs_clean_old_snapshots(root); 1476 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1456 mutex_unlock(&root->fs_info->cleaner_mutex); 1477 mutex_trylock(&root->fs_info->cleaner_mutex)) {
1478 btrfs_clean_old_snapshots(root);
1479 mutex_unlock(&root->fs_info->cleaner_mutex);
1480 }
1457 1481
1458 if (freezing(current)) { 1482 if (freezing(current)) {
1459 refrigerator(); 1483 refrigerator();
@@ -1558,15 +1582,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1558 err = -ENOMEM; 1582 err = -ENOMEM;
1559 goto fail; 1583 goto fail;
1560 } 1584 }
1561 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); 1585
1586 ret = init_srcu_struct(&fs_info->subvol_srcu);
1587 if (ret) {
1588 err = ret;
1589 goto fail;
1590 }
1591
1592 ret = setup_bdi(fs_info, &fs_info->bdi);
1593 if (ret) {
1594 err = ret;
1595 goto fail_srcu;
1596 }
1597
1598 fs_info->btree_inode = new_inode(sb);
1599 if (!fs_info->btree_inode) {
1600 err = -ENOMEM;
1601 goto fail_bdi;
1602 }
1603
1604 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1562 INIT_LIST_HEAD(&fs_info->trans_list); 1605 INIT_LIST_HEAD(&fs_info->trans_list);
1563 INIT_LIST_HEAD(&fs_info->dead_roots); 1606 INIT_LIST_HEAD(&fs_info->dead_roots);
1564 INIT_LIST_HEAD(&fs_info->hashers); 1607 INIT_LIST_HEAD(&fs_info->hashers);
1565 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1608 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1566 INIT_LIST_HEAD(&fs_info->ordered_operations); 1609 INIT_LIST_HEAD(&fs_info->ordered_operations);
1610 INIT_LIST_HEAD(&fs_info->caching_block_groups);
1567 spin_lock_init(&fs_info->delalloc_lock); 1611 spin_lock_init(&fs_info->delalloc_lock);
1568 spin_lock_init(&fs_info->new_trans_lock); 1612 spin_lock_init(&fs_info->new_trans_lock);
1569 spin_lock_init(&fs_info->ref_cache_lock); 1613 spin_lock_init(&fs_info->ref_cache_lock);
1614 spin_lock_init(&fs_info->fs_roots_radix_lock);
1570 1615
1571 init_completion(&fs_info->kobj_unregister); 1616 init_completion(&fs_info->kobj_unregister);
1572 fs_info->tree_root = tree_root; 1617 fs_info->tree_root = tree_root;
@@ -1585,12 +1630,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1585 fs_info->sb = sb; 1630 fs_info->sb = sb;
1586 fs_info->max_extent = (u64)-1; 1631 fs_info->max_extent = (u64)-1;
1587 fs_info->max_inline = 8192 * 1024; 1632 fs_info->max_inline = 8192 * 1024;
1588 if (setup_bdi(fs_info, &fs_info->bdi)) 1633 fs_info->metadata_ratio = 0;
1589 goto fail_bdi;
1590 fs_info->btree_inode = new_inode(sb);
1591 fs_info->btree_inode->i_ino = 1;
1592 fs_info->btree_inode->i_nlink = 1;
1593 fs_info->metadata_ratio = 8;
1594 1634
1595 fs_info->thread_pool_size = min_t(unsigned long, 1635 fs_info->thread_pool_size = min_t(unsigned long,
1596 num_online_cpus() + 2, 8); 1636 num_online_cpus() + 2, 8);
@@ -1602,6 +1642,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1602 sb->s_blocksize_bits = blksize_bits(4096); 1642 sb->s_blocksize_bits = blksize_bits(4096);
1603 sb->s_bdi = &fs_info->bdi; 1643 sb->s_bdi = &fs_info->bdi;
1604 1644
1645 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1646 fs_info->btree_inode->i_nlink = 1;
1605 /* 1647 /*
1606 * we set the i_size on the btree inode to the max possible int. 1648 * we set the i_size on the btree inode to the max possible int.
1607 * the real end of the address space is determined by all of 1649 * the real end of the address space is determined by all of
@@ -1620,28 +1662,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1620 1662
1621 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; 1663 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1622 1664
1665 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1666 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1667 sizeof(struct btrfs_key));
1668 BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
1669 insert_inode_hash(fs_info->btree_inode);
1670
1623 spin_lock_init(&fs_info->block_group_cache_lock); 1671 spin_lock_init(&fs_info->block_group_cache_lock);
1624 fs_info->block_group_cache_tree.rb_node = NULL; 1672 fs_info->block_group_cache_tree.rb_node = NULL;
1625 1673
1626 extent_io_tree_init(&fs_info->pinned_extents, 1674 extent_io_tree_init(&fs_info->freed_extents[0],
1627 fs_info->btree_inode->i_mapping, GFP_NOFS); 1675 fs_info->btree_inode->i_mapping, GFP_NOFS);
1676 extent_io_tree_init(&fs_info->freed_extents[1],
1677 fs_info->btree_inode->i_mapping, GFP_NOFS);
1678 fs_info->pinned_extents = &fs_info->freed_extents[0];
1628 fs_info->do_barriers = 1; 1679 fs_info->do_barriers = 1;
1629 1680
1630 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1631 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1632 sizeof(struct btrfs_key));
1633 insert_inode_hash(fs_info->btree_inode);
1634 1681
1635 mutex_init(&fs_info->trans_mutex); 1682 mutex_init(&fs_info->trans_mutex);
1636 mutex_init(&fs_info->ordered_operations_mutex); 1683 mutex_init(&fs_info->ordered_operations_mutex);
1637 mutex_init(&fs_info->tree_log_mutex); 1684 mutex_init(&fs_info->tree_log_mutex);
1638 mutex_init(&fs_info->drop_mutex);
1639 mutex_init(&fs_info->chunk_mutex); 1685 mutex_init(&fs_info->chunk_mutex);
1640 mutex_init(&fs_info->transaction_kthread_mutex); 1686 mutex_init(&fs_info->transaction_kthread_mutex);
1641 mutex_init(&fs_info->cleaner_mutex); 1687 mutex_init(&fs_info->cleaner_mutex);
1642 mutex_init(&fs_info->volume_mutex); 1688 mutex_init(&fs_info->volume_mutex);
1643 mutex_init(&fs_info->tree_reloc_mutex);
1644 init_rwsem(&fs_info->extent_commit_sem); 1689 init_rwsem(&fs_info->extent_commit_sem);
1690 init_rwsem(&fs_info->subvol_sem);
1645 1691
1646 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1692 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1647 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 1693 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1700,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1700 err = -EINVAL; 1746 err = -EINVAL;
1701 goto fail_iput; 1747 goto fail_iput;
1702 } 1748 }
1703 1749printk("thread pool is %d\n", fs_info->thread_pool_size);
1704 /* 1750 /*
1705 * we need to start all the end_io workers up front because the 1751 * we need to start all the end_io workers up front because the
1706 * queue work function gets called at interrupt time, and so it 1752 * queue work function gets called at interrupt time, and so it
@@ -1745,20 +1791,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1745 fs_info->endio_workers.idle_thresh = 4; 1791 fs_info->endio_workers.idle_thresh = 4;
1746 fs_info->endio_meta_workers.idle_thresh = 4; 1792 fs_info->endio_meta_workers.idle_thresh = 4;
1747 1793
1748 fs_info->endio_write_workers.idle_thresh = 64; 1794 fs_info->endio_write_workers.idle_thresh = 2;
1749 fs_info->endio_meta_write_workers.idle_thresh = 64; 1795 fs_info->endio_meta_write_workers.idle_thresh = 2;
1796
1797 fs_info->endio_workers.atomic_worker_start = 1;
1798 fs_info->endio_meta_workers.atomic_worker_start = 1;
1799 fs_info->endio_write_workers.atomic_worker_start = 1;
1800 fs_info->endio_meta_write_workers.atomic_worker_start = 1;
1750 1801
1751 btrfs_start_workers(&fs_info->workers, 1); 1802 btrfs_start_workers(&fs_info->workers, 1);
1752 btrfs_start_workers(&fs_info->submit_workers, 1); 1803 btrfs_start_workers(&fs_info->submit_workers, 1);
1753 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1804 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1754 btrfs_start_workers(&fs_info->fixup_workers, 1); 1805 btrfs_start_workers(&fs_info->fixup_workers, 1);
1755 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1806 btrfs_start_workers(&fs_info->endio_workers, 1);
1756 btrfs_start_workers(&fs_info->endio_meta_workers, 1807 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1757 fs_info->thread_pool_size); 1808 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1758 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1809 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1759 fs_info->thread_pool_size);
1760 btrfs_start_workers(&fs_info->endio_write_workers,
1761 fs_info->thread_pool_size);
1762 1810
1763 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1811 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1764 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1812 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1918,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1918 } 1966 }
1919 } 1967 }
1920 1968
1969 ret = btrfs_find_orphan_roots(tree_root);
1970 BUG_ON(ret);
1971
1921 if (!(sb->s_flags & MS_RDONLY)) { 1972 if (!(sb->s_flags & MS_RDONLY)) {
1922 ret = btrfs_recover_relocation(tree_root); 1973 ret = btrfs_recover_relocation(tree_root);
1923 BUG_ON(ret); 1974 BUG_ON(ret);
@@ -1977,6 +2028,8 @@ fail_iput:
1977 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2028 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1978fail_bdi: 2029fail_bdi:
1979 bdi_destroy(&fs_info->bdi); 2030 bdi_destroy(&fs_info->bdi);
2031fail_srcu:
2032 cleanup_srcu_struct(&fs_info->subvol_srcu);
1980fail: 2033fail:
1981 kfree(extent_root); 2034 kfree(extent_root);
1982 kfree(tree_root); 2035 kfree(tree_root);
@@ -2236,20 +2289,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2236 2289
2237int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2290int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2238{ 2291{
2239 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 2292 spin_lock(&fs_info->fs_roots_radix_lock);
2240 radix_tree_delete(&fs_info->fs_roots_radix, 2293 radix_tree_delete(&fs_info->fs_roots_radix,
2241 (unsigned long)root->root_key.objectid); 2294 (unsigned long)root->root_key.objectid);
2295 spin_unlock(&fs_info->fs_roots_radix_lock);
2296
2297 if (btrfs_root_refs(&root->root_item) == 0)
2298 synchronize_srcu(&fs_info->subvol_srcu);
2299
2300 free_fs_root(root);
2301 return 0;
2302}
2303
2304static void free_fs_root(struct btrfs_root *root)
2305{
2306 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2242 if (root->anon_super.s_dev) { 2307 if (root->anon_super.s_dev) {
2243 down_write(&root->anon_super.s_umount); 2308 down_write(&root->anon_super.s_umount);
2244 kill_anon_super(&root->anon_super); 2309 kill_anon_super(&root->anon_super);
2245 } 2310 }
2246 if (root->node) 2311 free_extent_buffer(root->node);
2247 free_extent_buffer(root->node); 2312 free_extent_buffer(root->commit_root);
2248 if (root->commit_root)
2249 free_extent_buffer(root->commit_root);
2250 kfree(root->name); 2313 kfree(root->name);
2251 kfree(root); 2314 kfree(root);
2252 return 0;
2253} 2315}
2254 2316
2255static int del_fs_roots(struct btrfs_fs_info *fs_info) 2317static int del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2258,6 +2320,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
2258 struct btrfs_root *gang[8]; 2320 struct btrfs_root *gang[8];
2259 int i; 2321 int i;
2260 2322
2323 while (!list_empty(&fs_info->dead_roots)) {
2324 gang[0] = list_entry(fs_info->dead_roots.next,
2325 struct btrfs_root, root_list);
2326 list_del(&gang[0]->root_list);
2327
2328 if (gang[0]->in_radix) {
2329 btrfs_free_fs_root(fs_info, gang[0]);
2330 } else {
2331 free_extent_buffer(gang[0]->node);
2332 free_extent_buffer(gang[0]->commit_root);
2333 kfree(gang[0]);
2334 }
2335 }
2336
2261 while (1) { 2337 while (1) {
2262 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 2338 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2263 (void **)gang, 0, 2339 (void **)gang, 0,
@@ -2287,9 +2363,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2287 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2363 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2288 for (i = 0; i < ret; i++) { 2364 for (i = 0; i < ret; i++) {
2289 root_objectid = gang[i]->root_key.objectid; 2365 root_objectid = gang[i]->root_key.objectid;
2290 ret = btrfs_find_dead_roots(fs_info->tree_root,
2291 root_objectid);
2292 BUG_ON(ret);
2293 btrfs_orphan_cleanup(gang[i]); 2366 btrfs_orphan_cleanup(gang[i]);
2294 } 2367 }
2295 root_objectid++; 2368 root_objectid++;
@@ -2359,7 +2432,6 @@ int close_ctree(struct btrfs_root *root)
2359 free_extent_buffer(root->fs_info->csum_root->commit_root); 2432 free_extent_buffer(root->fs_info->csum_root->commit_root);
2360 2433
2361 btrfs_free_block_groups(root->fs_info); 2434 btrfs_free_block_groups(root->fs_info);
2362 btrfs_free_pinned_extents(root->fs_info);
2363 2435
2364 del_fs_roots(fs_info); 2436 del_fs_roots(fs_info);
2365 2437
@@ -2378,6 +2450,7 @@ int close_ctree(struct btrfs_root *root)
2378 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2450 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2379 2451
2380 bdi_destroy(&fs_info->bdi); 2452 bdi_destroy(&fs_info->bdi);
2453 cleanup_srcu_struct(&fs_info->subvol_srcu);
2381 2454
2382 kfree(fs_info->extent_root); 2455 kfree(fs_info->extent_root);
2383 kfree(fs_info->tree_root); 2456 kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9596b40caa4e..ba5c3fd5ab8c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 28 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT; 29 type = FILEID_BTRFS_WITHOUT_PARENT;
30 30
31 fid->objectid = BTRFS_I(inode)->location.objectid; 31 fid->objectid = inode->i_ino;
32 fid->root_objectid = BTRFS_I(inode)->root->objectid; 32 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation; 33 fid->gen = inode->i_generation;
34 34
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
60} 60}
61 61
62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, 62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
63 u64 root_objectid, u32 generation) 63 u64 root_objectid, u32 generation,
64 int check_generation)
64{ 65{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
65 struct btrfs_root *root; 67 struct btrfs_root *root;
68 struct dentry *dentry;
66 struct inode *inode; 69 struct inode *inode;
67 struct btrfs_key key; 70 struct btrfs_key key;
71 int index;
72 int err = 0;
73
74 if (objectid < BTRFS_FIRST_FREE_OBJECTID)
75 return ERR_PTR(-ESTALE);
68 76
69 key.objectid = root_objectid; 77 key.objectid = root_objectid;
70 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 78 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
71 key.offset = (u64)-1; 79 key.offset = (u64)-1;
72 80
73 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); 81 index = srcu_read_lock(&fs_info->subvol_srcu);
74 if (IS_ERR(root)) 82
75 return ERR_CAST(root); 83 root = btrfs_read_fs_root_no_name(fs_info, &key);
84 if (IS_ERR(root)) {
85 err = PTR_ERR(root);
86 goto fail;
87 }
88
89 if (btrfs_root_refs(&root->root_item) == 0) {
90 err = -ENOENT;
91 goto fail;
92 }
76 93
77 key.objectid = objectid; 94 key.objectid = objectid;
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0; 96 key.offset = 0;
80 97
81 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root);
82 if (IS_ERR(inode)) 99 if (IS_ERR(inode)) {
83 return (void *)inode; 100 err = PTR_ERR(inode);
101 goto fail;
102 }
103
104 srcu_read_unlock(&fs_info->subvol_srcu, index);
84 105
85 if (generation != inode->i_generation) { 106 if (check_generation && generation != inode->i_generation) {
86 iput(inode); 107 iput(inode);
87 return ERR_PTR(-ESTALE); 108 return ERR_PTR(-ESTALE);
88 } 109 }
89 110
90 return d_obtain_alias(inode); 111 dentry = d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 dentry->d_op = &btrfs_dentry_operations;
114 return dentry;
115fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err);
91} 118}
92 119
93static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, 120static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
111 objectid = fid->parent_objectid; 138 objectid = fid->parent_objectid;
112 generation = fid->parent_gen; 139 generation = fid->parent_gen;
113 140
114 return btrfs_get_dentry(sb, objectid, root_objectid, generation); 141 return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
115} 142}
116 143
117static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, 144static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
133 root_objectid = fid->root_objectid; 160 root_objectid = fid->root_objectid;
134 generation = fid->gen; 161 generation = fid->gen;
135 162
136 return btrfs_get_dentry(sb, objectid, root_objectid, generation); 163 return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
137} 164}
138 165
139static struct dentry *btrfs_get_parent(struct dentry *child) 166static struct dentry *btrfs_get_parent(struct dentry *child)
140{ 167{
141 struct inode *dir = child->d_inode; 168 struct inode *dir = child->d_inode;
169 static struct dentry *dentry;
142 struct btrfs_root *root = BTRFS_I(dir)->root; 170 struct btrfs_root *root = BTRFS_I(dir)->root;
143 struct btrfs_key key;
144 struct btrfs_path *path; 171 struct btrfs_path *path;
145 struct extent_buffer *leaf; 172 struct extent_buffer *leaf;
146 int slot; 173 struct btrfs_root_ref *ref;
147 u64 objectid; 174 struct btrfs_key key;
175 struct btrfs_key found_key;
148 int ret; 176 int ret;
149 177
150 path = btrfs_alloc_path(); 178 path = btrfs_alloc_path();
151 179
152 key.objectid = dir->i_ino; 180 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
153 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 181 key.objectid = root->root_key.objectid;
154 key.offset = (u64)-1; 182 key.type = BTRFS_ROOT_BACKREF_KEY;
183 key.offset = (u64)-1;
184 root = root->fs_info->tree_root;
185 } else {
186 key.objectid = dir->i_ino;
187 key.type = BTRFS_INODE_REF_KEY;
188 key.offset = (u64)-1;
189 }
155 190
156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 191 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
157 if (ret < 0) { 192 if (ret < 0)
158 /* Error */ 193 goto fail;
159 btrfs_free_path(path); 194
160 return ERR_PTR(ret); 195 BUG_ON(ret == 0);
196 if (path->slots[0] == 0) {
197 ret = -ENOENT;
198 goto fail;
161 } 199 }
200
201 path->slots[0]--;
162 leaf = path->nodes[0]; 202 leaf = path->nodes[0];
163 slot = path->slots[0]; 203
164 if (ret) { 204 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
165 /* btrfs_search_slot() returns the slot where we'd want to 205 if (found_key.objectid != key.objectid || found_key.type != key.type) {
166 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. 206 ret = -ENOENT;
167 The _real_ backref, telling us what the parent inode 207 goto fail;
168 _actually_ is, will be in the slot _before_ the one
169 that btrfs_search_slot() returns. */
170 if (!slot) {
171 /* Unless there is _no_ key in the tree before... */
172 btrfs_free_path(path);
173 return ERR_PTR(-EIO);
174 }
175 slot--;
176 } 208 }
177 209
178 btrfs_item_key_to_cpu(leaf, &key, slot); 210 if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
211 ref = btrfs_item_ptr(leaf, path->slots[0],
212 struct btrfs_root_ref);
213 key.objectid = btrfs_root_ref_dirid(leaf, ref);
214 } else {
215 key.objectid = found_key.offset;
216 }
179 btrfs_free_path(path); 217 btrfs_free_path(path);
180 218
181 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) 219 if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
182 return ERR_PTR(-EINVAL); 220 return btrfs_get_dentry(root->fs_info->sb, key.objectid,
183 221 found_key.offset, 0, 0);
184 objectid = key.offset; 222 }
185
186 /* If we are already at the root of a subvol, return the real root */
187 if (objectid == dir->i_ino)
188 return dget(dir->i_sb->s_root);
189 223
190 /* Build a new key for the inode item */ 224 key.type = BTRFS_INODE_ITEM_KEY;
191 key.objectid = objectid;
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0; 225 key.offset = 0;
194 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry;
230fail:
231 btrfs_free_path(path);
232 return ERR_PTR(ret);
196} 233}
197 234
198const struct export_operations btrfs_export_ops = { 235const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 535f85ba104f..359a754c782c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,12 +32,12 @@
32#include "locking.h" 32#include "locking.h"
33#include "free-space-cache.h" 33#include "free-space-cache.h"
34 34
35static int update_reserved_extents(struct btrfs_root *root,
36 u64 bytenr, u64 num, int reserve);
37static int update_block_group(struct btrfs_trans_handle *trans, 35static int update_block_group(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 36 struct btrfs_root *root,
39 u64 bytenr, u64 num_bytes, int alloc, 37 u64 bytenr, u64 num_bytes, int alloc,
40 int mark_free); 38 int mark_free);
39static int update_reserved_extents(struct btrfs_block_group_cache *cache,
40 u64 num_bytes, int reserve);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 42 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
57 u64 parent, u64 root_objectid, 57 u64 parent, u64 root_objectid,
58 u64 flags, struct btrfs_disk_key *key, 58 u64 flags, struct btrfs_disk_key *key,
59 int level, struct btrfs_key *ins); 59 int level, struct btrfs_key *ins);
60
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
63static int pin_down_bytes(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 struct btrfs_path *path,
66 u64 bytenr, u64 num_bytes,
67 int is_data, int reserved,
68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
64 73
65static noinline int 74static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
153 return ret; 162 return ret;
154} 163}
155 164
156/* 165static int add_excluded_extent(struct btrfs_root *root,
157 * We always set EXTENT_LOCKED for the super mirror extents so we don't 166 u64 start, u64 num_bytes)
158 * overwrite them, so those bits need to be unset. Also, if we are unmounting
159 * with pinned extents still sitting there because we had a block group caching,
160 * we need to clear those now, since we are done.
161 */
162void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
163{ 167{
164 u64 start, end, last = 0; 168 u64 end = start + num_bytes - 1;
165 int ret; 169 set_extent_bits(&root->fs_info->freed_extents[0],
170 start, end, EXTENT_UPTODATE, GFP_NOFS);
171 set_extent_bits(&root->fs_info->freed_extents[1],
172 start, end, EXTENT_UPTODATE, GFP_NOFS);
173 return 0;
174}
166 175
167 while (1) { 176static void free_excluded_extents(struct btrfs_root *root,
168 ret = find_first_extent_bit(&info->pinned_extents, last, 177 struct btrfs_block_group_cache *cache)
169 &start, &end, 178{
170 EXTENT_LOCKED|EXTENT_DIRTY); 179 u64 start, end;
171 if (ret)
172 break;
173 180
174 clear_extent_bits(&info->pinned_extents, start, end, 181 start = cache->key.objectid;
175 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); 182 end = start + cache->key.offset - 1;
176 last = end+1; 183
177 } 184 clear_extent_bits(&root->fs_info->freed_extents[0],
185 start, end, EXTENT_UPTODATE, GFP_NOFS);
186 clear_extent_bits(&root->fs_info->freed_extents[1],
187 start, end, EXTENT_UPTODATE, GFP_NOFS);
178} 188}
179 189
180static int remove_sb_from_cache(struct btrfs_root *root, 190static int exclude_super_stripes(struct btrfs_root *root,
181 struct btrfs_block_group_cache *cache) 191 struct btrfs_block_group_cache *cache)
182{ 192{
183 struct btrfs_fs_info *fs_info = root->fs_info;
184 u64 bytenr; 193 u64 bytenr;
185 u64 *logical; 194 u64 *logical;
186 int stripe_len; 195 int stripe_len;
@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
192 cache->key.objectid, bytenr, 201 cache->key.objectid, bytenr,
193 0, &logical, &nr, &stripe_len); 202 0, &logical, &nr, &stripe_len);
194 BUG_ON(ret); 203 BUG_ON(ret);
204
195 while (nr--) { 205 while (nr--) {
196 try_lock_extent(&fs_info->pinned_extents, 206 cache->bytes_super += stripe_len;
197 logical[nr], 207 ret = add_excluded_extent(root, logical[nr],
198 logical[nr] + stripe_len - 1, GFP_NOFS); 208 stripe_len);
209 BUG_ON(ret);
199 } 210 }
211
200 kfree(logical); 212 kfree(logical);
201 } 213 }
202
203 return 0; 214 return 0;
204} 215}
205 216
217static struct btrfs_caching_control *
218get_caching_control(struct btrfs_block_group_cache *cache)
219{
220 struct btrfs_caching_control *ctl;
221
222 spin_lock(&cache->lock);
223 if (cache->cached != BTRFS_CACHE_STARTED) {
224 spin_unlock(&cache->lock);
225 return NULL;
226 }
227
228 ctl = cache->caching_ctl;
229 atomic_inc(&ctl->count);
230 spin_unlock(&cache->lock);
231 return ctl;
232}
233
234static void put_caching_control(struct btrfs_caching_control *ctl)
235{
236 if (atomic_dec_and_test(&ctl->count))
237 kfree(ctl);
238}
239
206/* 240/*
207 * this is only called by cache_block_group, since we could have freed extents 241 * this is only called by cache_block_group, since we could have freed extents
208 * we need to check the pinned_extents for any extents that can't be used yet 242 * we need to check the pinned_extents for any extents that can't be used yet
@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
215 int ret; 249 int ret;
216 250
217 while (start < end) { 251 while (start < end) {
218 ret = find_first_extent_bit(&info->pinned_extents, start, 252 ret = find_first_extent_bit(info->pinned_extents, start,
219 &extent_start, &extent_end, 253 &extent_start, &extent_end,
220 EXTENT_DIRTY|EXTENT_LOCKED); 254 EXTENT_DIRTY | EXTENT_UPTODATE);
221 if (ret) 255 if (ret)
222 break; 256 break;
223 257
@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
249{ 283{
250 struct btrfs_block_group_cache *block_group = data; 284 struct btrfs_block_group_cache *block_group = data;
251 struct btrfs_fs_info *fs_info = block_group->fs_info; 285 struct btrfs_fs_info *fs_info = block_group->fs_info;
252 u64 last = 0; 286 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
287 struct btrfs_root *extent_root = fs_info->extent_root;
253 struct btrfs_path *path; 288 struct btrfs_path *path;
254 int ret = 0;
255 struct btrfs_key key;
256 struct extent_buffer *leaf; 289 struct extent_buffer *leaf;
257 int slot; 290 struct btrfs_key key;
258 u64 total_found = 0; 291 u64 total_found = 0;
259 292 u64 last = 0;
260 BUG_ON(!fs_info); 293 u32 nritems;
294 int ret = 0;
261 295
262 path = btrfs_alloc_path(); 296 path = btrfs_alloc_path();
263 if (!path) 297 if (!path)
264 return -ENOMEM; 298 return -ENOMEM;
265 299
266 atomic_inc(&block_group->space_info->caching_threads); 300 exclude_super_stripes(extent_root, block_group);
301 spin_lock(&block_group->space_info->lock);
302 block_group->space_info->bytes_super += block_group->bytes_super;
303 spin_unlock(&block_group->space_info->lock);
304
267 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 305 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
306
268 /* 307 /*
269 * We don't want to deadlock with somebody trying to allocate a new 308 * We don't want to deadlock with somebody trying to allocate a new
270 * extent for the extent root while also trying to search the extent 309 * extent for the extent root while also trying to search the extent
@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
277 316
278 key.objectid = last; 317 key.objectid = last;
279 key.offset = 0; 318 key.offset = 0;
280 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 319 key.type = BTRFS_EXTENT_ITEM_KEY;
281again: 320again:
321 mutex_lock(&caching_ctl->mutex);
282 /* need to make sure the commit_root doesn't disappear */ 322 /* need to make sure the commit_root doesn't disappear */
283 down_read(&fs_info->extent_commit_sem); 323 down_read(&fs_info->extent_commit_sem);
284 324
285 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 325 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
286 if (ret < 0) 326 if (ret < 0)
287 goto err; 327 goto err;
288 328
329 leaf = path->nodes[0];
330 nritems = btrfs_header_nritems(leaf);
331
289 while (1) { 332 while (1) {
290 smp_mb(); 333 smp_mb();
291 if (block_group->fs_info->closing > 1) { 334 if (fs_info->closing > 1) {
292 last = (u64)-1; 335 last = (u64)-1;
293 break; 336 break;
294 } 337 }
295 338
296 leaf = path->nodes[0]; 339 if (path->slots[0] < nritems) {
297 slot = path->slots[0]; 340 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
298 if (slot >= btrfs_header_nritems(leaf)) { 341 } else {
299 ret = btrfs_next_leaf(fs_info->extent_root, path); 342 ret = find_next_key(path, 0, &key);
300 if (ret < 0) 343 if (ret)
301 goto err;
302 else if (ret)
303 break; 344 break;
304 345
305 if (need_resched() || 346 caching_ctl->progress = last;
306 btrfs_transaction_in_commit(fs_info)) { 347 btrfs_release_path(extent_root, path);
307 leaf = path->nodes[0]; 348 up_read(&fs_info->extent_commit_sem);
308 349 mutex_unlock(&caching_ctl->mutex);
309 /* this shouldn't happen, but if the 350 if (btrfs_transaction_in_commit(fs_info))
310 * leaf is empty just move on.
311 */
312 if (btrfs_header_nritems(leaf) == 0)
313 break;
314 /*
315 * we need to copy the key out so that
316 * we are sure the next search advances
317 * us forward in the btree.
318 */
319 btrfs_item_key_to_cpu(leaf, &key, 0);
320 btrfs_release_path(fs_info->extent_root, path);
321 up_read(&fs_info->extent_commit_sem);
322 schedule_timeout(1); 351 schedule_timeout(1);
323 goto again; 352 else
324 } 353 cond_resched();
354 goto again;
355 }
325 356
357 if (key.objectid < block_group->key.objectid) {
358 path->slots[0]++;
326 continue; 359 continue;
327 } 360 }
328 btrfs_item_key_to_cpu(leaf, &key, slot);
329 if (key.objectid < block_group->key.objectid)
330 goto next;
331 361
332 if (key.objectid >= block_group->key.objectid + 362 if (key.objectid >= block_group->key.objectid +
333 block_group->key.offset) 363 block_group->key.offset)
334 break; 364 break;
335 365
336 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 366 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
337 total_found += add_new_free_space(block_group, 367 total_found += add_new_free_space(block_group,
338 fs_info, last, 368 fs_info, last,
339 key.objectid); 369 key.objectid);
340 last = key.objectid + key.offset; 370 last = key.objectid + key.offset;
341 }
342 371
343 if (total_found > (1024 * 1024 * 2)) { 372 if (total_found > (1024 * 1024 * 2)) {
344 total_found = 0; 373 total_found = 0;
345 wake_up(&block_group->caching_q); 374 wake_up(&caching_ctl->wait);
375 }
346 } 376 }
347next:
348 path->slots[0]++; 377 path->slots[0]++;
349 } 378 }
350 ret = 0; 379 ret = 0;
@@ -352,33 +381,65 @@ next:
352 total_found += add_new_free_space(block_group, fs_info, last, 381 total_found += add_new_free_space(block_group, fs_info, last,
353 block_group->key.objectid + 382 block_group->key.objectid +
354 block_group->key.offset); 383 block_group->key.offset);
384 caching_ctl->progress = (u64)-1;
355 385
356 spin_lock(&block_group->lock); 386 spin_lock(&block_group->lock);
387 block_group->caching_ctl = NULL;
357 block_group->cached = BTRFS_CACHE_FINISHED; 388 block_group->cached = BTRFS_CACHE_FINISHED;
358 spin_unlock(&block_group->lock); 389 spin_unlock(&block_group->lock);
359 390
360err: 391err:
361 btrfs_free_path(path); 392 btrfs_free_path(path);
362 up_read(&fs_info->extent_commit_sem); 393 up_read(&fs_info->extent_commit_sem);
363 atomic_dec(&block_group->space_info->caching_threads);
364 wake_up(&block_group->caching_q);
365 394
395 free_excluded_extents(extent_root, block_group);
396
397 mutex_unlock(&caching_ctl->mutex);
398 wake_up(&caching_ctl->wait);
399
400 put_caching_control(caching_ctl);
401 atomic_dec(&block_group->space_info->caching_threads);
366 return 0; 402 return 0;
367} 403}
368 404
369static int cache_block_group(struct btrfs_block_group_cache *cache) 405static int cache_block_group(struct btrfs_block_group_cache *cache)
370{ 406{
407 struct btrfs_fs_info *fs_info = cache->fs_info;
408 struct btrfs_caching_control *caching_ctl;
371 struct task_struct *tsk; 409 struct task_struct *tsk;
372 int ret = 0; 410 int ret = 0;
373 411
412 smp_mb();
413 if (cache->cached != BTRFS_CACHE_NO)
414 return 0;
415
416 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
417 BUG_ON(!caching_ctl);
418
419 INIT_LIST_HEAD(&caching_ctl->list);
420 mutex_init(&caching_ctl->mutex);
421 init_waitqueue_head(&caching_ctl->wait);
422 caching_ctl->block_group = cache;
423 caching_ctl->progress = cache->key.objectid;
424 /* one for caching kthread, one for caching block group list */
425 atomic_set(&caching_ctl->count, 2);
426
374 spin_lock(&cache->lock); 427 spin_lock(&cache->lock);
375 if (cache->cached != BTRFS_CACHE_NO) { 428 if (cache->cached != BTRFS_CACHE_NO) {
376 spin_unlock(&cache->lock); 429 spin_unlock(&cache->lock);
377 return ret; 430 kfree(caching_ctl);
431 return 0;
378 } 432 }
433 cache->caching_ctl = caching_ctl;
379 cache->cached = BTRFS_CACHE_STARTED; 434 cache->cached = BTRFS_CACHE_STARTED;
380 spin_unlock(&cache->lock); 435 spin_unlock(&cache->lock);
381 436
437 down_write(&fs_info->extent_commit_sem);
438 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
439 up_write(&fs_info->extent_commit_sem);
440
441 atomic_inc(&cache->space_info->caching_threads);
442
382 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 443 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
383 cache->key.objectid); 444 cache->key.objectid);
384 if (IS_ERR(tsk)) { 445 if (IS_ERR(tsk)) {
@@ -1657,7 +1718,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1657 parent, ref_root, flags, 1718 parent, ref_root, flags,
1658 ref->objectid, ref->offset, 1719 ref->objectid, ref->offset,
1659 &ins, node->ref_mod); 1720 &ins, node->ref_mod);
1660 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1661 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1721 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1662 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1722 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1663 node->num_bytes, parent, 1723 node->num_bytes, parent,
@@ -1783,7 +1843,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1783 extent_op->flags_to_set, 1843 extent_op->flags_to_set,
1784 &extent_op->key, 1844 &extent_op->key,
1785 ref->level, &ins); 1845 ref->level, &ins);
1786 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1787 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1846 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1788 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1847 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1789 node->num_bytes, parent, ref_root, 1848 node->num_bytes, parent, ref_root,
@@ -1818,16 +1877,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1818 BUG_ON(extent_op); 1877 BUG_ON(extent_op);
1819 head = btrfs_delayed_node_to_head(node); 1878 head = btrfs_delayed_node_to_head(node);
1820 if (insert_reserved) { 1879 if (insert_reserved) {
1880 int mark_free = 0;
1881 struct extent_buffer *must_clean = NULL;
1882
1883 ret = pin_down_bytes(trans, root, NULL,
1884 node->bytenr, node->num_bytes,
1885 head->is_data, 1, &must_clean);
1886 if (ret > 0)
1887 mark_free = 1;
1888
1889 if (must_clean) {
1890 clean_tree_block(NULL, root, must_clean);
1891 btrfs_tree_unlock(must_clean);
1892 free_extent_buffer(must_clean);
1893 }
1821 if (head->is_data) { 1894 if (head->is_data) {
1822 ret = btrfs_del_csums(trans, root, 1895 ret = btrfs_del_csums(trans, root,
1823 node->bytenr, 1896 node->bytenr,
1824 node->num_bytes); 1897 node->num_bytes);
1825 BUG_ON(ret); 1898 BUG_ON(ret);
1826 } 1899 }
1827 btrfs_update_pinned_extents(root, node->bytenr, 1900 if (mark_free) {
1828 node->num_bytes, 1); 1901 ret = btrfs_free_reserved_extent(root,
1829 update_reserved_extents(root, node->bytenr, 1902 node->bytenr,
1830 node->num_bytes, 0); 1903 node->num_bytes);
1904 BUG_ON(ret);
1905 }
1831 } 1906 }
1832 mutex_unlock(&head->mutex); 1907 mutex_unlock(&head->mutex);
1833 return 0; 1908 return 0;
@@ -2692,60 +2767,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2692 alloc_target); 2767 alloc_target);
2693} 2768}
2694 2769
2770static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2771{
2772 u64 num_bytes;
2773 int level;
2774
2775 level = BTRFS_MAX_LEVEL - 2;
2776 /*
2777 * NOTE: these calculations are absolutely the worst possible case.
2778 * This assumes that _every_ item we insert will require a new leaf, and
2779 * that the tree has grown to its maximum level size.
2780 */
2781
2782 /*
2783 * for every item we insert we could insert both an extent item and a
2784 * extent ref item. Then for ever item we insert, we will need to cow
2785 * both the original leaf, plus the leaf to the left and right of it.
2786 *
2787 * Unless we are talking about the extent root, then we just want the
2788 * number of items * 2, since we just need the extent item plus its ref.
2789 */
2790 if (root == root->fs_info->extent_root)
2791 num_bytes = num_items * 2;
2792 else
2793 num_bytes = (num_items + (2 * num_items)) * 3;
2794
2795 /*
2796 * num_bytes is total number of leaves we could need times the leaf
2797 * size, and then for every leaf we could end up cow'ing 2 nodes per
2798 * level, down to the leaf level.
2799 */
2800 num_bytes = (num_bytes * root->leafsize) +
2801 (num_bytes * (level * 2)) * root->nodesize;
2802
2803 return num_bytes;
2804}
2805
2695/* 2806/*
2696 * for now this just makes sure we have at least 5% of our metadata space free 2807 * Unreserve metadata space for delalloc. If we have less reserved credits than
2697 * for use. 2808 * we have extents, this function does nothing.
2698 */ 2809 */
2699int btrfs_check_metadata_free_space(struct btrfs_root *root) 2810int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2811 struct inode *inode, int num_items)
2700{ 2812{
2701 struct btrfs_fs_info *info = root->fs_info; 2813 struct btrfs_fs_info *info = root->fs_info;
2702 struct btrfs_space_info *meta_sinfo; 2814 struct btrfs_space_info *meta_sinfo;
2703 u64 alloc_target, thresh; 2815 u64 num_bytes;
2704 int committed = 0, ret; 2816 u64 alloc_target;
2817 bool bug = false;
2705 2818
2706 /* get the space info for where the metadata will live */ 2819 /* get the space info for where the metadata will live */
2707 alloc_target = btrfs_get_alloc_profile(root, 0); 2820 alloc_target = btrfs_get_alloc_profile(root, 0);
2708 meta_sinfo = __find_space_info(info, alloc_target); 2821 meta_sinfo = __find_space_info(info, alloc_target);
2709 2822
2710again: 2823 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2824 num_items);
2825
2711 spin_lock(&meta_sinfo->lock); 2826 spin_lock(&meta_sinfo->lock);
2712 if (!meta_sinfo->full) 2827 if (BTRFS_I(inode)->delalloc_reserved_extents <=
2713 thresh = meta_sinfo->total_bytes * 80; 2828 BTRFS_I(inode)->delalloc_extents) {
2714 else 2829 spin_unlock(&meta_sinfo->lock);
2715 thresh = meta_sinfo->total_bytes * 95; 2830 return 0;
2831 }
2832
2833 BTRFS_I(inode)->delalloc_reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
2835
2836 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true;
2838 meta_sinfo->bytes_delalloc = 0;
2839 } else {
2840 meta_sinfo->bytes_delalloc -= num_bytes;
2841 }
2842 spin_unlock(&meta_sinfo->lock);
2843
2844 BUG_ON(bug);
2845
2846 return 0;
2847}
2716 2848
2849static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2850{
2851 u64 thresh;
2852
2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2856 meta_sinfo->bytes_may_use;
2857
2858 thresh = meta_sinfo->total_bytes - thresh;
2859 thresh *= 80;
2717 do_div(thresh, 100); 2860 do_div(thresh, 100);
2861 if (thresh <= meta_sinfo->bytes_delalloc)
2862 meta_sinfo->force_delalloc = 1;
2863 else
2864 meta_sinfo->force_delalloc = 0;
2865}
2718 2866
2719 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2867static int maybe_allocate_chunk(struct btrfs_root *root,
2720 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { 2868 struct btrfs_space_info *info)
2721 struct btrfs_trans_handle *trans; 2869{
2722 if (!meta_sinfo->full) { 2870 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2723 meta_sinfo->force_alloc = 1; 2871 struct btrfs_trans_handle *trans;
2724 spin_unlock(&meta_sinfo->lock); 2872 bool wait = false;
2873 int ret = 0;
2874 u64 min_metadata;
2875 u64 free_space;
2725 2876
2726 trans = btrfs_start_transaction(root, 1); 2877 free_space = btrfs_super_total_bytes(disk_super);
2727 if (!trans) 2878 /*
2728 return -ENOMEM; 2879 * we allow the metadata to grow to a max of either 5gb or 5% of the
2880 * space in the volume.
2881 */
2882 min_metadata = min((u64)5 * 1024 * 1024 * 1024,
2883 div64_u64(free_space * 5, 100));
2884 if (info->total_bytes >= min_metadata) {
2885 spin_unlock(&info->lock);
2886 return 0;
2887 }
2729 2888
2730 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2889 if (info->full) {
2731 2 * 1024 * 1024, alloc_target, 0); 2890 spin_unlock(&info->lock);
2732 btrfs_end_transaction(trans, root); 2891 return 0;
2892 }
2893
2894 if (!info->allocating_chunk) {
2895 info->force_alloc = 1;
2896 info->allocating_chunk = 1;
2897 init_waitqueue_head(&info->wait);
2898 } else {
2899 wait = true;
2900 }
2901
2902 spin_unlock(&info->lock);
2903
2904 if (wait) {
2905 wait_event(info->wait,
2906 !info->allocating_chunk);
2907 return 1;
2908 }
2909
2910 trans = btrfs_start_transaction(root, 1);
2911 if (!trans) {
2912 ret = -ENOMEM;
2913 goto out;
2914 }
2915
2916 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2917 4096 + 2 * 1024 * 1024,
2918 info->flags, 0);
2919 btrfs_end_transaction(trans, root);
2920 if (ret)
2921 goto out;
2922out:
2923 spin_lock(&info->lock);
2924 info->allocating_chunk = 0;
2925 spin_unlock(&info->lock);
2926 wake_up(&info->wait);
2927
2928 if (ret)
2929 return 0;
2930 return 1;
2931}
2932
2933/*
2934 * Reserve metadata space for delalloc.
2935 */
2936int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2937 struct inode *inode, int num_items)
2938{
2939 struct btrfs_fs_info *info = root->fs_info;
2940 struct btrfs_space_info *meta_sinfo;
2941 u64 num_bytes;
2942 u64 used;
2943 u64 alloc_target;
2944 int flushed = 0;
2945 int force_delalloc;
2946
2947 /* get the space info for where the metadata will live */
2948 alloc_target = btrfs_get_alloc_profile(root, 0);
2949 meta_sinfo = __find_space_info(info, alloc_target);
2950
2951 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2952 num_items);
2953again:
2954 spin_lock(&meta_sinfo->lock);
2955
2956 force_delalloc = meta_sinfo->force_delalloc;
2957
2958 if (unlikely(!meta_sinfo->bytes_root))
2959 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2960
2961 if (!flushed)
2962 meta_sinfo->bytes_delalloc += num_bytes;
2963
2964 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2965 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2966 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2967 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2968
2969 if (used > meta_sinfo->total_bytes) {
2970 flushed++;
2971
2972 if (flushed == 1) {
2973 if (maybe_allocate_chunk(root, meta_sinfo))
2974 goto again;
2975 flushed++;
2976 } else {
2977 spin_unlock(&meta_sinfo->lock);
2978 }
2979
2980 if (flushed == 2) {
2981 filemap_flush(inode->i_mapping);
2982 goto again;
2983 } else if (flushed == 3) {
2984 btrfs_start_delalloc_inodes(root);
2985 btrfs_wait_ordered_extents(root, 0);
2733 goto again; 2986 goto again;
2734 } 2987 }
2988 spin_lock(&meta_sinfo->lock);
2989 meta_sinfo->bytes_delalloc -= num_bytes;
2735 spin_unlock(&meta_sinfo->lock); 2990 spin_unlock(&meta_sinfo->lock);
2991 printk(KERN_ERR "enospc, has %d, reserved %d\n",
2992 BTRFS_I(inode)->delalloc_extents,
2993 BTRFS_I(inode)->delalloc_reserved_extents);
2994 dump_space_info(meta_sinfo, 0, 0);
2995 return -ENOSPC;
2996 }
2736 2997
2737 if (!committed) { 2998 BTRFS_I(inode)->delalloc_reserved_extents++;
2738 committed = 1; 2999 check_force_delalloc(meta_sinfo);
2739 trans = btrfs_join_transaction(root, 1); 3000 spin_unlock(&meta_sinfo->lock);
2740 if (!trans) 3001
2741 return -ENOMEM; 3002 if (!flushed && force_delalloc)
2742 ret = btrfs_commit_transaction(trans, root); 3003 filemap_flush(inode->i_mapping);
2743 if (ret) 3004
2744 return ret; 3005 return 0;
3006}
3007
3008/*
3009 * unreserve num_items number of items worth of metadata space. This needs to
3010 * be paired with btrfs_reserve_metadata_space.
3011 *
3012 * NOTE: if you have the option, run this _AFTER_ you do a
3013 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3014 * oprations which will result in more used metadata, so we want to make sure we
3015 * can do that without issue.
3016 */
3017int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3018{
3019 struct btrfs_fs_info *info = root->fs_info;
3020 struct btrfs_space_info *meta_sinfo;
3021 u64 num_bytes;
3022 u64 alloc_target;
3023 bool bug = false;
3024
3025 /* get the space info for where the metadata will live */
3026 alloc_target = btrfs_get_alloc_profile(root, 0);
3027 meta_sinfo = __find_space_info(info, alloc_target);
3028
3029 num_bytes = calculate_bytes_needed(root, num_items);
3030
3031 spin_lock(&meta_sinfo->lock);
3032 if (meta_sinfo->bytes_may_use < num_bytes) {
3033 bug = true;
3034 meta_sinfo->bytes_may_use = 0;
3035 } else {
3036 meta_sinfo->bytes_may_use -= num_bytes;
3037 }
3038 spin_unlock(&meta_sinfo->lock);
3039
3040 BUG_ON(bug);
3041
3042 return 0;
3043}
3044
3045/*
3046 * Reserve some metadata space for use. We'll calculate the worste case number
3047 * of bytes that would be needed to modify num_items number of items. If we
3048 * have space, fantastic, if not, you get -ENOSPC. Please call
3049 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3050 * items you reserved, since whatever metadata you needed should have already
3051 * been allocated.
3052 *
3053 * This will commit the transaction to make more space if we don't have enough
3054 * metadata space. THe only time we don't do this is if we're reserving space
3055 * inside of a transaction, then we will just return -ENOSPC and it is the
3056 * callers responsibility to handle it properly.
3057 */
3058int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3059{
3060 struct btrfs_fs_info *info = root->fs_info;
3061 struct btrfs_space_info *meta_sinfo;
3062 u64 num_bytes;
3063 u64 used;
3064 u64 alloc_target;
3065 int retries = 0;
3066
3067 /* get the space info for where the metadata will live */
3068 alloc_target = btrfs_get_alloc_profile(root, 0);
3069 meta_sinfo = __find_space_info(info, alloc_target);
3070
3071 num_bytes = calculate_bytes_needed(root, num_items);
3072again:
3073 spin_lock(&meta_sinfo->lock);
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!retries)
3079 meta_sinfo->bytes_may_use += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 retries++;
3088 if (retries == 1) {
3089 if (maybe_allocate_chunk(root, meta_sinfo))
3090 goto again;
3091 retries++;
3092 } else {
3093 spin_unlock(&meta_sinfo->lock);
3094 }
3095
3096 if (retries == 2) {
3097 btrfs_start_delalloc_inodes(root);
3098 btrfs_wait_ordered_extents(root, 0);
2745 goto again; 3099 goto again;
2746 } 3100 }
3101 spin_lock(&meta_sinfo->lock);
3102 meta_sinfo->bytes_may_use -= num_bytes;
3103 spin_unlock(&meta_sinfo->lock);
3104
3105 dump_space_info(meta_sinfo, 0, 0);
2747 return -ENOSPC; 3106 return -ENOSPC;
2748 } 3107 }
3108
3109 check_force_delalloc(meta_sinfo);
2749 spin_unlock(&meta_sinfo->lock); 3110 spin_unlock(&meta_sinfo->lock);
2750 3111
2751 return 0; 3112 return 0;
@@ -2765,13 +3126,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2765 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3126 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2766 3127
2767 data_sinfo = BTRFS_I(inode)->space_info; 3128 data_sinfo = BTRFS_I(inode)->space_info;
3129 if (!data_sinfo)
3130 goto alloc;
3131
2768again: 3132again:
2769 /* make sure we have enough space to handle the data first */ 3133 /* make sure we have enough space to handle the data first */
2770 spin_lock(&data_sinfo->lock); 3134 spin_lock(&data_sinfo->lock);
2771 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3135 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2772 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3136 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2773 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3137 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2774 data_sinfo->bytes_may_use < bytes) { 3138 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
2775 struct btrfs_trans_handle *trans; 3139 struct btrfs_trans_handle *trans;
2776 3140
2777 /* 3141 /*
@@ -2783,7 +3147,7 @@ again:
2783 3147
2784 data_sinfo->force_alloc = 1; 3148 data_sinfo->force_alloc = 1;
2785 spin_unlock(&data_sinfo->lock); 3149 spin_unlock(&data_sinfo->lock);
2786 3150alloc:
2787 alloc_target = btrfs_get_alloc_profile(root, 1); 3151 alloc_target = btrfs_get_alloc_profile(root, 1);
2788 trans = btrfs_start_transaction(root, 1); 3152 trans = btrfs_start_transaction(root, 1);
2789 if (!trans) 3153 if (!trans)
@@ -2795,12 +3159,17 @@ again:
2795 btrfs_end_transaction(trans, root); 3159 btrfs_end_transaction(trans, root);
2796 if (ret) 3160 if (ret)
2797 return ret; 3161 return ret;
3162
3163 if (!data_sinfo) {
3164 btrfs_set_inode_space_info(root, inode);
3165 data_sinfo = BTRFS_I(inode)->space_info;
3166 }
2798 goto again; 3167 goto again;
2799 } 3168 }
2800 spin_unlock(&data_sinfo->lock); 3169 spin_unlock(&data_sinfo->lock);
2801 3170
2802 /* commit the current transaction and try again */ 3171 /* commit the current transaction and try again */
2803 if (!committed) { 3172 if (!committed && !root->fs_info->open_ioctl_trans) {
2804 committed = 1; 3173 committed = 1;
2805 trans = btrfs_join_transaction(root, 1); 3174 trans = btrfs_join_transaction(root, 1);
2806 if (!trans) 3175 if (!trans)
@@ -2828,7 +3197,7 @@ again:
2828 BTRFS_I(inode)->reserved_bytes += bytes; 3197 BTRFS_I(inode)->reserved_bytes += bytes;
2829 spin_unlock(&data_sinfo->lock); 3198 spin_unlock(&data_sinfo->lock);
2830 3199
2831 return btrfs_check_metadata_free_space(root); 3200 return 0;
2832} 3201}
2833 3202
2834/* 3203/*
@@ -2927,17 +3296,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2927 BUG_ON(!space_info); 3296 BUG_ON(!space_info);
2928 3297
2929 spin_lock(&space_info->lock); 3298 spin_lock(&space_info->lock);
2930 if (space_info->force_alloc) { 3299 if (space_info->force_alloc)
2931 force = 1; 3300 force = 1;
2932 space_info->force_alloc = 0;
2933 }
2934 if (space_info->full) { 3301 if (space_info->full) {
2935 spin_unlock(&space_info->lock); 3302 spin_unlock(&space_info->lock);
2936 goto out; 3303 goto out;
2937 } 3304 }
2938 3305
2939 thresh = space_info->total_bytes - space_info->bytes_readonly; 3306 thresh = space_info->total_bytes - space_info->bytes_readonly;
2940 thresh = div_factor(thresh, 6); 3307 thresh = div_factor(thresh, 8);
2941 if (!force && 3308 if (!force &&
2942 (space_info->bytes_used + space_info->bytes_pinned + 3309 (space_info->bytes_used + space_info->bytes_pinned +
2943 space_info->bytes_reserved + alloc_bytes) < thresh) { 3310 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -2951,7 +3318,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2951 * we keep a reasonable number of metadata chunks allocated in the 3318 * we keep a reasonable number of metadata chunks allocated in the
2952 * FS as well. 3319 * FS as well.
2953 */ 3320 */
2954 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3321 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
2955 fs_info->data_chunk_allocations++; 3322 fs_info->data_chunk_allocations++;
2956 if (!(fs_info->data_chunk_allocations % 3323 if (!(fs_info->data_chunk_allocations %
2957 fs_info->metadata_ratio)) 3324 fs_info->metadata_ratio))
@@ -2959,8 +3326,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2959 } 3326 }
2960 3327
2961 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3328 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3329 spin_lock(&space_info->lock);
2962 if (ret) 3330 if (ret)
2963 space_info->full = 1; 3331 space_info->full = 1;
3332 space_info->force_alloc = 0;
3333 spin_unlock(&space_info->lock);
2964out: 3334out:
2965 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3335 mutex_unlock(&extent_root->fs_info->chunk_mutex);
2966 return ret; 3336 return ret;
@@ -3009,10 +3379,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3009 num_bytes = min(total, cache->key.offset - byte_in_group); 3379 num_bytes = min(total, cache->key.offset - byte_in_group);
3010 if (alloc) { 3380 if (alloc) {
3011 old_val += num_bytes; 3381 old_val += num_bytes;
3382 btrfs_set_block_group_used(&cache->item, old_val);
3383 cache->reserved -= num_bytes;
3012 cache->space_info->bytes_used += num_bytes; 3384 cache->space_info->bytes_used += num_bytes;
3385 cache->space_info->bytes_reserved -= num_bytes;
3013 if (cache->ro) 3386 if (cache->ro)
3014 cache->space_info->bytes_readonly -= num_bytes; 3387 cache->space_info->bytes_readonly -= num_bytes;
3015 btrfs_set_block_group_used(&cache->item, old_val);
3016 spin_unlock(&cache->lock); 3388 spin_unlock(&cache->lock);
3017 spin_unlock(&cache->space_info->lock); 3389 spin_unlock(&cache->space_info->lock);
3018 } else { 3390 } else {
@@ -3057,127 +3429,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3057 return bytenr; 3429 return bytenr;
3058} 3430}
3059 3431
3060int btrfs_update_pinned_extents(struct btrfs_root *root, 3432/*
3061 u64 bytenr, u64 num, int pin) 3433 * this function must be called within transaction
3434 */
3435int btrfs_pin_extent(struct btrfs_root *root,
3436 u64 bytenr, u64 num_bytes, int reserved)
3062{ 3437{
3063 u64 len;
3064 struct btrfs_block_group_cache *cache;
3065 struct btrfs_fs_info *fs_info = root->fs_info; 3438 struct btrfs_fs_info *fs_info = root->fs_info;
3439 struct btrfs_block_group_cache *cache;
3066 3440
3067 if (pin) 3441 cache = btrfs_lookup_block_group(fs_info, bytenr);
3068 set_extent_dirty(&fs_info->pinned_extents, 3442 BUG_ON(!cache);
3069 bytenr, bytenr + num - 1, GFP_NOFS);
3070
3071 while (num > 0) {
3072 cache = btrfs_lookup_block_group(fs_info, bytenr);
3073 BUG_ON(!cache);
3074 len = min(num, cache->key.offset -
3075 (bytenr - cache->key.objectid));
3076 if (pin) {
3077 spin_lock(&cache->space_info->lock);
3078 spin_lock(&cache->lock);
3079 cache->pinned += len;
3080 cache->space_info->bytes_pinned += len;
3081 spin_unlock(&cache->lock);
3082 spin_unlock(&cache->space_info->lock);
3083 fs_info->total_pinned += len;
3084 } else {
3085 int unpin = 0;
3086 3443
3087 /* 3444 spin_lock(&cache->space_info->lock);
3088 * in order to not race with the block group caching, we 3445 spin_lock(&cache->lock);
3089 * only want to unpin the extent if we are cached. If 3446 cache->pinned += num_bytes;
3090 * we aren't cached, we want to start async caching this 3447 cache->space_info->bytes_pinned += num_bytes;
3091 * block group so we can free the extent the next time 3448 if (reserved) {
3092 * around. 3449 cache->reserved -= num_bytes;
3093 */ 3450 cache->space_info->bytes_reserved -= num_bytes;
3094 spin_lock(&cache->space_info->lock); 3451 }
3095 spin_lock(&cache->lock); 3452 spin_unlock(&cache->lock);
3096 unpin = (cache->cached == BTRFS_CACHE_FINISHED); 3453 spin_unlock(&cache->space_info->lock);
3097 if (likely(unpin)) {
3098 cache->pinned -= len;
3099 cache->space_info->bytes_pinned -= len;
3100 fs_info->total_pinned -= len;
3101 }
3102 spin_unlock(&cache->lock);
3103 spin_unlock(&cache->space_info->lock);
3104 3454
3105 if (likely(unpin)) 3455 btrfs_put_block_group(cache);
3106 clear_extent_dirty(&fs_info->pinned_extents,
3107 bytenr, bytenr + len -1,
3108 GFP_NOFS);
3109 else
3110 cache_block_group(cache);
3111 3456
3112 if (unpin) 3457 set_extent_dirty(fs_info->pinned_extents,
3113 btrfs_add_free_space(cache, bytenr, len); 3458 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3114 } 3459 return 0;
3115 btrfs_put_block_group(cache); 3460}
3116 bytenr += len; 3461
3117 num -= len; 3462static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3463 u64 num_bytes, int reserve)
3464{
3465 spin_lock(&cache->space_info->lock);
3466 spin_lock(&cache->lock);
3467 if (reserve) {
3468 cache->reserved += num_bytes;
3469 cache->space_info->bytes_reserved += num_bytes;
3470 } else {
3471 cache->reserved -= num_bytes;
3472 cache->space_info->bytes_reserved -= num_bytes;
3118 } 3473 }
3474 spin_unlock(&cache->lock);
3475 spin_unlock(&cache->space_info->lock);
3119 return 0; 3476 return 0;
3120} 3477}
3121 3478
3122static int update_reserved_extents(struct btrfs_root *root, 3479int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3123 u64 bytenr, u64 num, int reserve) 3480 struct btrfs_root *root)
3124{ 3481{
3125 u64 len;
3126 struct btrfs_block_group_cache *cache;
3127 struct btrfs_fs_info *fs_info = root->fs_info; 3482 struct btrfs_fs_info *fs_info = root->fs_info;
3483 struct btrfs_caching_control *next;
3484 struct btrfs_caching_control *caching_ctl;
3485 struct btrfs_block_group_cache *cache;
3128 3486
3129 while (num > 0) { 3487 down_write(&fs_info->extent_commit_sem);
3130 cache = btrfs_lookup_block_group(fs_info, bytenr);
3131 BUG_ON(!cache);
3132 len = min(num, cache->key.offset -
3133 (bytenr - cache->key.objectid));
3134 3488
3135 spin_lock(&cache->space_info->lock); 3489 list_for_each_entry_safe(caching_ctl, next,
3136 spin_lock(&cache->lock); 3490 &fs_info->caching_block_groups, list) {
3137 if (reserve) { 3491 cache = caching_ctl->block_group;
3138 cache->reserved += len; 3492 if (block_group_cache_done(cache)) {
3139 cache->space_info->bytes_reserved += len; 3493 cache->last_byte_to_unpin = (u64)-1;
3494 list_del_init(&caching_ctl->list);
3495 put_caching_control(caching_ctl);
3140 } else { 3496 } else {
3141 cache->reserved -= len; 3497 cache->last_byte_to_unpin = caching_ctl->progress;
3142 cache->space_info->bytes_reserved -= len;
3143 } 3498 }
3144 spin_unlock(&cache->lock);
3145 spin_unlock(&cache->space_info->lock);
3146 btrfs_put_block_group(cache);
3147 bytenr += len;
3148 num -= len;
3149 } 3499 }
3500
3501 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3502 fs_info->pinned_extents = &fs_info->freed_extents[1];
3503 else
3504 fs_info->pinned_extents = &fs_info->freed_extents[0];
3505
3506 up_write(&fs_info->extent_commit_sem);
3150 return 0; 3507 return 0;
3151} 3508}
3152 3509
3153int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) 3510static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3154{ 3511{
3155 u64 last = 0; 3512 struct btrfs_fs_info *fs_info = root->fs_info;
3156 u64 start; 3513 struct btrfs_block_group_cache *cache = NULL;
3157 u64 end; 3514 u64 len;
3158 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3159 int ret;
3160 3515
3161 while (1) { 3516 while (start <= end) {
3162 ret = find_first_extent_bit(pinned_extents, last, 3517 if (!cache ||
3163 &start, &end, EXTENT_DIRTY); 3518 start >= cache->key.objectid + cache->key.offset) {
3164 if (ret) 3519 if (cache)
3165 break; 3520 btrfs_put_block_group(cache);
3521 cache = btrfs_lookup_block_group(fs_info, start);
3522 BUG_ON(!cache);
3523 }
3524
3525 len = cache->key.objectid + cache->key.offset - start;
3526 len = min(len, end + 1 - start);
3527
3528 if (start < cache->last_byte_to_unpin) {
3529 len = min(len, cache->last_byte_to_unpin - start);
3530 btrfs_add_free_space(cache, start, len);
3531 }
3166 3532
3167 set_extent_dirty(copy, start, end, GFP_NOFS); 3533 spin_lock(&cache->space_info->lock);
3168 last = end + 1; 3534 spin_lock(&cache->lock);
3535 cache->pinned -= len;
3536 cache->space_info->bytes_pinned -= len;
3537 spin_unlock(&cache->lock);
3538 spin_unlock(&cache->space_info->lock);
3539
3540 start += len;
3169 } 3541 }
3542
3543 if (cache)
3544 btrfs_put_block_group(cache);
3170 return 0; 3545 return 0;
3171} 3546}
3172 3547
3173int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 3548int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3174 struct btrfs_root *root, 3549 struct btrfs_root *root)
3175 struct extent_io_tree *unpin)
3176{ 3550{
3551 struct btrfs_fs_info *fs_info = root->fs_info;
3552 struct extent_io_tree *unpin;
3177 u64 start; 3553 u64 start;
3178 u64 end; 3554 u64 end;
3179 int ret; 3555 int ret;
3180 3556
3557 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3558 unpin = &fs_info->freed_extents[1];
3559 else
3560 unpin = &fs_info->freed_extents[0];
3561
3181 while (1) { 3562 while (1) {
3182 ret = find_first_extent_bit(unpin, 0, &start, &end, 3563 ret = find_first_extent_bit(unpin, 0, &start, &end,
3183 EXTENT_DIRTY); 3564 EXTENT_DIRTY);
@@ -3186,10 +3567,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3186 3567
3187 ret = btrfs_discard_extent(root, start, end + 1 - start); 3568 ret = btrfs_discard_extent(root, start, end + 1 - start);
3188 3569
3189 /* unlocks the pinned mutex */
3190 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
3191 clear_extent_dirty(unpin, start, end, GFP_NOFS); 3570 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3192 3571 unpin_extent_range(root, start, end);
3193 cond_resched(); 3572 cond_resched();
3194 } 3573 }
3195 3574
@@ -3199,7 +3578,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3199static int pin_down_bytes(struct btrfs_trans_handle *trans, 3578static int pin_down_bytes(struct btrfs_trans_handle *trans,
3200 struct btrfs_root *root, 3579 struct btrfs_root *root,
3201 struct btrfs_path *path, 3580 struct btrfs_path *path,
3202 u64 bytenr, u64 num_bytes, int is_data, 3581 u64 bytenr, u64 num_bytes,
3582 int is_data, int reserved,
3203 struct extent_buffer **must_clean) 3583 struct extent_buffer **must_clean)
3204{ 3584{
3205 int err = 0; 3585 int err = 0;
@@ -3231,15 +3611,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3231 } 3611 }
3232 free_extent_buffer(buf); 3612 free_extent_buffer(buf);
3233pinit: 3613pinit:
3234 btrfs_set_path_blocking(path); 3614 if (path)
3615 btrfs_set_path_blocking(path);
3235 /* unlocks the pinned mutex */ 3616 /* unlocks the pinned mutex */
3236 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3617 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3237 3618
3238 BUG_ON(err < 0); 3619 BUG_ON(err < 0);
3239 return 0; 3620 return 0;
3240} 3621}
3241 3622
3242
3243static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 3623static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3244 struct btrfs_root *root, 3624 struct btrfs_root *root,
3245 u64 bytenr, u64 num_bytes, u64 parent, 3625 u64 bytenr, u64 num_bytes, u64 parent,
@@ -3413,7 +3793,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3413 } 3793 }
3414 3794
3415 ret = pin_down_bytes(trans, root, path, bytenr, 3795 ret = pin_down_bytes(trans, root, path, bytenr,
3416 num_bytes, is_data, &must_clean); 3796 num_bytes, is_data, 0, &must_clean);
3417 if (ret > 0) 3797 if (ret > 0)
3418 mark_free = 1; 3798 mark_free = 1;
3419 BUG_ON(ret < 0); 3799 BUG_ON(ret < 0);
@@ -3544,8 +3924,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3544 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 3924 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3545 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 3925 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3546 /* unlocks the pinned mutex */ 3926 /* unlocks the pinned mutex */
3547 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3927 btrfs_pin_extent(root, bytenr, num_bytes, 1);
3548 update_reserved_extents(root, bytenr, num_bytes, 0);
3549 ret = 0; 3928 ret = 0;
3550 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 3929 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
3551 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 3930 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3585,19 +3964,33 @@ static noinline int
3585wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 3964wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3586 u64 num_bytes) 3965 u64 num_bytes)
3587{ 3966{
3967 struct btrfs_caching_control *caching_ctl;
3588 DEFINE_WAIT(wait); 3968 DEFINE_WAIT(wait);
3589 3969
3590 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); 3970 caching_ctl = get_caching_control(cache);
3591 3971 if (!caching_ctl)
3592 if (block_group_cache_done(cache)) {
3593 finish_wait(&cache->caching_q, &wait);
3594 return 0; 3972 return 0;
3595 }
3596 schedule();
3597 finish_wait(&cache->caching_q, &wait);
3598 3973
3599 wait_event(cache->caching_q, block_group_cache_done(cache) || 3974 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
3600 (cache->free_space >= num_bytes)); 3975 (cache->free_space >= num_bytes));
3976
3977 put_caching_control(caching_ctl);
3978 return 0;
3979}
3980
3981static noinline int
3982wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
3983{
3984 struct btrfs_caching_control *caching_ctl;
3985 DEFINE_WAIT(wait);
3986
3987 caching_ctl = get_caching_control(cache);
3988 if (!caching_ctl)
3989 return 0;
3990
3991 wait_event(caching_ctl->wait, block_group_cache_done(cache));
3992
3993 put_caching_control(caching_ctl);
3601 return 0; 3994 return 0;
3602} 3995}
3603 3996
@@ -3635,6 +4028,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3635 int last_ptr_loop = 0; 4028 int last_ptr_loop = 0;
3636 int loop = 0; 4029 int loop = 0;
3637 bool found_uncached_bg = false; 4030 bool found_uncached_bg = false;
4031 bool failed_cluster_refill = false;
3638 4032
3639 WARN_ON(num_bytes < root->sectorsize); 4033 WARN_ON(num_bytes < root->sectorsize);
3640 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4034 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3732,7 +4126,16 @@ have_block_group:
3732 if (unlikely(block_group->ro)) 4126 if (unlikely(block_group->ro))
3733 goto loop; 4127 goto loop;
3734 4128
3735 if (last_ptr) { 4129 /*
4130 * Ok we want to try and use the cluster allocator, so lets look
4131 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4132 * have tried the cluster allocator plenty of times at this
4133 * point and not have found anything, so we are likely way too
4134 * fragmented for the clustering stuff to find anything, so lets
4135 * just skip it and let the allocator find whatever block it can
4136 * find
4137 */
4138 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
3736 /* 4139 /*
3737 * the refill lock keeps out other 4140 * the refill lock keeps out other
3738 * people trying to start a new cluster 4141 * people trying to start a new cluster
@@ -3807,9 +4210,11 @@ refill_cluster:
3807 spin_unlock(&last_ptr->refill_lock); 4210 spin_unlock(&last_ptr->refill_lock);
3808 goto checks; 4211 goto checks;
3809 } 4212 }
3810 } else if (!cached && loop > LOOP_CACHING_NOWAIT) { 4213 } else if (!cached && loop > LOOP_CACHING_NOWAIT
4214 && !failed_cluster_refill) {
3811 spin_unlock(&last_ptr->refill_lock); 4215 spin_unlock(&last_ptr->refill_lock);
3812 4216
4217 failed_cluster_refill = true;
3813 wait_block_group_cache_progress(block_group, 4218 wait_block_group_cache_progress(block_group,
3814 num_bytes + empty_cluster + empty_size); 4219 num_bytes + empty_cluster + empty_size);
3815 goto have_block_group; 4220 goto have_block_group;
@@ -3821,13 +4226,9 @@ refill_cluster:
3821 * cluster. Free the cluster we've been trying 4226 * cluster. Free the cluster we've been trying
3822 * to use, and go to the next block group 4227 * to use, and go to the next block group
3823 */ 4228 */
3824 if (loop < LOOP_NO_EMPTY_SIZE) { 4229 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3825 btrfs_return_cluster_to_free_space(NULL,
3826 last_ptr);
3827 spin_unlock(&last_ptr->refill_lock);
3828 goto loop;
3829 }
3830 spin_unlock(&last_ptr->refill_lock); 4230 spin_unlock(&last_ptr->refill_lock);
4231 goto loop;
3831 } 4232 }
3832 4233
3833 offset = btrfs_find_space_for_alloc(block_group, search_start, 4234 offset = btrfs_find_space_for_alloc(block_group, search_start,
@@ -3881,9 +4282,12 @@ checks:
3881 search_start - offset); 4282 search_start - offset);
3882 BUG_ON(offset > search_start); 4283 BUG_ON(offset > search_start);
3883 4284
4285 update_reserved_extents(block_group, num_bytes, 1);
4286
3884 /* we are all good, lets return */ 4287 /* we are all good, lets return */
3885 break; 4288 break;
3886loop: 4289loop:
4290 failed_cluster_refill = false;
3887 btrfs_put_block_group(block_group); 4291 btrfs_put_block_group(block_group);
3888 } 4292 }
3889 up_read(&space_info->groups_sem); 4293 up_read(&space_info->groups_sem);
@@ -3941,21 +4345,32 @@ loop:
3941 return ret; 4345 return ret;
3942} 4346}
3943 4347
3944static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4348static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4349 int dump_block_groups)
3945{ 4350{
3946 struct btrfs_block_group_cache *cache; 4351 struct btrfs_block_group_cache *cache;
3947 4352
4353 spin_lock(&info->lock);
3948 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4354 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3949 (unsigned long long)(info->total_bytes - info->bytes_used - 4355 (unsigned long long)(info->total_bytes - info->bytes_used -
3950 info->bytes_pinned - info->bytes_reserved), 4356 info->bytes_pinned - info->bytes_reserved -
4357 info->bytes_super),
3951 (info->full) ? "" : "not "); 4358 (info->full) ? "" : "not ");
3952 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4359 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3953 " may_use=%llu, used=%llu\n", 4360 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4361 "\n",
3954 (unsigned long long)info->total_bytes, 4362 (unsigned long long)info->total_bytes,
3955 (unsigned long long)info->bytes_pinned, 4363 (unsigned long long)info->bytes_pinned,
3956 (unsigned long long)info->bytes_delalloc, 4364 (unsigned long long)info->bytes_delalloc,
3957 (unsigned long long)info->bytes_may_use, 4365 (unsigned long long)info->bytes_may_use,
3958 (unsigned long long)info->bytes_used); 4366 (unsigned long long)info->bytes_used,
4367 (unsigned long long)info->bytes_root,
4368 (unsigned long long)info->bytes_super,
4369 (unsigned long long)info->bytes_reserved);
4370 spin_unlock(&info->lock);
4371
4372 if (!dump_block_groups)
4373 return;
3959 4374
3960 down_read(&info->groups_sem); 4375 down_read(&info->groups_sem);
3961 list_for_each_entry(cache, &info->block_groups, list) { 4376 list_for_each_entry(cache, &info->block_groups, list) {
@@ -3973,12 +4388,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3973 up_read(&info->groups_sem); 4388 up_read(&info->groups_sem);
3974} 4389}
3975 4390
3976static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, 4391int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3977 struct btrfs_root *root, 4392 struct btrfs_root *root,
3978 u64 num_bytes, u64 min_alloc_size, 4393 u64 num_bytes, u64 min_alloc_size,
3979 u64 empty_size, u64 hint_byte, 4394 u64 empty_size, u64 hint_byte,
3980 u64 search_end, struct btrfs_key *ins, 4395 u64 search_end, struct btrfs_key *ins,
3981 u64 data) 4396 u64 data)
3982{ 4397{
3983 int ret; 4398 int ret;
3984 u64 search_start = 0; 4399 u64 search_start = 0;
@@ -4023,7 +4438,7 @@ again:
4023 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4438 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4024 "wanted %llu\n", (unsigned long long)data, 4439 "wanted %llu\n", (unsigned long long)data,
4025 (unsigned long long)num_bytes); 4440 (unsigned long long)num_bytes);
4026 dump_space_info(sinfo, num_bytes); 4441 dump_space_info(sinfo, num_bytes, 1);
4027 } 4442 }
4028 4443
4029 return ret; 4444 return ret;
@@ -4044,25 +4459,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4044 ret = btrfs_discard_extent(root, start, len); 4459 ret = btrfs_discard_extent(root, start, len);
4045 4460
4046 btrfs_add_free_space(cache, start, len); 4461 btrfs_add_free_space(cache, start, len);
4462 update_reserved_extents(cache, len, 0);
4047 btrfs_put_block_group(cache); 4463 btrfs_put_block_group(cache);
4048 update_reserved_extents(root, start, len, 0);
4049
4050 return ret;
4051}
4052
4053int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4054 struct btrfs_root *root,
4055 u64 num_bytes, u64 min_alloc_size,
4056 u64 empty_size, u64 hint_byte,
4057 u64 search_end, struct btrfs_key *ins,
4058 u64 data)
4059{
4060 int ret;
4061 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
4062 empty_size, hint_byte, search_end, ins,
4063 data);
4064 if (!ret)
4065 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4066 4464
4067 return ret; 4465 return ret;
4068} 4466}
@@ -4223,15 +4621,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4223{ 4621{
4224 int ret; 4622 int ret;
4225 struct btrfs_block_group_cache *block_group; 4623 struct btrfs_block_group_cache *block_group;
4624 struct btrfs_caching_control *caching_ctl;
4625 u64 start = ins->objectid;
4626 u64 num_bytes = ins->offset;
4226 4627
4227 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4628 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4228 cache_block_group(block_group); 4629 cache_block_group(block_group);
4229 wait_event(block_group->caching_q, 4630 caching_ctl = get_caching_control(block_group);
4230 block_group_cache_done(block_group));
4231 4631
4232 ret = btrfs_remove_free_space(block_group, ins->objectid, 4632 if (!caching_ctl) {
4233 ins->offset); 4633 BUG_ON(!block_group_cache_done(block_group));
4234 BUG_ON(ret); 4634 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4635 BUG_ON(ret);
4636 } else {
4637 mutex_lock(&caching_ctl->mutex);
4638
4639 if (start >= caching_ctl->progress) {
4640 ret = add_excluded_extent(root, start, num_bytes);
4641 BUG_ON(ret);
4642 } else if (start + num_bytes <= caching_ctl->progress) {
4643 ret = btrfs_remove_free_space(block_group,
4644 start, num_bytes);
4645 BUG_ON(ret);
4646 } else {
4647 num_bytes = caching_ctl->progress - start;
4648 ret = btrfs_remove_free_space(block_group,
4649 start, num_bytes);
4650 BUG_ON(ret);
4651
4652 start = caching_ctl->progress;
4653 num_bytes = ins->objectid + ins->offset -
4654 caching_ctl->progress;
4655 ret = add_excluded_extent(root, start, num_bytes);
4656 BUG_ON(ret);
4657 }
4658
4659 mutex_unlock(&caching_ctl->mutex);
4660 put_caching_control(caching_ctl);
4661 }
4662
4663 update_reserved_extents(block_group, ins->offset, 1);
4235 btrfs_put_block_group(block_group); 4664 btrfs_put_block_group(block_group);
4236 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 4665 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4237 0, owner, offset, ins, 1); 4666 0, owner, offset, ins, 1);
@@ -4255,9 +4684,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4255 int ret; 4684 int ret;
4256 u64 flags = 0; 4685 u64 flags = 0;
4257 4686
4258 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4687 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4259 empty_size, hint_byte, search_end, 4688 empty_size, hint_byte, search_end,
4260 ins, 0); 4689 ins, 0);
4261 if (ret) 4690 if (ret)
4262 return ret; 4691 return ret;
4263 4692
@@ -4268,7 +4697,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4268 } else 4697 } else
4269 BUG_ON(parent > 0); 4698 BUG_ON(parent > 0);
4270 4699
4271 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4272 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4700 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4273 struct btrfs_delayed_extent_op *extent_op; 4701 struct btrfs_delayed_extent_op *extent_op;
4274 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 4702 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4347,452 +4775,99 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4347 return buf; 4775 return buf;
4348} 4776}
4349 4777
4350#if 0 4778struct walk_control {
4351int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 4779 u64 refs[BTRFS_MAX_LEVEL];
4352 struct btrfs_root *root, struct extent_buffer *leaf) 4780 u64 flags[BTRFS_MAX_LEVEL];
4353{ 4781 struct btrfs_key update_progress;
4354 u64 disk_bytenr; 4782 int stage;
4355 u64 num_bytes; 4783 int level;
4356 struct btrfs_key key; 4784 int shared_level;
4357 struct btrfs_file_extent_item *fi; 4785 int update_ref;
4358 u32 nritems; 4786 int keep_locks;
4359 int i; 4787 int reada_slot;
4360 int ret; 4788 int reada_count;
4361 4789};
4362 BUG_ON(!btrfs_is_leaf(leaf));
4363 nritems = btrfs_header_nritems(leaf);
4364
4365 for (i = 0; i < nritems; i++) {
4366 cond_resched();
4367 btrfs_item_key_to_cpu(leaf, &key, i);
4368
4369 /* only extents have references, skip everything else */
4370 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4371 continue;
4372
4373 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4374
4375 /* inline extents live in the btree, they don't have refs */
4376 if (btrfs_file_extent_type(leaf, fi) ==
4377 BTRFS_FILE_EXTENT_INLINE)
4378 continue;
4379
4380 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4381
4382 /* holes don't have refs */
4383 if (disk_bytenr == 0)
4384 continue;
4385
4386 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4387 ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
4388 leaf->start, 0, key.objectid, 0);
4389 BUG_ON(ret);
4390 }
4391 return 0;
4392}
4393
4394static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
4395 struct btrfs_root *root,
4396 struct btrfs_leaf_ref *ref)
4397{
4398 int i;
4399 int ret;
4400 struct btrfs_extent_info *info;
4401 struct refsort *sorted;
4402
4403 if (ref->nritems == 0)
4404 return 0;
4405
4406 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
4407 for (i = 0; i < ref->nritems; i++) {
4408 sorted[i].bytenr = ref->extents[i].bytenr;
4409 sorted[i].slot = i;
4410 }
4411 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
4412
4413 /*
4414 * the items in the ref were sorted when the ref was inserted
4415 * into the ref cache, so this is already in order
4416 */
4417 for (i = 0; i < ref->nritems; i++) {
4418 info = ref->extents + sorted[i].slot;
4419 ret = btrfs_free_extent(trans, root, info->bytenr,
4420 info->num_bytes, ref->bytenr,
4421 ref->owner, ref->generation,
4422 info->objectid, 0);
4423
4424 atomic_inc(&root->fs_info->throttle_gen);
4425 wake_up(&root->fs_info->transaction_throttle);
4426 cond_resched();
4427
4428 BUG_ON(ret);
4429 info++;
4430 }
4431
4432 kfree(sorted);
4433 return 0;
4434}
4435
4436
4437static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
4438 struct btrfs_root *root, u64 start,
4439 u64 len, u32 *refs)
4440{
4441 int ret;
4442
4443 ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
4444 BUG_ON(ret);
4445
4446#if 0 /* some debugging code in case we see problems here */
4447 /* if the refs count is one, it won't get increased again. But
4448 * if the ref count is > 1, someone may be decreasing it at
4449 * the same time we are.
4450 */
4451 if (*refs != 1) {
4452 struct extent_buffer *eb = NULL;
4453 eb = btrfs_find_create_tree_block(root, start, len);
4454 if (eb)
4455 btrfs_tree_lock(eb);
4456
4457 mutex_lock(&root->fs_info->alloc_mutex);
4458 ret = lookup_extent_ref(NULL, root, start, len, refs);
4459 BUG_ON(ret);
4460 mutex_unlock(&root->fs_info->alloc_mutex);
4461
4462 if (eb) {
4463 btrfs_tree_unlock(eb);
4464 free_extent_buffer(eb);
4465 }
4466 if (*refs == 1) {
4467 printk(KERN_ERR "btrfs block %llu went down to one "
4468 "during drop_snap\n", (unsigned long long)start);
4469 }
4470
4471 }
4472#endif
4473
4474 cond_resched();
4475 return ret;
4476}
4477 4790
4791#define DROP_REFERENCE 1
4792#define UPDATE_BACKREF 2
4478 4793
4479/* 4794static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4480 * this is used while deleting old snapshots, and it drops the refs 4795 struct btrfs_root *root,
4481 * on a whole subtree starting from a level 1 node. 4796 struct walk_control *wc,
4482 * 4797 struct btrfs_path *path)
4483 * The idea is to sort all the leaf pointers, and then drop the
4484 * ref on all the leaves in order. Most of the time the leaves
4485 * will have ref cache entries, so no leaf IOs will be required to
4486 * find the extents they have references on.
4487 *
4488 * For each leaf, any references it has are also dropped in order
4489 *
4490 * This ends up dropping the references in something close to optimal
4491 * order for reading and modifying the extent allocation tree.
4492 */
4493static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4494 struct btrfs_root *root,
4495 struct btrfs_path *path)
4496{ 4798{
4497 u64 bytenr; 4799 u64 bytenr;
4498 u64 root_owner; 4800 u64 generation;
4499 u64 root_gen; 4801 u64 refs;
4500 struct extent_buffer *eb = path->nodes[1]; 4802 u64 last = 0;
4501 struct extent_buffer *leaf; 4803 u32 nritems;
4502 struct btrfs_leaf_ref *ref; 4804 u32 blocksize;
4503 struct refsort *sorted = NULL; 4805 struct btrfs_key key;
4504 int nritems = btrfs_header_nritems(eb); 4806 struct extent_buffer *eb;
4505 int ret; 4807 int ret;
4506 int i; 4808 int slot;
4507 int refi = 0; 4809 int nread = 0;
4508 int slot = path->slots[1];
4509 u32 blocksize = btrfs_level_size(root, 0);
4510 u32 refs;
4511
4512 if (nritems == 0)
4513 goto out;
4514
4515 root_owner = btrfs_header_owner(eb);
4516 root_gen = btrfs_header_generation(eb);
4517 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
4518 4810
4519 /* 4811 if (path->slots[wc->level] < wc->reada_slot) {
4520 * step one, sort all the leaf pointers so we don't scribble 4812 wc->reada_count = wc->reada_count * 2 / 3;
4521 * randomly into the extent allocation tree 4813 wc->reada_count = max(wc->reada_count, 2);
4522 */ 4814 } else {
4523 for (i = slot; i < nritems; i++) { 4815 wc->reada_count = wc->reada_count * 3 / 2;
4524 sorted[refi].bytenr = btrfs_node_blockptr(eb, i); 4816 wc->reada_count = min_t(int, wc->reada_count,
4525 sorted[refi].slot = i; 4817 BTRFS_NODEPTRS_PER_BLOCK(root));
4526 refi++;
4527 } 4818 }
4528 4819
4529 /* 4820 eb = path->nodes[wc->level];
4530 * nritems won't be zero, but if we're picking up drop_snapshot 4821 nritems = btrfs_header_nritems(eb);
4531 * after a crash, slot might be > 0, so double check things 4822 blocksize = btrfs_level_size(root, wc->level - 1);
4532 * just in case.
4533 */
4534 if (refi == 0)
4535 goto out;
4536 4823
4537 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 4824 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
4825 if (nread >= wc->reada_count)
4826 break;
4538 4827
4539 /* 4828 cond_resched();
4540 * the first loop frees everything the leaves point to 4829 bytenr = btrfs_node_blockptr(eb, slot);
4541 */ 4830 generation = btrfs_node_ptr_generation(eb, slot);
4542 for (i = 0; i < refi; i++) {
4543 u64 ptr_gen;
4544 4831
4545 bytenr = sorted[i].bytenr; 4832 if (slot == path->slots[wc->level])
4833 goto reada;
4546 4834
4547 /* 4835 if (wc->stage == UPDATE_BACKREF &&
4548 * check the reference count on this leaf. If it is > 1 4836 generation <= root->root_key.offset)
4549 * we just decrement it below and don't update any
4550 * of the refs the leaf points to.
4551 */
4552 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4553 blocksize, &refs);
4554 BUG_ON(ret);
4555 if (refs != 1)
4556 continue; 4837 continue;
4557 4838
4558 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); 4839 if (wc->stage == DROP_REFERENCE) {
4559 4840 ret = btrfs_lookup_extent_info(trans, root,
4560 /* 4841 bytenr, blocksize,
4561 * the leaf only had one reference, which means the 4842 &refs, NULL);
4562 * only thing pointing to this leaf is the snapshot
4563 * we're deleting. It isn't possible for the reference
4564 * count to increase again later
4565 *
4566 * The reference cache is checked for the leaf,
4567 * and if found we'll be able to drop any refs held by
4568 * the leaf without needing to read it in.
4569 */
4570 ref = btrfs_lookup_leaf_ref(root, bytenr);
4571 if (ref && ref->generation != ptr_gen) {
4572 btrfs_free_leaf_ref(root, ref);
4573 ref = NULL;
4574 }
4575 if (ref) {
4576 ret = cache_drop_leaf_ref(trans, root, ref);
4577 BUG_ON(ret);
4578 btrfs_remove_leaf_ref(root, ref);
4579 btrfs_free_leaf_ref(root, ref);
4580 } else {
4581 /*
4582 * the leaf wasn't in the reference cache, so
4583 * we have to read it.
4584 */
4585 leaf = read_tree_block(root, bytenr, blocksize,
4586 ptr_gen);
4587 ret = btrfs_drop_leaf_ref(trans, root, leaf);
4588 BUG_ON(ret); 4843 BUG_ON(ret);
4589 free_extent_buffer(leaf); 4844 BUG_ON(refs == 0);
4590 } 4845 if (refs == 1)
4591 atomic_inc(&root->fs_info->throttle_gen); 4846 goto reada;
4592 wake_up(&root->fs_info->transaction_throttle);
4593 cond_resched();
4594 }
4595
4596 /*
4597 * run through the loop again to free the refs on the leaves.
4598 * This is faster than doing it in the loop above because
4599 * the leaves are likely to be clustered together. We end up
4600 * working in nice chunks on the extent allocation tree.
4601 */
4602 for (i = 0; i < refi; i++) {
4603 bytenr = sorted[i].bytenr;
4604 ret = btrfs_free_extent(trans, root, bytenr,
4605 blocksize, eb->start,
4606 root_owner, root_gen, 0, 1);
4607 BUG_ON(ret);
4608
4609 atomic_inc(&root->fs_info->throttle_gen);
4610 wake_up(&root->fs_info->transaction_throttle);
4611 cond_resched();
4612 }
4613out:
4614 kfree(sorted);
4615
4616 /*
4617 * update the path to show we've processed the entire level 1
4618 * node. This will get saved into the root's drop_snapshot_progress
4619 * field so these drops are not repeated again if this transaction
4620 * commits.
4621 */
4622 path->slots[1] = nritems;
4623 return 0;
4624}
4625
4626/*
4627 * helper function for drop_snapshot, this walks down the tree dropping ref
4628 * counts as it goes.
4629 */
4630static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4631 struct btrfs_root *root,
4632 struct btrfs_path *path, int *level)
4633{
4634 u64 root_owner;
4635 u64 root_gen;
4636 u64 bytenr;
4637 u64 ptr_gen;
4638 struct extent_buffer *next;
4639 struct extent_buffer *cur;
4640 struct extent_buffer *parent;
4641 u32 blocksize;
4642 int ret;
4643 u32 refs;
4644
4645 WARN_ON(*level < 0);
4646 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4647 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4648 path->nodes[*level]->len, &refs);
4649 BUG_ON(ret);
4650 if (refs > 1)
4651 goto out;
4652
4653 /*
4654 * walk down to the last node level and free all the leaves
4655 */
4656 while (*level >= 0) {
4657 WARN_ON(*level < 0);
4658 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4659 cur = path->nodes[*level];
4660 4847
4661 if (btrfs_header_level(cur) != *level) 4848 if (!wc->update_ref ||
4662 WARN_ON(1); 4849 generation <= root->root_key.offset)
4663 4850 continue;
4664 if (path->slots[*level] >= 4851 btrfs_node_key_to_cpu(eb, &key, slot);
4665 btrfs_header_nritems(cur)) 4852 ret = btrfs_comp_cpu_keys(&key,
4666 break; 4853 &wc->update_progress);
4667 4854 if (ret < 0)
4668 /* the new code goes down to level 1 and does all the 4855 continue;
4669 * leaves pointed to that node in bulk. So, this check
4670 * for level 0 will always be false.
4671 *
4672 * But, the disk format allows the drop_snapshot_progress
4673 * field in the root to leave things in a state where
4674 * a leaf will need cleaning up here. If someone crashes
4675 * with the old code and then boots with the new code,
4676 * we might find a leaf here.
4677 */
4678 if (*level == 0) {
4679 ret = btrfs_drop_leaf_ref(trans, root, cur);
4680 BUG_ON(ret);
4681 break;
4682 } 4856 }
4683 4857reada:
4684 /* 4858 ret = readahead_tree_block(root, bytenr, blocksize,
4685 * once we get to level one, process the whole node 4859 generation);
4686 * at once, including everything below it. 4860 if (ret)
4687 */
4688 if (*level == 1) {
4689 ret = drop_level_one_refs(trans, root, path);
4690 BUG_ON(ret);
4691 break; 4861 break;
4692 } 4862 last = bytenr + blocksize;
4693 4863 nread++;
4694 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4695 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4696 blocksize = btrfs_level_size(root, *level - 1);
4697
4698 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4699 blocksize, &refs);
4700 BUG_ON(ret);
4701
4702 /*
4703 * if there is more than one reference, we don't need
4704 * to read that node to drop any references it has. We
4705 * just drop the ref we hold on that node and move on to the
4706 * next slot in this level.
4707 */
4708 if (refs != 1) {
4709 parent = path->nodes[*level];
4710 root_owner = btrfs_header_owner(parent);
4711 root_gen = btrfs_header_generation(parent);
4712 path->slots[*level]++;
4713
4714 ret = btrfs_free_extent(trans, root, bytenr,
4715 blocksize, parent->start,
4716 root_owner, root_gen,
4717 *level - 1, 1);
4718 BUG_ON(ret);
4719
4720 atomic_inc(&root->fs_info->throttle_gen);
4721 wake_up(&root->fs_info->transaction_throttle);
4722 cond_resched();
4723
4724 continue;
4725 }
4726
4727 /*
4728 * we need to keep freeing things in the next level down.
4729 * read the block and loop around to process it
4730 */
4731 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4732 WARN_ON(*level <= 0);
4733 if (path->nodes[*level-1])
4734 free_extent_buffer(path->nodes[*level-1]);
4735 path->nodes[*level-1] = next;
4736 *level = btrfs_header_level(next);
4737 path->slots[*level] = 0;
4738 cond_resched();
4739 } 4864 }
4740out: 4865 wc->reada_slot = slot;
4741 WARN_ON(*level < 0);
4742 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4743
4744 if (path->nodes[*level] == root->node) {
4745 parent = path->nodes[*level];
4746 bytenr = path->nodes[*level]->start;
4747 } else {
4748 parent = path->nodes[*level + 1];
4749 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4750 }
4751
4752 blocksize = btrfs_level_size(root, *level);
4753 root_owner = btrfs_header_owner(parent);
4754 root_gen = btrfs_header_generation(parent);
4755
4756 /*
4757 * cleanup and free the reference on the last node
4758 * we processed
4759 */
4760 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4761 parent->start, root_owner, root_gen,
4762 *level, 1);
4763 free_extent_buffer(path->nodes[*level]);
4764 path->nodes[*level] = NULL;
4765
4766 *level += 1;
4767 BUG_ON(ret);
4768
4769 cond_resched();
4770 return 0;
4771} 4866}
4772#endif
4773
4774struct walk_control {
4775 u64 refs[BTRFS_MAX_LEVEL];
4776 u64 flags[BTRFS_MAX_LEVEL];
4777 struct btrfs_key update_progress;
4778 int stage;
4779 int level;
4780 int shared_level;
4781 int update_ref;
4782 int keep_locks;
4783};
4784
4785#define DROP_REFERENCE 1
4786#define UPDATE_BACKREF 2
4787 4867
4788/* 4868/*
4789 * hepler to process tree block while walking down the tree. 4869 * hepler to process tree block while walking down the tree.
4790 * 4870 *
4791 * when wc->stage == DROP_REFERENCE, this function checks
4792 * reference count of the block. if the block is shared and
4793 * we need update back refs for the subtree rooted at the
4794 * block, this function changes wc->stage to UPDATE_BACKREF
4795 *
4796 * when wc->stage == UPDATE_BACKREF, this function updates 4871 * when wc->stage == UPDATE_BACKREF, this function updates
4797 * back refs for pointers in the block. 4872 * back refs for pointers in the block.
4798 * 4873 *
@@ -4805,7 +4880,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4805{ 4880{
4806 int level = wc->level; 4881 int level = wc->level;
4807 struct extent_buffer *eb = path->nodes[level]; 4882 struct extent_buffer *eb = path->nodes[level];
4808 struct btrfs_key key;
4809 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 4883 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4810 int ret; 4884 int ret;
4811 4885
@@ -4828,21 +4902,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4828 BUG_ON(wc->refs[level] == 0); 4902 BUG_ON(wc->refs[level] == 0);
4829 } 4903 }
4830 4904
4831 if (wc->stage == DROP_REFERENCE &&
4832 wc->update_ref && wc->refs[level] > 1) {
4833 BUG_ON(eb == root->node);
4834 BUG_ON(path->slots[level] > 0);
4835 if (level == 0)
4836 btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4837 else
4838 btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4839 if (btrfs_header_owner(eb) == root->root_key.objectid &&
4840 btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4841 wc->stage = UPDATE_BACKREF;
4842 wc->shared_level = level;
4843 }
4844 }
4845
4846 if (wc->stage == DROP_REFERENCE) { 4905 if (wc->stage == DROP_REFERENCE) {
4847 if (wc->refs[level] > 1) 4906 if (wc->refs[level] > 1)
4848 return 1; 4907 return 1;
@@ -4879,6 +4938,123 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4879} 4938}
4880 4939
4881/* 4940/*
4941 * hepler to process tree block pointer.
4942 *
4943 * when wc->stage == DROP_REFERENCE, this function checks
4944 * reference count of the block pointed to. if the block
4945 * is shared and we need update back refs for the subtree
4946 * rooted at the block, this function changes wc->stage to
4947 * UPDATE_BACKREF. if the block is shared and there is no
4948 * need to update back, this function drops the reference
4949 * to the block.
4950 *
4951 * NOTE: return value 1 means we should stop walking down.
4952 */
4953static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4954 struct btrfs_root *root,
4955 struct btrfs_path *path,
4956 struct walk_control *wc)
4957{
4958 u64 bytenr;
4959 u64 generation;
4960 u64 parent;
4961 u32 blocksize;
4962 struct btrfs_key key;
4963 struct extent_buffer *next;
4964 int level = wc->level;
4965 int reada = 0;
4966 int ret = 0;
4967
4968 generation = btrfs_node_ptr_generation(path->nodes[level],
4969 path->slots[level]);
4970 /*
4971 * if the lower level block was created before the snapshot
4972 * was created, we know there is no need to update back refs
4973 * for the subtree
4974 */
4975 if (wc->stage == UPDATE_BACKREF &&
4976 generation <= root->root_key.offset)
4977 return 1;
4978
4979 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4980 blocksize = btrfs_level_size(root, level - 1);
4981
4982 next = btrfs_find_tree_block(root, bytenr, blocksize);
4983 if (!next) {
4984 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
4985 reada = 1;
4986 }
4987 btrfs_tree_lock(next);
4988 btrfs_set_lock_blocking(next);
4989
4990 if (wc->stage == DROP_REFERENCE) {
4991 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4992 &wc->refs[level - 1],
4993 &wc->flags[level - 1]);
4994 BUG_ON(ret);
4995 BUG_ON(wc->refs[level - 1] == 0);
4996
4997 if (wc->refs[level - 1] > 1) {
4998 if (!wc->update_ref ||
4999 generation <= root->root_key.offset)
5000 goto skip;
5001
5002 btrfs_node_key_to_cpu(path->nodes[level], &key,
5003 path->slots[level]);
5004 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5005 if (ret < 0)
5006 goto skip;
5007
5008 wc->stage = UPDATE_BACKREF;
5009 wc->shared_level = level - 1;
5010 }
5011 }
5012
5013 if (!btrfs_buffer_uptodate(next, generation)) {
5014 btrfs_tree_unlock(next);
5015 free_extent_buffer(next);
5016 next = NULL;
5017 }
5018
5019 if (!next) {
5020 if (reada && level == 1)
5021 reada_walk_down(trans, root, wc, path);
5022 next = read_tree_block(root, bytenr, blocksize, generation);
5023 btrfs_tree_lock(next);
5024 btrfs_set_lock_blocking(next);
5025 }
5026
5027 level--;
5028 BUG_ON(level != btrfs_header_level(next));
5029 path->nodes[level] = next;
5030 path->slots[level] = 0;
5031 path->locks[level] = 1;
5032 wc->level = level;
5033 if (wc->level == 1)
5034 wc->reada_slot = 0;
5035 return 0;
5036skip:
5037 wc->refs[level - 1] = 0;
5038 wc->flags[level - 1] = 0;
5039
5040 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5041 parent = path->nodes[level]->start;
5042 } else {
5043 BUG_ON(root->root_key.objectid !=
5044 btrfs_header_owner(path->nodes[level]));
5045 parent = 0;
5046 }
5047
5048 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5049 root->root_key.objectid, level - 1, 0);
5050 BUG_ON(ret);
5051
5052 btrfs_tree_unlock(next);
5053 free_extent_buffer(next);
5054 return 1;
5055}
5056
5057/*
4882 * hepler to process tree block while walking up the tree. 5058 * hepler to process tree block while walking up the tree.
4883 * 5059 *
4884 * when wc->stage == DROP_REFERENCE, this function drops 5060 * when wc->stage == DROP_REFERENCE, this function drops
@@ -4905,7 +5081,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4905 if (level < wc->shared_level) 5081 if (level < wc->shared_level)
4906 goto out; 5082 goto out;
4907 5083
4908 BUG_ON(wc->refs[level] <= 1);
4909 ret = find_next_key(path, level + 1, &wc->update_progress); 5084 ret = find_next_key(path, level + 1, &wc->update_progress);
4910 if (ret > 0) 5085 if (ret > 0)
4911 wc->update_ref = 0; 5086 wc->update_ref = 0;
@@ -4936,8 +5111,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4936 path->locks[level] = 0; 5111 path->locks[level] = 0;
4937 return 1; 5112 return 1;
4938 } 5113 }
4939 } else {
4940 BUG_ON(level != 0);
4941 } 5114 }
4942 } 5115 }
4943 5116
@@ -4990,17 +5163,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4990 struct btrfs_path *path, 5163 struct btrfs_path *path,
4991 struct walk_control *wc) 5164 struct walk_control *wc)
4992{ 5165{
4993 struct extent_buffer *next;
4994 struct extent_buffer *cur;
4995 u64 bytenr;
4996 u64 ptr_gen;
4997 u32 blocksize;
4998 int level = wc->level; 5166 int level = wc->level;
4999 int ret; 5167 int ret;
5000 5168
5001 while (level >= 0) { 5169 while (level >= 0) {
5002 cur = path->nodes[level]; 5170 if (path->slots[level] >=
5003 BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); 5171 btrfs_header_nritems(path->nodes[level]))
5172 break;
5004 5173
5005 ret = walk_down_proc(trans, root, path, wc); 5174 ret = walk_down_proc(trans, root, path, wc);
5006 if (ret > 0) 5175 if (ret > 0)
@@ -5009,20 +5178,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5009 if (level == 0) 5178 if (level == 0)
5010 break; 5179 break;
5011 5180
5012 bytenr = btrfs_node_blockptr(cur, path->slots[level]); 5181 ret = do_walk_down(trans, root, path, wc);
5013 blocksize = btrfs_level_size(root, level - 1); 5182 if (ret > 0) {
5014 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); 5183 path->slots[level]++;
5015 5184 continue;
5016 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 5185 }
5017 btrfs_tree_lock(next); 5186 level = wc->level;
5018 btrfs_set_lock_blocking(next);
5019
5020 level--;
5021 BUG_ON(level != btrfs_header_level(next));
5022 path->nodes[level] = next;
5023 path->slots[level] = 0;
5024 path->locks[level] = 1;
5025 wc->level = level;
5026 } 5187 }
5027 return 0; 5188 return 0;
5028} 5189}
@@ -5112,9 +5273,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5112 err = ret; 5273 err = ret;
5113 goto out; 5274 goto out;
5114 } 5275 }
5115 btrfs_node_key_to_cpu(path->nodes[level], &key, 5276 WARN_ON(ret > 0);
5116 path->slots[level]);
5117 WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
5118 5277
5119 /* 5278 /*
5120 * unlock our path, this is safe because only this 5279 * unlock our path, this is safe because only this
@@ -5149,6 +5308,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5149 wc->stage = DROP_REFERENCE; 5308 wc->stage = DROP_REFERENCE;
5150 wc->update_ref = update_ref; 5309 wc->update_ref = update_ref;
5151 wc->keep_locks = 0; 5310 wc->keep_locks = 0;
5311 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5152 5312
5153 while (1) { 5313 while (1) {
5154 ret = walk_down_tree(trans, root, path, wc); 5314 ret = walk_down_tree(trans, root, path, wc);
@@ -5201,9 +5361,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5201 ret = btrfs_del_root(trans, tree_root, &root->root_key); 5361 ret = btrfs_del_root(trans, tree_root, &root->root_key);
5202 BUG_ON(ret); 5362 BUG_ON(ret);
5203 5363
5204 free_extent_buffer(root->node); 5364 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5205 free_extent_buffer(root->commit_root); 5365 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5206 kfree(root); 5366 NULL, NULL);
5367 BUG_ON(ret < 0);
5368 if (ret > 0) {
5369 ret = btrfs_del_orphan_item(trans, tree_root,
5370 root->root_key.objectid);
5371 BUG_ON(ret);
5372 }
5373 }
5374
5375 if (root->in_radix) {
5376 btrfs_free_fs_root(tree_root->fs_info, root);
5377 } else {
5378 free_extent_buffer(root->node);
5379 free_extent_buffer(root->commit_root);
5380 kfree(root);
5381 }
5207out: 5382out:
5208 btrfs_end_transaction(trans, tree_root); 5383 btrfs_end_transaction(trans, tree_root);
5209 kfree(wc); 5384 kfree(wc);
@@ -5255,6 +5430,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5255 wc->stage = DROP_REFERENCE; 5430 wc->stage = DROP_REFERENCE;
5256 wc->update_ref = 0; 5431 wc->update_ref = 0;
5257 wc->keep_locks = 1; 5432 wc->keep_locks = 1;
5433 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5258 5434
5259 while (1) { 5435 while (1) {
5260 wret = walk_down_tree(trans, root, path, wc); 5436 wret = walk_down_tree(trans, root, path, wc);
@@ -5397,9 +5573,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
5397 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 5573 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5398 while (1) { 5574 while (1) {
5399 int ret; 5575 int ret;
5400 spin_lock(&em_tree->lock); 5576 write_lock(&em_tree->lock);
5401 ret = add_extent_mapping(em_tree, em); 5577 ret = add_extent_mapping(em_tree, em);
5402 spin_unlock(&em_tree->lock); 5578 write_unlock(&em_tree->lock);
5403 if (ret != -EEXIST) { 5579 if (ret != -EEXIST) {
5404 free_extent_map(em); 5580 free_extent_map(em);
5405 break; 5581 break;
@@ -6842,287 +7018,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
6842 return 0; 7018 return 0;
6843} 7019}
6844 7020
6845#if 0 7021/*
6846static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 7022 * checks to see if its even possible to relocate this block group.
6847 struct btrfs_root *root, 7023 *
6848 u64 objectid, u64 size) 7024 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
6849{ 7025 * ok to go ahead and try.
6850 struct btrfs_path *path; 7026 */
6851 struct btrfs_inode_item *item; 7027int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6852 struct extent_buffer *leaf;
6853 int ret;
6854
6855 path = btrfs_alloc_path();
6856 if (!path)
6857 return -ENOMEM;
6858
6859 path->leave_spinning = 1;
6860 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6861 if (ret)
6862 goto out;
6863
6864 leaf = path->nodes[0];
6865 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
6866 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
6867 btrfs_set_inode_generation(leaf, item, 1);
6868 btrfs_set_inode_size(leaf, item, size);
6869 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
6870 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
6871 btrfs_mark_buffer_dirty(leaf);
6872 btrfs_release_path(root, path);
6873out:
6874 btrfs_free_path(path);
6875 return ret;
6876}
6877
6878static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
6879 struct btrfs_block_group_cache *group)
6880{ 7028{
6881 struct inode *inode = NULL; 7029 struct btrfs_block_group_cache *block_group;
6882 struct btrfs_trans_handle *trans; 7030 struct btrfs_space_info *space_info;
6883 struct btrfs_root *root; 7031 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6884 struct btrfs_key root_key; 7032 struct btrfs_device *device;
6885 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 7033 int full = 0;
6886 int err = 0; 7034 int ret = 0;
6887 7035
6888 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 7036 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
6889 root_key.type = BTRFS_ROOT_ITEM_KEY;
6890 root_key.offset = (u64)-1;
6891 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
6892 if (IS_ERR(root))
6893 return ERR_CAST(root);
6894 7037
6895 trans = btrfs_start_transaction(root, 1); 7038 /* odd, couldn't find the block group, leave it alone */
6896 BUG_ON(!trans); 7039 if (!block_group)
7040 return -1;
6897 7041
6898 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 7042 /* no bytes used, we're good */
6899 if (err) 7043 if (!btrfs_block_group_used(&block_group->item))
6900 goto out; 7044 goto out;
6901 7045
6902 err = __insert_orphan_inode(trans, root, objectid, group->key.offset); 7046 space_info = block_group->space_info;
6903 BUG_ON(err); 7047 spin_lock(&space_info->lock);
6904
6905 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
6906 group->key.offset, 0, group->key.offset,
6907 0, 0, 0);
6908 BUG_ON(err);
6909
6910 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
6911 if (inode->i_state & I_NEW) {
6912 BTRFS_I(inode)->root = root;
6913 BTRFS_I(inode)->location.objectid = objectid;
6914 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
6915 BTRFS_I(inode)->location.offset = 0;
6916 btrfs_read_locked_inode(inode);
6917 unlock_new_inode(inode);
6918 BUG_ON(is_bad_inode(inode));
6919 } else {
6920 BUG_ON(1);
6921 }
6922 BTRFS_I(inode)->index_cnt = group->key.objectid;
6923
6924 err = btrfs_orphan_add(trans, inode);
6925out:
6926 btrfs_end_transaction(trans, root);
6927 if (err) {
6928 if (inode)
6929 iput(inode);
6930 inode = ERR_PTR(err);
6931 }
6932 return inode;
6933}
6934
6935int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
6936{
6937
6938 struct btrfs_ordered_sum *sums;
6939 struct btrfs_sector_sum *sector_sum;
6940 struct btrfs_ordered_extent *ordered;
6941 struct btrfs_root *root = BTRFS_I(inode)->root;
6942 struct list_head list;
6943 size_t offset;
6944 int ret;
6945 u64 disk_bytenr;
6946
6947 INIT_LIST_HEAD(&list);
6948
6949 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
6950 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
6951
6952 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
6953 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
6954 disk_bytenr + len - 1, &list);
6955
6956 while (!list_empty(&list)) {
6957 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
6958 list_del_init(&sums->list);
6959
6960 sector_sum = sums->sums;
6961 sums->bytenr = ordered->start;
6962 7048
6963 offset = 0; 7049 full = space_info->full;
6964 while (offset < sums->len) {
6965 sector_sum->bytenr += ordered->start - disk_bytenr;
6966 sector_sum++;
6967 offset += root->sectorsize;
6968 }
6969 7050
6970 btrfs_add_ordered_sum(inode, ordered, sums); 7051 /*
7052 * if this is the last block group we have in this space, we can't
7053 * relocate it unless we're able to allocate a new chunk below.
7054 *
7055 * Otherwise, we need to make sure we have room in the space to handle
7056 * all of the extents from this block group. If we can, we're good
7057 */
7058 if ((space_info->total_bytes != block_group->key.offset) &&
7059 (space_info->bytes_used + space_info->bytes_reserved +
7060 space_info->bytes_pinned + space_info->bytes_readonly +
7061 btrfs_block_group_used(&block_group->item) <
7062 space_info->total_bytes)) {
7063 spin_unlock(&space_info->lock);
7064 goto out;
6971 } 7065 }
6972 btrfs_put_ordered_extent(ordered); 7066 spin_unlock(&space_info->lock);
6973 return 0;
6974}
6975
6976int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
6977{
6978 struct btrfs_trans_handle *trans;
6979 struct btrfs_path *path;
6980 struct btrfs_fs_info *info = root->fs_info;
6981 struct extent_buffer *leaf;
6982 struct inode *reloc_inode;
6983 struct btrfs_block_group_cache *block_group;
6984 struct btrfs_key key;
6985 u64 skipped;
6986 u64 cur_byte;
6987 u64 total_found;
6988 u32 nritems;
6989 int ret;
6990 int progress;
6991 int pass = 0;
6992
6993 root = root->fs_info->extent_root;
6994
6995 block_group = btrfs_lookup_block_group(info, group_start);
6996 BUG_ON(!block_group);
6997
6998 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
6999 (unsigned long long)block_group->key.objectid,
7000 (unsigned long long)block_group->flags);
7001
7002 path = btrfs_alloc_path();
7003 BUG_ON(!path);
7004
7005 reloc_inode = create_reloc_inode(info, block_group);
7006 BUG_ON(IS_ERR(reloc_inode));
7007
7008 __alloc_chunk_for_shrink(root, block_group, 1);
7009 set_block_group_readonly(block_group);
7010
7011 btrfs_start_delalloc_inodes(info->tree_root);
7012 btrfs_wait_ordered_extents(info->tree_root, 0);
7013again:
7014 skipped = 0;
7015 total_found = 0;
7016 progress = 0;
7017 key.objectid = block_group->key.objectid;
7018 key.offset = 0;
7019 key.type = 0;
7020 cur_byte = key.objectid;
7021
7022 trans = btrfs_start_transaction(info->tree_root, 1);
7023 btrfs_commit_transaction(trans, info->tree_root);
7024 7067
7025 mutex_lock(&root->fs_info->cleaner_mutex); 7068 /*
7026 btrfs_clean_old_snapshots(info->tree_root); 7069 * ok we don't have enough space, but maybe we have free space on our
7027 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 7070 * devices to allocate new chunks for relocation, so loop through our
7028 mutex_unlock(&root->fs_info->cleaner_mutex); 7071 * alloc devices and guess if we have enough space. However, if we
7072 * were marked as full, then we know there aren't enough chunks, and we
7073 * can just return.
7074 */
7075 ret = -1;
7076 if (full)
7077 goto out;
7029 7078
7030 trans = btrfs_start_transaction(info->tree_root, 1); 7079 mutex_lock(&root->fs_info->chunk_mutex);
7031 btrfs_commit_transaction(trans, info->tree_root); 7080 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7081 u64 min_free = btrfs_block_group_used(&block_group->item);
7082 u64 dev_offset, max_avail;
7032 7083
7033 while (1) { 7084 /*
7034 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7085 * check to make sure we can actually find a chunk with enough
7035 if (ret < 0) 7086 * space to fit our block group in.
7036 goto out; 7087 */
7037next: 7088 if (device->total_bytes > device->bytes_used + min_free) {
7038 leaf = path->nodes[0]; 7089 ret = find_free_dev_extent(NULL, device, min_free,
7039 nritems = btrfs_header_nritems(leaf); 7090 &dev_offset, &max_avail);
7040 if (path->slots[0] >= nritems) { 7091 if (!ret)
7041 ret = btrfs_next_leaf(root, path);
7042 if (ret < 0)
7043 goto out;
7044 if (ret == 1) {
7045 ret = 0;
7046 break; 7092 break;
7047 } 7093 ret = -1;
7048 leaf = path->nodes[0];
7049 nritems = btrfs_header_nritems(leaf);
7050 }
7051
7052 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7053
7054 if (key.objectid >= block_group->key.objectid +
7055 block_group->key.offset)
7056 break;
7057
7058 if (progress && need_resched()) {
7059 btrfs_release_path(root, path);
7060 cond_resched();
7061 progress = 0;
7062 continue;
7063 }
7064 progress = 1;
7065
7066 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
7067 key.objectid + key.offset <= cur_byte) {
7068 path->slots[0]++;
7069 goto next;
7070 }
7071
7072 total_found++;
7073 cur_byte = key.objectid + key.offset;
7074 btrfs_release_path(root, path);
7075
7076 __alloc_chunk_for_shrink(root, block_group, 0);
7077 ret = relocate_one_extent(root, path, &key, block_group,
7078 reloc_inode, pass);
7079 BUG_ON(ret < 0);
7080 if (ret > 0)
7081 skipped++;
7082
7083 key.objectid = cur_byte;
7084 key.type = 0;
7085 key.offset = 0;
7086 }
7087
7088 btrfs_release_path(root, path);
7089
7090 if (pass == 0) {
7091 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
7092 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
7093 }
7094
7095 if (total_found > 0) {
7096 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
7097 (unsigned long long)total_found, pass);
7098 pass++;
7099 if (total_found == skipped && pass > 2) {
7100 iput(reloc_inode);
7101 reloc_inode = create_reloc_inode(info, block_group);
7102 pass = 0;
7103 } 7094 }
7104 goto again;
7105 } 7095 }
7106 7096 mutex_unlock(&root->fs_info->chunk_mutex);
7107 /* delete reloc_inode */
7108 iput(reloc_inode);
7109
7110 /* unpin extents in this range */
7111 trans = btrfs_start_transaction(info->tree_root, 1);
7112 btrfs_commit_transaction(trans, info->tree_root);
7113
7114 spin_lock(&block_group->lock);
7115 WARN_ON(block_group->pinned > 0);
7116 WARN_ON(block_group->reserved > 0);
7117 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
7118 spin_unlock(&block_group->lock);
7119 btrfs_put_block_group(block_group);
7120 ret = 0;
7121out: 7097out:
7122 btrfs_free_path(path); 7098 btrfs_put_block_group(block_group);
7123 return ret; 7099 return ret;
7124} 7100}
7125#endif
7126 7101
7127static int find_first_block_group(struct btrfs_root *root, 7102static int find_first_block_group(struct btrfs_root *root,
7128 struct btrfs_path *path, struct btrfs_key *key) 7103 struct btrfs_path *path, struct btrfs_key *key)
@@ -7165,8 +7140,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7165{ 7140{
7166 struct btrfs_block_group_cache *block_group; 7141 struct btrfs_block_group_cache *block_group;
7167 struct btrfs_space_info *space_info; 7142 struct btrfs_space_info *space_info;
7143 struct btrfs_caching_control *caching_ctl;
7168 struct rb_node *n; 7144 struct rb_node *n;
7169 7145
7146 down_write(&info->extent_commit_sem);
7147 while (!list_empty(&info->caching_block_groups)) {
7148 caching_ctl = list_entry(info->caching_block_groups.next,
7149 struct btrfs_caching_control, list);
7150 list_del(&caching_ctl->list);
7151 put_caching_control(caching_ctl);
7152 }
7153 up_write(&info->extent_commit_sem);
7154
7170 spin_lock(&info->block_group_cache_lock); 7155 spin_lock(&info->block_group_cache_lock);
7171 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7156 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7172 block_group = rb_entry(n, struct btrfs_block_group_cache, 7157 block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -7180,8 +7165,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7180 up_write(&block_group->space_info->groups_sem); 7165 up_write(&block_group->space_info->groups_sem);
7181 7166
7182 if (block_group->cached == BTRFS_CACHE_STARTED) 7167 if (block_group->cached == BTRFS_CACHE_STARTED)
7183 wait_event(block_group->caching_q, 7168 wait_block_group_cache_done(block_group);
7184 block_group_cache_done(block_group));
7185 7169
7186 btrfs_remove_free_space_cache(block_group); 7170 btrfs_remove_free_space_cache(block_group);
7187 7171
@@ -7251,7 +7235,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7251 spin_lock_init(&cache->lock); 7235 spin_lock_init(&cache->lock);
7252 spin_lock_init(&cache->tree_lock); 7236 spin_lock_init(&cache->tree_lock);
7253 cache->fs_info = info; 7237 cache->fs_info = info;
7254 init_waitqueue_head(&cache->caching_q);
7255 INIT_LIST_HEAD(&cache->list); 7238 INIT_LIST_HEAD(&cache->list);
7256 INIT_LIST_HEAD(&cache->cluster_list); 7239 INIT_LIST_HEAD(&cache->cluster_list);
7257 7240
@@ -7273,8 +7256,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7273 cache->flags = btrfs_block_group_flags(&cache->item); 7256 cache->flags = btrfs_block_group_flags(&cache->item);
7274 cache->sectorsize = root->sectorsize; 7257 cache->sectorsize = root->sectorsize;
7275 7258
7276 remove_sb_from_cache(root, cache);
7277
7278 /* 7259 /*
7279 * check for two cases, either we are full, and therefore 7260 * check for two cases, either we are full, and therefore
7280 * don't need to bother with the caching work since we won't 7261 * don't need to bother with the caching work since we won't
@@ -7283,13 +7264,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7283 * time, particularly in the full case. 7264 * time, particularly in the full case.
7284 */ 7265 */
7285 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 7266 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7267 exclude_super_stripes(root, cache);
7268 cache->last_byte_to_unpin = (u64)-1;
7286 cache->cached = BTRFS_CACHE_FINISHED; 7269 cache->cached = BTRFS_CACHE_FINISHED;
7270 free_excluded_extents(root, cache);
7287 } else if (btrfs_block_group_used(&cache->item) == 0) { 7271 } else if (btrfs_block_group_used(&cache->item) == 0) {
7272 exclude_super_stripes(root, cache);
7273 cache->last_byte_to_unpin = (u64)-1;
7288 cache->cached = BTRFS_CACHE_FINISHED; 7274 cache->cached = BTRFS_CACHE_FINISHED;
7289 add_new_free_space(cache, root->fs_info, 7275 add_new_free_space(cache, root->fs_info,
7290 found_key.objectid, 7276 found_key.objectid,
7291 found_key.objectid + 7277 found_key.objectid +
7292 found_key.offset); 7278 found_key.offset);
7279 free_excluded_extents(root, cache);
7293 } 7280 }
7294 7281
7295 ret = update_space_info(info, cache->flags, found_key.offset, 7282 ret = update_space_info(info, cache->flags, found_key.offset,
@@ -7297,6 +7284,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7297 &space_info); 7284 &space_info);
7298 BUG_ON(ret); 7285 BUG_ON(ret);
7299 cache->space_info = space_info; 7286 cache->space_info = space_info;
7287 spin_lock(&cache->space_info->lock);
7288 cache->space_info->bytes_super += cache->bytes_super;
7289 spin_unlock(&cache->space_info->lock);
7290
7300 down_write(&space_info->groups_sem); 7291 down_write(&space_info->groups_sem);
7301 list_add_tail(&cache->list, &space_info->block_groups); 7292 list_add_tail(&cache->list, &space_info->block_groups);
7302 up_write(&space_info->groups_sem); 7293 up_write(&space_info->groups_sem);
@@ -7346,7 +7337,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7346 atomic_set(&cache->count, 1); 7337 atomic_set(&cache->count, 1);
7347 spin_lock_init(&cache->lock); 7338 spin_lock_init(&cache->lock);
7348 spin_lock_init(&cache->tree_lock); 7339 spin_lock_init(&cache->tree_lock);
7349 init_waitqueue_head(&cache->caching_q);
7350 INIT_LIST_HEAD(&cache->list); 7340 INIT_LIST_HEAD(&cache->list);
7351 INIT_LIST_HEAD(&cache->cluster_list); 7341 INIT_LIST_HEAD(&cache->cluster_list);
7352 7342
@@ -7355,15 +7345,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7355 cache->flags = type; 7345 cache->flags = type;
7356 btrfs_set_block_group_flags(&cache->item, type); 7346 btrfs_set_block_group_flags(&cache->item, type);
7357 7347
7348 cache->last_byte_to_unpin = (u64)-1;
7358 cache->cached = BTRFS_CACHE_FINISHED; 7349 cache->cached = BTRFS_CACHE_FINISHED;
7359 remove_sb_from_cache(root, cache); 7350 exclude_super_stripes(root, cache);
7360 7351
7361 add_new_free_space(cache, root->fs_info, chunk_offset, 7352 add_new_free_space(cache, root->fs_info, chunk_offset,
7362 chunk_offset + size); 7353 chunk_offset + size);
7363 7354
7355 free_excluded_extents(root, cache);
7356
7364 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7357 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7365 &cache->space_info); 7358 &cache->space_info);
7366 BUG_ON(ret); 7359 BUG_ON(ret);
7360
7361 spin_lock(&cache->space_info->lock);
7362 cache->space_info->bytes_super += cache->bytes_super;
7363 spin_unlock(&cache->space_info->lock);
7364
7367 down_write(&cache->space_info->groups_sem); 7365 down_write(&cache->space_info->groups_sem);
7368 list_add_tail(&cache->list, &cache->space_info->block_groups); 7366 list_add_tail(&cache->list, &cache->space_info->block_groups);
7369 up_write(&cache->space_info->groups_sem); 7367 up_write(&cache->space_info->groups_sem);
@@ -7429,8 +7427,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7429 up_write(&block_group->space_info->groups_sem); 7427 up_write(&block_group->space_info->groups_sem);
7430 7428
7431 if (block_group->cached == BTRFS_CACHE_STARTED) 7429 if (block_group->cached == BTRFS_CACHE_STARTED)
7432 wait_event(block_group->caching_q, 7430 wait_block_group_cache_done(block_group);
7433 block_group_cache_done(block_group));
7434 7431
7435 btrfs_remove_free_space_cache(block_group); 7432 btrfs_remove_free_space_cache(block_group);
7436 7433
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68260180f587..de1793ba004a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
280 return NULL; 280 return NULL;
281} 281}
282 282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other)
285{
286 if (tree->ops && tree->ops->merge_extent_hook)
287 tree->ops->merge_extent_hook(tree->mapping->host, new,
288 other);
289}
290
283/* 291/*
284 * utility function to look for merge candidates inside a given range. 292 * utility function to look for merge candidates inside a given range.
285 * Any extents with matching state are merged together into a single 293 * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
303 other = rb_entry(other_node, struct extent_state, rb_node); 311 other = rb_entry(other_node, struct extent_state, rb_node);
304 if (other->end == state->start - 1 && 312 if (other->end == state->start - 1 &&
305 other->state == state->state) { 313 other->state == state->state) {
314 merge_cb(tree, state, other);
306 state->start = other->start; 315 state->start = other->start;
307 other->tree = NULL; 316 other->tree = NULL;
308 rb_erase(&other->rb_node, &tree->state); 317 rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
314 other = rb_entry(other_node, struct extent_state, rb_node); 323 other = rb_entry(other_node, struct extent_state, rb_node);
315 if (other->start == state->end + 1 && 324 if (other->start == state->end + 1 &&
316 other->state == state->state) { 325 other->state == state->state) {
326 merge_cb(tree, state, other);
317 other->start = state->start; 327 other->start = state->start;
318 state->tree = NULL; 328 state->tree = NULL;
319 rb_erase(&state->rb_node, &tree->state); 329 rb_erase(&state->rb_node, &tree->state);
320 free_extent_state(state); 330 free_extent_state(state);
331 state = NULL;
321 } 332 }
322 } 333 }
334
323 return 0; 335 return 0;
324} 336}
325 337
326static void set_state_cb(struct extent_io_tree *tree, 338static int set_state_cb(struct extent_io_tree *tree,
327 struct extent_state *state, 339 struct extent_state *state,
328 unsigned long bits) 340 unsigned long bits)
329{ 341{
330 if (tree->ops && tree->ops->set_bit_hook) { 342 if (tree->ops && tree->ops->set_bit_hook) {
331 tree->ops->set_bit_hook(tree->mapping->host, state->start, 343 return tree->ops->set_bit_hook(tree->mapping->host,
332 state->end, state->state, bits); 344 state->start, state->end,
345 state->state, bits);
333 } 346 }
347
348 return 0;
334} 349}
335 350
336static void clear_state_cb(struct extent_io_tree *tree, 351static void clear_state_cb(struct extent_io_tree *tree,
337 struct extent_state *state, 352 struct extent_state *state,
338 unsigned long bits) 353 unsigned long bits)
339{ 354{
340 if (tree->ops && tree->ops->clear_bit_hook) { 355 if (tree->ops && tree->ops->clear_bit_hook)
341 tree->ops->clear_bit_hook(tree->mapping->host, state->start, 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
342 state->end, state->state, bits);
343 }
344} 357}
345 358
346/* 359/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
358 int bits) 371 int bits)
359{ 372{
360 struct rb_node *node; 373 struct rb_node *node;
374 int ret;
361 375
362 if (end < start) { 376 if (end < start) {
363 printk(KERN_ERR "btrfs end < start %llu %llu\n", 377 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
365 (unsigned long long)start); 379 (unsigned long long)start);
366 WARN_ON(1); 380 WARN_ON(1);
367 } 381 }
382 state->start = start;
383 state->end = end;
384 ret = set_state_cb(tree, state, bits);
385 if (ret)
386 return ret;
387
368 if (bits & EXTENT_DIRTY) 388 if (bits & EXTENT_DIRTY)
369 tree->dirty_bytes += end - start + 1; 389 tree->dirty_bytes += end - start + 1;
370 set_state_cb(tree, state, bits);
371 state->state |= bits; 390 state->state |= bits;
372 state->start = start;
373 state->end = end;
374 node = tree_insert(&tree->state, end, &state->rb_node); 391 node = tree_insert(&tree->state, end, &state->rb_node);
375 if (node) { 392 if (node) {
376 struct extent_state *found; 393 struct extent_state *found;
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
387 return 0; 404 return 0;
388} 405}
389 406
407static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 u64 split)
409{
410 if (tree->ops && tree->ops->split_extent_hook)
411 return tree->ops->split_extent_hook(tree->mapping->host,
412 orig, split);
413 return 0;
414}
415
390/* 416/*
391 * split a given extent state struct in two, inserting the preallocated 417 * split a given extent state struct in two, inserting the preallocated
392 * struct 'prealloc' as the newly created second half. 'split' indicates an 418 * struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 struct extent_state *prealloc, u64 split) 431 struct extent_state *prealloc, u64 split)
406{ 432{
407 struct rb_node *node; 433 struct rb_node *node;
434
435 split_cb(tree, orig, split);
436
408 prealloc->start = orig->start; 437 prealloc->start = orig->start;
409 prealloc->end = split - 1; 438 prealloc->end = split - 1;
410 prealloc->state = orig->state; 439 prealloc->state = orig->state;
@@ -471,10 +500,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
471 * bits were already set, or zero if none of the bits were already set. 500 * bits were already set, or zero if none of the bits were already set.
472 */ 501 */
473int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 502int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
474 int bits, int wake, int delete, gfp_t mask) 503 int bits, int wake, int delete,
504 struct extent_state **cached_state,
505 gfp_t mask)
475{ 506{
476 struct extent_state *state; 507 struct extent_state *state;
508 struct extent_state *cached;
477 struct extent_state *prealloc = NULL; 509 struct extent_state *prealloc = NULL;
510 struct rb_node *next_node;
478 struct rb_node *node; 511 struct rb_node *node;
479 u64 last_end; 512 u64 last_end;
480 int err; 513 int err;
@@ -488,6 +521,17 @@ again:
488 } 521 }
489 522
490 spin_lock(&tree->lock); 523 spin_lock(&tree->lock);
524 if (cached_state) {
525 cached = *cached_state;
526 *cached_state = NULL;
527 cached_state = NULL;
528 if (cached && cached->tree && cached->start == start) {
529 atomic_dec(&cached->refs);
530 state = cached;
531 goto hit_next;
532 }
533 free_extent_state(cached);
534 }
491 /* 535 /*
492 * this search will find the extents that end after 536 * this search will find the extents that end after
493 * our range starts 537 * our range starts
@@ -496,6 +540,7 @@ again:
496 if (!node) 540 if (!node)
497 goto out; 541 goto out;
498 state = rb_entry(node, struct extent_state, rb_node); 542 state = rb_entry(node, struct extent_state, rb_node);
543hit_next:
499 if (state->start > end) 544 if (state->start > end)
500 goto out; 545 goto out;
501 WARN_ON(state->end < start); 546 WARN_ON(state->end < start);
@@ -526,13 +571,11 @@ again:
526 if (err) 571 if (err)
527 goto out; 572 goto out;
528 if (state->end <= end) { 573 if (state->end <= end) {
529 set |= clear_state_bit(tree, state, bits, 574 set |= clear_state_bit(tree, state, bits, wake,
530 wake, delete); 575 delete);
531 if (last_end == (u64)-1) 576 if (last_end == (u64)-1)
532 goto out; 577 goto out;
533 start = last_end + 1; 578 start = last_end + 1;
534 } else {
535 start = state->start;
536 } 579 }
537 goto search_again; 580 goto search_again;
538 } 581 }
@@ -547,19 +590,30 @@ again:
547 prealloc = alloc_extent_state(GFP_ATOMIC); 590 prealloc = alloc_extent_state(GFP_ATOMIC);
548 err = split_state(tree, state, prealloc, end + 1); 591 err = split_state(tree, state, prealloc, end + 1);
549 BUG_ON(err == -EEXIST); 592 BUG_ON(err == -EEXIST);
550
551 if (wake) 593 if (wake)
552 wake_up(&state->wq); 594 wake_up(&state->wq);
553 set |= clear_state_bit(tree, prealloc, bits, 595
554 wake, delete); 596 set |= clear_state_bit(tree, prealloc, bits, wake, delete);
597
555 prealloc = NULL; 598 prealloc = NULL;
556 goto out; 599 goto out;
557 } 600 }
558 601
602 if (state->end < end && prealloc && !need_resched())
603 next_node = rb_next(&state->rb_node);
604 else
605 next_node = NULL;
606
559 set |= clear_state_bit(tree, state, bits, wake, delete); 607 set |= clear_state_bit(tree, state, bits, wake, delete);
560 if (last_end == (u64)-1) 608 if (last_end == (u64)-1)
561 goto out; 609 goto out;
562 start = last_end + 1; 610 start = last_end + 1;
611 if (start <= end && next_node) {
612 state = rb_entry(next_node, struct extent_state,
613 rb_node);
614 if (state->start == start)
615 goto hit_next;
616 }
563 goto search_again; 617 goto search_again;
564 618
565out: 619out:
@@ -641,40 +695,59 @@ out:
641 return 0; 695 return 0;
642} 696}
643 697
644static void set_state_bits(struct extent_io_tree *tree, 698static int set_state_bits(struct extent_io_tree *tree,
645 struct extent_state *state, 699 struct extent_state *state,
646 int bits) 700 int bits)
647{ 701{
702 int ret;
703
704 ret = set_state_cb(tree, state, bits);
705 if (ret)
706 return ret;
707
648 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 708 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
649 u64 range = state->end - state->start + 1; 709 u64 range = state->end - state->start + 1;
650 tree->dirty_bytes += range; 710 tree->dirty_bytes += range;
651 } 711 }
652 set_state_cb(tree, state, bits);
653 state->state |= bits; 712 state->state |= bits;
713
714 return 0;
715}
716
717static void cache_state(struct extent_state *state,
718 struct extent_state **cached_ptr)
719{
720 if (cached_ptr && !(*cached_ptr)) {
721 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
722 *cached_ptr = state;
723 atomic_inc(&state->refs);
724 }
725 }
654} 726}
655 727
656/* 728/*
657 * set some bits on a range in the tree. This may require allocations 729 * set some bits on a range in the tree. This may require allocations or
658 * or sleeping, so the gfp mask is used to indicate what is allowed. 730 * sleeping, so the gfp mask is used to indicate what is allowed.
659 * 731 *
660 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the 732 * If any of the exclusive bits are set, this will fail with -EEXIST if some
661 * range already has the desired bits set. The start of the existing 733 * part of the range already has the desired bits set. The start of the
662 * range is returned in failed_start in this case. 734 * existing range is returned in failed_start in this case.
663 * 735 *
664 * [start, end] is inclusive 736 * [start, end] is inclusive This takes the tree lock.
665 * This takes the tree lock.
666 */ 737 */
738
667static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 739static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
668 int bits, int exclusive, u64 *failed_start, 740 int bits, int exclusive_bits, u64 *failed_start,
741 struct extent_state **cached_state,
669 gfp_t mask) 742 gfp_t mask)
670{ 743{
671 struct extent_state *state; 744 struct extent_state *state;
672 struct extent_state *prealloc = NULL; 745 struct extent_state *prealloc = NULL;
673 struct rb_node *node; 746 struct rb_node *node;
674 int err = 0; 747 int err = 0;
675 int set;
676 u64 last_start; 748 u64 last_start;
677 u64 last_end; 749 u64 last_end;
750
678again: 751again:
679 if (!prealloc && (mask & __GFP_WAIT)) { 752 if (!prealloc && (mask & __GFP_WAIT)) {
680 prealloc = alloc_extent_state(mask); 753 prealloc = alloc_extent_state(mask);
@@ -683,6 +756,13 @@ again:
683 } 756 }
684 757
685 spin_lock(&tree->lock); 758 spin_lock(&tree->lock);
759 if (cached_state && *cached_state) {
760 state = *cached_state;
761 if (state->start == start && state->tree) {
762 node = &state->rb_node;
763 goto hit_next;
764 }
765 }
686 /* 766 /*
687 * this search will find all the extents that end after 767 * this search will find all the extents that end after
688 * our range starts. 768 * our range starts.
@@ -694,8 +774,8 @@ again:
694 BUG_ON(err == -EEXIST); 774 BUG_ON(err == -EEXIST);
695 goto out; 775 goto out;
696 } 776 }
697
698 state = rb_entry(node, struct extent_state, rb_node); 777 state = rb_entry(node, struct extent_state, rb_node);
778hit_next:
699 last_start = state->start; 779 last_start = state->start;
700 last_end = state->end; 780 last_end = state->end;
701 781
@@ -706,17 +786,32 @@ again:
706 * Just lock what we found and keep going 786 * Just lock what we found and keep going
707 */ 787 */
708 if (state->start == start && state->end <= end) { 788 if (state->start == start && state->end <= end) {
709 set = state->state & bits; 789 struct rb_node *next_node;
710 if (set && exclusive) { 790 if (state->state & exclusive_bits) {
711 *failed_start = state->start; 791 *failed_start = state->start;
712 err = -EEXIST; 792 err = -EEXIST;
713 goto out; 793 goto out;
714 } 794 }
715 set_state_bits(tree, state, bits); 795
796 err = set_state_bits(tree, state, bits);
797 if (err)
798 goto out;
799
800 cache_state(state, cached_state);
716 merge_state(tree, state); 801 merge_state(tree, state);
717 if (last_end == (u64)-1) 802 if (last_end == (u64)-1)
718 goto out; 803 goto out;
804
719 start = last_end + 1; 805 start = last_end + 1;
806 if (start < end && prealloc && !need_resched()) {
807 next_node = rb_next(node);
808 if (next_node) {
809 state = rb_entry(next_node, struct extent_state,
810 rb_node);
811 if (state->start == start)
812 goto hit_next;
813 }
814 }
720 goto search_again; 815 goto search_again;
721 } 816 }
722 817
@@ -737,8 +832,7 @@ again:
737 * desired bit on it. 832 * desired bit on it.
738 */ 833 */
739 if (state->start < start) { 834 if (state->start < start) {
740 set = state->state & bits; 835 if (state->state & exclusive_bits) {
741 if (exclusive && set) {
742 *failed_start = start; 836 *failed_start = start;
743 err = -EEXIST; 837 err = -EEXIST;
744 goto out; 838 goto out;
@@ -749,13 +843,14 @@ again:
749 if (err) 843 if (err)
750 goto out; 844 goto out;
751 if (state->end <= end) { 845 if (state->end <= end) {
752 set_state_bits(tree, state, bits); 846 err = set_state_bits(tree, state, bits);
847 if (err)
848 goto out;
849 cache_state(state, cached_state);
753 merge_state(tree, state); 850 merge_state(tree, state);
754 if (last_end == (u64)-1) 851 if (last_end == (u64)-1)
755 goto out; 852 goto out;
756 start = last_end + 1; 853 start = last_end + 1;
757 } else {
758 start = state->start;
759 } 854 }
760 goto search_again; 855 goto search_again;
761 } 856 }
@@ -774,10 +869,13 @@ again:
774 this_end = last_start - 1; 869 this_end = last_start - 1;
775 err = insert_state(tree, prealloc, start, this_end, 870 err = insert_state(tree, prealloc, start, this_end,
776 bits); 871 bits);
777 prealloc = NULL;
778 BUG_ON(err == -EEXIST); 872 BUG_ON(err == -EEXIST);
779 if (err) 873 if (err) {
874 prealloc = NULL;
780 goto out; 875 goto out;
876 }
877 cache_state(prealloc, cached_state);
878 prealloc = NULL;
781 start = this_end + 1; 879 start = this_end + 1;
782 goto search_again; 880 goto search_again;
783 } 881 }
@@ -788,8 +886,7 @@ again:
788 * on the first half 886 * on the first half
789 */ 887 */
790 if (state->start <= end && state->end > end) { 888 if (state->start <= end && state->end > end) {
791 set = state->state & bits; 889 if (state->state & exclusive_bits) {
792 if (exclusive && set) {
793 *failed_start = start; 890 *failed_start = start;
794 err = -EEXIST; 891 err = -EEXIST;
795 goto out; 892 goto out;
@@ -797,7 +894,12 @@ again:
797 err = split_state(tree, state, prealloc, end + 1); 894 err = split_state(tree, state, prealloc, end + 1);
798 BUG_ON(err == -EEXIST); 895 BUG_ON(err == -EEXIST);
799 896
800 set_state_bits(tree, prealloc, bits); 897 err = set_state_bits(tree, prealloc, bits);
898 if (err) {
899 prealloc = NULL;
900 goto out;
901 }
902 cache_state(prealloc, cached_state);
801 merge_state(tree, prealloc); 903 merge_state(tree, prealloc);
802 prealloc = NULL; 904 prealloc = NULL;
803 goto out; 905 goto out;
@@ -826,86 +928,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
826 gfp_t mask) 928 gfp_t mask)
827{ 929{
828 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 930 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
829 mask); 931 NULL, mask);
830}
831
832int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
833 gfp_t mask)
834{
835 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
836} 932}
837 933
838int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 934int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
839 int bits, gfp_t mask) 935 int bits, gfp_t mask)
840{ 936{
841 return set_extent_bit(tree, start, end, bits, 0, NULL, 937 return set_extent_bit(tree, start, end, bits, 0, NULL,
842 mask); 938 NULL, mask);
843} 939}
844 940
845int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 941int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
846 int bits, gfp_t mask) 942 int bits, gfp_t mask)
847{ 943{
848 return clear_extent_bit(tree, start, end, bits, 0, 0, mask); 944 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
849} 945}
850 946
851int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 947int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
852 gfp_t mask) 948 gfp_t mask)
853{ 949{
854 return set_extent_bit(tree, start, end, 950 return set_extent_bit(tree, start, end,
855 EXTENT_DELALLOC | EXTENT_DIRTY, 951 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
856 0, NULL, mask); 952 0, NULL, NULL, mask);
857} 953}
858 954
859int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 955int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
860 gfp_t mask) 956 gfp_t mask)
861{ 957{
862 return clear_extent_bit(tree, start, end, 958 return clear_extent_bit(tree, start, end,
863 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); 959 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
864} 960 NULL, mask);
865
866int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
867 gfp_t mask)
868{
869 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
870} 961}
871 962
872int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 963int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
873 gfp_t mask) 964 gfp_t mask)
874{ 965{
875 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 966 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
876 mask); 967 NULL, mask);
877} 968}
878 969
879static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 970static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
880 gfp_t mask) 971 gfp_t mask)
881{ 972{
882 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); 973 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
974 NULL, mask);
883} 975}
884 976
885int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 977int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
886 gfp_t mask) 978 gfp_t mask)
887{ 979{
888 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 980 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
889 mask); 981 NULL, mask);
890} 982}
891 983
892static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 984static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
893 u64 end, gfp_t mask) 985 u64 end, gfp_t mask)
894{ 986{
895 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); 987 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
896} 988 NULL, mask);
897
898static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask)
900{
901 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
902 0, NULL, mask);
903}
904
905static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
906 u64 end, gfp_t mask)
907{
908 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
909} 989}
910 990
911int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 991int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +997,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
917 * either insert or lock state struct between start and end use mask to tell 997 * either insert or lock state struct between start and end use mask to tell
918 * us if waiting is desired. 998 * us if waiting is desired.
919 */ 999 */
920int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1000int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1001 int bits, struct extent_state **cached_state, gfp_t mask)
921{ 1002{
922 int err; 1003 int err;
923 u64 failed_start; 1004 u64 failed_start;
924 while (1) { 1005 while (1) {
925 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 1006 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
926 &failed_start, mask); 1007 EXTENT_LOCKED, &failed_start,
1008 cached_state, mask);
927 if (err == -EEXIST && (mask & __GFP_WAIT)) { 1009 if (err == -EEXIST && (mask & __GFP_WAIT)) {
928 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1010 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
929 start = failed_start; 1011 start = failed_start;
@@ -935,27 +1017,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
935 return err; 1017 return err;
936} 1018}
937 1019
1020int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1021{
1022 return lock_extent_bits(tree, start, end, 0, NULL, mask);
1023}
1024
938int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1025int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
939 gfp_t mask) 1026 gfp_t mask)
940{ 1027{
941 int err; 1028 int err;
942 u64 failed_start; 1029 u64 failed_start;
943 1030
944 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 1031 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
945 &failed_start, mask); 1032 &failed_start, NULL, mask);
946 if (err == -EEXIST) { 1033 if (err == -EEXIST) {
947 if (failed_start > start) 1034 if (failed_start > start)
948 clear_extent_bit(tree, start, failed_start - 1, 1035 clear_extent_bit(tree, start, failed_start - 1,
949 EXTENT_LOCKED, 1, 0, mask); 1036 EXTENT_LOCKED, 1, 0, NULL, mask);
950 return 0; 1037 return 0;
951 } 1038 }
952 return 1; 1039 return 1;
953} 1040}
954 1041
1042int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1043 struct extent_state **cached, gfp_t mask)
1044{
1045 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1046 mask);
1047}
1048
955int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1049int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
956 gfp_t mask) 1050 gfp_t mask)
957{ 1051{
958 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); 1052 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1053 mask);
959} 1054}
960 1055
961/* 1056/*
@@ -974,7 +1069,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
974 page_cache_release(page); 1069 page_cache_release(page);
975 index++; 1070 index++;
976 } 1071 }
977 set_extent_dirty(tree, start, end, GFP_NOFS);
978 return 0; 1072 return 0;
979} 1073}
980 1074
@@ -994,7 +1088,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
994 page_cache_release(page); 1088 page_cache_release(page);
995 index++; 1089 index++;
996 } 1090 }
997 set_extent_writeback(tree, start, end, GFP_NOFS);
998 return 0; 1091 return 0;
999} 1092}
1000 1093
@@ -1232,6 +1325,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
1232 u64 delalloc_start; 1325 u64 delalloc_start;
1233 u64 delalloc_end; 1326 u64 delalloc_end;
1234 u64 found; 1327 u64 found;
1328 struct extent_state *cached_state = NULL;
1235 int ret; 1329 int ret;
1236 int loops = 0; 1330 int loops = 0;
1237 1331
@@ -1269,6 +1363,7 @@ again:
1269 /* some of the pages are gone, lets avoid looping by 1363 /* some of the pages are gone, lets avoid looping by
1270 * shortening the size of the delalloc range we're searching 1364 * shortening the size of the delalloc range we're searching
1271 */ 1365 */
1366 free_extent_state(cached_state);
1272 if (!loops) { 1367 if (!loops) {
1273 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1368 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1274 max_bytes = PAGE_CACHE_SIZE - offset; 1369 max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1377,21 @@ again:
1282 BUG_ON(ret); 1377 BUG_ON(ret);
1283 1378
1284 /* step three, lock the state bits for the whole range */ 1379 /* step three, lock the state bits for the whole range */
1285 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); 1380 lock_extent_bits(tree, delalloc_start, delalloc_end,
1381 0, &cached_state, GFP_NOFS);
1286 1382
1287 /* then test to make sure it is all still delalloc */ 1383 /* then test to make sure it is all still delalloc */
1288 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1384 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1289 EXTENT_DELALLOC, 1); 1385 EXTENT_DELALLOC, 1, cached_state);
1290 if (!ret) { 1386 if (!ret) {
1291 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); 1387 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1388 &cached_state, GFP_NOFS);
1292 __unlock_for_delalloc(inode, locked_page, 1389 __unlock_for_delalloc(inode, locked_page,
1293 delalloc_start, delalloc_end); 1390 delalloc_start, delalloc_end);
1294 cond_resched(); 1391 cond_resched();
1295 goto again; 1392 goto again;
1296 } 1393 }
1394 free_extent_state(cached_state);
1297 *start = delalloc_start; 1395 *start = delalloc_start;
1298 *end = delalloc_end; 1396 *end = delalloc_end;
1299out_failed: 1397out_failed:
@@ -1307,7 +1405,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1307 int clear_unlock, 1405 int clear_unlock,
1308 int clear_delalloc, int clear_dirty, 1406 int clear_delalloc, int clear_dirty,
1309 int set_writeback, 1407 int set_writeback,
1310 int end_writeback) 1408 int end_writeback,
1409 int set_private2)
1311{ 1410{
1312 int ret; 1411 int ret;
1313 struct page *pages[16]; 1412 struct page *pages[16];
@@ -1325,8 +1424,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1325 if (clear_delalloc) 1424 if (clear_delalloc)
1326 clear_bits |= EXTENT_DELALLOC; 1425 clear_bits |= EXTENT_DELALLOC;
1327 1426
1328 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); 1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1329 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) 1428 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
1429 set_private2))
1330 return 0; 1430 return 0;
1331 1431
1332 while (nr_pages > 0) { 1432 while (nr_pages > 0) {
@@ -1334,6 +1434,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1334 min_t(unsigned long, 1434 min_t(unsigned long,
1335 nr_pages, ARRAY_SIZE(pages)), pages); 1435 nr_pages, ARRAY_SIZE(pages)), pages);
1336 for (i = 0; i < ret; i++) { 1436 for (i = 0; i < ret; i++) {
1437
1438 if (set_private2)
1439 SetPagePrivate2(pages[i]);
1440
1337 if (pages[i] == locked_page) { 1441 if (pages[i] == locked_page) {
1338 page_cache_release(pages[i]); 1442 page_cache_release(pages[i]);
1339 continue; 1443 continue;
@@ -1476,14 +1580,17 @@ out:
1476 * range is found set. 1580 * range is found set.
1477 */ 1581 */
1478int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1582int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1479 int bits, int filled) 1583 int bits, int filled, struct extent_state *cached)
1480{ 1584{
1481 struct extent_state *state = NULL; 1585 struct extent_state *state = NULL;
1482 struct rb_node *node; 1586 struct rb_node *node;
1483 int bitset = 0; 1587 int bitset = 0;
1484 1588
1485 spin_lock(&tree->lock); 1589 spin_lock(&tree->lock);
1486 node = tree_search(tree, start); 1590 if (cached && cached->tree && cached->start == start)
1591 node = &cached->rb_node;
1592 else
1593 node = tree_search(tree, start);
1487 while (node && start <= end) { 1594 while (node && start <= end) {
1488 state = rb_entry(node, struct extent_state, rb_node); 1595 state = rb_entry(node, struct extent_state, rb_node);
1489 1596
@@ -1503,6 +1610,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1503 bitset = 0; 1610 bitset = 0;
1504 break; 1611 break;
1505 } 1612 }
1613
1614 if (state->end == (u64)-1)
1615 break;
1616
1506 start = state->end + 1; 1617 start = state->end + 1;
1507 if (start > end) 1618 if (start > end)
1508 break; 1619 break;
@@ -1526,7 +1637,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
1526{ 1637{
1527 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1638 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1528 u64 end = start + PAGE_CACHE_SIZE - 1; 1639 u64 end = start + PAGE_CACHE_SIZE - 1;
1529 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) 1640 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1530 SetPageUptodate(page); 1641 SetPageUptodate(page);
1531 return 0; 1642 return 0;
1532} 1643}
@@ -1540,7 +1651,7 @@ static int check_page_locked(struct extent_io_tree *tree,
1540{ 1651{
1541 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1652 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1542 u64 end = start + PAGE_CACHE_SIZE - 1; 1653 u64 end = start + PAGE_CACHE_SIZE - 1;
1543 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) 1654 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1544 unlock_page(page); 1655 unlock_page(page);
1545 return 0; 1656 return 0;
1546} 1657}
@@ -1552,10 +1663,7 @@ static int check_page_locked(struct extent_io_tree *tree,
1552static int check_page_writeback(struct extent_io_tree *tree, 1663static int check_page_writeback(struct extent_io_tree *tree,
1553 struct page *page) 1664 struct page *page)
1554{ 1665{
1555 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1666 end_page_writeback(page);
1556 u64 end = start + PAGE_CACHE_SIZE - 1;
1557 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1558 end_page_writeback(page);
1559 return 0; 1667 return 0;
1560} 1668}
1561 1669
@@ -1613,13 +1721,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1613 } 1721 }
1614 1722
1615 if (!uptodate) { 1723 if (!uptodate) {
1616 clear_extent_uptodate(tree, start, end, GFP_ATOMIC); 1724 clear_extent_uptodate(tree, start, end, GFP_NOFS);
1617 ClearPageUptodate(page); 1725 ClearPageUptodate(page);
1618 SetPageError(page); 1726 SetPageError(page);
1619 } 1727 }
1620 1728
1621 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1622
1623 if (whole_page) 1729 if (whole_page)
1624 end_page_writeback(page); 1730 end_page_writeback(page);
1625 else 1731 else
@@ -1983,7 +2089,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
1983 continue; 2089 continue;
1984 } 2090 }
1985 /* the get_extent function already copied into the page */ 2091 /* the get_extent function already copied into the page */
1986 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { 2092 if (test_range_bit(tree, cur, cur_end,
2093 EXTENT_UPTODATE, 1, NULL)) {
1987 check_page_uptodate(tree, page); 2094 check_page_uptodate(tree, page);
1988 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2095 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1989 cur = cur + iosize; 2096 cur = cur + iosize;
@@ -2078,6 +2185,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2078 u64 iosize; 2185 u64 iosize;
2079 u64 unlock_start; 2186 u64 unlock_start;
2080 sector_t sector; 2187 sector_t sector;
2188 struct extent_state *cached_state = NULL;
2081 struct extent_map *em; 2189 struct extent_map *em;
2082 struct block_device *bdev; 2190 struct block_device *bdev;
2083 int ret; 2191 int ret;
@@ -2124,6 +2232,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2124 delalloc_end = 0; 2232 delalloc_end = 0;
2125 page_started = 0; 2233 page_started = 0;
2126 if (!epd->extent_locked) { 2234 if (!epd->extent_locked) {
2235 u64 delalloc_to_write = 0;
2127 /* 2236 /*
2128 * make sure the wbc mapping index is at least updated 2237 * make sure the wbc mapping index is at least updated
2129 * to this page. 2238 * to this page.
@@ -2143,8 +2252,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2143 tree->ops->fill_delalloc(inode, page, delalloc_start, 2252 tree->ops->fill_delalloc(inode, page, delalloc_start,
2144 delalloc_end, &page_started, 2253 delalloc_end, &page_started,
2145 &nr_written); 2254 &nr_written);
2255 /*
2256 * delalloc_end is already one less than the total
2257 * length, so we don't subtract one from
2258 * PAGE_CACHE_SIZE
2259 */
2260 delalloc_to_write += (delalloc_end - delalloc_start +
2261 PAGE_CACHE_SIZE) >>
2262 PAGE_CACHE_SHIFT;
2146 delalloc_start = delalloc_end + 1; 2263 delalloc_start = delalloc_end + 1;
2147 } 2264 }
2265 if (wbc->nr_to_write < delalloc_to_write) {
2266 int thresh = 8192;
2267
2268 if (delalloc_to_write < thresh * 2)
2269 thresh = delalloc_to_write;
2270 wbc->nr_to_write = min_t(u64, delalloc_to_write,
2271 thresh);
2272 }
2148 2273
2149 /* did the fill delalloc function already unlock and start 2274 /* did the fill delalloc function already unlock and start
2150 * the IO? 2275 * the IO?
@@ -2160,15 +2285,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2160 goto done_unlocked; 2285 goto done_unlocked;
2161 } 2286 }
2162 } 2287 }
2163 lock_extent(tree, start, page_end, GFP_NOFS);
2164
2165 unlock_start = start;
2166
2167 if (tree->ops && tree->ops->writepage_start_hook) { 2288 if (tree->ops && tree->ops->writepage_start_hook) {
2168 ret = tree->ops->writepage_start_hook(page, start, 2289 ret = tree->ops->writepage_start_hook(page, start,
2169 page_end); 2290 page_end);
2170 if (ret == -EAGAIN) { 2291 if (ret == -EAGAIN) {
2171 unlock_extent(tree, start, page_end, GFP_NOFS);
2172 redirty_page_for_writepage(wbc, page); 2292 redirty_page_for_writepage(wbc, page);
2173 update_nr_written(page, wbc, nr_written); 2293 update_nr_written(page, wbc, nr_written);
2174 unlock_page(page); 2294 unlock_page(page);
@@ -2184,12 +2304,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2184 update_nr_written(page, wbc, nr_written + 1); 2304 update_nr_written(page, wbc, nr_written + 1);
2185 2305
2186 end = page_end; 2306 end = page_end;
2187 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2188 printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2189
2190 if (last_byte <= start) { 2307 if (last_byte <= start) {
2191 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2192 unlock_extent(tree, start, page_end, GFP_NOFS);
2193 if (tree->ops && tree->ops->writepage_end_io_hook) 2308 if (tree->ops && tree->ops->writepage_end_io_hook)
2194 tree->ops->writepage_end_io_hook(page, start, 2309 tree->ops->writepage_end_io_hook(page, start,
2195 page_end, NULL, 1); 2310 page_end, NULL, 1);
@@ -2197,13 +2312,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2197 goto done; 2312 goto done;
2198 } 2313 }
2199 2314
2200 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2201 blocksize = inode->i_sb->s_blocksize; 2315 blocksize = inode->i_sb->s_blocksize;
2202 2316
2203 while (cur <= end) { 2317 while (cur <= end) {
2204 if (cur >= last_byte) { 2318 if (cur >= last_byte) {
2205 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2206 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2207 if (tree->ops && tree->ops->writepage_end_io_hook) 2319 if (tree->ops && tree->ops->writepage_end_io_hook)
2208 tree->ops->writepage_end_io_hook(page, cur, 2320 tree->ops->writepage_end_io_hook(page, cur,
2209 page_end, NULL, 1); 2321 page_end, NULL, 1);
@@ -2235,12 +2347,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2235 */ 2347 */
2236 if (compressed || block_start == EXTENT_MAP_HOLE || 2348 if (compressed || block_start == EXTENT_MAP_HOLE ||
2237 block_start == EXTENT_MAP_INLINE) { 2349 block_start == EXTENT_MAP_INLINE) {
2238 clear_extent_dirty(tree, cur,
2239 cur + iosize - 1, GFP_NOFS);
2240
2241 unlock_extent(tree, unlock_start, cur + iosize - 1,
2242 GFP_NOFS);
2243
2244 /* 2350 /*
2245 * end_io notification does not happen here for 2351 * end_io notification does not happen here for
2246 * compressed extents 2352 * compressed extents
@@ -2265,13 +2371,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2265 } 2371 }
2266 /* leave this out until we have a page_mkwrite call */ 2372 /* leave this out until we have a page_mkwrite call */
2267 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2373 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2268 EXTENT_DIRTY, 0)) { 2374 EXTENT_DIRTY, 0, NULL)) {
2269 cur = cur + iosize; 2375 cur = cur + iosize;
2270 pg_offset += iosize; 2376 pg_offset += iosize;
2271 continue; 2377 continue;
2272 } 2378 }
2273 2379
2274 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2275 if (tree->ops && tree->ops->writepage_io_hook) { 2380 if (tree->ops && tree->ops->writepage_io_hook) {
2276 ret = tree->ops->writepage_io_hook(page, cur, 2381 ret = tree->ops->writepage_io_hook(page, cur,
2277 cur + iosize - 1); 2382 cur + iosize - 1);
@@ -2309,12 +2414,12 @@ done:
2309 set_page_writeback(page); 2414 set_page_writeback(page);
2310 end_page_writeback(page); 2415 end_page_writeback(page);
2311 } 2416 }
2312 if (unlock_start <= page_end)
2313 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2314 unlock_page(page); 2417 unlock_page(page);
2315 2418
2316done_unlocked: 2419done_unlocked:
2317 2420
2421 /* drop our reference on any cached states */
2422 free_extent_state(cached_state);
2318 return 0; 2423 return 0;
2319} 2424}
2320 2425
@@ -2339,9 +2444,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2339 writepage_t writepage, void *data, 2444 writepage_t writepage, void *data,
2340 void (*flush_fn)(void *)) 2445 void (*flush_fn)(void *))
2341{ 2446{
2342 struct backing_dev_info *bdi = mapping->backing_dev_info;
2343 int ret = 0; 2447 int ret = 0;
2344 int done = 0; 2448 int done = 0;
2449 int nr_to_write_done = 0;
2345 struct pagevec pvec; 2450 struct pagevec pvec;
2346 int nr_pages; 2451 int nr_pages;
2347 pgoff_t index; 2452 pgoff_t index;
@@ -2361,7 +2466,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2361 scanned = 1; 2466 scanned = 1;
2362 } 2467 }
2363retry: 2468retry:
2364 while (!done && (index <= end) && 2469 while (!done && !nr_to_write_done && (index <= end) &&
2365 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2470 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2366 PAGECACHE_TAG_DIRTY, min(end - index, 2471 PAGECACHE_TAG_DIRTY, min(end - index,
2367 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2472 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -2412,12 +2517,15 @@ retry:
2412 unlock_page(page); 2517 unlock_page(page);
2413 ret = 0; 2518 ret = 0;
2414 } 2519 }
2415 if (ret || wbc->nr_to_write <= 0) 2520 if (ret)
2416 done = 1;
2417 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2418 wbc->encountered_congestion = 1;
2419 done = 1; 2521 done = 1;
2420 } 2522
2523 /*
2524 * the filesystem may choose to bump up nr_to_write.
2525 * We have to make sure to honor the new nr_to_write
2526 * at any time
2527 */
2528 nr_to_write_done = wbc->nr_to_write <= 0;
2421 } 2529 }
2422 pagevec_release(&pvec); 2530 pagevec_release(&pvec);
2423 cond_resched(); 2531 cond_resched();
@@ -2604,10 +2712,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2604 return 0; 2712 return 0;
2605 2713
2606 lock_extent(tree, start, end, GFP_NOFS); 2714 lock_extent(tree, start, end, GFP_NOFS);
2607 wait_on_extent_writeback(tree, start, end); 2715 wait_on_page_writeback(page);
2608 clear_extent_bit(tree, start, end, 2716 clear_extent_bit(tree, start, end,
2609 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 2717 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2610 1, 1, GFP_NOFS); 2718 1, 1, NULL, GFP_NOFS);
2611 return 0; 2719 return 0;
2612} 2720}
2613 2721
@@ -2687,7 +2795,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
2687 !isnew && !PageUptodate(page) && 2795 !isnew && !PageUptodate(page) &&
2688 (block_off_end > to || block_off_start < from) && 2796 (block_off_end > to || block_off_start < from) &&
2689 !test_range_bit(tree, block_start, cur_end, 2797 !test_range_bit(tree, block_start, cur_end,
2690 EXTENT_UPTODATE, 1)) { 2798 EXTENT_UPTODATE, 1, NULL)) {
2691 u64 sector; 2799 u64 sector;
2692 u64 extent_offset = block_start - em->start; 2800 u64 extent_offset = block_start - em->start;
2693 size_t iosize; 2801 size_t iosize;
@@ -2701,7 +2809,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
2701 */ 2809 */
2702 set_extent_bit(tree, block_start, 2810 set_extent_bit(tree, block_start,
2703 block_start + iosize - 1, 2811 block_start + iosize - 1,
2704 EXTENT_LOCKED, 0, NULL, GFP_NOFS); 2812 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
2705 ret = submit_extent_page(READ, tree, page, 2813 ret = submit_extent_page(READ, tree, page,
2706 sector, iosize, page_offset, em->bdev, 2814 sector, iosize, page_offset, em->bdev,
2707 NULL, 1, 2815 NULL, 1,
@@ -2742,13 +2850,18 @@ int try_release_extent_state(struct extent_map_tree *map,
2742 int ret = 1; 2850 int ret = 1;
2743 2851
2744 if (test_range_bit(tree, start, end, 2852 if (test_range_bit(tree, start, end,
2745 EXTENT_IOBITS | EXTENT_ORDERED, 0)) 2853 EXTENT_IOBITS, 0, NULL))
2746 ret = 0; 2854 ret = 0;
2747 else { 2855 else {
2748 if ((mask & GFP_NOFS) == GFP_NOFS) 2856 if ((mask & GFP_NOFS) == GFP_NOFS)
2749 mask = GFP_NOFS; 2857 mask = GFP_NOFS;
2750 clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 2858 /*
2751 1, 1, mask); 2859 * at this point we can safely clear everything except the
2860 * locked bit and the nodatasum bit
2861 */
2862 clear_extent_bit(tree, start, end,
2863 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2864 0, 0, NULL, mask);
2752 } 2865 }
2753 return ret; 2866 return ret;
2754} 2867}
@@ -2771,29 +2884,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
2771 u64 len; 2884 u64 len;
2772 while (start <= end) { 2885 while (start <= end) {
2773 len = end - start + 1; 2886 len = end - start + 1;
2774 spin_lock(&map->lock); 2887 write_lock(&map->lock);
2775 em = lookup_extent_mapping(map, start, len); 2888 em = lookup_extent_mapping(map, start, len);
2776 if (!em || IS_ERR(em)) { 2889 if (!em || IS_ERR(em)) {
2777 spin_unlock(&map->lock); 2890 write_unlock(&map->lock);
2778 break; 2891 break;
2779 } 2892 }
2780 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2893 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2781 em->start != start) { 2894 em->start != start) {
2782 spin_unlock(&map->lock); 2895 write_unlock(&map->lock);
2783 free_extent_map(em); 2896 free_extent_map(em);
2784 break; 2897 break;
2785 } 2898 }
2786 if (!test_range_bit(tree, em->start, 2899 if (!test_range_bit(tree, em->start,
2787 extent_map_end(em) - 1, 2900 extent_map_end(em) - 1,
2788 EXTENT_LOCKED | EXTENT_WRITEBACK | 2901 EXTENT_LOCKED | EXTENT_WRITEBACK,
2789 EXTENT_ORDERED, 2902 0, NULL)) {
2790 0)) {
2791 remove_extent_mapping(map, em); 2903 remove_extent_mapping(map, em);
2792 /* once for the rb tree */ 2904 /* once for the rb tree */
2793 free_extent_map(em); 2905 free_extent_map(em);
2794 } 2906 }
2795 start = extent_map_end(em); 2907 start = extent_map_end(em);
2796 spin_unlock(&map->lock); 2908 write_unlock(&map->lock);
2797 2909
2798 /* once for us */ 2910 /* once for us */
2799 free_extent_map(em); 2911 free_extent_map(em);
@@ -3203,7 +3315,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3203 int uptodate; 3315 int uptodate;
3204 unsigned long index; 3316 unsigned long index;
3205 3317
3206 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); 3318 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3207 if (ret) 3319 if (ret)
3208 return 1; 3320 return 1;
3209 while (start <= end) { 3321 while (start <= end) {
@@ -3233,7 +3345,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3233 return 1; 3345 return 1;
3234 3346
3235 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3347 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3236 EXTENT_UPTODATE, 1); 3348 EXTENT_UPTODATE, 1, NULL);
3237 if (ret) 3349 if (ret)
3238 return ret; 3350 return ret;
3239 3351
@@ -3269,7 +3381,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3269 return 0; 3381 return 0;
3270 3382
3271 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3383 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3272 EXTENT_UPTODATE, 1)) { 3384 EXTENT_UPTODATE, 1, NULL)) {
3273 return 0; 3385 return 0;
3274 } 3386 }
3275 3387
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5bc20abf3f3d..4794ec891fed 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,8 @@
13#define EXTENT_DEFRAG (1 << 6) 13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7) 14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8) 15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_NODATASUM (1 << 12)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 18#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21 19
22/* flags for bio submission */ 20/* flags for bio submission */
@@ -62,8 +60,13 @@ struct extent_io_ops {
62 struct extent_state *state, int uptodate); 60 struct extent_state *state, int uptodate);
63 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
64 unsigned long old, unsigned long bits); 62 unsigned long old, unsigned long bits);
65 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 63 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
66 unsigned long old, unsigned long bits); 64 unsigned long bits);
65 int (*merge_extent_hook)(struct inode *inode,
66 struct extent_state *new,
67 struct extent_state *other);
68 int (*split_extent_hook)(struct inode *inode,
69 struct extent_state *orig, u64 split);
67 int (*write_cache_pages_lock_hook)(struct page *page); 70 int (*write_cache_pages_lock_hook)(struct page *page);
68}; 71};
69 72
@@ -81,10 +84,14 @@ struct extent_state {
81 u64 start; 84 u64 start;
82 u64 end; /* inclusive */ 85 u64 end; /* inclusive */
83 struct rb_node rb_node; 86 struct rb_node rb_node;
87
88 /* ADD NEW ELEMENTS AFTER THIS */
84 struct extent_io_tree *tree; 89 struct extent_io_tree *tree;
85 wait_queue_head_t wq; 90 wait_queue_head_t wq;
86 atomic_t refs; 91 atomic_t refs;
87 unsigned long state; 92 unsigned long state;
93 u64 split_start;
94 u64 split_end;
88 95
89 /* for use by the FS */ 96 /* for use by the FS */
90 u64 private; 97 u64 private;
@@ -142,6 +149,8 @@ int try_release_extent_state(struct extent_map_tree *map,
142 struct extent_io_tree *tree, struct page *page, 149 struct extent_io_tree *tree, struct page *page,
143 gfp_t mask); 150 gfp_t mask);
144int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 151int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
152int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
153 int bits, struct extent_state **cached, gfp_t mask);
145int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 154int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
146int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 155int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
147 gfp_t mask); 156 gfp_t mask);
@@ -155,11 +164,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
155 u64 max_bytes, unsigned long bits); 164 u64 max_bytes, unsigned long bits);
156 165
157int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 166int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
158 int bits, int filled); 167 int bits, int filled, struct extent_state *cached_state);
159int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 168int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
160 int bits, gfp_t mask); 169 int bits, gfp_t mask);
161int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 170int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
162 int bits, int wake, int delete, gfp_t mask); 171 int bits, int wake, int delete, struct extent_state **cached,
172 gfp_t mask);
163int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 173int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, gfp_t mask); 174 int bits, gfp_t mask);
165int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 175int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -282,5 +292,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
282 int clear_unlock, 292 int clear_unlock,
283 int clear_delalloc, int clear_dirty, 293 int clear_delalloc, int clear_dirty,
284 int set_writeback, 294 int set_writeback,
285 int end_writeback); 295 int end_writeback,
296 int set_private2);
286#endif 297#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 30c9365861e6..2c726b7b9faa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
37{ 37{
38 tree->map.rb_node = NULL; 38 tree->map.rb_node = NULL;
39 spin_lock_init(&tree->lock); 39 rwlock_init(&tree->lock);
40} 40}
41 41
42/** 42/**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
198 return 0; 198 return 0;
199} 199}
200 200
201int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
202{
203 int ret = 0;
204 struct extent_map *merge = NULL;
205 struct rb_node *rb;
206 struct extent_map *em;
207
208 write_lock(&tree->lock);
209 em = lookup_extent_mapping(tree, start, len);
210
211 WARN_ON(em->start != start || !em);
212
213 if (!em)
214 goto out;
215
216 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
217
218 if (em->start != 0) {
219 rb = rb_prev(&em->rb_node);
220 if (rb)
221 merge = rb_entry(rb, struct extent_map, rb_node);
222 if (rb && mergable_maps(merge, em)) {
223 em->start = merge->start;
224 em->len += merge->len;
225 em->block_len += merge->block_len;
226 em->block_start = merge->block_start;
227 merge->in_tree = 0;
228 rb_erase(&merge->rb_node, &tree->map);
229 free_extent_map(merge);
230 }
231 }
232
233 rb = rb_next(&em->rb_node);
234 if (rb)
235 merge = rb_entry(rb, struct extent_map, rb_node);
236 if (rb && mergable_maps(em, merge)) {
237 em->len += merge->len;
238 em->block_len += merge->len;
239 rb_erase(&merge->rb_node, &tree->map);
240 merge->in_tree = 0;
241 free_extent_map(merge);
242 }
243
244 free_extent_map(em);
245out:
246 write_unlock(&tree->lock);
247 return ret;
248
249}
250
201/** 251/**
202 * add_extent_mapping - add new extent map to the extent tree 252 * add_extent_mapping - add new extent map to the extent tree
203 * @tree: tree to insert new map in 253 * @tree: tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
222 ret = -EEXIST; 272 ret = -EEXIST;
223 goto out; 273 goto out;
224 } 274 }
225 assert_spin_locked(&tree->lock);
226 rb = tree_insert(&tree->map, em->start, &em->rb_node); 275 rb = tree_insert(&tree->map, em->start, &em->rb_node);
227 if (rb) { 276 if (rb) {
228 ret = -EEXIST; 277 ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
285 struct rb_node *next = NULL; 334 struct rb_node *next = NULL;
286 u64 end = range_end(start, len); 335 u64 end = range_end(start, len);
287 336
288 assert_spin_locked(&tree->lock);
289 rb_node = __tree_search(&tree->map, start, &prev, &next); 337 rb_node = __tree_search(&tree->map, start, &prev, &next);
290 if (!rb_node && prev) { 338 if (!rb_node && prev) {
291 em = rb_entry(prev, struct extent_map, rb_node); 339 em = rb_entry(prev, struct extent_map, rb_node);
@@ -319,6 +367,54 @@ out:
319} 367}
320 368
321/** 369/**
370 * search_extent_mapping - find a nearby extent map
371 * @tree: tree to lookup in
372 * @start: byte offset to start the search
373 * @len: length of the lookup range
374 *
375 * Find and return the first extent_map struct in @tree that intersects the
376 * [start, len] range.
377 *
378 * If one can't be found, any nearby extent may be returned
379 */
380struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
381 u64 start, u64 len)
382{
383 struct extent_map *em;
384 struct rb_node *rb_node;
385 struct rb_node *prev = NULL;
386 struct rb_node *next = NULL;
387
388 rb_node = __tree_search(&tree->map, start, &prev, &next);
389 if (!rb_node && prev) {
390 em = rb_entry(prev, struct extent_map, rb_node);
391 goto found;
392 }
393 if (!rb_node && next) {
394 em = rb_entry(next, struct extent_map, rb_node);
395 goto found;
396 }
397 if (!rb_node) {
398 em = NULL;
399 goto out;
400 }
401 if (IS_ERR(rb_node)) {
402 em = ERR_PTR(PTR_ERR(rb_node));
403 goto out;
404 }
405 em = rb_entry(rb_node, struct extent_map, rb_node);
406 goto found;
407
408 em = NULL;
409 goto out;
410
411found:
412 atomic_inc(&em->refs);
413out:
414 return em;
415}
416
417/**
322 * remove_extent_mapping - removes an extent_map from the extent tree 418 * remove_extent_mapping - removes an extent_map from the extent tree
323 * @tree: extent tree to remove from 419 * @tree: extent tree to remove from
324 * @em: extent map beeing removed 420 * @em: extent map beeing removed
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
331 int ret = 0; 427 int ret = 0;
332 428
333 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 429 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
334 assert_spin_locked(&tree->lock);
335 rb_erase(&em->rb_node, &tree->map); 430 rb_erase(&em->rb_node, &tree->map);
336 em->in_tree = 0; 431 em->in_tree = 0;
337 return ret; 432 return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fb6eeef06bb0..ab6d74b6e647 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
31 31
32struct extent_map_tree { 32struct extent_map_tree {
33 struct rb_root map; 33 struct rb_root map;
34 spinlock_t lock; 34 rwlock_t lock;
35}; 35};
36 36
37static inline u64 extent_map_end(struct extent_map *em) 37static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em); 59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void); 60int __init extent_map_init(void);
61void extent_map_exit(void); 61void extent_map_exit(void);
62int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
63struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
64 u64 start, u64 len);
62#endif 65#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b833972273a..f19e1259a971 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
112 int err = 0; 112 int err = 0;
113 int i; 113 int i;
114 struct inode *inode = fdentry(file)->d_inode; 114 struct inode *inode = fdentry(file)->d_inode;
115 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
116 u64 hint_byte;
117 u64 num_bytes; 115 u64 num_bytes;
118 u64 start_pos; 116 u64 start_pos;
119 u64 end_of_last_block; 117 u64 end_of_last_block;
@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
125 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
126 124
127 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
127 if (err)
128 return err;
128 129
129 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
130 trans = btrfs_join_transaction(root, 1);
131 if (!trans) {
132 err = -ENOMEM;
133 goto out_unlock;
134 }
135 btrfs_set_trans_block_group(trans, inode);
136 hint_byte = 0;
137
138 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
139
140 /* check for reserved extents on each page, we don't want
141 * to reset the delalloc bit on things that already have
142 * extents reserved.
143 */
144 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
145 for (i = 0; i < num_pages; i++) { 130 for (i = 0; i < num_pages; i++) {
146 struct page *p = pages[i]; 131 struct page *p = pages[i];
147 SetPageUptodate(p); 132 SetPageUptodate(p);
@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
155 * at this time. 140 * at this time.
156 */ 141 */
157 } 142 }
158 err = btrfs_end_transaction(trans, root);
159out_unlock:
160 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
161 return err; 143 return err;
162} 144}
163 145
@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
189 if (!split2) 171 if (!split2)
190 split2 = alloc_extent_map(GFP_NOFS); 172 split2 = alloc_extent_map(GFP_NOFS);
191 173
192 spin_lock(&em_tree->lock); 174 write_lock(&em_tree->lock);
193 em = lookup_extent_mapping(em_tree, start, len); 175 em = lookup_extent_mapping(em_tree, start, len);
194 if (!em) { 176 if (!em) {
195 spin_unlock(&em_tree->lock); 177 write_unlock(&em_tree->lock);
196 break; 178 break;
197 } 179 }
198 flags = em->flags; 180 flags = em->flags;
199 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 181 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
200 spin_unlock(&em_tree->lock);
201 if (em->start <= start && 182 if (em->start <= start &&
202 (!testend || em->start + em->len >= start + len)) { 183 (!testend || em->start + em->len >= start + len)) {
203 free_extent_map(em); 184 free_extent_map(em);
185 write_unlock(&em_tree->lock);
204 break; 186 break;
205 } 187 }
206 if (start < em->start) { 188 if (start < em->start) {
@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
210 start = em->start + em->len; 192 start = em->start + em->len;
211 } 193 }
212 free_extent_map(em); 194 free_extent_map(em);
195 write_unlock(&em_tree->lock);
213 continue; 196 continue;
214 } 197 }
215 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 198 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
260 free_extent_map(split); 243 free_extent_map(split);
261 split = NULL; 244 split = NULL;
262 } 245 }
263 spin_unlock(&em_tree->lock); 246 write_unlock(&em_tree->lock);
264 247
265 /* once for us */ 248 /* once for us */
266 free_extent_map(em); 249 free_extent_map(em);
@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
289noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, 272noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
290 struct btrfs_root *root, struct inode *inode, 273 struct btrfs_root *root, struct inode *inode,
291 u64 start, u64 end, u64 locked_end, 274 u64 start, u64 end, u64 locked_end,
292 u64 inline_limit, u64 *hint_byte) 275 u64 inline_limit, u64 *hint_byte, int drop_cache)
293{ 276{
294 u64 extent_end = 0; 277 u64 extent_end = 0;
295 u64 search_start = start; 278 u64 search_start = start;
@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
314 int ret; 297 int ret;
315 298
316 inline_limit = 0; 299 inline_limit = 0;
317 btrfs_drop_extent_cache(inode, start, end - 1, 0); 300 if (drop_cache)
301 btrfs_drop_extent_cache(inode, start, end - 1, 0);
318 302
319 path = btrfs_alloc_path(); 303 path = btrfs_alloc_path();
320 if (!path) 304 if (!path)
@@ -936,21 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
936 start_pos = pos; 920 start_pos = pos;
937 921
938 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 922 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
923
924 /* do the reserve before the mutex lock in case we have to do some
925 * flushing. We wouldn't deadlock, but this is more polite.
926 */
927 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
928 if (err)
929 goto out_nolock;
930
931 mutex_lock(&inode->i_mutex);
932
939 current->backing_dev_info = inode->i_mapping->backing_dev_info; 933 current->backing_dev_info = inode->i_mapping->backing_dev_info;
940 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 934 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
941 if (err) 935 if (err)
942 goto out_nolock; 936 goto out;
937
943 if (count == 0) 938 if (count == 0)
944 goto out_nolock; 939 goto out;
945 940
946 err = file_remove_suid(file); 941 err = file_remove_suid(file);
947 if (err) 942 if (err)
948 goto out_nolock; 943 goto out;
944
949 file_update_time(file); 945 file_update_time(file);
950 946
951 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 947 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
952 948
953 mutex_lock(&inode->i_mutex); 949 /* generic_write_checks can change our pos */
950 start_pos = pos;
951
954 BTRFS_I(inode)->sequence++; 952 BTRFS_I(inode)->sequence++;
955 first_index = pos >> PAGE_CACHE_SHIFT; 953 first_index = pos >> PAGE_CACHE_SHIFT;
956 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 954 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1024,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1024 } 1022 }
1025 1023
1026 if (will_write) { 1024 if (will_write) {
1027 btrfs_fdatawrite_range(inode->i_mapping, pos, 1025 filemap_fdatawrite_range(inode->i_mapping, pos,
1028 pos + write_bytes - 1, 1026 pos + write_bytes - 1);
1029 WB_SYNC_ALL);
1030 } else { 1027 } else {
1031 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1028 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1032 num_pages); 1029 num_pages);
@@ -1047,6 +1044,7 @@ out:
1047 mutex_unlock(&inode->i_mutex); 1044 mutex_unlock(&inode->i_mutex);
1048 if (ret) 1045 if (ret)
1049 err = ret; 1046 err = ret;
1047 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1050 1048
1051out_nolock: 1049out_nolock:
1052 kfree(pages); 1050 kfree(pages);
@@ -1203,7 +1201,7 @@ out:
1203 return ret > 0 ? EIO : ret; 1201 return ret > 0 ? EIO : ret;
1204} 1202}
1205 1203
1206static struct vm_operations_struct btrfs_file_vm_ops = { 1204static const struct vm_operations_struct btrfs_file_vm_ops = {
1207 .fault = filemap_fault, 1205 .fault = filemap_fault,
1208 .page_mkwrite = btrfs_page_mkwrite, 1206 .page_mkwrite = btrfs_page_mkwrite,
1209}; 1207};
@@ -1215,7 +1213,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1215 return 0; 1213 return 0;
1216} 1214}
1217 1215
1218struct file_operations btrfs_file_operations = { 1216const struct file_operations btrfs_file_operations = {
1219 .llseek = generic_file_llseek, 1217 .llseek = generic_file_llseek,
1220 .read = do_sync_read, 1218 .read = do_sync_read,
1221 .aio_read = generic_file_aio_read, 1219 .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5edcee3a617f..5c2caad76212 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
259 259
260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) 260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261{ 261{
262 u64 max_bytes, possible_bytes; 262 u64 max_bytes;
263 u64 bitmap_bytes;
264 u64 extent_bytes;
263 265
264 /* 266 /*
265 * The goal is to keep the total amount of memory used per 1gb of space 267 * The goal is to keep the total amount of memory used per 1gb of space
@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
269 max_bytes = MAX_CACHE_BYTES_PER_GIG * 271 max_bytes = MAX_CACHE_BYTES_PER_GIG *
270 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 272 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271 273
272 possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + 274 /*
273 (sizeof(struct btrfs_free_space) * 275 * we want to account for 1 more bitmap than what we have so we can make
274 block_group->extents_thresh); 276 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
277 * we add more bitmaps.
278 */
279 bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
275 280
276 if (possible_bytes > max_bytes) { 281 if (bitmap_bytes >= max_bytes) {
277 int extent_bytes = max_bytes - 282 block_group->extents_thresh = 0;
278 (block_group->total_bitmaps * PAGE_CACHE_SIZE); 283 return;
284 }
279 285
280 if (extent_bytes <= 0) { 286 /*
281 block_group->extents_thresh = 0; 287 * we want the extent entry threshold to always be at most 1/2 the maxw
282 return; 288 * bytes we can have, or whatever is less than that.
283 } 289 */
290 extent_bytes = max_bytes - bitmap_bytes;
291 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
284 292
285 block_group->extents_thresh = extent_bytes / 293 block_group->extents_thresh =
286 (sizeof(struct btrfs_free_space)); 294 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
287 }
288} 295}
289 296
290static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, 297static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
403 BUG_ON(block_group->total_bitmaps >= max_bitmaps); 410 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404 411
405 info->offset = offset_to_bitmap(block_group, offset); 412 info->offset = offset_to_bitmap(block_group, offset);
413 info->bytes = 0;
406 link_free_space(block_group, info); 414 link_free_space(block_group, info);
407 block_group->total_bitmaps++; 415 block_group->total_bitmaps++;
408 416
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6b627c611808..72ce3c173d6a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
149 ptr = (unsigned long)(ref + 1); 149 ptr = (unsigned long)(ref + 1);
150 ret = 0; 150 ret = 0;
151 } else if (ret < 0) { 151 } else if (ret < 0) {
152 if (ret == -EOVERFLOW)
153 ret = -EMLINK;
152 goto out; 154 goto out;
153 } else { 155 } else {
154 ref = btrfs_item_ptr(path->nodes[0], path->slots[0], 156 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
177 179
178 ret = btrfs_insert_empty_item(trans, root, path, &key, 180 ret = btrfs_insert_empty_item(trans, root, path, &key,
179 sizeof(struct btrfs_inode_item)); 181 sizeof(struct btrfs_inode_item));
180 if (ret == 0 && objectid > root->highest_inode)
181 root->highest_inode = objectid;
182 return ret; 182 return ret;
183} 183}
184 184
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9abbced1123d..c56eb5909172 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
43 slot = path->slots[0] - 1; 43 slot = path->slots[0] - 1;
44 l = path->nodes[0]; 44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot); 45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid; 46 *objectid = max_t(u64, found_key.objectid,
47 BTRFS_FIRST_FREE_OBJECTID - 1);
47 } else { 48 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID; 49 *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
49 } 50 }
50 ret = 0; 51 ret = 0;
51error: 52error:
@@ -53,91 +54,27 @@ error:
53 return ret; 54 return ret;
54} 55}
55 56
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, 57int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root, 58 struct btrfs_root *root,
61 u64 dirid, u64 *objectid) 59 u64 dirid, u64 *objectid)
62{ 60{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret; 61 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex); 62 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 start_found = 0;
88 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
89 if (ret < 0)
90 goto error;
91 63
92 while (1) { 64 if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
93 l = path->nodes[0]; 65 ret = btrfs_find_highest_inode(root, &root->highest_objectid);
94 slot = path->slots[0]; 66 if (ret)
95 if (slot >= btrfs_header_nritems(l)) { 67 goto out;
96 ret = btrfs_next_leaf(root, path); 68 }
97 if (ret == 0)
98 continue;
99 if (ret < 0)
100 goto error;
101 if (!start_found) {
102 *objectid = search_start;
103 start_found = 1;
104 goto found;
105 }
106 *objectid = last_ino > search_start ?
107 last_ino : search_start;
108 goto found;
109 }
110 btrfs_item_key_to_cpu(l, &key, slot);
111 if (key.objectid >= search_start) {
112 if (start_found) {
113 if (last_ino < search_start)
114 last_ino = search_start;
115 if (key.objectid > last_ino) {
116 *objectid = last_ino;
117 goto found;
118 }
119 } else if (key.objectid > search_start) {
120 *objectid = search_start;
121 goto found;
122 }
123 }
124 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
125 break;
126 69
127 start_found = 1; 70 if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
128 last_ino = key.objectid + 1; 71 ret = -ENOSPC;
129 path->slots[0]++; 72 goto out;
130 } 73 }
131 BUG_ON(1); 74
132found: 75 *objectid = ++root->highest_objectid;
133 btrfs_release_path(root, path); 76 ret = 0;
134 btrfs_free_path(path); 77out:
135 BUG_ON(*objectid < search_start);
136 mutex_unlock(&root->objectid_mutex);
137 return 0;
138error:
139 btrfs_release_path(root, path);
140 btrfs_free_path(path);
141 mutex_unlock(&root->objectid_mutex); 78 mutex_unlock(&root->objectid_mutex);
142 return ret; 79 return ret;
143} 80}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59cba180fe83..112e5aa85892 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -55,14 +55,14 @@ struct btrfs_iget_args {
55 struct btrfs_root *root; 55 struct btrfs_root *root;
56}; 56};
57 57
58static struct inode_operations btrfs_dir_inode_operations; 58static const struct inode_operations btrfs_dir_inode_operations;
59static struct inode_operations btrfs_symlink_inode_operations; 59static const struct inode_operations btrfs_symlink_inode_operations;
60static struct inode_operations btrfs_dir_ro_inode_operations; 60static const struct inode_operations btrfs_dir_ro_inode_operations;
61static struct inode_operations btrfs_special_inode_operations; 61static const struct inode_operations btrfs_special_inode_operations;
62static struct inode_operations btrfs_file_inode_operations; 62static const struct inode_operations btrfs_file_inode_operations;
63static struct address_space_operations btrfs_aops; 63static const struct address_space_operations btrfs_aops;
64static struct address_space_operations btrfs_symlink_aops; 64static const struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations; 65static const struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops; 66static struct extent_io_ops btrfs_extent_io_ops;
67 67
68static struct kmem_cache *btrfs_inode_cachep; 68static struct kmem_cache *btrfs_inode_cachep;
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
231 } 231 }
232 232
233 ret = btrfs_drop_extents(trans, root, inode, start, 233 ret = btrfs_drop_extents(trans, root, inode, start,
234 aligned_end, aligned_end, start, &hint_byte); 234 aligned_end, aligned_end, start,
235 &hint_byte, 1);
235 BUG_ON(ret); 236 BUG_ON(ret);
236 237
237 if (isize > actual_end) 238 if (isize > actual_end)
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
240 inline_len, compressed_size, 241 inline_len, compressed_size,
241 compressed_pages); 242 compressed_pages);
242 BUG_ON(ret); 243 BUG_ON(ret);
243 btrfs_drop_extent_cache(inode, start, aligned_end, 0); 244 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
244 return 0; 245 return 0;
245} 246}
246 247
@@ -425,7 +426,7 @@ again:
425 extent_clear_unlock_delalloc(inode, 426 extent_clear_unlock_delalloc(inode,
426 &BTRFS_I(inode)->io_tree, 427 &BTRFS_I(inode)->io_tree,
427 start, end, NULL, 1, 0, 428 start, end, NULL, 1, 0,
428 0, 1, 1, 1); 429 0, 1, 1, 1, 0);
429 ret = 0; 430 ret = 0;
430 goto free_pages_out; 431 goto free_pages_out;
431 } 432 }
@@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
611 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 612 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
612 613
613 while (1) { 614 while (1) {
614 spin_lock(&em_tree->lock); 615 write_lock(&em_tree->lock);
615 ret = add_extent_mapping(em_tree, em); 616 ret = add_extent_mapping(em_tree, em);
616 spin_unlock(&em_tree->lock); 617 write_unlock(&em_tree->lock);
617 if (ret != -EEXIST) { 618 if (ret != -EEXIST) {
618 free_extent_map(em); 619 free_extent_map(em);
619 break; 620 break;
@@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
640 async_extent->start, 641 async_extent->start,
641 async_extent->start + 642 async_extent->start +
642 async_extent->ram_size - 1, 643 async_extent->ram_size - 1,
643 NULL, 1, 1, 0, 1, 1, 0); 644 NULL, 1, 1, 0, 1, 1, 0, 0);
644 645
645 ret = btrfs_submit_compressed_write(inode, 646 ret = btrfs_submit_compressed_write(inode,
646 async_extent->start, 647 async_extent->start,
@@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
713 extent_clear_unlock_delalloc(inode, 714 extent_clear_unlock_delalloc(inode,
714 &BTRFS_I(inode)->io_tree, 715 &BTRFS_I(inode)->io_tree,
715 start, end, NULL, 1, 1, 716 start, end, NULL, 1, 1,
716 1, 1, 1, 1); 717 1, 1, 1, 1, 0);
717 *nr_written = *nr_written + 718 *nr_written = *nr_written +
718 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 719 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
719 *page_started = 1; 720 *page_started = 1;
@@ -725,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
725 BUG_ON(disk_num_bytes > 726 BUG_ON(disk_num_bytes >
726 btrfs_super_total_bytes(&root->fs_info->super_copy)); 727 btrfs_super_total_bytes(&root->fs_info->super_copy));
727 728
729
730 read_lock(&BTRFS_I(inode)->extent_tree.lock);
731 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
732 start, num_bytes);
733 if (em) {
734 alloc_hint = em->block_start;
735 free_extent_map(em);
736 }
737 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
728 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 738 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
729 739
730 while (disk_num_bytes > 0) { 740 while (disk_num_bytes > 0) {
@@ -737,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
737 em = alloc_extent_map(GFP_NOFS); 747 em = alloc_extent_map(GFP_NOFS);
738 em->start = start; 748 em->start = start;
739 em->orig_start = em->start; 749 em->orig_start = em->start;
740
741 ram_size = ins.offset; 750 ram_size = ins.offset;
742 em->len = ins.offset; 751 em->len = ins.offset;
743 752
@@ -747,9 +756,9 @@ static noinline int cow_file_range(struct inode *inode,
747 set_bit(EXTENT_FLAG_PINNED, &em->flags); 756 set_bit(EXTENT_FLAG_PINNED, &em->flags);
748 757
749 while (1) { 758 while (1) {
750 spin_lock(&em_tree->lock); 759 write_lock(&em_tree->lock);
751 ret = add_extent_mapping(em_tree, em); 760 ret = add_extent_mapping(em_tree, em);
752 spin_unlock(&em_tree->lock); 761 write_unlock(&em_tree->lock);
753 if (ret != -EEXIST) { 762 if (ret != -EEXIST) {
754 free_extent_map(em); 763 free_extent_map(em);
755 break; 764 break;
@@ -776,11 +785,14 @@ static noinline int cow_file_range(struct inode *inode,
776 /* we're not doing compressed IO, don't unlock the first 785 /* we're not doing compressed IO, don't unlock the first
777 * page (which the caller expects to stay locked), don't 786 * page (which the caller expects to stay locked), don't
778 * clear any dirty bits and don't set any writeback bits 787 * clear any dirty bits and don't set any writeback bits
788 *
789 * Do set the Private2 bit so we know this page was properly
790 * setup for writepage
779 */ 791 */
780 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 792 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
781 start, start + ram_size - 1, 793 start, start + ram_size - 1,
782 locked_page, unlock, 1, 794 locked_page, unlock, 1,
783 1, 0, 0, 0); 795 1, 0, 0, 0, 1);
784 disk_num_bytes -= cur_alloc_size; 796 disk_num_bytes -= cur_alloc_size;
785 num_bytes -= cur_alloc_size; 797 num_bytes -= cur_alloc_size;
786 alloc_hint = ins.objectid + ins.offset; 798 alloc_hint = ins.objectid + ins.offset;
@@ -853,7 +865,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
853 int limit = 10 * 1024 * 1042; 865 int limit = 10 * 1024 * 1042;
854 866
855 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
856 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 868 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
857 while (start < end) { 869 while (start < end) {
858 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
859 async_cow->inode = inode; 871 async_cow->inode = inode;
@@ -1080,9 +1092,9 @@ out_check:
1080 em->bdev = root->fs_info->fs_devices->latest_bdev; 1092 em->bdev = root->fs_info->fs_devices->latest_bdev;
1081 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1093 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1082 while (1) { 1094 while (1) {
1083 spin_lock(&em_tree->lock); 1095 write_lock(&em_tree->lock);
1084 ret = add_extent_mapping(em_tree, em); 1096 ret = add_extent_mapping(em_tree, em);
1085 spin_unlock(&em_tree->lock); 1097 write_unlock(&em_tree->lock);
1086 if (ret != -EEXIST) { 1098 if (ret != -EEXIST) {
1087 free_extent_map(em); 1099 free_extent_map(em);
1088 break; 1100 break;
@@ -1101,7 +1113,7 @@ out_check:
1101 1113
1102 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1114 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1103 cur_offset, cur_offset + num_bytes - 1, 1115 cur_offset, cur_offset + num_bytes - 1,
1104 locked_page, 1, 1, 1, 0, 0, 0); 1116 locked_page, 1, 1, 1, 0, 0, 0, 1);
1105 cur_offset = extent_end; 1117 cur_offset = extent_end;
1106 if (cur_offset > end) 1118 if (cur_offset > end)
1107 break; 1119 break;
@@ -1147,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1147 return ret; 1159 return ret;
1148} 1160}
1149 1161
1162static int btrfs_split_extent_hook(struct inode *inode,
1163 struct extent_state *orig, u64 split)
1164{
1165 struct btrfs_root *root = BTRFS_I(inode)->root;
1166 u64 size;
1167
1168 if (!(orig->state & EXTENT_DELALLOC))
1169 return 0;
1170
1171 size = orig->end - orig->start + 1;
1172 if (size > root->fs_info->max_extent) {
1173 u64 num_extents;
1174 u64 new_size;
1175
1176 new_size = orig->end - split + 1;
1177 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1178 root->fs_info->max_extent);
1179
1180 /*
1181 * if we break a large extent up then leave delalloc_extents be,
1182 * since we've already accounted for the large extent.
1183 */
1184 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1185 root->fs_info->max_extent) < num_extents)
1186 return 0;
1187 }
1188
1189 BTRFS_I(inode)->delalloc_extents++;
1190
1191 return 0;
1192}
1193
1194/*
1195 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1196 * extents so we can keep track of new extents that are just merged onto old
1197 * extents, such as when we are doing sequential writes, so we can properly
1198 * account for the metadata space we'll need.
1199 */
1200static int btrfs_merge_extent_hook(struct inode *inode,
1201 struct extent_state *new,
1202 struct extent_state *other)
1203{
1204 struct btrfs_root *root = BTRFS_I(inode)->root;
1205 u64 new_size, old_size;
1206 u64 num_extents;
1207
1208 /* not delalloc, ignore it */
1209 if (!(other->state & EXTENT_DELALLOC))
1210 return 0;
1211
1212 old_size = other->end - other->start + 1;
1213 if (new->start < other->start)
1214 new_size = other->end - new->start + 1;
1215 else
1216 new_size = new->end - other->start + 1;
1217
1218 /* we're not bigger than the max, unreserve the space and go */
1219 if (new_size <= root->fs_info->max_extent) {
1220 BTRFS_I(inode)->delalloc_extents--;
1221 return 0;
1222 }
1223
1224 /*
1225 * If we grew by another max_extent, just return, we want to keep that
1226 * reserved amount.
1227 */
1228 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1229 root->fs_info->max_extent);
1230 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1231 root->fs_info->max_extent) > num_extents)
1232 return 0;
1233
1234 BTRFS_I(inode)->delalloc_extents--;
1235
1236 return 0;
1237}
1238
1150/* 1239/*
1151 * extent_io.c set_bit_hook, used to track delayed allocation 1240 * extent_io.c set_bit_hook, used to track delayed allocation
1152 * bytes in this file, and to maintain the list of inodes that 1241 * bytes in this file, and to maintain the list of inodes that
@@ -1155,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1155static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1244static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1156 unsigned long old, unsigned long bits) 1245 unsigned long old, unsigned long bits)
1157{ 1246{
1247
1158 /* 1248 /*
1159 * set_bit and clear bit hooks normally require _irqsave/restore 1249 * set_bit and clear bit hooks normally require _irqsave/restore
1160 * but in this case, we are only testeing for the DELALLOC 1250 * but in this case, we are only testeing for the DELALLOC
@@ -1162,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1162 */ 1252 */
1163 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1253 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1164 struct btrfs_root *root = BTRFS_I(inode)->root; 1254 struct btrfs_root *root = BTRFS_I(inode)->root;
1255
1256 BTRFS_I(inode)->delalloc_extents++;
1165 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1257 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1166 spin_lock(&root->fs_info->delalloc_lock); 1258 spin_lock(&root->fs_info->delalloc_lock);
1167 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1259 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1178,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1178/* 1270/*
1179 * extent_io.c clear_bit_hook, see set_bit_hook for why 1271 * extent_io.c clear_bit_hook, see set_bit_hook for why
1180 */ 1272 */
1181static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1273static int btrfs_clear_bit_hook(struct inode *inode,
1182 unsigned long old, unsigned long bits) 1274 struct extent_state *state, unsigned long bits)
1183{ 1275{
1184 /* 1276 /*
1185 * set_bit and clear bit hooks normally require _irqsave/restore 1277 * set_bit and clear bit hooks normally require _irqsave/restore
1186 * but in this case, we are only testeing for the DELALLOC 1278 * but in this case, we are only testeing for the DELALLOC
1187 * bit, which is only set or cleared with irqs on 1279 * bit, which is only set or cleared with irqs on
1188 */ 1280 */
1189 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1281 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1190 struct btrfs_root *root = BTRFS_I(inode)->root; 1282 struct btrfs_root *root = BTRFS_I(inode)->root;
1191 1283
1284 BTRFS_I(inode)->delalloc_extents--;
1285 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1286
1192 spin_lock(&root->fs_info->delalloc_lock); 1287 spin_lock(&root->fs_info->delalloc_lock);
1193 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1288 if (state->end - state->start + 1 >
1289 root->fs_info->delalloc_bytes) {
1194 printk(KERN_INFO "btrfs warning: delalloc account " 1290 printk(KERN_INFO "btrfs warning: delalloc account "
1195 "%llu %llu\n", 1291 "%llu %llu\n",
1196 (unsigned long long)end - start + 1, 1292 (unsigned long long)
1293 state->end - state->start + 1,
1197 (unsigned long long) 1294 (unsigned long long)
1198 root->fs_info->delalloc_bytes); 1295 root->fs_info->delalloc_bytes);
1199 btrfs_delalloc_free_space(root, inode, (u64)-1); 1296 btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1201,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1201 BTRFS_I(inode)->delalloc_bytes = 0; 1298 BTRFS_I(inode)->delalloc_bytes = 0;
1202 } else { 1299 } else {
1203 btrfs_delalloc_free_space(root, inode, 1300 btrfs_delalloc_free_space(root, inode,
1204 end - start + 1); 1301 state->end -
1205 root->fs_info->delalloc_bytes -= end - start + 1; 1302 state->start + 1);
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1303 root->fs_info->delalloc_bytes -= state->end -
1304 state->start + 1;
1305 BTRFS_I(inode)->delalloc_bytes -= state->end -
1306 state->start + 1;
1207 } 1307 }
1208 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1308 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1209 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1309 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -1374,10 +1474,8 @@ again:
1374 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1474 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1375 1475
1376 /* already ordered? We're done */ 1476 /* already ordered? We're done */
1377 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 1477 if (PagePrivate2(page))
1378 EXTENT_ORDERED, 0)) {
1379 goto out; 1478 goto out;
1380 }
1381 1479
1382 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1480 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1383 if (ordered) { 1481 if (ordered) {
@@ -1413,11 +1511,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1413 struct inode *inode = page->mapping->host; 1511 struct inode *inode = page->mapping->host;
1414 struct btrfs_writepage_fixup *fixup; 1512 struct btrfs_writepage_fixup *fixup;
1415 struct btrfs_root *root = BTRFS_I(inode)->root; 1513 struct btrfs_root *root = BTRFS_I(inode)->root;
1416 int ret;
1417 1514
1418 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, 1515 /* this page is properly in the ordered list */
1419 EXTENT_ORDERED, 0); 1516 if (TestClearPagePrivate2(page))
1420 if (ret)
1421 return 0; 1517 return 0;
1422 1518
1423 if (PageChecked(page)) 1519 if (PageChecked(page))
@@ -1455,9 +1551,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1455 BUG_ON(!path); 1551 BUG_ON(!path);
1456 1552
1457 path->leave_spinning = 1; 1553 path->leave_spinning = 1;
1554
1555 /*
1556 * we may be replacing one extent in the tree with another.
1557 * The new extent is pinned in the extent map, and we don't want
1558 * to drop it from the cache until it is completely in the btree.
1559 *
1560 * So, tell btrfs_drop_extents to leave this extent in the cache.
1561 * the caller is expected to unpin it and allow it to be merged
1562 * with the others.
1563 */
1458 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1564 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1459 file_pos + num_bytes, locked_end, 1565 file_pos + num_bytes, locked_end,
1460 file_pos, &hint); 1566 file_pos, &hint, 0);
1461 BUG_ON(ret); 1567 BUG_ON(ret);
1462 1568
1463 ins.objectid = inode->i_ino; 1569 ins.objectid = inode->i_ino;
@@ -1485,7 +1591,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1485 btrfs_mark_buffer_dirty(leaf); 1591 btrfs_mark_buffer_dirty(leaf);
1486 1592
1487 inode_add_bytes(inode, num_bytes); 1593 inode_add_bytes(inode, num_bytes);
1488 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1489 1594
1490 ins.objectid = disk_bytenr; 1595 ins.objectid = disk_bytenr;
1491 ins.offset = disk_num_bytes; 1596 ins.offset = disk_num_bytes;
@@ -1596,6 +1701,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1596 ordered_extent->len, 1701 ordered_extent->len,
1597 compressed, 0, 0, 1702 compressed, 0, 0,
1598 BTRFS_FILE_EXTENT_REG); 1703 BTRFS_FILE_EXTENT_REG);
1704 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1705 ordered_extent->file_offset,
1706 ordered_extent->len);
1599 BUG_ON(ret); 1707 BUG_ON(ret);
1600 } 1708 }
1601 unlock_extent(io_tree, ordered_extent->file_offset, 1709 unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1623,6 +1731,7 @@ nocow:
1623static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1731static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1624 struct extent_state *state, int uptodate) 1732 struct extent_state *state, int uptodate)
1625{ 1733{
1734 ClearPagePrivate2(page);
1626 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1735 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1627} 1736}
1628 1737
@@ -1669,13 +1778,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1669 failrec->last_mirror = 0; 1778 failrec->last_mirror = 0;
1670 failrec->bio_flags = 0; 1779 failrec->bio_flags = 0;
1671 1780
1672 spin_lock(&em_tree->lock); 1781 read_lock(&em_tree->lock);
1673 em = lookup_extent_mapping(em_tree, start, failrec->len); 1782 em = lookup_extent_mapping(em_tree, start, failrec->len);
1674 if (em->start > start || em->start + em->len < start) { 1783 if (em->start > start || em->start + em->len < start) {
1675 free_extent_map(em); 1784 free_extent_map(em);
1676 em = NULL; 1785 em = NULL;
1677 } 1786 }
1678 spin_unlock(&em_tree->lock); 1787 read_unlock(&em_tree->lock);
1679 1788
1680 if (!em || IS_ERR(em)) { 1789 if (!em || IS_ERR(em)) {
1681 kfree(failrec); 1790 kfree(failrec);
@@ -1794,7 +1903,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1794 return 0; 1903 return 0;
1795 1904
1796 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1905 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1797 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { 1906 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
1798 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 1907 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1799 GFP_NOFS); 1908 GFP_NOFS);
1800 return 0; 1909 return 0;
@@ -2352,6 +2461,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2352 return ret; 2461 return ret;
2353} 2462}
2354 2463
2464int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2465 struct btrfs_root *root,
2466 struct inode *dir, u64 objectid,
2467 const char *name, int name_len)
2468{
2469 struct btrfs_path *path;
2470 struct extent_buffer *leaf;
2471 struct btrfs_dir_item *di;
2472 struct btrfs_key key;
2473 u64 index;
2474 int ret;
2475
2476 path = btrfs_alloc_path();
2477 if (!path)
2478 return -ENOMEM;
2479
2480 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2481 name, name_len, -1);
2482 BUG_ON(!di || IS_ERR(di));
2483
2484 leaf = path->nodes[0];
2485 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2486 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2487 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2488 BUG_ON(ret);
2489 btrfs_release_path(root, path);
2490
2491 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
2492 objectid, root->root_key.objectid,
2493 dir->i_ino, &index, name, name_len);
2494 if (ret < 0) {
2495 BUG_ON(ret != -ENOENT);
2496 di = btrfs_search_dir_index_item(root, path, dir->i_ino,
2497 name, name_len);
2498 BUG_ON(!di || IS_ERR(di));
2499
2500 leaf = path->nodes[0];
2501 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2502 btrfs_release_path(root, path);
2503 index = key.offset;
2504 }
2505
2506 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2507 index, name, name_len, -1);
2508 BUG_ON(!di || IS_ERR(di));
2509
2510 leaf = path->nodes[0];
2511 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2512 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2513 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2514 BUG_ON(ret);
2515 btrfs_release_path(root, path);
2516
2517 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2518 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2519 ret = btrfs_update_inode(trans, root, dir);
2520 BUG_ON(ret);
2521 dir->i_sb->s_dirt = 1;
2522
2523 btrfs_free_path(path);
2524 return 0;
2525}
2526
2355static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 2527static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2356{ 2528{
2357 struct inode *inode = dentry->d_inode; 2529 struct inode *inode = dentry->d_inode;
@@ -2361,29 +2533,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2361 struct btrfs_trans_handle *trans; 2533 struct btrfs_trans_handle *trans;
2362 unsigned long nr = 0; 2534 unsigned long nr = 0;
2363 2535
2364 /*
2365 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2366 * the root of a subvolume or snapshot
2367 */
2368 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 2536 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2369 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { 2537 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2370 return -ENOTEMPTY; 2538 return -ENOTEMPTY;
2371 }
2372 2539
2373 trans = btrfs_start_transaction(root, 1); 2540 trans = btrfs_start_transaction(root, 1);
2374 btrfs_set_trans_block_group(trans, dir); 2541 btrfs_set_trans_block_group(trans, dir);
2375 2542
2543 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
2544 err = btrfs_unlink_subvol(trans, root, dir,
2545 BTRFS_I(inode)->location.objectid,
2546 dentry->d_name.name,
2547 dentry->d_name.len);
2548 goto out;
2549 }
2550
2376 err = btrfs_orphan_add(trans, inode); 2551 err = btrfs_orphan_add(trans, inode);
2377 if (err) 2552 if (err)
2378 goto fail_trans; 2553 goto out;
2379 2554
2380 /* now the directory is empty */ 2555 /* now the directory is empty */
2381 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2556 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2382 dentry->d_name.name, dentry->d_name.len); 2557 dentry->d_name.name, dentry->d_name.len);
2383 if (!err) 2558 if (!err)
2384 btrfs_i_size_write(inode, 0); 2559 btrfs_i_size_write(inode, 0);
2385 2560out:
2386fail_trans:
2387 nr = trans->blocks_used; 2561 nr = trans->blocks_used;
2388 ret = btrfs_end_transaction_throttle(trans, root); 2562 ret = btrfs_end_transaction_throttle(trans, root);
2389 btrfs_btree_balance_dirty(root, nr); 2563 btrfs_btree_balance_dirty(root, nr);
@@ -2864,7 +3038,12 @@ again:
2864 goto again; 3038 goto again;
2865 } 3039 }
2866 3040
2867 btrfs_set_extent_delalloc(inode, page_start, page_end); 3041 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3042 if (ret) {
3043 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3044 goto out_unlock;
3045 }
3046
2868 ret = 0; 3047 ret = 0;
2869 if (offset != PAGE_CACHE_SIZE) { 3048 if (offset != PAGE_CACHE_SIZE) {
2870 kaddr = kmap(page); 3049 kaddr = kmap(page);
@@ -2895,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2895 u64 last_byte; 3074 u64 last_byte;
2896 u64 cur_offset; 3075 u64 cur_offset;
2897 u64 hole_size; 3076 u64 hole_size;
2898 int err; 3077 int err = 0;
2899 3078
2900 if (size <= hole_start) 3079 if (size <= hole_start)
2901 return 0; 3080 return 0;
2902 3081
2903 err = btrfs_check_metadata_free_space(root);
2904 if (err)
2905 return err;
2906
2907 btrfs_truncate_page(inode->i_mapping, inode->i_size); 3082 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2908 3083
2909 while (1) { 3084 while (1) {
@@ -2935,15 +3110,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2935 cur_offset, 3110 cur_offset,
2936 cur_offset + hole_size, 3111 cur_offset + hole_size,
2937 block_end, 3112 block_end,
2938 cur_offset, &hint_byte); 3113 cur_offset, &hint_byte, 1);
2939 if (err) 3114 if (err)
2940 break; 3115 break;
3116
3117 err = btrfs_reserve_metadata_space(root, 1);
3118 if (err)
3119 break;
3120
2941 err = btrfs_insert_file_extent(trans, root, 3121 err = btrfs_insert_file_extent(trans, root,
2942 inode->i_ino, cur_offset, 0, 3122 inode->i_ino, cur_offset, 0,
2943 0, hole_size, 0, hole_size, 3123 0, hole_size, 0, hole_size,
2944 0, 0, 0); 3124 0, 0, 0);
2945 btrfs_drop_extent_cache(inode, hole_start, 3125 btrfs_drop_extent_cache(inode, hole_start,
2946 last_byte - 1, 0); 3126 last_byte - 1, 0);
3127 btrfs_unreserve_metadata_space(root, 1);
2947 } 3128 }
2948 free_extent_map(em); 3129 free_extent_map(em);
2949 cur_offset = last_byte; 3130 cur_offset = last_byte;
@@ -3003,6 +3184,11 @@ void btrfs_delete_inode(struct inode *inode)
3003 } 3184 }
3004 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3185 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3005 3186
3187 if (inode->i_nlink > 0) {
3188 BUG_ON(btrfs_root_refs(&root->root_item) != 0);
3189 goto no_delete;
3190 }
3191
3006 btrfs_i_size_write(inode, 0); 3192 btrfs_i_size_write(inode, 0);
3007 trans = btrfs_join_transaction(root, 1); 3193 trans = btrfs_join_transaction(root, 1);
3008 3194
@@ -3070,29 +3256,67 @@ out_err:
3070 * is kind of like crossing a mount point. 3256 * is kind of like crossing a mount point.
3071 */ 3257 */
3072static int fixup_tree_root_location(struct btrfs_root *root, 3258static int fixup_tree_root_location(struct btrfs_root *root,
3073 struct btrfs_key *location, 3259 struct inode *dir,
3074 struct btrfs_root **sub_root, 3260 struct dentry *dentry,
3075 struct dentry *dentry) 3261 struct btrfs_key *location,
3262 struct btrfs_root **sub_root)
3076{ 3263{
3077 struct btrfs_root_item *ri; 3264 struct btrfs_path *path;
3265 struct btrfs_root *new_root;
3266 struct btrfs_root_ref *ref;
3267 struct extent_buffer *leaf;
3268 int ret;
3269 int err = 0;
3078 3270
3079 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) 3271 path = btrfs_alloc_path();
3080 return 0; 3272 if (!path) {
3081 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) 3273 err = -ENOMEM;
3082 return 0; 3274 goto out;
3275 }
3083 3276
3084 *sub_root = btrfs_read_fs_root(root->fs_info, location, 3277 err = -ENOENT;
3085 dentry->d_name.name, 3278 ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
3086 dentry->d_name.len); 3279 BTRFS_I(dir)->root->root_key.objectid,
3087 if (IS_ERR(*sub_root)) 3280 location->objectid);
3088 return PTR_ERR(*sub_root); 3281 if (ret) {
3282 if (ret < 0)
3283 err = ret;
3284 goto out;
3285 }
3089 3286
3090 ri = &(*sub_root)->root_item; 3287 leaf = path->nodes[0];
3091 location->objectid = btrfs_root_dirid(ri); 3288 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
3092 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 3289 if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
3093 location->offset = 0; 3290 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
3291 goto out;
3094 3292
3095 return 0; 3293 ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
3294 (unsigned long)(ref + 1),
3295 dentry->d_name.len);
3296 if (ret)
3297 goto out;
3298
3299 btrfs_release_path(root->fs_info->tree_root, path);
3300
3301 new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
3302 if (IS_ERR(new_root)) {
3303 err = PTR_ERR(new_root);
3304 goto out;
3305 }
3306
3307 if (btrfs_root_refs(&new_root->root_item) == 0) {
3308 err = -ENOENT;
3309 goto out;
3310 }
3311
3312 *sub_root = new_root;
3313 location->objectid = btrfs_root_dirid(&new_root->root_item);
3314 location->type = BTRFS_INODE_ITEM_KEY;
3315 location->offset = 0;
3316 err = 0;
3317out:
3318 btrfs_free_path(path);
3319 return err;
3096} 3320}
3097 3321
3098static void inode_tree_add(struct inode *inode) 3322static void inode_tree_add(struct inode *inode)
@@ -3101,11 +3325,13 @@ static void inode_tree_add(struct inode *inode)
3101 struct btrfs_inode *entry; 3325 struct btrfs_inode *entry;
3102 struct rb_node **p; 3326 struct rb_node **p;
3103 struct rb_node *parent; 3327 struct rb_node *parent;
3104
3105again: 3328again:
3106 p = &root->inode_tree.rb_node; 3329 p = &root->inode_tree.rb_node;
3107 parent = NULL; 3330 parent = NULL;
3108 3331
3332 if (hlist_unhashed(&inode->i_hash))
3333 return;
3334
3109 spin_lock(&root->inode_lock); 3335 spin_lock(&root->inode_lock);
3110 while (*p) { 3336 while (*p) {
3111 parent = *p; 3337 parent = *p;
@@ -3132,13 +3358,87 @@ again:
3132static void inode_tree_del(struct inode *inode) 3358static void inode_tree_del(struct inode *inode)
3133{ 3359{
3134 struct btrfs_root *root = BTRFS_I(inode)->root; 3360 struct btrfs_root *root = BTRFS_I(inode)->root;
3361 int empty = 0;
3135 3362
3136 spin_lock(&root->inode_lock); 3363 spin_lock(&root->inode_lock);
3137 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3364 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3138 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3365 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3139 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3366 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3367 empty = RB_EMPTY_ROOT(&root->inode_tree);
3368 }
3369 spin_unlock(&root->inode_lock);
3370
3371 if (empty && btrfs_root_refs(&root->root_item) == 0) {
3372 synchronize_srcu(&root->fs_info->subvol_srcu);
3373 spin_lock(&root->inode_lock);
3374 empty = RB_EMPTY_ROOT(&root->inode_tree);
3375 spin_unlock(&root->inode_lock);
3376 if (empty)
3377 btrfs_add_dead_root(root);
3378 }
3379}
3380
3381int btrfs_invalidate_inodes(struct btrfs_root *root)
3382{
3383 struct rb_node *node;
3384 struct rb_node *prev;
3385 struct btrfs_inode *entry;
3386 struct inode *inode;
3387 u64 objectid = 0;
3388
3389 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
3390
3391 spin_lock(&root->inode_lock);
3392again:
3393 node = root->inode_tree.rb_node;
3394 prev = NULL;
3395 while (node) {
3396 prev = node;
3397 entry = rb_entry(node, struct btrfs_inode, rb_node);
3398
3399 if (objectid < entry->vfs_inode.i_ino)
3400 node = node->rb_left;
3401 else if (objectid > entry->vfs_inode.i_ino)
3402 node = node->rb_right;
3403 else
3404 break;
3405 }
3406 if (!node) {
3407 while (prev) {
3408 entry = rb_entry(prev, struct btrfs_inode, rb_node);
3409 if (objectid <= entry->vfs_inode.i_ino) {
3410 node = prev;
3411 break;
3412 }
3413 prev = rb_next(prev);
3414 }
3415 }
3416 while (node) {
3417 entry = rb_entry(node, struct btrfs_inode, rb_node);
3418 objectid = entry->vfs_inode.i_ino + 1;
3419 inode = igrab(&entry->vfs_inode);
3420 if (inode) {
3421 spin_unlock(&root->inode_lock);
3422 if (atomic_read(&inode->i_count) > 1)
3423 d_prune_aliases(inode);
3424 /*
3425 * btrfs_drop_inode will remove it from
3426 * the inode cache when its usage count
3427 * hits zero.
3428 */
3429 iput(inode);
3430 cond_resched();
3431 spin_lock(&root->inode_lock);
3432 goto again;
3433 }
3434
3435 if (cond_resched_lock(&root->inode_lock))
3436 goto again;
3437
3438 node = rb_next(node);
3140 } 3439 }
3141 spin_unlock(&root->inode_lock); 3440 spin_unlock(&root->inode_lock);
3441 return 0;
3142} 3442}
3143 3443
3144static noinline void init_btrfs_i(struct inode *inode) 3444static noinline void init_btrfs_i(struct inode *inode)
@@ -3225,15 +3525,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3225 return inode; 3525 return inode;
3226} 3526}
3227 3527
3528static struct inode *new_simple_dir(struct super_block *s,
3529 struct btrfs_key *key,
3530 struct btrfs_root *root)
3531{
3532 struct inode *inode = new_inode(s);
3533
3534 if (!inode)
3535 return ERR_PTR(-ENOMEM);
3536
3537 init_btrfs_i(inode);
3538
3539 BTRFS_I(inode)->root = root;
3540 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3541 BTRFS_I(inode)->dummy_inode = 1;
3542
3543 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
3544 inode->i_op = &simple_dir_inode_operations;
3545 inode->i_fop = &simple_dir_operations;
3546 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
3547 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3548
3549 return inode;
3550}
3551
3228struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 3552struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3229{ 3553{
3230 struct inode *inode; 3554 struct inode *inode;
3231 struct btrfs_inode *bi = BTRFS_I(dir); 3555 struct btrfs_root *root = BTRFS_I(dir)->root;
3232 struct btrfs_root *root = bi->root;
3233 struct btrfs_root *sub_root = root; 3556 struct btrfs_root *sub_root = root;
3234 struct btrfs_key location; 3557 struct btrfs_key location;
3558 int index;
3235 int ret; 3559 int ret;
3236 3560
3561 dentry->d_op = &btrfs_dentry_operations;
3562
3237 if (dentry->d_name.len > BTRFS_NAME_LEN) 3563 if (dentry->d_name.len > BTRFS_NAME_LEN)
3238 return ERR_PTR(-ENAMETOOLONG); 3564 return ERR_PTR(-ENAMETOOLONG);
3239 3565
@@ -3242,29 +3568,50 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3242 if (ret < 0) 3568 if (ret < 0)
3243 return ERR_PTR(ret); 3569 return ERR_PTR(ret);
3244 3570
3245 inode = NULL; 3571 if (location.objectid == 0)
3246 if (location.objectid) { 3572 return NULL;
3247 ret = fixup_tree_root_location(root, &location, &sub_root, 3573
3248 dentry); 3574 if (location.type == BTRFS_INODE_ITEM_KEY) {
3249 if (ret < 0) 3575 inode = btrfs_iget(dir->i_sb, &location, root);
3250 return ERR_PTR(ret); 3576 return inode;
3251 if (ret > 0) 3577 }
3252 return ERR_PTR(-ENOENT); 3578
3579 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
3580
3581 index = srcu_read_lock(&root->fs_info->subvol_srcu);
3582 ret = fixup_tree_root_location(root, dir, dentry,
3583 &location, &sub_root);
3584 if (ret < 0) {
3585 if (ret != -ENOENT)
3586 inode = ERR_PTR(ret);
3587 else
3588 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3589 } else {
3253 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3590 inode = btrfs_iget(dir->i_sb, &location, sub_root);
3254 if (IS_ERR(inode))
3255 return ERR_CAST(inode);
3256 } 3591 }
3592 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3593
3257 return inode; 3594 return inode;
3258} 3595}
3259 3596
3597static int btrfs_dentry_delete(struct dentry *dentry)
3598{
3599 struct btrfs_root *root;
3600
3601 if (!dentry->d_inode)
3602 return 0;
3603
3604 root = BTRFS_I(dentry->d_inode)->root;
3605 if (btrfs_root_refs(&root->root_item) == 0)
3606 return 1;
3607 return 0;
3608}
3609
3260static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 3610static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3261 struct nameidata *nd) 3611 struct nameidata *nd)
3262{ 3612{
3263 struct inode *inode; 3613 struct inode *inode;
3264 3614
3265 if (dentry->d_name.len > BTRFS_NAME_LEN)
3266 return ERR_PTR(-ENAMETOOLONG);
3267
3268 inode = btrfs_lookup_dentry(dir, dentry); 3615 inode = btrfs_lookup_dentry(dir, dentry);
3269 if (IS_ERR(inode)) 3616 if (IS_ERR(inode))
3270 return ERR_CAST(inode); 3617 return ERR_CAST(inode);
@@ -3603,9 +3950,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3603 if (ret != 0) 3950 if (ret != 0)
3604 goto fail; 3951 goto fail;
3605 3952
3606 if (objectid > root->highest_inode)
3607 root->highest_inode = objectid;
3608
3609 inode->i_uid = current_fsuid(); 3953 inode->i_uid = current_fsuid();
3610 3954
3611 if (dir && (dir->i_mode & S_ISGID)) { 3955 if (dir && (dir->i_mode & S_ISGID)) {
@@ -3673,26 +4017,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
3673 struct inode *parent_inode, struct inode *inode, 4017 struct inode *parent_inode, struct inode *inode,
3674 const char *name, int name_len, int add_backref, u64 index) 4018 const char *name, int name_len, int add_backref, u64 index)
3675{ 4019{
3676 int ret; 4020 int ret = 0;
3677 struct btrfs_key key; 4021 struct btrfs_key key;
3678 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4022 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3679 4023
3680 key.objectid = inode->i_ino; 4024 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
3681 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4025 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
3682 key.offset = 0; 4026 } else {
4027 key.objectid = inode->i_ino;
4028 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
4029 key.offset = 0;
4030 }
4031
4032 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4033 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
4034 key.objectid, root->root_key.objectid,
4035 parent_inode->i_ino,
4036 index, name, name_len);
4037 } else if (add_backref) {
4038 ret = btrfs_insert_inode_ref(trans, root,
4039 name, name_len, inode->i_ino,
4040 parent_inode->i_ino, index);
4041 }
3683 4042
3684 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3685 parent_inode->i_ino,
3686 &key, btrfs_inode_type(inode),
3687 index);
3688 if (ret == 0) { 4043 if (ret == 0) {
3689 if (add_backref) { 4044 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3690 ret = btrfs_insert_inode_ref(trans, root, 4045 parent_inode->i_ino, &key,
3691 name, name_len, 4046 btrfs_inode_type(inode), index);
3692 inode->i_ino, 4047 BUG_ON(ret);
3693 parent_inode->i_ino, 4048
3694 index);
3695 }
3696 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4049 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3697 name_len * 2); 4050 name_len * 2);
3698 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4051 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -3732,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3732 if (!new_valid_dev(rdev)) 4085 if (!new_valid_dev(rdev))
3733 return -EINVAL; 4086 return -EINVAL;
3734 4087
3735 err = btrfs_check_metadata_free_space(root); 4088 /*
4089 * 2 for inode item and ref
4090 * 2 for dir items
4091 * 1 for xattr if selinux is on
4092 */
4093 err = btrfs_reserve_metadata_space(root, 5);
3736 if (err) 4094 if (err)
3737 goto fail; 4095 return err;
3738 4096
3739 trans = btrfs_start_transaction(root, 1); 4097 trans = btrfs_start_transaction(root, 1);
4098 if (!trans)
4099 goto fail;
3740 btrfs_set_trans_block_group(trans, dir); 4100 btrfs_set_trans_block_group(trans, dir);
3741 4101
3742 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4102 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3774,6 +4134,7 @@ out_unlock:
3774 nr = trans->blocks_used; 4134 nr = trans->blocks_used;
3775 btrfs_end_transaction_throttle(trans, root); 4135 btrfs_end_transaction_throttle(trans, root);
3776fail: 4136fail:
4137 btrfs_unreserve_metadata_space(root, 5);
3777 if (drop_inode) { 4138 if (drop_inode) {
3778 inode_dec_link_count(inode); 4139 inode_dec_link_count(inode);
3779 iput(inode); 4140 iput(inode);
@@ -3794,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3794 u64 objectid; 4155 u64 objectid;
3795 u64 index = 0; 4156 u64 index = 0;
3796 4157
3797 err = btrfs_check_metadata_free_space(root); 4158 /*
4159 * 2 for inode item and ref
4160 * 2 for dir items
4161 * 1 for xattr if selinux is on
4162 */
4163 err = btrfs_reserve_metadata_space(root, 5);
3798 if (err) 4164 if (err)
3799 goto fail; 4165 return err;
4166
3800 trans = btrfs_start_transaction(root, 1); 4167 trans = btrfs_start_transaction(root, 1);
4168 if (!trans)
4169 goto fail;
3801 btrfs_set_trans_block_group(trans, dir); 4170 btrfs_set_trans_block_group(trans, dir);
3802 4171
3803 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4172 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3838,6 +4207,7 @@ out_unlock:
3838 nr = trans->blocks_used; 4207 nr = trans->blocks_used;
3839 btrfs_end_transaction_throttle(trans, root); 4208 btrfs_end_transaction_throttle(trans, root);
3840fail: 4209fail:
4210 btrfs_unreserve_metadata_space(root, 5);
3841 if (drop_inode) { 4211 if (drop_inode) {
3842 inode_dec_link_count(inode); 4212 inode_dec_link_count(inode);
3843 iput(inode); 4213 iput(inode);
@@ -3860,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3860 if (inode->i_nlink == 0) 4230 if (inode->i_nlink == 0)
3861 return -ENOENT; 4231 return -ENOENT;
3862 4232
3863 btrfs_inc_nlink(inode); 4233 /*
3864 err = btrfs_check_metadata_free_space(root); 4234 * 1 item for inode ref
4235 * 2 items for dir items
4236 */
4237 err = btrfs_reserve_metadata_space(root, 3);
3865 if (err) 4238 if (err)
3866 goto fail; 4239 return err;
4240
4241 btrfs_inc_nlink(inode);
4242
3867 err = btrfs_set_inode_index(dir, &index); 4243 err = btrfs_set_inode_index(dir, &index);
3868 if (err) 4244 if (err)
3869 goto fail; 4245 goto fail;
@@ -3875,20 +4251,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3875 4251
3876 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4252 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3877 4253
3878 if (err) 4254 if (err) {
3879 drop_inode = 1;
3880
3881 btrfs_update_inode_block_group(trans, dir);
3882 err = btrfs_update_inode(trans, root, inode);
3883
3884 if (err)
3885 drop_inode = 1; 4255 drop_inode = 1;
4256 } else {
4257 btrfs_update_inode_block_group(trans, dir);
4258 err = btrfs_update_inode(trans, root, inode);
4259 BUG_ON(err);
4260 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
4261 }
3886 4262
3887 nr = trans->blocks_used; 4263 nr = trans->blocks_used;
3888
3889 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3890 btrfs_end_transaction_throttle(trans, root); 4264 btrfs_end_transaction_throttle(trans, root);
3891fail: 4265fail:
4266 btrfs_unreserve_metadata_space(root, 3);
3892 if (drop_inode) { 4267 if (drop_inode) {
3893 inode_dec_link_count(inode); 4268 inode_dec_link_count(inode);
3894 iput(inode); 4269 iput(inode);
@@ -3908,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3908 u64 index = 0; 4283 u64 index = 0;
3909 unsigned long nr = 1; 4284 unsigned long nr = 1;
3910 4285
3911 err = btrfs_check_metadata_free_space(root); 4286 /*
4287 * 2 items for inode and ref
4288 * 2 items for dir items
4289 * 1 for xattr if selinux is on
4290 */
4291 err = btrfs_reserve_metadata_space(root, 5);
3912 if (err) 4292 if (err)
3913 goto out_unlock; 4293 return err;
3914 4294
3915 trans = btrfs_start_transaction(root, 1); 4295 trans = btrfs_start_transaction(root, 1);
3916 btrfs_set_trans_block_group(trans, dir); 4296 if (!trans) {
3917 4297 err = -ENOMEM;
3918 if (IS_ERR(trans)) {
3919 err = PTR_ERR(trans);
3920 goto out_unlock; 4298 goto out_unlock;
3921 } 4299 }
4300 btrfs_set_trans_block_group(trans, dir);
3922 4301
3923 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4302 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3924 if (err) { 4303 if (err) {
@@ -3967,6 +4346,7 @@ out_fail:
3967 btrfs_end_transaction_throttle(trans, root); 4346 btrfs_end_transaction_throttle(trans, root);
3968 4347
3969out_unlock: 4348out_unlock:
4349 btrfs_unreserve_metadata_space(root, 5);
3970 if (drop_on_err) 4350 if (drop_on_err)
3971 iput(inode); 4351 iput(inode);
3972 btrfs_btree_balance_dirty(root, nr); 4352 btrfs_btree_balance_dirty(root, nr);
@@ -4064,11 +4444,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4064 int compressed; 4444 int compressed;
4065 4445
4066again: 4446again:
4067 spin_lock(&em_tree->lock); 4447 read_lock(&em_tree->lock);
4068 em = lookup_extent_mapping(em_tree, start, len); 4448 em = lookup_extent_mapping(em_tree, start, len);
4069 if (em) 4449 if (em)
4070 em->bdev = root->fs_info->fs_devices->latest_bdev; 4450 em->bdev = root->fs_info->fs_devices->latest_bdev;
4071 spin_unlock(&em_tree->lock); 4451 read_unlock(&em_tree->lock);
4072 4452
4073 if (em) { 4453 if (em) {
4074 if (em->start > start || em->start + em->len <= start) 4454 if (em->start > start || em->start + em->len <= start)
@@ -4215,6 +4595,11 @@ again:
4215 map = kmap(page); 4595 map = kmap(page);
4216 read_extent_buffer(leaf, map + pg_offset, ptr, 4596 read_extent_buffer(leaf, map + pg_offset, ptr,
4217 copy_size); 4597 copy_size);
4598 if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
4599 memset(map + pg_offset + copy_size, 0,
4600 PAGE_CACHE_SIZE - pg_offset -
4601 copy_size);
4602 }
4218 kunmap(page); 4603 kunmap(page);
4219 } 4604 }
4220 flush_dcache_page(page); 4605 flush_dcache_page(page);
@@ -4259,7 +4644,7 @@ insert:
4259 } 4644 }
4260 4645
4261 err = 0; 4646 err = 0;
4262 spin_lock(&em_tree->lock); 4647 write_lock(&em_tree->lock);
4263 ret = add_extent_mapping(em_tree, em); 4648 ret = add_extent_mapping(em_tree, em);
4264 /* it is possible that someone inserted the extent into the tree 4649 /* it is possible that someone inserted the extent into the tree
4265 * while we had the lock dropped. It is also possible that 4650 * while we had the lock dropped. It is also possible that
@@ -4299,7 +4684,7 @@ insert:
4299 err = 0; 4684 err = 0;
4300 } 4685 }
4301 } 4686 }
4302 spin_unlock(&em_tree->lock); 4687 write_unlock(&em_tree->lock);
4303out: 4688out:
4304 if (path) 4689 if (path)
4305 btrfs_free_path(path); 4690 btrfs_free_path(path);
@@ -4398,13 +4783,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4398 u64 page_start = page_offset(page); 4783 u64 page_start = page_offset(page);
4399 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 4784 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4400 4785
4786
4787 /*
4788 * we have the page locked, so new writeback can't start,
4789 * and the dirty bit won't be cleared while we are here.
4790 *
4791 * Wait for IO on this page so that we can safely clear
4792 * the PagePrivate2 bit and do ordered accounting
4793 */
4401 wait_on_page_writeback(page); 4794 wait_on_page_writeback(page);
4795
4402 tree = &BTRFS_I(page->mapping->host)->io_tree; 4796 tree = &BTRFS_I(page->mapping->host)->io_tree;
4403 if (offset) { 4797 if (offset) {
4404 btrfs_releasepage(page, GFP_NOFS); 4798 btrfs_releasepage(page, GFP_NOFS);
4405 return; 4799 return;
4406 } 4800 }
4407
4408 lock_extent(tree, page_start, page_end, GFP_NOFS); 4801 lock_extent(tree, page_start, page_end, GFP_NOFS);
4409 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 4802 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4410 page_offset(page)); 4803 page_offset(page));
@@ -4415,16 +4808,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4415 */ 4808 */
4416 clear_extent_bit(tree, page_start, page_end, 4809 clear_extent_bit(tree, page_start, page_end,
4417 EXTENT_DIRTY | EXTENT_DELALLOC | 4810 EXTENT_DIRTY | EXTENT_DELALLOC |
4418 EXTENT_LOCKED, 1, 0, GFP_NOFS); 4811 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
4419 btrfs_finish_ordered_io(page->mapping->host, 4812 /*
4420 page_start, page_end); 4813 * whoever cleared the private bit is responsible
4814 * for the finish_ordered_io
4815 */
4816 if (TestClearPagePrivate2(page)) {
4817 btrfs_finish_ordered_io(page->mapping->host,
4818 page_start, page_end);
4819 }
4421 btrfs_put_ordered_extent(ordered); 4820 btrfs_put_ordered_extent(ordered);
4422 lock_extent(tree, page_start, page_end, GFP_NOFS); 4821 lock_extent(tree, page_start, page_end, GFP_NOFS);
4423 } 4822 }
4424 clear_extent_bit(tree, page_start, page_end, 4823 clear_extent_bit(tree, page_start, page_end,
4425 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4824 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
4426 EXTENT_ORDERED, 4825 1, 1, NULL, GFP_NOFS);
4427 1, 1, GFP_NOFS);
4428 __btrfs_releasepage(page, GFP_NOFS); 4826 __btrfs_releasepage(page, GFP_NOFS);
4429 4827
4430 ClearPageChecked(page); 4828 ClearPageChecked(page);
@@ -4473,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4473 goto out; 4871 goto out;
4474 } 4872 }
4475 4873
4874 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
4875 if (ret) {
4876 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4877 ret = VM_FAULT_SIGBUS;
4878 goto out;
4879 }
4880
4476 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4881 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4477again: 4882again:
4478 lock_page(page); 4883 lock_page(page);
@@ -4504,7 +4909,23 @@ again:
4504 goto again; 4909 goto again;
4505 } 4910 }
4506 4911
4507 btrfs_set_extent_delalloc(inode, page_start, page_end); 4912 /*
4913 * XXX - page_mkwrite gets called every time the page is dirtied, even
4914 * if it was already dirty, so for space accounting reasons we need to
4915 * clear any delalloc bits for the range we are fixing to save. There
4916 * is probably a better way to do this, but for now keep consistent with
4917 * prepare_pages in the normal write path.
4918 */
4919 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
4920 EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
4921
4922 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
4923 if (ret) {
4924 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4925 ret = VM_FAULT_SIGBUS;
4926 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4927 goto out_unlock;
4928 }
4508 ret = 0; 4929 ret = 0;
4509 4930
4510 /* page is wholly or partially inside EOF */ 4931 /* page is wholly or partially inside EOF */
@@ -4521,11 +4942,15 @@ again:
4521 } 4942 }
4522 ClearPageChecked(page); 4943 ClearPageChecked(page);
4523 set_page_dirty(page); 4944 set_page_dirty(page);
4945 SetPageUptodate(page);
4524 4946
4525 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 4947 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4526 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4948 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4527 4949
4528out_unlock: 4950out_unlock:
4951 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
4952 if (!ret)
4953 return VM_FAULT_LOCKED;
4529 unlock_page(page); 4954 unlock_page(page);
4530out: 4955out:
4531 return ret; 4956 return ret;
@@ -4594,11 +5019,11 @@ out:
4594 * create a new subvolume directory/inode (helper for the ioctl). 5019 * create a new subvolume directory/inode (helper for the ioctl).
4595 */ 5020 */
4596int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 5021int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4597 struct btrfs_root *new_root, struct dentry *dentry, 5022 struct btrfs_root *new_root,
4598 u64 new_dirid, u64 alloc_hint) 5023 u64 new_dirid, u64 alloc_hint)
4599{ 5024{
4600 struct inode *inode; 5025 struct inode *inode;
4601 int error; 5026 int err;
4602 u64 index = 0; 5027 u64 index = 0;
4603 5028
4604 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 5029 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4611,11 +5036,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4611 inode->i_nlink = 1; 5036 inode->i_nlink = 1;
4612 btrfs_i_size_write(inode, 0); 5037 btrfs_i_size_write(inode, 0);
4613 5038
4614 error = btrfs_update_inode(trans, new_root, inode); 5039 err = btrfs_update_inode(trans, new_root, inode);
4615 if (error) 5040 BUG_ON(err);
4616 return error;
4617 5041
4618 d_instantiate(dentry, inode); 5042 iput(inode);
4619 return 0; 5043 return 0;
4620} 5044}
4621 5045
@@ -4641,6 +5065,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4641 return NULL; 5065 return NULL;
4642 ei->last_trans = 0; 5066 ei->last_trans = 0;
4643 ei->logged_trans = 0; 5067 ei->logged_trans = 0;
5068 ei->delalloc_extents = 0;
5069 ei->delalloc_reserved_extents = 0;
4644 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5070 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4645 INIT_LIST_HEAD(&ei->i_orphan); 5071 INIT_LIST_HEAD(&ei->i_orphan);
4646 INIT_LIST_HEAD(&ei->ordered_operations); 5072 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -4693,6 +5119,16 @@ void btrfs_destroy_inode(struct inode *inode)
4693 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5119 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4694} 5120}
4695 5121
5122void btrfs_drop_inode(struct inode *inode)
5123{
5124 struct btrfs_root *root = BTRFS_I(inode)->root;
5125
5126 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5127 generic_delete_inode(inode);
5128 else
5129 generic_drop_inode(inode);
5130}
5131
4696static void init_once(void *foo) 5132static void init_once(void *foo)
4697{ 5133{
4698 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 5134 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4761,31 +5197,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4761{ 5197{
4762 struct btrfs_trans_handle *trans; 5198 struct btrfs_trans_handle *trans;
4763 struct btrfs_root *root = BTRFS_I(old_dir)->root; 5199 struct btrfs_root *root = BTRFS_I(old_dir)->root;
5200 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
4764 struct inode *new_inode = new_dentry->d_inode; 5201 struct inode *new_inode = new_dentry->d_inode;
4765 struct inode *old_inode = old_dentry->d_inode; 5202 struct inode *old_inode = old_dentry->d_inode;
4766 struct timespec ctime = CURRENT_TIME; 5203 struct timespec ctime = CURRENT_TIME;
4767 u64 index = 0; 5204 u64 index = 0;
5205 u64 root_objectid;
4768 int ret; 5206 int ret;
4769 5207
4770 /* we're not allowed to rename between subvolumes */ 5208 if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
4771 if (BTRFS_I(old_inode)->root->root_key.objectid != 5209 return -EPERM;
4772 BTRFS_I(new_dir)->root->root_key.objectid) 5210
5211 /* we only allow rename subvolume link between subvolumes */
5212 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
4773 return -EXDEV; 5213 return -EXDEV;
4774 5214
5215 if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
5216 (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
5217 return -ENOTEMPTY;
5218
4775 if (S_ISDIR(old_inode->i_mode) && new_inode && 5219 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4776 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { 5220 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4777 return -ENOTEMPTY; 5221 return -ENOTEMPTY;
4778 }
4779 5222
4780 /* to rename a snapshot or subvolume, we need to juggle the 5223 /*
4781 * backrefs. This isn't coded yet 5224 * 2 items for dir items
5225 * 1 item for orphan entry
5226 * 1 item for ref
4782 */ 5227 */
4783 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5228 ret = btrfs_reserve_metadata_space(root, 4);
4784 return -EXDEV;
4785
4786 ret = btrfs_check_metadata_free_space(root);
4787 if (ret) 5229 if (ret)
4788 goto out_unlock; 5230 return ret;
4789 5231
4790 /* 5232 /*
4791 * we're using rename to replace one file with another. 5233 * we're using rename to replace one file with another.
@@ -4796,8 +5238,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4796 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 5238 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4797 filemap_flush(old_inode->i_mapping); 5239 filemap_flush(old_inode->i_mapping);
4798 5240
5241 /* close the racy window with snapshot create/destroy ioctl */
5242 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5243 down_read(&root->fs_info->subvol_sem);
5244
4799 trans = btrfs_start_transaction(root, 1); 5245 trans = btrfs_start_transaction(root, 1);
5246 btrfs_set_trans_block_group(trans, new_dir);
5247
5248 if (dest != root)
5249 btrfs_record_root_in_trans(trans, dest);
5250
5251 ret = btrfs_set_inode_index(new_dir, &index);
5252 if (ret)
5253 goto out_fail;
4800 5254
5255 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
5256 /* force full log commit if subvolume involved. */
5257 root->fs_info->last_trans_log_full_commit = trans->transid;
5258 } else {
5259 ret = btrfs_insert_inode_ref(trans, dest,
5260 new_dentry->d_name.name,
5261 new_dentry->d_name.len,
5262 old_inode->i_ino,
5263 new_dir->i_ino, index);
5264 if (ret)
5265 goto out_fail;
5266 /*
5267 * this is an ugly little race, but the rename is required
5268 * to make sure that if we crash, the inode is either at the
5269 * old name or the new one. pinning the log transaction lets
5270 * us make sure we don't allow a log commit to come in after
5271 * we unlink the name but before we add the new name back in.
5272 */
5273 btrfs_pin_log_trans(root);
5274 }
4801 /* 5275 /*
4802 * make sure the inode gets flushed if it is replacing 5276 * make sure the inode gets flushed if it is replacing
4803 * something. 5277 * something.
@@ -4807,18 +5281,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4807 btrfs_add_ordered_operation(trans, root, old_inode); 5281 btrfs_add_ordered_operation(trans, root, old_inode);
4808 } 5282 }
4809 5283
4810 /*
4811 * this is an ugly little race, but the rename is required to make
4812 * sure that if we crash, the inode is either at the old name
4813 * or the new one. pinning the log transaction lets us make sure
4814 * we don't allow a log commit to come in after we unlink the
4815 * name but before we add the new name back in.
4816 */
4817 btrfs_pin_log_trans(root);
4818
4819 btrfs_set_trans_block_group(trans, new_dir);
4820
4821 btrfs_inc_nlink(old_dentry->d_inode);
4822 old_dir->i_ctime = old_dir->i_mtime = ctime; 5284 old_dir->i_ctime = old_dir->i_mtime = ctime;
4823 new_dir->i_ctime = new_dir->i_mtime = ctime; 5285 new_dir->i_ctime = new_dir->i_mtime = ctime;
4824 old_inode->i_ctime = ctime; 5286 old_inode->i_ctime = ctime;
@@ -4826,47 +5288,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4826 if (old_dentry->d_parent != new_dentry->d_parent) 5288 if (old_dentry->d_parent != new_dentry->d_parent)
4827 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 5289 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4828 5290
4829 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 5291 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4830 old_dentry->d_name.name, 5292 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
4831 old_dentry->d_name.len); 5293 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
4832 if (ret) 5294 old_dentry->d_name.name,
4833 goto out_fail; 5295 old_dentry->d_name.len);
5296 } else {
5297 btrfs_inc_nlink(old_dentry->d_inode);
5298 ret = btrfs_unlink_inode(trans, root, old_dir,
5299 old_dentry->d_inode,
5300 old_dentry->d_name.name,
5301 old_dentry->d_name.len);
5302 }
5303 BUG_ON(ret);
4834 5304
4835 if (new_inode) { 5305 if (new_inode) {
4836 new_inode->i_ctime = CURRENT_TIME; 5306 new_inode->i_ctime = CURRENT_TIME;
4837 ret = btrfs_unlink_inode(trans, root, new_dir, 5307 if (unlikely(new_inode->i_ino ==
4838 new_dentry->d_inode, 5308 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4839 new_dentry->d_name.name, 5309 root_objectid = BTRFS_I(new_inode)->location.objectid;
4840 new_dentry->d_name.len); 5310 ret = btrfs_unlink_subvol(trans, dest, new_dir,
4841 if (ret) 5311 root_objectid,
4842 goto out_fail; 5312 new_dentry->d_name.name,
5313 new_dentry->d_name.len);
5314 BUG_ON(new_inode->i_nlink == 0);
5315 } else {
5316 ret = btrfs_unlink_inode(trans, dest, new_dir,
5317 new_dentry->d_inode,
5318 new_dentry->d_name.name,
5319 new_dentry->d_name.len);
5320 }
5321 BUG_ON(ret);
4843 if (new_inode->i_nlink == 0) { 5322 if (new_inode->i_nlink == 0) {
4844 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 5323 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4845 if (ret) 5324 BUG_ON(ret);
4846 goto out_fail;
4847 } 5325 }
4848
4849 } 5326 }
4850 ret = btrfs_set_inode_index(new_dir, &index);
4851 if (ret)
4852 goto out_fail;
4853 5327
4854 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, 5328 ret = btrfs_add_link(trans, new_dir, old_inode,
4855 old_inode, new_dentry->d_name.name, 5329 new_dentry->d_name.name,
4856 new_dentry->d_name.len, 1, index); 5330 new_dentry->d_name.len, 0, index);
4857 if (ret) 5331 BUG_ON(ret);
4858 goto out_fail;
4859 5332
4860 btrfs_log_new_name(trans, old_inode, old_dir, 5333 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
4861 new_dentry->d_parent); 5334 btrfs_log_new_name(trans, old_inode, old_dir,
5335 new_dentry->d_parent);
5336 btrfs_end_log_trans(root);
5337 }
4862out_fail: 5338out_fail:
4863
4864 /* this btrfs_end_log_trans just allows the current
4865 * log-sub transaction to complete
4866 */
4867 btrfs_end_log_trans(root);
4868 btrfs_end_transaction_throttle(trans, root); 5339 btrfs_end_transaction_throttle(trans, root);
4869out_unlock: 5340
5341 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5342 up_read(&root->fs_info->subvol_sem);
5343
5344 btrfs_unreserve_metadata_space(root, 4);
4870 return ret; 5345 return ret;
4871} 5346}
4872 5347
@@ -4938,11 +5413,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4938 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5413 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4939 return -ENAMETOOLONG; 5414 return -ENAMETOOLONG;
4940 5415
4941 err = btrfs_check_metadata_free_space(root); 5416 /*
5417 * 2 items for inode item and ref
5418 * 2 items for dir items
5419 * 1 item for xattr if selinux is on
5420 */
5421 err = btrfs_reserve_metadata_space(root, 5);
4942 if (err) 5422 if (err)
4943 goto out_fail; 5423 return err;
4944 5424
4945 trans = btrfs_start_transaction(root, 1); 5425 trans = btrfs_start_transaction(root, 1);
5426 if (!trans)
5427 goto out_fail;
4946 btrfs_set_trans_block_group(trans, dir); 5428 btrfs_set_trans_block_group(trans, dir);
4947 5429
4948 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5430 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5023,6 +5505,7 @@ out_unlock:
5023 nr = trans->blocks_used; 5505 nr = trans->blocks_used;
5024 btrfs_end_transaction_throttle(trans, root); 5506 btrfs_end_transaction_throttle(trans, root);
5025out_fail: 5507out_fail:
5508 btrfs_unreserve_metadata_space(root, 5);
5026 if (drop_inode) { 5509 if (drop_inode) {
5027 inode_dec_link_count(inode); 5510 inode_dec_link_count(inode);
5028 iput(inode); 5511 iput(inode);
@@ -5044,6 +5527,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5044 5527
5045 while (num_bytes > 0) { 5528 while (num_bytes > 0) {
5046 alloc_size = min(num_bytes, root->fs_info->max_extent); 5529 alloc_size = min(num_bytes, root->fs_info->max_extent);
5530
5531 ret = btrfs_reserve_metadata_space(root, 1);
5532 if (ret)
5533 goto out;
5534
5047 ret = btrfs_reserve_extent(trans, root, alloc_size, 5535 ret = btrfs_reserve_extent(trans, root, alloc_size,
5048 root->sectorsize, 0, alloc_hint, 5536 root->sectorsize, 0, alloc_hint,
5049 (u64)-1, &ins, 1); 5537 (u64)-1, &ins, 1);
@@ -5058,9 +5546,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5058 0, 0, 0, 5546 0, 0, 0,
5059 BTRFS_FILE_EXTENT_PREALLOC); 5547 BTRFS_FILE_EXTENT_PREALLOC);
5060 BUG_ON(ret); 5548 BUG_ON(ret);
5549 btrfs_drop_extent_cache(inode, cur_offset,
5550 cur_offset + ins.offset -1, 0);
5061 num_bytes -= ins.offset; 5551 num_bytes -= ins.offset;
5062 cur_offset += ins.offset; 5552 cur_offset += ins.offset;
5063 alloc_hint = ins.objectid + ins.offset; 5553 alloc_hint = ins.objectid + ins.offset;
5554 btrfs_unreserve_metadata_space(root, 1);
5064 } 5555 }
5065out: 5556out:
5066 if (cur_offset > start) { 5557 if (cur_offset > start) {
@@ -5201,7 +5692,7 @@ static int btrfs_permission(struct inode *inode, int mask)
5201 return generic_permission(inode, mask, btrfs_check_acl); 5692 return generic_permission(inode, mask, btrfs_check_acl);
5202} 5693}
5203 5694
5204static struct inode_operations btrfs_dir_inode_operations = { 5695static const struct inode_operations btrfs_dir_inode_operations = {
5205 .getattr = btrfs_getattr, 5696 .getattr = btrfs_getattr,
5206 .lookup = btrfs_lookup, 5697 .lookup = btrfs_lookup,
5207 .create = btrfs_create, 5698 .create = btrfs_create,
@@ -5219,11 +5710,12 @@ static struct inode_operations btrfs_dir_inode_operations = {
5219 .removexattr = btrfs_removexattr, 5710 .removexattr = btrfs_removexattr,
5220 .permission = btrfs_permission, 5711 .permission = btrfs_permission,
5221}; 5712};
5222static struct inode_operations btrfs_dir_ro_inode_operations = { 5713static const struct inode_operations btrfs_dir_ro_inode_operations = {
5223 .lookup = btrfs_lookup, 5714 .lookup = btrfs_lookup,
5224 .permission = btrfs_permission, 5715 .permission = btrfs_permission,
5225}; 5716};
5226static struct file_operations btrfs_dir_file_operations = { 5717
5718static const struct file_operations btrfs_dir_file_operations = {
5227 .llseek = generic_file_llseek, 5719 .llseek = generic_file_llseek,
5228 .read = generic_read_dir, 5720 .read = generic_read_dir,
5229 .readdir = btrfs_real_readdir, 5721 .readdir = btrfs_real_readdir,
@@ -5245,6 +5737,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5245 .readpage_io_failed_hook = btrfs_io_failed_hook, 5737 .readpage_io_failed_hook = btrfs_io_failed_hook,
5246 .set_bit_hook = btrfs_set_bit_hook, 5738 .set_bit_hook = btrfs_set_bit_hook,
5247 .clear_bit_hook = btrfs_clear_bit_hook, 5739 .clear_bit_hook = btrfs_clear_bit_hook,
5740 .merge_extent_hook = btrfs_merge_extent_hook,
5741 .split_extent_hook = btrfs_split_extent_hook,
5248}; 5742};
5249 5743
5250/* 5744/*
@@ -5259,7 +5753,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5259 * 5753 *
5260 * For now we're avoiding this by dropping bmap. 5754 * For now we're avoiding this by dropping bmap.
5261 */ 5755 */
5262static struct address_space_operations btrfs_aops = { 5756static const struct address_space_operations btrfs_aops = {
5263 .readpage = btrfs_readpage, 5757 .readpage = btrfs_readpage,
5264 .writepage = btrfs_writepage, 5758 .writepage = btrfs_writepage,
5265 .writepages = btrfs_writepages, 5759 .writepages = btrfs_writepages,
@@ -5269,16 +5763,17 @@ static struct address_space_operations btrfs_aops = {
5269 .invalidatepage = btrfs_invalidatepage, 5763 .invalidatepage = btrfs_invalidatepage,
5270 .releasepage = btrfs_releasepage, 5764 .releasepage = btrfs_releasepage,
5271 .set_page_dirty = btrfs_set_page_dirty, 5765 .set_page_dirty = btrfs_set_page_dirty,
5766 .error_remove_page = generic_error_remove_page,
5272}; 5767};
5273 5768
5274static struct address_space_operations btrfs_symlink_aops = { 5769static const struct address_space_operations btrfs_symlink_aops = {
5275 .readpage = btrfs_readpage, 5770 .readpage = btrfs_readpage,
5276 .writepage = btrfs_writepage, 5771 .writepage = btrfs_writepage,
5277 .invalidatepage = btrfs_invalidatepage, 5772 .invalidatepage = btrfs_invalidatepage,
5278 .releasepage = btrfs_releasepage, 5773 .releasepage = btrfs_releasepage,
5279}; 5774};
5280 5775
5281static struct inode_operations btrfs_file_inode_operations = { 5776static const struct inode_operations btrfs_file_inode_operations = {
5282 .truncate = btrfs_truncate, 5777 .truncate = btrfs_truncate,
5283 .getattr = btrfs_getattr, 5778 .getattr = btrfs_getattr,
5284 .setattr = btrfs_setattr, 5779 .setattr = btrfs_setattr,
@@ -5290,7 +5785,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5290 .fallocate = btrfs_fallocate, 5785 .fallocate = btrfs_fallocate,
5291 .fiemap = btrfs_fiemap, 5786 .fiemap = btrfs_fiemap,
5292}; 5787};
5293static struct inode_operations btrfs_special_inode_operations = { 5788static const struct inode_operations btrfs_special_inode_operations = {
5294 .getattr = btrfs_getattr, 5789 .getattr = btrfs_getattr,
5295 .setattr = btrfs_setattr, 5790 .setattr = btrfs_setattr,
5296 .permission = btrfs_permission, 5791 .permission = btrfs_permission,
@@ -5299,7 +5794,7 @@ static struct inode_operations btrfs_special_inode_operations = {
5299 .listxattr = btrfs_listxattr, 5794 .listxattr = btrfs_listxattr,
5300 .removexattr = btrfs_removexattr, 5795 .removexattr = btrfs_removexattr,
5301}; 5796};
5302static struct inode_operations btrfs_symlink_inode_operations = { 5797static const struct inode_operations btrfs_symlink_inode_operations = {
5303 .readlink = generic_readlink, 5798 .readlink = generic_readlink,
5304 .follow_link = page_follow_link_light, 5799 .follow_link = page_follow_link_light,
5305 .put_link = page_put_link, 5800 .put_link = page_put_link,
@@ -5309,3 +5804,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5309 .listxattr = btrfs_listxattr, 5804 .listxattr = btrfs_listxattr,
5310 .removexattr = btrfs_removexattr, 5805 .removexattr = btrfs_removexattr,
5311}; 5806};
5807
5808struct dentry_operations btrfs_dentry_operations = {
5809 .d_delete = btrfs_dentry_delete,
5810};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd88f25889f7..9a780c8d0ac8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
230 struct btrfs_root_item root_item; 230 struct btrfs_root_item root_item;
231 struct btrfs_inode_item *inode_item; 231 struct btrfs_inode_item *inode_item;
232 struct extent_buffer *leaf; 232 struct extent_buffer *leaf;
233 struct btrfs_root *new_root = root; 233 struct btrfs_root *new_root;
234 struct inode *dir; 234 struct inode *dir = dentry->d_parent->d_inode;
235 int ret; 235 int ret;
236 int err; 236 int err;
237 u64 objectid; 237 u64 objectid;
@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1; 240 unsigned long nr = 1;
241 241
242 ret = btrfs_check_metadata_free_space(root); 242 /*
243 * 1 - inode item
244 * 2 - refs
245 * 1 - root item
246 * 2 - dir items
247 */
248 ret = btrfs_reserve_metadata_space(root, 6);
243 if (ret) 249 if (ret)
244 goto fail_commit; 250 return ret;
245 251
246 trans = btrfs_start_transaction(root, 1); 252 trans = btrfs_start_transaction(root, 1);
247 BUG_ON(!trans); 253 BUG_ON(!trans);
@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
304 if (ret) 310 if (ret)
305 goto fail; 311 goto fail;
306 312
313 key.offset = (u64)-1;
314 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
315 BUG_ON(IS_ERR(new_root));
316
317 btrfs_record_root_in_trans(trans, new_root);
318
319 ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
320 BTRFS_I(dir)->block_group);
307 /* 321 /*
308 * insert the directory item 322 * insert the directory item
309 */ 323 */
310 key.offset = (u64)-1;
311 dir = dentry->d_parent->d_inode;
312 ret = btrfs_set_inode_index(dir, &index); 324 ret = btrfs_set_inode_index(dir, &index);
313 BUG_ON(ret); 325 BUG_ON(ret);
314 326
@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
322 ret = btrfs_update_inode(trans, root, dir); 334 ret = btrfs_update_inode(trans, root, dir);
323 BUG_ON(ret); 335 BUG_ON(ret);
324 336
325 /* add the backref first */
326 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 337 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
327 objectid, BTRFS_ROOT_BACKREF_KEY, 338 objectid, root->root_key.objectid,
328 root->root_key.objectid,
329 dir->i_ino, index, name, namelen); 339 dir->i_ino, index, name, namelen);
330 340
331 BUG_ON(ret); 341 BUG_ON(ret);
332 342
333 /* now add the forward ref */ 343 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
334 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
335 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
336 objectid,
337 dir->i_ino, index, name, namelen);
338
339 BUG_ON(ret);
340
341 ret = btrfs_commit_transaction(trans, root);
342 if (ret)
343 goto fail_commit;
344
345 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
346 BUG_ON(!new_root);
347
348 trans = btrfs_start_transaction(new_root, 1);
349 BUG_ON(!trans);
350
351 ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
352 BTRFS_I(dir)->block_group);
353 if (ret)
354 goto fail;
355
356fail: 344fail:
357 nr = trans->blocks_used; 345 nr = trans->blocks_used;
358 err = btrfs_commit_transaction(trans, new_root); 346 err = btrfs_commit_transaction(trans, root);
359 if (err && !ret) 347 if (err && !ret)
360 ret = err; 348 ret = err;
361fail_commit: 349
350 btrfs_unreserve_metadata_space(root, 6);
362 btrfs_btree_balance_dirty(root, nr); 351 btrfs_btree_balance_dirty(root, nr);
363 return ret; 352 return ret;
364} 353}
@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
375 if (!root->ref_cows) 364 if (!root->ref_cows)
376 return -EINVAL; 365 return -EINVAL;
377 366
378 ret = btrfs_check_metadata_free_space(root); 367 /*
368 * 1 - inode item
369 * 2 - refs
370 * 1 - root item
371 * 2 - dir items
372 */
373 ret = btrfs_reserve_metadata_space(root, 6);
379 if (ret) 374 if (ret)
380 goto fail_unlock; 375 goto fail_unlock;
381 376
382 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
383 if (!pending_snapshot) { 378 if (!pending_snapshot) {
384 ret = -ENOMEM; 379 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6);
385 goto fail_unlock; 381 goto fail_unlock;
386 } 382 }
387 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
388 if (!pending_snapshot->name) { 384 if (!pending_snapshot->name) {
389 ret = -ENOMEM; 385 ret = -ENOMEM;
390 kfree(pending_snapshot); 386 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6);
391 goto fail_unlock; 388 goto fail_unlock;
392 } 389 }
393 memcpy(pending_snapshot->name, name, namelen); 390 memcpy(pending_snapshot->name, name, namelen);
@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
420 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup 417 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
421 * inside this filesystem so it's quite a bit simpler. 418 * inside this filesystem so it's quite a bit simpler.
422 */ 419 */
423static noinline int btrfs_mksubvol(struct path *parent, char *name, 420static noinline int btrfs_mksubvol(struct path *parent,
424 int mode, int namelen, 421 char *name, int namelen,
425 struct btrfs_root *snap_src) 422 struct btrfs_root *snap_src)
426{ 423{
424 struct inode *dir = parent->dentry->d_inode;
427 struct dentry *dentry; 425 struct dentry *dentry;
428 int error; 426 int error;
429 427
430 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); 428 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
431 429
432 dentry = lookup_one_len(name, parent->dentry, namelen); 430 dentry = lookup_one_len(name, parent->dentry, namelen);
433 error = PTR_ERR(dentry); 431 error = PTR_ERR(dentry);
@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
438 if (dentry->d_inode) 436 if (dentry->d_inode)
439 goto out_dput; 437 goto out_dput;
440 438
441 if (!IS_POSIXACL(parent->dentry->d_inode))
442 mode &= ~current_umask();
443
444 error = mnt_want_write(parent->mnt); 439 error = mnt_want_write(parent->mnt);
445 if (error) 440 if (error)
446 goto out_dput; 441 goto out_dput;
447 442
448 error = btrfs_may_create(parent->dentry->d_inode, dentry); 443 error = btrfs_may_create(dir, dentry);
449 if (error) 444 if (error)
450 goto out_drop_write; 445 goto out_drop_write;
451 446
452 /* 447 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
453 * Actually perform the low-level subvolume creation after all 448
454 * this VFS fuzz. 449 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
455 * 450 goto out_up_read;
456 * Eventually we want to pass in an inode under which we create this 451
457 * subvolume, but for now all are under the filesystem root.
458 *
459 * Also we should pass on the mode eventually to allow creating new
460 * subvolume with specific mode bits.
461 */
462 if (snap_src) { 452 if (snap_src) {
463 struct dentry *dir = dentry->d_parent; 453 error = create_snapshot(snap_src, dentry,
464 struct dentry *test = dir->d_parent; 454 name, namelen);
465 struct btrfs_path *path = btrfs_alloc_path();
466 int ret;
467 u64 test_oid;
468 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
469
470 test_oid = snap_src->root_key.objectid;
471
472 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
473 path, parent_oid, test_oid);
474 if (ret == 0)
475 goto create;
476 btrfs_release_path(snap_src->fs_info->tree_root, path);
477
478 /* we need to make sure we aren't creating a directory loop
479 * by taking a snapshot of something that has our current
480 * subvol in its directory tree. So, this loops through
481 * the dentries and checks the forward refs for each subvolume
482 * to see if is references the subvolume where we are
483 * placing this new snapshot.
484 */
485 while (1) {
486 if (!test ||
487 dir == snap_src->fs_info->sb->s_root ||
488 test == snap_src->fs_info->sb->s_root ||
489 test->d_inode->i_sb != snap_src->fs_info->sb) {
490 break;
491 }
492 if (S_ISLNK(test->d_inode->i_mode)) {
493 printk(KERN_INFO "Btrfs symlink in snapshot "
494 "path, failed\n");
495 error = -EMLINK;
496 btrfs_free_path(path);
497 goto out_drop_write;
498 }
499 test_oid =
500 BTRFS_I(test->d_inode)->root->root_key.objectid;
501 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
502 path, test_oid, parent_oid);
503 if (ret == 0) {
504 printk(KERN_INFO "Btrfs snapshot creation "
505 "failed, looping\n");
506 error = -EMLINK;
507 btrfs_free_path(path);
508 goto out_drop_write;
509 }
510 btrfs_release_path(snap_src->fs_info->tree_root, path);
511 test = test->d_parent;
512 }
513create:
514 btrfs_free_path(path);
515 error = create_snapshot(snap_src, dentry, name, namelen);
516 } else { 455 } else {
517 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, 456 error = create_subvol(BTRFS_I(dir)->root, dentry,
518 dentry, name, namelen); 457 name, namelen);
519 } 458 }
520 if (error) 459 if (!error)
521 goto out_drop_write; 460 fsnotify_mkdir(dir, dentry);
522 461out_up_read:
523 fsnotify_mkdir(parent->dentry->d_inode, dentry); 462 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
524out_drop_write: 463out_drop_write:
525 mnt_drop_write(parent->mnt); 464 mnt_drop_write(parent->mnt);
526out_dput: 465out_dput:
527 dput(dentry); 466 dput(dentry);
528out_unlock: 467out_unlock:
529 mutex_unlock(&parent->dentry->d_inode->i_mutex); 468 mutex_unlock(&dir->i_mutex);
530 return error; 469 return error;
531} 470}
532 471
533
534static int btrfs_defrag_file(struct file *file) 472static int btrfs_defrag_file(struct file *file)
535{ 473{
536 struct inode *inode = fdentry(file)->d_inode; 474 struct inode *inode = fdentry(file)->d_inode;
@@ -596,9 +534,8 @@ again:
596 clear_page_dirty_for_io(page); 534 clear_page_dirty_for_io(page);
597 535
598 btrfs_set_extent_delalloc(inode, page_start, page_end); 536 btrfs_set_extent_delalloc(inode, page_start, page_end);
599
600 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
601 set_page_dirty(page); 537 set_page_dirty(page);
538 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
602 unlock_page(page); 539 unlock_page(page);
603 page_cache_release(page); 540 page_cache_release(page);
604 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 541 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -609,7 +546,8 @@ out_unlock:
609 return 0; 546 return 0;
610} 547}
611 548
612static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) 549static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
550 void __user *arg)
613{ 551{
614 u64 new_size; 552 u64 new_size;
615 u64 old_size; 553 u64 old_size;
@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
718{ 656{
719 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 657 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
720 struct btrfs_ioctl_vol_args *vol_args; 658 struct btrfs_ioctl_vol_args *vol_args;
721 struct btrfs_dir_item *di;
722 struct btrfs_path *path;
723 struct file *src_file; 659 struct file *src_file;
724 u64 root_dirid;
725 int namelen; 660 int namelen;
726 int ret = 0; 661 int ret = 0;
727 662
@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
739 goto out; 674 goto out;
740 } 675 }
741 676
742 path = btrfs_alloc_path();
743 if (!path) {
744 ret = -ENOMEM;
745 goto out;
746 }
747
748 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
749 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
750 path, root_dirid,
751 vol_args->name, namelen, 0);
752 btrfs_free_path(path);
753
754 if (di && !IS_ERR(di)) {
755 ret = -EEXIST;
756 goto out;
757 }
758
759 if (IS_ERR(di)) {
760 ret = PTR_ERR(di);
761 goto out;
762 }
763
764 if (subvol) { 677 if (subvol) {
765 ret = btrfs_mksubvol(&file->f_path, vol_args->name, 678 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
766 file->f_path.dentry->d_inode->i_mode, 679 NULL);
767 namelen, NULL);
768 } else { 680 } else {
769 struct inode *src_inode; 681 struct inode *src_inode;
770 src_file = fget(vol_args->fd); 682 src_file = fget(vol_args->fd);
@@ -781,17 +693,156 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
781 fput(src_file); 693 fput(src_file);
782 goto out; 694 goto out;
783 } 695 }
784 ret = btrfs_mksubvol(&file->f_path, vol_args->name, 696 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
785 file->f_path.dentry->d_inode->i_mode, 697 BTRFS_I(src_inode)->root);
786 namelen, BTRFS_I(src_inode)->root);
787 fput(src_file); 698 fput(src_file);
788 } 699 }
789
790out: 700out:
791 kfree(vol_args); 701 kfree(vol_args);
792 return ret; 702 return ret;
793} 703}
794 704
705/*
706 * helper to check if the subvolume references other subvolumes
707 */
708static noinline int may_destroy_subvol(struct btrfs_root *root)
709{
710 struct btrfs_path *path;
711 struct btrfs_key key;
712 int ret;
713
714 path = btrfs_alloc_path();
715 if (!path)
716 return -ENOMEM;
717
718 key.objectid = root->root_key.objectid;
719 key.type = BTRFS_ROOT_REF_KEY;
720 key.offset = (u64)-1;
721
722 ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
723 &key, path, 0, 0);
724 if (ret < 0)
725 goto out;
726 BUG_ON(ret == 0);
727
728 ret = 0;
729 if (path->slots[0] > 0) {
730 path->slots[0]--;
731 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
732 if (key.objectid == root->root_key.objectid &&
733 key.type == BTRFS_ROOT_REF_KEY)
734 ret = -ENOTEMPTY;
735 }
736out:
737 btrfs_free_path(path);
738 return ret;
739}
740
741static noinline int btrfs_ioctl_snap_destroy(struct file *file,
742 void __user *arg)
743{
744 struct dentry *parent = fdentry(file);
745 struct dentry *dentry;
746 struct inode *dir = parent->d_inode;
747 struct inode *inode;
748 struct btrfs_root *root = BTRFS_I(dir)->root;
749 struct btrfs_root *dest = NULL;
750 struct btrfs_ioctl_vol_args *vol_args;
751 struct btrfs_trans_handle *trans;
752 int namelen;
753 int ret;
754 int err = 0;
755
756 if (!capable(CAP_SYS_ADMIN))
757 return -EPERM;
758
759 vol_args = memdup_user(arg, sizeof(*vol_args));
760 if (IS_ERR(vol_args))
761 return PTR_ERR(vol_args);
762
763 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
764 namelen = strlen(vol_args->name);
765 if (strchr(vol_args->name, '/') ||
766 strncmp(vol_args->name, "..", namelen) == 0) {
767 err = -EINVAL;
768 goto out;
769 }
770
771 err = mnt_want_write(file->f_path.mnt);
772 if (err)
773 goto out;
774
775 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
776 dentry = lookup_one_len(vol_args->name, parent, namelen);
777 if (IS_ERR(dentry)) {
778 err = PTR_ERR(dentry);
779 goto out_unlock_dir;
780 }
781
782 if (!dentry->d_inode) {
783 err = -ENOENT;
784 goto out_dput;
785 }
786
787 inode = dentry->d_inode;
788 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
789 err = -EINVAL;
790 goto out_dput;
791 }
792
793 dest = BTRFS_I(inode)->root;
794
795 mutex_lock(&inode->i_mutex);
796 err = d_invalidate(dentry);
797 if (err)
798 goto out_unlock;
799
800 down_write(&root->fs_info->subvol_sem);
801
802 err = may_destroy_subvol(dest);
803 if (err)
804 goto out_up_write;
805
806 trans = btrfs_start_transaction(root, 1);
807 ret = btrfs_unlink_subvol(trans, root, dir,
808 dest->root_key.objectid,
809 dentry->d_name.name,
810 dentry->d_name.len);
811 BUG_ON(ret);
812
813 btrfs_record_root_in_trans(trans, dest);
814
815 memset(&dest->root_item.drop_progress, 0,
816 sizeof(dest->root_item.drop_progress));
817 dest->root_item.drop_level = 0;
818 btrfs_set_root_refs(&dest->root_item, 0);
819
820 ret = btrfs_insert_orphan_item(trans,
821 root->fs_info->tree_root,
822 dest->root_key.objectid);
823 BUG_ON(ret);
824
825 ret = btrfs_commit_transaction(trans, root);
826 BUG_ON(ret);
827 inode->i_flags |= S_DEAD;
828out_up_write:
829 up_write(&root->fs_info->subvol_sem);
830out_unlock:
831 mutex_unlock(&inode->i_mutex);
832 if (!err) {
833 btrfs_invalidate_inodes(dest);
834 d_delete(dentry);
835 }
836out_dput:
837 dput(dentry);
838out_unlock_dir:
839 mutex_unlock(&dir->i_mutex);
840 mnt_drop_write(file->f_path.mnt);
841out:
842 kfree(vol_args);
843 return err;
844}
845
795static int btrfs_ioctl_defrag(struct file *file) 846static int btrfs_ioctl_defrag(struct file *file)
796{ 847{
797 struct inode *inode = fdentry(file)->d_inode; 848 struct inode *inode = fdentry(file)->d_inode;
@@ -865,8 +916,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
865 return ret; 916 return ret;
866} 917}
867 918
868static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, 919static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
869 u64 off, u64 olen, u64 destoff) 920 u64 off, u64 olen, u64 destoff)
870{ 921{
871 struct inode *inode = fdentry(file)->d_inode; 922 struct inode *inode = fdentry(file)->d_inode;
872 struct btrfs_root *root = BTRFS_I(inode)->root; 923 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -976,7 +1027,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
976 1027
977 /* punch hole in destination first */ 1028 /* punch hole in destination first */
978 btrfs_drop_extents(trans, root, inode, off, off + len, 1029 btrfs_drop_extents(trans, root, inode, off, off + len,
979 off + len, 0, &hint_byte); 1030 off + len, 0, &hint_byte, 1);
980 1031
981 /* clone data */ 1032 /* clone data */
982 key.objectid = src->i_ino; 1033 key.objectid = src->i_ino;
@@ -1071,8 +1122,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1071 datao += off - key.offset; 1122 datao += off - key.offset;
1072 datal -= off - key.offset; 1123 datal -= off - key.offset;
1073 } 1124 }
1074 if (key.offset + datao + datal + key.offset > 1125 if (key.offset + datao + datal > off + len)
1075 off + len)
1076 datal = off + len - key.offset - datao; 1126 datal = off + len - key.offset - datao;
1077 /* disko == 0 means it's a hole */ 1127 /* disko == 0 means it's a hole */
1078 if (!disko) 1128 if (!disko)
@@ -1182,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
1182 struct inode *inode = fdentry(file)->d_inode; 1232 struct inode *inode = fdentry(file)->d_inode;
1183 struct btrfs_root *root = BTRFS_I(inode)->root; 1233 struct btrfs_root *root = BTRFS_I(inode)->root;
1184 struct btrfs_trans_handle *trans; 1234 struct btrfs_trans_handle *trans;
1185 int ret = 0; 1235 int ret;
1186 1236
1237 ret = -EPERM;
1187 if (!capable(CAP_SYS_ADMIN)) 1238 if (!capable(CAP_SYS_ADMIN))
1188 return -EPERM; 1239 goto out;
1189 1240
1190 if (file->private_data) { 1241 ret = -EINPROGRESS;
1191 ret = -EINPROGRESS; 1242 if (file->private_data)
1192 goto out; 1243 goto out;
1193 }
1194 1244
1195 ret = mnt_want_write(file->f_path.mnt); 1245 ret = mnt_want_write(file->f_path.mnt);
1196 if (ret) 1246 if (ret)
@@ -1200,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
1200 root->fs_info->open_ioctl_trans++; 1250 root->fs_info->open_ioctl_trans++;
1201 mutex_unlock(&root->fs_info->trans_mutex); 1251 mutex_unlock(&root->fs_info->trans_mutex);
1202 1252
1253 ret = -ENOMEM;
1203 trans = btrfs_start_ioctl_transaction(root, 0); 1254 trans = btrfs_start_ioctl_transaction(root, 0);
1204 if (trans) 1255 if (!trans)
1205 file->private_data = trans; 1256 goto out_drop;
1206 else 1257
1207 ret = -ENOMEM; 1258 file->private_data = trans;
1208 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1259 return 0;
1260
1261out_drop:
1262 mutex_lock(&root->fs_info->trans_mutex);
1263 root->fs_info->open_ioctl_trans--;
1264 mutex_unlock(&root->fs_info->trans_mutex);
1265 mnt_drop_write(file->f_path.mnt);
1209out: 1266out:
1210 return ret; 1267 return ret;
1211} 1268}
@@ -1221,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file)
1221 struct inode *inode = fdentry(file)->d_inode; 1278 struct inode *inode = fdentry(file)->d_inode;
1222 struct btrfs_root *root = BTRFS_I(inode)->root; 1279 struct btrfs_root *root = BTRFS_I(inode)->root;
1223 struct btrfs_trans_handle *trans; 1280 struct btrfs_trans_handle *trans;
1224 int ret = 0;
1225 1281
1226 trans = file->private_data; 1282 trans = file->private_data;
1227 if (!trans) { 1283 if (!trans)
1228 ret = -EINVAL; 1284 return -EINVAL;
1229 goto out;
1230 }
1231 btrfs_end_transaction(trans, root);
1232 file->private_data = NULL; 1285 file->private_data = NULL;
1233 1286
1287 btrfs_end_transaction(trans, root);
1288
1234 mutex_lock(&root->fs_info->trans_mutex); 1289 mutex_lock(&root->fs_info->trans_mutex);
1235 root->fs_info->open_ioctl_trans--; 1290 root->fs_info->open_ioctl_trans--;
1236 mutex_unlock(&root->fs_info->trans_mutex); 1291 mutex_unlock(&root->fs_info->trans_mutex);
1237 1292
1238 mnt_drop_write(file->f_path.mnt); 1293 mnt_drop_write(file->f_path.mnt);
1239 1294 return 0;
1240out:
1241 return ret;
1242} 1295}
1243 1296
1244long btrfs_ioctl(struct file *file, unsigned int 1297long btrfs_ioctl(struct file *file, unsigned int
@@ -1258,6 +1311,8 @@ long btrfs_ioctl(struct file *file, unsigned int
1258 return btrfs_ioctl_snap_create(file, argp, 0); 1311 return btrfs_ioctl_snap_create(file, argp, 0);
1259 case BTRFS_IOC_SUBVOL_CREATE: 1312 case BTRFS_IOC_SUBVOL_CREATE:
1260 return btrfs_ioctl_snap_create(file, argp, 1); 1313 return btrfs_ioctl_snap_create(file, argp, 1);
1314 case BTRFS_IOC_SNAP_DESTROY:
1315 return btrfs_ioctl_snap_destroy(file, argp);
1261 case BTRFS_IOC_DEFRAG: 1316 case BTRFS_IOC_DEFRAG:
1262 return btrfs_ioctl_defrag(file); 1317 return btrfs_ioctl_defrag(file);
1263 case BTRFS_IOC_RESIZE: 1318 case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b320b103fa13..bc49914475eb 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
65 65
66#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ 66#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
67 struct btrfs_ioctl_vol_args) 67 struct btrfs_ioctl_vol_args)
68 68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args)
69#endif 70#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7b2f401e604e..897fba835f89 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
159 * 159 *
160 * len is the length of the extent 160 * len is the length of the extent
161 * 161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was 162 * The tree is given a single reference on the ordered extent that was
165 * inserted. 163 * inserted.
166 */ 164 */
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
181 entry->start = start; 179 entry->start = start;
182 entry->len = len; 180 entry->len = len;
183 entry->disk_len = disk_len; 181 entry->disk_len = disk_len;
182 entry->bytes_left = len;
184 entry->inode = inode; 183 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 184 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags); 185 set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
195 &entry->rb_node); 194 &entry->rb_node);
196 BUG_ON(node); 195 BUG_ON(node);
197 196
198 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199 entry_end(entry) - 1, GFP_NOFS);
200
201 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202 list_add_tail(&entry->root_extent_list, 198 list_add_tail(&entry->root_extent_list,
203 &BTRFS_I(inode)->root->fs_info->ordered_extents); 199 &BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
241 struct btrfs_ordered_inode_tree *tree; 237 struct btrfs_ordered_inode_tree *tree;
242 struct rb_node *node; 238 struct rb_node *node;
243 struct btrfs_ordered_extent *entry; 239 struct btrfs_ordered_extent *entry;
244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245 int ret; 240 int ret;
246 241
247 tree = &BTRFS_I(inode)->ordered_tree; 242 tree = &BTRFS_I(inode)->ordered_tree;
248 mutex_lock(&tree->mutex); 243 mutex_lock(&tree->mutex);
249 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250 GFP_NOFS);
251 node = tree_search(tree, file_offset); 244 node = tree_search(tree, file_offset);
252 if (!node) { 245 if (!node) {
253 ret = 1; 246 ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
260 goto out; 253 goto out;
261 } 254 }
262 255
263 ret = test_range_bit(io_tree, entry->file_offset, 256 if (io_size > entry->bytes_left) {
264 entry->file_offset + entry->len - 1, 257 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
265 EXTENT_ORDERED, 0); 258 (unsigned long long)entry->bytes_left,
266 if (ret == 0) 259 (unsigned long long)io_size);
260 }
261 entry->bytes_left -= io_size;
262 if (entry->bytes_left == 0)
267 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 263 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
264 else
265 ret = 1;
268out: 266out:
269 mutex_unlock(&tree->mutex); 267 mutex_unlock(&tree->mutex);
270 return ret == 0; 268 return ret == 0;
@@ -460,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
460 * start IO on any dirty ones so the wait doesn't stall waiting 458 * start IO on any dirty ones so the wait doesn't stall waiting
461 * for pdflush to find them 459 * for pdflush to find them
462 */ 460 */
463 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 461 filemap_fdatawrite_range(inode->i_mapping, start, end);
464 if (wait) { 462 if (wait) {
465 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
466 &entry->flags)); 464 &entry->flags));
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
476 u64 orig_end; 474 u64 orig_end;
477 u64 wait_end; 475 u64 wait_end;
478 struct btrfs_ordered_extent *ordered; 476 struct btrfs_ordered_extent *ordered;
477 int found;
479 478
480 if (start + len < start) { 479 if (start + len < start) {
481 orig_end = INT_LIMIT(loff_t); 480 orig_end = INT_LIMIT(loff_t);
@@ -489,19 +488,18 @@ again:
489 /* start IO across the range first to instantiate any delalloc 488 /* start IO across the range first to instantiate any delalloc
490 * extents 489 * extents
491 */ 490 */
492 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 491 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
493 492
494 /* The compression code will leave pages locked but return from 493 /* The compression code will leave pages locked but return from
495 * writepage without setting the page writeback. Starting again 494 * writepage without setting the page writeback. Starting again
496 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 495 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
497 */ 496 */
498 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
499 498
500 btrfs_wait_on_page_writeback_range(inode->i_mapping, 499 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
501 start >> PAGE_CACHE_SHIFT,
502 orig_end >> PAGE_CACHE_SHIFT);
503 500
504 end = orig_end; 501 end = orig_end;
502 found = 0;
505 while (1) { 503 while (1) {
506 ordered = btrfs_lookup_first_ordered_extent(inode, end); 504 ordered = btrfs_lookup_first_ordered_extent(inode, end);
507 if (!ordered) 505 if (!ordered)
@@ -514,6 +512,7 @@ again:
514 btrfs_put_ordered_extent(ordered); 512 btrfs_put_ordered_extent(ordered);
515 break; 513 break;
516 } 514 }
515 found++;
517 btrfs_start_ordered_extent(inode, ordered, 1); 516 btrfs_start_ordered_extent(inode, ordered, 1);
518 end = ordered->file_offset; 517 end = ordered->file_offset;
519 btrfs_put_ordered_extent(ordered); 518 btrfs_put_ordered_extent(ordered);
@@ -521,8 +520,8 @@ again:
521 break; 520 break;
522 end--; 521 end--;
523 } 522 }
524 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, 523 if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
525 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { 524 EXTENT_DELALLOC, 0, NULL)) {
526 schedule_timeout(1); 525 schedule_timeout(1);
527 goto again; 526 goto again;
528 } 527 }
@@ -613,7 +612,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
613 */ 612 */
614 if (test_range_bit(io_tree, disk_i_size, 613 if (test_range_bit(io_tree, disk_i_size,
615 ordered->file_offset + ordered->len - 1, 614 ordered->file_offset + ordered->len - 1,
616 EXTENT_DELALLOC, 0)) { 615 EXTENT_DELALLOC, 0, NULL)) {
617 goto out; 616 goto out;
618 } 617 }
619 /* 618 /*
@@ -664,7 +663,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
664 */ 663 */
665 if (i_size_test > entry_end(ordered) && 664 if (i_size_test > entry_end(ordered) &&
666 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, 665 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
667 EXTENT_DELALLOC, 0)) { 666 EXTENT_DELALLOC, 0, NULL)) {
668 new_i_size = min_t(u64, i_size_test, i_size_read(inode)); 667 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
669 } 668 }
670 BTRFS_I(inode)->disk_i_size = new_i_size; 669 BTRFS_I(inode)->disk_i_size = new_i_size;
@@ -715,89 +714,6 @@ out:
715} 714}
716 715
717 716
718/**
719 * taken from mm/filemap.c because it isn't exported
720 *
721 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
722 * @mapping: address space structure to write
723 * @start: offset in bytes where the range starts
724 * @end: offset in bytes where the range ends (inclusive)
725 * @sync_mode: enable synchronous operation
726 *
727 * Start writeback against all of a mapping's dirty pages that lie
728 * within the byte offsets <start, end> inclusive.
729 *
730 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
731 * opposed to a regular memory cleansing writeback. The difference between
732 * these two operations is that if a dirty page/buffer is encountered, it must
733 * be waited upon, and not just skipped over.
734 */
735int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
736 loff_t end, int sync_mode)
737{
738 struct writeback_control wbc = {
739 .sync_mode = sync_mode,
740 .nr_to_write = mapping->nrpages * 2,
741 .range_start = start,
742 .range_end = end,
743 };
744 return btrfs_writepages(mapping, &wbc);
745}
746
747/**
748 * taken from mm/filemap.c because it isn't exported
749 *
750 * wait_on_page_writeback_range - wait for writeback to complete
751 * @mapping: target address_space
752 * @start: beginning page index
753 * @end: ending page index
754 *
755 * Wait for writeback to complete against pages indexed by start->end
756 * inclusive
757 */
758int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
759 pgoff_t start, pgoff_t end)
760{
761 struct pagevec pvec;
762 int nr_pages;
763 int ret = 0;
764 pgoff_t index;
765
766 if (end < start)
767 return 0;
768
769 pagevec_init(&pvec, 0);
770 index = start;
771 while ((index <= end) &&
772 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
773 PAGECACHE_TAG_WRITEBACK,
774 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
775 unsigned i;
776
777 for (i = 0; i < nr_pages; i++) {
778 struct page *page = pvec.pages[i];
779
780 /* until radix tree lookup accepts end_index */
781 if (page->index > end)
782 continue;
783
784 wait_on_page_writeback(page);
785 if (PageError(page))
786 ret = -EIO;
787 }
788 pagevec_release(&pvec);
789 cond_resched();
790 }
791
792 /* Check for outstanding write errors */
793 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
794 ret = -ENOSPC;
795 if (test_and_clear_bit(AS_EIO, &mapping->flags))
796 ret = -EIO;
797
798 return ret;
799}
800
801/* 717/*
802 * add a given inode to the list of inodes that must be fully on 718 * add a given inode to the list of inodes that must be fully on
803 * disk before a transaction commit finishes. 719 * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3d31c8827b01..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
85 /* extent length on disk */ 85 /* extent length on disk */
86 u64 disk_len; 86 u64 disk_len;
87 87
88 /* number of bytes that still need writing */
89 u64 bytes_left;
90
88 /* flags (described above) */ 91 /* flags (described above) */
89 unsigned long flags; 92 unsigned long flags;
90 93
@@ -150,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
150int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode,
151 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
152int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
153int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
154 pgoff_t start, pgoff_t end);
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 3c0d52af4f80..79cba5fbc28e 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -65,3 +65,23 @@ out:
65 btrfs_free_path(path); 65 btrfs_free_path(path);
66 return ret; 66 return ret;
67} 67}
68
69int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
70{
71 struct btrfs_path *path;
72 struct btrfs_key key;
73 int ret;
74
75 key.objectid = BTRFS_ORPHAN_OBJECTID;
76 key.type = BTRFS_ORPHAN_ITEM_KEY;
77 key.offset = offset;
78
79 path = btrfs_alloc_path();
80 if (!path)
81 return -ENOMEM;
82
83 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
84
85 btrfs_free_path(path);
86 return ret;
87}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c04f7f212602..361ad323faac 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -121,6 +121,15 @@ struct inodevec {
121 int nr; 121 int nr;
122}; 122};
123 123
124#define MAX_EXTENTS 128
125
126struct file_extent_cluster {
127 u64 start;
128 u64 end;
129 u64 boundary[MAX_EXTENTS];
130 unsigned int nr;
131};
132
124struct reloc_control { 133struct reloc_control {
125 /* block group to relocate */ 134 /* block group to relocate */
126 struct btrfs_block_group_cache *block_group; 135 struct btrfs_block_group_cache *block_group;
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2180 struct reloc_control *rc) 2189 struct reloc_control *rc)
2181{ 2190{
2182 if (test_range_bit(&rc->processed_blocks, bytenr, 2191 if (test_range_bit(&rc->processed_blocks, bytenr,
2183 bytenr + blocksize - 1, EXTENT_DIRTY, 1)) 2192 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2184 return 1; 2193 return 1;
2185 return 0; 2194 return 0;
2186} 2195}
@@ -2529,56 +2538,94 @@ out:
2529} 2538}
2530 2539
2531static noinline_for_stack 2540static noinline_for_stack
2532int relocate_inode_pages(struct inode *inode, u64 start, u64 len) 2541int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2542 u64 block_start)
2543{
2544 struct btrfs_root *root = BTRFS_I(inode)->root;
2545 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2546 struct extent_map *em;
2547 int ret = 0;
2548
2549 em = alloc_extent_map(GFP_NOFS);
2550 if (!em)
2551 return -ENOMEM;
2552
2553 em->start = start;
2554 em->len = end + 1 - start;
2555 em->block_len = em->len;
2556 em->block_start = block_start;
2557 em->bdev = root->fs_info->fs_devices->latest_bdev;
2558 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2559
2560 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2561 while (1) {
2562 write_lock(&em_tree->lock);
2563 ret = add_extent_mapping(em_tree, em);
2564 write_unlock(&em_tree->lock);
2565 if (ret != -EEXIST) {
2566 free_extent_map(em);
2567 break;
2568 }
2569 btrfs_drop_extent_cache(inode, start, end, 0);
2570 }
2571 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2572 return ret;
2573}
2574
2575static int relocate_file_extent_cluster(struct inode *inode,
2576 struct file_extent_cluster *cluster)
2533{ 2577{
2534 u64 page_start; 2578 u64 page_start;
2535 u64 page_end; 2579 u64 page_end;
2536 unsigned long i; 2580 u64 offset = BTRFS_I(inode)->index_cnt;
2537 unsigned long first_index; 2581 unsigned long index;
2538 unsigned long last_index; 2582 unsigned long last_index;
2539 unsigned int total_read = 0; 2583 unsigned int dirty_page = 0;
2540 unsigned int total_dirty = 0;
2541 struct page *page; 2584 struct page *page;
2542 struct file_ra_state *ra; 2585 struct file_ra_state *ra;
2543 struct btrfs_ordered_extent *ordered; 2586 int nr = 0;
2544 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2545 int ret = 0; 2587 int ret = 0;
2546 2588
2589 if (!cluster->nr)
2590 return 0;
2591
2547 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2592 ra = kzalloc(sizeof(*ra), GFP_NOFS);
2548 if (!ra) 2593 if (!ra)
2549 return -ENOMEM; 2594 return -ENOMEM;
2550 2595
2596 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2597 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2598
2551 mutex_lock(&inode->i_mutex); 2599 mutex_lock(&inode->i_mutex);
2552 first_index = start >> PAGE_CACHE_SHIFT;
2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2554 2600
2555 /* make sure the dirty trick played by the caller work */ 2601 i_size_write(inode, cluster->end + 1 - offset);
2556 while (1) { 2602 ret = setup_extent_mapping(inode, cluster->start - offset,
2557 ret = invalidate_inode_pages2_range(inode->i_mapping, 2603 cluster->end - offset, cluster->start);
2558 first_index, last_index);
2559 if (ret != -EBUSY)
2560 break;
2561 schedule_timeout(HZ/10);
2562 }
2563 if (ret) 2604 if (ret)
2564 goto out_unlock; 2605 goto out_unlock;
2565 2606
2566 file_ra_state_init(ra, inode->i_mapping); 2607 file_ra_state_init(ra, inode->i_mapping);
2567 2608
2568 for (i = first_index ; i <= last_index; i++) { 2609 WARN_ON(cluster->start != cluster->boundary[0]);
2569 if (total_read % ra->ra_pages == 0) { 2610 while (index <= last_index) {
2570 btrfs_force_ra(inode->i_mapping, ra, NULL, i, 2611 page = find_lock_page(inode->i_mapping, index);
2571 min(last_index, ra->ra_pages + i - 1));
2572 }
2573 total_read++;
2574again:
2575 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
2576 BUG_ON(1);
2577 page = grab_cache_page(inode->i_mapping, i);
2578 if (!page) { 2612 if (!page) {
2579 ret = -ENOMEM; 2613 page_cache_sync_readahead(inode->i_mapping,
2580 goto out_unlock; 2614 ra, NULL, index,
2615 last_index + 1 - index);
2616 page = grab_cache_page(inode->i_mapping, index);
2617 if (!page) {
2618 ret = -ENOMEM;
2619 goto out_unlock;
2620 }
2621 }
2622
2623 if (PageReadahead(page)) {
2624 page_cache_async_readahead(inode->i_mapping,
2625 ra, NULL, page, index,
2626 last_index + 1 - index);
2581 } 2627 }
2628
2582 if (!PageUptodate(page)) { 2629 if (!PageUptodate(page)) {
2583 btrfs_readpage(NULL, page); 2630 btrfs_readpage(NULL, page);
2584 lock_page(page); 2631 lock_page(page);
@@ -2589,75 +2636,79 @@ again:
2589 goto out_unlock; 2636 goto out_unlock;
2590 } 2637 }
2591 } 2638 }
2592 wait_on_page_writeback(page);
2593 2639
2594 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2640 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2595 page_end = page_start + PAGE_CACHE_SIZE - 1; 2641 page_end = page_start + PAGE_CACHE_SIZE - 1;
2596 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 2642
2597 2643 lock_extent(&BTRFS_I(inode)->io_tree,
2598 ordered = btrfs_lookup_ordered_extent(inode, page_start); 2644 page_start, page_end, GFP_NOFS);
2599 if (ordered) { 2645
2600 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2601 unlock_page(page);
2602 page_cache_release(page);
2603 btrfs_start_ordered_extent(inode, ordered, 1);
2604 btrfs_put_ordered_extent(ordered);
2605 goto again;
2606 }
2607 set_page_extent_mapped(page); 2646 set_page_extent_mapped(page);
2608 2647
2609 if (i == first_index) 2648 if (nr < cluster->nr &&
2610 set_extent_bits(io_tree, page_start, page_end, 2649 page_start + offset == cluster->boundary[nr]) {
2650 set_extent_bits(&BTRFS_I(inode)->io_tree,
2651 page_start, page_end,
2611 EXTENT_BOUNDARY, GFP_NOFS); 2652 EXTENT_BOUNDARY, GFP_NOFS);
2653 nr++;
2654 }
2612 btrfs_set_extent_delalloc(inode, page_start, page_end); 2655 btrfs_set_extent_delalloc(inode, page_start, page_end);
2613 2656
2614 set_page_dirty(page); 2657 set_page_dirty(page);
2615 total_dirty++; 2658 dirty_page++;
2616 2659
2617 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 2660 unlock_extent(&BTRFS_I(inode)->io_tree,
2661 page_start, page_end, GFP_NOFS);
2618 unlock_page(page); 2662 unlock_page(page);
2619 page_cache_release(page); 2663 page_cache_release(page);
2664
2665 index++;
2666 if (nr < cluster->nr &&
2667 page_end + 1 + offset == cluster->boundary[nr]) {
2668 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2669 dirty_page);
2670 dirty_page = 0;
2671 }
2672 }
2673 if (dirty_page) {
2674 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2675 dirty_page);
2620 } 2676 }
2677 WARN_ON(nr != cluster->nr);
2621out_unlock: 2678out_unlock:
2622 mutex_unlock(&inode->i_mutex); 2679 mutex_unlock(&inode->i_mutex);
2623 kfree(ra); 2680 kfree(ra);
2624 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
2625 return ret; 2681 return ret;
2626} 2682}
2627 2683
2628static noinline_for_stack 2684static noinline_for_stack
2629int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) 2685int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2686 struct file_extent_cluster *cluster)
2630{ 2687{
2631 struct btrfs_root *root = BTRFS_I(inode)->root; 2688 int ret;
2632 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2633 struct extent_map *em;
2634 u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
2635 u64 end = start + extent_key->offset - 1;
2636
2637 em = alloc_extent_map(GFP_NOFS);
2638 em->start = start;
2639 em->len = extent_key->offset;
2640 em->block_len = extent_key->offset;
2641 em->block_start = extent_key->objectid;
2642 em->bdev = root->fs_info->fs_devices->latest_bdev;
2643 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2644 2689
2645 /* setup extent map to cheat btrfs_readpage */ 2690 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2646 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); 2691 ret = relocate_file_extent_cluster(inode, cluster);
2647 while (1) { 2692 if (ret)
2648 int ret; 2693 return ret;
2649 spin_lock(&em_tree->lock); 2694 cluster->nr = 0;
2650 ret = add_extent_mapping(em_tree, em);
2651 spin_unlock(&em_tree->lock);
2652 if (ret != -EEXIST) {
2653 free_extent_map(em);
2654 break;
2655 }
2656 btrfs_drop_extent_cache(inode, start, end, 0);
2657 } 2695 }
2658 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2659 2696
2660 return relocate_inode_pages(inode, start, extent_key->offset); 2697 if (!cluster->nr)
2698 cluster->start = extent_key->objectid;
2699 else
2700 BUG_ON(cluster->nr >= MAX_EXTENTS);
2701 cluster->end = extent_key->objectid + extent_key->offset - 1;
2702 cluster->boundary[cluster->nr] = extent_key->objectid;
2703 cluster->nr++;
2704
2705 if (cluster->nr >= MAX_EXTENTS) {
2706 ret = relocate_file_extent_cluster(inode, cluster);
2707 if (ret)
2708 return ret;
2709 cluster->nr = 0;
2710 }
2711 return 0;
2661} 2712}
2662 2713
2663#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2714#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
3203 return 0; 3254 return 0;
3204} 3255}
3205 3256
3257
3206static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3258static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3207{ 3259{
3208 struct rb_root blocks = RB_ROOT; 3260 struct rb_root blocks = RB_ROOT;
3209 struct btrfs_key key; 3261 struct btrfs_key key;
3262 struct file_extent_cluster *cluster;
3210 struct btrfs_trans_handle *trans = NULL; 3263 struct btrfs_trans_handle *trans = NULL;
3211 struct btrfs_path *path; 3264 struct btrfs_path *path;
3212 struct btrfs_extent_item *ei; 3265 struct btrfs_extent_item *ei;
@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3216 int ret; 3269 int ret;
3217 int err = 0; 3270 int err = 0;
3218 3271
3272 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3273 if (!cluster)
3274 return -ENOMEM;
3275
3219 path = btrfs_alloc_path(); 3276 path = btrfs_alloc_path();
3220 if (!path) 3277 if (!path)
3221 return -ENOMEM; 3278 return -ENOMEM;
3222 3279
3280 rc->extents_found = 0;
3281 rc->extents_skipped = 0;
3282
3223 rc->search_start = rc->block_group->key.objectid; 3283 rc->search_start = rc->block_group->key.objectid;
3224 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, 3284 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3225 GFP_NOFS); 3285 GFP_NOFS);
@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3306 } 3366 }
3307 3367
3308 nr = trans->blocks_used; 3368 nr = trans->blocks_used;
3309 btrfs_end_transaction_throttle(trans, rc->extent_root); 3369 btrfs_end_transaction(trans, rc->extent_root);
3310 trans = NULL; 3370 trans = NULL;
3311 btrfs_btree_balance_dirty(rc->extent_root, nr); 3371 btrfs_btree_balance_dirty(rc->extent_root, nr);
3312 3372
3313 if (rc->stage == MOVE_DATA_EXTENTS && 3373 if (rc->stage == MOVE_DATA_EXTENTS &&
3314 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3374 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3315 rc->found_file_extent = 1; 3375 rc->found_file_extent = 1;
3316 ret = relocate_data_extent(rc->data_inode, &key); 3376 ret = relocate_data_extent(rc->data_inode,
3377 &key, cluster);
3317 if (ret < 0) { 3378 if (ret < 0) {
3318 err = ret; 3379 err = ret;
3319 break; 3380 break;
@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3328 btrfs_btree_balance_dirty(rc->extent_root, nr); 3389 btrfs_btree_balance_dirty(rc->extent_root, nr);
3329 } 3390 }
3330 3391
3392 if (!err) {
3393 ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3394 if (ret < 0)
3395 err = ret;
3396 }
3397
3398 kfree(cluster);
3399
3331 rc->create_reloc_root = 0; 3400 rc->create_reloc_root = 0;
3332 smp_mb(); 3401 smp_mb();
3333 3402
@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3348} 3417}
3349 3418
3350static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3419static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3351 struct btrfs_root *root, 3420 struct btrfs_root *root, u64 objectid)
3352 u64 objectid, u64 size)
3353{ 3421{
3354 struct btrfs_path *path; 3422 struct btrfs_path *path;
3355 struct btrfs_inode_item *item; 3423 struct btrfs_inode_item *item;
@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3368 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3436 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3369 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 3437 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3370 btrfs_set_inode_generation(leaf, item, 1); 3438 btrfs_set_inode_generation(leaf, item, 1);
3371 btrfs_set_inode_size(leaf, item, size); 3439 btrfs_set_inode_size(leaf, item, 0);
3372 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3440 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3373 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3441 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3374 btrfs_mark_buffer_dirty(leaf); 3442 btrfs_mark_buffer_dirty(leaf);
@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3404 if (err) 3472 if (err)
3405 goto out; 3473 goto out;
3406 3474
3407 err = __insert_orphan_inode(trans, root, objectid, group->key.offset); 3475 err = __insert_orphan_inode(trans, root, objectid);
3408 BUG_ON(err);
3409
3410 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
3411 group->key.offset, 0, group->key.offset,
3412 0, 0, 0);
3413 BUG_ON(err); 3476 BUG_ON(err);
3414 3477
3415 key.objectid = objectid; 3478 key.objectid = objectid;
@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3475 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 3538 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3476 3539
3477 while (1) { 3540 while (1) {
3478 mutex_lock(&fs_info->cleaner_mutex);
3479 btrfs_clean_old_snapshots(fs_info->tree_root);
3480 mutex_unlock(&fs_info->cleaner_mutex);
3481
3482 rc->extents_found = 0; 3541 rc->extents_found = 0;
3483 rc->extents_skipped = 0; 3542 rc->extents_skipped = 0;
3484 3543
3544 mutex_lock(&fs_info->cleaner_mutex);
3545
3546 btrfs_clean_old_snapshots(fs_info->tree_root);
3485 ret = relocate_block_group(rc); 3547 ret = relocate_block_group(rc);
3548
3549 mutex_unlock(&fs_info->cleaner_mutex);
3486 if (ret < 0) { 3550 if (ret < 0) {
3487 err = ret; 3551 err = ret;
3488 break; 3552 break;
@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3514 } 3578 }
3515 } 3579 }
3516 3580
3517 filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, 3581 filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3518 rc->block_group->key.objectid, 3582 rc->block_group->key.objectid,
3519 rc->block_group->key.objectid + 3583 rc->block_group->key.objectid +
3520 rc->block_group->key.offset - 1); 3584 rc->block_group->key.offset - 1);
3521 3585
3522 WARN_ON(rc->block_group->pinned > 0); 3586 WARN_ON(rc->block_group->pinned > 0);
3523 WARN_ON(rc->block_group->reserved > 0); 3587 WARN_ON(rc->block_group->reserved > 0);
@@ -3530,6 +3594,26 @@ out:
3530 return err; 3594 return err;
3531} 3595}
3532 3596
3597static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3598{
3599 struct btrfs_trans_handle *trans;
3600 int ret;
3601
3602 trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3603
3604 memset(&root->root_item.drop_progress, 0,
3605 sizeof(root->root_item.drop_progress));
3606 root->root_item.drop_level = 0;
3607 btrfs_set_root_refs(&root->root_item, 0);
3608 ret = btrfs_update_root(trans, root->fs_info->tree_root,
3609 &root->root_key, &root->root_item);
3610 BUG_ON(ret);
3611
3612 ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3613 BUG_ON(ret);
3614 return 0;
3615}
3616
3533/* 3617/*
3534 * recover relocation interrupted by system crash. 3618 * recover relocation interrupted by system crash.
3535 * 3619 *
@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3589 fs_root = read_fs_root(root->fs_info, 3673 fs_root = read_fs_root(root->fs_info,
3590 reloc_root->root_key.offset); 3674 reloc_root->root_key.offset);
3591 if (IS_ERR(fs_root)) { 3675 if (IS_ERR(fs_root)) {
3592 err = PTR_ERR(fs_root); 3676 ret = PTR_ERR(fs_root);
3593 goto out; 3677 if (ret != -ENOENT) {
3678 err = ret;
3679 goto out;
3680 }
3681 mark_garbage_root(reloc_root);
3594 } 3682 }
3595 } 3683 }
3596 3684
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0ddc6d61c55a..9351428f30e2 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
94 goto out; 94 goto out;
95 95
96 BUG_ON(ret == 0); 96 BUG_ON(ret == 0);
97 if (path->slots[0] == 0) {
98 ret = 1;
99 goto out;
100 }
97 l = path->nodes[0]; 101 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1; 102 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot); 103 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) { 104 if (found_key.objectid != objectid ||
105 found_key.type != BTRFS_ROOT_ITEM_KEY) {
102 ret = 1; 106 ret = 1;
103 goto out; 107 goto out;
104 } 108 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), 109 if (item)
106 sizeof(*item)); 110 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
107 memcpy(key, &found_key, sizeof(found_key)); 111 sizeof(*item));
112 if (key)
113 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0; 114 ret = 0;
109out: 115out:
110 btrfs_free_path(path); 116 btrfs_free_path(path);
@@ -249,6 +255,59 @@ err:
249 return ret; 255 return ret;
250} 256}
251 257
258int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259{
260 struct extent_buffer *leaf;
261 struct btrfs_path *path;
262 struct btrfs_key key;
263 int err = 0;
264 int ret;
265
266 path = btrfs_alloc_path();
267 if (!path)
268 return -ENOMEM;
269
270 key.objectid = BTRFS_ORPHAN_OBJECTID;
271 key.type = BTRFS_ORPHAN_ITEM_KEY;
272 key.offset = 0;
273
274 while (1) {
275 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
276 if (ret < 0) {
277 err = ret;
278 break;
279 }
280
281 leaf = path->nodes[0];
282 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
283 ret = btrfs_next_leaf(tree_root, path);
284 if (ret < 0)
285 err = ret;
286 if (ret != 0)
287 break;
288 leaf = path->nodes[0];
289 }
290
291 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
292 btrfs_release_path(tree_root, path);
293
294 if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
295 key.type != BTRFS_ORPHAN_ITEM_KEY)
296 break;
297
298 ret = btrfs_find_dead_roots(tree_root, key.offset);
299 if (ret) {
300 err = ret;
301 break;
302 }
303
304 key.offset++;
305 }
306
307 btrfs_free_path(path);
308 return err;
309}
310
252/* drop the root item for 'key' from 'root' */ 311/* drop the root item for 'key' from 'root' */
253int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 312int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
254 struct btrfs_key *key) 313 struct btrfs_key *key)
@@ -278,31 +337,57 @@ out:
278 return ret; 337 return ret;
279} 338}
280 339
281#if 0 /* this will get used when snapshot deletion is implemented */
282int btrfs_del_root_ref(struct btrfs_trans_handle *trans, 340int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
283 struct btrfs_root *tree_root, 341 struct btrfs_root *tree_root,
284 u64 root_id, u8 type, u64 ref_id) 342 u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
343 const char *name, int name_len)
344
285{ 345{
346 struct btrfs_path *path;
347 struct btrfs_root_ref *ref;
348 struct extent_buffer *leaf;
286 struct btrfs_key key; 349 struct btrfs_key key;
350 unsigned long ptr;
351 int err = 0;
287 int ret; 352 int ret;
288 struct btrfs_path *path;
289 353
290 path = btrfs_alloc_path(); 354 path = btrfs_alloc_path();
355 if (!path)
356 return -ENOMEM;
291 357
292 key.objectid = root_id; 358 key.objectid = root_id;
293 key.type = type; 359 key.type = BTRFS_ROOT_BACKREF_KEY;
294 key.offset = ref_id; 360 key.offset = ref_id;
295 361again:
296 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 362 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
297 BUG_ON(ret); 363 BUG_ON(ret < 0);
298 364 if (ret == 0) {
299 ret = btrfs_del_item(trans, tree_root, path); 365 leaf = path->nodes[0];
300 BUG_ON(ret); 366 ref = btrfs_item_ptr(leaf, path->slots[0],
367 struct btrfs_root_ref);
368
369 WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
370 WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
371 ptr = (unsigned long)(ref + 1);
372 WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
373 *sequence = btrfs_root_ref_sequence(leaf, ref);
374
375 ret = btrfs_del_item(trans, tree_root, path);
376 BUG_ON(ret);
377 } else
378 err = -ENOENT;
379
380 if (key.type == BTRFS_ROOT_BACKREF_KEY) {
381 btrfs_release_path(tree_root, path);
382 key.objectid = ref_id;
383 key.type = BTRFS_ROOT_REF_KEY;
384 key.offset = root_id;
385 goto again;
386 }
301 387
302 btrfs_free_path(path); 388 btrfs_free_path(path);
303 return ret; 389 return err;
304} 390}
305#endif
306 391
307int btrfs_find_root_ref(struct btrfs_root *tree_root, 392int btrfs_find_root_ref(struct btrfs_root *tree_root,
308 struct btrfs_path *path, 393 struct btrfs_path *path,
@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
319 return ret; 404 return ret;
320} 405}
321 406
322
323/* 407/*
324 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY 408 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
325 * or BTRFS_ROOT_BACKREF_KEY. 409 * or BTRFS_ROOT_BACKREF_KEY.
@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
335 */ 419 */
336int btrfs_add_root_ref(struct btrfs_trans_handle *trans, 420int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
337 struct btrfs_root *tree_root, 421 struct btrfs_root *tree_root,
338 u64 root_id, u8 type, u64 ref_id, 422 u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
339 u64 dirid, u64 sequence,
340 const char *name, int name_len) 423 const char *name, int name_len)
341{ 424{
342 struct btrfs_key key; 425 struct btrfs_key key;
@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
346 struct extent_buffer *leaf; 429 struct extent_buffer *leaf;
347 unsigned long ptr; 430 unsigned long ptr;
348 431
349
350 path = btrfs_alloc_path(); 432 path = btrfs_alloc_path();
433 if (!path)
434 return -ENOMEM;
351 435
352 key.objectid = root_id; 436 key.objectid = root_id;
353 key.type = type; 437 key.type = BTRFS_ROOT_BACKREF_KEY;
354 key.offset = ref_id; 438 key.offset = ref_id;
355 439again:
356 ret = btrfs_insert_empty_item(trans, tree_root, path, &key, 440 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
357 sizeof(*ref) + name_len); 441 sizeof(*ref) + name_len);
358 BUG_ON(ret); 442 BUG_ON(ret);
@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
366 write_extent_buffer(leaf, name, ptr, name_len); 450 write_extent_buffer(leaf, name, ptr, name_len);
367 btrfs_mark_buffer_dirty(leaf); 451 btrfs_mark_buffer_dirty(leaf);
368 452
453 if (key.type == BTRFS_ROOT_BACKREF_KEY) {
454 btrfs_release_path(tree_root, path);
455 key.objectid = ref_id;
456 key.type = BTRFS_ROOT_REF_KEY;
457 key.offset = root_id;
458 goto again;
459 }
460
369 btrfs_free_path(path); 461 btrfs_free_path(path);
370 return ret; 462 return 0;
371} 463}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6d6d06cb6dfc..9de9b2236419 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,7 @@
51#include "export.h" 51#include "export.h"
52#include "compression.h" 52#include "compression.h"
53 53
54static struct super_operations btrfs_super_ops; 54static const struct super_operations btrfs_super_ops;
55 55
56static void btrfs_put_super(struct super_block *sb) 56static void btrfs_put_super(struct super_block *sb)
57{ 57{
@@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 344 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 345 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 346 sb->s_time_gran = 1;
347#ifdef CONFIG_BTRFS_POSIX_ACL
347 sb->s_flags |= MS_POSIXACL; 348 sb->s_flags |= MS_POSIXACL;
349#endif
348 350
349 tree_root = open_ctree(sb, fs_devices, (char *)data); 351 tree_root = open_ctree(sb, fs_devices, (char *)data);
350 352
@@ -675,7 +677,8 @@ static int btrfs_unfreeze(struct super_block *sb)
675 return 0; 677 return 0;
676} 678}
677 679
678static struct super_operations btrfs_super_ops = { 680static const struct super_operations btrfs_super_ops = {
681 .drop_inode = btrfs_drop_inode,
679 .delete_inode = btrfs_delete_inode, 682 .delete_inode = btrfs_delete_inode,
680 .put_super = btrfs_put_super, 683 .put_super = btrfs_put_super,
681 .sync_fs = btrfs_sync_fs, 684 .sync_fs = btrfs_sync_fs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cdbb5022da52..0b8f36d4400a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
104{ 104{
105 if (root->ref_cows && root->last_trans < trans->transid) { 105 if (root->ref_cows && root->last_trans < trans->transid) {
106 WARN_ON(root == root->fs_info->extent_root); 106 WARN_ON(root == root->fs_info->extent_root);
107 WARN_ON(root->root_item.refs == 0);
108 WARN_ON(root->commit_root != root->node); 107 WARN_ON(root->commit_root != root->node);
109 108
110 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 109 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
187 h->alloc_exclude_start = 0; 186 h->alloc_exclude_start = 0;
188 h->delayed_ref_updates = 0; 187 h->delayed_ref_updates = 0;
189 188
189 if (!current->journal_info)
190 current->journal_info = h;
191
190 root->fs_info->running_transaction->use_count++; 192 root->fs_info->running_transaction->use_count++;
191 record_root_in_trans(h, root); 193 record_root_in_trans(h, root);
192 mutex_unlock(&root->fs_info->trans_mutex); 194 mutex_unlock(&root->fs_info->trans_mutex);
@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
318 wake_up(&cur_trans->writer_wait); 320 wake_up(&cur_trans->writer_wait);
319 put_transaction(cur_trans); 321 put_transaction(cur_trans);
320 mutex_unlock(&info->trans_mutex); 322 mutex_unlock(&info->trans_mutex);
323
324 if (current->journal_info == trans)
325 current->journal_info = NULL;
321 memset(trans, 0, sizeof(*trans)); 326 memset(trans, 0, sizeof(*trans));
322 kmem_cache_free(btrfs_trans_handle_cachep, trans); 327 kmem_cache_free(btrfs_trans_handle_cachep, trans);
323 328
@@ -720,7 +725,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
720 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 725 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
721 726
722 key.objectid = objectid; 727 key.objectid = objectid;
723 key.offset = 0; 728 /* record when the snapshot was created in key.offset */
729 key.offset = trans->transid;
724 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 730 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
725 731
726 old = btrfs_lock_root_node(root); 732 old = btrfs_lock_root_node(root);
@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
743 memcpy(&pending->root_key, &key, sizeof(key)); 749 memcpy(&pending->root_key, &key, sizeof(key));
744fail: 750fail:
745 kfree(new_root_item); 751 kfree(new_root_item);
752 btrfs_unreserve_metadata_space(root, 6);
746 return ret; 753 return ret;
747} 754}
748 755
@@ -778,24 +785,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
778 ret = btrfs_update_inode(trans, parent_root, parent_inode); 785 ret = btrfs_update_inode(trans, parent_root, parent_inode);
779 BUG_ON(ret); 786 BUG_ON(ret);
780 787
781 /* add the backref first */
782 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 788 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
783 pending->root_key.objectid, 789 pending->root_key.objectid,
784 BTRFS_ROOT_BACKREF_KEY,
785 parent_root->root_key.objectid, 790 parent_root->root_key.objectid,
786 parent_inode->i_ino, index, pending->name, 791 parent_inode->i_ino, index, pending->name,
787 namelen); 792 namelen);
788 793
789 BUG_ON(ret); 794 BUG_ON(ret);
790 795
791 /* now add the forward ref */
792 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
793 parent_root->root_key.objectid,
794 BTRFS_ROOT_REF_KEY,
795 pending->root_key.objectid,
796 parent_inode->i_ino, index, pending->name,
797 namelen);
798
799 inode = btrfs_lookup_dentry(parent_inode, pending->dentry); 796 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
800 d_instantiate(pending->dentry, inode); 797 d_instantiate(pending->dentry, inode);
801fail: 798fail:
@@ -874,7 +871,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
874 unsigned long timeout = 1; 871 unsigned long timeout = 1;
875 struct btrfs_transaction *cur_trans; 872 struct btrfs_transaction *cur_trans;
876 struct btrfs_transaction *prev_trans = NULL; 873 struct btrfs_transaction *prev_trans = NULL;
877 struct extent_io_tree *pinned_copy;
878 DEFINE_WAIT(wait); 874 DEFINE_WAIT(wait);
879 int ret; 875 int ret;
880 int should_grow = 0; 876 int should_grow = 0;
@@ -915,13 +911,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
915 return 0; 911 return 0;
916 } 912 }
917 913
918 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
919 if (!pinned_copy)
920 return -ENOMEM;
921
922 extent_io_tree_init(pinned_copy,
923 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
924
925 trans->transaction->in_commit = 1; 914 trans->transaction->in_commit = 1;
926 trans->transaction->blocked = 1; 915 trans->transaction->blocked = 1;
927 if (cur_trans->list.prev != &root->fs_info->trans_list) { 916 if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -1019,6 +1008,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1019 ret = commit_cowonly_roots(trans, root); 1008 ret = commit_cowonly_roots(trans, root);
1020 BUG_ON(ret); 1009 BUG_ON(ret);
1021 1010
1011 btrfs_prepare_extent_commit(trans, root);
1012
1022 cur_trans = root->fs_info->running_transaction; 1013 cur_trans = root->fs_info->running_transaction;
1023 spin_lock(&root->fs_info->new_trans_lock); 1014 spin_lock(&root->fs_info->new_trans_lock);
1024 root->fs_info->running_transaction = NULL; 1015 root->fs_info->running_transaction = NULL;
@@ -1042,8 +1033,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1042 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1033 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1043 sizeof(root->fs_info->super_copy)); 1034 sizeof(root->fs_info->super_copy));
1044 1035
1045 btrfs_copy_pinned(root, pinned_copy);
1046
1047 trans->transaction->blocked = 0; 1036 trans->transaction->blocked = 0;
1048 1037
1049 wake_up(&root->fs_info->transaction_wait); 1038 wake_up(&root->fs_info->transaction_wait);
@@ -1059,8 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1059 */ 1048 */
1060 mutex_unlock(&root->fs_info->tree_log_mutex); 1049 mutex_unlock(&root->fs_info->tree_log_mutex);
1061 1050
1062 btrfs_finish_extent_commit(trans, root, pinned_copy); 1051 btrfs_finish_extent_commit(trans, root);
1063 kfree(pinned_copy);
1064 1052
1065 /* do the directory inserts of any pending snapshot creations */ 1053 /* do the directory inserts of any pending snapshot creations */
1066 finish_pending_snapshots(trans, root->fs_info); 1054 finish_pending_snapshots(trans, root->fs_info);
@@ -1078,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1078 1066
1079 mutex_unlock(&root->fs_info->trans_mutex); 1067 mutex_unlock(&root->fs_info->trans_mutex);
1080 1068
1069 if (current->journal_info == trans)
1070 current->journal_info = NULL;
1071
1081 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1072 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1082 return ret; 1073 return ret;
1083} 1074}
@@ -1096,8 +1087,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1096 1087
1097 while (!list_empty(&list)) { 1088 while (!list_empty(&list)) {
1098 root = list_entry(list.next, struct btrfs_root, root_list); 1089 root = list_entry(list.next, struct btrfs_root, root_list);
1099 list_del_init(&root->root_list); 1090 list_del(&root->root_list);
1100 btrfs_drop_snapshot(root, 0); 1091
1092 if (btrfs_header_backref_rev(root->node) <
1093 BTRFS_MIXED_BACKREF_REV)
1094 btrfs_drop_snapshot(root, 0);
1095 else
1096 btrfs_drop_snapshot(root, 1);
1101 } 1097 }
1102 return 0; 1098 return 0;
1103} 1099}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d91b0de7c502..7827841b55cb 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -263,8 +263,8 @@ static int process_one_buffer(struct btrfs_root *log,
263 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
264{ 264{
265 if (wc->pin) 265 if (wc->pin)
266 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_pin_extent(log->fs_info->extent_root,
267 eb->start, eb->len, 1); 267 eb->start, eb->len, 0);
268 268
269 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
270 if (wc->write) 270 if (wc->write)
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
534 saved_nbytes = inode_get_bytes(inode); 534 saved_nbytes = inode_get_bytes(inode);
535 /* drop any overlapping extents */ 535 /* drop any overlapping extents */
536 ret = btrfs_drop_extents(trans, root, inode, 536 ret = btrfs_drop_extents(trans, root, inode,
537 start, extent_end, extent_end, start, &alloc_hint); 537 start, extent_end, extent_end, start, &alloc_hint, 1);
538 BUG_ON(ret); 538 BUG_ON(ret);
539 539
540 if (found_type == BTRFS_FILE_EXTENT_REG || 540 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -2605,7 +2605,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2605 extent); 2605 extent);
2606 cs = btrfs_file_extent_offset(src, extent); 2606 cs = btrfs_file_extent_offset(src, extent);
2607 cl = btrfs_file_extent_num_bytes(src, 2607 cl = btrfs_file_extent_num_bytes(src,
2608 extent);; 2608 extent);
2609 if (btrfs_file_extent_compression(src, 2609 if (btrfs_file_extent_compression(src,
2610 extent)) { 2610 extent)) {
2611 cs = 0; 2611 cs = 0;
@@ -2841,7 +2841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2841 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2841 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2842 break; 2842 break;
2843 2843
2844 if (parent == sb->s_root) 2844 if (IS_ROOT(parent))
2845 break; 2845 break;
2846 2846
2847 parent = parent->d_parent; 2847 parent = parent->d_parent;
@@ -2880,6 +2880,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2880 goto end_no_trans; 2880 goto end_no_trans;
2881 } 2881 }
2882 2882
2883 if (root != BTRFS_I(inode)->root ||
2884 btrfs_root_refs(&root->root_item) == 0) {
2885 ret = 1;
2886 goto end_no_trans;
2887 }
2888
2883 ret = check_parent_dirs_for_sync(trans, inode, parent, 2889 ret = check_parent_dirs_for_sync(trans, inode, parent,
2884 sb, last_committed); 2890 sb, last_committed);
2885 if (ret) 2891 if (ret)
@@ -2907,12 +2913,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2907 break; 2913 break;
2908 2914
2909 inode = parent->d_inode; 2915 inode = parent->d_inode;
2916 if (root != BTRFS_I(inode)->root)
2917 break;
2918
2910 if (BTRFS_I(inode)->generation > 2919 if (BTRFS_I(inode)->generation >
2911 root->fs_info->last_trans_committed) { 2920 root->fs_info->last_trans_committed) {
2912 ret = btrfs_log_inode(trans, root, inode, inode_only); 2921 ret = btrfs_log_inode(trans, root, inode, inode_only);
2913 BUG_ON(ret); 2922 BUG_ON(ret);
2914 } 2923 }
2915 if (parent == sb->s_root) 2924 if (IS_ROOT(parent))
2916 break; 2925 break;
2917 2926
2918 parent = parent->d_parent; 2927 parent = parent->d_parent;
@@ -2951,7 +2960,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2951 struct btrfs_key tmp_key; 2960 struct btrfs_key tmp_key;
2952 struct btrfs_root *log; 2961 struct btrfs_root *log;
2953 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 2962 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2954 u64 highest_inode;
2955 struct walk_control wc = { 2963 struct walk_control wc = {
2956 .process_func = process_one_buffer, 2964 .process_func = process_one_buffer,
2957 .stage = 0, 2965 .stage = 0,
@@ -3010,11 +3018,6 @@ again:
3010 path); 3018 path);
3011 BUG_ON(ret); 3019 BUG_ON(ret);
3012 } 3020 }
3013 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
3014 if (ret == 0) {
3015 wc.replay_dest->highest_inode = highest_inode;
3016 wc.replay_dest->last_inode_alloc = highest_inode;
3017 }
3018 3021
3019 key.offset = found_key.offset - 1; 3022 key.offset = found_key.offset - 1;
3020 wc.replay_dest->log_root = NULL; 3023 wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cf405b0828d..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -276,7 +276,7 @@ loop_lock:
276 * is now congested. Back off and let other work structs 276 * is now congested. Back off and let other work structs
277 * run instead 277 * run instead
278 */ 278 */
279 if (pending && bdi_write_congested(bdi) && batch_run > 32 && 279 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
280 fs_info->fs_devices->open_devices > 1) { 280 fs_info->fs_devices->open_devices > 1) {
281 struct io_context *ioc; 281 struct io_context *ioc;
282 282
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
446 goto error; 446 goto error;
447 447
448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 448 device->name = kstrdup(orig_dev->name, GFP_NOFS);
449 if (!device->name) 449 if (!device->name) {
450 kfree(device);
450 goto error; 451 goto error;
452 }
451 453
452 device->devid = orig_dev->devid; 454 device->devid = orig_dev->devid;
453 device->work.func = pending_bios_fn; 455 device->work.func = pending_bios_fn;
@@ -719,10 +721,9 @@ error:
719 * called very infrequently and that a given device has a small number 721 * called very infrequently and that a given device has a small number
720 * of extents 722 * of extents
721 */ 723 */
722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 724int find_free_dev_extent(struct btrfs_trans_handle *trans,
723 struct btrfs_device *device, 725 struct btrfs_device *device, u64 num_bytes,
724 u64 num_bytes, u64 *start, 726 u64 *start, u64 *max_avail)
725 u64 *max_avail)
726{ 727{
727 struct btrfs_key key; 728 struct btrfs_key key;
728 struct btrfs_root *root = device->dev_root; 729 struct btrfs_root *root = device->dev_root;
@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1736 extent_root = root->fs_info->extent_root; 1737 extent_root = root->fs_info->extent_root;
1737 em_tree = &root->fs_info->mapping_tree.map_tree; 1738 em_tree = &root->fs_info->mapping_tree.map_tree;
1738 1739
1740 ret = btrfs_can_relocate(extent_root, chunk_offset);
1741 if (ret)
1742 return -ENOSPC;
1743
1739 /* step one, relocate all the extents inside this chunk */ 1744 /* step one, relocate all the extents inside this chunk */
1740 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1745 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1741 BUG_ON(ret); 1746 BUG_ON(ret);
@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1749 * step two, delete the device extents and the 1754 * step two, delete the device extents and the
1750 * chunk tree entries 1755 * chunk tree entries
1751 */ 1756 */
1752 spin_lock(&em_tree->lock); 1757 read_lock(&em_tree->lock);
1753 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1758 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1754 spin_unlock(&em_tree->lock); 1759 read_unlock(&em_tree->lock);
1755 1760
1756 BUG_ON(em->start > chunk_offset || 1761 BUG_ON(em->start > chunk_offset ||
1757 em->start + em->len < chunk_offset); 1762 em->start + em->len < chunk_offset);
@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1780 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1785 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1781 BUG_ON(ret); 1786 BUG_ON(ret);
1782 1787
1783 spin_lock(&em_tree->lock); 1788 write_lock(&em_tree->lock);
1784 remove_extent_mapping(em_tree, em); 1789 remove_extent_mapping(em_tree, em);
1785 spin_unlock(&em_tree->lock); 1790 write_unlock(&em_tree->lock);
1786 1791
1787 kfree(map); 1792 kfree(map);
1788 em->bdev = NULL; 1793 em->bdev = NULL;
@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1807 struct btrfs_key found_key; 1812 struct btrfs_key found_key;
1808 u64 chunk_tree = chunk_root->root_key.objectid; 1813 u64 chunk_tree = chunk_root->root_key.objectid;
1809 u64 chunk_type; 1814 u64 chunk_type;
1815 bool retried = false;
1816 int failed = 0;
1810 int ret; 1817 int ret;
1811 1818
1812 path = btrfs_alloc_path(); 1819 path = btrfs_alloc_path();
1813 if (!path) 1820 if (!path)
1814 return -ENOMEM; 1821 return -ENOMEM;
1815 1822
1823again:
1816 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1824 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1817 key.offset = (u64)-1; 1825 key.offset = (u64)-1;
1818 key.type = BTRFS_CHUNK_ITEM_KEY; 1826 key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1842 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1850 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1843 found_key.objectid, 1851 found_key.objectid,
1844 found_key.offset); 1852 found_key.offset);
1845 BUG_ON(ret); 1853 if (ret == -ENOSPC)
1854 failed++;
1855 else if (ret)
1856 BUG();
1846 } 1857 }
1847 1858
1848 if (found_key.offset == 0) 1859 if (found_key.offset == 0)
@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1850 key.offset = found_key.offset - 1; 1861 key.offset = found_key.offset - 1;
1851 } 1862 }
1852 ret = 0; 1863 ret = 0;
1864 if (failed && !retried) {
1865 failed = 0;
1866 retried = true;
1867 goto again;
1868 } else if (failed && retried) {
1869 WARN_ON(1);
1870 ret = -ENOSPC;
1871 }
1853error: 1872error:
1854 btrfs_free_path(path); 1873 btrfs_free_path(path);
1855 return ret; 1874 return ret;
@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
1894 continue; 1913 continue;
1895 1914
1896 ret = btrfs_shrink_device(device, old_size - size_to_free); 1915 ret = btrfs_shrink_device(device, old_size - size_to_free);
1916 if (ret == -ENOSPC)
1917 break;
1897 BUG_ON(ret); 1918 BUG_ON(ret);
1898 1919
1899 trans = btrfs_start_transaction(dev_root, 1); 1920 trans = btrfs_start_transaction(dev_root, 1);
@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
1938 chunk = btrfs_item_ptr(path->nodes[0], 1959 chunk = btrfs_item_ptr(path->nodes[0],
1939 path->slots[0], 1960 path->slots[0],
1940 struct btrfs_chunk); 1961 struct btrfs_chunk);
1941 key.offset = found_key.offset;
1942 /* chunk zero is special */ 1962 /* chunk zero is special */
1943 if (key.offset == 0) 1963 if (found_key.offset == 0)
1944 break; 1964 break;
1945 1965
1946 btrfs_release_path(chunk_root, path); 1966 btrfs_release_path(chunk_root, path);
@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
1948 chunk_root->root_key.objectid, 1968 chunk_root->root_key.objectid,
1949 found_key.objectid, 1969 found_key.objectid,
1950 found_key.offset); 1970 found_key.offset);
1951 BUG_ON(ret); 1971 BUG_ON(ret && ret != -ENOSPC);
1972 key.offset = found_key.offset - 1;
1952 } 1973 }
1953 ret = 0; 1974 ret = 0;
1954error: 1975error:
@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1974 u64 chunk_offset; 1995 u64 chunk_offset;
1975 int ret; 1996 int ret;
1976 int slot; 1997 int slot;
1998 int failed = 0;
1999 bool retried = false;
1977 struct extent_buffer *l; 2000 struct extent_buffer *l;
1978 struct btrfs_key key; 2001 struct btrfs_key key;
1979 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2002 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1980 u64 old_total = btrfs_super_total_bytes(super_copy); 2003 u64 old_total = btrfs_super_total_bytes(super_copy);
2004 u64 old_size = device->total_bytes;
1981 u64 diff = device->total_bytes - new_size; 2005 u64 diff = device->total_bytes - new_size;
1982 2006
1983 if (new_size >= device->total_bytes) 2007 if (new_size >= device->total_bytes)
@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1987 if (!path) 2011 if (!path)
1988 return -ENOMEM; 2012 return -ENOMEM;
1989 2013
1990 trans = btrfs_start_transaction(root, 1);
1991 if (!trans) {
1992 ret = -ENOMEM;
1993 goto done;
1994 }
1995
1996 path->reada = 2; 2014 path->reada = 2;
1997 2015
1998 lock_chunks(root); 2016 lock_chunks(root);
@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2001 if (device->writeable) 2019 if (device->writeable)
2002 device->fs_devices->total_rw_bytes -= diff; 2020 device->fs_devices->total_rw_bytes -= diff;
2003 unlock_chunks(root); 2021 unlock_chunks(root);
2004 btrfs_end_transaction(trans, root);
2005 2022
2023again:
2006 key.objectid = device->devid; 2024 key.objectid = device->devid;
2007 key.offset = (u64)-1; 2025 key.offset = (u64)-1;
2008 key.type = BTRFS_DEV_EXTENT_KEY; 2026 key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2017 goto done; 2035 goto done;
2018 if (ret) { 2036 if (ret) {
2019 ret = 0; 2037 ret = 0;
2038 btrfs_release_path(root, path);
2020 break; 2039 break;
2021 } 2040 }
2022 2041
@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2024 slot = path->slots[0]; 2043 slot = path->slots[0];
2025 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2044 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2026 2045
2027 if (key.objectid != device->devid) 2046 if (key.objectid != device->devid) {
2047 btrfs_release_path(root, path);
2028 break; 2048 break;
2049 }
2029 2050
2030 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2051 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2031 length = btrfs_dev_extent_length(l, dev_extent); 2052 length = btrfs_dev_extent_length(l, dev_extent);
2032 2053
2033 if (key.offset + length <= new_size) 2054 if (key.offset + length <= new_size) {
2055 btrfs_release_path(root, path);
2034 break; 2056 break;
2057 }
2035 2058
2036 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2059 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2037 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2060 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2040 2063
2041 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2064 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2042 chunk_offset); 2065 chunk_offset);
2043 if (ret) 2066 if (ret && ret != -ENOSPC)
2044 goto done; 2067 goto done;
2068 if (ret == -ENOSPC)
2069 failed++;
2070 key.offset -= 1;
2071 }
2072
2073 if (failed && !retried) {
2074 failed = 0;
2075 retried = true;
2076 goto again;
2077 } else if (failed && retried) {
2078 ret = -ENOSPC;
2079 lock_chunks(root);
2080
2081 device->total_bytes = old_size;
2082 if (device->writeable)
2083 device->fs_devices->total_rw_bytes += diff;
2084 unlock_chunks(root);
2085 goto done;
2045 } 2086 }
2046 2087
2047 /* Shrinking succeeded, else we would be at "done". */ 2088 /* Shrinking succeeded, else we would be at "done". */
@@ -2294,9 +2335,9 @@ again:
2294 em->block_len = em->len; 2335 em->block_len = em->len;
2295 2336
2296 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2337 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2297 spin_lock(&em_tree->lock); 2338 write_lock(&em_tree->lock);
2298 ret = add_extent_mapping(em_tree, em); 2339 ret = add_extent_mapping(em_tree, em);
2299 spin_unlock(&em_tree->lock); 2340 write_unlock(&em_tree->lock);
2300 BUG_ON(ret); 2341 BUG_ON(ret);
2301 free_extent_map(em); 2342 free_extent_map(em);
2302 2343
@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2491 int readonly = 0; 2532 int readonly = 0;
2492 int i; 2533 int i;
2493 2534
2494 spin_lock(&map_tree->map_tree.lock); 2535 read_lock(&map_tree->map_tree.lock);
2495 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2536 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2496 spin_unlock(&map_tree->map_tree.lock); 2537 read_unlock(&map_tree->map_tree.lock);
2497 if (!em) 2538 if (!em)
2498 return 1; 2539 return 1;
2499 2540
@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2518 struct extent_map *em; 2559 struct extent_map *em;
2519 2560
2520 while (1) { 2561 while (1) {
2521 spin_lock(&tree->map_tree.lock); 2562 write_lock(&tree->map_tree.lock);
2522 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2563 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2523 if (em) 2564 if (em)
2524 remove_extent_mapping(&tree->map_tree, em); 2565 remove_extent_mapping(&tree->map_tree, em);
2525 spin_unlock(&tree->map_tree.lock); 2566 write_unlock(&tree->map_tree.lock);
2526 if (!em) 2567 if (!em)
2527 break; 2568 break;
2528 kfree(em->bdev); 2569 kfree(em->bdev);
@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2540 struct extent_map_tree *em_tree = &map_tree->map_tree; 2581 struct extent_map_tree *em_tree = &map_tree->map_tree;
2541 int ret; 2582 int ret;
2542 2583
2543 spin_lock(&em_tree->lock); 2584 read_lock(&em_tree->lock);
2544 em = lookup_extent_mapping(em_tree, logical, len); 2585 em = lookup_extent_mapping(em_tree, logical, len);
2545 spin_unlock(&em_tree->lock); 2586 read_unlock(&em_tree->lock);
2546 BUG_ON(!em); 2587 BUG_ON(!em);
2547 2588
2548 BUG_ON(em->start > logical || em->start + em->len < logical); 2589 BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2604,9 +2645,9 @@ again:
2604 atomic_set(&multi->error, 0); 2645 atomic_set(&multi->error, 0);
2605 } 2646 }
2606 2647
2607 spin_lock(&em_tree->lock); 2648 read_lock(&em_tree->lock);
2608 em = lookup_extent_mapping(em_tree, logical, *length); 2649 em = lookup_extent_mapping(em_tree, logical, *length);
2609 spin_unlock(&em_tree->lock); 2650 read_unlock(&em_tree->lock);
2610 2651
2611 if (!em && unplug_page) 2652 if (!em && unplug_page)
2612 return 0; 2653 return 0;
@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2763 u64 stripe_nr; 2804 u64 stripe_nr;
2764 int i, j, nr = 0; 2805 int i, j, nr = 0;
2765 2806
2766 spin_lock(&em_tree->lock); 2807 read_lock(&em_tree->lock);
2767 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2808 em = lookup_extent_mapping(em_tree, chunk_start, 1);
2768 spin_unlock(&em_tree->lock); 2809 read_unlock(&em_tree->lock);
2769 2810
2770 BUG_ON(!em || em->start != chunk_start); 2811 BUG_ON(!em || em->start != chunk_start);
2771 map = (struct map_lookup *)em->bdev; 2812 map = (struct map_lookup *)em->bdev;
@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3053 logical = key->offset; 3094 logical = key->offset;
3054 length = btrfs_chunk_length(leaf, chunk); 3095 length = btrfs_chunk_length(leaf, chunk);
3055 3096
3056 spin_lock(&map_tree->map_tree.lock); 3097 read_lock(&map_tree->map_tree.lock);
3057 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3098 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3058 spin_unlock(&map_tree->map_tree.lock); 3099 read_unlock(&map_tree->map_tree.lock);
3059 3100
3060 /* already mapped? */ 3101 /* already mapped? */
3061 if (em && em->start <= logical && em->start + em->len > logical) { 3102 if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3114 map->stripes[i].dev->in_fs_metadata = 1; 3155 map->stripes[i].dev->in_fs_metadata = 1;
3115 } 3156 }
3116 3157
3117 spin_lock(&map_tree->map_tree.lock); 3158 write_lock(&map_tree->map_tree.lock);
3118 ret = add_extent_mapping(&map_tree->map_tree, em); 3159 ret = add_extent_mapping(&map_tree->map_tree, em);
3119 spin_unlock(&map_tree->map_tree.lock); 3160 write_unlock(&map_tree->map_tree.lock);
3120 BUG_ON(ret); 3161 BUG_ON(ret);
3121 free_extent_map(em); 3162 free_extent_map(em);
3122 3163
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5139a833f721..31b0fabdd2ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
181void btrfs_unlock_volumes(void); 181void btrfs_unlock_volumes(void);
182void btrfs_lock_volumes(void); 182void btrfs_lock_volumes(void);
183int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 183int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
184int find_free_dev_extent(struct btrfs_trans_handle *trans,
185 struct btrfs_device *device, u64 num_bytes,
186 u64 *start, u64 *max_avail);
184#endif 187#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b0fc93f95fd0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
260 * attributes are handled directly. 260 * attributes are handled directly.
261 */ 261 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 262struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_FS_POSIX_ACL 263#ifdef CONFIG_BTRFS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 264 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 265 &btrfs_xattr_acl_default_handler,
266#endif 266#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 90a98865b0cc..6fa530256bfd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,6 +52,7 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52 bh->b_end_io = handler; 52 bh->b_end_io = handler;
53 bh->b_private = private; 53 bh->b_private = private;
54} 54}
55EXPORT_SYMBOL(init_buffer);
55 56
56static int sync_buffer(void *word) 57static int sync_buffer(void *word)
57{ 58{
@@ -80,6 +81,7 @@ void unlock_buffer(struct buffer_head *bh)
80 smp_mb__after_clear_bit(); 81 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock); 82 wake_up_bit(&bh->b_state, BH_Lock);
82} 83}
84EXPORT_SYMBOL(unlock_buffer);
83 85
84/* 86/*
85 * Block until a buffer comes unlocked. This doesn't stop it 87 * Block until a buffer comes unlocked. This doesn't stop it
@@ -90,6 +92,7 @@ void __wait_on_buffer(struct buffer_head * bh)
90{ 92{
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92} 94}
95EXPORT_SYMBOL(__wait_on_buffer);
93 96
94static void 97static void
95__clear_page_buffers(struct page *page) 98__clear_page_buffers(struct page *page)
@@ -144,6 +147,7 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
144 __end_buffer_read_notouch(bh, uptodate); 147 __end_buffer_read_notouch(bh, uptodate);
145 put_bh(bh); 148 put_bh(bh);
146} 149}
150EXPORT_SYMBOL(end_buffer_read_sync);
147 151
148void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 152void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
149{ 153{
@@ -164,6 +168,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
164 unlock_buffer(bh); 168 unlock_buffer(bh);
165 put_bh(bh); 169 put_bh(bh);
166} 170}
171EXPORT_SYMBOL(end_buffer_write_sync);
167 172
168/* 173/*
169 * Various filesystems appear to want __find_get_block to be non-blocking. 174 * Various filesystems appear to want __find_get_block to be non-blocking.
@@ -272,9 +277,10 @@ void invalidate_bdev(struct block_device *bdev)
272 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
273 invalidate_mapping_pages(mapping, 0, -1); 278 invalidate_mapping_pages(mapping, 0, -1);
274} 279}
280EXPORT_SYMBOL(invalidate_bdev);
275 281
276/* 282/*
277 * Kick pdflush then try to free up some ZONE_NORMAL memory. 283 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
278 */ 284 */
279static void free_more_memory(void) 285static void free_more_memory(void)
280{ 286{
@@ -410,6 +416,7 @@ still_busy:
410 local_irq_restore(flags); 416 local_irq_restore(flags);
411 return; 417 return;
412} 418}
419EXPORT_SYMBOL(end_buffer_async_write);
413 420
414/* 421/*
415 * If a page's buffers are under async readin (end_buffer_async_read 422 * If a page's buffers are under async readin (end_buffer_async_read
@@ -438,8 +445,8 @@ static void mark_buffer_async_read(struct buffer_head *bh)
438 set_buffer_async_read(bh); 445 set_buffer_async_read(bh);
439} 446}
440 447
441void mark_buffer_async_write_endio(struct buffer_head *bh, 448static void mark_buffer_async_write_endio(struct buffer_head *bh,
442 bh_end_io_t *handler) 449 bh_end_io_t *handler)
443{ 450{
444 bh->b_end_io = handler; 451 bh->b_end_io = handler;
445 set_buffer_async_write(bh); 452 set_buffer_async_write(bh);
@@ -553,7 +560,7 @@ repeat:
553 return err; 560 return err;
554} 561}
555 562
556void do_thaw_all(struct work_struct *work) 563static void do_thaw_all(struct work_struct *work)
557{ 564{
558 struct super_block *sb; 565 struct super_block *sb;
559 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
@@ -1172,6 +1179,7 @@ void mark_buffer_dirty(struct buffer_head *bh)
1172 } 1179 }
1173 } 1180 }
1174} 1181}
1182EXPORT_SYMBOL(mark_buffer_dirty);
1175 1183
1176/* 1184/*
1177 * Decrement a buffer_head's reference count. If all buffers against a page 1185 * Decrement a buffer_head's reference count. If all buffers against a page
@@ -1188,6 +1196,7 @@ void __brelse(struct buffer_head * buf)
1188 } 1196 }
1189 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); 1197 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1190} 1198}
1199EXPORT_SYMBOL(__brelse);
1191 1200
1192/* 1201/*
1193 * bforget() is like brelse(), except it discards any 1202 * bforget() is like brelse(), except it discards any
@@ -1206,6 +1215,7 @@ void __bforget(struct buffer_head *bh)
1206 } 1215 }
1207 __brelse(bh); 1216 __brelse(bh);
1208} 1217}
1218EXPORT_SYMBOL(__bforget);
1209 1219
1210static struct buffer_head *__bread_slow(struct buffer_head *bh) 1220static struct buffer_head *__bread_slow(struct buffer_head *bh)
1211{ 1221{
@@ -1699,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1699 /* 1709 /*
1700 * If it's a fully non-blocking write attempt and we cannot 1710 * If it's a fully non-blocking write attempt and we cannot
1701 * lock the buffer then redirty the page. Note that this can 1711 * lock the buffer then redirty the page. Note that this can
1702 * potentially cause a busy-wait loop from pdflush and kswapd 1712 * potentially cause a busy-wait loop from writeback threads
1703 * activity, but those code paths have their own higher-level 1713 * and kswapd activity, but those code paths have their own
1704 * throttling. 1714 * higher-level throttling.
1705 */ 1715 */
1706 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1716 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1707 lock_buffer(bh); 1717 lock_buffer(bh);
@@ -2218,6 +2228,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
2218 } 2228 }
2219 return 0; 2229 return 0;
2220} 2230}
2231EXPORT_SYMBOL(block_read_full_page);
2221 2232
2222/* utility function for filesystems that need to do work on expanding 2233/* utility function for filesystems that need to do work on expanding
2223 * truncates. Uses filesystem pagecache writes to allow the filesystem to 2234 * truncates. Uses filesystem pagecache writes to allow the filesystem to
@@ -2228,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
2228 struct address_space *mapping = inode->i_mapping; 2239 struct address_space *mapping = inode->i_mapping;
2229 struct page *page; 2240 struct page *page;
2230 void *fsdata; 2241 void *fsdata;
2231 unsigned long limit;
2232 int err; 2242 int err;
2233 2243
2234 err = -EFBIG; 2244 err = inode_newsize_ok(inode, size);
2235 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 2245 if (err)
2236 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2237 send_sig(SIGXFSZ, current, 0);
2238 goto out;
2239 }
2240 if (size > inode->i_sb->s_maxbytes)
2241 goto out; 2246 goto out;
2242 2247
2243 err = pagecache_write_begin(NULL, mapping, size, 0, 2248 err = pagecache_write_begin(NULL, mapping, size, 0,
@@ -2252,6 +2257,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
2252out: 2257out:
2253 return err; 2258 return err;
2254} 2259}
2260EXPORT_SYMBOL(generic_cont_expand_simple);
2255 2261
2256static int cont_expand_zero(struct file *file, struct address_space *mapping, 2262static int cont_expand_zero(struct file *file, struct address_space *mapping,
2257 loff_t pos, loff_t *bytes) 2263 loff_t pos, loff_t *bytes)
@@ -2352,6 +2358,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2352out: 2358out:
2353 return err; 2359 return err;
2354} 2360}
2361EXPORT_SYMBOL(cont_write_begin);
2355 2362
2356int block_prepare_write(struct page *page, unsigned from, unsigned to, 2363int block_prepare_write(struct page *page, unsigned from, unsigned to,
2357 get_block_t *get_block) 2364 get_block_t *get_block)
@@ -2362,6 +2369,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
2362 ClearPageUptodate(page); 2369 ClearPageUptodate(page);
2363 return err; 2370 return err;
2364} 2371}
2372EXPORT_SYMBOL(block_prepare_write);
2365 2373
2366int block_commit_write(struct page *page, unsigned from, unsigned to) 2374int block_commit_write(struct page *page, unsigned from, unsigned to)
2367{ 2375{
@@ -2369,6 +2377,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
2369 __block_commit_write(inode,page,from,to); 2377 __block_commit_write(inode,page,from,to);
2370 return 0; 2378 return 0;
2371} 2379}
2380EXPORT_SYMBOL(block_commit_write);
2372 2381
2373/* 2382/*
2374 * block_page_mkwrite() is not allowed to change the file size as it gets 2383 * block_page_mkwrite() is not allowed to change the file size as it gets
@@ -2426,6 +2435,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2426out: 2435out:
2427 return ret; 2436 return ret;
2428} 2437}
2438EXPORT_SYMBOL(block_page_mkwrite);
2429 2439
2430/* 2440/*
2431 * nobh_write_begin()'s prereads are special: the buffer_heads are freed 2441 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
@@ -2849,6 +2859,7 @@ unlock:
2849out: 2859out:
2850 return err; 2860 return err;
2851} 2861}
2862EXPORT_SYMBOL(block_truncate_page);
2852 2863
2853/* 2864/*
2854 * The generic ->writepage function for buffer-backed address_spaces 2865 * The generic ->writepage function for buffer-backed address_spaces
@@ -2890,6 +2901,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2890 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2901 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2891 return __block_write_full_page(inode, page, get_block, wbc, handler); 2902 return __block_write_full_page(inode, page, get_block, wbc, handler);
2892} 2903}
2904EXPORT_SYMBOL(block_write_full_page_endio);
2893 2905
2894/* 2906/*
2895 * The generic ->writepage function for buffer-backed address_spaces 2907 * The generic ->writepage function for buffer-backed address_spaces
@@ -2900,7 +2912,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2900 return block_write_full_page_endio(page, get_block, wbc, 2912 return block_write_full_page_endio(page, get_block, wbc,
2901 end_buffer_async_write); 2913 end_buffer_async_write);
2902} 2914}
2903 2915EXPORT_SYMBOL(block_write_full_page);
2904 2916
2905sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2917sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2906 get_block_t *get_block) 2918 get_block_t *get_block)
@@ -2913,6 +2925,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2913 get_block(inode, block, &tmp, 0); 2925 get_block(inode, block, &tmp, 0);
2914 return tmp.b_blocknr; 2926 return tmp.b_blocknr;
2915} 2927}
2928EXPORT_SYMBOL(generic_block_bmap);
2916 2929
2917static void end_bio_bh_io_sync(struct bio *bio, int err) 2930static void end_bio_bh_io_sync(struct bio *bio, int err)
2918{ 2931{
@@ -2982,6 +2995,7 @@ int submit_bh(int rw, struct buffer_head * bh)
2982 bio_put(bio); 2995 bio_put(bio);
2983 return ret; 2996 return ret;
2984} 2997}
2998EXPORT_SYMBOL(submit_bh);
2985 2999
2986/** 3000/**
2987 * ll_rw_block: low-level access to block devices (DEPRECATED) 3001 * ll_rw_block: low-level access to block devices (DEPRECATED)
@@ -3043,6 +3057,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3043 unlock_buffer(bh); 3057 unlock_buffer(bh);
3044 } 3058 }
3045} 3059}
3060EXPORT_SYMBOL(ll_rw_block);
3046 3061
3047/* 3062/*
3048 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3063 * For a data-integrity writeout, we need to wait upon any in-progress I/O
@@ -3071,6 +3086,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
3071 } 3086 }
3072 return ret; 3087 return ret;
3073} 3088}
3089EXPORT_SYMBOL(sync_dirty_buffer);
3074 3090
3075/* 3091/*
3076 * try_to_free_buffers() checks if all the buffers on this particular page 3092 * try_to_free_buffers() checks if all the buffers on this particular page
@@ -3185,13 +3201,14 @@ void block_sync_page(struct page *page)
3185 if (mapping) 3201 if (mapping)
3186 blk_run_backing_dev(mapping->backing_dev_info, page); 3202 blk_run_backing_dev(mapping->backing_dev_info, page);
3187} 3203}
3204EXPORT_SYMBOL(block_sync_page);
3188 3205
3189/* 3206/*
3190 * There are no bdflush tunables left. But distributions are 3207 * There are no bdflush tunables left. But distributions are
3191 * still running obsolete flush daemons, so we terminate them here. 3208 * still running obsolete flush daemons, so we terminate them here.
3192 * 3209 *
3193 * Use of bdflush() is deprecated and will be removed in a future kernel. 3210 * Use of bdflush() is deprecated and will be removed in a future kernel.
3194 * The `pdflush' kernel threads fully replace bdflush daemons and this call. 3211 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3195 */ 3212 */
3196SYSCALL_DEFINE2(bdflush, int, func, long, data) 3213SYSCALL_DEFINE2(bdflush, int, func, long, data)
3197{ 3214{
@@ -3361,29 +3378,3 @@ void __init buffer_init(void)
3361 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); 3378 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3362 hotcpu_notifier(buffer_cpu_notify, 0); 3379 hotcpu_notifier(buffer_cpu_notify, 0);
3363} 3380}
3364
3365EXPORT_SYMBOL(__bforget);
3366EXPORT_SYMBOL(__brelse);
3367EXPORT_SYMBOL(__wait_on_buffer);
3368EXPORT_SYMBOL(block_commit_write);
3369EXPORT_SYMBOL(block_prepare_write);
3370EXPORT_SYMBOL(block_page_mkwrite);
3371EXPORT_SYMBOL(block_read_full_page);
3372EXPORT_SYMBOL(block_sync_page);
3373EXPORT_SYMBOL(block_truncate_page);
3374EXPORT_SYMBOL(block_write_full_page);
3375EXPORT_SYMBOL(block_write_full_page_endio);
3376EXPORT_SYMBOL(cont_write_begin);
3377EXPORT_SYMBOL(end_buffer_read_sync);
3378EXPORT_SYMBOL(end_buffer_write_sync);
3379EXPORT_SYMBOL(end_buffer_async_write);
3380EXPORT_SYMBOL(file_fsync);
3381EXPORT_SYMBOL(generic_block_bmap);
3382EXPORT_SYMBOL(generic_cont_expand_simple);
3383EXPORT_SYMBOL(init_buffer);
3384EXPORT_SYMBOL(invalidate_bdev);
3385EXPORT_SYMBOL(ll_rw_block);
3386EXPORT_SYMBOL(mark_buffer_dirty);
3387EXPORT_SYMBOL(submit_bh);
3388EXPORT_SYMBOL(sync_dirty_buffer);
3389EXPORT_SYMBOL(unlock_buffer);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cbc57f932d2..d6db933df2b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -264,7 +264,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
264{ 264{
265 struct char_device_struct *cd; 265 struct char_device_struct *cd;
266 struct cdev *cdev; 266 struct cdev *cdev;
267 char *s;
268 int err = -ENOMEM; 267 int err = -ENOMEM;
269 268
270 cd = __register_chrdev_region(major, baseminor, count, name); 269 cd = __register_chrdev_region(major, baseminor, count, name);
@@ -278,8 +277,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
278 cdev->owner = fops->owner; 277 cdev->owner = fops->owner;
279 cdev->ops = fops; 278 cdev->ops = fops;
280 kobject_set_name(&cdev->kobj, "%s", name); 279 kobject_set_name(&cdev->kobj, "%s", name);
281 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
282 *s = '!';
283 280
284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); 281 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
285 if (err) 282 if (err)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 6994a0f54f02..80f352596807 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,7 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select SLOW_WORK
5 help 6 help
6 This is the client VFS module for the Common Internet File System 7 This is the client VFS module for the Common Internet File System
7 (CIFS) protocol which is the successor to the Server Message Block 8 (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606912d8f2a8..fea9e898c4ba 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -142,7 +142,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
143 if (rc != 0) { 143 if (rc != 0) {
144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
145 __func__, *devname, rc));; 145 __func__, *devname, rc));
146 goto compose_mount_options_err; 146 goto compose_mount_options_err;
147 } 147 }
148 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 148 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -385,7 +385,7 @@ out_err:
385 goto out; 385 goto out;
386} 386}
387 387
388struct inode_operations cifs_dfs_referral_inode_operations = { 388const struct inode_operations cifs_dfs_referral_inode_operations = {
389 .follow_link = cifs_dfs_follow_mountpoint, 389 .follow_link = cifs_dfs_follow_mountpoint,
390}; 390};
391 391
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3610e9958b4c..9a5e4f5f3122 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -50,7 +50,7 @@
50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
51 51
52#ifdef CONFIG_CIFS_QUOTA 52#ifdef CONFIG_CIFS_QUOTA
53static struct quotactl_ops cifs_quotactl_ops; 53static const struct quotactl_ops cifs_quotactl_ops;
54#endif /* QUOTA */ 54#endif /* QUOTA */
55 55
56int cifsFYI = 0; 56int cifsFYI = 0;
@@ -64,9 +64,6 @@ unsigned int multiuser_mount = 0;
64unsigned int extended_security = CIFSSEC_DEF; 64unsigned int extended_security = CIFSSEC_DEF;
65/* unsigned int ntlmv2_support = 0; */ 65/* unsigned int ntlmv2_support = 0; */
66unsigned int sign_CIFS_PDUs = 1; 66unsigned int sign_CIFS_PDUs = 1;
67extern struct task_struct *oplockThread; /* remove sparse warning */
68struct task_struct *oplockThread = NULL;
69/* extern struct task_struct * dnotifyThread; remove sparse warning */
70static const struct super_operations cifs_super_ops; 67static const struct super_operations cifs_super_ops;
71unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 68unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
72module_param(CIFSMaxBufSize, int, 0); 69module_param(CIFSMaxBufSize, int, 0);
@@ -185,8 +182,7 @@ out_mount_failed:
185 cifs_sb->mountdata = NULL; 182 cifs_sb->mountdata = NULL;
186 } 183 }
187#endif 184#endif
188 if (cifs_sb->local_nls) 185 unload_nls(cifs_sb->local_nls);
189 unload_nls(cifs_sb->local_nls);
190 kfree(cifs_sb); 186 kfree(cifs_sb);
191 } 187 }
192 return rc; 188 return rc;
@@ -517,7 +513,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
517 return rc; 513 return rc;
518} 514}
519 515
520static struct quotactl_ops cifs_quotactl_ops = { 516static const struct quotactl_ops cifs_quotactl_ops = {
521 .set_xquota = cifs_xquota_set, 517 .set_xquota = cifs_xquota_set,
522 .get_xquota = cifs_xquota_get, 518 .get_xquota = cifs_xquota_get,
523 .set_xstate = cifs_xstate_set, 519 .set_xstate = cifs_xstate_set,
@@ -973,89 +969,12 @@ cifs_destroy_mids(void)
973 kmem_cache_destroy(cifs_oplock_cachep); 969 kmem_cache_destroy(cifs_oplock_cachep);
974} 970}
975 971
976static int cifs_oplock_thread(void *dummyarg)
977{
978 struct oplock_q_entry *oplock_item;
979 struct cifsTconInfo *pTcon;
980 struct inode *inode;
981 __u16 netfid;
982 int rc, waitrc = 0;
983
984 set_freezable();
985 do {
986 if (try_to_freeze())
987 continue;
988
989 spin_lock(&cifs_oplock_lock);
990 if (list_empty(&cifs_oplock_list)) {
991 spin_unlock(&cifs_oplock_lock);
992 set_current_state(TASK_INTERRUPTIBLE);
993 schedule_timeout(39*HZ);
994 } else {
995 oplock_item = list_entry(cifs_oplock_list.next,
996 struct oplock_q_entry, qhead);
997 cFYI(1, ("found oplock item to write out"));
998 pTcon = oplock_item->tcon;
999 inode = oplock_item->pinode;
1000 netfid = oplock_item->netfid;
1001 spin_unlock(&cifs_oplock_lock);
1002 DeleteOplockQEntry(oplock_item);
1003 /* can not grab inode sem here since it would
1004 deadlock when oplock received on delete
1005 since vfs_unlink holds the i_mutex across
1006 the call */
1007 /* mutex_lock(&inode->i_mutex);*/
1008 if (S_ISREG(inode->i_mode)) {
1009#ifdef CONFIG_CIFS_EXPERIMENTAL
1010 if (CIFS_I(inode)->clientCanCacheAll == 0)
1011 break_lease(inode, FMODE_READ);
1012 else if (CIFS_I(inode)->clientCanCacheRead == 0)
1013 break_lease(inode, FMODE_WRITE);
1014#endif
1015 rc = filemap_fdatawrite(inode->i_mapping);
1016 if (CIFS_I(inode)->clientCanCacheRead == 0) {
1017 waitrc = filemap_fdatawait(
1018 inode->i_mapping);
1019 invalidate_remote_inode(inode);
1020 }
1021 if (rc == 0)
1022 rc = waitrc;
1023 } else
1024 rc = 0;
1025 /* mutex_unlock(&inode->i_mutex);*/
1026 if (rc)
1027 CIFS_I(inode)->write_behind_rc = rc;
1028 cFYI(1, ("Oplock flush inode %p rc %d",
1029 inode, rc));
1030
1031 /* releasing stale oplock after recent reconnect
1032 of smb session using a now incorrect file
1033 handle is not a data integrity issue but do
1034 not bother sending an oplock release if session
1035 to server still is disconnected since oplock
1036 already released by the server in that case */
1037 if (!pTcon->need_reconnect) {
1038 rc = CIFSSMBLock(0, pTcon, netfid,
1039 0 /* len */ , 0 /* offset */, 0,
1040 0, LOCKING_ANDX_OPLOCK_RELEASE,
1041 false /* wait flag */);
1042 cFYI(1, ("Oplock release rc = %d", rc));
1043 }
1044 set_current_state(TASK_INTERRUPTIBLE);
1045 schedule_timeout(1); /* yield in case q were corrupt */
1046 }
1047 } while (!kthread_should_stop());
1048
1049 return 0;
1050}
1051
1052static int __init 972static int __init
1053init_cifs(void) 973init_cifs(void)
1054{ 974{
1055 int rc = 0; 975 int rc = 0;
1056 cifs_proc_init(); 976 cifs_proc_init();
1057 INIT_LIST_HEAD(&cifs_tcp_ses_list); 977 INIT_LIST_HEAD(&cifs_tcp_ses_list);
1058 INIT_LIST_HEAD(&cifs_oplock_list);
1059#ifdef CONFIG_CIFS_EXPERIMENTAL 978#ifdef CONFIG_CIFS_EXPERIMENTAL
1060 INIT_LIST_HEAD(&GlobalDnotifyReqList); 979 INIT_LIST_HEAD(&GlobalDnotifyReqList);
1061 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); 980 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1084,7 +1003,6 @@ init_cifs(void)
1084 rwlock_init(&GlobalSMBSeslock); 1003 rwlock_init(&GlobalSMBSeslock);
1085 rwlock_init(&cifs_tcp_ses_lock); 1004 rwlock_init(&cifs_tcp_ses_lock);
1086 spin_lock_init(&GlobalMid_Lock); 1005 spin_lock_init(&GlobalMid_Lock);
1087 spin_lock_init(&cifs_oplock_lock);
1088 1006
1089 if (cifs_max_pending < 2) { 1007 if (cifs_max_pending < 2) {
1090 cifs_max_pending = 2; 1008 cifs_max_pending = 2;
@@ -1119,16 +1037,13 @@ init_cifs(void)
1119 if (rc) 1037 if (rc)
1120 goto out_unregister_key_type; 1038 goto out_unregister_key_type;
1121#endif 1039#endif
1122 oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); 1040 rc = slow_work_register_user();
1123 if (IS_ERR(oplockThread)) { 1041 if (rc)
1124 rc = PTR_ERR(oplockThread); 1042 goto out_unregister_resolver_key;
1125 cERROR(1, ("error %d create oplock thread", rc));
1126 goto out_unregister_dfs_key_type;
1127 }
1128 1043
1129 return 0; 1044 return 0;
1130 1045
1131 out_unregister_dfs_key_type: 1046 out_unregister_resolver_key:
1132#ifdef CONFIG_CIFS_DFS_UPCALL 1047#ifdef CONFIG_CIFS_DFS_UPCALL
1133 unregister_key_type(&key_type_dns_resolver); 1048 unregister_key_type(&key_type_dns_resolver);
1134 out_unregister_key_type: 1049 out_unregister_key_type:
@@ -1165,7 +1080,6 @@ exit_cifs(void)
1165 cifs_destroy_inodecache(); 1080 cifs_destroy_inodecache();
1166 cifs_destroy_mids(); 1081 cifs_destroy_mids();
1167 cifs_destroy_request_bufs(); 1082 cifs_destroy_request_bufs();
1168 kthread_stop(oplockThread);
1169} 1083}
1170 1084
1171MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); 1085MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 094325e3f714..ac2b24c192f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
67 67
68extern const struct inode_operations cifs_file_inode_ops; 68extern const struct inode_operations cifs_file_inode_ops;
69extern const struct inode_operations cifs_symlink_inode_ops; 69extern const struct inode_operations cifs_symlink_inode_ops;
70extern struct inode_operations cifs_dfs_referral_inode_operations; 70extern const struct inode_operations cifs_dfs_referral_inode_operations;
71 71
72 72
73/* Functions related to files and directories */ 73/* Functions related to files and directories */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6cfc81a32703..5d0fde18039c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slow-work.h>
21#include "cifs_fs_sb.h" 22#include "cifs_fs_sb.h"
22#include "cifsacl.h" 23#include "cifsacl.h"
23/* 24/*
@@ -346,14 +347,16 @@ struct cifsFileInfo {
346 /* lock scope id (0 if none) */ 347 /* lock scope id (0 if none) */
347 struct file *pfile; /* needed for writepage */ 348 struct file *pfile; /* needed for writepage */
348 struct inode *pInode; /* needed for oplock break */ 349 struct inode *pInode; /* needed for oplock break */
350 struct vfsmount *mnt;
349 struct mutex lock_mutex; 351 struct mutex lock_mutex;
350 struct list_head llist; /* list of byte range locks we have. */ 352 struct list_head llist; /* list of byte range locks we have. */
351 bool closePend:1; /* file is marked to close */ 353 bool closePend:1; /* file is marked to close */
352 bool invalidHandle:1; /* file closed via session abend */ 354 bool invalidHandle:1; /* file closed via session abend */
353 bool messageMode:1; /* for pipes: message vs byte mode */ 355 bool oplock_break_cancelled:1;
354 atomic_t count; /* reference count */ 356 atomic_t count; /* reference count */
355 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
356 struct cifs_search_info srch_inf; 358 struct cifs_search_info srch_inf;
359 struct slow_work oplock_break; /* slow_work job for oplock breaks */
357}; 360};
358 361
359/* Take a reference on the file private data */ 362/* Take a reference on the file private data */
@@ -365,8 +368,10 @@ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
365/* Release a reference on the file private data */ 368/* Release a reference on the file private data */
366static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file) 369static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
367{ 370{
368 if (atomic_dec_and_test(&cifs_file->count)) 371 if (atomic_dec_and_test(&cifs_file->count)) {
372 iput(cifs_file->pInode);
369 kfree(cifs_file); 373 kfree(cifs_file);
374 }
370} 375}
371 376
372/* 377/*
@@ -382,7 +387,6 @@ struct cifsInodeInfo {
382 unsigned long time; /* jiffies of last update/check of inode */ 387 unsigned long time; /* jiffies of last update/check of inode */
383 bool clientCanCacheRead:1; /* read oplock */ 388 bool clientCanCacheRead:1; /* read oplock */
384 bool clientCanCacheAll:1; /* read and writebehind oplock */ 389 bool clientCanCacheAll:1; /* read and writebehind oplock */
385 bool oplockPending:1;
386 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 390 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
387 u64 server_eof; /* current file size on server */ 391 u64 server_eof; /* current file size on server */
388 u64 uniqueid; /* server inode number */ 392 u64 uniqueid; /* server inode number */
@@ -585,9 +589,9 @@ require use of the stronger protocol */
585#define CIFSSEC_MUST_LANMAN 0x10010 589#define CIFSSEC_MUST_LANMAN 0x10010
586#define CIFSSEC_MUST_PLNTXT 0x20020 590#define CIFSSEC_MUST_PLNTXT 0x20020
587#ifdef CONFIG_CIFS_UPCALL 591#ifdef CONFIG_CIFS_UPCALL
588#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */ 592#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */
589#else 593#else
590#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */ 594#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
591#endif /* UPCALL */ 595#endif /* UPCALL */
592#else /* do not allow weak pw hash */ 596#else /* do not allow weak pw hash */
593#ifdef CONFIG_CIFS_UPCALL 597#ifdef CONFIG_CIFS_UPCALL
@@ -669,12 +673,6 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
669 */ 673 */
670GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 674GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
671 675
672/* Global list of oplocks */
673GLOBAL_EXTERN struct list_head cifs_oplock_list;
674
675/* Protects the cifs_oplock_list */
676GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
677
678/* Outstanding dir notify requests */ 676/* Outstanding dir notify requests */
679GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 677GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
680/* DirNotify response queue */ 678/* DirNotify response queue */
@@ -725,3 +723,4 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
725GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 723GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
726GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 724GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
727 725
726extern const struct slow_work_ops cifs_oplock_break_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index da8fbf565991..6928c24d1d42 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,18 +86,17 @@ extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
86 const int stage, 86 const int stage,
87 const struct nls_table *nls_cp); 87 const struct nls_table *nls_cp);
88extern __u16 GetNextMid(struct TCP_Server_Info *server); 88extern __u16 GetNextMid(struct TCP_Server_Info *server);
89extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
90 struct cifsTconInfo *);
91extern void DeleteOplockQEntry(struct oplock_q_entry *);
92extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
93extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 89extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
94extern u64 cifs_UnixTimeToNT(struct timespec); 90extern u64 cifs_UnixTimeToNT(struct timespec);
95extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 91extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
96 int offset); 92 int offset);
97 93
94extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
95 __u16 fileHandle, struct file *file,
96 struct vfsmount *mnt, unsigned int oflags);
98extern int cifs_posix_open(char *full_path, struct inode **pinode, 97extern int cifs_posix_open(char *full_path, struct inode **pinode,
99 struct super_block *sb, int mode, int oflags, 98 struct vfsmount *mnt, int mode, int oflags,
100 int *poplock, __u16 *pnetfid, int xid); 99 __u32 *poplock, __u16 *pnetfid, int xid);
101extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 100extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
102 FILE_UNIX_BASIC_INFO *info, 101 FILE_UNIX_BASIC_INFO *info,
103 struct cifs_sb_info *cifs_sb); 102 struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 301e307e1279..941441d3e386 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -94,6 +94,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
94 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { 94 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
95 open_file = list_entry(tmp, struct cifsFileInfo, tlist); 95 open_file = list_entry(tmp, struct cifsFileInfo, tlist);
96 open_file->invalidHandle = true; 96 open_file->invalidHandle = true;
97 open_file->oplock_break_cancelled = true;
97 } 98 }
98 write_unlock(&GlobalSMBSeslock); 99 write_unlock(&GlobalSMBSeslock);
99 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted 100 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d49682433c20..43003e0bef18 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1670,7 +1670,6 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1670 CIFSSMBTDis(xid, tcon); 1670 CIFSSMBTDis(xid, tcon);
1671 _FreeXid(xid); 1671 _FreeXid(xid);
1672 1672
1673 DeleteTconOplockQEntries(tcon);
1674 tconInfoFree(tcon); 1673 tconInfoFree(tcon);
1675 cifs_put_smb_ses(ses); 1674 cifs_put_smb_ses(ses);
1676} 1675}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a6424cfc0121..627a60a6c1b1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -24,6 +24,7 @@
24#include <linux/stat.h> 24#include <linux/stat.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h>
27#include "cifsfs.h" 28#include "cifsfs.h"
28#include "cifspdu.h" 29#include "cifspdu.h"
29#include "cifsglob.h" 30#include "cifsglob.h"
@@ -129,44 +130,45 @@ cifs_bp_rename_retry:
129 return full_path; 130 return full_path;
130} 131}
131 132
132static void 133struct cifsFileInfo *
133cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, 134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
134 struct cifsTconInfo *tcon, bool write_only) 135 struct file *file, struct vfsmount *mnt, unsigned int oflags)
135{ 136{
136 int oplock = 0; 137 int oplock = 0;
137 struct cifsFileInfo *pCifsFile; 138 struct cifsFileInfo *pCifsFile;
138 struct cifsInodeInfo *pCifsInode; 139 struct cifsInodeInfo *pCifsInode;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
139 141
140 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 142 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
141
142 if (pCifsFile == NULL) 143 if (pCifsFile == NULL)
143 return; 144 return pCifsFile;
144 145
145 if (oplockEnabled) 146 if (oplockEnabled)
146 oplock = REQ_OPLOCK; 147 oplock = REQ_OPLOCK;
147 148
148 pCifsFile->netfid = fileHandle; 149 pCifsFile->netfid = fileHandle;
149 pCifsFile->pid = current->tgid; 150 pCifsFile->pid = current->tgid;
150 pCifsFile->pInode = newinode; 151 pCifsFile->pInode = igrab(newinode);
152 pCifsFile->mnt = mnt;
153 pCifsFile->pfile = file;
151 pCifsFile->invalidHandle = false; 154 pCifsFile->invalidHandle = false;
152 pCifsFile->closePend = false; 155 pCifsFile->closePend = false;
153 mutex_init(&pCifsFile->fh_mutex); 156 mutex_init(&pCifsFile->fh_mutex);
154 mutex_init(&pCifsFile->lock_mutex); 157 mutex_init(&pCifsFile->lock_mutex);
155 INIT_LIST_HEAD(&pCifsFile->llist); 158 INIT_LIST_HEAD(&pCifsFile->llist);
156 atomic_set(&pCifsFile->count, 1); 159 atomic_set(&pCifsFile->count, 1);
160 slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
157 161
158 /* set the following in open now
159 pCifsFile->pfile = file; */
160 write_lock(&GlobalSMBSeslock); 162 write_lock(&GlobalSMBSeslock);
161 list_add(&pCifsFile->tlist, &tcon->openFileList); 163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
162 pCifsInode = CIFS_I(newinode); 164 pCifsInode = CIFS_I(newinode);
163 if (pCifsInode) { 165 if (pCifsInode) {
164 /* if readable file instance put first in list*/ 166 /* if readable file instance put first in list*/
165 if (write_only) 167 if (oflags & FMODE_READ)
168 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
169 else
166 list_add_tail(&pCifsFile->flist, 170 list_add_tail(&pCifsFile->flist,
167 &pCifsInode->openFileList); 171 &pCifsInode->openFileList);
168 else
169 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
170 172
171 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
172 pCifsInode->clientCanCacheAll = true; 174 pCifsInode->clientCanCacheAll = true;
@@ -176,18 +178,18 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
176 pCifsInode->clientCanCacheRead = true; 178 pCifsInode->clientCanCacheRead = true;
177 } 179 }
178 write_unlock(&GlobalSMBSeslock); 180 write_unlock(&GlobalSMBSeslock);
181
182 return pCifsFile;
179} 183}
180 184
181int cifs_posix_open(char *full_path, struct inode **pinode, 185int cifs_posix_open(char *full_path, struct inode **pinode,
182 struct super_block *sb, int mode, int oflags, 186 struct vfsmount *mnt, int mode, int oflags,
183 int *poplock, __u16 *pnetfid, int xid) 187 __u32 *poplock, __u16 *pnetfid, int xid)
184{ 188{
185 int rc; 189 int rc;
186 __u32 oplock;
187 bool write_only = false;
188 FILE_UNIX_BASIC_INFO *presp_data; 190 FILE_UNIX_BASIC_INFO *presp_data;
189 __u32 posix_flags = 0; 191 __u32 posix_flags = 0;
190 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 192 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
191 struct cifs_fattr fattr; 193 struct cifs_fattr fattr;
192 194
193 cFYI(1, ("posix open %s", full_path)); 195 cFYI(1, ("posix open %s", full_path));
@@ -223,12 +225,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
223 if (oflags & O_DIRECT) 225 if (oflags & O_DIRECT)
224 posix_flags |= SMB_O_DIRECT; 226 posix_flags |= SMB_O_DIRECT;
225 227
226 if (!(oflags & FMODE_READ))
227 write_only = true;
228
229 mode &= ~current_umask(); 228 mode &= ~current_umask();
230 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, 229 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
231 pnetfid, presp_data, &oplock, full_path, 230 pnetfid, presp_data, poplock, full_path,
232 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 231 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
233 CIFS_MOUNT_MAP_SPECIAL_CHR); 232 CIFS_MOUNT_MAP_SPECIAL_CHR);
234 if (rc) 233 if (rc)
@@ -244,7 +243,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
244 243
245 /* get new inode and set it up */ 244 /* get new inode and set it up */
246 if (*pinode == NULL) { 245 if (*pinode == NULL) {
247 *pinode = cifs_iget(sb, &fattr); 246 *pinode = cifs_iget(mnt->mnt_sb, &fattr);
248 if (!*pinode) { 247 if (!*pinode) {
249 rc = -ENOMEM; 248 rc = -ENOMEM;
250 goto posix_open_ret; 249 goto posix_open_ret;
@@ -253,7 +252,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
253 cifs_fattr_to_inode(*pinode, &fattr); 252 cifs_fattr_to_inode(*pinode, &fattr);
254 } 253 }
255 254
256 cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only); 255 cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
257 256
258posix_open_ret: 257posix_open_ret:
259 kfree(presp_data); 258 kfree(presp_data);
@@ -280,7 +279,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
280 int rc = -ENOENT; 279 int rc = -ENOENT;
281 int xid; 280 int xid;
282 int create_options = CREATE_NOT_DIR; 281 int create_options = CREATE_NOT_DIR;
283 int oplock = 0; 282 __u32 oplock = 0;
284 int oflags; 283 int oflags;
285 bool posix_create = false; 284 bool posix_create = false;
286 /* 285 /*
@@ -298,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
298 FILE_ALL_INFO *buf = NULL; 297 FILE_ALL_INFO *buf = NULL;
299 struct inode *newinode = NULL; 298 struct inode *newinode = NULL;
300 int disposition = FILE_OVERWRITE_IF; 299 int disposition = FILE_OVERWRITE_IF;
301 bool write_only = false;
302 300
303 xid = GetXid(); 301 xid = GetXid();
304 302
@@ -323,7 +321,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
323 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 321 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
324 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 322 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
325 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 323 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
326 rc = cifs_posix_open(full_path, &newinode, inode->i_sb, 324 rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
327 mode, oflags, &oplock, &fileHandle, xid); 325 mode, oflags, &oplock, &fileHandle, xid);
328 /* EIO could indicate that (posix open) operation is not 326 /* EIO could indicate that (posix open) operation is not
329 supported, despite what server claimed in capability 327 supported, despite what server claimed in capability
@@ -351,11 +349,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
351 desiredAccess = 0; 349 desiredAccess = 0;
352 if (oflags & FMODE_READ) 350 if (oflags & FMODE_READ)
353 desiredAccess |= GENERIC_READ; /* is this too little? */ 351 desiredAccess |= GENERIC_READ; /* is this too little? */
354 if (oflags & FMODE_WRITE) { 352 if (oflags & FMODE_WRITE)
355 desiredAccess |= GENERIC_WRITE; 353 desiredAccess |= GENERIC_WRITE;
356 if (!(oflags & FMODE_READ))
357 write_only = true;
358 }
359 354
360 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 355 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
361 disposition = FILE_CREATE; 356 disposition = FILE_CREATE;
@@ -470,8 +465,8 @@ cifs_create_set_dentry:
470 /* mknod case - do not leave file open */ 465 /* mknod case - do not leave file open */
471 CIFSSMBClose(xid, tcon, fileHandle); 466 CIFSSMBClose(xid, tcon, fileHandle);
472 } else if (!(posix_create) && (newinode)) { 467 } else if (!(posix_create) && (newinode)) {
473 cifs_fill_fileinfo(newinode, fileHandle, 468 cifs_new_fileinfo(newinode, fileHandle, NULL,
474 cifs_sb->tcon, write_only); 469 nd->path.mnt, oflags);
475 } 470 }
476cifs_create_out: 471cifs_create_out:
477 kfree(buf); 472 kfree(buf);
@@ -611,7 +606,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
611{ 606{
612 int xid; 607 int xid;
613 int rc = 0; /* to get around spurious gcc warning, set to zero here */ 608 int rc = 0; /* to get around spurious gcc warning, set to zero here */
614 int oplock = 0; 609 __u32 oplock = 0;
615 __u16 fileHandle = 0; 610 __u16 fileHandle = 0;
616 bool posix_open = false; 611 bool posix_open = false;
617 struct cifs_sb_info *cifs_sb; 612 struct cifs_sb_info *cifs_sb;
@@ -683,8 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
683 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 678 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
684 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 679 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
685 (nd->intent.open.flags & O_CREAT)) { 680 (nd->intent.open.flags & O_CREAT)) {
686 rc = cifs_posix_open(full_path, &newInode, 681 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
687 parent_dir_inode->i_sb,
688 nd->intent.open.create_mode, 682 nd->intent.open.create_mode,
689 nd->intent.open.flags, &oplock, 683 nd->intent.open.flags, &oplock,
690 &fileHandle, xid); 684 &fileHandle, xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa7beac8b80e..429337eb7afe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -30,6 +30,7 @@
30#include <linux/writeback.h> 30#include <linux/writeback.h>
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h>
33#include <asm/div64.h> 34#include <asm/div64.h>
34#include "cifsfs.h" 35#include "cifsfs.h"
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -39,27 +40,6 @@
39#include "cifs_debug.h" 40#include "cifs_debug.h"
40#include "cifs_fs_sb.h" 41#include "cifs_fs_sb.h"
41 42
42static inline struct cifsFileInfo *cifs_init_private(
43 struct cifsFileInfo *private_data, struct inode *inode,
44 struct file *file, __u16 netfid)
45{
46 memset(private_data, 0, sizeof(struct cifsFileInfo));
47 private_data->netfid = netfid;
48 private_data->pid = current->tgid;
49 mutex_init(&private_data->fh_mutex);
50 mutex_init(&private_data->lock_mutex);
51 INIT_LIST_HEAD(&private_data->llist);
52 private_data->pfile = file; /* needed for writepage */
53 private_data->pInode = inode;
54 private_data->invalidHandle = false;
55 private_data->closePend = false;
56 /* Initialize reference count to one. The private data is
57 freed on the release of the last reference */
58 atomic_set(&private_data->count, 1);
59
60 return private_data;
61}
62
63static inline int cifs_convert_flags(unsigned int flags) 43static inline int cifs_convert_flags(unsigned int flags)
64{ 44{
65 if ((flags & O_ACCMODE) == O_RDONLY) 45 if ((flags & O_ACCMODE) == O_RDONLY)
@@ -123,9 +103,11 @@ static inline int cifs_get_disposition(unsigned int flags)
123} 103}
124 104
125/* all arguments to this function must be checked for validity in caller */ 105/* all arguments to this function must be checked for validity in caller */
126static inline int cifs_posix_open_inode_helper(struct inode *inode, 106static inline int
127 struct file *file, struct cifsInodeInfo *pCifsInode, 107cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
128 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) 108 struct cifsInodeInfo *pCifsInode,
109 struct cifsFileInfo *pCifsFile, __u32 oplock,
110 u16 netfid)
129{ 111{
130 112
131 write_lock(&GlobalSMBSeslock); 113 write_lock(&GlobalSMBSeslock);
@@ -219,17 +201,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 struct timespec temp; 201 struct timespec temp;
220 int rc; 202 int rc;
221 203
222 /* want handles we can use to read with first
223 in the list so we do not have to walk the
224 list to search for one in write_begin */
225 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
226 list_add_tail(&pCifsFile->flist,
227 &pCifsInode->openFileList);
228 } else {
229 list_add(&pCifsFile->flist,
230 &pCifsInode->openFileList);
231 }
232 write_unlock(&GlobalSMBSeslock);
233 if (pCifsInode->clientCanCacheRead) { 204 if (pCifsInode->clientCanCacheRead) {
234 /* we have the inode open somewhere else 205 /* we have the inode open somewhere else
235 no need to discard cache data */ 206 no need to discard cache data */
@@ -279,7 +250,8 @@ client_can_cache:
279int cifs_open(struct inode *inode, struct file *file) 250int cifs_open(struct inode *inode, struct file *file)
280{ 251{
281 int rc = -EACCES; 252 int rc = -EACCES;
282 int xid, oplock; 253 int xid;
254 __u32 oplock;
283 struct cifs_sb_info *cifs_sb; 255 struct cifs_sb_info *cifs_sb;
284 struct cifsTconInfo *tcon; 256 struct cifsTconInfo *tcon;
285 struct cifsFileInfo *pCifsFile; 257 struct cifsFileInfo *pCifsFile;
@@ -324,7 +296,7 @@ int cifs_open(struct inode *inode, struct file *file)
324 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 296 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
325 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 297 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
326 /* can not refresh inode info since size could be stale */ 298 /* can not refresh inode info since size could be stale */
327 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 299 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
328 cifs_sb->mnt_file_mode /* ignored */, 300 cifs_sb->mnt_file_mode /* ignored */,
329 oflags, &oplock, &netfid, xid); 301 oflags, &oplock, &netfid, xid);
330 if (rc == 0) { 302 if (rc == 0) {
@@ -414,24 +386,17 @@ int cifs_open(struct inode *inode, struct file *file)
414 cFYI(1, ("cifs_open returned 0x%x", rc)); 386 cFYI(1, ("cifs_open returned 0x%x", rc));
415 goto out; 387 goto out;
416 } 388 }
417 file->private_data = 389
418 kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 390 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
391 file->f_flags);
392 file->private_data = pCifsFile;
419 if (file->private_data == NULL) { 393 if (file->private_data == NULL) {
420 rc = -ENOMEM; 394 rc = -ENOMEM;
421 goto out; 395 goto out;
422 } 396 }
423 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
424 write_lock(&GlobalSMBSeslock);
425 list_add(&pCifsFile->tlist, &tcon->openFileList);
426 397
427 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 398 rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
428 if (pCifsInode) { 399 &oplock, buf, full_path, xid);
429 rc = cifs_open_inode_helper(inode, file, pCifsInode,
430 pCifsFile, tcon,
431 &oplock, buf, full_path, xid);
432 } else {
433 write_unlock(&GlobalSMBSeslock);
434 }
435 400
436 if (oplock & CIFS_CREATE_ACTION) { 401 if (oplock & CIFS_CREATE_ACTION) {
437 /* time to set mode which we can not set earlier due to 402 /* time to set mode which we can not set earlier due to
@@ -474,7 +439,8 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
474static int cifs_reopen_file(struct file *file, bool can_flush) 439static int cifs_reopen_file(struct file *file, bool can_flush)
475{ 440{
476 int rc = -EACCES; 441 int rc = -EACCES;
477 int xid, oplock; 442 int xid;
443 __u32 oplock;
478 struct cifs_sb_info *cifs_sb; 444 struct cifs_sb_info *cifs_sb;
479 struct cifsTconInfo *tcon; 445 struct cifsTconInfo *tcon;
480 struct cifsFileInfo *pCifsFile; 446 struct cifsFileInfo *pCifsFile;
@@ -543,7 +509,7 @@ reopen_error_exit:
543 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 509 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
544 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 510 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
545 /* can not refresh inode info since size could be stale */ 511 /* can not refresh inode info since size could be stale */
546 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 512 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
547 cifs_sb->mnt_file_mode /* ignored */, 513 cifs_sb->mnt_file_mode /* ignored */,
548 oflags, &oplock, &netfid, xid); 514 oflags, &oplock, &netfid, xid);
549 if (rc == 0) { 515 if (rc == 0) {
@@ -2308,6 +2274,73 @@ out:
2308 return rc; 2274 return rc;
2309} 2275}
2310 2276
2277static void
2278cifs_oplock_break(struct slow_work *work)
2279{
2280 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2281 oplock_break);
2282 struct inode *inode = cfile->pInode;
2283 struct cifsInodeInfo *cinode = CIFS_I(inode);
2284 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
2285 int rc, waitrc = 0;
2286
2287 if (inode && S_ISREG(inode->i_mode)) {
2288#ifdef CONFIG_CIFS_EXPERIMENTAL
2289 if (cinode->clientCanCacheAll == 0)
2290 break_lease(inode, FMODE_READ);
2291 else if (cinode->clientCanCacheRead == 0)
2292 break_lease(inode, FMODE_WRITE);
2293#endif
2294 rc = filemap_fdatawrite(inode->i_mapping);
2295 if (cinode->clientCanCacheRead == 0) {
2296 waitrc = filemap_fdatawait(inode->i_mapping);
2297 invalidate_remote_inode(inode);
2298 }
2299 if (!rc)
2300 rc = waitrc;
2301 if (rc)
2302 cinode->write_behind_rc = rc;
2303 cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
2304 }
2305
2306 /*
2307 * releasing stale oplock after recent reconnect of smb session using
2308 * a now incorrect file handle is not a data integrity issue but do
2309 * not bother sending an oplock release if session to server still is
2310 * disconnected since oplock already released by the server
2311 */
2312 if (!cfile->closePend && !cfile->oplock_break_cancelled) {
2313 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
2314 LOCKING_ANDX_OPLOCK_RELEASE, false);
2315 cFYI(1, ("Oplock release rc = %d", rc));
2316 }
2317}
2318
2319static int
2320cifs_oplock_break_get(struct slow_work *work)
2321{
2322 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2323 oplock_break);
2324 mntget(cfile->mnt);
2325 cifsFileInfo_get(cfile);
2326 return 0;
2327}
2328
2329static void
2330cifs_oplock_break_put(struct slow_work *work)
2331{
2332 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2333 oplock_break);
2334 mntput(cfile->mnt);
2335 cifsFileInfo_put(cfile);
2336}
2337
2338const struct slow_work_ops cifs_oplock_break_ops = {
2339 .get_ref = cifs_oplock_break_get,
2340 .put_ref = cifs_oplock_break_put,
2341 .execute = cifs_oplock_break,
2342};
2343
2311const struct address_space_operations cifs_addr_ops = { 2344const struct address_space_operations cifs_addr_ops = {
2312 .readpage = cifs_readpage, 2345 .readpage = cifs_readpage,
2313 .readpages = cifs_readpages, 2346 .readpages = cifs_readpages,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 1f09c7619319..5e2492535daa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1557,57 +1557,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
1557 1557
1558static int cifs_vmtruncate(struct inode *inode, loff_t offset) 1558static int cifs_vmtruncate(struct inode *inode, loff_t offset)
1559{ 1559{
1560 struct address_space *mapping = inode->i_mapping; 1560 loff_t oldsize;
1561 unsigned long limit; 1561 int err;
1562 1562
1563 spin_lock(&inode->i_lock); 1563 spin_lock(&inode->i_lock);
1564 if (inode->i_size < offset) 1564 err = inode_newsize_ok(inode, offset);
1565 goto do_expand; 1565 if (err) {
1566 /*
1567 * truncation of in-use swapfiles is disallowed - it would cause
1568 * subsequent swapout to scribble on the now-freed blocks.
1569 */
1570 if (IS_SWAPFILE(inode)) {
1571 spin_unlock(&inode->i_lock);
1572 goto out_busy;
1573 }
1574 i_size_write(inode, offset);
1575 spin_unlock(&inode->i_lock);
1576 /*
1577 * unmap_mapping_range is called twice, first simply for efficiency
1578 * so that truncate_inode_pages does fewer single-page unmaps. However
1579 * after this first call, and before truncate_inode_pages finishes,
1580 * it is possible for private pages to be COWed, which remain after
1581 * truncate_inode_pages finishes, hence the second unmap_mapping_range
1582 * call must be made for correctness.
1583 */
1584 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1585 truncate_inode_pages(mapping, offset);
1586 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1587 goto out_truncate;
1588
1589do_expand:
1590 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1591 if (limit != RLIM_INFINITY && offset > limit) {
1592 spin_unlock(&inode->i_lock); 1566 spin_unlock(&inode->i_lock);
1593 goto out_sig; 1567 goto out;
1594 }
1595 if (offset > inode->i_sb->s_maxbytes) {
1596 spin_unlock(&inode->i_lock);
1597 goto out_big;
1598 } 1568 }
1569
1570 oldsize = inode->i_size;
1599 i_size_write(inode, offset); 1571 i_size_write(inode, offset);
1600 spin_unlock(&inode->i_lock); 1572 spin_unlock(&inode->i_lock);
1601out_truncate: 1573 truncate_pagecache(inode, oldsize, offset);
1602 if (inode->i_op->truncate) 1574 if (inode->i_op->truncate)
1603 inode->i_op->truncate(inode); 1575 inode->i_op->truncate(inode);
1604 return 0; 1576out:
1605out_sig: 1577 return err;
1606 send_sig(SIGXFSZ, current, 0);
1607out_big:
1608 return -EFBIG;
1609out_busy:
1610 return -ETXTBSY;
1611} 1578}
1612 1579
1613static int 1580static int
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index e079a9190ec4..0241b25ac33f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -32,7 +32,6 @@
32 32
33extern mempool_t *cifs_sm_req_poolp; 33extern mempool_t *cifs_sm_req_poolp;
34extern mempool_t *cifs_req_poolp; 34extern mempool_t *cifs_req_poolp;
35extern struct task_struct *oplockThread;
36 35
37/* The xid serves as a useful identifier for each incoming vfs request, 36/* The xid serves as a useful identifier for each incoming vfs request,
38 in a similar way to the mid which is useful to track each sent smb, 37 in a similar way to the mid which is useful to track each sent smb,
@@ -500,6 +499,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
500 struct cifsTconInfo *tcon; 499 struct cifsTconInfo *tcon;
501 struct cifsInodeInfo *pCifsInode; 500 struct cifsInodeInfo *pCifsInode;
502 struct cifsFileInfo *netfile; 501 struct cifsFileInfo *netfile;
502 int rc;
503 503
504 cFYI(1, ("Checking for oplock break or dnotify response")); 504 cFYI(1, ("Checking for oplock break or dnotify response"));
505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -562,30 +562,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
562 continue; 562 continue;
563 563
564 cifs_stats_inc(&tcon->num_oplock_brks); 564 cifs_stats_inc(&tcon->num_oplock_brks);
565 write_lock(&GlobalSMBSeslock); 565 read_lock(&GlobalSMBSeslock);
566 list_for_each(tmp2, &tcon->openFileList) { 566 list_for_each(tmp2, &tcon->openFileList) {
567 netfile = list_entry(tmp2, struct cifsFileInfo, 567 netfile = list_entry(tmp2, struct cifsFileInfo,
568 tlist); 568 tlist);
569 if (pSMB->Fid != netfile->netfid) 569 if (pSMB->Fid != netfile->netfid)
570 continue; 570 continue;
571 571
572 write_unlock(&GlobalSMBSeslock); 572 /*
573 read_unlock(&cifs_tcp_ses_lock); 573 * don't do anything if file is about to be
574 * closed anyway.
575 */
576 if (netfile->closePend) {
577 read_unlock(&GlobalSMBSeslock);
578 read_unlock(&cifs_tcp_ses_lock);
579 return true;
580 }
581
574 cFYI(1, ("file id match, oplock break")); 582 cFYI(1, ("file id match, oplock break"));
575 pCifsInode = CIFS_I(netfile->pInode); 583 pCifsInode = CIFS_I(netfile->pInode);
576 pCifsInode->clientCanCacheAll = false; 584 pCifsInode->clientCanCacheAll = false;
577 if (pSMB->OplockLevel == 0) 585 if (pSMB->OplockLevel == 0)
578 pCifsInode->clientCanCacheRead = false; 586 pCifsInode->clientCanCacheRead = false;
579 pCifsInode->oplockPending = true; 587 rc = slow_work_enqueue(&netfile->oplock_break);
580 AllocOplockQEntry(netfile->pInode, 588 if (rc) {
581 netfile->netfid, tcon); 589 cERROR(1, ("failed to enqueue oplock "
582 cFYI(1, ("about to wake up oplock thread")); 590 "break: %d\n", rc));
583 if (oplockThread) 591 } else {
584 wake_up_process(oplockThread); 592 netfile->oplock_break_cancelled = false;
585 593 }
594 read_unlock(&GlobalSMBSeslock);
595 read_unlock(&cifs_tcp_ses_lock);
586 return true; 596 return true;
587 } 597 }
588 write_unlock(&GlobalSMBSeslock); 598 read_unlock(&GlobalSMBSeslock);
589 read_unlock(&cifs_tcp_ses_lock); 599 read_unlock(&cifs_tcp_ses_lock);
590 cFYI(1, ("No matching file for oplock break")); 600 cFYI(1, ("No matching file for oplock break"));
591 return true; 601 return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f823a4a208a7..1f098ca71636 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -146,7 +146,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
146 } 146 }
147} 147}
148 148
149void 149static void
150cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, 150cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
151 struct cifs_sb_info *cifs_sb) 151 struct cifs_sb_info *cifs_sb)
152{ 152{
@@ -161,7 +161,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
161 cifs_fill_common_info(fattr, cifs_sb); 161 cifs_fill_common_info(fattr, cifs_sb);
162} 162}
163 163
164void 164static void
165cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, 165cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
166 struct cifs_sb_info *cifs_sb) 166 struct cifs_sb_info *cifs_sb)
167{ 167{
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1da4ab250eae..07b8e71544ee 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -103,56 +103,6 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
103 mempool_free(midEntry, cifs_mid_poolp); 103 mempool_free(midEntry, cifs_mid_poolp);
104} 104}
105 105
106struct oplock_q_entry *
107AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
108{
109 struct oplock_q_entry *temp;
110 if ((pinode == NULL) || (tcon == NULL)) {
111 cERROR(1, ("Null parms passed to AllocOplockQEntry"));
112 return NULL;
113 }
114 temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep,
115 GFP_KERNEL);
116 if (temp == NULL)
117 return temp;
118 else {
119 temp->pinode = pinode;
120 temp->tcon = tcon;
121 temp->netfid = fid;
122 spin_lock(&cifs_oplock_lock);
123 list_add_tail(&temp->qhead, &cifs_oplock_list);
124 spin_unlock(&cifs_oplock_lock);
125 }
126 return temp;
127}
128
129void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
130{
131 spin_lock(&cifs_oplock_lock);
132 /* should we check if list empty first? */
133 list_del(&oplockEntry->qhead);
134 spin_unlock(&cifs_oplock_lock);
135 kmem_cache_free(cifs_oplock_cachep, oplockEntry);
136}
137
138
139void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
140{
141 struct oplock_q_entry *temp;
142
143 if (tcon == NULL)
144 return;
145
146 spin_lock(&cifs_oplock_lock);
147 list_for_each_entry(temp, &cifs_oplock_list, qhead) {
148 if ((temp->tcon) && (temp->tcon == tcon)) {
149 list_del(&temp->qhead);
150 kmem_cache_free(cifs_oplock_cachep, temp);
151 }
152 }
153 spin_unlock(&cifs_oplock_lock);
154}
155
156static int 106static int
157smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 107smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
158{ 108{
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9c..d99860a33890 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
2#define _CODA_INT_ 2#define _CODA_INT_
3 3
4struct dentry; 4struct dentry;
5struct file;
5 6
6extern struct file_system_type coda_fs_type; 7extern struct file_system_type coda_fs_type;
7extern unsigned long coda_timeout; 8extern unsigned long coda_timeout;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac66c44a..be4392ca2098 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/major.h> 23#include <linux/major.h>
24#include <linux/time.h> 24#include <linux/time.h>
25#include <linux/sched.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/ioport.h> 27#include <linux/ioport.h>
27#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/fs/compat.c b/fs/compat.c
index 6d6f98fe64a0..d576b552e8e2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -100,13 +100,6 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
100 get_compat_timespec(&tv[1], &t[1])) 100 get_compat_timespec(&tv[1], &t[1]))
101 return -EFAULT; 101 return -EFAULT;
102 102
103 if ((tv[0].tv_nsec == UTIME_OMIT || tv[0].tv_nsec == UTIME_NOW)
104 && tv[0].tv_sec != 0)
105 return -EINVAL;
106 if ((tv[1].tv_nsec == UTIME_OMIT || tv[1].tv_nsec == UTIME_NOW)
107 && tv[1].tv_sec != 0)
108 return -EINVAL;
109
110 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) 103 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
111 return 0; 104 return 0;
112 } 105 }
@@ -775,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
775 char __user * type, unsigned long flags, 768 char __user * type, unsigned long flags,
776 void __user * data) 769 void __user * data)
777{ 770{
778 unsigned long type_page; 771 char *kernel_type;
779 unsigned long data_page; 772 unsigned long data_page;
780 unsigned long dev_page; 773 char *kernel_dev;
781 char *dir_page; 774 char *dir_page;
782 int retval; 775 int retval;
783 776
784 retval = copy_mount_options (type, &type_page); 777 retval = copy_mount_string(type, &kernel_type);
785 if (retval < 0) 778 if (retval < 0)
786 goto out; 779 goto out;
787 780
@@ -790,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
790 if (IS_ERR(dir_page)) 783 if (IS_ERR(dir_page))
791 goto out1; 784 goto out1;
792 785
793 retval = copy_mount_options (dev_name, &dev_page); 786 retval = copy_mount_string(dev_name, &kernel_dev);
794 if (retval < 0) 787 if (retval < 0)
795 goto out2; 788 goto out2;
796 789
797 retval = copy_mount_options (data, &data_page); 790 retval = copy_mount_options(data, &data_page);
798 if (retval < 0) 791 if (retval < 0)
799 goto out3; 792 goto out3;
800 793
801 retval = -EINVAL; 794 retval = -EINVAL;
802 795
803 if (type_page && data_page) { 796 if (kernel_type && data_page) {
804 if (!strcmp((char *)type_page, SMBFS_NAME)) { 797 if (!strcmp(kernel_type, SMBFS_NAME)) {
805 do_smb_super_data_conv((void *)data_page); 798 do_smb_super_data_conv((void *)data_page);
806 } else if (!strcmp((char *)type_page, NCPFS_NAME)) { 799 } else if (!strcmp(kernel_type, NCPFS_NAME)) {
807 do_ncp_super_data_conv((void *)data_page); 800 do_ncp_super_data_conv((void *)data_page);
808 } else if (!strcmp((char *)type_page, NFS4_NAME)) { 801 } else if (!strcmp(kernel_type, NFS4_NAME)) {
809 if (do_nfs4_super_data_conv((void *) data_page)) 802 if (do_nfs4_super_data_conv((void *) data_page))
810 goto out4; 803 goto out4;
811 } 804 }
812 } 805 }
813 806
814 retval = do_mount((char*)dev_page, dir_page, (char*)type_page, 807 retval = do_mount(kernel_dev, dir_page, kernel_type,
815 flags, (void*)data_page); 808 flags, (void*)data_page);
816 809
817 out4: 810 out4:
818 free_page(data_page); 811 free_page(data_page);
819 out3: 812 out3:
820 free_page(dev_page); 813 kfree(kernel_dev);
821 out2: 814 out2:
822 putname(dir_page); 815 putname(dir_page);
823 out1: 816 out1:
824 free_page(type_page); 817 kfree(kernel_type);
825 out: 818 out:
826 return retval; 819 return retval;
827} 820}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 75efb028974b..d5f8c96964be 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,14 +18,13 @@
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/tty.h> 19#include <linux/tty.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/magic.h>
21#include <linux/idr.h> 22#include <linux/idr.h>
22#include <linux/devpts_fs.h> 23#include <linux/devpts_fs.h>
23#include <linux/parser.h> 24#include <linux/parser.h>
24#include <linux/fsnotify.h> 25#include <linux/fsnotify.h>
25#include <linux/seq_file.h> 26#include <linux/seq_file.h>
26 27
27#define DEVPTS_SUPER_MAGIC 0x1cd1
28
29#define DEVPTS_DEFAULT_MODE 0600 28#define DEVPTS_DEFAULT_MODE 0600
30/* 29/*
31 * ptmx is a new node in /dev/pts and will be unused in legacy (single- 30 * ptmx is a new node in /dev/pts and will be unused in legacy (single-
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1d1d27442235..1c8bb8c3a82e 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -386,9 +386,9 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
386 return rv; 386 return rv;
387} 387}
388 388
389static struct seq_operations format1_seq_ops; 389static const struct seq_operations format1_seq_ops;
390static struct seq_operations format2_seq_ops; 390static const struct seq_operations format2_seq_ops;
391static struct seq_operations format3_seq_ops; 391static const struct seq_operations format3_seq_ops;
392 392
393static void *table_seq_start(struct seq_file *seq, loff_t *pos) 393static void *table_seq_start(struct seq_file *seq, loff_t *pos)
394{ 394{
@@ -534,21 +534,21 @@ static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
534 } 534 }
535} 535}
536 536
537static struct seq_operations format1_seq_ops = { 537static const struct seq_operations format1_seq_ops = {
538 .start = table_seq_start, 538 .start = table_seq_start,
539 .next = table_seq_next, 539 .next = table_seq_next,
540 .stop = table_seq_stop, 540 .stop = table_seq_stop,
541 .show = table_seq_show, 541 .show = table_seq_show,
542}; 542};
543 543
544static struct seq_operations format2_seq_ops = { 544static const struct seq_operations format2_seq_ops = {
545 .start = table_seq_start, 545 .start = table_seq_start,
546 .next = table_seq_next, 546 .next = table_seq_next,
547 .stop = table_seq_stop, 547 .stop = table_seq_stop,
548 .show = table_seq_show, 548 .show = table_seq_show,
549}; 549};
550 550
551static struct seq_operations format3_seq_ops = { 551static const struct seq_operations format3_seq_ops = {
552 .start = table_seq_start, 552 .start = table_seq_start,
553 .next = table_seq_next, 553 .next = table_seq_next,
554 .stop = table_seq_stop, 554 .stop = table_seq_stop,
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb7913447..31f4b0e6d72c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
63} 63}
64 64
65int drop_caches_sysctl_handler(ctl_table *table, int write, 65int drop_caches_sysctl_handler(ctl_table *table, int write,
66 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 66 void __user *buffer, size_t *length, loff_t *ppos)
67{ 67{
68 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 68 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 69 if (write) {
70 if (sysctl_drop_caches & 1) 70 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 71 drop_pagecache();
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 0c754e64232b..8aadb99b7634 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,8 @@
1config ECRYPT_FS 1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)" 2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && CRYPTO && NET 3 depends on EXPERIMENTAL && KEYS && NET
4 select CRYPTO_ECB
5 select CRYPTO_CBC
4 help 6 help
5 Encrypted filesystem that operates on the VFS layer. See 7 Encrypted filesystem that operates on the VFS layer. See
6 <file:Documentation/filesystems/ecryptfs.txt> to learn more about 8 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b91851f1cda3..fbb6e5eed697 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -245,13 +245,11 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
245 crypto_free_blkcipher(crypt_stat->tfm); 245 crypto_free_blkcipher(crypt_stat->tfm);
246 if (crypt_stat->hash_tfm) 246 if (crypt_stat->hash_tfm)
247 crypto_free_hash(crypt_stat->hash_tfm); 247 crypto_free_hash(crypt_stat->hash_tfm);
248 mutex_lock(&crypt_stat->keysig_list_mutex);
249 list_for_each_entry_safe(key_sig, key_sig_tmp, 248 list_for_each_entry_safe(key_sig, key_sig_tmp,
250 &crypt_stat->keysig_list, crypt_stat_list) { 249 &crypt_stat->keysig_list, crypt_stat_list) {
251 list_del(&key_sig->crypt_stat_list); 250 list_del(&key_sig->crypt_stat_list);
252 kmem_cache_free(ecryptfs_key_sig_cache, key_sig); 251 kmem_cache_free(ecryptfs_key_sig_cache, key_sig);
253 } 252 }
254 mutex_unlock(&crypt_stat->keysig_list_mutex);
255 memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); 253 memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
256} 254}
257 255
@@ -511,13 +509,14 @@ int ecryptfs_encrypt_page(struct page *page)
511 + extent_offset), crypt_stat); 509 + extent_offset), crypt_stat);
512 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, 510 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
513 offset, crypt_stat->extent_size); 511 offset, crypt_stat->extent_size);
514 if (rc) { 512 if (rc < 0) {
515 ecryptfs_printk(KERN_ERR, "Error attempting " 513 ecryptfs_printk(KERN_ERR, "Error attempting "
516 "to write lower page; rc = [%d]" 514 "to write lower page; rc = [%d]"
517 "\n", rc); 515 "\n", rc);
518 goto out; 516 goto out;
519 } 517 }
520 } 518 }
519 rc = 0;
521out: 520out:
522 if (enc_extent_page) { 521 if (enc_extent_page) {
523 kunmap(enc_extent_page); 522 kunmap(enc_extent_page);
@@ -633,7 +632,7 @@ int ecryptfs_decrypt_page(struct page *page)
633 rc = ecryptfs_read_lower(enc_extent_virt, offset, 632 rc = ecryptfs_read_lower(enc_extent_virt, offset,
634 crypt_stat->extent_size, 633 crypt_stat->extent_size,
635 ecryptfs_inode); 634 ecryptfs_inode);
636 if (rc) { 635 if (rc < 0) {
637 ecryptfs_printk(KERN_ERR, "Error attempting " 636 ecryptfs_printk(KERN_ERR, "Error attempting "
638 "to read lower page; rc = [%d]" 637 "to read lower page; rc = [%d]"
639 "\n", rc); 638 "\n", rc);
@@ -797,6 +796,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
797 kfree(full_alg_name); 796 kfree(full_alg_name);
798 if (IS_ERR(crypt_stat->tfm)) { 797 if (IS_ERR(crypt_stat->tfm)) {
799 rc = PTR_ERR(crypt_stat->tfm); 798 rc = PTR_ERR(crypt_stat->tfm);
799 crypt_stat->tfm = NULL;
800 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " 800 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
801 "Error initializing cipher [%s]\n", 801 "Error initializing cipher [%s]\n",
802 crypt_stat->cipher); 802 crypt_stat->cipher);
@@ -925,7 +925,9 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
925 struct ecryptfs_global_auth_tok *global_auth_tok; 925 struct ecryptfs_global_auth_tok *global_auth_tok;
926 int rc = 0; 926 int rc = 0;
927 927
928 mutex_lock(&crypt_stat->keysig_list_mutex);
928 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 929 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
930
929 list_for_each_entry(global_auth_tok, 931 list_for_each_entry(global_auth_tok,
930 &mount_crypt_stat->global_auth_tok_list, 932 &mount_crypt_stat->global_auth_tok_list,
931 mount_crypt_stat_list) { 933 mount_crypt_stat_list) {
@@ -934,13 +936,13 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
934 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); 936 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
935 if (rc) { 937 if (rc) {
936 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); 938 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
937 mutex_unlock(
938 &mount_crypt_stat->global_auth_tok_list_mutex);
939 goto out; 939 goto out;
940 } 940 }
941 } 941 }
942 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); 942
943out: 943out:
944 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
945 mutex_unlock(&crypt_stat->keysig_list_mutex);
944 return rc; 946 return rc;
945} 947}
946 948
@@ -1212,14 +1214,15 @@ int ecryptfs_read_and_validate_header_region(char *data,
1212 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; 1214 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
1213 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1215 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
1214 ecryptfs_inode); 1216 ecryptfs_inode);
1215 if (rc) { 1217 if (rc < 0) {
1216 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", 1218 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
1217 __func__, rc); 1219 __func__, rc);
1218 goto out; 1220 goto out;
1219 } 1221 }
1220 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { 1222 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1221 rc = -EINVAL; 1223 rc = -EINVAL;
1222 } 1224 } else
1225 rc = 0;
1223out: 1226out:
1224 return rc; 1227 return rc;
1225} 1228}
@@ -1314,10 +1317,11 @@ ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
1314 1317
1315 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 1318 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
1316 0, virt_len); 1319 0, virt_len);
1317 if (rc) 1320 if (rc < 0)
1318 printk(KERN_ERR "%s: Error attempting to write header " 1321 printk(KERN_ERR "%s: Error attempting to write header "
1319 "information to lower file; rc = [%d]\n", __func__, 1322 "information to lower file; rc = [%d]\n", __func__, rc);
1320 rc); 1323 else
1324 rc = 0;
1321 return rc; 1325 return rc;
1322} 1326}
1323 1327
@@ -1597,7 +1601,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1597 } 1601 }
1598 rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, 1602 rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
1599 ecryptfs_inode); 1603 ecryptfs_inode);
1600 if (!rc) 1604 if (rc >= 0)
1601 rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, 1605 rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
1602 ecryptfs_dentry, 1606 ecryptfs_dentry,
1603 ECRYPTFS_VALIDATE_HEADER_SIZE); 1607 ECRYPTFS_VALIDATE_HEADER_SIZE);
@@ -1702,7 +1706,7 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
1702 } else { 1706 } else {
1703 printk(KERN_ERR "%s: No support for requested filename " 1707 printk(KERN_ERR "%s: No support for requested filename "
1704 "encryption method in this release\n", __func__); 1708 "encryption method in this release\n", __func__);
1705 rc = -ENOTSUPP; 1709 rc = -EOPNOTSUPP;
1706 goto out; 1710 goto out;
1707 } 1711 }
1708out: 1712out:
@@ -1763,7 +1767,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1763 if (IS_ERR(*key_tfm)) { 1767 if (IS_ERR(*key_tfm)) {
1764 rc = PTR_ERR(*key_tfm); 1768 rc = PTR_ERR(*key_tfm);
1765 printk(KERN_ERR "Unable to allocate crypto cipher with name " 1769 printk(KERN_ERR "Unable to allocate crypto cipher with name "
1766 "[%s]; rc = [%d]\n", cipher_name, rc); 1770 "[%s]; rc = [%d]\n", full_alg_name, rc);
1767 goto out; 1771 goto out;
1768 } 1772 }
1769 crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); 1773 crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
@@ -1776,7 +1780,8 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1776 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); 1780 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
1777 if (rc) { 1781 if (rc) {
1778 printk(KERN_ERR "Error attempting to set key of size [%zd] for " 1782 printk(KERN_ERR "Error attempting to set key of size [%zd] for "
1779 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); 1783 "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
1784 rc);
1780 rc = -EINVAL; 1785 rc = -EINVAL;
1781 goto out; 1786 goto out;
1782 } 1787 }
@@ -2166,7 +2171,7 @@ int ecryptfs_encrypt_and_encode_filename(
2166 (*encoded_name)[(*encoded_name_size)] = '\0'; 2171 (*encoded_name)[(*encoded_name_size)] = '\0';
2167 (*encoded_name_size)++; 2172 (*encoded_name_size)++;
2168 } else { 2173 } else {
2169 rc = -ENOTSUPP; 2174 rc = -EOPNOTSUPP;
2170 } 2175 }
2171 if (rc) { 2176 if (rc) {
2172 printk(KERN_ERR "%s: Error attempting to encode " 2177 printk(KERN_ERR "%s: Error attempting to encode "
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 00b30a2d5466..542f625312f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops;
582extern const struct inode_operations ecryptfs_symlink_iops; 582extern const struct inode_operations ecryptfs_symlink_iops;
583extern const struct super_operations ecryptfs_sops; 583extern const struct super_operations ecryptfs_sops;
584extern const struct dentry_operations ecryptfs_dops; 584extern const struct dentry_operations ecryptfs_dops;
585extern struct address_space_operations ecryptfs_aops; 585extern const struct address_space_operations ecryptfs_aops;
586extern int ecryptfs_verbosity; 586extern int ecryptfs_verbosity;
587extern unsigned int ecryptfs_message_buf_len; 587extern unsigned int ecryptfs_message_buf_len;
588extern signed long ecryptfs_message_wait_timeout; 588extern signed long ecryptfs_message_wait_timeout;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 2f0945d63297..056fed62d0de 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -476,6 +476,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
476 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); 476 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
477 struct dentry *lower_dir_dentry; 477 struct dentry *lower_dir_dentry;
478 478
479 dget(lower_dentry);
479 lower_dir_dentry = lock_parent(lower_dentry); 480 lower_dir_dentry = lock_parent(lower_dentry);
480 rc = vfs_unlink(lower_dir_inode, lower_dentry); 481 rc = vfs_unlink(lower_dir_inode, lower_dentry);
481 if (rc) { 482 if (rc) {
@@ -489,6 +490,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
489 d_drop(dentry); 490 d_drop(dentry);
490out_unlock: 491out_unlock:
491 unlock_dir(lower_dir_dentry); 492 unlock_dir(lower_dir_dentry);
493 dput(lower_dentry);
492 return rc; 494 return rc;
493} 495}
494 496
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 259525c9abb8..a0a7847567e9 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -416,7 +416,9 @@ ecryptfs_find_global_auth_tok_for_sig(
416 &mount_crypt_stat->global_auth_tok_list, 416 &mount_crypt_stat->global_auth_tok_list,
417 mount_crypt_stat_list) { 417 mount_crypt_stat_list) {
418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { 418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
419 (*global_auth_tok) = walker; 419 rc = key_validate(walker->global_auth_tok_key);
420 if (!rc)
421 (*global_auth_tok) = walker;
420 goto out; 422 goto out;
421 } 423 }
422 } 424 }
@@ -612,7 +614,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
612 } 614 }
613 /* TODO: Support other key modules than passphrase for 615 /* TODO: Support other key modules than passphrase for
614 * filename encryption */ 616 * filename encryption */
615 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); 617 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
618 rc = -EOPNOTSUPP;
619 printk(KERN_INFO "%s: Filename encryption only supports "
620 "password tokens\n", __func__);
621 goto out_free_unlock;
622 }
616 sg_init_one( 623 sg_init_one(
617 &s->hash_sg, 624 &s->hash_sg,
618 (u8 *)s->auth_tok->token.password.session_key_encryption_key, 625 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
@@ -910,7 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
910 } 917 }
911 /* TODO: Support other key modules than passphrase for 918 /* TODO: Support other key modules than passphrase for
912 * filename encryption */ 919 * filename encryption */
913 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); 920 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
921 rc = -EOPNOTSUPP;
922 printk(KERN_INFO "%s: Filename encryption only supports "
923 "password tokens\n", __func__);
924 goto out_free_unlock;
925 }
914 rc = crypto_blkcipher_setkey( 926 rc = crypto_blkcipher_setkey(
915 s->desc.tfm, 927 s->desc.tfm,
916 s->auth_tok->token.password.session_key_encryption_key, 928 s->auth_tok->token.password.session_key_encryption_key,
@@ -1316,8 +1328,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1316 rc = -EINVAL; 1328 rc = -EINVAL;
1317 goto out_free; 1329 goto out_free;
1318 } 1330 }
1319 ecryptfs_cipher_code_to_string(crypt_stat->cipher, 1331 rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher,
1320 (u16)data[(*packet_size)]); 1332 (u16)data[(*packet_size)]);
1333 if (rc)
1334 goto out_free;
1321 /* A little extra work to differentiate among the AES key 1335 /* A little extra work to differentiate among the AES key
1322 * sizes; see RFC2440 */ 1336 * sizes; see RFC2440 */
1323 switch(data[(*packet_size)++]) { 1337 switch(data[(*packet_size)++]) {
@@ -1328,7 +1342,9 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1328 crypt_stat->key_size = 1342 crypt_stat->key_size =
1329 (*new_auth_tok)->session_key.encrypted_key_size; 1343 (*new_auth_tok)->session_key.encrypted_key_size;
1330 } 1344 }
1331 ecryptfs_init_crypt_ctx(crypt_stat); 1345 rc = ecryptfs_init_crypt_ctx(crypt_stat);
1346 if (rc)
1347 goto out_free;
1332 if (unlikely(data[(*packet_size)++] != 0x03)) { 1348 if (unlikely(data[(*packet_size)++] != 0x03)) {
1333 printk(KERN_WARNING "Only S2K ID 3 is currently supported\n"); 1349 printk(KERN_WARNING "Only S2K ID 3 is currently supported\n");
1334 rc = -ENOSYS; 1350 rc = -ENOSYS;
@@ -2366,21 +2382,18 @@ struct kmem_cache *ecryptfs_key_sig_cache;
2366int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) 2382int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
2367{ 2383{
2368 struct ecryptfs_key_sig *new_key_sig; 2384 struct ecryptfs_key_sig *new_key_sig;
2369 int rc = 0;
2370 2385
2371 new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); 2386 new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
2372 if (!new_key_sig) { 2387 if (!new_key_sig) {
2373 rc = -ENOMEM;
2374 printk(KERN_ERR 2388 printk(KERN_ERR
2375 "Error allocating from ecryptfs_key_sig_cache\n"); 2389 "Error allocating from ecryptfs_key_sig_cache\n");
2376 goto out; 2390 return -ENOMEM;
2377 } 2391 }
2378 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); 2392 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
2379 mutex_lock(&crypt_stat->keysig_list_mutex); 2393 /* Caller must hold keysig_list_mutex */
2380 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); 2394 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
2381 mutex_unlock(&crypt_stat->keysig_list_mutex); 2395
2382out: 2396 return 0;
2383 return rc;
2384} 2397}
2385 2398
2386struct kmem_cache *ecryptfs_global_auth_tok_cache; 2399struct kmem_cache *ecryptfs_global_auth_tok_cache;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index c6d7a4d748a0..e14cf7e588db 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -136,6 +136,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
136 const struct cred *cred) 136 const struct cred *cred)
137{ 137{
138 struct ecryptfs_open_req *req; 138 struct ecryptfs_open_req *req;
139 int flags = O_LARGEFILE;
139 int rc = 0; 140 int rc = 0;
140 141
141 /* Corresponding dput() and mntput() are done when the 142 /* Corresponding dput() and mntput() are done when the
@@ -143,10 +144,14 @@ int ecryptfs_privileged_open(struct file **lower_file,
143 * destroyed. */ 144 * destroyed. */
144 dget(lower_dentry); 145 dget(lower_dentry);
145 mntget(lower_mnt); 146 mntget(lower_mnt);
146 (*lower_file) = dentry_open(lower_dentry, lower_mnt, 147 flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
147 (O_RDWR | O_LARGEFILE), cred); 148 (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred);
148 if (!IS_ERR(*lower_file)) 149 if (!IS_ERR(*lower_file))
149 goto out; 150 goto out;
151 if (flags & O_RDONLY) {
152 rc = PTR_ERR((*lower_file));
153 goto out;
154 }
150 req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); 155 req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
151 if (!req) { 156 if (!req) {
152 rc = -ENOMEM; 157 rc = -ENOMEM;
@@ -180,21 +185,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
180 __func__); 185 __func__);
181 goto out_unlock; 186 goto out_unlock;
182 } 187 }
183 if (IS_ERR(*req->lower_file)) { 188 if (IS_ERR(*req->lower_file))
184 rc = PTR_ERR(*req->lower_file); 189 rc = PTR_ERR(*req->lower_file);
185 dget(lower_dentry);
186 mntget(lower_mnt);
187 (*lower_file) = dentry_open(lower_dentry, lower_mnt,
188 (O_RDONLY | O_LARGEFILE), cred);
189 if (IS_ERR(*lower_file)) {
190 rc = PTR_ERR(*req->lower_file);
191 (*lower_file) = NULL;
192 printk(KERN_WARNING "%s: Error attempting privileged "
193 "open of lower file with either RW or RO "
194 "perms; rc = [%d]. Giving up.\n",
195 __func__, rc);
196 }
197 }
198out_unlock: 190out_unlock:
199 mutex_unlock(&req->mux); 191 mutex_unlock(&req->mux);
200out_free: 192out_free:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f0aa9883c28..101fe4c7b1ee 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -129,11 +129,10 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
129 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 129 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
130 rc = ecryptfs_privileged_open(&inode_info->lower_file, 130 rc = ecryptfs_privileged_open(&inode_info->lower_file,
131 lower_dentry, lower_mnt, cred); 131 lower_dentry, lower_mnt, cred);
132 if (rc || IS_ERR(inode_info->lower_file)) { 132 if (rc) {
133 printk(KERN_ERR "Error opening lower persistent file " 133 printk(KERN_ERR "Error opening lower persistent file "
134 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 134 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
135 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 135 "rc = [%d]\n", lower_dentry, lower_mnt, rc);
136 rc = PTR_ERR(inode_info->lower_file);
137 inode_info->lower_file = NULL; 136 inode_info->lower_file = NULL;
138 } 137 }
139 } 138 }
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 5c6bab9786e3..df4ce99d0597 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -396,9 +396,11 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
396 rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, 396 rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
397 sizeof(u64)); 397 sizeof(u64));
398 kfree(file_size_virt); 398 kfree(file_size_virt);
399 if (rc) 399 if (rc < 0)
400 printk(KERN_ERR "%s: Error writing file size to header; " 400 printk(KERN_ERR "%s: Error writing file size to header; "
401 "rc = [%d]\n", __func__, rc); 401 "rc = [%d]\n", __func__, rc);
402 else
403 rc = 0;
402out: 404out:
403 return rc; 405 return rc;
404} 406}
@@ -545,7 +547,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
545 return rc; 547 return rc;
546} 548}
547 549
548struct address_space_operations ecryptfs_aops = { 550const struct address_space_operations ecryptfs_aops = {
549 .writepage = ecryptfs_writepage, 551 .writepage = ecryptfs_writepage,
550 .readpage = ecryptfs_readpage, 552 .readpage = ecryptfs_readpage,
551 .write_begin = ecryptfs_write_begin, 553 .write_begin = ecryptfs_write_begin,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index a137c6ea2fee..0cc4fafd6552 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -34,15 +34,14 @@
34 * 34 *
35 * Write data to the lower file. 35 * Write data to the lower file.
36 * 36 *
37 * Returns zero on success; non-zero on error 37 * Returns bytes written on success; less than zero on error
38 */ 38 */
39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, 39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
40 loff_t offset, size_t size) 40 loff_t offset, size_t size)
41{ 41{
42 struct ecryptfs_inode_info *inode_info; 42 struct ecryptfs_inode_info *inode_info;
43 ssize_t octets_written;
44 mm_segment_t fs_save; 43 mm_segment_t fs_save;
45 int rc = 0; 44 ssize_t rc;
46 45
47 inode_info = ecryptfs_inode_to_private(ecryptfs_inode); 46 inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
48 mutex_lock(&inode_info->lower_file_mutex); 47 mutex_lock(&inode_info->lower_file_mutex);
@@ -50,14 +49,9 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
50 inode_info->lower_file->f_pos = offset; 49 inode_info->lower_file->f_pos = offset;
51 fs_save = get_fs(); 50 fs_save = get_fs();
52 set_fs(get_ds()); 51 set_fs(get_ds());
53 octets_written = vfs_write(inode_info->lower_file, data, size, 52 rc = vfs_write(inode_info->lower_file, data, size,
54 &inode_info->lower_file->f_pos); 53 &inode_info->lower_file->f_pos);
55 set_fs(fs_save); 54 set_fs(fs_save);
56 if (octets_written < 0) {
57 printk(KERN_ERR "%s: octets_written = [%td]; "
58 "expected [%td]\n", __func__, octets_written, size);
59 rc = -EINVAL;
60 }
61 mutex_unlock(&inode_info->lower_file_mutex); 55 mutex_unlock(&inode_info->lower_file_mutex);
62 mark_inode_dirty_sync(ecryptfs_inode); 56 mark_inode_dirty_sync(ecryptfs_inode);
63 return rc; 57 return rc;
@@ -91,6 +85,8 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
91 + offset_in_page); 85 + offset_in_page);
92 virt = kmap(page_for_lower); 86 virt = kmap(page_for_lower);
93 rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); 87 rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
88 if (rc > 0)
89 rc = 0;
94 kunmap(page_for_lower); 90 kunmap(page_for_lower);
95 return rc; 91 return rc;
96} 92}
@@ -229,30 +225,24 @@ out:
229 * Read @size bytes of data at byte offset @offset from the lower 225 * Read @size bytes of data at byte offset @offset from the lower
230 * inode into memory location @data. 226 * inode into memory location @data.
231 * 227 *
232 * Returns zero on success; non-zero on error 228 * Returns bytes read on success; 0 on EOF; less than zero on error
233 */ 229 */
234int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 230int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
235 struct inode *ecryptfs_inode) 231 struct inode *ecryptfs_inode)
236{ 232{
237 struct ecryptfs_inode_info *inode_info = 233 struct ecryptfs_inode_info *inode_info =
238 ecryptfs_inode_to_private(ecryptfs_inode); 234 ecryptfs_inode_to_private(ecryptfs_inode);
239 ssize_t octets_read;
240 mm_segment_t fs_save; 235 mm_segment_t fs_save;
241 int rc = 0; 236 ssize_t rc;
242 237
243 mutex_lock(&inode_info->lower_file_mutex); 238 mutex_lock(&inode_info->lower_file_mutex);
244 BUG_ON(!inode_info->lower_file); 239 BUG_ON(!inode_info->lower_file);
245 inode_info->lower_file->f_pos = offset; 240 inode_info->lower_file->f_pos = offset;
246 fs_save = get_fs(); 241 fs_save = get_fs();
247 set_fs(get_ds()); 242 set_fs(get_ds());
248 octets_read = vfs_read(inode_info->lower_file, data, size, 243 rc = vfs_read(inode_info->lower_file, data, size,
249 &inode_info->lower_file->f_pos); 244 &inode_info->lower_file->f_pos);
250 set_fs(fs_save); 245 set_fs(fs_save);
251 if (octets_read < 0) {
252 printk(KERN_ERR "%s: octets_read = [%td]; "
253 "expected [%td]\n", __func__, octets_read, size);
254 rc = -EINVAL;
255 }
256 mutex_unlock(&inode_info->lower_file_mutex); 246 mutex_unlock(&inode_info->lower_file_mutex);
257 return rc; 247 return rc;
258} 248}
@@ -284,6 +274,8 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
284 offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page); 274 offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
285 virt = kmap(page_for_ecryptfs); 275 virt = kmap(page_for_ecryptfs);
286 rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); 276 rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
277 if (rc > 0)
278 rc = 0;
287 kunmap(page_for_ecryptfs); 279 kunmap(page_for_ecryptfs);
288 flush_dcache_page(page_for_ecryptfs); 280 flush_dcache_page(page_for_ecryptfs);
289 return rc; 281 return rc;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 12d649602d3a..b15a43a80ab7 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -77,7 +77,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
77 struct ecryptfs_inode_info *inode_info; 77 struct ecryptfs_inode_info *inode_info;
78 78
79 inode_info = ecryptfs_inode_to_private(inode); 79 inode_info = ecryptfs_inode_to_private(inode);
80 mutex_lock(&inode_info->lower_file_mutex);
81 if (inode_info->lower_file) { 80 if (inode_info->lower_file) {
82 struct dentry *lower_dentry = 81 struct dentry *lower_dentry =
83 inode_info->lower_file->f_dentry; 82 inode_info->lower_file->f_dentry;
@@ -89,7 +88,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
89 d_drop(lower_dentry); 88 d_drop(lower_dentry);
90 } 89 }
91 } 90 }
92 mutex_unlock(&inode_info->lower_file_mutex);
93 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
94 kmem_cache_free(ecryptfs_inode_info_cache, inode_info); 92 kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
95} 93}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 31d12de83a2a..8b47e4200e65 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -68,11 +68,16 @@ int eventfd_signal(struct eventfd_ctx *ctx, int n)
68} 68}
69EXPORT_SYMBOL_GPL(eventfd_signal); 69EXPORT_SYMBOL_GPL(eventfd_signal);
70 70
71static void eventfd_free_ctx(struct eventfd_ctx *ctx)
72{
73 kfree(ctx);
74}
75
71static void eventfd_free(struct kref *kref) 76static void eventfd_free(struct kref *kref)
72{ 77{
73 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 78 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
74 79
75 kfree(ctx); 80 eventfd_free_ctx(ctx);
76} 81}
77 82
78/** 83/**
@@ -298,9 +303,23 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
298} 303}
299EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 304EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
300 305
301SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 306/**
307 * eventfd_file_create - Creates an eventfd file pointer.
308 * @count: Initial eventfd counter value.
309 * @flags: Flags for the eventfd file.
310 *
311 * This function creates an eventfd file pointer, w/out installing it into
312 * the fd table. This is useful when the eventfd file is used during the
313 * initialization of data structures that require extra setup after the eventfd
314 * creation. So the eventfd creation is split into the file pointer creation
315 * phase, and the file descriptor installation phase.
316 * In this way races with userspace closing the newly installed file descriptor
317 * can be avoided.
318 * Returns an eventfd file pointer, or a proper error pointer.
319 */
320struct file *eventfd_file_create(unsigned int count, int flags)
302{ 321{
303 int fd; 322 struct file *file;
304 struct eventfd_ctx *ctx; 323 struct eventfd_ctx *ctx;
305 324
306 /* Check the EFD_* constants for consistency. */ 325 /* Check the EFD_* constants for consistency. */
@@ -308,26 +327,48 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
308 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 327 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
309 328
310 if (flags & ~EFD_FLAGS_SET) 329 if (flags & ~EFD_FLAGS_SET)
311 return -EINVAL; 330 return ERR_PTR(-EINVAL);
312 331
313 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 332 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
314 if (!ctx) 333 if (!ctx)
315 return -ENOMEM; 334 return ERR_PTR(-ENOMEM);
316 335
317 kref_init(&ctx->kref); 336 kref_init(&ctx->kref);
318 init_waitqueue_head(&ctx->wqh); 337 init_waitqueue_head(&ctx->wqh);
319 ctx->count = count; 338 ctx->count = count;
320 ctx->flags = flags; 339 ctx->flags = flags;
321 340
322 /* 341 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
323 * When we call this, the initialization must be complete, since 342 flags & EFD_SHARED_FCNTL_FLAGS);
324 * anon_inode_getfd() will install the fd. 343 if (IS_ERR(file))
325 */ 344 eventfd_free_ctx(ctx);
326 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 345
327 flags & EFD_SHARED_FCNTL_FLAGS); 346 return file;
328 if (fd < 0) 347}
329 kfree(ctx); 348
349SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
350{
351 int fd, error;
352 struct file *file;
353
354 error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
355 if (error < 0)
356 return error;
357 fd = error;
358
359 file = eventfd_file_create(count, flags);
360 if (IS_ERR(file)) {
361 error = PTR_ERR(file);
362 goto err_put_unused_fd;
363 }
364 fd_install(fd, file);
365
330 return fd; 366 return fd;
367
368err_put_unused_fd:
369 put_unused_fd(fd);
370
371 return error;
331} 372}
332 373
333SYSCALL_DEFINE1(eventfd, unsigned int, count) 374SYSCALL_DEFINE1(eventfd, unsigned int, count)
diff --git a/fs/exec.c b/fs/exec.c
index 172ceb6edde4..d49be6bc1793 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h> 36#include <linux/perf_event.h>
37#include <linux/highmem.h> 37#include <linux/highmem.h>
38#include <linux/spinlock.h> 38#include <linux/spinlock.h>
39#include <linux/key.h> 39#include <linux/key.h>
@@ -55,6 +55,7 @@
55#include <linux/kmod.h> 55#include <linux/kmod.h>
56#include <linux/fsnotify.h> 56#include <linux/fsnotify.h>
57#include <linux/fs_struct.h> 57#include <linux/fs_struct.h>
58#include <linux/pipe_fs_i.h>
58 59
59#include <asm/uaccess.h> 60#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 61#include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
63 64
64int core_uses_pid; 65int core_uses_pid;
65char core_pattern[CORENAME_MAX_SIZE] = "core"; 66char core_pattern[CORENAME_MAX_SIZE] = "core";
67unsigned int core_pipe_limit;
66int suid_dumpable = 0; 68int suid_dumpable = 0;
67 69
68/* The maximal length of core_pattern is also specified in sysctl.c */ 70/* The maximal length of core_pattern is also specified in sysctl.c */
@@ -845,6 +847,9 @@ static int de_thread(struct task_struct *tsk)
845 sig->notify_count = 0; 847 sig->notify_count = 0;
846 848
847no_thread_group: 849no_thread_group:
850 if (current->mm)
851 setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
852
848 exit_itimers(sig); 853 exit_itimers(sig);
849 flush_itimer_signals(); 854 flush_itimer_signals();
850 855
@@ -923,7 +928,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
923 task_lock(tsk); 928 task_lock(tsk);
924 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 929 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
925 task_unlock(tsk); 930 task_unlock(tsk);
926 perf_counter_comm(tsk); 931 perf_event_comm(tsk);
927} 932}
928 933
929int flush_old_exec(struct linux_binprm * bprm) 934int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +1002,7 @@ int flush_old_exec(struct linux_binprm * bprm)
997 * security domain: 1002 * security domain:
998 */ 1003 */
999 if (!get_dumpable(current->mm)) 1004 if (!get_dumpable(current->mm))
1000 perf_counter_exit_task(current); 1005 perf_event_exit_task(current);
1001 1006
1002 /* An exec changes our domain. We are no longer part of the thread 1007 /* An exec changes our domain. We are no longer part of the thread
1003 group */ 1008 group */
@@ -1354,6 +1359,8 @@ int do_execve(char * filename,
1354 if (retval < 0) 1359 if (retval < 0)
1355 goto out; 1360 goto out;
1356 1361
1362 current->stack_start = current->mm->start_stack;
1363
1357 /* execve succeeded */ 1364 /* execve succeeded */
1358 current->fs->in_exec = 0; 1365 current->fs->in_exec = 0;
1359 current->in_execve = 0; 1366 current->in_execve = 0;
@@ -1388,18 +1395,16 @@ out_ret:
1388 return retval; 1395 return retval;
1389} 1396}
1390 1397
1391int set_binfmt(struct linux_binfmt *new) 1398void set_binfmt(struct linux_binfmt *new)
1392{ 1399{
1393 struct linux_binfmt *old = current->binfmt; 1400 struct mm_struct *mm = current->mm;
1394 1401
1395 if (new) { 1402 if (mm->binfmt)
1396 if (!try_module_get(new->module)) 1403 module_put(mm->binfmt->module);
1397 return -1; 1404
1398 } 1405 mm->binfmt = new;
1399 current->binfmt = new; 1406 if (new)
1400 if (old) 1407 __module_get(new->module);
1401 module_put(old->module);
1402 return 0;
1403} 1408}
1404 1409
1405EXPORT_SYMBOL(set_binfmt); 1410EXPORT_SYMBOL(set_binfmt);
@@ -1723,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm)
1723 return (ret >= 2) ? 2 : ret; 1728 return (ret >= 2) ? 2 : ret;
1724} 1729}
1725 1730
1731static void wait_for_dump_helpers(struct file *file)
1732{
1733 struct pipe_inode_info *pipe;
1734
1735 pipe = file->f_path.dentry->d_inode->i_pipe;
1736
1737 pipe_lock(pipe);
1738 pipe->readers++;
1739 pipe->writers--;
1740
1741 while ((pipe->readers > 1) && (!signal_pending(current))) {
1742 wake_up_interruptible_sync(&pipe->wait);
1743 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1744 pipe_wait(pipe);
1745 }
1746
1747 pipe->readers--;
1748 pipe->writers++;
1749 pipe_unlock(pipe);
1750
1751}
1752
1753
1726void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1754void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1727{ 1755{
1728 struct core_state core_state; 1756 struct core_state core_state;
@@ -1739,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1739 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; 1767 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1740 char **helper_argv = NULL; 1768 char **helper_argv = NULL;
1741 int helper_argc = 0; 1769 int helper_argc = 0;
1742 char *delimit; 1770 int dump_count = 0;
1771 static atomic_t core_dump_count = ATOMIC_INIT(0);
1743 1772
1744 audit_core_dumps(signr); 1773 audit_core_dumps(signr);
1745 1774
1746 binfmt = current->binfmt; 1775 binfmt = mm->binfmt;
1747 if (!binfmt || !binfmt->core_dump) 1776 if (!binfmt || !binfmt->core_dump)
1748 goto fail; 1777 goto fail;
1749 1778
@@ -1794,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1794 lock_kernel(); 1823 lock_kernel();
1795 ispipe = format_corename(corename, signr); 1824 ispipe = format_corename(corename, signr);
1796 unlock_kernel(); 1825 unlock_kernel();
1797 /* 1826
1798 * Don't bother to check the RLIMIT_CORE value if core_pattern points
1799 * to a pipe. Since we're not writing directly to the filesystem
1800 * RLIMIT_CORE doesn't really apply, as no actual core file will be
1801 * created unless the pipe reader choses to write out the core file
1802 * at which point file size limits and permissions will be imposed
1803 * as it does with any other process
1804 */
1805 if ((!ispipe) && (core_limit < binfmt->min_coredump)) 1827 if ((!ispipe) && (core_limit < binfmt->min_coredump))
1806 goto fail_unlock; 1828 goto fail_unlock;
1807 1829
1808 if (ispipe) { 1830 if (ispipe) {
1831 if (core_limit == 0) {
1832 /*
1833 * Normally core limits are irrelevant to pipes, since
1834 * we're not writing to the file system, but we use
1835 * core_limit of 0 here as a speacial value. Any
1836 * non-zero limit gets set to RLIM_INFINITY below, but
1837 * a limit of 0 skips the dump. This is a consistent
1838 * way to catch recursive crashes. We can still crash
1839 * if the core_pattern binary sets RLIM_CORE = !0
1840 * but it runs as root, and can do lots of stupid things
1841 * Note that we use task_tgid_vnr here to grab the pid
1842 * of the process group leader. That way we get the
1843 * right pid if a thread in a multi-threaded
1844 * core_pattern process dies.
1845 */
1846 printk(KERN_WARNING
1847 "Process %d(%s) has RLIMIT_CORE set to 0\n",
1848 task_tgid_vnr(current), current->comm);
1849 printk(KERN_WARNING "Aborting core\n");
1850 goto fail_unlock;
1851 }
1852
1853 dump_count = atomic_inc_return(&core_dump_count);
1854 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
1855 printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
1856 task_tgid_vnr(current), current->comm);
1857 printk(KERN_WARNING "Skipping core dump\n");
1858 goto fail_dropcount;
1859 }
1860
1809 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1861 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1810 if (!helper_argv) { 1862 if (!helper_argv) {
1811 printk(KERN_WARNING "%s failed to allocate memory\n", 1863 printk(KERN_WARNING "%s failed to allocate memory\n",
1812 __func__); 1864 __func__);
1813 goto fail_unlock; 1865 goto fail_dropcount;
1814 }
1815 /* Terminate the string before the first option */
1816 delimit = strchr(corename, ' ');
1817 if (delimit)
1818 *delimit = '\0';
1819 delimit = strrchr(helper_argv[0], '/');
1820 if (delimit)
1821 delimit++;
1822 else
1823 delimit = helper_argv[0];
1824 if (!strcmp(delimit, current->comm)) {
1825 printk(KERN_NOTICE "Recursive core dump detected, "
1826 "aborting\n");
1827 goto fail_unlock;
1828 } 1866 }
1829 1867
1830 core_limit = RLIM_INFINITY; 1868 core_limit = RLIM_INFINITY;
1831 1869
1832 /* SIGPIPE can happen, but it's just never processed */ 1870 /* SIGPIPE can happen, but it's just never processed */
1833 if (call_usermodehelper_pipe(corename+1, helper_argv, NULL, 1871 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
1834 &file)) { 1872 &file)) {
1835 printk(KERN_INFO "Core dump to %s pipe failed\n", 1873 printk(KERN_INFO "Core dump to %s pipe failed\n",
1836 corename); 1874 corename);
1837 goto fail_unlock; 1875 goto fail_dropcount;
1838 } 1876 }
1839 } else 1877 } else
1840 file = filp_open(corename, 1878 file = filp_open(corename,
1841 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1879 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1842 0600); 1880 0600);
1843 if (IS_ERR(file)) 1881 if (IS_ERR(file))
1844 goto fail_unlock; 1882 goto fail_dropcount;
1845 inode = file->f_path.dentry->d_inode; 1883 inode = file->f_path.dentry->d_inode;
1846 if (inode->i_nlink > 1) 1884 if (inode->i_nlink > 1)
1847 goto close_fail; /* multiple links - don't dump */ 1885 goto close_fail; /* multiple links - don't dump */
@@ -1870,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1870 if (retval) 1908 if (retval)
1871 current->signal->group_exit_code |= 0x80; 1909 current->signal->group_exit_code |= 0x80;
1872close_fail: 1910close_fail:
1911 if (ispipe && core_pipe_limit)
1912 wait_for_dump_helpers(file);
1873 filp_close(file, NULL); 1913 filp_close(file, NULL);
1914fail_dropcount:
1915 if (dump_count)
1916 atomic_dec(&core_dump_count);
1874fail_unlock: 1917fail_unlock:
1875 if (helper_argv) 1918 if (helper_argv)
1876 argv_free(helper_argv); 1919 argv_free(helper_argv);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 5ab10c3bbebe..9f500dec3b59 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
214 } 214 }
215 215
216 lock_super(sb); 216 lock_super(sb);
217 lock_kernel();
218 sbi = sb->s_fs_info; 217 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 218 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
220 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); 219 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
@@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
245out: 244out:
246 if (or) 245 if (or)
247 osd_end_request(or); 246 osd_end_request(or);
248 unlock_kernel();
249 unlock_super(sb); 247 unlock_super(sb);
250 kfree(fscb); 248 kfree(fscb);
251 return ret; 249 return ret;
@@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb)
268 int num_pend; 266 int num_pend;
269 struct exofs_sb_info *sbi = sb->s_fs_info; 267 struct exofs_sb_info *sbi = sb->s_fs_info;
270 268
271 lock_kernel();
272
273 if (sb->s_dirt) 269 if (sb->s_dirt)
274 exofs_write_super(sb); 270 exofs_write_super(sb);
275 271
@@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb)
286 osduld_put_device(sbi->s_dev); 282 osduld_put_device(sbi->s_dev);
287 kfree(sb->s_fs_info); 283 kfree(sb->s_fs_info);
288 sb->s_fs_info = NULL; 284 sb->s_fs_info = NULL;
289
290 unlock_kernel();
291} 285}
292 286
293/* 287/*
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 1c1638f873a4..ade634076d0a 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
819 .writepages = ext2_writepages, 819 .writepages = ext2_writepages,
820 .migratepage = buffer_migrate_page, 820 .migratepage = buffer_migrate_page,
821 .is_partially_uptodate = block_is_partially_uptodate, 821 .is_partially_uptodate = block_is_partially_uptodate,
822 .error_remove_page = generic_error_remove_page,
822}; 823};
823 824
824const struct address_space_operations ext2_aops_xip = { 825const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
837 .direct_IO = ext2_direct_IO, 838 .direct_IO = ext2_direct_IO,
838 .writepages = ext2_writepages, 839 .writepages = ext2_writepages,
839 .migratepage = buffer_migrate_page, 840 .migratepage = buffer_migrate_page,
841 .error_remove_page = generic_error_remove_page,
840}; 842};
841 843
842/* 844/*
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 23701f289e98..dd7175ce5606 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -70,7 +70,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
70 if (PTR_ERR(inode) == -ESTALE) { 70 if (PTR_ERR(inode) == -ESTALE) {
71 ext2_error(dir->i_sb, __func__, 71 ext2_error(dir->i_sb, __func__,
72 "deleted inode referenced: %lu", 72 "deleted inode referenced: %lu",
73 ino); 73 (unsigned long) ino);
74 return ERR_PTR(-EIO); 74 return ERR_PTR(-EIO);
75 } else { 75 } else {
76 return ERR_CAST(inode); 76 return ERR_CAST(inode);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index b72b85884223..c18fbf3e4068 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block,
20 void **kaddr, unsigned long *pfn) 20 void **kaddr, unsigned long *pfn)
21{ 21{
22 struct block_device *bdev = inode->i_sb->s_bdev; 22 struct block_device *bdev = inode->i_sb->s_bdev;
23 struct block_device_operations *ops = bdev->bd_disk->fops; 23 const struct block_device_operations *ops = bdev->bd_disk->fops;
24 sector_t sector; 24 sector_t sector;
25 25
26 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ 26 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index cd098a7b77fc..acf1b1423327 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1830,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
1830 .direct_IO = ext3_direct_IO, 1830 .direct_IO = ext3_direct_IO,
1831 .migratepage = buffer_migrate_page, 1831 .migratepage = buffer_migrate_page,
1832 .is_partially_uptodate = block_is_partially_uptodate, 1832 .is_partially_uptodate = block_is_partially_uptodate,
1833 .error_remove_page = generic_error_remove_page,
1833}; 1834};
1834 1835
1835static const struct address_space_operations ext3_writeback_aops = { 1836static const struct address_space_operations ext3_writeback_aops = {
@@ -1845,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
1845 .direct_IO = ext3_direct_IO, 1846 .direct_IO = ext3_direct_IO,
1846 .migratepage = buffer_migrate_page, 1847 .migratepage = buffer_migrate_page,
1847 .is_partially_uptodate = block_is_partially_uptodate, 1848 .is_partially_uptodate = block_is_partially_uptodate,
1849 .error_remove_page = generic_error_remove_page,
1848}; 1850};
1849 1851
1850static const struct address_space_operations ext3_journalled_aops = { 1852static const struct address_space_operations ext3_journalled_aops = {
@@ -1859,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
1859 .invalidatepage = ext3_invalidatepage, 1861 .invalidatepage = ext3_invalidatepage,
1860 .releasepage = ext3_releasepage, 1862 .releasepage = ext3_releasepage,
1861 .is_partially_uptodate = block_is_partially_uptodate, 1863 .is_partially_uptodate = block_is_partially_uptodate,
1864 .error_remove_page = generic_error_remove_page,
1862}; 1865};
1863 1866
1864void ext3_set_aops(struct inode *inode) 1867void ext3_set_aops(struct inode *inode)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a8d80a7f1105..72743d360509 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -720,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
720static ssize_t ext3_quota_write(struct super_block *sb, int type, 720static ssize_t ext3_quota_write(struct super_block *sb, int type,
721 const char *data, size_t len, loff_t off); 721 const char *data, size_t len, loff_t off);
722 722
723static struct dquot_operations ext3_quota_operations = { 723static const struct dquot_operations ext3_quota_operations = {
724 .initialize = dquot_initialize, 724 .initialize = dquot_initialize,
725 .drop = dquot_drop, 725 .drop = dquot_drop,
726 .alloc_space = dquot_alloc_space, 726 .alloc_space = dquot_alloc_space,
@@ -737,7 +737,7 @@ static struct dquot_operations ext3_quota_operations = {
737 .destroy_dquot = dquot_destroy, 737 .destroy_dquot = dquot_destroy,
738}; 738};
739 739
740static struct quotactl_ops ext3_qctl_operations = { 740static const struct quotactl_ops ext3_qctl_operations = {
741 .quota_on = ext3_quota_on, 741 .quota_on = ext3_quota_on,
742 .quota_off = vfs_quota_off, 742 .quota_off = vfs_quota_off,
743 .quota_sync = vfs_quota_sync, 743 .quota_sync = vfs_quota_sync,
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4DEV_COMPAT
30 bool "Enable ext4dev compatibility"
31 depends on EXT4_FS
32 help
33 Starting with 2.6.28, the name of the ext4 filesystem was
34 renamed from ext4dev to ext4. Unfortunately there are some
35 legacy userspace programs (such as klibc's fstype) have
36 "ext4dev" hardcoded.
37
38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev,
40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed!
42
43config EXT4_FS_XATTR 29config EXT4_FS_XATTR
44 bool "Ext4 extended attributes" 30 bool "Ext4 extended attributes"
45 depends on EXT4_FS 31 depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..984ca0cb38c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
65/* data type for block group number */ 65/* data type for block group number */
66typedef unsigned int ext4_group_t; 66typedef unsigned int ext4_group_t;
67 67
68/*
69 * Flags used in mballoc's allocation_context flags field.
70 *
71 * Also used to show what's going on for debugging purposes when the
72 * flag field is exported via the traceport interface
73 */
68 74
69/* prefer goal again. length */ 75/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 0x0001 76#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
127 int pages_written; 133 int pages_written;
128 int retval; 134 int retval;
129}; 135};
136#define DIO_AIO_UNWRITTEN 0x1
137typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */
142 ext4_lblk_t offset; /* offset in the file */
143 size_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */
145} ext4_io_end_t;
130 146
131/* 147/*
132 * Special inodes numbers 148 * Special inodes numbers
@@ -347,7 +363,16 @@ struct ext4_new_group_data {
347 /* Call ext4_da_update_reserve_space() after successfully 363 /* Call ext4_da_update_reserve_space() after successfully
348 allocating the blocks */ 364 allocating the blocks */
349#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 365#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
350 366 /* caller is from the direct IO path, request to creation of an
367 unitialized extents if not allocated, split the uninitialized
368 extent if blocks has been preallocated already*/
369#define EXT4_GET_BLOCKS_DIO 0x0010
370#define EXT4_GET_BLOCKS_CONVERT 0x0020
371#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
372 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
373 /* Convert extent to initialized after direct IO complete */
374#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
375 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
351 376
352/* 377/*
353 * ioctl commands 378 * ioctl commands
@@ -500,8 +525,8 @@ struct move_extent {
500static inline __le32 ext4_encode_extra_time(struct timespec *time) 525static inline __le32 ext4_encode_extra_time(struct timespec *time)
501{ 526{
502 return cpu_to_le32((sizeof(time->tv_sec) > 4 ? 527 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
503 time->tv_sec >> 32 : 0) | 528 (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
504 ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); 529 ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
505} 530}
506 531
507static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 532static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
509 if (sizeof(time->tv_sec) > 4) 534 if (sizeof(time->tv_sec) > 4)
510 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) 535 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
511 << 32; 536 << 32;
512 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; 537 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
513} 538}
514 539
515#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ 540#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -672,6 +697,11 @@ struct ext4_inode_info {
672 __u16 i_extra_isize; 697 __u16 i_extra_isize;
673 698
674 spinlock_t i_block_reservation_lock; 699 spinlock_t i_block_reservation_lock;
700
701 /* completed async DIOs that might need unwritten extents handling */
702 struct list_head i_aio_dio_complete_list;
703 /* current io_end structure for async DIO write*/
704 ext4_io_end_t *cur_aio_dio;
675}; 705};
676 706
677/* 707/*
@@ -942,18 +972,11 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 972 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 973 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 974 unsigned int s_mb_group_prealloc;
975 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 976 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 977 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 978 unsigned long s_mb_last_start;
948 979
949 /* history to debug policy */
950 struct ext4_mb_history *s_mb_history;
951 int s_mb_history_cur;
952 int s_mb_history_max;
953 int s_mb_history_num;
954 spinlock_t s_mb_history_lock;
955 int s_mb_history_filter;
956
957 /* stats for buddy allocator */ 980 /* stats for buddy allocator */
958 spinlock_t s_mb_pa_lock; 981 spinlock_t s_mb_pa_lock;
959 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 982 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1003,9 @@ struct ext4_sb_info {
980 1003
981 unsigned int s_log_groups_per_flex; 1004 unsigned int s_log_groups_per_flex;
982 struct flex_groups *s_flex_groups; 1005 struct flex_groups *s_flex_groups;
1006
1007 /* workqueue for dio unwritten */
1008 struct workqueue_struct *dio_unwritten_wq;
983}; 1009};
984 1010
985static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1011static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1397 struct address_space *mapping, loff_t from); 1423 struct address_space *mapping, loff_t from);
1398extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1424extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1399extern qsize_t ext4_get_reserved_space(struct inode *inode); 1425extern qsize_t ext4_get_reserved_space(struct inode *inode);
1400 1426extern int flush_aio_dio_completed_IO(struct inode *inode);
1401/* ioctl.c */ 1427/* ioctl.c */
1402extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1428extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1403extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1429extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *);
1699extern void ext4_ext_release(struct super_block *); 1725extern void ext4_ext_release(struct super_block *);
1700extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1726extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1701 loff_t len); 1727 loff_t len);
1728extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1729 loff_t len);
1702extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1730extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1703 sector_t block, unsigned int max_blocks, 1731 sector_t block, unsigned int max_blocks,
1704 struct buffer_head *bh, int flags); 1732 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
221} 221}
222 222
223static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
224{
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226}
227
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
235 struct ext4_ext_path *path, 240 struct ext4_ext_path *path,
236 struct ext4_extent *); 241 struct ext4_extent *);
237extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 242extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
238extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 243extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
239extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, 244extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
240 ext_prepare_callback, void *); 245 ext_prepare_callback, void *);
241extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 246extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
162int __ext4_journal_stop(const char *where, handle_t *handle); 162int __ext4_journal_stop(const char *where, handle_t *handle);
163 163
164#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) 164#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
165 165
166/* Note: Do not use this for NULL handles. This is only to determine if
167 * a properly allocated handle is using a journal or not. */
166static inline int ext4_handle_valid(handle_t *handle) 168static inline int ext4_handle_valid(handle_t *handle)
167{ 169{
168 if (handle == EXT4_NOJOURNAL_HANDLE) 170 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
169 return 0; 171 return 0;
170 return 1; 172 return 1;
171} 173}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..10539e364283 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
723 * insert new index [@logical;@ptr] into the block at @curp; 723 * insert new index [@logical;@ptr] into the block at @curp;
724 * check where to insert: before @curp or after @curp 724 * check where to insert: before @curp or after @curp
725 */ 725 */
726static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
727 struct ext4_ext_path *curp, 727 struct ext4_ext_path *curp,
728 int logical, ext4_fsblk_t ptr) 728 int logical, ext4_fsblk_t ptr)
729{ 729{
@@ -1586,7 +1586,7 @@ out:
1586 */ 1586 */
1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1588 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1589 struct ext4_extent *newext) 1589 struct ext4_extent *newext, int flag)
1590{ 1590{
1591 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1592 struct ext4_extent *ex, *fex; 1592 struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1602 BUG_ON(path[depth].p_hdr == NULL); 1602 BUG_ON(path[depth].p_hdr == NULL);
1603 1603
1604 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1606 && ext4_can_extents_be_merged(inode, ex, newext)) {
1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext), 1608 ext4_ext_is_uninitialized(newext),
1608 ext4_ext_get_actual_len(newext), 1609 ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
1722 1723
1723merge: 1724merge:
1724 /* try to merge extents to the right */ 1725 /* try to merge extents to the right */
1725 ext4_ext_try_to_merge(inode, path, nearex); 1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727 ext4_ext_try_to_merge(inode, path, nearex);
1726 1728
1727 /* try to merge extents to the left */ 1729 /* try to merge extents to the left */
1728 1730
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
2378 */ 2380 */
2379 2381
2380 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2382 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2381 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2384 printk(KERN_INFO "EXT4-fs: file extents enabled");
2382#ifdef AGGRESSIVE_TEST 2385#ifdef AGGRESSIVE_TEST
2383 printk(", aggressive tests"); 2386 printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
2389 printk(", stats"); 2392 printk(", stats");
2390#endif 2393#endif
2391 printk("\n"); 2394 printk("\n");
2395#endif
2392#ifdef EXTENTS_STATS 2396#ifdef EXTENTS_STATS
2393 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2397 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2394 EXT4_SB(sb)->s_ext_min = 1 << 30; 2398 EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2490} 2494}
2491 2495
2492#define EXT4_EXT_ZERO_LEN 7 2496#define EXT4_EXT_ZERO_LEN 7
2493
2494/* 2497/*
2495 * This function is called by ext4_ext_get_blocks() if someone tries to write 2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2496 * to an uninitialized extent. It may result in splitting the uninitialized 2499 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2583 ex3->ee_block = cpu_to_le32(iblock); 2586 ex3->ee_block = cpu_to_le32(iblock);
2584 ext4_ext_store_pblock(ex3, newblock); 2587 ext4_ext_store_pblock(ex3, newblock);
2585 ex3->ee_len = cpu_to_le16(allocated); 2588 ex3->ee_len = cpu_to_le16(allocated);
2586 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2589 err = ext4_ext_insert_extent(handle, inode, path,
2590 ex3, 0);
2587 if (err == -ENOSPC) { 2591 if (err == -ENOSPC) {
2588 err = ext4_ext_zeroout(inode, &orig_ex); 2592 err = ext4_ext_zeroout(inode, &orig_ex);
2589 if (err) 2593 if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2639 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2643 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2640 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2644 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2641 ext4_ext_mark_uninitialized(ex3); 2645 ext4_ext_mark_uninitialized(ex3);
2642 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2646 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2643 if (err == -ENOSPC) { 2647 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex); 2648 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err) 2649 if (err)
@@ -2757,7 +2761,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2757 err = ext4_ext_dirty(handle, inode, path + depth); 2761 err = ext4_ext_dirty(handle, inode, path + depth);
2758 goto out; 2762 goto out;
2759insert: 2763insert:
2760 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2764 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2761 if (err == -ENOSPC) { 2765 if (err == -ENOSPC) {
2762 err = ext4_ext_zeroout(inode, &orig_ex); 2766 err = ext4_ext_zeroout(inode, &orig_ex);
2763 if (err) 2767 if (err)
@@ -2785,6 +2789,324 @@ fix_extent_len:
2785} 2789}
2786 2790
2787/* 2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 * a> There is no split required: Entire extent should be uninitialized
2800 * b> Splits in two extents: Write is happening at either end of the extent
2801 * c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 */
2811static int ext4_split_unwritten_extents(handle_t *handle,
2812 struct inode *inode,
2813 struct ext4_ext_path *path,
2814 ext4_lblk_t iblock,
2815 unsigned int max_blocks,
2816 int flags)
2817{
2818 struct ext4_extent *ex, newex, orig_ex;
2819 struct ext4_extent *ex1 = NULL;
2820 struct ext4_extent *ex2 = NULL;
2821 struct ext4_extent *ex3 = NULL;
2822 struct ext4_extent_header *eh;
2823 ext4_lblk_t ee_block;
2824 unsigned int allocated, ee_len, depth;
2825 ext4_fsblk_t newblock;
2826 int err = 0;
2827 int ret = 0;
2828
2829 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2830 "iblock %llu, max_blocks %u\n", inode->i_ino,
2831 (unsigned long long)iblock, max_blocks);
2832 depth = ext_depth(inode);
2833 eh = path[depth].p_hdr;
2834 ex = path[depth].p_ext;
2835 ee_block = le32_to_cpu(ex->ee_block);
2836 ee_len = ext4_ext_get_actual_len(ex);
2837 allocated = ee_len - (iblock - ee_block);
2838 newblock = iblock - ee_block + ext_pblock(ex);
2839 ex2 = ex;
2840 orig_ex.ee_block = ex->ee_block;
2841 orig_ex.ee_len = cpu_to_le16(ee_len);
2842 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2843
2844 /*
2845 * if the entire unintialized extent length less than
2846 * the size of extent to write, there is no need to split
2847 * uninitialized extent
2848 */
2849 if (allocated <= max_blocks)
2850 return ret;
2851
2852 err = ext4_ext_get_access(handle, inode, path + depth);
2853 if (err)
2854 goto out;
2855 /* ex1: ee_block to iblock - 1 : uninitialized */
2856 if (iblock > ee_block) {
2857 ex1 = ex;
2858 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2859 ext4_ext_mark_uninitialized(ex1);
2860 ex2 = &newex;
2861 }
2862 /*
2863 * for sanity, update the length of the ex2 extent before
2864 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2865 * overlap of blocks.
2866 */
2867 if (!ex1 && allocated > max_blocks)
2868 ex2->ee_len = cpu_to_le16(max_blocks);
2869 /* ex3: to ee_block + ee_len : uninitialised */
2870 if (allocated > max_blocks) {
2871 unsigned int newdepth;
2872 ex3 = &newex;
2873 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2874 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2875 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2876 ext4_ext_mark_uninitialized(ex3);
2877 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2878 if (err == -ENOSPC) {
2879 err = ext4_ext_zeroout(inode, &orig_ex);
2880 if (err)
2881 goto fix_extent_len;
2882 /* update the extent length and mark as initialized */
2883 ex->ee_block = orig_ex.ee_block;
2884 ex->ee_len = orig_ex.ee_len;
2885 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2886 ext4_ext_dirty(handle, inode, path + depth);
2887 /* zeroed the full extent */
2888 /* blocks available from iblock */
2889 return allocated;
2890
2891 } else if (err)
2892 goto fix_extent_len;
2893 /*
2894 * The depth, and hence eh & ex might change
2895 * as part of the insert above.
2896 */
2897 newdepth = ext_depth(inode);
2898 /*
2899 * update the extent length after successful insert of the
2900 * split extent
2901 */
2902 orig_ex.ee_len = cpu_to_le16(ee_len -
2903 ext4_ext_get_actual_len(ex3));
2904 depth = newdepth;
2905 ext4_ext_drop_refs(path);
2906 path = ext4_ext_find_extent(inode, iblock, path);
2907 if (IS_ERR(path)) {
2908 err = PTR_ERR(path);
2909 goto out;
2910 }
2911 eh = path[depth].p_hdr;
2912 ex = path[depth].p_ext;
2913 if (ex2 != &newex)
2914 ex2 = ex;
2915
2916 err = ext4_ext_get_access(handle, inode, path + depth);
2917 if (err)
2918 goto out;
2919
2920 allocated = max_blocks;
2921 }
2922 /*
2923 * If there was a change of depth as part of the
2924 * insertion of ex3 above, we need to update the length
2925 * of the ex1 extent again here
2926 */
2927 if (ex1 && ex1 != ex) {
2928 ex1 = ex;
2929 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2930 ext4_ext_mark_uninitialized(ex1);
2931 ex2 = &newex;
2932 }
2933 /*
2934 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2935 * uninitialised still.
2936 */
2937 ex2->ee_block = cpu_to_le32(iblock);
2938 ext4_ext_store_pblock(ex2, newblock);
2939 ex2->ee_len = cpu_to_le16(allocated);
2940 ext4_ext_mark_uninitialized(ex2);
2941 if (ex2 != ex)
2942 goto insert;
2943 /* Mark modified extent as dirty */
2944 err = ext4_ext_dirty(handle, inode, path + depth);
2945 ext_debug("out here\n");
2946 goto out;
2947insert:
2948 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2949 if (err == -ENOSPC) {
2950 err = ext4_ext_zeroout(inode, &orig_ex);
2951 if (err)
2952 goto fix_extent_len;
2953 /* update the extent length and mark as initialized */
2954 ex->ee_block = orig_ex.ee_block;
2955 ex->ee_len = orig_ex.ee_len;
2956 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2957 ext4_ext_dirty(handle, inode, path + depth);
2958 /* zero out the first half */
2959 return allocated;
2960 } else if (err)
2961 goto fix_extent_len;
2962out:
2963 ext4_ext_show_leaf(inode, path);
2964 return err ? err : allocated;
2965
2966fix_extent_len:
2967 ex->ee_block = orig_ex.ee_block;
2968 ex->ee_len = orig_ex.ee_len;
2969 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2970 ext4_ext_mark_uninitialized(ex);
2971 ext4_ext_dirty(handle, inode, path + depth);
2972 return err;
2973}
2974static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2975 struct inode *inode,
2976 struct ext4_ext_path *path)
2977{
2978 struct ext4_extent *ex;
2979 struct ext4_extent_header *eh;
2980 int depth;
2981 int err = 0;
2982 int ret = 0;
2983
2984 depth = ext_depth(inode);
2985 eh = path[depth].p_hdr;
2986 ex = path[depth].p_ext;
2987
2988 err = ext4_ext_get_access(handle, inode, path + depth);
2989 if (err)
2990 goto out;
2991 /* first mark the extent as initialized */
2992 ext4_ext_mark_initialized(ex);
2993
2994 /*
2995 * We have to see if it can be merged with the extent
2996 * on the left.
2997 */
2998 if (ex > EXT_FIRST_EXTENT(eh)) {
2999 /*
3000 * To merge left, pass "ex - 1" to try_to_merge(),
3001 * since it merges towards right _only_.
3002 */
3003 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3004 if (ret) {
3005 err = ext4_ext_correct_indexes(handle, inode, path);
3006 if (err)
3007 goto out;
3008 depth = ext_depth(inode);
3009 ex--;
3010 }
3011 }
3012 /*
3013 * Try to Merge towards right.
3014 */
3015 ret = ext4_ext_try_to_merge(inode, path, ex);
3016 if (ret) {
3017 err = ext4_ext_correct_indexes(handle, inode, path);
3018 if (err)
3019 goto out;
3020 depth = ext_depth(inode);
3021 }
3022 /* Mark modified extent as dirty */
3023 err = ext4_ext_dirty(handle, inode, path + depth);
3024out:
3025 ext4_ext_show_leaf(inode, path);
3026 return err;
3027}
3028
3029static int
3030ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3031 ext4_lblk_t iblock, unsigned int max_blocks,
3032 struct ext4_ext_path *path, int flags,
3033 unsigned int allocated, struct buffer_head *bh_result,
3034 ext4_fsblk_t newblock)
3035{
3036 int ret = 0;
3037 int err = 0;
3038 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3039
3040 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3041 "block %llu, max_blocks %u, flags %d, allocated %u",
3042 inode->i_ino, (unsigned long long)iblock, max_blocks,
3043 flags, allocated);
3044 ext4_ext_show_leaf(inode, path);
3045
3046 /* DIO get_block() before submit the IO, split the extent */
3047 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3048 ret = ext4_split_unwritten_extents(handle,
3049 inode, path, iblock,
3050 max_blocks, flags);
3051 /* flag the io_end struct that we need convert when IO done */
3052 if (io)
3053 io->flag = DIO_AIO_UNWRITTEN;
3054 goto out;
3055 }
3056 /* DIO end_io complete, convert the filled extent to written */
3057 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3058 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3059 path);
3060 goto out2;
3061 }
3062 /* buffered IO case */
3063 /*
3064 * repeat fallocate creation request
3065 * we already have an unwritten extent
3066 */
3067 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3068 goto map_out;
3069
3070 /* buffered READ or buffered write_begin() lookup */
3071 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3072 /*
3073 * We have blocks reserved already. We
3074 * return allocated blocks so that delalloc
3075 * won't do block reservation for us. But
3076 * the buffer head will be unmapped so that
3077 * a read from the block returns 0s.
3078 */
3079 set_buffer_unwritten(bh_result);
3080 goto out1;
3081 }
3082
3083 /* buffered write, writepage time, convert*/
3084 ret = ext4_ext_convert_to_initialized(handle, inode,
3085 path, iblock,
3086 max_blocks);
3087out:
3088 if (ret <= 0) {
3089 err = ret;
3090 goto out2;
3091 } else
3092 allocated = ret;
3093 set_buffer_new(bh_result);
3094map_out:
3095 set_buffer_mapped(bh_result);
3096out1:
3097 if (allocated > max_blocks)
3098 allocated = max_blocks;
3099 ext4_ext_show_leaf(inode, path);
3100 bh_result->b_bdev = inode->i_sb->s_bdev;
3101 bh_result->b_blocknr = newblock;
3102out2:
3103 if (path) {
3104 ext4_ext_drop_refs(path);
3105 kfree(path);
3106 }
3107 return err ? err : allocated;
3108}
3109/*
2788 * Block allocation/map/preallocation routine for extents based files 3110 * Block allocation/map/preallocation routine for extents based files
2789 * 3111 *
2790 * 3112 *
@@ -2814,6 +3136,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2814 int err = 0, depth, ret, cache_type; 3136 int err = 0, depth, ret, cache_type;
2815 unsigned int allocated = 0; 3137 unsigned int allocated = 0;
2816 struct ext4_allocation_request ar; 3138 struct ext4_allocation_request ar;
3139 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
2817 3140
2818 __clear_bit(BH_New, &bh_result->b_state); 3141 __clear_bit(BH_New, &bh_result->b_state);
2819 ext_debug("blocks %u/%u requested for inode %lu\n", 3142 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2889 EXT4_EXT_CACHE_EXTENT); 3212 EXT4_EXT_CACHE_EXTENT);
2890 goto out; 3213 goto out;
2891 } 3214 }
2892 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3215 ret = ext4_ext_handle_uninitialized_extents(handle,
2893 goto out; 3216 inode, iblock, max_blocks, path,
2894 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3217 flags, allocated, bh_result, newblock);
2895 if (allocated > max_blocks) 3218 return ret;
2896 allocated = max_blocks;
2897 /*
2898 * We have blocks reserved already. We
2899 * return allocated blocks so that delalloc
2900 * won't do block reservation for us. But
2901 * the buffer head will be unmapped so that
2902 * a read from the block returns 0s.
2903 */
2904 set_buffer_unwritten(bh_result);
2905 bh_result->b_bdev = inode->i_sb->s_bdev;
2906 bh_result->b_blocknr = newblock;
2907 goto out2;
2908 }
2909
2910 ret = ext4_ext_convert_to_initialized(handle, inode,
2911 path, iblock,
2912 max_blocks);
2913 if (ret <= 0) {
2914 err = ret;
2915 goto out2;
2916 } else
2917 allocated = ret;
2918 goto outnew;
2919 } 3219 }
2920 } 3220 }
2921 3221
@@ -2986,9 +3286,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2986 /* try to insert new extent into found leaf and return */ 3286 /* try to insert new extent into found leaf and return */
2987 ext4_ext_store_pblock(&newex, newblock); 3287 ext4_ext_store_pblock(&newex, newblock);
2988 newex.ee_len = cpu_to_le16(ar.len); 3288 newex.ee_len = cpu_to_le16(ar.len);
2989 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ 3289 /* Mark uninitialized */
3290 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
2990 ext4_ext_mark_uninitialized(&newex); 3291 ext4_ext_mark_uninitialized(&newex);
2991 err = ext4_ext_insert_extent(handle, inode, path, &newex); 3292 /*
3293 * io_end structure was created for every async
3294 * direct IO write to the middle of the file.
3295 * To avoid unecessary convertion for every aio dio rewrite
3296 * to the mid of file, here we flag the IO that is really
3297 * need the convertion.
3298 *
3299 */
3300 if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
3301 io->flag = DIO_AIO_UNWRITTEN;
3302 }
3303 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2992 if (err) { 3304 if (err) {
2993 /* free data blocks we just allocated */ 3305 /* free data blocks we just allocated */
2994 /* not a good idea to call discard here directly, 3306 /* not a good idea to call discard here directly,
@@ -3002,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3002 /* previous routine could use block we allocated */ 3314 /* previous routine could use block we allocated */
3003 newblock = ext_pblock(&newex); 3315 newblock = ext_pblock(&newex);
3004 allocated = ext4_ext_get_actual_len(&newex); 3316 allocated = ext4_ext_get_actual_len(&newex);
3005outnew:
3006 set_buffer_new(bh_result); 3317 set_buffer_new(bh_result);
3007 3318
3008 /* Cache only when it is _not_ an uninitialized extent */ 3319 /* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3512,63 @@ retry:
3201} 3512}
3202 3513
3203/* 3514/*
3515 * This function convert a range of blocks to written extents
3516 * The caller of this function will pass the start offset and the size.
3517 * all unwritten extents within this range will be converted to
3518 * written extents.
3519 *
3520 * This function is called from the direct IO end io call back
3521 * function, to convert the fallocated extents after IO is completed.
3522 */
3523int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3524 loff_t len)
3525{
3526 handle_t *handle;
3527 ext4_lblk_t block;
3528 unsigned int max_blocks;
3529 int ret = 0;
3530 int ret2 = 0;
3531 struct buffer_head map_bh;
3532 unsigned int credits, blkbits = inode->i_blkbits;
3533
3534 block = offset >> blkbits;
3535 /*
3536 * We can't just convert len to max_blocks because
3537 * If blocksize = 4096 offset = 3072 and len = 2048
3538 */
3539 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3540 - block;
3541 /*
3542 * credits to insert 1 extent into extent tree
3543 */
3544 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3545 while (ret >= 0 && ret < max_blocks) {
3546 block = block + ret;
3547 max_blocks = max_blocks - ret;
3548 handle = ext4_journal_start(inode, credits);
3549 if (IS_ERR(handle)) {
3550 ret = PTR_ERR(handle);
3551 break;
3552 }
3553 map_bh.b_state = 0;
3554 ret = ext4_get_blocks(handle, inode, block,
3555 max_blocks, &map_bh,
3556 EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3557 if (ret <= 0) {
3558 WARN_ON(ret <= 0);
3559 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3560 "returned error inode#%lu, block=%u, "
3561 "max_blocks=%u", __func__,
3562 inode->i_ino, block, max_blocks);
3563 }
3564 ext4_mark_inode_dirty(handle, inode);
3565 ret2 = ext4_journal_stop(handle);
3566 if (ret <= 0 || ret2 )
3567 break;
3568 }
3569 return ret > 0 ? ret2 : ret;
3570}
3571/*
3204 * Callback function called for each extent to gather FIEMAP information. 3572 * Callback function called for each extent to gather FIEMAP information.
3205 */ 3573 */
3206static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3574static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5ca3eca70a1e..9630583cef28 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -81,7 +81,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
81 return generic_file_aio_write(iocb, iov, nr_segs, pos); 81 return generic_file_aio_write(iocb, iov, nr_segs, pos);
82} 82}
83 83
84static struct vm_operations_struct ext4_file_vm_ops = { 84static const struct vm_operations_struct ext4_file_vm_ops = {
85 .fault = filemap_fault, 85 .fault = filemap_fault,
86 .page_mkwrite = ext4_page_mkwrite, 86 .page_mkwrite = ext4_page_mkwrite,
87}; 87};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
44 * 44 *
45 * What we do is just kick off a commit and wait on it. This will snapshot the 45 * What we do is just kick off a commit and wait on it. This will snapshot the
46 * inode to disk. 46 * inode to disk.
47 *
48 * i_mutex lock is held when entering and exiting this function
47 */ 49 */
48 50
49int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
56 58
57 trace_ext4_sync_file(file, dentry, datasync); 59 trace_ext4_sync_file(file, dentry, datasync);
58 60
61 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0)
63 goto out;
59 /* 64 /*
60 * data=writeback: 65 * data=writeback:
61 * The caller's filemap_fdatawrite()/wait will sync the data. 66 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4abd683b963d..5c5bc5dafff8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -1145,6 +1146,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1146}
1146 1147
1147/* 1148/*
1149 * Return the number of contiguous dirty pages in a given inode
1150 * starting at page frame idx.
1151 */
1152static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1153 unsigned int max_pages)
1154{
1155 struct address_space *mapping = inode->i_mapping;
1156 pgoff_t index;
1157 struct pagevec pvec;
1158 pgoff_t num = 0;
1159 int i, nr_pages, done = 0;
1160
1161 if (max_pages == 0)
1162 return 0;
1163 pagevec_init(&pvec, 0);
1164 while (!done) {
1165 index = idx;
1166 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1167 PAGECACHE_TAG_DIRTY,
1168 (pgoff_t)PAGEVEC_SIZE);
1169 if (nr_pages == 0)
1170 break;
1171 for (i = 0; i < nr_pages; i++) {
1172 struct page *page = pvec.pages[i];
1173 struct buffer_head *bh, *head;
1174
1175 lock_page(page);
1176 if (unlikely(page->mapping != mapping) ||
1177 !PageDirty(page) ||
1178 PageWriteback(page) ||
1179 page->index != idx) {
1180 done = 1;
1181 unlock_page(page);
1182 break;
1183 }
1184 if (page_has_buffers(page)) {
1185 bh = head = page_buffers(page);
1186 do {
1187 if (!buffer_delay(bh) &&
1188 !buffer_unwritten(bh))
1189 done = 1;
1190 bh = bh->b_this_page;
1191 } while (!done && (bh != head));
1192 }
1193 unlock_page(page);
1194 if (done)
1195 break;
1196 idx++;
1197 num++;
1198 if (num >= max_pages)
1199 break;
1200 }
1201 pagevec_release(&pvec);
1202 }
1203 return num;
1204}
1205
1206/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1207 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1208 * and returns if the blocks are already mapped.
1150 * 1209 *
@@ -1175,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1175 clear_buffer_mapped(bh); 1234 clear_buffer_mapped(bh);
1176 clear_buffer_unwritten(bh); 1235 clear_buffer_unwritten(bh);
1177 1236
1237 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1238 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1239 (unsigned long)block);
1178 /* 1240 /*
1179 * Try to see if we can get the block without requesting a new 1241 * Try to see if we can get the block without requesting a new
1180 * file system block. 1242 * file system block.
@@ -1796,11 +1858,11 @@ repeat:
1796 1858
1797 if (ext4_claim_free_blocks(sbi, total)) { 1859 if (ext4_claim_free_blocks(sbi, total)) {
1798 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1860 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1861 vfs_dq_release_reservation_block(inode, total);
1799 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1862 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1800 yield(); 1863 yield();
1801 goto repeat; 1864 goto repeat;
1802 } 1865 }
1803 vfs_dq_release_reservation_block(inode, total);
1804 return -ENOSPC; 1866 return -ENOSPC;
1805 } 1867 }
1806 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1868 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2092static void ext4_print_free_blocks(struct inode *inode) 2154static void ext4_print_free_blocks(struct inode *inode)
2093{ 2155{
2094 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2156 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2095 printk(KERN_EMERG "Total free blocks count %lld\n", 2157 printk(KERN_CRIT "Total free blocks count %lld\n",
2096 ext4_count_free_blocks(inode->i_sb)); 2158 ext4_count_free_blocks(inode->i_sb));
2097 printk(KERN_EMERG "Free/Dirty block details\n"); 2159 printk(KERN_CRIT "Free/Dirty block details\n");
2098 printk(KERN_EMERG "free_blocks=%lld\n", 2160 printk(KERN_CRIT "free_blocks=%lld\n",
2099 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2161 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2100 printk(KERN_EMERG "dirty_blocks=%lld\n", 2162 printk(KERN_CRIT "dirty_blocks=%lld\n",
2101 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2163 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2102 printk(KERN_EMERG "Block reservation details\n"); 2164 printk(KERN_CRIT "Block reservation details\n");
2103 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2165 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2104 EXT4_I(inode)->i_reserved_data_blocks); 2166 EXT4_I(inode)->i_reserved_data_blocks);
2105 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2167 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2106 EXT4_I(inode)->i_reserved_meta_blocks); 2168 EXT4_I(inode)->i_reserved_meta_blocks);
2107 return; 2169 return;
2108} 2170}
2109 2171
@@ -2189,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2189 * writepage and writepages will again try to write 2251 * writepage and writepages will again try to write
2190 * the same. 2252 * the same.
2191 */ 2253 */
2192 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2254 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2193 "at logical offset %llu with max blocks " 2255 "delayed block allocation failed for inode %lu at "
2194 "%zd with error %d\n", 2256 "logical offset %llu with max blocks %zd with "
2195 __func__, mpd->inode->i_ino, 2257 "error %d\n", mpd->inode->i_ino,
2196 (unsigned long long)next, 2258 (unsigned long long) next,
2197 mpd->b_size >> mpd->inode->i_blkbits, err); 2259 mpd->b_size >> mpd->inode->i_blkbits, err);
2198 printk(KERN_EMERG "This should not happen.!! " 2260 printk(KERN_CRIT "This should not happen!! "
2199 "Data will be lost\n"); 2261 "Data will be lost\n");
2200 if (err == -ENOSPC) { 2262 if (err == -ENOSPC) {
2201 ext4_print_free_blocks(mpd->inode); 2263 ext4_print_free_blocks(mpd->inode);
2202 } 2264 }
@@ -2337,7 +2399,7 @@ static int __mpage_da_writepage(struct page *page,
2337 /* 2399 /*
2338 * Rest of the page in the page_vec 2400 * Rest of the page in the page_vec
2339 * redirty then and skip then. We will 2401 * redirty then and skip then. We will
2340 * try to to write them again after 2402 * try to write them again after
2341 * starting a new transaction 2403 * starting a new transaction
2342 */ 2404 */
2343 redirty_page_for_writepage(wbc, page); 2405 redirty_page_for_writepage(wbc, page);
@@ -2743,8 +2805,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2805 int no_nrwrite_index_update;
2744 int pages_written = 0; 2806 int pages_written = 0;
2745 long pages_skipped; 2807 long pages_skipped;
2808 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2809 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2810 int needed_blocks, ret = 0;
2811 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2812 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2813 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2814
@@ -2771,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2835 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2836 return -EROFS;
2773 2837
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2838 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2839 range_whole = 1;
2786 2840
@@ -2795,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2849 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2850 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2851
2852 /*
2853 * This works around two forms of stupidity. The first is in
2854 * the writeback code, which caps the maximum number of pages
2855 * written to be 1024 pages. This is wrong on multiple
2856 * levels; different architectues have a different page size,
2857 * which changes the maximum amount of data which gets
2858 * written. Secondly, 4 megabytes is way too small. XFS
2859 * forces this value to be 16 megabytes by multiplying
2860 * nr_to_write parameter by four, and then relies on its
2861 * allocator to allocate larger extents to make them
2862 * contiguous. Unfortunately this brings us to the second
2863 * stupidity, which is that ext4's mballoc code only allocates
2864 * at most 2048 blocks. So we force contiguous writes up to
2865 * the number of dirty blocks in the inode, or
2866 * sbi->max_writeback_mb_bump whichever is smaller.
2867 */
2868 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2869 if (!range_cyclic && range_whole)
2870 desired_nr_to_write = wbc->nr_to_write * 8;
2871 else
2872 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2873 max_pages);
2874 if (desired_nr_to_write > max_pages)
2875 desired_nr_to_write = max_pages;
2876
2877 if (wbc->nr_to_write < desired_nr_to_write) {
2878 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2879 wbc->nr_to_write = desired_nr_to_write;
2880 }
2881
2798 mpd.wbc = wbc; 2882 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2883 mpd.inode = mapping->host;
2800 2884
@@ -2822,10 +2906,9 @@ retry:
2822 handle = ext4_journal_start(inode, needed_blocks); 2906 handle = ext4_journal_start(inode, needed_blocks);
2823 if (IS_ERR(handle)) { 2907 if (IS_ERR(handle)) {
2824 ret = PTR_ERR(handle); 2908 ret = PTR_ERR(handle);
2825 printk(KERN_CRIT "%s: jbd2_start: " 2909 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2826 "%ld pages, ino %lu; err %d\n", __func__, 2910 "%ld pages, ino %lu; err %d\n", __func__,
2827 wbc->nr_to_write, inode->i_ino, ret); 2911 wbc->nr_to_write, inode->i_ino, ret);
2828 dump_stack();
2829 goto out_writepages; 2912 goto out_writepages;
2830 } 2913 }
2831 2914
@@ -2897,9 +2980,10 @@ retry:
2897 goto retry; 2980 goto retry;
2898 } 2981 }
2899 if (pages_skipped != wbc->pages_skipped) 2982 if (pages_skipped != wbc->pages_skipped)
2900 printk(KERN_EMERG "This should not happen leaving %s " 2983 ext4_msg(inode->i_sb, KERN_CRIT,
2901 "with nr_to_write = %ld ret = %d\n", 2984 "This should not happen leaving %s "
2902 __func__, wbc->nr_to_write, ret); 2985 "with nr_to_write = %ld ret = %d\n",
2986 __func__, wbc->nr_to_write, ret);
2903 2987
2904 /* Update index */ 2988 /* Update index */
2905 index += pages_written; 2989 index += pages_written;
@@ -2914,7 +2998,8 @@ retry:
2914out_writepages: 2998out_writepages:
2915 if (!no_nrwrite_index_update) 2999 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 3000 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 3001 if (wbc->nr_to_write > nr_to_writebump)
3002 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 3003 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3004 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3005 return ret;
@@ -3272,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3272} 3357}
3273 3358
3274/* 3359/*
3360 * O_DIRECT for ext3 (or indirect map) based files
3361 *
3275 * If the O_DIRECT write will extend the file then add this inode to the 3362 * If the O_DIRECT write will extend the file then add this inode to the
3276 * orphan list. So recovery will truncate it back to the original size 3363 * orphan list. So recovery will truncate it back to the original size
3277 * if the machine crashes during the write. 3364 * if the machine crashes during the write.
@@ -3280,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3280 * crashes then stale disk data _may_ be exposed inside the file. But current 3367 * crashes then stale disk data _may_ be exposed inside the file. But current
3281 * VFS code falls back into buffered path in that case so we are safe. 3368 * VFS code falls back into buffered path in that case so we are safe.
3282 */ 3369 */
3283static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3370static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3284 const struct iovec *iov, loff_t offset, 3371 const struct iovec *iov, loff_t offset,
3285 unsigned long nr_segs) 3372 unsigned long nr_segs)
3286{ 3373{
@@ -3291,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3291 ssize_t ret; 3378 ssize_t ret;
3292 int orphan = 0; 3379 int orphan = 0;
3293 size_t count = iov_length(iov, nr_segs); 3380 size_t count = iov_length(iov, nr_segs);
3381 int retries = 0;
3294 3382
3295 if (rw == WRITE) { 3383 if (rw == WRITE) {
3296 loff_t final_size = offset + count; 3384 loff_t final_size = offset + count;
@@ -3313,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3313 } 3401 }
3314 } 3402 }
3315 3403
3404retry:
3316 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3405 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3317 offset, nr_segs, 3406 offset, nr_segs,
3318 ext4_get_block, NULL); 3407 ext4_get_block, NULL);
3408 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3409 goto retry;
3319 3410
3320 if (orphan) { 3411 if (orphan) {
3321 int err; 3412 int err;
@@ -3354,6 +3445,359 @@ out:
3354 return ret; 3445 return ret;
3355} 3446}
3356 3447
3448/* Maximum number of blocks we map for direct IO at once. */
3449
3450static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3451 struct buffer_head *bh_result, int create)
3452{
3453 handle_t *handle = NULL;
3454 int ret = 0;
3455 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3456 int dio_credits;
3457
3458 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3459 inode->i_ino, create);
3460 /*
3461 * DIO VFS code passes create = 0 flag for write to
3462 * the middle of file. It does this to avoid block
3463 * allocation for holes, to prevent expose stale data
3464 * out when there is parallel buffered read (which does
3465 * not hold the i_mutex lock) while direct IO write has
3466 * not completed. DIO request on holes finally falls back
3467 * to buffered IO for this reason.
3468 *
3469 * For ext4 extent based file, since we support fallocate,
3470 * new allocated extent as uninitialized, for holes, we
3471 * could fallocate blocks for holes, thus parallel
3472 * buffered IO read will zero out the page when read on
3473 * a hole while parallel DIO write to the hole has not completed.
3474 *
3475 * when we come here, we know it's a direct IO write to
3476 * to the middle of file (<i_size)
3477 * so it's safe to override the create flag from VFS.
3478 */
3479 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3480
3481 if (max_blocks > DIO_MAX_BLOCKS)
3482 max_blocks = DIO_MAX_BLOCKS;
3483 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3484 handle = ext4_journal_start(inode, dio_credits);
3485 if (IS_ERR(handle)) {
3486 ret = PTR_ERR(handle);
3487 goto out;
3488 }
3489 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3490 create);
3491 if (ret > 0) {
3492 bh_result->b_size = (ret << inode->i_blkbits);
3493 ret = 0;
3494 }
3495 ext4_journal_stop(handle);
3496out:
3497 return ret;
3498}
3499
3500static void ext4_free_io_end(ext4_io_end_t *io)
3501{
3502 BUG_ON(!io);
3503 iput(io->inode);
3504 kfree(io);
3505}
3506static void dump_aio_dio_list(struct inode * inode)
3507{
3508#ifdef EXT4_DEBUG
3509 struct list_head *cur, *before, *after;
3510 ext4_io_end_t *io, *io0, *io1;
3511
3512 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3513 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3514 return;
3515 }
3516
3517 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3518 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3519 cur = &io->list;
3520 before = cur->prev;
3521 io0 = container_of(before, ext4_io_end_t, list);
3522 after = cur->next;
3523 io1 = container_of(after, ext4_io_end_t, list);
3524
3525 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3526 io, inode->i_ino, io0, io1);
3527 }
3528#endif
3529}
3530
3531/*
3532 * check a range of space and convert unwritten extents to written.
3533 */
3534static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3535{
3536 struct inode *inode = io->inode;
3537 loff_t offset = io->offset;
3538 size_t size = io->size;
3539 int ret = 0;
3540
3541 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3542 "list->prev 0x%p\n",
3543 io, inode->i_ino, io->list.next, io->list.prev);
3544
3545 if (list_empty(&io->list))
3546 return ret;
3547
3548 if (io->flag != DIO_AIO_UNWRITTEN)
3549 return ret;
3550
3551 if (offset + size <= i_size_read(inode))
3552 ret = ext4_convert_unwritten_extents(inode, offset, size);
3553
3554 if (ret < 0) {
3555 printk(KERN_EMERG "%s: failed to convert unwritten"
3556 "extents to written extents, error is %d"
3557 " io is still on inode %lu aio dio list\n",
3558 __func__, ret, inode->i_ino);
3559 return ret;
3560 }
3561
3562 /* clear the DIO AIO unwritten flag */
3563 io->flag = 0;
3564 return ret;
3565}
3566/*
3567 * work on completed aio dio IO, to convert unwritten extents to extents
3568 */
3569static void ext4_end_aio_dio_work(struct work_struct *work)
3570{
3571 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3572 struct inode *inode = io->inode;
3573 int ret = 0;
3574
3575 mutex_lock(&inode->i_mutex);
3576 ret = ext4_end_aio_dio_nolock(io);
3577 if (ret >= 0) {
3578 if (!list_empty(&io->list))
3579 list_del_init(&io->list);
3580 ext4_free_io_end(io);
3581 }
3582 mutex_unlock(&inode->i_mutex);
3583}
3584/*
3585 * This function is called from ext4_sync_file().
3586 *
3587 * When AIO DIO IO is completed, the work to convert unwritten
3588 * extents to written is queued on workqueue but may not get immediately
3589 * scheduled. When fsync is called, we need to ensure the
3590 * conversion is complete before fsync returns.
3591 * The inode keeps track of a list of completed AIO from DIO path
3592 * that might needs to do the conversion. This function walks through
3593 * the list and convert the related unwritten extents to written.
3594 */
3595int flush_aio_dio_completed_IO(struct inode *inode)
3596{
3597 ext4_io_end_t *io;
3598 int ret = 0;
3599 int ret2 = 0;
3600
3601 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3602 return ret;
3603
3604 dump_aio_dio_list(inode);
3605 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3606 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3607 ext4_io_end_t, list);
3608 /*
3609 * Calling ext4_end_aio_dio_nolock() to convert completed
3610 * IO to written.
3611 *
3612 * When ext4_sync_file() is called, run_queue() may already
3613 * about to flush the work corresponding to this io structure.
3614 * It will be upset if it founds the io structure related
3615 * to the work-to-be schedule is freed.
3616 *
3617 * Thus we need to keep the io structure still valid here after
3618 * convertion finished. The io structure has a flag to
3619 * avoid double converting from both fsync and background work
3620 * queue work.
3621 */
3622 ret = ext4_end_aio_dio_nolock(io);
3623 if (ret < 0)
3624 ret2 = ret;
3625 else
3626 list_del_init(&io->list);
3627 }
3628 return (ret2 < 0) ? ret2 : 0;
3629}
3630
3631static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3632{
3633 ext4_io_end_t *io = NULL;
3634
3635 io = kmalloc(sizeof(*io), GFP_NOFS);
3636
3637 if (io) {
3638 igrab(inode);
3639 io->inode = inode;
3640 io->flag = 0;
3641 io->offset = 0;
3642 io->size = 0;
3643 io->error = 0;
3644 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3645 INIT_LIST_HEAD(&io->list);
3646 }
3647
3648 return io;
3649}
3650
3651static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3652 ssize_t size, void *private)
3653{
3654 ext4_io_end_t *io_end = iocb->private;
3655 struct workqueue_struct *wq;
3656
3657 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3658 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3659 iocb->private, io_end->inode->i_ino, iocb, offset,
3660 size);
3661 /* if not async direct IO or dio with 0 bytes write, just return */
3662 if (!io_end || !size)
3663 return;
3664
3665 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){
3667 ext4_free_io_end(io_end);
3668 iocb->private = NULL;
3669 return;
3670 }
3671
3672 io_end->offset = offset;
3673 io_end->size = size;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675
3676 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work);
3678
3679 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list,
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682 iocb->private = NULL;
3683}
3684/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to
3687 * fall back to buffered IO.
3688 *
3689 * For holes, we fallocate those blocks, mark them as unintialized
3690 * If those blocks were preallocated, we mark sure they are splited, but
3691 * still keep the range to write as unintialized.
3692 *
3693 * The unwrritten extents will be converted to written when DIO is completed.
3694 * For async direct IO, since the IO may still pending when return, we
3695 * set up an end_io call back function, which will do the convertion
3696 * when async direct IO completed.
3697 *
3698 * If the O_DIRECT write will extend the file then add this inode to the
3699 * orphan list. So recovery will truncate it back to the original size
3700 * if the machine crashes during the write.
3701 *
3702 */
3703static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704 const struct iovec *iov, loff_t offset,
3705 unsigned long nr_segs)
3706{
3707 struct file *file = iocb->ki_filp;
3708 struct inode *inode = file->f_mapping->host;
3709 ssize_t ret;
3710 size_t count = iov_length(iov, nr_segs);
3711
3712 loff_t final_size = offset + count;
3713 if (rw == WRITE && final_size <= inode->i_size) {
3714 /*
3715 * We could direct write to holes and fallocate.
3716 *
3717 * Allocated blocks to fill the hole are marked as uninitialized
3718 * to prevent paralel buffered read to expose the stale data
3719 * before DIO complete the data IO.
3720 *
3721 * As to previously fallocated extents, ext4 get_block
3722 * will just simply mark the buffer mapped but still
3723 * keep the extents uninitialized.
3724 *
3725 * for non AIO case, we will convert those unwritten extents
3726 * to written after return back from blockdev_direct_IO.
3727 *
3728 * for async DIO, the conversion needs to be defered when
3729 * the IO is completed. The ext4 end_io callback function
3730 * will be called to take care of the conversion work.
3731 * Here for async case, we allocate an io_end structure to
3732 * hook to the iocb.
3733 */
3734 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode);
3738 if (!iocb->private)
3739 return -ENOMEM;
3740 /*
3741 * we save the io structure for current async
3742 * direct IO, so that later ext4_get_blocks()
3743 * could flag the io structure whether there
3744 * is a unwritten extents needs to be converted
3745 * when IO is completed.
3746 */
3747 EXT4_I(inode)->cur_aio_dio = iocb->private;
3748 }
3749
3750 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs,
3753 ext4_get_block_dio_write,
3754 ext4_end_io_dio);
3755 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL;
3757 /*
3758 * The io_end structure takes a reference to the inode,
3759 * that structure needs to be destroyed and the
3760 * reference to the inode need to be dropped, when IO is
3761 * complete, even with 0 byte write, or failed.
3762 *
3763 * In the successful AIO DIO case, the io_end structure will be
3764 * desctroyed and the reference to the inode will be dropped
3765 * after the end_io call back function is called.
3766 *
3767 * In the case there is 0 byte write, or error case, since
3768 * VFS direct IO won't invoke the end_io call back function,
3769 * we need to free the end_io structure here.
3770 */
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL;
3774 } else if (ret > 0)
3775 /*
3776 * for non AIO case, since the IO is already
3777 * completed, we could do the convertion right here
3778 */
3779 ret = ext4_convert_unwritten_extents(inode,
3780 offset, ret);
3781 return ret;
3782 }
3783
3784 /* for write the the end of file case, we fall back to old way */
3785 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3786}
3787
3788static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3789 const struct iovec *iov, loff_t offset,
3790 unsigned long nr_segs)
3791{
3792 struct file *file = iocb->ki_filp;
3793 struct inode *inode = file->f_mapping->host;
3794
3795 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3796 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3797
3798 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3799}
3800
3357/* 3801/*
3358 * Pages can be marked dirty completely asynchronously from ext4's journalling 3802 * Pages can be marked dirty completely asynchronously from ext4's journalling
3359 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3803 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3386,6 +3830,7 @@ static const struct address_space_operations ext4_ordered_aops = {
3386 .direct_IO = ext4_direct_IO, 3830 .direct_IO = ext4_direct_IO,
3387 .migratepage = buffer_migrate_page, 3831 .migratepage = buffer_migrate_page,
3388 .is_partially_uptodate = block_is_partially_uptodate, 3832 .is_partially_uptodate = block_is_partially_uptodate,
3833 .error_remove_page = generic_error_remove_page,
3389}; 3834};
3390 3835
3391static const struct address_space_operations ext4_writeback_aops = { 3836static const struct address_space_operations ext4_writeback_aops = {
@@ -3401,6 +3846,7 @@ static const struct address_space_operations ext4_writeback_aops = {
3401 .direct_IO = ext4_direct_IO, 3846 .direct_IO = ext4_direct_IO,
3402 .migratepage = buffer_migrate_page, 3847 .migratepage = buffer_migrate_page,
3403 .is_partially_uptodate = block_is_partially_uptodate, 3848 .is_partially_uptodate = block_is_partially_uptodate,
3849 .error_remove_page = generic_error_remove_page,
3404}; 3850};
3405 3851
3406static const struct address_space_operations ext4_journalled_aops = { 3852static const struct address_space_operations ext4_journalled_aops = {
@@ -3415,6 +3861,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3415 .invalidatepage = ext4_invalidatepage, 3861 .invalidatepage = ext4_invalidatepage,
3416 .releasepage = ext4_releasepage, 3862 .releasepage = ext4_releasepage,
3417 .is_partially_uptodate = block_is_partially_uptodate, 3863 .is_partially_uptodate = block_is_partially_uptodate,
3864 .error_remove_page = generic_error_remove_page,
3418}; 3865};
3419 3866
3420static const struct address_space_operations ext4_da_aops = { 3867static const struct address_space_operations ext4_da_aops = {
@@ -3431,6 +3878,7 @@ static const struct address_space_operations ext4_da_aops = {
3431 .direct_IO = ext4_direct_IO, 3878 .direct_IO = ext4_direct_IO,
3432 .migratepage = buffer_migrate_page, 3879 .migratepage = buffer_migrate_page,
3433 .is_partially_uptodate = block_is_partially_uptodate, 3880 .is_partially_uptodate = block_is_partially_uptodate,
3881 .error_remove_page = generic_error_remove_page,
3434}; 3882};
3435 3883
3436void ext4_set_aops(struct inode *inode) 3884void ext4_set_aops(struct inode *inode)
@@ -4547,8 +4995,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
4547 */ 4995 */
4548static int ext4_do_update_inode(handle_t *handle, 4996static int ext4_do_update_inode(handle_t *handle,
4549 struct inode *inode, 4997 struct inode *inode,
4550 struct ext4_iloc *iloc, 4998 struct ext4_iloc *iloc)
4551 int do_sync)
4552{ 4999{
4553 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5000 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4554 struct ext4_inode_info *ei = EXT4_I(inode); 5001 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4649,22 +5096,10 @@ static int ext4_do_update_inode(handle_t *handle,
4649 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5096 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4650 } 5097 }
4651 5098
4652 /* 5099 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4653 * If we're not using a journal and we were called from 5100 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4654 * ext4_write_inode() to sync the inode (making do_sync true), 5101 if (!err)
4655 * we can just use sync_dirty_buffer() directly to do our dirty 5102 err = rc;
4656 * work. Testing s_journal here is a bit redundant but it's
4657 * worth it to avoid potential future trouble.
4658 */
4659 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4660 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4661 sync_dirty_buffer(bh);
4662 } else {
4663 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4664 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4665 if (!err)
4666 err = rc;
4667 }
4668 ei->i_state &= ~EXT4_STATE_NEW; 5103 ei->i_state &= ~EXT4_STATE_NEW;
4669 5104
4670out_brelse: 5105out_brelse:
@@ -4732,8 +5167,16 @@ int ext4_write_inode(struct inode *inode, int wait)
4732 err = ext4_get_inode_loc(inode, &iloc); 5167 err = ext4_get_inode_loc(inode, &iloc);
4733 if (err) 5168 if (err)
4734 return err; 5169 return err;
4735 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, 5170 if (wait)
4736 inode, &iloc, wait); 5171 sync_dirty_buffer(iloc.bh);
5172 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5173 ext4_error(inode->i_sb, __func__,
5174 "IO error syncing inode, "
5175 "inode=%lu, block=%llu",
5176 inode->i_ino,
5177 (unsigned long long)iloc.bh->b_blocknr);
5178 err = -EIO;
5179 }
4737 } 5180 }
4738 return err; 5181 return err;
4739} 5182}
@@ -5029,7 +5472,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
5029 get_bh(iloc->bh); 5472 get_bh(iloc->bh);
5030 5473
5031 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5474 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5032 err = ext4_do_update_inode(handle, inode, iloc, 0); 5475 err = ext4_do_update_inode(handle, inode, iloc);
5033 put_bh(iloc->bh); 5476 put_bh(iloc->bh);
5034 return err; 5477 return err;
5035} 5478}
@@ -5173,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5173 */ 5616 */
5174void ext4_dirty_inode(struct inode *inode) 5617void ext4_dirty_inode(struct inode *inode)
5175{ 5618{
5176 handle_t *current_handle = ext4_journal_current_handle();
5177 handle_t *handle; 5619 handle_t *handle;
5178 5620
5179 if (!ext4_handle_valid(current_handle)) {
5180 ext4_mark_inode_dirty(current_handle, inode);
5181 return;
5182 }
5183
5184 handle = ext4_journal_start(inode, 2); 5621 handle = ext4_journal_start(inode, 2);
5185 if (IS_ERR(handle)) 5622 if (IS_ERR(handle))
5186 goto out; 5623 goto out;
5187 if (current_handle && 5624
5188 current_handle->h_transaction != handle->h_transaction) { 5625 ext4_mark_inode_dirty(handle, inode);
5189 /* This task has a transaction open against a different fs */ 5626
5190 printk(KERN_EMERG "%s: transactions do not match!\n",
5191 __func__);
5192 } else {
5193 jbd_debug(5, "marking dirty. outer handle=%p\n",
5194 current_handle);
5195 ext4_mark_inode_dirty(handle, inode);
5196 }
5197 ext4_journal_stop(handle); 5627 ext4_journal_stop(handle);
5198out: 5628out:
5199 return; 5629 return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
2096 return err; 2096 return err;
2097} 2097}
2098 2098
2099#ifdef EXT4_MB_HISTORY
2100struct ext4_mb_proc_session {
2101 struct ext4_mb_history *history;
2102 struct super_block *sb;
2103 int start;
2104 int max;
2105};
2106
2107static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2108 struct ext4_mb_history *hs,
2109 int first)
2110{
2111 if (hs == s->history + s->max)
2112 hs = s->history;
2113 if (!first && hs == s->history + s->start)
2114 return NULL;
2115 while (hs->orig.fe_len == 0) {
2116 hs++;
2117 if (hs == s->history + s->max)
2118 hs = s->history;
2119 if (hs == s->history + s->start)
2120 return NULL;
2121 }
2122 return hs;
2123}
2124
2125static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2126{
2127 struct ext4_mb_proc_session *s = seq->private;
2128 struct ext4_mb_history *hs;
2129 int l = *pos;
2130
2131 if (l == 0)
2132 return SEQ_START_TOKEN;
2133 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2134 if (!hs)
2135 return NULL;
2136 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2137 return hs;
2138}
2139
2140static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2141 loff_t *pos)
2142{
2143 struct ext4_mb_proc_session *s = seq->private;
2144 struct ext4_mb_history *hs = v;
2145
2146 ++*pos;
2147 if (v == SEQ_START_TOKEN)
2148 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2149 else
2150 return ext4_mb_history_skip_empty(s, ++hs, 0);
2151}
2152
2153static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2154{
2155 char buf[25], buf2[25], buf3[25], *fmt;
2156 struct ext4_mb_history *hs = v;
2157
2158 if (v == SEQ_START_TOKEN) {
2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2161 "pid", "inode", "original", "goal", "result", "found",
2162 "grps", "cr", "flags", "merge", "tail", "broken");
2163 return 0;
2164 }
2165
2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2168 "0x%04x %-5s %-5u %-6u\n";
2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2170 hs->result.fe_start, hs->result.fe_len,
2171 hs->result.fe_logical);
2172 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2173 hs->orig.fe_start, hs->orig.fe_len,
2174 hs->orig.fe_logical);
2175 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
2176 hs->goal.fe_start, hs->goal.fe_len,
2177 hs->goal.fe_logical);
2178 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2179 hs->found, hs->groups, hs->cr, hs->flags,
2180 hs->merged ? "M" : "", hs->tail,
2181 hs->buddy ? 1 << hs->buddy : 0);
2182 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2183 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2184 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2185 hs->result.fe_start, hs->result.fe_len,
2186 hs->result.fe_logical);
2187 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2188 hs->orig.fe_start, hs->orig.fe_len,
2189 hs->orig.fe_logical);
2190 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2191 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2192 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2193 hs->result.fe_start, hs->result.fe_len);
2194 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2195 hs->pid, hs->ino, buf2);
2196 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2197 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2198 hs->result.fe_start, hs->result.fe_len);
2199 seq_printf(seq, "%-5u %-8u %-23s free\n",
2200 hs->pid, hs->ino, buf2);
2201 }
2202 return 0;
2203}
2204
2205static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2206{
2207}
2208
2209static const struct seq_operations ext4_mb_seq_history_ops = {
2210 .start = ext4_mb_seq_history_start,
2211 .next = ext4_mb_seq_history_next,
2212 .stop = ext4_mb_seq_history_stop,
2213 .show = ext4_mb_seq_history_show,
2214};
2215
2216static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2217{
2218 struct super_block *sb = PDE(inode)->data;
2219 struct ext4_sb_info *sbi = EXT4_SB(sb);
2220 struct ext4_mb_proc_session *s;
2221 int rc;
2222 int size;
2223
2224 if (unlikely(sbi->s_mb_history == NULL))
2225 return -ENOMEM;
2226 s = kmalloc(sizeof(*s), GFP_KERNEL);
2227 if (s == NULL)
2228 return -ENOMEM;
2229 s->sb = sb;
2230 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2231 s->history = kmalloc(size, GFP_KERNEL);
2232 if (s->history == NULL) {
2233 kfree(s);
2234 return -ENOMEM;
2235 }
2236
2237 spin_lock(&sbi->s_mb_history_lock);
2238 memcpy(s->history, sbi->s_mb_history, size);
2239 s->max = sbi->s_mb_history_max;
2240 s->start = sbi->s_mb_history_cur % s->max;
2241 spin_unlock(&sbi->s_mb_history_lock);
2242
2243 rc = seq_open(file, &ext4_mb_seq_history_ops);
2244 if (rc == 0) {
2245 struct seq_file *m = (struct seq_file *)file->private_data;
2246 m->private = s;
2247 } else {
2248 kfree(s->history);
2249 kfree(s);
2250 }
2251 return rc;
2252
2253}
2254
2255static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2256{
2257 struct seq_file *seq = (struct seq_file *)file->private_data;
2258 struct ext4_mb_proc_session *s = seq->private;
2259 kfree(s->history);
2260 kfree(s);
2261 return seq_release(inode, file);
2262}
2263
2264static ssize_t ext4_mb_seq_history_write(struct file *file,
2265 const char __user *buffer,
2266 size_t count, loff_t *ppos)
2267{
2268 struct seq_file *seq = (struct seq_file *)file->private_data;
2269 struct ext4_mb_proc_session *s = seq->private;
2270 struct super_block *sb = s->sb;
2271 char str[32];
2272 int value;
2273
2274 if (count >= sizeof(str)) {
2275 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2276 "mb_history", (int)sizeof(str));
2277 return -EOVERFLOW;
2278 }
2279
2280 if (copy_from_user(str, buffer, count))
2281 return -EFAULT;
2282
2283 value = simple_strtol(str, NULL, 0);
2284 if (value < 0)
2285 return -ERANGE;
2286 EXT4_SB(sb)->s_mb_history_filter = value;
2287
2288 return count;
2289}
2290
2291static const struct file_operations ext4_mb_seq_history_fops = {
2292 .owner = THIS_MODULE,
2293 .open = ext4_mb_seq_history_open,
2294 .read = seq_read,
2295 .write = ext4_mb_seq_history_write,
2296 .llseek = seq_lseek,
2297 .release = ext4_mb_seq_history_release,
2298};
2299
2300static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2099static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2301{ 2100{
2302 struct super_block *sb = seq->private; 2101 struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2396 .release = seq_release, 2195 .release = seq_release,
2397}; 2196};
2398 2197
2399static void ext4_mb_history_release(struct super_block *sb)
2400{
2401 struct ext4_sb_info *sbi = EXT4_SB(sb);
2402
2403 if (sbi->s_proc != NULL) {
2404 remove_proc_entry("mb_groups", sbi->s_proc);
2405 if (sbi->s_mb_history_max)
2406 remove_proc_entry("mb_history", sbi->s_proc);
2407 }
2408 kfree(sbi->s_mb_history);
2409}
2410
2411static void ext4_mb_history_init(struct super_block *sb)
2412{
2413 struct ext4_sb_info *sbi = EXT4_SB(sb);
2414 int i;
2415
2416 if (sbi->s_proc != NULL) {
2417 if (sbi->s_mb_history_max)
2418 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2419 &ext4_mb_seq_history_fops, sb);
2420 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2421 &ext4_mb_seq_groups_fops, sb);
2422 }
2423
2424 sbi->s_mb_history_cur = 0;
2425 spin_lock_init(&sbi->s_mb_history_lock);
2426 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2427 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2428 /* if we can't allocate history, then we simple won't use it */
2429}
2430
2431static noinline_for_stack void
2432ext4_mb_store_history(struct ext4_allocation_context *ac)
2433{
2434 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2435 struct ext4_mb_history h;
2436
2437 if (sbi->s_mb_history == NULL)
2438 return;
2439
2440 if (!(ac->ac_op & sbi->s_mb_history_filter))
2441 return;
2442
2443 h.op = ac->ac_op;
2444 h.pid = current->pid;
2445 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2446 h.orig = ac->ac_o_ex;
2447 h.result = ac->ac_b_ex;
2448 h.flags = ac->ac_flags;
2449 h.found = ac->ac_found;
2450 h.groups = ac->ac_groups_scanned;
2451 h.cr = ac->ac_criteria;
2452 h.tail = ac->ac_tail;
2453 h.buddy = ac->ac_buddy;
2454 h.merged = 0;
2455 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2456 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2457 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2458 h.merged = 1;
2459 h.goal = ac->ac_g_ex;
2460 h.result = ac->ac_f_ex;
2461 }
2462
2463 spin_lock(&sbi->s_mb_history_lock);
2464 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2465 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2466 sbi->s_mb_history_cur = 0;
2467 spin_unlock(&sbi->s_mb_history_lock);
2468}
2469
2470#else
2471#define ext4_mb_history_release(sb)
2472#define ext4_mb_history_init(sb)
2473#endif
2474
2475 2198
2476/* Create and initialize ext4_group_info data for the given group. */ 2199/* Create and initialize ext4_group_info data for the given group. */
2477int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2200int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2690 sbi->s_mb_stats = MB_DEFAULT_STATS; 2413 sbi->s_mb_stats = MB_DEFAULT_STATS;
2691 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2414 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2692 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2415 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2693 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2694 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2416 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2695 2417
2696 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2418 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2708 spin_lock_init(&lg->lg_prealloc_lock); 2430 spin_lock_init(&lg->lg_prealloc_lock);
2709 } 2431 }
2710 2432
2711 ext4_mb_history_init(sb); 2433 if (sbi->s_proc)
2434 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2435 &ext4_mb_seq_groups_fops, sb);
2712 2436
2713 if (sbi->s_journal) 2437 if (sbi->s_journal)
2714 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2438 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2715
2716 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2717 return 0; 2439 return 0;
2718} 2440}
2719 2441
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
2790 } 2512 }
2791 2513
2792 free_percpu(sbi->s_locality_groups); 2514 free_percpu(sbi->s_locality_groups);
2793 ext4_mb_history_release(sb); 2515 if (sbi->s_proc)
2516 remove_proc_entry("mb_groups", sbi->s_proc);
2794 2517
2795 return 0; 2518 return 0;
2796} 2519}
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3276 atomic_inc(&sbi->s_bal_breaks); 2999 atomic_inc(&sbi->s_bal_breaks);
3277 } 3000 }
3278 3001
3279 ext4_mb_store_history(ac); 3002 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3003 trace_ext4_mballoc_alloc(ac);
3004 else
3005 trace_ext4_mballoc_prealloc(ac);
3280} 3006}
3281 3007
3282/* 3008/*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3776 if (ac) { 3502 if (ac) {
3777 ac->ac_sb = sb; 3503 ac->ac_sb = sb;
3778 ac->ac_inode = pa->pa_inode; 3504 ac->ac_inode = pa->pa_inode;
3779 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3780 } 3505 }
3781 3506
3782 while (bit < end) { 3507 while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3796 ac->ac_b_ex.fe_start = bit; 3521 ac->ac_b_ex.fe_start = bit;
3797 ac->ac_b_ex.fe_len = next - bit; 3522 ac->ac_b_ex.fe_len = next - bit;
3798 ac->ac_b_ex.fe_logical = 0; 3523 ac->ac_b_ex.fe_logical = 0;
3799 ext4_mb_store_history(ac); 3524 trace_ext4_mballoc_discard(ac);
3800 } 3525 }
3801 3526
3802 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3527 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3831 ext4_group_t group; 3556 ext4_group_t group;
3832 ext4_grpblk_t bit; 3557 ext4_grpblk_t bit;
3833 3558
3834 if (ac)
3835 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3836
3837 trace_ext4_mb_release_group_pa(ac, pa); 3559 trace_ext4_mb_release_group_pa(ac, pa);
3838 BUG_ON(pa->pa_deleted == 0); 3560 BUG_ON(pa->pa_deleted == 0);
3839 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3561 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3848 ac->ac_b_ex.fe_start = bit; 3570 ac->ac_b_ex.fe_start = bit;
3849 ac->ac_b_ex.fe_len = pa->pa_len; 3571 ac->ac_b_ex.fe_len = pa->pa_len;
3850 ac->ac_b_ex.fe_logical = 0; 3572 ac->ac_b_ex.fe_logical = 0;
3851 ext4_mb_store_history(ac); 3573 trace_ext4_mballoc_discard(ac);
3852 } 3574 }
3853 3575
3854 return 0; 3576 return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3911 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3912 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits; 3913 >> bsbits;
4192 size = max(size, isize);
4193 3914
4194 if ((size == isize) && 3915 if ((size == isize) &&
4195 !ext4_fs_is_busy(sbi) && 3916 !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4199 } 3920 }
4200 3921
4201 /* don't use group allocation for large files */ 3922 /* don't use group allocation for large files */
3923 size = max(size, isize);
4202 if (size >= sbi->s_mb_stream_request) { 3924 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4204 return; 3926 return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4739 4461
4740 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4741 if (ac) { 4463 if (ac) {
4742 ac->ac_op = EXT4_MB_HISTORY_FREE;
4743 ac->ac_inode = inode; 4464 ac->ac_inode = inode;
4744 ac->ac_sb = sb; 4465 ac->ac_sb = sb;
4745 } 4466 }
@@ -4806,7 +4527,7 @@ do_more:
4806 ac->ac_b_ex.fe_group = block_group; 4527 ac->ac_b_ex.fe_group = block_group;
4807 ac->ac_b_ex.fe_start = bit; 4528 ac->ac_b_ex.fe_start = bit;
4808 ac->ac_b_ex.fe_len = count; 4529 ac->ac_b_ex.fe_len = count;
4809 ext4_mb_store_history(ac); 4530 trace_ext4_mballoc_free(ac);
4810 } 4531 }
4811 4532
4812 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
52#define mb_debug(n, fmt, a...) 52#define mb_debug(n, fmt, a...)
53#endif 53#endif
54 54
55/*
56 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
57 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
58 */
59#define EXT4_MB_HISTORY
60#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 55#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
61#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 56#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
62#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
63#define EXT4_MB_HISTORY_FREE 8 /* free */
64
65#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
66 EXT4_MB_HISTORY_PREALLOC)
67 57
68/* 58/*
69 * How long mballoc can look for a best extent (in found extents) 59 * How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
84 * with 'ext4_mb_stats' allocator will collect stats that will be 74 * with 'ext4_mb_stats' allocator will collect stats that will be
85 * shown at umount. The collecting costs though! 75 * shown at umount. The collecting costs though!
86 */ 76 */
87#define MB_DEFAULT_STATS 1 77#define MB_DEFAULT_STATS 0
88 78
89/* 79/*
90 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 80 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
217#define AC_STATUS_FOUND 2 207#define AC_STATUS_FOUND 2
218#define AC_STATUS_BREAK 3 208#define AC_STATUS_BREAK 3
219 209
220struct ext4_mb_history {
221 struct ext4_free_extent orig; /* orig allocation */
222 struct ext4_free_extent goal; /* goal allocation */
223 struct ext4_free_extent result; /* result allocation */
224 unsigned pid;
225 unsigned ino;
226 __u16 found; /* how many extents have been found */
227 __u16 groups; /* how many groups have been scanned */
228 __u16 tail; /* what tail broke some buddy */
229 __u16 buddy; /* buddy the tail ^^^ broke */
230 __u16 flags;
231 __u8 cr:3; /* which phase the result extent was found at */
232 __u8 op:4;
233 __u8 merged:1;
234};
235
236struct ext4_buddy { 210struct ext4_buddy {
237 struct page *bd_buddy_page; 211 struct page *bd_buddy_page;
238 void *bd_buddy; 212 void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
247#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
248#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
249 223
250#ifndef EXT4_MB_HISTORY
251static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
252{
253 return;
254}
255#endif
256
257#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
258 225
259static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
75 goto err_out; 75 goto err_out;
76 } 76 }
77 } 77 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext); 78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 79err_out:
80 if (path) { 80 if (path) {
81 ext4_ext_drop_refs(path); 81 ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
322 goto out; 322 goto out;
323 323
324 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
325 orig_path, new_ext)) 325 orig_path, new_ext, 0))
326 goto out; 326 goto out;
327 } 327 }
328 328
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
333 goto out; 333 goto out;
334 334
335 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
336 orig_path, end_ext)) 336 orig_path, end_ext, 0))
337 goto out; 337 goto out;
338 } 338 }
339out: 339out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
1001 return -EINVAL; 1001 return -EINVAL;
1002 } 1002 }
1003 1003
1004 /* orig and donor should be different file */
1005 if (orig_inode->i_ino == donor_inode->i_ino) {
1006 ext4_debug("ext4 move extent: The argument files should not "
1007 "be same file [ino:orig %lu, donor %lu]\n",
1008 orig_inode->i_ino, donor_inode->i_ino);
1009 return -EINVAL;
1010 }
1011
1012 /* Ext4 move extent supports only extent based file */ 1004 /* Ext4 move extent supports only extent based file */
1013 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 1005 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
1014 ext4_debug("ext4 move extent: orig file is not extents " 1006 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 int block_len_in_page; 1224 int block_len_in_page;
1233 int uninit; 1225 int uninit;
1234 1226
1227 /* orig and donor should be different file */
1228 if (orig_inode->i_ino == donor_inode->i_ino) {
1229 ext4_debug("ext4 move extent: The argument files should not "
1230 "be same file [ino:orig %lu, donor %lu]\n",
1231 orig_inode->i_ino, donor_inode->i_ino);
1232 return -EINVAL;
1233 }
1234
1235 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1237 if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..7c8fe80bacdd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2076 struct ext4_iloc iloc; 2076 struct ext4_iloc iloc;
2077 int err = 0; 2077 int err = 0;
2078 2078
2079 if (!ext4_handle_valid(handle)) 2079 /* ext4_handle_valid() assumes a valid handle_t pointer */
2080 if (handle && !ext4_handle_valid(handle))
2080 return 0; 2081 return 0;
2081 2082
2082 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2083 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a6b1ab734728..312211ee05af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
50#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
52 52
53static int default_mb_history_length = 1000;
54
55module_param_named(default_mb_history_length, default_mb_history_length,
56 int, 0644);
57MODULE_PARM_DESC(default_mb_history_length,
58 "Default number of entries saved for mb_history");
59
60struct proc_dir_entry *ext4_proc_root; 53struct proc_dir_entry *ext4_proc_root;
61static struct kset *ext4_kset; 54static struct kset *ext4_kset;
62 55
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
189 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 182 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
190} 183}
191 184
185
186/* Just increment the non-pointer handle value */
187static handle_t *ext4_get_nojournal(void)
188{
189 handle_t *handle = current->journal_info;
190 unsigned long ref_cnt = (unsigned long)handle;
191
192 BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
193
194 ref_cnt++;
195 handle = (handle_t *)ref_cnt;
196
197 current->journal_info = handle;
198 return handle;
199}
200
201
202/* Decrement the non-pointer handle value */
203static void ext4_put_nojournal(handle_t *handle)
204{
205 unsigned long ref_cnt = (unsigned long)handle;
206
207 BUG_ON(ref_cnt == 0);
208
209 ref_cnt--;
210 handle = (handle_t *)ref_cnt;
211
212 current->journal_info = handle;
213}
214
192/* 215/*
193 * Wrappers for jbd2_journal_start/end. 216 * Wrappers for jbd2_journal_start/end.
194 * 217 *
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
215 } 238 }
216 return jbd2_journal_start(journal, nblocks); 239 return jbd2_journal_start(journal, nblocks);
217 } 240 }
218 /* 241 return ext4_get_nojournal();
219 * We're not journaling, return the appropriate indication.
220 */
221 current->journal_info = EXT4_NOJOURNAL_HANDLE;
222 return current->journal_info;
223} 242}
224 243
225/* 244/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
235 int rc; 254 int rc;
236 255
237 if (!ext4_handle_valid(handle)) { 256 if (!ext4_handle_valid(handle)) {
238 /* 257 ext4_put_nojournal(handle);
239 * Do this here since we don't call jbd2_journal_stop() in
240 * no-journal mode.
241 */
242 current->journal_info = NULL;
243 return 0; 258 return 0;
244 } 259 }
245 sb = handle->h_transaction->t_journal->j_private; 260 sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
580 struct ext4_super_block *es = sbi->s_es; 595 struct ext4_super_block *es = sbi->s_es;
581 int i, err; 596 int i, err;
582 597
598 flush_workqueue(sbi->dio_unwritten_wq);
599 destroy_workqueue(sbi->dio_unwritten_wq);
600
583 lock_super(sb); 601 lock_super(sb);
584 lock_kernel(); 602 lock_kernel();
585 if (sb->s_dirt) 603 if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
684 ei->i_allocated_meta_blocks = 0; 702 ei->i_allocated_meta_blocks = 0;
685 ei->i_delalloc_reserved_flag = 0; 703 ei->i_delalloc_reserved_flag = 0;
686 spin_lock_init(&(ei->i_block_reservation_lock)); 704 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL;
687 707
688 return &ei->vfs_inode; 708 return &ei->vfs_inode;
689} 709}
@@ -964,7 +984,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
964static ssize_t ext4_quota_write(struct super_block *sb, int type, 984static ssize_t ext4_quota_write(struct super_block *sb, int type,
965 const char *data, size_t len, loff_t off); 985 const char *data, size_t len, loff_t off);
966 986
967static struct dquot_operations ext4_quota_operations = { 987static const struct dquot_operations ext4_quota_operations = {
968 .initialize = dquot_initialize, 988 .initialize = dquot_initialize,
969 .drop = dquot_drop, 989 .drop = dquot_drop,
970 .alloc_space = dquot_alloc_space, 990 .alloc_space = dquot_alloc_space,
@@ -985,7 +1005,7 @@ static struct dquot_operations ext4_quota_operations = {
985 .destroy_dquot = dquot_destroy, 1005 .destroy_dquot = dquot_destroy,
986}; 1006};
987 1007
988static struct quotactl_ops ext4_qctl_operations = { 1008static const struct quotactl_ops ext4_qctl_operations = {
989 .quota_on = ext4_quota_on, 1009 .quota_on = ext4_quota_on,
990 .quota_off = vfs_quota_off, 1010 .quota_off = vfs_quota_off,
991 .quota_sync = vfs_quota_sync, 1011 .quota_sync = vfs_quota_sync,
@@ -1052,7 +1072,7 @@ enum {
1052 Opt_journal_update, Opt_journal_dev, 1072 Opt_journal_update, Opt_journal_dev,
1053 Opt_journal_checksum, Opt_journal_async_commit, 1073 Opt_journal_checksum, Opt_journal_async_commit,
1054 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1055 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, 1075 Opt_data_err_abort, Opt_data_err_ignore,
1056 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1057 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1058 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
1099 {Opt_data_writeback, "data=writeback"}, 1119 {Opt_data_writeback, "data=writeback"},
1100 {Opt_data_err_abort, "data_err=abort"}, 1120 {Opt_data_err_abort, "data_err=abort"},
1101 {Opt_data_err_ignore, "data_err=ignore"}, 1121 {Opt_data_err_ignore, "data_err=ignore"},
1102 {Opt_mb_history_length, "mb_history_length=%u"},
1103 {Opt_offusrjquota, "usrjquota="}, 1122 {Opt_offusrjquota, "usrjquota="},
1104 {Opt_usrjquota, "usrjquota=%s"}, 1123 {Opt_usrjquota, "usrjquota=%s"},
1105 {Opt_offgrpjquota, "grpjquota="}, 1124 {Opt_offgrpjquota, "grpjquota="},
@@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
1340 case Opt_data_err_ignore: 1359 case Opt_data_err_ignore:
1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1360 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1342 break; 1361 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1350#ifdef CONFIG_QUOTA 1362#ifdef CONFIG_QUOTA
1351 case Opt_usrjquota: 1363 case Opt_usrjquota:
1352 qtype = USRQUOTA; 1364 qtype = USRQUOTA;
@@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1646 EXT4_INODES_PER_GROUP(sb), 1658 EXT4_INODES_PER_GROUP(sb),
1647 sbi->s_mount_opt); 1659 sbi->s_mount_opt);
1648 1660
1649 if (EXT4_SB(sb)->s_journal) {
1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1653 } else {
1654 ext4_msg(sb, KERN_INFO, "no journal");
1655 }
1656 return res; 1661 return res;
1657} 1662}
1658 1663
@@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2202EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2203EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2204EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2205EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2206
2201static struct attribute *ext4_attrs[] = { 2207static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2208 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2216 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2217 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2218 ATTR_LIST(mb_group_prealloc),
2219 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2220 NULL,
2214}; 2221};
2215 2222
@@ -2413,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2413 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2420 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2414 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2421 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2415 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2422 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2416 sbi->s_mb_history_max = default_mb_history_length;
2417 2423
2418 set_opt(sbi->s_mount_opt, BARRIER); 2424 set_opt(sbi->s_mount_opt, BARRIER);
2419 2425
@@ -2679,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2685 }
2680 2686
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2687 sbi->s_stripe = ext4_get_stripe_size(sbi);
2688 sbi->s_max_writeback_mb_bump = 128;
2682 2689
2683 /* 2690 /*
2684 * set up enough so that it can read an inode 2691 * set up enough so that it can read an inode
@@ -2798,6 +2805,12 @@ no_journal:
2798 clear_opt(sbi->s_mount_opt, NOBH); 2805 clear_opt(sbi->s_mount_opt, NOBH);
2799 } 2806 }
2800 } 2807 }
2808 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2809 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2810 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2811 goto failed_mount_wq;
2812 }
2813
2801 /* 2814 /*
2802 * The jbd2_journal_load will have done any necessary log recovery, 2815 * The jbd2_journal_load will have done any necessary log recovery,
2803 * so we can safely mount the rest of the filesystem now. 2816 * so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2862,12 @@ no_journal:
2849 "available"); 2862 "available");
2850 } 2863 }
2851 2864
2852 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2865 if (test_opt(sb, DELALLOC) &&
2866 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2853 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 2867 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2854 "requested data journaling mode"); 2868 "requested data journaling mode");
2855 clear_opt(sbi->s_mount_opt, DELALLOC); 2869 clear_opt(sbi->s_mount_opt, DELALLOC);
2856 } else if (test_opt(sb, DELALLOC)) 2870 }
2857 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2858 2871
2859 err = ext4_setup_system_zone(sb); 2872 err = ext4_setup_system_zone(sb);
2860 if (err) { 2873 if (err) {
@@ -2910,6 +2923,8 @@ cantfind_ext4:
2910 2923
2911failed_mount4: 2924failed_mount4:
2912 ext4_msg(sb, KERN_ERR, "mount failed"); 2925 ext4_msg(sb, KERN_ERR, "mount failed");
2926 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2927failed_mount_wq:
2913 ext4_release_system_zone(sb); 2928 ext4_release_system_zone(sb);
2914 if (sbi->s_journal) { 2929 if (sbi->s_journal) {
2915 jbd2_journal_destroy(sbi->s_journal); 2930 jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
3164 return -EINVAL; 3179 return -EINVAL;
3165 } 3180 }
3166 3181
3167 if (journal->j_flags & JBD2_BARRIER) 3182 if (!(journal->j_flags & JBD2_BARRIER))
3168 ext4_msg(sb, KERN_INFO, "barriers enabled");
3169 else
3170 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3183 ext4_msg(sb, KERN_INFO, "barriers disabled");
3171 3184
3172 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3185 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3361{ 3374{
3362 int ret = 0; 3375 int ret = 0;
3363 tid_t target; 3376 tid_t target;
3377 struct ext4_sb_info *sbi = EXT4_SB(sb);
3364 3378
3365 trace_ext4_sync_fs(sb, wait); 3379 trace_ext4_sync_fs(sb, wait);
3366 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3380 flush_workqueue(sbi->dio_unwritten_wq);
3381 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
3367 if (wait) 3382 if (wait)
3368 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3383 jbd2_log_wait_commit(sbi->s_journal, target);
3369 } 3384 }
3370 return ret; 3385 return ret;
3371} 3386}
@@ -3951,27 +3966,6 @@ static struct file_system_type ext4_fs_type = {
3951 .fs_flags = FS_REQUIRES_DEV, 3966 .fs_flags = FS_REQUIRES_DEV,
3952}; 3967};
3953 3968
3954#ifdef CONFIG_EXT4DEV_COMPAT
3955static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3956 const char *dev_name, void *data,struct vfsmount *mnt)
3957{
3958 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3959 "to mount using ext4\n", dev_name);
3960 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3961 "will go away by 2.6.31\n", dev_name);
3962 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3963}
3964
3965static struct file_system_type ext4dev_fs_type = {
3966 .owner = THIS_MODULE,
3967 .name = "ext4dev",
3968 .get_sb = ext4dev_get_sb,
3969 .kill_sb = kill_block_super,
3970 .fs_flags = FS_REQUIRES_DEV,
3971};
3972MODULE_ALIAS("ext4dev");
3973#endif
3974
3975static int __init init_ext4_fs(void) 3969static int __init init_ext4_fs(void)
3976{ 3970{
3977 int err; 3971 int err;
@@ -3996,13 +3990,6 @@ static int __init init_ext4_fs(void)
3996 err = register_filesystem(&ext4_fs_type); 3990 err = register_filesystem(&ext4_fs_type);
3997 if (err) 3991 if (err)
3998 goto out; 3992 goto out;
3999#ifdef CONFIG_EXT4DEV_COMPAT
4000 err = register_filesystem(&ext4dev_fs_type);
4001 if (err) {
4002 unregister_filesystem(&ext4_fs_type);
4003 goto out;
4004 }
4005#endif
4006 return 0; 3993 return 0;
4007out: 3994out:
4008 destroy_inodecache(); 3995 destroy_inodecache();
@@ -4021,9 +4008,6 @@ out4:
4021static void __exit exit_ext4_fs(void) 4008static void __exit exit_ext4_fs(void)
4022{ 4009{
4023 unregister_filesystem(&ext4_fs_type); 4010 unregister_filesystem(&ext4_fs_type);
4024#ifdef CONFIG_EXT4DEV_COMPAT
4025 unregister_filesystem(&ext4dev_fs_type);
4026#endif
4027 destroy_inodecache(); 4011 destroy_inodecache();
4028 exit_ext4_xattr(); 4012 exit_ext4_xattr();
4029 exit_ext4_mballoc(); 4013 exit_ext4_mballoc();
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72a176d..7db0979c6b72 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323/* fat/misc.c */ 323/* fat/misc.c */
324extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 324extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
325 __attribute__ ((format (printf, 2, 3))) __cold; 325 __attribute__ ((format (printf, 2, 3))) __cold;
326extern void fat_clusters_flush(struct super_block *sb); 326extern int fat_clusters_flush(struct super_block *sb);
327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
329 __le16 __time, __le16 __date, u8 time_cs); 329 __le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8970d8c49bb0..76b7961ab663 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
451 451
452static int fat_sync_fs(struct super_block *sb, int wait) 452static int fat_sync_fs(struct super_block *sb, int wait)
453{ 453{
454 lock_super(sb); 454 int err = 0;
455 fat_clusters_flush(sb);
456 sb->s_dirt = 0;
457 unlock_super(sb);
458 455
459 return 0; 456 if (sb->s_dirt) {
457 lock_super(sb);
458 sb->s_dirt = 0;
459 err = fat_clusters_flush(sb);
460 unlock_super(sb);
461 }
462
463 return err;
460} 464}
461 465
462static void fat_put_super(struct super_block *sb) 466static void fat_put_super(struct super_block *sb)
@@ -470,19 +474,11 @@ static void fat_put_super(struct super_block *sb)
470 474
471 iput(sbi->fat_inode); 475 iput(sbi->fat_inode);
472 476
473 if (sbi->nls_disk) { 477 unload_nls(sbi->nls_disk);
474 unload_nls(sbi->nls_disk); 478 unload_nls(sbi->nls_io);
475 sbi->nls_disk = NULL; 479
476 sbi->options.codepage = fat_default_codepage; 480 if (sbi->options.iocharset != fat_default_iocharset)
477 }
478 if (sbi->nls_io) {
479 unload_nls(sbi->nls_io);
480 sbi->nls_io = NULL;
481 }
482 if (sbi->options.iocharset != fat_default_iocharset) {
483 kfree(sbi->options.iocharset); 481 kfree(sbi->options.iocharset);
484 sbi->options.iocharset = fat_default_iocharset;
485 }
486 482
487 sb->s_fs_info = NULL; 483 sb->s_fs_info = NULL;
488 kfree(sbi); 484 kfree(sbi);
@@ -820,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
820 seq_puts(m, ",shortname=mixed"); 816 seq_puts(m, ",shortname=mixed");
821 break; 817 break;
822 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: 818 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
823 /* seq_puts(m, ",shortname=lower"); */ 819 seq_puts(m, ",shortname=lower");
824 break; 820 break;
825 default: 821 default:
826 seq_puts(m, ",shortname=unknown"); 822 seq_puts(m, ",shortname=unknown");
@@ -971,7 +967,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
971 opts->codepage = fat_default_codepage; 967 opts->codepage = fat_default_codepage;
972 opts->iocharset = fat_default_iocharset; 968 opts->iocharset = fat_default_iocharset;
973 if (is_vfat) { 969 if (is_vfat) {
974 opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95; 970 opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
975 opts->rodir = 0; 971 opts->rodir = 0;
976 } else { 972 } else {
977 opts->shortname = 0; 973 opts->shortname = 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 4e35be873e09..0f55f5cb732f 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
43 43
44/* Flushes the number of free clusters on FAT32 */ 44/* Flushes the number of free clusters on FAT32 */
45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
46void fat_clusters_flush(struct super_block *sb) 46int fat_clusters_flush(struct super_block *sb)
47{ 47{
48 struct msdos_sb_info *sbi = MSDOS_SB(sb); 48 struct msdos_sb_info *sbi = MSDOS_SB(sb);
49 struct buffer_head *bh; 49 struct buffer_head *bh;
50 struct fat_boot_fsinfo *fsinfo; 50 struct fat_boot_fsinfo *fsinfo;
51 51
52 if (sbi->fat_bits != 32) 52 if (sbi->fat_bits != 32)
53 return; 53 return 0;
54 54
55 bh = sb_bread(sb, sbi->fsinfo_sector); 55 bh = sb_bread(sb, sbi->fsinfo_sector);
56 if (bh == NULL) { 56 if (bh == NULL) {
57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); 57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
58 return; 58 return -EIO;
59 } 59 }
60 60
61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data; 61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
74 mark_buffer_dirty(bh); 74 mark_buffer_dirty(bh);
75 } 75 }
76 brelse(bh); 76 brelse(bh);
77
78 return 0;
77} 79}
78 80
79/* 81/*
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e83557112..f565f24019b5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
499 int charlen; 499 int charlen;
500 500
501 if (utf8) { 501 if (utf8) {
502 int name_len = strlen(name); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 503 if (*outlen < 0)
504 *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); 504 return *outlen;
505 505 else if (*outlen > 255)
506 /*
507 * We stripped '.'s before and set len appropriately,
508 * but utf8s_to_utf16s doesn't care about len
509 */
510 *outlen -= (name_len - len);
511
512 if (*outlen > 255)
513 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
514 507
515 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ae413086db97..fc089f2f7f56 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
263 return pid; 263 return pid;
264} 264}
265 265
266static int f_setown_ex(struct file *filp, unsigned long arg)
267{
268 struct f_owner_ex * __user owner_p = (void * __user)arg;
269 struct f_owner_ex owner;
270 struct pid *pid;
271 int type;
272 int ret;
273
274 ret = copy_from_user(&owner, owner_p, sizeof(owner));
275 if (ret)
276 return ret;
277
278 switch (owner.type) {
279 case F_OWNER_TID:
280 type = PIDTYPE_MAX;
281 break;
282
283 case F_OWNER_PID:
284 type = PIDTYPE_PID;
285 break;
286
287 case F_OWNER_GID:
288 type = PIDTYPE_PGID;
289 break;
290
291 default:
292 return -EINVAL;
293 }
294
295 rcu_read_lock();
296 pid = find_vpid(owner.pid);
297 if (owner.pid && !pid)
298 ret = -ESRCH;
299 else
300 ret = __f_setown(filp, pid, type, 1);
301 rcu_read_unlock();
302
303 return ret;
304}
305
306static int f_getown_ex(struct file *filp, unsigned long arg)
307{
308 struct f_owner_ex * __user owner_p = (void * __user)arg;
309 struct f_owner_ex owner;
310 int ret = 0;
311
312 read_lock(&filp->f_owner.lock);
313 owner.pid = pid_vnr(filp->f_owner.pid);
314 switch (filp->f_owner.pid_type) {
315 case PIDTYPE_MAX:
316 owner.type = F_OWNER_TID;
317 break;
318
319 case PIDTYPE_PID:
320 owner.type = F_OWNER_PID;
321 break;
322
323 case PIDTYPE_PGID:
324 owner.type = F_OWNER_GID;
325 break;
326
327 default:
328 WARN_ON(1);
329 ret = -EINVAL;
330 break;
331 }
332 read_unlock(&filp->f_owner.lock);
333
334 if (!ret)
335 ret = copy_to_user(owner_p, &owner, sizeof(owner));
336 return ret;
337}
338
266static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, 339static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
267 struct file *filp) 340 struct file *filp)
268{ 341{
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
313 case F_SETOWN: 386 case F_SETOWN:
314 err = f_setown(filp, arg, 1); 387 err = f_setown(filp, arg, 1);
315 break; 388 break;
389 case F_GETOWN_EX:
390 err = f_getown_ex(filp, arg);
391 break;
392 case F_SETOWN_EX:
393 err = f_setown_ex(filp, arg);
394 break;
316 case F_GETSIG: 395 case F_GETSIG:
317 err = filp->f_owner.signum; 396 err = filp->f_owner.signum;
318 break; 397 break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
428 507
429static void send_sigio_to_task(struct task_struct *p, 508static void send_sigio_to_task(struct task_struct *p,
430 struct fown_struct *fown, 509 struct fown_struct *fown,
431 int fd, 510 int fd, int reason, int group)
432 int reason)
433{ 511{
434 /* 512 /*
435 * F_SETSIG can change ->signum lockless in parallel, make 513 * F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
461 else 539 else
462 si.si_band = band_table[reason - POLL_IN]; 540 si.si_band = band_table[reason - POLL_IN];
463 si.si_fd = fd; 541 si.si_fd = fd;
464 if (!group_send_sig_info(signum, &si, p)) 542 if (!do_send_sig_info(signum, &si, p, group))
465 break; 543 break;
466 /* fall-through: fall back on the old plain SIGIO signal */ 544 /* fall-through: fall back on the old plain SIGIO signal */
467 case 0: 545 case 0:
468 group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); 546 do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
469 } 547 }
470} 548}
471 549
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
474 struct task_struct *p; 552 struct task_struct *p;
475 enum pid_type type; 553 enum pid_type type;
476 struct pid *pid; 554 struct pid *pid;
555 int group = 1;
477 556
478 read_lock(&fown->lock); 557 read_lock(&fown->lock);
558
479 type = fown->pid_type; 559 type = fown->pid_type;
560 if (type == PIDTYPE_MAX) {
561 group = 0;
562 type = PIDTYPE_PID;
563 }
564
480 pid = fown->pid; 565 pid = fown->pid;
481 if (!pid) 566 if (!pid)
482 goto out_unlock_fown; 567 goto out_unlock_fown;
483 568
484 read_lock(&tasklist_lock); 569 read_lock(&tasklist_lock);
485 do_each_pid_task(pid, type, p) { 570 do_each_pid_task(pid, type, p) {
486 send_sigio_to_task(p, fown, fd, band); 571 send_sigio_to_task(p, fown, fd, band, group);
487 } while_each_pid_task(pid, type, p); 572 } while_each_pid_task(pid, type, p);
488 read_unlock(&tasklist_lock); 573 read_unlock(&tasklist_lock);
489 out_unlock_fown: 574 out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
491} 576}
492 577
493static void send_sigurg_to_task(struct task_struct *p, 578static void send_sigurg_to_task(struct task_struct *p,
494 struct fown_struct *fown) 579 struct fown_struct *fown, int group)
495{ 580{
496 if (sigio_perm(p, fown, SIGURG)) 581 if (sigio_perm(p, fown, SIGURG))
497 group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); 582 do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
498} 583}
499 584
500int send_sigurg(struct fown_struct *fown) 585int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
502 struct task_struct *p; 587 struct task_struct *p;
503 enum pid_type type; 588 enum pid_type type;
504 struct pid *pid; 589 struct pid *pid;
590 int group = 1;
505 int ret = 0; 591 int ret = 0;
506 592
507 read_lock(&fown->lock); 593 read_lock(&fown->lock);
594
508 type = fown->pid_type; 595 type = fown->pid_type;
596 if (type == PIDTYPE_MAX) {
597 group = 0;
598 type = PIDTYPE_PID;
599 }
600
509 pid = fown->pid; 601 pid = fown->pid;
510 if (!pid) 602 if (!pid)
511 goto out_unlock_fown; 603 goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
514 606
515 read_lock(&tasklist_lock); 607 read_lock(&tasklist_lock);
516 do_each_pid_task(pid, type, p) { 608 do_each_pid_task(pid, type, p) {
517 send_sigurg_to_task(p, fown); 609 send_sigurg_to_task(p, fown, group);
518 } while_each_pid_task(pid, type, p); 610 } while_each_pid_task(pid, type, p);
519 read_unlock(&tasklist_lock); 611 read_unlock(&tasklist_lock);
520 out_unlock_fown: 612 out_unlock_fown:
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f8..8eb44042e009 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
74 * Handle nr_files sysctl 74 * Handle nr_files sysctl
75 */ 75 */
76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
77int proc_nr_files(ctl_table *table, int write, struct file *filp, 77int proc_nr_files(ctl_table *table, int write,
78 void __user *buffer, size_t *lenp, loff_t *ppos) 78 void __user *buffer, size_t *lenp, loff_t *ppos)
79{ 79{
80 files_stat.nr_files = get_nr_files(); 80 files_stat.nr_files = get_nr_files();
81 return proc_dointvec(table, write, filp, buffer, lenp, ppos); 81 return proc_dointvec(table, write, buffer, lenp, ppos);
82} 82}
83#else 83#else
84int proc_nr_files(ctl_table *table, int write, struct file *filp, 84int proc_nr_files(ctl_table *table, int write,
85 void __user *buffer, size_t *lenp, loff_t *ppos) 85 void __user *buffer, size_t *lenp, loff_t *ppos)
86{ 86{
87 return -ENOSYS; 87 return -ENOSYS;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8e1e5e19d21e..9d5360c4c2af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,8 +41,9 @@ struct wb_writeback_args {
41 long nr_pages; 41 long nr_pages;
42 struct super_block *sb; 42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode; 43 enum writeback_sync_modes sync_mode;
44 int for_kupdate; 44 int for_kupdate:1;
45 int range_cyclic; 45 int range_cyclic:1;
46 int for_background:1;
46}; 47};
47 48
48/* 49/*
@@ -249,14 +250,25 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
249 * completion. Caller need not hold sb s_umount semaphore. 250 * completion. Caller need not hold sb s_umount semaphore.
250 * 251 *
251 */ 252 */
252void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 253void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
254 long nr_pages)
253{ 255{
254 struct wb_writeback_args args = { 256 struct wb_writeback_args args = {
257 .sb = sb,
255 .sync_mode = WB_SYNC_NONE, 258 .sync_mode = WB_SYNC_NONE,
256 .nr_pages = nr_pages, 259 .nr_pages = nr_pages,
257 .range_cyclic = 1, 260 .range_cyclic = 1,
258 }; 261 };
259 262
263 /*
264 * We treat @nr_pages=0 as the special case to do background writeback,
265 * ie. to sync pages until the background dirty threshold is reached.
266 */
267 if (!nr_pages) {
268 args.nr_pages = LONG_MAX;
269 args.for_background = 1;
270 }
271
260 bdi_alloc_queue_work(bdi, &args); 272 bdi_alloc_queue_work(bdi, &args);
261} 273}
262 274
@@ -310,7 +322,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
310 * For inodes being constantly redirtied, dirtied_when can get stuck. 322 * For inodes being constantly redirtied, dirtied_when can get stuck.
311 * It _appears_ to be in the future, but is actually in distant past. 323 * It _appears_ to be in the future, but is actually in distant past.
312 * This test is necessary to prevent such wrapped-around relative times 324 * This test is necessary to prevent such wrapped-around relative times
313 * from permanently stopping the whole pdflush writeback. 325 * from permanently stopping the whole bdi writeback.
314 */ 326 */
315 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 327 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
316#endif 328#endif
@@ -324,13 +336,38 @@ static void move_expired_inodes(struct list_head *delaying_queue,
324 struct list_head *dispatch_queue, 336 struct list_head *dispatch_queue,
325 unsigned long *older_than_this) 337 unsigned long *older_than_this)
326{ 338{
339 LIST_HEAD(tmp);
340 struct list_head *pos, *node;
341 struct super_block *sb = NULL;
342 struct inode *inode;
343 int do_sb_sort = 0;
344
327 while (!list_empty(delaying_queue)) { 345 while (!list_empty(delaying_queue)) {
328 struct inode *inode = list_entry(delaying_queue->prev, 346 inode = list_entry(delaying_queue->prev, struct inode, i_list);
329 struct inode, i_list);
330 if (older_than_this && 347 if (older_than_this &&
331 inode_dirtied_after(inode, *older_than_this)) 348 inode_dirtied_after(inode, *older_than_this))
332 break; 349 break;
333 list_move(&inode->i_list, dispatch_queue); 350 if (sb && sb != inode->i_sb)
351 do_sb_sort = 1;
352 sb = inode->i_sb;
353 list_move(&inode->i_list, &tmp);
354 }
355
356 /* just one sb in list, splice to dispatch_queue and we're done */
357 if (!do_sb_sort) {
358 list_splice(&tmp, dispatch_queue);
359 return;
360 }
361
362 /* Move inodes from one superblock together */
363 while (!list_empty(&tmp)) {
364 inode = list_entry(tmp.prev, struct inode, i_list);
365 sb = inode->i_sb;
366 list_for_each_prev_safe(pos, node, &tmp) {
367 inode = list_entry(pos, struct inode, i_list);
368 if (inode->i_sb == sb)
369 list_move(&inode->i_list, dispatch_queue);
370 }
334 } 371 }
335} 372}
336 373
@@ -439,8 +476,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
439 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
440 inode->i_state &= ~I_SYNC; 477 inode->i_state &= ~I_SYNC;
441 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 478 if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
442 if (!(inode->i_state & I_DIRTY) && 479 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
443 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 480 /*
481 * More pages get dirtied by a fast dirtier.
482 */
483 goto select_queue;
484 } else if (inode->i_state & I_DIRTY) {
485 /*
486 * At least XFS will redirty the inode during the
487 * writeback (delalloc) and on io completion (isize).
488 */
489 redirty_tail(inode);
490 } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
444 /* 491 /*
445 * We didn't write back all the pages. nfs_writepages() 492 * We didn't write back all the pages. nfs_writepages()
446 * sometimes bales out without doing anything. Redirty 493 * sometimes bales out without doing anything. Redirty
@@ -462,6 +509,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
462 * soon as the queue becomes uncongested. 509 * soon as the queue becomes uncongested.
463 */ 510 */
464 inode->i_state |= I_DIRTY_PAGES; 511 inode->i_state |= I_DIRTY_PAGES;
512select_queue:
465 if (wbc->nr_to_write <= 0) { 513 if (wbc->nr_to_write <= 0) {
466 /* 514 /*
467 * slice used up: queue for next turn 515 * slice used up: queue for next turn
@@ -484,12 +532,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
484 inode->i_state |= I_DIRTY_PAGES; 532 inode->i_state |= I_DIRTY_PAGES;
485 redirty_tail(inode); 533 redirty_tail(inode);
486 } 534 }
487 } else if (inode->i_state & I_DIRTY) {
488 /*
489 * Someone redirtied the inode while were writing back
490 * the pages.
491 */
492 redirty_tail(inode);
493 } else if (atomic_read(&inode->i_count)) { 535 } else if (atomic_read(&inode->i_count)) {
494 /* 536 /*
495 * The inode is clean, inuse 537 * The inode is clean, inuse
@@ -506,6 +548,17 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
506 return ret; 548 return ret;
507} 549}
508 550
551static void unpin_sb_for_writeback(struct super_block **psb)
552{
553 struct super_block *sb = *psb;
554
555 if (sb) {
556 up_read(&sb->s_umount);
557 put_super(sb);
558 *psb = NULL;
559 }
560}
561
509/* 562/*
510 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 563 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
511 * before calling writeback. So make sure that we do pin it, so it doesn't 564 * before calling writeback. So make sure that we do pin it, so it doesn't
@@ -515,11 +568,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
515 * 1 if we failed. 568 * 1 if we failed.
516 */ 569 */
517static int pin_sb_for_writeback(struct writeback_control *wbc, 570static int pin_sb_for_writeback(struct writeback_control *wbc,
518 struct inode *inode) 571 struct inode *inode, struct super_block **psb)
519{ 572{
520 struct super_block *sb = inode->i_sb; 573 struct super_block *sb = inode->i_sb;
521 574
522 /* 575 /*
576 * If this sb is already pinned, nothing more to do. If not and
577 * *psb is non-NULL, unpin the old one first
578 */
579 if (sb == *psb)
580 return 0;
581 else if (*psb)
582 unpin_sb_for_writeback(psb);
583
584 /*
523 * Caller must already hold the ref for this 585 * Caller must already hold the ref for this
524 */ 586 */
525 if (wbc->sync_mode == WB_SYNC_ALL) { 587 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -532,7 +594,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
532 if (down_read_trylock(&sb->s_umount)) { 594 if (down_read_trylock(&sb->s_umount)) {
533 if (sb->s_root) { 595 if (sb->s_root) {
534 spin_unlock(&sb_lock); 596 spin_unlock(&sb_lock);
535 return 0; 597 goto pinned;
536 } 598 }
537 /* 599 /*
538 * umounted, drop rwsem again and fall through to failure 600 * umounted, drop rwsem again and fall through to failure
@@ -543,24 +605,15 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
543 sb->s_count--; 605 sb->s_count--;
544 spin_unlock(&sb_lock); 606 spin_unlock(&sb_lock);
545 return 1; 607 return 1;
546} 608pinned:
547 609 *psb = sb;
548static void unpin_sb_for_writeback(struct writeback_control *wbc, 610 return 0;
549 struct inode *inode)
550{
551 struct super_block *sb = inode->i_sb;
552
553 if (wbc->sync_mode == WB_SYNC_ALL)
554 return;
555
556 up_read(&sb->s_umount);
557 put_super(sb);
558} 611}
559 612
560static void writeback_inodes_wb(struct bdi_writeback *wb, 613static void writeback_inodes_wb(struct bdi_writeback *wb,
561 struct writeback_control *wbc) 614 struct writeback_control *wbc)
562{ 615{
563 struct super_block *sb = wbc->sb; 616 struct super_block *sb = wbc->sb, *pin_sb = NULL;
564 const int is_blkdev_sb = sb_is_blkdev_sb(sb); 617 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
565 const unsigned long start = jiffies; /* livelock avoidance */ 618 const unsigned long start = jiffies; /* livelock avoidance */
566 619
@@ -619,7 +672,7 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
619 if (inode_dirtied_after(inode, start)) 672 if (inode_dirtied_after(inode, start))
620 break; 673 break;
621 674
622 if (pin_sb_for_writeback(wbc, inode)) { 675 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
623 requeue_io(inode); 676 requeue_io(inode);
624 continue; 677 continue;
625 } 678 }
@@ -628,7 +681,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
628 __iget(inode); 681 __iget(inode);
629 pages_skipped = wbc->pages_skipped; 682 pages_skipped = wbc->pages_skipped;
630 writeback_single_inode(inode, wbc); 683 writeback_single_inode(inode, wbc);
631 unpin_sb_for_writeback(wbc, inode);
632 if (wbc->pages_skipped != pages_skipped) { 684 if (wbc->pages_skipped != pages_skipped) {
633 /* 685 /*
634 * writeback is not making progress due to locked 686 * writeback is not making progress due to locked
@@ -648,6 +700,8 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
648 wbc->more_io = 1; 700 wbc->more_io = 1;
649 } 701 }
650 702
703 unpin_sb_for_writeback(&pin_sb);
704
651 spin_unlock(&inode_lock); 705 spin_unlock(&inode_lock);
652 /* Leave any unwritten inodes on b_io */ 706 /* Leave any unwritten inodes on b_io */
653} 707}
@@ -706,6 +760,7 @@ static long wb_writeback(struct bdi_writeback *wb,
706 }; 760 };
707 unsigned long oldest_jif; 761 unsigned long oldest_jif;
708 long wrote = 0; 762 long wrote = 0;
763 struct inode *inode;
709 764
710 if (wbc.for_kupdate) { 765 if (wbc.for_kupdate) {
711 wbc.older_than_this = &oldest_jif; 766 wbc.older_than_this = &oldest_jif;
@@ -719,20 +774,16 @@ static long wb_writeback(struct bdi_writeback *wb,
719 774
720 for (;;) { 775 for (;;) {
721 /* 776 /*
722 * Don't flush anything for non-integrity writeback where 777 * Stop writeback when nr_pages has been consumed
723 * no nr_pages was given
724 */ 778 */
725 if (!args->for_kupdate && args->nr_pages <= 0 && 779 if (args->nr_pages <= 0)
726 args->sync_mode == WB_SYNC_NONE)
727 break; 780 break;
728 781
729 /* 782 /*
730 * If no specific pages were given and this is just a 783 * For background writeout, stop when we are below the
731 * periodic background writeout and we are below the 784 * background dirty threshold
732 * background dirty threshold, don't do anything
733 */ 785 */
734 if (args->for_kupdate && args->nr_pages <= 0 && 786 if (args->for_background && !over_bground_thresh())
735 !over_bground_thresh())
736 break; 787 break;
737 788
738 wbc.more_io = 0; 789 wbc.more_io = 0;
@@ -744,13 +795,32 @@ static long wb_writeback(struct bdi_writeback *wb,
744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 795 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
745 796
746 /* 797 /*
747 * If we ran out of stuff to write, bail unless more_io got set 798 * If we consumed everything, see if we have more
748 */ 799 */
749 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 800 if (wbc.nr_to_write <= 0)
750 if (wbc.more_io && !wbc.for_kupdate) 801 continue;
751 continue; 802 /*
803 * Didn't write everything and we don't have more IO, bail
804 */
805 if (!wbc.more_io)
752 break; 806 break;
807 /*
808 * Did we write something? Try for more
809 */
810 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
811 continue;
812 /*
813 * Nothing written. Wait for some inode to
814 * become available for writeback. Otherwise
815 * we'll just busyloop.
816 */
817 spin_lock(&inode_lock);
818 if (!list_empty(&wb->b_more_io)) {
819 inode = list_entry(wb->b_more_io.prev,
820 struct inode, i_list);
821 inode_wait_for_writeback(inode);
753 } 822 }
823 spin_unlock(&inode_lock);
754 } 824 }
755 825
756 return wrote; 826 return wrote;
@@ -1060,9 +1130,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
1060 * If older_than_this is non-NULL, then only write out inodes which 1130 * If older_than_this is non-NULL, then only write out inodes which
1061 * had their first dirtying at a time earlier than *older_than_this. 1131 * had their first dirtying at a time earlier than *older_than_this.
1062 * 1132 *
1063 * If we're a pdlfush thread, then implement pdflush collision avoidance
1064 * against the entire list.
1065 *
1066 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 1133 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1067 * This function assumes that the blockdev superblock's inodes are backed by 1134 * This function assumes that the blockdev superblock's inodes are backed by
1068 * a variety of queues, so all inodes are searched. For other superblocks, 1135 * a variety of queues, so all inodes are searched. For other superblocks,
@@ -1141,7 +1208,7 @@ void writeback_inodes_sb(struct super_block *sb)
1141 nr_to_write = nr_dirty + nr_unstable + 1208 nr_to_write = nr_dirty + nr_unstable +
1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1209 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1143 1210
1144 bdi_writeback_all(sb, nr_to_write); 1211 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1145} 1212}
1146EXPORT_SYMBOL(writeback_inodes_sb); 1213EXPORT_SYMBOL(writeback_inodes_sb);
1147 1214
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e703654e7f40..992f6c9410bb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1276,14 +1276,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1276 return 0; 1276 return 0;
1277 1277
1278 if (attr->ia_valid & ATTR_SIZE) { 1278 if (attr->ia_valid & ATTR_SIZE) {
1279 unsigned long limit; 1279 err = inode_newsize_ok(inode, attr->ia_size);
1280 if (IS_SWAPFILE(inode)) 1280 if (err)
1281 return -ETXTBSY; 1281 return err;
1282 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1283 if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
1284 send_sig(SIGXFSZ, current, 0);
1285 return -EFBIG;
1286 }
1287 is_truncate = true; 1282 is_truncate = true;
1288 } 1283 }
1289 1284
@@ -1350,8 +1345,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1350 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. 1345 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
1351 */ 1346 */
1352 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { 1347 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
1353 if (outarg.attr.size < oldsize) 1348 truncate_pagecache(inode, oldsize, outarg.attr.size);
1354 fuse_truncate(inode->i_mapping, outarg.attr.size);
1355 invalidate_inode_pages2(inode->i_mapping); 1349 invalidate_inode_pages2(inode->i_mapping);
1356 } 1350 }
1357 1351
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index cbc464043b6f..a3492f7d207c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1313 return 0; 1313 return 0;
1314} 1314}
1315 1315
1316static struct vm_operations_struct fuse_file_vm_ops = { 1316static const struct vm_operations_struct fuse_file_vm_ops = {
1317 .close = fuse_vma_close, 1317 .close = fuse_vma_close,
1318 .fault = filemap_fault, 1318 .fault = filemap_fault,
1319 .page_mkwrite = fuse_page_mkwrite, 1319 .page_mkwrite = fuse_page_mkwrite,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fc9c79feb5f7..01cc462ff45d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -606,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
606void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, 606void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
607 u64 attr_valid); 607 u64 attr_valid);
608 608
609void fuse_truncate(struct address_space *mapping, loff_t offset);
610
611/** 609/**
612 * Initialize the client device 610 * Initialize the client device
613 */ 611 */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6da947daabda..1a822ce2b24b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -140,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
140 return 0; 140 return 0;
141} 141}
142 142
143void fuse_truncate(struct address_space *mapping, loff_t offset)
144{
145 /* See vmtruncate() */
146 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
147 truncate_inode_pages(mapping, offset);
148 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
149}
150
151void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, 143void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
152 u64 attr_valid) 144 u64 attr_valid)
153{ 145{
@@ -205,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
205 spin_unlock(&fc->lock); 197 spin_unlock(&fc->lock);
206 198
207 if (S_ISREG(inode->i_mode) && oldsize != attr->size) { 199 if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
208 if (attr->size < oldsize) 200 truncate_pagecache(inode, oldsize, attr->size);
209 fuse_truncate(inode->i_mapping, attr->size);
210 invalidate_inode_pages2(inode->i_mapping); 201 invalidate_inode_pages2(inode->i_mapping);
211 } 202 }
212} 203}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7ebae9a4ecc0..694b5d48f036 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
1135 .direct_IO = gfs2_direct_IO, 1135 .direct_IO = gfs2_direct_IO,
1136 .migratepage = buffer_migrate_page, 1136 .migratepage = buffer_migrate_page,
1137 .is_partially_uptodate = block_is_partially_uptodate, 1137 .is_partially_uptodate = block_is_partially_uptodate,
1138 .error_remove_page = generic_error_remove_page,
1138}; 1139};
1139 1140
1140static const struct address_space_operations gfs2_ordered_aops = { 1141static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
1151 .direct_IO = gfs2_direct_IO, 1152 .direct_IO = gfs2_direct_IO,
1152 .migratepage = buffer_migrate_page, 1153 .migratepage = buffer_migrate_page,
1153 .is_partially_uptodate = block_is_partially_uptodate, 1154 .is_partially_uptodate = block_is_partially_uptodate,
1155 .error_remove_page = generic_error_remove_page,
1154}; 1156};
1155 1157
1156static const struct address_space_operations gfs2_jdata_aops = { 1158static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
1166 .invalidatepage = gfs2_invalidatepage, 1168 .invalidatepage = gfs2_invalidatepage,
1167 .releasepage = gfs2_releasepage, 1169 .releasepage = gfs2_releasepage,
1168 .is_partially_uptodate = block_is_partially_uptodate, 1170 .is_partially_uptodate = block_is_partially_uptodate,
1171 .error_remove_page = generic_error_remove_page,
1169}; 1172};
1170 1173
1171void gfs2_set_aops(struct inode *inode) 1174void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 166f38fbd246..4eb308aa3234 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -418,7 +418,7 @@ out:
418 return ret; 418 return ret;
419} 419}
420 420
421static struct vm_operations_struct gfs2_vm_ops = { 421static const struct vm_operations_struct gfs2_vm_ops = {
422 .fault = filemap_fault, 422 .fault = filemap_fault,
423 .page_mkwrite = gfs2_page_mkwrite, 423 .page_mkwrite = gfs2_page_mkwrite,
424}; 424};
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c3ac18054057..247436c10deb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/utsname.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
17#include <linux/xattr.h> 16#include <linux/xattr.h>
18#include <linux/posix_acl.h> 17#include <linux/posix_acl.h>
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 28c590b7c9da..8f1cfb02a6cb 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -179,7 +179,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
179 * always aligned to a 64 bit boundary. 179 * always aligned to a 64 bit boundary.
180 * 180 *
181 * The size of the buffer is in bytes, but is it assumed that it is 181 * The size of the buffer is in bytes, but is it assumed that it is
182 * always ok to to read a complete multiple of 64 bits at the end 182 * always ok to read a complete multiple of 64 bits at the end
183 * of the block in case the end is no aligned to a natural boundary. 183 * of the block in case the end is no aligned to a natural boundary.
184 * 184 *
185 * Return: the block number (bitmap buffer scope) that was found 185 * Return: the block number (bitmap buffer scope) that was found
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 7b6165f25fbe..8bbe03c3f6d5 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb)
344 brelse(HFS_SB(sb)->mdb_bh); 344 brelse(HFS_SB(sb)->mdb_bh);
345 brelse(HFS_SB(sb)->alt_mdb_bh); 345 brelse(HFS_SB(sb)->alt_mdb_bh);
346 346
347 if (HFS_SB(sb)->nls_io) 347 unload_nls(HFS_SB(sb)->nls_io);
348 unload_nls(HFS_SB(sb)->nls_io); 348 unload_nls(HFS_SB(sb)->nls_disk);
349 if (HFS_SB(sb)->nls_disk)
350 unload_nls(HFS_SB(sb)->nls_disk);
351 349
352 free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); 350 free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
353 kfree(HFS_SB(sb)); 351 kfree(HFS_SB(sb));
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c0759fe0855b..43022f3d5148 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb)
229 iput(HFSPLUS_SB(sb).alloc_file); 229 iput(HFSPLUS_SB(sb).alloc_file);
230 iput(HFSPLUS_SB(sb).hidden_dir); 230 iput(HFSPLUS_SB(sb).hidden_dir);
231 brelse(HFSPLUS_SB(sb).s_vhbh); 231 brelse(HFSPLUS_SB(sb).s_vhbh);
232 if (HFSPLUS_SB(sb).nls) 232 unload_nls(HFSPLUS_SB(sb).nls);
233 unload_nls(HFSPLUS_SB(sb).nls);
234 kfree(sb->s_fs_info); 233 kfree(sb->s_fs_info);
235 sb->s_fs_info = NULL; 234 sb->s_fs_info = NULL;
236 235
@@ -464,8 +463,7 @@ out:
464 463
465cleanup: 464cleanup:
466 hfsplus_put_super(sb); 465 hfsplus_put_super(sb);
467 if (nls) 466 unload_nls(nls);
468 unload_nls(nls);
469 return err; 467 return err;
470} 468}
471 469
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a93b885311d8..87a1258953b8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,12 +31,10 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/ima.h> 33#include <linux/ima.h>
34#include <linux/magic.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
37/* some random number */
38#define HUGETLBFS_MAGIC 0x958458f6
39
40static const struct super_operations hugetlbfs_ops; 38static const struct super_operations hugetlbfs_ops;
41static const struct address_space_operations hugetlbfs_aops; 39static const struct address_space_operations hugetlbfs_aops;
42const struct file_operations hugetlbfs_file_operations; 40const struct file_operations hugetlbfs_file_operations;
@@ -382,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode)
382 380
383static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) 381static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
384{ 382{
385 struct super_block *sb = inode->i_sb; 383 if (generic_detach_inode(inode)) {
386 384 truncate_hugepages(inode, 0);
387 if (!hlist_unhashed(&inode->i_hash)) { 385 clear_inode(inode);
388 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 386 destroy_inode(inode);
389 list_move(&inode->i_list, &inode_unused);
390 inodes_stat.nr_unused++;
391 if (!sb || (sb->s_flags & MS_ACTIVE)) {
392 spin_unlock(&inode_lock);
393 return;
394 }
395 inode->i_state |= I_WILL_FREE;
396 spin_unlock(&inode_lock);
397 /*
398 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
399 * in our backing_dev_info.
400 */
401 write_inode_now(inode, 1);
402 spin_lock(&inode_lock);
403 inode->i_state &= ~I_WILL_FREE;
404 inodes_stat.nr_unused--;
405 hlist_del_init(&inode->i_hash);
406 } 387 }
407 list_del_init(&inode->i_list);
408 list_del_init(&inode->i_sb_list);
409 inode->i_state |= I_FREEING;
410 inodes_stat.nr_inodes--;
411 spin_unlock(&inode_lock);
412 truncate_hugepages(inode, 0);
413 clear_inode(inode);
414 destroy_inode(inode);
415} 388}
416 389
417static void hugetlbfs_drop_inode(struct inode *inode) 390static void hugetlbfs_drop_inode(struct inode *inode)
@@ -507,6 +480,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
507 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 480 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
508 INIT_LIST_HEAD(&inode->i_mapping->private_list); 481 INIT_LIST_HEAD(&inode->i_mapping->private_list);
509 info = HUGETLBFS_I(inode); 482 info = HUGETLBFS_I(inode);
483 /*
484 * The policy is initialized here even if we are creating a
485 * private inode because initialization simply creates an
486 * an empty rb tree and calls spin_lock_init(), later when we
487 * call mpol_free_shared_policy() it will just return because
488 * the rb tree will still be empty.
489 */
510 mpol_shared_policy_init(&info->policy, NULL); 490 mpol_shared_policy_init(&info->policy, NULL);
511 switch (mode & S_IFMT) { 491 switch (mode & S_IFMT) {
512 default: 492 default:
@@ -937,7 +917,7 @@ static int can_do_hugetlb_shm(void)
937} 917}
938 918
939struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, 919struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
940 struct user_struct **user) 920 struct user_struct **user, int creat_flags)
941{ 921{
942 int error = -ENOMEM; 922 int error = -ENOMEM;
943 struct file *file; 923 struct file *file;
@@ -949,7 +929,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
949 if (!hugetlbfs_vfsmount) 929 if (!hugetlbfs_vfsmount)
950 return ERR_PTR(-ENOENT); 930 return ERR_PTR(-ENOENT);
951 931
952 if (!can_do_hugetlb_shm()) { 932 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
953 *user = current_user(); 933 *user = current_user();
954 if (user_shm_lock(size, *user)) { 934 if (user_shm_lock(size, *user)) {
955 WARN_ONCE(1, 935 WARN_ONCE(1,
diff --git a/fs/inode.c b/fs/inode.c
index b2ba83d2c4e1..4d8e3be55976 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/rwsem.h>
17#include <linux/hash.h> 18#include <linux/hash.h>
18#include <linux/swap.h> 19#include <linux/swap.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -87,14 +88,18 @@ static struct hlist_head *inode_hashtable __read_mostly;
87DEFINE_SPINLOCK(inode_lock); 88DEFINE_SPINLOCK(inode_lock);
88 89
89/* 90/*
90 * iprune_mutex provides exclusion between the kswapd or try_to_free_pages 91 * iprune_sem provides exclusion between the kswapd or try_to_free_pages
91 * icache shrinking path, and the umount path. Without this exclusion, 92 * icache shrinking path, and the umount path. Without this exclusion,
92 * by the time prune_icache calls iput for the inode whose pages it has 93 * by the time prune_icache calls iput for the inode whose pages it has
93 * been invalidating, or by the time it calls clear_inode & destroy_inode 94 * been invalidating, or by the time it calls clear_inode & destroy_inode
94 * from its final dispose_list, the struct super_block they refer to 95 * from its final dispose_list, the struct super_block they refer to
95 * (for inode->i_sb->s_op) may already have been freed and reused. 96 * (for inode->i_sb->s_op) may already have been freed and reused.
97 *
98 * We make this an rwsem because the fastpath is icache shrinking. In
99 * some cases a filesystem may be doing a significant amount of work in
100 * its inode reclaim code, so this should improve parallelism.
96 */ 101 */
97static DEFINE_MUTEX(iprune_mutex); 102static DECLARE_RWSEM(iprune_sem);
98 103
99/* 104/*
100 * Statistics gathering.. 105 * Statistics gathering..
@@ -123,7 +128,7 @@ static void wake_up_inode(struct inode *inode)
123int inode_init_always(struct super_block *sb, struct inode *inode) 128int inode_init_always(struct super_block *sb, struct inode *inode)
124{ 129{
125 static const struct address_space_operations empty_aops; 130 static const struct address_space_operations empty_aops;
126 static struct inode_operations empty_iops; 131 static const struct inode_operations empty_iops;
127 static const struct file_operations empty_fops; 132 static const struct file_operations empty_fops;
128 struct address_space *const mapping = &inode->i_data; 133 struct address_space *const mapping = &inode->i_data;
129 134
@@ -381,7 +386,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
381 /* 386 /*
382 * We can reschedule here without worrying about the list's 387 * We can reschedule here without worrying about the list's
383 * consistency because the per-sb list of inodes must not 388 * consistency because the per-sb list of inodes must not
384 * change during umount anymore, and because iprune_mutex keeps 389 * change during umount anymore, and because iprune_sem keeps
385 * shrink_icache_memory() away. 390 * shrink_icache_memory() away.
386 */ 391 */
387 cond_resched_lock(&inode_lock); 392 cond_resched_lock(&inode_lock);
@@ -420,7 +425,7 @@ int invalidate_inodes(struct super_block *sb)
420 int busy; 425 int busy;
421 LIST_HEAD(throw_away); 426 LIST_HEAD(throw_away);
422 427
423 mutex_lock(&iprune_mutex); 428 down_write(&iprune_sem);
424 spin_lock(&inode_lock); 429 spin_lock(&inode_lock);
425 inotify_unmount_inodes(&sb->s_inodes); 430 inotify_unmount_inodes(&sb->s_inodes);
426 fsnotify_unmount_inodes(&sb->s_inodes); 431 fsnotify_unmount_inodes(&sb->s_inodes);
@@ -428,7 +433,7 @@ int invalidate_inodes(struct super_block *sb)
428 spin_unlock(&inode_lock); 433 spin_unlock(&inode_lock);
429 434
430 dispose_list(&throw_away); 435 dispose_list(&throw_away);
431 mutex_unlock(&iprune_mutex); 436 up_write(&iprune_sem);
432 437
433 return busy; 438 return busy;
434} 439}
@@ -467,7 +472,7 @@ static void prune_icache(int nr_to_scan)
467 int nr_scanned; 472 int nr_scanned;
468 unsigned long reap = 0; 473 unsigned long reap = 0;
469 474
470 mutex_lock(&iprune_mutex); 475 down_read(&iprune_sem);
471 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
472 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 477 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
473 struct inode *inode; 478 struct inode *inode;
@@ -509,7 +514,7 @@ static void prune_icache(int nr_to_scan)
509 spin_unlock(&inode_lock); 514 spin_unlock(&inode_lock);
510 515
511 dispose_list(&freeable); 516 dispose_list(&freeable);
512 mutex_unlock(&iprune_mutex); 517 up_read(&iprune_sem);
513} 518}
514 519
515/* 520/*
@@ -695,13 +700,15 @@ void unlock_new_inode(struct inode *inode)
695 } 700 }
696#endif 701#endif
697 /* 702 /*
698 * This is special! We do not need the spinlock 703 * This is special! We do not need the spinlock when clearing I_LOCK,
699 * when clearing I_LOCK, because we're guaranteed 704 * because we're guaranteed that nobody else tries to do anything about
700 * that nobody else tries to do anything about the 705 * the state of the inode when it is locked, as we just created it (so
701 * state of the inode when it is locked, as we 706 * there can be no old holders that haven't tested I_LOCK).
702 * just created it (so there can be no old holders 707 * However we must emit the memory barrier so that other CPUs reliably
703 * that haven't tested I_LOCK). 708 * see the clearing of I_LOCK after the other inode initialisation has
709 * completed.
704 */ 710 */
711 smp_mb();
705 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); 712 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
706 inode->i_state &= ~(I_LOCK|I_NEW); 713 inode->i_state &= ~(I_LOCK|I_NEW);
707 wake_up_inode(inode); 714 wake_up_inode(inode);
@@ -1234,7 +1241,16 @@ void generic_delete_inode(struct inode *inode)
1234} 1241}
1235EXPORT_SYMBOL(generic_delete_inode); 1242EXPORT_SYMBOL(generic_delete_inode);
1236 1243
1237static void generic_forget_inode(struct inode *inode) 1244/**
1245 * generic_detach_inode - remove inode from inode lists
1246 * @inode: inode to remove
1247 *
1248 * Remove inode from inode lists, write it if it's dirty. This is just an
1249 * internal VFS helper exported for hugetlbfs. Do not use!
1250 *
1251 * Returns 1 if inode should be completely destroyed.
1252 */
1253int generic_detach_inode(struct inode *inode)
1238{ 1254{
1239 struct super_block *sb = inode->i_sb; 1255 struct super_block *sb = inode->i_sb;
1240 1256
@@ -1244,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode)
1244 inodes_stat.nr_unused++; 1260 inodes_stat.nr_unused++;
1245 if (sb->s_flags & MS_ACTIVE) { 1261 if (sb->s_flags & MS_ACTIVE) {
1246 spin_unlock(&inode_lock); 1262 spin_unlock(&inode_lock);
1247 return; 1263 return 0;
1248 } 1264 }
1249 WARN_ON(inode->i_state & I_NEW); 1265 WARN_ON(inode->i_state & I_NEW);
1250 inode->i_state |= I_WILL_FREE; 1266 inode->i_state |= I_WILL_FREE;
@@ -1262,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode)
1262 inode->i_state |= I_FREEING; 1278 inode->i_state |= I_FREEING;
1263 inodes_stat.nr_inodes--; 1279 inodes_stat.nr_inodes--;
1264 spin_unlock(&inode_lock); 1280 spin_unlock(&inode_lock);
1281 return 1;
1282}
1283EXPORT_SYMBOL_GPL(generic_detach_inode);
1284
1285static void generic_forget_inode(struct inode *inode)
1286{
1287 if (!generic_detach_inode(inode))
1288 return;
1265 if (inode->i_data.nrpages) 1289 if (inode->i_data.nrpages)
1266 truncate_inode_pages(&inode->i_data, 0); 1290 truncate_inode_pages(&inode->i_data, 0);
1267 clear_inode(inode); 1291 clear_inode(inode);
@@ -1392,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1392 struct inode *inode = dentry->d_inode; 1416 struct inode *inode = dentry->d_inode;
1393 struct timespec now; 1417 struct timespec now;
1394 1418
1395 if (mnt_want_write(mnt))
1396 return;
1397 if (inode->i_flags & S_NOATIME) 1419 if (inode->i_flags & S_NOATIME)
1398 goto out; 1420 return;
1399 if (IS_NOATIME(inode)) 1421 if (IS_NOATIME(inode))
1400 goto out; 1422 return;
1401 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1423 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1402 goto out; 1424 return;
1403 1425
1404 if (mnt->mnt_flags & MNT_NOATIME) 1426 if (mnt->mnt_flags & MNT_NOATIME)
1405 goto out; 1427 return;
1406 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1428 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1407 goto out; 1429 return;
1408 1430
1409 now = current_fs_time(inode->i_sb); 1431 now = current_fs_time(inode->i_sb);
1410 1432
1411 if (!relatime_need_update(mnt, inode, now)) 1433 if (!relatime_need_update(mnt, inode, now))
1412 goto out; 1434 return;
1413 1435
1414 if (timespec_equal(&inode->i_atime, &now)) 1436 if (timespec_equal(&inode->i_atime, &now))
1415 goto out; 1437 return;
1438
1439 if (mnt_want_write(mnt))
1440 return;
1416 1441
1417 inode->i_atime = now; 1442 inode->i_atime = now;
1418 mark_inode_dirty_sync(inode); 1443 mark_inode_dirty_sync(inode);
1419out:
1420 mnt_drop_write(mnt); 1444 mnt_drop_write(mnt);
1421} 1445}
1422EXPORT_SYMBOL(touch_atime); 1446EXPORT_SYMBOL(touch_atime);
@@ -1437,34 +1461,37 @@ void file_update_time(struct file *file)
1437{ 1461{
1438 struct inode *inode = file->f_path.dentry->d_inode; 1462 struct inode *inode = file->f_path.dentry->d_inode;
1439 struct timespec now; 1463 struct timespec now;
1440 int sync_it = 0; 1464 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
1441 int err;
1442 1465
1466 /* First try to exhaust all avenues to not sync */
1443 if (IS_NOCMTIME(inode)) 1467 if (IS_NOCMTIME(inode))
1444 return; 1468 return;
1445 1469
1446 err = mnt_want_write_file(file);
1447 if (err)
1448 return;
1449
1450 now = current_fs_time(inode->i_sb); 1470 now = current_fs_time(inode->i_sb);
1451 if (!timespec_equal(&inode->i_mtime, &now)) { 1471 if (!timespec_equal(&inode->i_mtime, &now))
1452 inode->i_mtime = now; 1472 sync_it = S_MTIME;
1453 sync_it = 1;
1454 }
1455 1473
1456 if (!timespec_equal(&inode->i_ctime, &now)) { 1474 if (!timespec_equal(&inode->i_ctime, &now))
1457 inode->i_ctime = now; 1475 sync_it |= S_CTIME;
1458 sync_it = 1;
1459 }
1460 1476
1461 if (IS_I_VERSION(inode)) { 1477 if (IS_I_VERSION(inode))
1462 inode_inc_iversion(inode); 1478 sync_it |= S_VERSION;
1463 sync_it = 1; 1479
1464 } 1480 if (!sync_it)
1481 return;
1465 1482
1466 if (sync_it) 1483 /* Finally allowed to write? Takes lock. */
1467 mark_inode_dirty_sync(inode); 1484 if (mnt_want_write_file(file))
1485 return;
1486
1487 /* Only change inode inside the lock region */
1488 if (sync_it & S_VERSION)
1489 inode_inc_iversion(inode);
1490 if (sync_it & S_CTIME)
1491 inode->i_ctime = now;
1492 if (sync_it & S_MTIME)
1493 inode->i_mtime = now;
1494 mark_inode_dirty_sync(inode);
1468 mnt_drop_write(file->f_path.mnt); 1495 mnt_drop_write(file->f_path.mnt);
1469} 1496}
1470EXPORT_SYMBOL(file_update_time); 1497EXPORT_SYMBOL(file_update_time);
@@ -1592,7 +1619,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1592 else if (S_ISSOCK(mode)) 1619 else if (S_ISSOCK(mode))
1593 inode->i_fop = &bad_sock_fops; 1620 inode->i_fop = &bad_sock_fops;
1594 else 1621 else
1595 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", 1622 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
1596 mode); 1623 " inode %s:%lu\n", mode, inode->i_sb->s_id,
1624 inode->i_ino);
1597} 1625}
1598EXPORT_SYMBOL(init_special_inode); 1626EXPORT_SYMBOL(init_special_inode);
diff --git a/fs/internal.h b/fs/internal.h
index d55ef562f0bb..515175b8b72e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *);
57 * namespace.c 57 * namespace.c
58 */ 58 */
59extern int copy_mount_options(const void __user *, unsigned long *); 59extern int copy_mount_options(const void __user *, unsigned long *);
60extern int copy_mount_string(const void __user *, char **);
60 61
61extern void free_vfsmnt(struct vfsmount *); 62extern void free_vfsmnt(struct vfsmount *);
62extern struct vfsmount *alloc_vfsmnt(const char *); 63extern struct vfsmount *alloc_vfsmnt(const char *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5612880fcbe7..7b17a14396ff 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags);
162static int fiemap_check_ranges(struct super_block *sb, 162static int fiemap_check_ranges(struct super_block *sb,
163 u64 start, u64 len, u64 *new_len) 163 u64 start, u64 len, u64 *new_len)
164{ 164{
165 u64 maxbytes = (u64) sb->s_maxbytes;
166
165 *new_len = len; 167 *new_len = len;
166 168
167 if (len == 0) 169 if (len == 0)
168 return -EINVAL; 170 return -EINVAL;
169 171
170 if (start > sb->s_maxbytes) 172 if (start > maxbytes)
171 return -EFBIG; 173 return -EFBIG;
172 174
173 /* 175 /*
174 * Shrink request scope to what the fs can actually handle. 176 * Shrink request scope to what the fs can actually handle.
175 */ 177 */
176 if ((len > sb->s_maxbytes) || 178 if (len > maxbytes || (maxbytes - len) < start)
177 (sb->s_maxbytes - len) < start) 179 *new_len = maxbytes - start;
178 *new_len = sb->s_maxbytes - start;
179 180
180 return 0; 181 return 0;
181} 182}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 85f96bc651c7..6b4dcd4f2943 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb)
46#ifdef CONFIG_JOLIET 46#ifdef CONFIG_JOLIET
47 lock_kernel(); 47 lock_kernel();
48 48
49 if (sbi->s_nls_iocharset) { 49 unload_nls(sbi->s_nls_iocharset);
50 unload_nls(sbi->s_nls_iocharset);
51 sbi->s_nls_iocharset = NULL;
52 }
53 50
54 unlock_kernel(); 51 unlock_kernel();
55#endif 52#endif
@@ -912,8 +909,7 @@ out_no_root:
912 printk(KERN_WARNING "%s: get root inode failed\n", __func__); 909 printk(KERN_WARNING "%s: get root inode failed\n", __func__);
913out_no_inode: 910out_no_inode:
914#ifdef CONFIG_JOLIET 911#ifdef CONFIG_JOLIET
915 if (sbi->s_nls_iocharset) 912 unload_nls(sbi->s_nls_iocharset);
916 unload_nls(sbi->s_nls_iocharset);
917#endif 913#endif
918 goto out_freesbi; 914 goto out_freesbi;
919out_no_read: 915out_no_read:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e6d49b..ca0f5eb62b20 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
643 643
644int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 644int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
645{ 645{
646 struct transaction_chp_stats_s *stats;
646 transaction_t *transaction; 647 transaction_t *transaction;
647 journal_t *journal; 648 journal_t *journal;
648 int ret = 0; 649 int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
679 680
680 /* OK, that was the last buffer for the transaction: we can now 681 /* OK, that was the last buffer for the transaction: we can now
681 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
683 stats = &transaction->t_chp_stats;
684 if (stats->cs_chp_time)
685 stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
686 jiffies);
687 trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
688 transaction->t_tid, stats);
682 689
683 __jbd2_journal_drop_transaction(journal, transaction); 690 __jbd2_journal_drop_transaction(journal, transaction);
684 kfree(transaction); 691 kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 26d991ddc1e6..d4cfd6d2779e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
410 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
411 write_op = WRITE_SYNC_PLUG; 411 write_op = WRITE_SYNC_PLUG;
412 trace_jbd2_commit_locking(journal, commit_transaction); 412 trace_jbd2_commit_locking(journal, commit_transaction);
413 stats.u.run.rs_wait = commit_transaction->t_max_wait; 413 stats.run.rs_wait = commit_transaction->t_max_wait;
414 stats.u.run.rs_locked = jiffies; 414 stats.run.rs_locked = jiffies;
415 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 415 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
416 stats.u.run.rs_locked); 416 stats.run.rs_locked);
417 417
418 spin_lock(&commit_transaction->t_handle_lock); 418 spin_lock(&commit_transaction->t_handle_lock);
419 while (commit_transaction->t_updates) { 419 while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
486 jbd2_journal_switch_revoke_table(journal); 486 jbd2_journal_switch_revoke_table(journal);
487 487
488 trace_jbd2_commit_flushing(journal, commit_transaction); 488 trace_jbd2_commit_flushing(journal, commit_transaction);
489 stats.u.run.rs_flushing = jiffies; 489 stats.run.rs_flushing = jiffies;
490 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, 490 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
491 stats.u.run.rs_flushing); 491 stats.run.rs_flushing);
492 492
493 commit_transaction->t_state = T_FLUSH; 493 commit_transaction->t_state = T_FLUSH;
494 journal->j_committing_transaction = commit_transaction; 494 journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
523 spin_unlock(&journal->j_state_lock); 523 spin_unlock(&journal->j_state_lock);
524 524
525 trace_jbd2_commit_logging(journal, commit_transaction); 525 trace_jbd2_commit_logging(journal, commit_transaction);
526 stats.u.run.rs_logging = jiffies; 526 stats.run.rs_logging = jiffies;
527 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, 527 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
528 stats.u.run.rs_logging); 528 stats.run.rs_logging);
529 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; 529 stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
530 stats.u.run.rs_blocks_logged = 0; 530 stats.run.rs_blocks_logged = 0;
531 531
532 J_ASSERT(commit_transaction->t_nr_buffers <= 532 J_ASSERT(commit_transaction->t_nr_buffers <=
533 commit_transaction->t_outstanding_credits); 533 commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
695 submit_bh(write_op, bh); 695 submit_bh(write_op, bh);
696 } 696 }
697 cond_resched(); 697 cond_resched();
698 stats.u.run.rs_blocks_logged += bufs; 698 stats.run.rs_blocks_logged += bufs;
699 699
700 /* Force a new descriptor to be generated next 700 /* Force a new descriptor to be generated next
701 time round the loop. */ 701 time round the loop. */
@@ -988,33 +988,30 @@ restart_loop:
988 J_ASSERT(commit_transaction->t_state == T_COMMIT); 988 J_ASSERT(commit_transaction->t_state == T_COMMIT);
989 989
990 commit_transaction->t_start = jiffies; 990 commit_transaction->t_start = jiffies;
991 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, 991 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
992 commit_transaction->t_start); 992 commit_transaction->t_start);
993 993
994 /* 994 /*
995 * File the transaction for history 995 * File the transaction statistics
996 */ 996 */
997 stats.ts_type = JBD2_STATS_RUN;
998 stats.ts_tid = commit_transaction->t_tid; 997 stats.ts_tid = commit_transaction->t_tid;
999 stats.u.run.rs_handle_count = commit_transaction->t_handle_count; 998 stats.run.rs_handle_count = commit_transaction->t_handle_count;
1000 spin_lock(&journal->j_history_lock); 999 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1001 memcpy(journal->j_history + journal->j_history_cur, &stats, 1000 commit_transaction->t_tid, &stats.run);
1002 sizeof(stats));
1003 if (++journal->j_history_cur == journal->j_history_max)
1004 journal->j_history_cur = 0;
1005 1001
1006 /* 1002 /*
1007 * Calculate overall stats 1003 * Calculate overall stats
1008 */ 1004 */
1005 spin_lock(&journal->j_history_lock);
1009 journal->j_stats.ts_tid++; 1006 journal->j_stats.ts_tid++;
1010 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; 1007 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1011 journal->j_stats.u.run.rs_running += stats.u.run.rs_running; 1008 journal->j_stats.run.rs_running += stats.run.rs_running;
1012 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; 1009 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1013 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; 1010 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1014 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; 1011 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1015 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; 1012 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1016 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; 1013 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1017 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; 1014 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1018 spin_unlock(&journal->j_history_lock); 1015 spin_unlock(&journal->j_history_lock);
1019 1016
1020 commit_transaction->t_state = T_FINISHED; 1017 commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a8a358bc0f21..b0ab5219becb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
136 journal->j_task = current; 136 journal->j_task = current;
137 wake_up(&journal->j_wait_done_commit); 137 wake_up(&journal->j_wait_done_commit);
138 138
139 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
140 "commit interval %ld seconds\n", current->pid,
141 journal->j_devname, journal->j_commit_interval / HZ);
142
143 /* 139 /*
144 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
145 */ 141 */
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
223{ 219{
224 struct task_struct *t; 220 struct task_struct *t;
225 221
226 t = kthread_run(kjournald2, journal, "kjournald2"); 222 t = kthread_run(kjournald2, journal, "jbd2/%s",
223 journal->j_devname);
227 if (IS_ERR(t)) 224 if (IS_ERR(t))
228 return PTR_ERR(t); 225 return PTR_ERR(t);
229 226
@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
679 int max; 676 int max;
680}; 677};
681 678
682static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
683 struct transaction_stats_s *ts,
684 int first)
685{
686 if (ts == s->stats + s->max)
687 ts = s->stats;
688 if (!first && ts == s->stats + s->start)
689 return NULL;
690 while (ts->ts_type == 0) {
691 ts++;
692 if (ts == s->stats + s->max)
693 ts = s->stats;
694 if (ts == s->stats + s->start)
695 return NULL;
696 }
697 return ts;
698
699}
700
701static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
702{
703 struct jbd2_stats_proc_session *s = seq->private;
704 struct transaction_stats_s *ts;
705 int l = *pos;
706
707 if (l == 0)
708 return SEQ_START_TOKEN;
709 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
710 if (!ts)
711 return NULL;
712 l--;
713 while (l) {
714 ts = jbd2_history_skip_empty(s, ++ts, 0);
715 if (!ts)
716 break;
717 l--;
718 }
719 return ts;
720}
721
722static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
723{
724 struct jbd2_stats_proc_session *s = seq->private;
725 struct transaction_stats_s *ts = v;
726
727 ++*pos;
728 if (v == SEQ_START_TOKEN)
729 return jbd2_history_skip_empty(s, s->stats + s->start, 1);
730 else
731 return jbd2_history_skip_empty(s, ++ts, 0);
732}
733
734static int jbd2_seq_history_show(struct seq_file *seq, void *v)
735{
736 struct transaction_stats_s *ts = v;
737 if (v == SEQ_START_TOKEN) {
738 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
739 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
740 "wait", "run", "lock", "flush", "log", "hndls",
741 "block", "inlog", "ctime", "write", "drop",
742 "close");
743 return 0;
744 }
745 if (ts->ts_type == JBD2_STATS_RUN)
746 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
747 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
748 jiffies_to_msecs(ts->u.run.rs_wait),
749 jiffies_to_msecs(ts->u.run.rs_running),
750 jiffies_to_msecs(ts->u.run.rs_locked),
751 jiffies_to_msecs(ts->u.run.rs_flushing),
752 jiffies_to_msecs(ts->u.run.rs_logging),
753 ts->u.run.rs_handle_count,
754 ts->u.run.rs_blocks,
755 ts->u.run.rs_blocks_logged);
756 else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
757 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
758 "C", ts->ts_tid, " ",
759 jiffies_to_msecs(ts->u.chp.cs_chp_time),
760 ts->u.chp.cs_written, ts->u.chp.cs_dropped,
761 ts->u.chp.cs_forced_to_close);
762 else
763 J_ASSERT(0);
764 return 0;
765}
766
767static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
768{
769}
770
771static struct seq_operations jbd2_seq_history_ops = {
772 .start = jbd2_seq_history_start,
773 .next = jbd2_seq_history_next,
774 .stop = jbd2_seq_history_stop,
775 .show = jbd2_seq_history_show,
776};
777
778static int jbd2_seq_history_open(struct inode *inode, struct file *file)
779{
780 journal_t *journal = PDE(inode)->data;
781 struct jbd2_stats_proc_session *s;
782 int rc, size;
783
784 s = kmalloc(sizeof(*s), GFP_KERNEL);
785 if (s == NULL)
786 return -ENOMEM;
787 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
788 s->stats = kmalloc(size, GFP_KERNEL);
789 if (s->stats == NULL) {
790 kfree(s);
791 return -ENOMEM;
792 }
793 spin_lock(&journal->j_history_lock);
794 memcpy(s->stats, journal->j_history, size);
795 s->max = journal->j_history_max;
796 s->start = journal->j_history_cur % s->max;
797 spin_unlock(&journal->j_history_lock);
798
799 rc = seq_open(file, &jbd2_seq_history_ops);
800 if (rc == 0) {
801 struct seq_file *m = file->private_data;
802 m->private = s;
803 } else {
804 kfree(s->stats);
805 kfree(s);
806 }
807 return rc;
808
809}
810
811static int jbd2_seq_history_release(struct inode *inode, struct file *file)
812{
813 struct seq_file *seq = file->private_data;
814 struct jbd2_stats_proc_session *s = seq->private;
815
816 kfree(s->stats);
817 kfree(s);
818 return seq_release(inode, file);
819}
820
821static struct file_operations jbd2_seq_history_fops = {
822 .owner = THIS_MODULE,
823 .open = jbd2_seq_history_open,
824 .read = seq_read,
825 .llseek = seq_lseek,
826 .release = jbd2_seq_history_release,
827};
828
829static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) 679static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
830{ 680{
831 return *pos ? NULL : SEQ_START_TOKEN; 681 return *pos ? NULL : SEQ_START_TOKEN;
@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
842 692
843 if (v != SEQ_START_TOKEN) 693 if (v != SEQ_START_TOKEN)
844 return 0; 694 return 0;
845 seq_printf(seq, "%lu transaction, each upto %u blocks\n", 695 seq_printf(seq, "%lu transaction, each up to %u blocks\n",
846 s->stats->ts_tid, 696 s->stats->ts_tid,
847 s->journal->j_max_transaction_buffers); 697 s->journal->j_max_transaction_buffers);
848 if (s->stats->ts_tid == 0) 698 if (s->stats->ts_tid == 0)
849 return 0; 699 return 0;
850 seq_printf(seq, "average: \n %ums waiting for transaction\n", 700 seq_printf(seq, "average: \n %ums waiting for transaction\n",
851 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); 701 jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
852 seq_printf(seq, " %ums running transaction\n", 702 seq_printf(seq, " %ums running transaction\n",
853 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); 703 jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
854 seq_printf(seq, " %ums transaction was being locked\n", 704 seq_printf(seq, " %ums transaction was being locked\n",
855 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); 705 jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
856 seq_printf(seq, " %ums flushing data (in ordered mode)\n", 706 seq_printf(seq, " %ums flushing data (in ordered mode)\n",
857 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 707 jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
858 seq_printf(seq, " %ums logging transaction\n", 708 seq_printf(seq, " %ums logging transaction\n",
859 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 709 jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
860 seq_printf(seq, " %lluus average transaction commit time\n", 710 seq_printf(seq, " %lluus average transaction commit time\n",
861 div_u64(s->journal->j_average_commit_time, 1000)); 711 div_u64(s->journal->j_average_commit_time, 1000));
862 seq_printf(seq, " %lu handles per transaction\n", 712 seq_printf(seq, " %lu handles per transaction\n",
863 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 713 s->stats->run.rs_handle_count / s->stats->ts_tid);
864 seq_printf(seq, " %lu blocks per transaction\n", 714 seq_printf(seq, " %lu blocks per transaction\n",
865 s->stats->u.run.rs_blocks / s->stats->ts_tid); 715 s->stats->run.rs_blocks / s->stats->ts_tid);
866 seq_printf(seq, " %lu logged blocks per transaction\n", 716 seq_printf(seq, " %lu logged blocks per transaction\n",
867 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); 717 s->stats->run.rs_blocks_logged / s->stats->ts_tid);
868 return 0; 718 return 0;
869} 719}
870 720
@@ -872,7 +722,7 @@ static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
872{ 722{
873} 723}
874 724
875static struct seq_operations jbd2_seq_info_ops = { 725static const struct seq_operations jbd2_seq_info_ops = {
876 .start = jbd2_seq_info_start, 726 .start = jbd2_seq_info_start,
877 .next = jbd2_seq_info_next, 727 .next = jbd2_seq_info_next,
878 .stop = jbd2_seq_info_stop, 728 .stop = jbd2_seq_info_stop,
@@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
920 return seq_release(inode, file); 770 return seq_release(inode, file);
921} 771}
922 772
923static struct file_operations jbd2_seq_info_fops = { 773static const struct file_operations jbd2_seq_info_fops = {
924 .owner = THIS_MODULE, 774 .owner = THIS_MODULE,
925 .open = jbd2_seq_info_open, 775 .open = jbd2_seq_info_open,
926 .read = seq_read, 776 .read = seq_read,
@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
934{ 784{
935 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); 785 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
936 if (journal->j_proc_entry) { 786 if (journal->j_proc_entry) {
937 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
938 &jbd2_seq_history_fops, journal);
939 proc_create_data("info", S_IRUGO, journal->j_proc_entry, 787 proc_create_data("info", S_IRUGO, journal->j_proc_entry,
940 &jbd2_seq_info_fops, journal); 788 &jbd2_seq_info_fops, journal);
941 } 789 }
@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
944static void jbd2_stats_proc_exit(journal_t *journal) 792static void jbd2_stats_proc_exit(journal_t *journal)
945{ 793{
946 remove_proc_entry("info", journal->j_proc_entry); 794 remove_proc_entry("info", journal->j_proc_entry);
947 remove_proc_entry("history", journal->j_proc_entry);
948 remove_proc_entry(journal->j_devname, proc_jbd2_stats); 795 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
949} 796}
950 797
951static void journal_init_stats(journal_t *journal)
952{
953 int size;
954
955 if (!proc_jbd2_stats)
956 return;
957
958 journal->j_history_max = 100;
959 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
960 journal->j_history = kzalloc(size, GFP_KERNEL);
961 if (!journal->j_history) {
962 journal->j_history_max = 0;
963 return;
964 }
965 spin_lock_init(&journal->j_history_lock);
966}
967
968/* 798/*
969 * Management for journal control blocks: functions to create and 799 * Management for journal control blocks: functions to create and
970 * destroy journal_t structures, and to initialise and read existing 800 * destroy journal_t structures, and to initialise and read existing
@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
1009 goto fail; 839 goto fail;
1010 } 840 }
1011 841
1012 journal_init_stats(journal); 842 spin_lock_init(&journal->j_history_lock);
1013 843
1014 return journal; 844 return journal;
1015fail: 845fail:
@@ -1115,7 +945,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1115 while ((p = strchr(p, '/'))) 945 while ((p = strchr(p, '/')))
1116 *p = '!'; 946 *p = '!';
1117 p = journal->j_devname + strlen(journal->j_devname); 947 p = journal->j_devname + strlen(journal->j_devname);
1118 sprintf(p, ":%lu", journal->j_inode->i_ino); 948 sprintf(p, "-%lu", journal->j_inode->i_ino);
1119 jbd_debug(1, 949 jbd_debug(1,
1120 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 950 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1121 journal, inode->i_sb->s_id, inode->i_ino, 951 journal, inode->i_sb->s_id, inode->i_ino,
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index e9580104b6ba..3ff50da94789 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
15#include <linux/completion.h> 15#include <linux/completion.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/kthread.h>
18#include "nodelist.h" 19#include "nodelist.h"
19 20
20 21
@@ -31,7 +32,7 @@ void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
31/* This must only ever be called when no GC thread is currently running */ 32/* This must only ever be called when no GC thread is currently running */
32int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) 33int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
33{ 34{
34 pid_t pid; 35 struct task_struct *tsk;
35 int ret = 0; 36 int ret = 0;
36 37
37 BUG_ON(c->gc_task); 38 BUG_ON(c->gc_task);
@@ -39,15 +40,16 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
39 init_completion(&c->gc_thread_start); 40 init_completion(&c->gc_thread_start);
40 init_completion(&c->gc_thread_exit); 41 init_completion(&c->gc_thread_exit);
41 42
42 pid = kernel_thread(jffs2_garbage_collect_thread, c, CLONE_FS|CLONE_FILES); 43 tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
43 if (pid < 0) { 44 if (IS_ERR(tsk)) {
44 printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %d\n", -pid); 45 printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk));
45 complete(&c->gc_thread_exit); 46 complete(&c->gc_thread_exit);
46 ret = pid; 47 ret = PTR_ERR(tsk);
47 } else { 48 } else {
48 /* Wait for it... */ 49 /* Wait for it... */
49 D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", pid)); 50 D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid));
50 wait_for_completion(&c->gc_thread_start); 51 wait_for_completion(&c->gc_thread_start);
52 ret = tsk->pid;
51 } 53 }
52 54
53 return ret; 55 return ret;
@@ -71,7 +73,6 @@ static int jffs2_garbage_collect_thread(void *_c)
71{ 73{
72 struct jffs2_sb_info *c = _c; 74 struct jffs2_sb_info *c = _c;
73 75
74 daemonize("jffs2_gcd_mtd%d", c->mtd->index);
75 allow_signal(SIGKILL); 76 allow_signal(SIGKILL);
76 allow_signal(SIGSTOP); 77 allow_signal(SIGSTOP);
77 allow_signal(SIGCONT); 78 allow_signal(SIGCONT);
@@ -107,6 +108,11 @@ static int jffs2_garbage_collect_thread(void *_c)
107 * the GC thread get there first. */ 108 * the GC thread get there first. */
108 schedule_timeout_interruptible(msecs_to_jiffies(50)); 109 schedule_timeout_interruptible(msecs_to_jiffies(50));
109 110
111 if (kthread_should_stop()) {
112 D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n"));
113 goto die;
114 }
115
110 /* Put_super will send a SIGKILL and then wait on the sem. 116 /* Put_super will send a SIGKILL and then wait on the sem.
111 */ 117 */
112 while (signal_pending(current) || freezing(current)) { 118 while (signal_pending(current) || freezing(current)) {
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 9eff2bdae8a7..c082868910f2 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -39,13 +39,13 @@ int __init jffs2_create_slab_caches(void)
39 39
40 raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent", 40 raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
41 sizeof(struct jffs2_raw_dirent), 41 sizeof(struct jffs2_raw_dirent),
42 0, 0, NULL); 42 0, SLAB_HWCACHE_ALIGN, NULL);
43 if (!raw_dirent_slab) 43 if (!raw_dirent_slab)
44 goto err; 44 goto err;
45 45
46 raw_inode_slab = kmem_cache_create("jffs2_raw_inode", 46 raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
47 sizeof(struct jffs2_raw_inode), 47 sizeof(struct jffs2_raw_inode),
48 0, 0, NULL); 48 0, SLAB_HWCACHE_ALIGN, NULL);
49 if (!raw_inode_slab) 49 if (!raw_inode_slab)
50 goto err; 50 goto err;
51 51
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0035c021395a..9a80e8e595d0 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
123 return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); 123 return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
124} 124}
125 125
126static struct export_operations jffs2_export_ops = { 126static const struct export_operations jffs2_export_ops = {
127 .get_parent = jffs2_get_parent, 127 .get_parent = jffs2_get_parent,
128 .fh_to_dentry = jffs2_fh_to_dentry, 128 .fh_to_dentry = jffs2_fh_to_dentry,
129 .fh_to_parent = jffs2_fh_to_parent, 129 .fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 37e6dcda8fc8..2234c73fc577 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb)
178 rc = jfs_umount(sb); 178 rc = jfs_umount(sb);
179 if (rc) 179 if (rc)
180 jfs_err("jfs_umount failed with return code %d", rc); 180 jfs_err("jfs_umount failed with return code %d", rc);
181 if (sbi->nls_tab) 181
182 unload_nls(sbi->nls_tab); 182 unload_nls(sbi->nls_tab);
183 sbi->nls_tab = NULL;
184 183
185 truncate_inode_pages(sbi->direct_inode->i_mapping, 0); 184 truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
186 iput(sbi->direct_inode); 185 iput(sbi->direct_inode);
187 sbi->direct_inode = NULL;
188 186
189 kfree(sbi); 187 kfree(sbi);
190 188
@@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
347 345
348 if (nls_map != (void *) -1) { 346 if (nls_map != (void *) -1) {
349 /* Discard old (if remount) */ 347 /* Discard old (if remount) */
350 if (sbi->nls_tab) 348 unload_nls(sbi->nls_tab);
351 unload_nls(sbi->nls_tab);
352 sbi->nls_tab = nls_map; 349 sbi->nls_tab = nls_map;
353 } 350 }
354 return 1; 351 return 1;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcec3d3ea64f..219576c52d80 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
527 const void *from, size_t available) 527 const void *from, size_t available)
528{ 528{
529 loff_t pos = *ppos; 529 loff_t pos = *ppos;
530 size_t ret;
531
530 if (pos < 0) 532 if (pos < 0)
531 return -EINVAL; 533 return -EINVAL;
532 if (pos >= available) 534 if (pos >= available || !count)
533 return 0; 535 return 0;
534 if (count > available - pos) 536 if (count > available - pos)
535 count = available - pos; 537 count = available - pos;
536 if (copy_to_user(to, from + pos, count)) 538 ret = copy_to_user(to, from + pos, count);
539 if (ret == count)
537 return -EFAULT; 540 return -EFAULT;
541 count -= ret;
538 *ppos = pos + count; 542 *ppos = pos + count;
539 return count; 543 return count;
540} 544}
@@ -735,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
735 if (copy_from_user(attr->set_buf, buf, size)) 739 if (copy_from_user(attr->set_buf, buf, size))
736 goto out; 740 goto out;
737 741
738 ret = len; /* claim we got the whole input */
739 attr->set_buf[size] = '\0'; 742 attr->set_buf[size] = '\0';
740 val = simple_strtol(attr->set_buf, NULL, 0); 743 val = simple_strtol(attr->set_buf, NULL, 0);
741 attr->set(attr->data, val); 744 ret = attr->set(attr->data, val);
745 if (ret == 0)
746 ret = len; /* on success, claim we got the whole input */
742out: 747out:
743 mutex_unlock(&attr->mutex); 748 mutex_unlock(&attr->mutex);
744 return ret; 749 return ret;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..fc9032dc8862 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
166 */ 166 */
167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
168 continue; 168 continue;
169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) 169 if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
170 continue; 170 continue;
171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
172 continue; 172 continue;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4336adba952a..c81249fef11f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner); 458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
459} 459}
460 460
461static struct file_lock_operations nlmclnt_lock_ops = { 461static const struct file_lock_operations nlmclnt_lock_ops = {
462 .fl_copy_lock = nlmclnt_locks_copy_lock, 462 .fl_copy_lock = nlmclnt_locks_copy_lock,
463 .fl_release_private = nlmclnt_locks_release_private, 463 .fl_release_private = nlmclnt_locks_release_private,
464}; 464};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 7cb076ac6b45..4600c2037b8b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -111,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
111 */ 111 */
112 chain = &nlm_hosts[nlm_hash_address(ni->sap)]; 112 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
113 hlist_for_each_entry(host, pos, chain, h_hash) { 113 hlist_for_each_entry(host, pos, chain, h_hash) {
114 if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) 114 if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
115 continue; 115 continue;
116 116
117 /* See if we have an NSM handle for this client */ 117 /* See if we have an NSM handle for this client */
@@ -125,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
125 if (host->h_server != ni->server) 125 if (host->h_server != ni->server)
126 continue; 126 continue;
127 if (ni->server && 127 if (ni->server &&
128 !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) 128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
129 continue; 129 continue;
130 130
131 /* Move to head of hash chain. */ 131 /* Move to head of hash chain. */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 30c933188dd7..f956651d0f65 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -209,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
209 struct nsm_handle *nsm; 209 struct nsm_handle *nsm;
210 210
211 list_for_each_entry(nsm, &nsm_handles, sm_link) 211 list_for_each_entry(nsm, &nsm_handles, sm_link)
212 if (nlm_cmp_addr(nsm_addr(nsm), sap)) 212 if (rpc_cmp_addr(nsm_addr(nsm), sap))
213 return nsm; 213 return nsm;
214 return NULL; 214 return NULL;
215} 215}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e577a78d7bac..d1001790fa9a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
705 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; 705 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
706} 706}
707 707
708struct lock_manager_operations nlmsvc_lock_operations = { 708const struct lock_manager_operations nlmsvc_lock_operations = {
709 .fl_compare_owner = nlmsvc_same_owner, 709 .fl_compare_owner = nlmsvc_same_owner,
710 .fl_notify = nlmsvc_notify_blocked, 710 .fl_notify = nlmsvc_notify_blocked,
711 .fl_grant = nlmsvc_grant_deferred, 711 .fl_grant = nlmsvc_grant_deferred,
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 9e4d6aab611b..ad478da7ca63 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
417static int 417static int
418nlmsvc_match_ip(void *datap, struct nlm_host *host) 418nlmsvc_match_ip(void *datap, struct nlm_host *host)
419{ 419{
420 return nlm_cmp_addr(nlm_srcaddr(host), datap); 420 return rpc_cmp_addr(nlm_srcaddr(host), datap);
421} 421}
422 422
423/** 423/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacde..b583ab0a4cbb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/utsname.h>
12#include <linux/nfs.h> 11#include <linux/nfs.h>
13 12
14#include <linux/sunrpc/xdr.h> 13#include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d528653192..ad9dbbc9145d 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/utsname.h>
13#include <linux/nfs.h> 12#include <linux/nfs.h>
14 13
15#include <linux/sunrpc/xdr.h> 14#include <linux/sunrpc/xdr.h>
diff --git a/fs/locks.c b/fs/locks.c
index 19ee18a6829b..a8794f233bc9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
434 return fl->fl_file == try->fl_file; 434 return fl->fl_file == try->fl_file;
435} 435}
436 436
437static struct lock_manager_operations lease_manager_ops = { 437static const struct lock_manager_operations lease_manager_ops = {
438 .fl_break = lease_break_callback, 438 .fl_break = lease_break_callback,
439 .fl_release_private = lease_release_private_callback, 439 .fl_release_private = lease_release_private_callback,
440 .fl_mylease = lease_mylease_callback, 440 .fl_mylease = lease_mylease_callback,
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d407e7a0b6fe..6198731d7fcd 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -308,14 +308,18 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
308 struct inode *inode = (struct inode*)mapping->host; 308 struct inode *inode = (struct inode*)mapping->host;
309 char *kaddr = page_address(page); 309 char *kaddr = page_address(page);
310 loff_t pos = page_offset(page) + (char*)de - kaddr; 310 loff_t pos = page_offset(page) + (char*)de - kaddr;
311 unsigned len = minix_sb(inode->i_sb)->s_dirsize; 311 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
312 unsigned len = sbi->s_dirsize;
312 int err; 313 int err;
313 314
314 lock_page(page); 315 lock_page(page);
315 err = __minix_write_begin(NULL, mapping, pos, len, 316 err = __minix_write_begin(NULL, mapping, pos, len,
316 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 317 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
317 if (err == 0) { 318 if (err == 0) {
318 de->inode = 0; 319 if (sbi->s_version == MINIX_V3)
320 ((minix3_dirent *) de)->inode = 0;
321 else
322 de->inode = 0;
319 err = dir_commit_chunk(page, pos, len); 323 err = dir_commit_chunk(page, pos, len);
320 } else { 324 } else {
321 unlock_page(page); 325 unlock_page(page);
@@ -440,7 +444,10 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
440 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize, 444 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize,
441 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 445 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
442 if (err == 0) { 446 if (err == 0) {
443 de->inode = inode->i_ino; 447 if (sbi->s_version == MINIX_V3)
448 ((minix3_dirent *) de)->inode = inode->i_ino;
449 else
450 de->inode = inode->i_ino;
444 err = dir_commit_chunk(page, pos, sbi->s_dirsize); 451 err = dir_commit_chunk(page, pos, sbi->s_dirsize);
445 } else { 452 } else {
446 unlock_page(page); 453 unlock_page(page);
@@ -470,7 +477,14 @@ ino_t minix_inode_by_name(struct dentry *dentry)
470 ino_t res = 0; 477 ino_t res = 0;
471 478
472 if (de) { 479 if (de) {
473 res = de->inode; 480 struct address_space *mapping = page->mapping;
481 struct inode *inode = mapping->host;
482 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
483
484 if (sbi->s_version == MINIX_V3)
485 res = ((minix3_dirent *) de)->inode;
486 else
487 res = de->inode;
474 dir_put_page(page); 488 dir_put_page(page);
475 } 489 }
476 return res; 490 return res;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7230787d18b0..bdc3cb4fd222 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1640{ 1640{
1641 struct vfsmount *mnt; 1641 struct vfsmount *mnt;
1642 1642
1643 if (!type || !memchr(type, 0, PAGE_SIZE)) 1643 if (!type)
1644 return -EINVAL; 1644 return -EINVAL;
1645 1645
1646 /* we need capabilities... */ 1646 /* we need capabilities... */
@@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where)
1871 return 0; 1871 return 0;
1872} 1872}
1873 1873
1874int copy_mount_string(const void __user *data, char **where)
1875{
1876 char *tmp;
1877
1878 if (!data) {
1879 *where = NULL;
1880 return 0;
1881 }
1882
1883 tmp = strndup_user(data, PAGE_SIZE);
1884 if (IS_ERR(tmp))
1885 return PTR_ERR(tmp);
1886
1887 *where = tmp;
1888 return 0;
1889}
1890
1874/* 1891/*
1875 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 1892 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
1876 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 1893 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1900 1917
1901 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 1918 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
1902 return -EINVAL; 1919 return -EINVAL;
1903 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
1904 return -EINVAL;
1905 1920
1906 if (data_page) 1921 if (data_page)
1907 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1922 ((char *)data_page)[PAGE_SIZE - 1] = 0;
@@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns);
2070SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2085SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2071 char __user *, type, unsigned long, flags, void __user *, data) 2086 char __user *, type, unsigned long, flags, void __user *, data)
2072{ 2087{
2073 int retval; 2088 int ret;
2089 char *kernel_type;
2090 char *kernel_dir;
2091 char *kernel_dev;
2074 unsigned long data_page; 2092 unsigned long data_page;
2075 unsigned long type_page;
2076 unsigned long dev_page;
2077 char *dir_page;
2078 2093
2079 retval = copy_mount_options(type, &type_page); 2094 ret = copy_mount_string(type, &kernel_type);
2080 if (retval < 0) 2095 if (ret < 0)
2081 return retval; 2096 goto out_type;
2082 2097
2083 dir_page = getname(dir_name); 2098 kernel_dir = getname(dir_name);
2084 retval = PTR_ERR(dir_page); 2099 if (IS_ERR(kernel_dir)) {
2085 if (IS_ERR(dir_page)) 2100 ret = PTR_ERR(kernel_dir);
2086 goto out1; 2101 goto out_dir;
2102 }
2087 2103
2088 retval = copy_mount_options(dev_name, &dev_page); 2104 ret = copy_mount_string(dev_name, &kernel_dev);
2089 if (retval < 0) 2105 if (ret < 0)
2090 goto out2; 2106 goto out_dev;
2091 2107
2092 retval = copy_mount_options(data, &data_page); 2108 ret = copy_mount_options(data, &data_page);
2093 if (retval < 0) 2109 if (ret < 0)
2094 goto out3; 2110 goto out_data;
2095 2111
2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 2112 ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
2097 flags, (void *)data_page); 2113 (void *) data_page);
2098 free_page(data_page);
2099 2114
2100out3: 2115 free_page(data_page);
2101 free_page(dev_page); 2116out_data:
2102out2: 2117 kfree(kernel_dev);
2103 putname(dir_page); 2118out_dev:
2104out1: 2119 putname(kernel_dir);
2105 free_page(type_page); 2120out_dir:
2106 return retval; 2121 kfree(kernel_type);
2122out_type:
2123 return ret;
2107} 2124}
2108 2125
2109/* 2126/*
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9c590722d87e..b8b5b30d53f0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1241,7 +1241,7 @@ ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
1241 month = 2; 1241 month = 2;
1242 } else { 1242 } else {
1243 nl_day = (year & 3) || day <= 59 ? day : day - 1; 1243 nl_day = (year & 3) || day <= 59 ? day : day - 1;
1244 for (month = 0; month < 12; month++) 1244 for (month = 1; month < 12; month++)
1245 if (day_n[month] > nl_day) 1245 if (day_n[month] > nl_day)
1246 break; 1246 break;
1247 } 1247 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b99ce205b1bd..cf98da1be23e 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb)
746 746
747#ifdef CONFIG_NCPFS_NLS 747#ifdef CONFIG_NCPFS_NLS
748 /* unload the NLS charsets */ 748 /* unload the NLS charsets */
749 if (server->nls_vol) 749 unload_nls(server->nls_vol);
750 { 750 unload_nls(server->nls_io);
751 unload_nls(server->nls_vol);
752 server->nls_vol = NULL;
753 }
754 if (server->nls_io)
755 {
756 unload_nls(server->nls_io);
757 server->nls_io = NULL;
758 }
759#endif /* CONFIG_NCPFS_NLS */ 751#endif /* CONFIG_NCPFS_NLS */
760 752
761 if (server->info_filp) 753 if (server->info_filp)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index fa038df63ac8..0d58caf4a6e1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
223 oldset_io = server->nls_io; 223 oldset_io = server->nls_io;
224 server->nls_io = iocharset; 224 server->nls_io = iocharset;
225 225
226 if (oldset_cp) 226 unload_nls(oldset_cp);
227 unload_nls(oldset_cp); 227 unload_nls(oldset_io);
228 if (oldset_io)
229 unload_nls(oldset_io);
230 228
231 return 0; 229 return 0;
232} 230}
@@ -442,7 +440,7 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
442 if (dentry) { 440 if (dentry) {
443 struct inode* s_inode = dentry->d_inode; 441 struct inode* s_inode = dentry->d_inode;
444 442
445 if (inode) { 443 if (s_inode) {
446 NCP_FINFO(s_inode)->volNumber = vnum; 444 NCP_FINFO(s_inode)->volNumber = vnum;
447 NCP_FINFO(s_inode)->dirEntNum = de; 445 NCP_FINFO(s_inode)->dirEntNum = de;
448 NCP_FINFO(s_inode)->DosDirNum = dosde; 446 NCP_FINFO(s_inode)->DosDirNum = dosde;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5d8dcb9ee326..15458decdb8a 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -95,7 +95,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
95 return VM_FAULT_MAJOR; 95 return VM_FAULT_MAJOR;
96} 96}
97 97
98static struct vm_operations_struct ncp_file_mmap = 98static const struct vm_operations_struct ncp_file_mmap =
99{ 99{
100 .fault = ncp_file_mmap_fault, 100 .fault = ncp_file_mmap_fault,
101}; 101};
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index e5a2dac5f715..76b0aa0f73bf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -222,7 +222,7 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
222 222
223 p = read_buf(xdr, len); 223 p = read_buf(xdr, len);
224 if (unlikely(p == NULL)) 224 if (unlikely(p == NULL))
225 return htonl(NFS4ERR_RESOURCE);; 225 return htonl(NFS4ERR_RESOURCE);
226 226
227 memcpy(sid->data, p, len); 227 memcpy(sid->data, p, len);
228 return 0; 228 return 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e350bd6a2334..63976c0ccc25 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -648,8 +648,6 @@ static int nfs_start_lockd(struct nfs_server *server)
648 .hostname = clp->cl_hostname, 648 .hostname = clp->cl_hostname,
649 .address = (struct sockaddr *)&clp->cl_addr, 649 .address = (struct sockaddr *)&clp->cl_addr,
650 .addrlen = clp->cl_addrlen, 650 .addrlen = clp->cl_addrlen,
651 .protocol = server->flags & NFS_MOUNT_TCP ?
652 IPPROTO_TCP : IPPROTO_UDP,
653 .nfs_version = clp->rpc_ops->version, 651 .nfs_version = clp->rpc_ops->version,
654 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 652 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
655 1 : 0, 653 1 : 0,
@@ -660,6 +658,14 @@ static int nfs_start_lockd(struct nfs_server *server)
660 if (server->flags & NFS_MOUNT_NONLM) 658 if (server->flags & NFS_MOUNT_NONLM)
661 return 0; 659 return 0;
662 660
661 switch (clp->cl_proto) {
662 default:
663 nlm_init.protocol = IPPROTO_TCP;
664 break;
665 case XPRT_TRANSPORT_UDP:
666 nlm_init.protocol = IPPROTO_UDP;
667 }
668
663 host = nlmclnt_init(&nlm_init); 669 host = nlmclnt_init(&nlm_init);
664 if (IS_ERR(host)) 670 if (IS_ERR(host))
665 return PTR_ERR(host); 671 return PTR_ERR(host);
@@ -787,7 +793,7 @@ static int nfs_init_server(struct nfs_server *server,
787 dprintk("--> nfs_init_server()\n"); 793 dprintk("--> nfs_init_server()\n");
788 794
789#ifdef CONFIG_NFS_V3 795#ifdef CONFIG_NFS_V3
790 if (data->flags & NFS_MOUNT_VER3) 796 if (data->version == 3)
791 cl_init.rpc_ops = &nfs_v3_clientops; 797 cl_init.rpc_ops = &nfs_v3_clientops;
792#endif 798#endif
793 799
@@ -933,10 +939,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
933 goto out_error; 939 goto out_error;
934 940
935 nfs_server_set_fsinfo(server, &fsinfo); 941 nfs_server_set_fsinfo(server, &fsinfo);
936 error = bdi_init(&server->backing_dev_info);
937 if (error)
938 goto out_error;
939
940 942
941 /* Get some general file system info */ 943 /* Get some general file system info */
942 if (server->namelen == 0) { 944 if (server->namelen == 0) {
@@ -968,6 +970,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
968 target->acdirmin = source->acdirmin; 970 target->acdirmin = source->acdirmin;
969 target->acdirmax = source->acdirmax; 971 target->acdirmax = source->acdirmax;
970 target->caps = source->caps; 972 target->caps = source->caps;
973 target->options = source->options;
971} 974}
972 975
973/* 976/*
@@ -995,6 +998,12 @@ static struct nfs_server *nfs_alloc_server(void)
995 return NULL; 998 return NULL;
996 } 999 }
997 1000
1001 if (bdi_init(&server->backing_dev_info)) {
1002 nfs_free_iostats(server->io_stats);
1003 kfree(server);
1004 return NULL;
1005 }
1006
998 return server; 1007 return server;
999} 1008}
1000 1009
@@ -1529,7 +1538,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
1529static void nfs_server_list_stop(struct seq_file *p, void *v); 1538static void nfs_server_list_stop(struct seq_file *p, void *v);
1530static int nfs_server_list_show(struct seq_file *m, void *v); 1539static int nfs_server_list_show(struct seq_file *m, void *v);
1531 1540
1532static struct seq_operations nfs_server_list_ops = { 1541static const struct seq_operations nfs_server_list_ops = {
1533 .start = nfs_server_list_start, 1542 .start = nfs_server_list_start,
1534 .next = nfs_server_list_next, 1543 .next = nfs_server_list_next,
1535 .stop = nfs_server_list_stop, 1544 .stop = nfs_server_list_stop,
@@ -1550,7 +1559,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
1550static void nfs_volume_list_stop(struct seq_file *p, void *v); 1559static void nfs_volume_list_stop(struct seq_file *p, void *v);
1551static int nfs_volume_list_show(struct seq_file *m, void *v); 1560static int nfs_volume_list_show(struct seq_file *m, void *v);
1552 1561
1553static struct seq_operations nfs_volume_list_ops = { 1562static const struct seq_operations nfs_volume_list_ops = {
1554 .start = nfs_volume_list_start, 1563 .start = nfs_volume_list_start,
1555 .next = nfs_volume_list_next, 1564 .next = nfs_volume_list_next,
1556 .stop = nfs_volume_list_stop, 1565 .stop = nfs_volume_list_stop,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5021b75d2d1e..f5fdd39e037a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
60static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); 60static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
61 61
62static struct vm_operations_struct nfs_file_vm_ops; 62static const struct vm_operations_struct nfs_file_vm_ops;
63 63
64const struct file_operations nfs_file_operations = { 64const struct file_operations nfs_file_operations = {
65 .llseek = nfs_file_llseek, 65 .llseek = nfs_file_llseek,
@@ -525,6 +525,7 @@ const struct address_space_operations nfs_file_aops = {
525 .direct_IO = nfs_direct_IO, 525 .direct_IO = nfs_direct_IO,
526 .migratepage = nfs_migrate_page, 526 .migratepage = nfs_migrate_page,
527 .launder_page = nfs_launder_page, 527 .launder_page = nfs_launder_page,
528 .error_remove_page = generic_error_remove_page,
528}; 529};
529 530
530/* 531/*
@@ -571,7 +572,7 @@ out_unlock:
571 return VM_FAULT_SIGBUS; 572 return VM_FAULT_SIGBUS;
572} 573}
573 574
574static struct vm_operations_struct nfs_file_vm_ops = { 575static const struct vm_operations_struct nfs_file_vm_ops = {
575 .fault = filemap_fault, 576 .fault = filemap_fault,
576 .page_mkwrite = nfs_vm_page_mkwrite, 577 .page_mkwrite = nfs_vm_page_mkwrite,
577}; 578};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 379be678cb7e..70fad69eb959 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -58,17 +58,34 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
58/* 58/*
59 * Get the cache cookie for an NFS superblock. We have to handle 59 * Get the cache cookie for an NFS superblock. We have to handle
60 * uniquification here because the cache doesn't do it for us. 60 * uniquification here because the cache doesn't do it for us.
61 *
62 * The default uniquifier is just an empty string, but it may be overridden
63 * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
64 * superblock across an automount point of some nature.
61 */ 65 */
62void nfs_fscache_get_super_cookie(struct super_block *sb, 66void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
63 struct nfs_parsed_mount_data *data) 67 struct nfs_clone_mount *mntdata)
64{ 68{
65 struct nfs_fscache_key *key, *xkey; 69 struct nfs_fscache_key *key, *xkey;
66 struct nfs_server *nfss = NFS_SB(sb); 70 struct nfs_server *nfss = NFS_SB(sb);
67 struct rb_node **p, *parent; 71 struct rb_node **p, *parent;
68 const char *uniq = data->fscache_uniq ?: "";
69 int diff, ulen; 72 int diff, ulen;
70 73
71 ulen = strlen(uniq); 74 if (uniq) {
75 ulen = strlen(uniq);
76 } else if (mntdata) {
77 struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
78 if (mnt_s->fscache_key) {
79 uniq = mnt_s->fscache_key->key.uniquifier;
80 ulen = mnt_s->fscache_key->key.uniq_len;
81 }
82 }
83
84 if (!uniq) {
85 uniq = "";
86 ulen = 1;
87 }
88
72 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); 89 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
73 if (!key) 90 if (!key)
74 return; 91 return;
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 6e809bb0ff08..b9c572d0679f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -74,7 +74,8 @@ extern void nfs_fscache_get_client_cookie(struct nfs_client *);
74extern void nfs_fscache_release_client_cookie(struct nfs_client *); 74extern void nfs_fscache_release_client_cookie(struct nfs_client *);
75 75
76extern void nfs_fscache_get_super_cookie(struct super_block *, 76extern void nfs_fscache_get_super_cookie(struct super_block *,
77 struct nfs_parsed_mount_data *); 77 const char *,
78 struct nfs_clone_mount *);
78extern void nfs_fscache_release_super_cookie(struct super_block *); 79extern void nfs_fscache_release_super_cookie(struct super_block *);
79 80
80extern void nfs_fscache_init_inode_cookie(struct inode *); 81extern void nfs_fscache_init_inode_cookie(struct inode *);
@@ -173,7 +174,8 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
173 174
174static inline void nfs_fscache_get_super_cookie( 175static inline void nfs_fscache_get_super_cookie(
175 struct super_block *sb, 176 struct super_block *sb,
176 struct nfs_parsed_mount_data *data) 177 const char *uniq,
178 struct nfs_clone_mount *mntdata)
177{ 179{
178} 180}
179static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} 181static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 060022b4651c..faa091865ad0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -458,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
458 */ 458 */
459static int nfs_vmtruncate(struct inode * inode, loff_t offset) 459static int nfs_vmtruncate(struct inode * inode, loff_t offset)
460{ 460{
461 if (i_size_read(inode) < offset) { 461 loff_t oldsize;
462 unsigned long limit; 462 int err;
463
464 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
465 if (limit != RLIM_INFINITY && offset > limit)
466 goto out_sig;
467 if (offset > inode->i_sb->s_maxbytes)
468 goto out_big;
469 spin_lock(&inode->i_lock);
470 i_size_write(inode, offset);
471 spin_unlock(&inode->i_lock);
472 } else {
473 struct address_space *mapping = inode->i_mapping;
474 463
475 /* 464 err = inode_newsize_ok(inode, offset);
476 * truncation of in-use swapfiles is disallowed - it would 465 if (err)
477 * cause subsequent swapout to scribble on the now-freed 466 goto out;
478 * blocks.
479 */
480 if (IS_SWAPFILE(inode))
481 return -ETXTBSY;
482 spin_lock(&inode->i_lock);
483 i_size_write(inode, offset);
484 spin_unlock(&inode->i_lock);
485 467
486 /* 468 spin_lock(&inode->i_lock);
487 * unmap_mapping_range is called twice, first simply for 469 oldsize = inode->i_size;
488 * efficiency so that truncate_inode_pages does fewer 470 i_size_write(inode, offset);
489 * single-page unmaps. However after this first call, and 471 spin_unlock(&inode->i_lock);
490 * before truncate_inode_pages finishes, it is possible for 472
491 * private pages to be COWed, which remain after 473 truncate_pagecache(inode, oldsize, offset);
492 * truncate_inode_pages finishes, hence the second 474out:
493 * unmap_mapping_range call must be made for correctness. 475 return err;
494 */
495 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
496 truncate_inode_pages(mapping, offset);
497 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
498 }
499 return 0;
500out_sig:
501 send_sig(SIGXFSZ, current, 0);
502out_big:
503 return -EFBIG;
504} 476}
505 477
506/** 478/**
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9a..5e078b222b4e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/utsname.h>
17#include <linux/errno.h> 16#include <linux/errno.h>
18#include <linux/string.h> 17#include <linux/string.h>
19#include <linux/in.h> 18#include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ee6a13f05443..3f8881d1a050 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/utsname.h>
11#include <linux/errno.h> 10#include <linux/errno.h>
12#include <linux/string.h> 11#include <linux/string.h>
13#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f1..5fe5492fbd29 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/utsname.h>
14#include <linux/errno.h> 13#include <linux/errno.h>
15#include <linux/string.h> 14#include <linux/string.h>
16#include <linux/in.h> 15#include <linux/in.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be6544aef41f..ed7c269e2514 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
36 */ 36 */
37 37
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/utsname.h>
40#include <linux/delay.h> 39#include <linux/delay.h>
41#include <linux/errno.h> 40#include <linux/errno.h>
42#include <linux/string.h> 41#include <linux/string.h>
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1434080aefeb..2ef4fecf3984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl)
638 nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); 638 nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
639} 639}
640 640
641static struct file_lock_operations nfs4_fl_lock_ops = { 641static const struct file_lock_operations nfs4_fl_lock_ops = {
642 .fl_copy_lock = nfs4_fl_copy_lock, 642 .fl_copy_lock = nfs4_fl_copy_lock,
643 .fl_release_private = nfs4_fl_release_lock, 643 .fl_release_private = nfs4_fl_release_lock,
644}; 644};
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cfc30d362f94..83ad47cbdd8a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/utsname.h>
43#include <linux/errno.h> 42#include <linux/errno.h>
44#include <linux/string.h> 43#include <linux/string.h>
45#include <linux/in.h> 44#include <linux/in.h>
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49d..ef583854d8d0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/time.h> 33#include <linux/time.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/utsname.h>
36#include <linux/errno.h> 35#include <linux/errno.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/in.h> 37#include <linux/in.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index de935692d40d..29786d3b9326 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,6 +728,27 @@ static void nfs_umount_begin(struct super_block *sb)
728 unlock_kernel(); 728 unlock_kernel();
729} 729}
730 730
731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags)
732{
733 struct nfs_parsed_mount_data *data;
734
735 data = kzalloc(sizeof(*data), GFP_KERNEL);
736 if (data) {
737 data->flags = flags;
738 data->rsize = NFS_MAX_FILE_IO_SIZE;
739 data->wsize = NFS_MAX_FILE_IO_SIZE;
740 data->acregmin = NFS_DEF_ACREGMIN;
741 data->acregmax = NFS_DEF_ACREGMAX;
742 data->acdirmin = NFS_DEF_ACDIRMIN;
743 data->acdirmax = NFS_DEF_ACDIRMAX;
744 data->nfs_server.port = NFS_UNSPEC_PORT;
745 data->auth_flavors[0] = RPC_AUTH_UNIX;
746 data->auth_flavor_len = 1;
747 data->minorversion = 0;
748 }
749 return data;
750}
751
731/* 752/*
732 * Sanity-check a server address provided by the mount command. 753 * Sanity-check a server address provided by the mount command.
733 * 754 *
@@ -1430,10 +1451,13 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1430 int status; 1451 int status;
1431 1452
1432 if (args->mount_server.version == 0) { 1453 if (args->mount_server.version == 0) {
1433 if (args->flags & NFS_MOUNT_VER3) 1454 switch (args->version) {
1434 args->mount_server.version = NFS_MNT3_VERSION; 1455 default:
1435 else 1456 args->mount_server.version = NFS_MNT3_VERSION;
1436 args->mount_server.version = NFS_MNT_VERSION; 1457 break;
1458 case 2:
1459 args->mount_server.version = NFS_MNT_VERSION;
1460 }
1437 } 1461 }
1438 request.version = args->mount_server.version; 1462 request.version = args->mount_server.version;
1439 1463
@@ -1634,20 +1658,6 @@ static int nfs_validate_mount_data(void *options,
1634 if (data == NULL) 1658 if (data == NULL)
1635 goto out_no_data; 1659 goto out_no_data;
1636 1660
1637 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
1638 args->rsize = NFS_MAX_FILE_IO_SIZE;
1639 args->wsize = NFS_MAX_FILE_IO_SIZE;
1640 args->acregmin = NFS_DEF_ACREGMIN;
1641 args->acregmax = NFS_DEF_ACREGMAX;
1642 args->acdirmin = NFS_DEF_ACDIRMIN;
1643 args->acdirmax = NFS_DEF_ACDIRMAX;
1644 args->mount_server.port = NFS_UNSPEC_PORT;
1645 args->nfs_server.port = NFS_UNSPEC_PORT;
1646 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1647 args->auth_flavors[0] = RPC_AUTH_UNIX;
1648 args->auth_flavor_len = 1;
1649 args->minorversion = 0;
1650
1651 switch (data->version) { 1661 switch (data->version) {
1652 case 1: 1662 case 1:
1653 data->namlen = 0; 1663 data->namlen = 0;
@@ -1701,6 +1711,8 @@ static int nfs_validate_mount_data(void *options,
1701 1711
1702 if (!(data->flags & NFS_MOUNT_TCP)) 1712 if (!(data->flags & NFS_MOUNT_TCP))
1703 args->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1713 args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1714 else
1715 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1704 /* N.B. caller will free nfs_server.hostname in all cases */ 1716 /* N.B. caller will free nfs_server.hostname in all cases */
1705 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); 1717 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
1706 args->namlen = data->namlen; 1718 args->namlen = data->namlen;
@@ -1778,7 +1790,7 @@ static int nfs_validate_mount_data(void *options,
1778 } 1790 }
1779 1791
1780#ifndef CONFIG_NFS_V3 1792#ifndef CONFIG_NFS_V3
1781 if (args->flags & NFS_MOUNT_VER3) 1793 if (args->version == 3)
1782 goto out_v3_not_compiled; 1794 goto out_v3_not_compiled;
1783#endif /* !CONFIG_NFS_V3 */ 1795#endif /* !CONFIG_NFS_V3 */
1784 1796
@@ -1936,7 +1948,7 @@ static void nfs_fill_super(struct super_block *sb,
1936 if (data->bsize) 1948 if (data->bsize)
1937 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); 1949 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
1938 1950
1939 if (server->flags & NFS_MOUNT_VER3) { 1951 if (server->nfs_client->rpc_ops->version == 3) {
1940 /* The VFS shouldn't apply the umask to mode bits. We will do 1952 /* The VFS shouldn't apply the umask to mode bits. We will do
1941 * so ourselves when necessary. 1953 * so ourselves when necessary.
1942 */ 1954 */
@@ -1960,7 +1972,7 @@ static void nfs_clone_super(struct super_block *sb,
1960 sb->s_blocksize = old_sb->s_blocksize; 1972 sb->s_blocksize = old_sb->s_blocksize;
1961 sb->s_maxbytes = old_sb->s_maxbytes; 1973 sb->s_maxbytes = old_sb->s_maxbytes;
1962 1974
1963 if (server->flags & NFS_MOUNT_VER3) { 1975 if (server->nfs_client->rpc_ops->version == 3) {
1964 /* The VFS shouldn't apply the umask to mode bits. We will do 1976 /* The VFS shouldn't apply the umask to mode bits. We will do
1965 * so ourselves when necessary. 1977 * so ourselves when necessary.
1966 */ 1978 */
@@ -2094,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2094 }; 2106 };
2095 int error = -ENOMEM; 2107 int error = -ENOMEM;
2096 2108
2097 data = kzalloc(sizeof(*data), GFP_KERNEL); 2109 data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
2098 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2110 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
2099 if (data == NULL || mntfh == NULL) 2111 if (data == NULL || mntfh == NULL)
2100 goto out_free_fh; 2112 goto out_free_fh;
@@ -2144,7 +2156,8 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2144 if (!s->s_root) { 2156 if (!s->s_root) {
2145 /* initial superblock/root creation */ 2157 /* initial superblock/root creation */
2146 nfs_fill_super(s, data); 2158 nfs_fill_super(s, data);
2147 nfs_fscache_get_super_cookie(s, data); 2159 nfs_fscache_get_super_cookie(
2160 s, data ? data->fscache_uniq : NULL, NULL);
2148 } 2161 }
2149 2162
2150 mntroot = nfs_get_root(s, mntfh); 2163 mntroot = nfs_get_root(s, mntfh);
@@ -2190,8 +2203,8 @@ static void nfs_kill_super(struct super_block *s)
2190{ 2203{
2191 struct nfs_server *server = NFS_SB(s); 2204 struct nfs_server *server = NFS_SB(s);
2192 2205
2193 bdi_unregister(&server->backing_dev_info);
2194 kill_anon_super(s); 2206 kill_anon_super(s);
2207 bdi_unregister(&server->backing_dev_info);
2195 nfs_fscache_release_super_cookie(s); 2208 nfs_fscache_release_super_cookie(s);
2196 nfs_free_server(server); 2209 nfs_free_server(server);
2197} 2210}
@@ -2245,6 +2258,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2245 if (!s->s_root) { 2258 if (!s->s_root) {
2246 /* initial superblock/root creation */ 2259 /* initial superblock/root creation */
2247 nfs_clone_super(s, data->sb); 2260 nfs_clone_super(s, data->sb);
2261 nfs_fscache_get_super_cookie(s, NULL, data);
2248 } 2262 }
2249 2263
2250 mntroot = nfs_get_root(s, data->fh); 2264 mntroot = nfs_get_root(s, data->fh);
@@ -2362,18 +2376,7 @@ static int nfs4_validate_mount_data(void *options,
2362 if (data == NULL) 2376 if (data == NULL)
2363 goto out_no_data; 2377 goto out_no_data;
2364 2378
2365 args->rsize = NFS_MAX_FILE_IO_SIZE;
2366 args->wsize = NFS_MAX_FILE_IO_SIZE;
2367 args->acregmin = NFS_DEF_ACREGMIN;
2368 args->acregmax = NFS_DEF_ACREGMAX;
2369 args->acdirmin = NFS_DEF_ACDIRMIN;
2370 args->acdirmax = NFS_DEF_ACDIRMAX;
2371 args->nfs_server.port = NFS_UNSPEC_PORT;
2372 args->auth_flavors[0] = RPC_AUTH_UNIX;
2373 args->auth_flavor_len = 1;
2374 args->version = 4; 2379 args->version = 4;
2375 args->minorversion = 0;
2376
2377 switch (data->version) { 2380 switch (data->version) {
2378 case 1: 2381 case 1:
2379 if (data->host_addrlen > sizeof(args->nfs_server.address)) 2382 if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2508,7 +2511,8 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2508 if (!s->s_root) { 2511 if (!s->s_root) {
2509 /* initial superblock/root creation */ 2512 /* initial superblock/root creation */
2510 nfs4_fill_super(s); 2513 nfs4_fill_super(s);
2511 nfs_fscache_get_super_cookie(s, data); 2514 nfs_fscache_get_super_cookie(
2515 s, data ? data->fscache_uniq : NULL, NULL);
2512 } 2516 }
2513 2517
2514 mntroot = nfs4_get_root(s, mntfh); 2518 mntroot = nfs4_get_root(s, mntfh);
@@ -2656,7 +2660,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2656 struct nfs_parsed_mount_data *data; 2660 struct nfs_parsed_mount_data *data;
2657 int error = -ENOMEM; 2661 int error = -ENOMEM;
2658 2662
2659 data = kzalloc(sizeof(*data), GFP_KERNEL); 2663 data = nfs_alloc_parsed_mount_data(0);
2660 if (data == NULL) 2664 if (data == NULL)
2661 goto out_free_data; 2665 goto out_free_data;
2662 2666
@@ -2741,6 +2745,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2741 if (!s->s_root) { 2745 if (!s->s_root) {
2742 /* initial superblock/root creation */ 2746 /* initial superblock/root creation */
2743 nfs4_clone_super(s, data->sb); 2747 nfs4_clone_super(s, data->sb);
2748 nfs_fscache_get_super_cookie(s, NULL, data);
2744 } 2749 }
2745 2750
2746 mntroot = nfs4_get_root(s, data->fh); 2751 mntroot = nfs4_get_root(s, data->fh);
@@ -2822,6 +2827,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2822 if (!s->s_root) { 2827 if (!s->s_root) {
2823 /* initial superblock/root creation */ 2828 /* initial superblock/root creation */
2824 nfs4_fill_super(s); 2829 nfs4_fill_super(s);
2830 nfs_fscache_get_super_cookie(s, NULL, data);
2825 } 2831 }
2826 2832
2827 mntroot = nfs4_get_root(s, &mntfh); 2833 mntroot = nfs4_get_root(s, &mntfh);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index d9462643155c..c1c9e035d4a4 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1341,6 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1341 if (rv) 1341 if (rv)
1342 goto out; 1342 goto out;
1343 rv = check_nfsd_access(exp, rqstp); 1343 rv = check_nfsd_access(exp, rqstp);
1344 if (rv)
1345 fh_put(fhp);
1344out: 1346out:
1345 exp_put(exp); 1347 exp_put(exp);
1346 return rv; 1348 return rv;
@@ -1515,7 +1517,7 @@ static int e_show(struct seq_file *m, void *p)
1515 return svc_export_show(m, &svc_export_cache, cp); 1517 return svc_export_show(m, &svc_export_cache, cp);
1516} 1518}
1517 1519
1518struct seq_operations nfs_exports_op = { 1520const struct seq_operations nfs_exports_op = {
1519 .start = e_start, 1521 .start = e_start,
1520 .next = e_next, 1522 .next = e_next,
1521 .stop = e_stop, 1523 .stop = e_stop,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 01d4ec1c88e0..edf926e1062f 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -814,17 +814,6 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
814 return p; 814 return p;
815} 815}
816 816
817static __be32 *
818encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
819 struct svc_fh *fhp)
820{
821 p = encode_post_op_attr(cd->rqstp, p, fhp);
822 *p++ = xdr_one; /* yes, a file handle follows */
823 p = encode_fh(p, fhp);
824 fh_put(fhp);
825 return p;
826}
827
828static int 817static int
829compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, 818compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
830 const char *name, int namlen) 819 const char *name, int namlen)
@@ -836,29 +825,54 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
836 dparent = cd->fh.fh_dentry; 825 dparent = cd->fh.fh_dentry;
837 exp = cd->fh.fh_export; 826 exp = cd->fh.fh_export;
838 827
839 fh_init(fhp, NFS3_FHSIZE);
840 if (isdotent(name, namlen)) { 828 if (isdotent(name, namlen)) {
841 if (namlen == 2) { 829 if (namlen == 2) {
842 dchild = dget_parent(dparent); 830 dchild = dget_parent(dparent);
843 if (dchild == dparent) { 831 if (dchild == dparent) {
844 /* filesystem root - cannot return filehandle for ".." */ 832 /* filesystem root - cannot return filehandle for ".." */
845 dput(dchild); 833 dput(dchild);
846 return 1; 834 return -ENOENT;
847 } 835 }
848 } else 836 } else
849 dchild = dget(dparent); 837 dchild = dget(dparent);
850 } else 838 } else
851 dchild = lookup_one_len(name, dparent, namlen); 839 dchild = lookup_one_len(name, dparent, namlen);
852 if (IS_ERR(dchild)) 840 if (IS_ERR(dchild))
853 return 1; 841 return -ENOENT;
854 if (d_mountpoint(dchild) || 842 rv = -ENOENT;
855 fh_compose(fhp, exp, dchild, &cd->fh) != 0 || 843 if (d_mountpoint(dchild))
856 !dchild->d_inode) 844 goto out;
857 rv = 1; 845 rv = fh_compose(fhp, exp, dchild, &cd->fh);
846 if (rv)
847 goto out;
848 if (!dchild->d_inode)
849 goto out;
850 rv = 0;
851out:
858 dput(dchild); 852 dput(dchild);
859 return rv; 853 return rv;
860} 854}
861 855
856__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
857{
858 struct svc_fh fh;
859 int err;
860
861 fh_init(&fh, NFS3_FHSIZE);
862 err = compose_entry_fh(cd, &fh, name, namlen);
863 if (err) {
864 *p++ = 0;
865 *p++ = 0;
866 goto out;
867 }
868 p = encode_post_op_attr(cd->rqstp, p, &fh);
869 *p++ = xdr_one; /* yes, a file handle follows */
870 p = encode_fh(p, &fh);
871out:
872 fh_put(&fh);
873 return p;
874}
875
862/* 876/*
863 * Encode a directory entry. This one works for both normal readdir 877 * Encode a directory entry. This one works for both normal readdir
864 * and readdirplus. 878 * and readdirplus.
@@ -929,16 +943,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
929 943
930 p = encode_entry_baggage(cd, p, name, namlen, ino); 944 p = encode_entry_baggage(cd, p, name, namlen, ino);
931 945
932 /* throw in readdirplus baggage */ 946 if (plus)
933 if (plus) { 947 p = encode_entryplus_baggage(cd, p, name, namlen);
934 struct svc_fh fh;
935
936 if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
937 *p++ = 0;
938 *p++ = 0;
939 } else
940 p = encode_entryplus_baggage(cd, p, &fh);
941 }
942 num_entry_words = p - cd->buffer; 948 num_entry_words = p - cd->buffer;
943 } else if (cd->rqstp->rq_respages[pn+1] != NULL) { 949 } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
944 /* temporarily encode entry into next page, then move back to 950 /* temporarily encode entry into next page, then move back to
@@ -951,17 +957,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
951 957
952 p1 = encode_entry_baggage(cd, p1, name, namlen, ino); 958 p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
953 959
954 /* throw in readdirplus baggage */ 960 if (plus)
955 if (plus) { 961 p = encode_entryplus_baggage(cd, p1, name, namlen);
956 struct svc_fh fh;
957
958 if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
959 /* zero out the filehandle */
960 *p1++ = 0;
961 *p1++ = 0;
962 } else
963 p1 = encode_entryplus_baggage(cd, p1, &fh);
964 }
965 962
966 /* determine entry word length and lengths to go in pages */ 963 /* determine entry word length and lengths to go in pages */
967 num_entry_words = p1 - tmp; 964 num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 54b8b4140c8f..725d02f210e2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -321,7 +321,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
321 deny = ~pas.group & pas.other; 321 deny = ~pas.group & pas.other;
322 if (deny) { 322 if (deny) {
323 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; 323 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
324 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; 324 ace->flag = eflag;
325 ace->access_mask = deny_mask_from_posix(deny, flags); 325 ace->access_mask = deny_mask_from_posix(deny, flags);
326 ace->whotype = NFS4_ACL_WHO_GROUP; 326 ace->whotype = NFS4_ACL_WHO_GROUP;
327 ace++; 327 ace++;
@@ -335,7 +335,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
335 if (deny) { 335 if (deny) {
336 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; 336 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
337 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; 337 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
338 ace->access_mask = mask_from_posix(deny, flags); 338 ace->access_mask = deny_mask_from_posix(deny, flags);
339 ace->whotype = NFS4_ACL_WHO_NAMED; 339 ace->whotype = NFS4_ACL_WHO_NAMED;
340 ace->who = pa->e_id; 340 ace->who = pa->e_id;
341 ace++; 341 ace++;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3fd23f7aceca..24e8d78f8dde 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -43,25 +43,30 @@
43#include <linux/sunrpc/xdr.h> 43#include <linux/sunrpc/xdr.h>
44#include <linux/sunrpc/svc.h> 44#include <linux/sunrpc/svc.h>
45#include <linux/sunrpc/clnt.h> 45#include <linux/sunrpc/clnt.h>
46#include <linux/sunrpc/svcsock.h>
46#include <linux/nfsd/nfsd.h> 47#include <linux/nfsd/nfsd.h>
47#include <linux/nfsd/state.h> 48#include <linux/nfsd/state.h>
48#include <linux/sunrpc/sched.h> 49#include <linux/sunrpc/sched.h>
49#include <linux/nfs4.h> 50#include <linux/nfs4.h>
51#include <linux/sunrpc/xprtsock.h>
50 52
51#define NFSDDBG_FACILITY NFSDDBG_PROC 53#define NFSDDBG_FACILITY NFSDDBG_PROC
52 54
53#define NFSPROC4_CB_NULL 0 55#define NFSPROC4_CB_NULL 0
54#define NFSPROC4_CB_COMPOUND 1 56#define NFSPROC4_CB_COMPOUND 1
57#define NFS4_STATEID_SIZE 16
55 58
56/* Index of predefined Linux callback client operations */ 59/* Index of predefined Linux callback client operations */
57 60
58enum { 61enum {
59 NFSPROC4_CLNT_CB_NULL = 0, 62 NFSPROC4_CLNT_CB_NULL = 0,
60 NFSPROC4_CLNT_CB_RECALL, 63 NFSPROC4_CLNT_CB_RECALL,
64 NFSPROC4_CLNT_CB_SEQUENCE,
61}; 65};
62 66
63enum nfs_cb_opnum4 { 67enum nfs_cb_opnum4 {
64 OP_CB_RECALL = 4, 68 OP_CB_RECALL = 4,
69 OP_CB_SEQUENCE = 11,
65}; 70};
66 71
67#define NFS4_MAXTAGLEN 20 72#define NFS4_MAXTAGLEN 20
@@ -70,17 +75,29 @@ enum nfs_cb_opnum4 {
70#define NFS4_dec_cb_null_sz 0 75#define NFS4_dec_cb_null_sz 0
71#define cb_compound_enc_hdr_sz 4 76#define cb_compound_enc_hdr_sz 4
72#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) 77#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
78#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
79#define cb_sequence_enc_sz (sessionid_sz + 4 + \
80 1 /* no referring calls list yet */)
81#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
82
73#define op_enc_sz 1 83#define op_enc_sz 1
74#define op_dec_sz 2 84#define op_dec_sz 2
75#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) 85#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
76#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) 86#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
77#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ 87#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
88 cb_sequence_enc_sz + \
78 1 + enc_stateid_sz + \ 89 1 + enc_stateid_sz + \
79 enc_nfs4_fh_sz) 90 enc_nfs4_fh_sz)
80 91
81#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ 92#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
93 cb_sequence_dec_sz + \
82 op_dec_sz) 94 op_dec_sz)
83 95
96struct nfs4_rpc_args {
97 void *args_op;
98 struct nfsd4_cb_sequence args_seq;
99};
100
84/* 101/*
85* Generic encode routines from fs/nfs/nfs4xdr.c 102* Generic encode routines from fs/nfs/nfs4xdr.c
86*/ 103*/
@@ -137,11 +154,13 @@ xdr_error: \
137} while (0) 154} while (0)
138 155
139struct nfs4_cb_compound_hdr { 156struct nfs4_cb_compound_hdr {
140 int status; 157 /* args */
141 u32 ident; 158 u32 ident; /* minorversion 0 only */
142 u32 nops; 159 u32 nops;
143 __be32 *nops_p; 160 __be32 *nops_p;
144 u32 minorversion; 161 u32 minorversion;
162 /* res */
163 int status;
145 u32 taglen; 164 u32 taglen;
146 char *tag; 165 char *tag;
147}; 166};
@@ -238,6 +257,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
238 hdr->nops++; 257 hdr->nops++;
239} 258}
240 259
260static void
261encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
262 struct nfs4_cb_compound_hdr *hdr)
263{
264 __be32 *p;
265
266 if (hdr->minorversion == 0)
267 return;
268
269 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
270
271 WRITE32(OP_CB_SEQUENCE);
272 WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
273 WRITE32(args->cbs_clp->cl_cb_seq_nr);
274 WRITE32(0); /* slotid, always 0 */
275 WRITE32(0); /* highest slotid always 0 */
276 WRITE32(0); /* cachethis always 0 */
277 WRITE32(0); /* FIXME: support referring_call_lists */
278 hdr->nops++;
279}
280
241static int 281static int
242nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) 282nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
243{ 283{
@@ -249,15 +289,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
249} 289}
250 290
251static int 291static int
252nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args) 292nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
293 struct nfs4_rpc_args *rpc_args)
253{ 294{
254 struct xdr_stream xdr; 295 struct xdr_stream xdr;
296 struct nfs4_delegation *args = rpc_args->args_op;
255 struct nfs4_cb_compound_hdr hdr = { 297 struct nfs4_cb_compound_hdr hdr = {
256 .ident = args->dl_ident, 298 .ident = args->dl_ident,
299 .minorversion = rpc_args->args_seq.cbs_minorversion,
257 }; 300 };
258 301
259 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 302 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
260 encode_cb_compound_hdr(&xdr, &hdr); 303 encode_cb_compound_hdr(&xdr, &hdr);
304 encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
261 encode_cb_recall(&xdr, args, &hdr); 305 encode_cb_recall(&xdr, args, &hdr);
262 encode_cb_nops(&hdr); 306 encode_cb_nops(&hdr);
263 return 0; 307 return 0;
@@ -299,6 +343,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
299 return 0; 343 return 0;
300} 344}
301 345
346/*
347 * Our current back channel implmentation supports a single backchannel
348 * with a single slot.
349 */
350static int
351decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
352 struct rpc_rqst *rqstp)
353{
354 struct nfs4_sessionid id;
355 int status;
356 u32 dummy;
357 __be32 *p;
358
359 if (res->cbs_minorversion == 0)
360 return 0;
361
362 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
363 if (status)
364 return status;
365
366 /*
367 * If the server returns different values for sessionID, slotID or
368 * sequence number, the server is looney tunes.
369 */
370 status = -ESERVERFAULT;
371
372 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
373 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
374 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
375 if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
376 NFS4_MAX_SESSIONID_LEN)) {
377 dprintk("%s Invalid session id\n", __func__);
378 goto out;
379 }
380 READ32(dummy);
381 if (dummy != res->cbs_clp->cl_cb_seq_nr) {
382 dprintk("%s Invalid sequence number\n", __func__);
383 goto out;
384 }
385 READ32(dummy); /* slotid must be 0 */
386 if (dummy != 0) {
387 dprintk("%s Invalid slotid\n", __func__);
388 goto out;
389 }
390 /* FIXME: process highest slotid and target highest slotid */
391 status = 0;
392out:
393 return status;
394}
395
396
302static int 397static int
303nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) 398nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
304{ 399{
@@ -306,7 +401,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
306} 401}
307 402
308static int 403static int
309nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p) 404nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
405 struct nfsd4_cb_sequence *seq)
310{ 406{
311 struct xdr_stream xdr; 407 struct xdr_stream xdr;
312 struct nfs4_cb_compound_hdr hdr; 408 struct nfs4_cb_compound_hdr hdr;
@@ -316,6 +412,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
316 status = decode_cb_compound_hdr(&xdr, &hdr); 412 status = decode_cb_compound_hdr(&xdr, &hdr);
317 if (status) 413 if (status)
318 goto out; 414 goto out;
415 if (seq) {
416 status = decode_cb_sequence(&xdr, seq, rqstp);
417 if (status)
418 goto out;
419 }
319 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); 420 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
320out: 421out:
321 return status; 422 return status;
@@ -377,16 +478,15 @@ static int max_cb_time(void)
377 478
378int setup_callback_client(struct nfs4_client *clp) 479int setup_callback_client(struct nfs4_client *clp)
379{ 480{
380 struct sockaddr_in addr;
381 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 481 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
382 struct rpc_timeout timeparms = { 482 struct rpc_timeout timeparms = {
383 .to_initval = max_cb_time(), 483 .to_initval = max_cb_time(),
384 .to_retries = 0, 484 .to_retries = 0,
385 }; 485 };
386 struct rpc_create_args args = { 486 struct rpc_create_args args = {
387 .protocol = IPPROTO_TCP, 487 .protocol = XPRT_TRANSPORT_TCP,
388 .address = (struct sockaddr *)&addr, 488 .address = (struct sockaddr *) &cb->cb_addr,
389 .addrsize = sizeof(addr), 489 .addrsize = cb->cb_addrlen,
390 .timeout = &timeparms, 490 .timeout = &timeparms,
391 .program = &cb_program, 491 .program = &cb_program,
392 .prognumber = cb->cb_prog, 492 .prognumber = cb->cb_prog,
@@ -399,13 +499,10 @@ int setup_callback_client(struct nfs4_client *clp)
399 499
400 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 500 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
401 return -EINVAL; 501 return -EINVAL;
402 502 if (cb->cb_minorversion) {
403 /* Initialize address */ 503 args.bc_xprt = clp->cl_cb_xprt;
404 memset(&addr, 0, sizeof(addr)); 504 args.protocol = XPRT_TRANSPORT_BC_TCP;
405 addr.sin_family = AF_INET; 505 }
406 addr.sin_port = htons(cb->cb_port);
407 addr.sin_addr.s_addr = htonl(cb->cb_addr);
408
409 /* Create RPC client */ 506 /* Create RPC client */
410 client = rpc_create(&args); 507 client = rpc_create(&args);
411 if (IS_ERR(client)) { 508 if (IS_ERR(client)) {
@@ -439,42 +536,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
439 .rpc_call_done = nfsd4_cb_probe_done, 536 .rpc_call_done = nfsd4_cb_probe_done,
440}; 537};
441 538
442static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb) 539static struct rpc_cred *callback_cred;
443{
444 struct auth_cred acred = {
445 .machine_cred = 1
446 };
447 540
448 /* 541int set_callback_cred(void)
449 * Note in the gss case this doesn't actually have to wait for a 542{
450 * gss upcall (or any calls to the client); this just creates a 543 callback_cred = rpc_lookup_machine_cred();
451 * non-uptodate cred which the rpc state machine will fill in with 544 if (!callback_cred)
452 * a refresh_upcall later. 545 return -ENOMEM;
453 */ 546 return 0;
454 return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
455 RPCAUTH_LOOKUP_NEW);
456} 547}
457 548
549
458void do_probe_callback(struct nfs4_client *clp) 550void do_probe_callback(struct nfs4_client *clp)
459{ 551{
460 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 552 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
461 struct rpc_message msg = { 553 struct rpc_message msg = {
462 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 554 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
463 .rpc_argp = clp, 555 .rpc_argp = clp,
556 .rpc_cred = callback_cred
464 }; 557 };
465 struct rpc_cred *cred;
466 int status; 558 int status;
467 559
468 cred = lookup_cb_cred(cb);
469 if (IS_ERR(cred)) {
470 status = PTR_ERR(cred);
471 goto out;
472 }
473 cb->cb_cred = cred;
474 msg.rpc_cred = cb->cb_cred;
475 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, 560 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
476 &nfsd4_cb_probe_ops, (void *)clp); 561 &nfsd4_cb_probe_ops, (void *)clp);
477out:
478 if (status) { 562 if (status) {
479 warn_no_callback_path(clp, status); 563 warn_no_callback_path(clp, status);
480 put_nfs4_client(clp); 564 put_nfs4_client(clp);
@@ -503,11 +587,95 @@ nfsd4_probe_callback(struct nfs4_client *clp)
503 do_probe_callback(clp); 587 do_probe_callback(clp);
504} 588}
505 589
590/*
591 * There's currently a single callback channel slot.
592 * If the slot is available, then mark it busy. Otherwise, set the
593 * thread for sleeping on the callback RPC wait queue.
594 */
595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
596 struct rpc_task *task)
597{
598 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
599 u32 *ptr = (u32 *)clp->cl_sessionid.data;
600 int status = 0;
601
602 dprintk("%s: %u:%u:%u:%u\n", __func__,
603 ptr[0], ptr[1], ptr[2], ptr[3]);
604
605 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
606 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
607 dprintk("%s slot is busy\n", __func__);
608 status = -EAGAIN;
609 goto out;
610 }
611
612 /*
613 * We'll need the clp during XDR encoding and decoding,
614 * and the sequence during decoding to verify the reply
615 */
616 args->args_seq.cbs_clp = clp;
617 task->tk_msg.rpc_resp = &args->args_seq;
618
619out:
620 dprintk("%s status=%d\n", __func__, status);
621 return status;
622}
623
624/*
625 * TODO: cb_sequence should support referring call lists, cachethis, multiple
626 * slots, and mark callback channel down on communication errors.
627 */
628static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
629{
630 struct nfs4_delegation *dp = calldata;
631 struct nfs4_client *clp = dp->dl_client;
632 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
633 u32 minorversion = clp->cl_cb_conn.cb_minorversion;
634 int status = 0;
635
636 args->args_seq.cbs_minorversion = minorversion;
637 if (minorversion) {
638 status = nfsd41_cb_setup_sequence(clp, task);
639 if (status) {
640 if (status != -EAGAIN) {
641 /* terminate rpc task */
642 task->tk_status = status;
643 task->tk_action = NULL;
644 }
645 return;
646 }
647 }
648 rpc_call_start(task);
649}
650
651static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
652{
653 struct nfs4_delegation *dp = calldata;
654 struct nfs4_client *clp = dp->dl_client;
655
656 dprintk("%s: minorversion=%d\n", __func__,
657 clp->cl_cb_conn.cb_minorversion);
658
659 if (clp->cl_cb_conn.cb_minorversion) {
660 /* No need for lock, access serialized in nfsd4_cb_prepare */
661 ++clp->cl_cb_seq_nr;
662 clear_bit(0, &clp->cl_cb_slot_busy);
663 rpc_wake_up_next(&clp->cl_cb_waitq);
664 dprintk("%s: freed slot, new seqid=%d\n", __func__,
665 clp->cl_cb_seq_nr);
666
667 /* We're done looking into the sequence information */
668 task->tk_msg.rpc_resp = NULL;
669 }
670}
671
506static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 672static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
507{ 673{
508 struct nfs4_delegation *dp = calldata; 674 struct nfs4_delegation *dp = calldata;
509 struct nfs4_client *clp = dp->dl_client; 675 struct nfs4_client *clp = dp->dl_client;
510 676
677 nfsd4_cb_done(task, calldata);
678
511 switch (task->tk_status) { 679 switch (task->tk_status) {
512 case -EIO: 680 case -EIO:
513 /* Network partition? */ 681 /* Network partition? */
@@ -520,16 +688,19 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
520 break; 688 break;
521 default: 689 default:
522 /* success, or error we can't handle */ 690 /* success, or error we can't handle */
523 return; 691 goto done;
524 } 692 }
525 if (dp->dl_retries--) { 693 if (dp->dl_retries--) {
526 rpc_delay(task, 2*HZ); 694 rpc_delay(task, 2*HZ);
527 task->tk_status = 0; 695 task->tk_status = 0;
528 rpc_restart_call(task); 696 rpc_restart_call(task);
697 return;
529 } else { 698 } else {
530 atomic_set(&clp->cl_cb_conn.cb_set, 0); 699 atomic_set(&clp->cl_cb_conn.cb_set, 0);
531 warn_no_callback_path(clp, task->tk_status); 700 warn_no_callback_path(clp, task->tk_status);
532 } 701 }
702done:
703 kfree(task->tk_msg.rpc_argp);
533} 704}
534 705
535static void nfsd4_cb_recall_release(void *calldata) 706static void nfsd4_cb_recall_release(void *calldata)
@@ -542,6 +713,7 @@ static void nfsd4_cb_recall_release(void *calldata)
542} 713}
543 714
544static const struct rpc_call_ops nfsd4_cb_recall_ops = { 715static const struct rpc_call_ops nfsd4_cb_recall_ops = {
716 .rpc_call_prepare = nfsd4_cb_prepare,
545 .rpc_call_done = nfsd4_cb_recall_done, 717 .rpc_call_done = nfsd4_cb_recall_done,
546 .rpc_release = nfsd4_cb_recall_release, 718 .rpc_release = nfsd4_cb_recall_release,
547}; 719};
@@ -554,17 +726,24 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
554{ 726{
555 struct nfs4_client *clp = dp->dl_client; 727 struct nfs4_client *clp = dp->dl_client;
556 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; 728 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
729 struct nfs4_rpc_args *args;
557 struct rpc_message msg = { 730 struct rpc_message msg = {
558 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 731 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
559 .rpc_argp = dp, 732 .rpc_cred = callback_cred
560 .rpc_cred = clp->cl_cb_conn.cb_cred
561 }; 733 };
562 int status; 734 int status = -ENOMEM;
563 735
736 args = kzalloc(sizeof(*args), GFP_KERNEL);
737 if (!args)
738 goto out;
739 args->args_op = dp;
740 msg.rpc_argp = args;
564 dp->dl_retries = 1; 741 dp->dl_retries = 1;
565 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 742 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
566 &nfsd4_cb_recall_ops, dp); 743 &nfsd4_cb_recall_ops, dp);
744out:
567 if (status) { 745 if (status) {
746 kfree(args);
568 put_nfs4_client(clp); 747 put_nfs4_client(clp);
569 nfs4_put_delegation(dp); 748 nfs4_put_delegation(dp);
570 } 749 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index cdfa86fa1471..ba2c199592fd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
38#include <linux/init.h> 38#include <linux/init.h>
39 39
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/utsname.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7c8801769a3c..bebc0c2e1b0a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -68,7 +68,6 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
68 u32 *bmval, u32 *writable) 68 u32 *bmval, u32 *writable)
69{ 69{
70 struct dentry *dentry = cstate->current_fh.fh_dentry; 70 struct dentry *dentry = cstate->current_fh.fh_dentry;
71 struct svc_export *exp = cstate->current_fh.fh_export;
72 71
73 /* 72 /*
74 * Check about attributes are supported by the NFSv4 server or not. 73 * Check about attributes are supported by the NFSv4 server or not.
@@ -80,17 +79,13 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
80 return nfserr_attrnotsupp; 79 return nfserr_attrnotsupp;
81 80
82 /* 81 /*
83 * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported 82 * Check FATTR4_WORD0_ACL can be supported
84 * in current environment or not. 83 * in current environment or not.
85 */ 84 */
86 if (bmval[0] & FATTR4_WORD0_ACL) { 85 if (bmval[0] & FATTR4_WORD0_ACL) {
87 if (!IS_POSIXACL(dentry->d_inode)) 86 if (!IS_POSIXACL(dentry->d_inode))
88 return nfserr_attrnotsupp; 87 return nfserr_attrnotsupp;
89 } 88 }
90 if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
91 if (exp->ex_fslocs.locations == NULL)
92 return nfserr_attrnotsupp;
93 }
94 89
95 /* 90 /*
96 * According to spec, read-only attributes return ERR_INVAL. 91 * According to spec, read-only attributes return ERR_INVAL.
@@ -123,6 +118,35 @@ nfsd4_check_open_attributes(struct svc_rqst *rqstp,
123 return status; 118 return status;
124} 119}
125 120
121static int
122is_create_with_attrs(struct nfsd4_open *open)
123{
124 return open->op_create == NFS4_OPEN_CREATE
125 && (open->op_createmode == NFS4_CREATE_UNCHECKED
126 || open->op_createmode == NFS4_CREATE_GUARDED
127 || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
128}
129
130/*
131 * if error occurs when setting the acl, just clear the acl bit
132 * in the returned attr bitmap.
133 */
134static void
135do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
136 struct nfs4_acl *acl, u32 *bmval)
137{
138 __be32 status;
139
140 status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
141 if (status)
142 /*
143 * We should probably fail the whole open at this point,
144 * but we've already created the file, so it's too late;
145 * So this seems the least of evils:
146 */
147 bmval[0] &= ~FATTR4_WORD0_ACL;
148}
149
126static inline void 150static inline void
127fh_dup2(struct svc_fh *dst, struct svc_fh *src) 151fh_dup2(struct svc_fh *dst, struct svc_fh *src)
128{ 152{
@@ -206,6 +230,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
206 if (status) 230 if (status)
207 goto out; 231 goto out;
208 232
233 if (is_create_with_attrs(open) && open->op_acl != NULL)
234 do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
235
209 set_change_info(&open->op_cinfo, current_fh); 236 set_change_info(&open->op_cinfo, current_fh);
210 fh_dup2(current_fh, &resfh); 237 fh_dup2(current_fh, &resfh);
211 238
@@ -536,12 +563,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
536 status = nfserr_badtype; 563 status = nfserr_badtype;
537 } 564 }
538 565
539 if (!status) { 566 if (status)
540 fh_unlock(&cstate->current_fh); 567 goto out;
541 set_change_info(&create->cr_cinfo, &cstate->current_fh); 568
542 fh_dup2(&cstate->current_fh, &resfh); 569 if (create->cr_acl != NULL)
543 } 570 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
571 create->cr_bmval);
544 572
573 fh_unlock(&cstate->current_fh);
574 set_change_info(&create->cr_cinfo, &cstate->current_fh);
575 fh_dup2(&cstate->current_fh, &resfh);
576out:
545 fh_put(&resfh); 577 fh_put(&resfh);
546 return status; 578 return status;
547} 579}
@@ -947,34 +979,6 @@ static struct nfsd4_operation nfsd4_ops[];
947static const char *nfsd4_op_name(unsigned opnum); 979static const char *nfsd4_op_name(unsigned opnum);
948 980
949/* 981/*
950 * This is a replay of a compound for which no cache entry pages
951 * were used. Encode the sequence operation, and if cachethis is FALSE
952 * encode the uncache rep error on the next operation.
953 */
954static __be32
955nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
956 struct nfsd4_compoundres *resp)
957{
958 struct nfsd4_op *op;
959
960 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
961 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
962
963 /* Encode the replayed sequence operation */
964 BUG_ON(resp->opcnt != 1);
965 op = &args->ops[resp->opcnt - 1];
966 nfsd4_encode_operation(resp, op);
967
968 /*return nfserr_retry_uncached_rep in next operation. */
969 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
970 op = &args->ops[resp->opcnt++];
971 op->status = nfserr_retry_uncached_rep;
972 nfsd4_encode_operation(resp, op);
973 }
974 return op->status;
975}
976
977/*
978 * Enforce NFSv4.1 COMPOUND ordering rules. 982 * Enforce NFSv4.1 COMPOUND ordering rules.
979 * 983 *
980 * TODO: 984 * TODO:
@@ -1083,13 +1087,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1083 BUG_ON(op->status == nfs_ok); 1087 BUG_ON(op->status == nfs_ok);
1084 1088
1085encode_op: 1089encode_op:
1086 /* Only from SEQUENCE or CREATE_SESSION */ 1090 /* Only from SEQUENCE */
1087 if (resp->cstate.status == nfserr_replay_cache) { 1091 if (resp->cstate.status == nfserr_replay_cache) {
1088 dprintk("%s NFS4.1 replay from cache\n", __func__); 1092 dprintk("%s NFS4.1 replay from cache\n", __func__);
1089 if (nfsd4_not_cached(resp)) 1093 status = op->status;
1090 status = nfsd4_enc_uncached_replay(args, resp);
1091 else
1092 status = op->status;
1093 goto out; 1094 goto out;
1094 } 1095 }
1095 if (op->status == nfserr_replay_me) { 1096 if (op->status == nfserr_replay_me) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..2153f9bdbebd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -55,6 +55,7 @@
55#include <linux/lockd/bind.h> 55#include <linux/lockd/bind.h>
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sunrpc/svcauth_gss.h> 57#include <linux/sunrpc/svcauth_gss.h>
58#include <linux/sunrpc/clnt.h>
58 59
59#define NFSDDBG_FACILITY NFSDDBG_PROC 60#define NFSDDBG_FACILITY NFSDDBG_PROC
60 61
@@ -413,36 +414,65 @@ gen_sessionid(struct nfsd4_session *ses)
413} 414}
414 415
415/* 416/*
416 * Give the client the number of slots it requests bound by 417 * The protocol defines ca_maxresponssize_cached to include the size of
417 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. 418 * the rpc header, but all we need to cache is the data starting after
419 * the end of the initial SEQUENCE operation--the rest we regenerate
420 * each time. Therefore we can advertise a ca_maxresponssize_cached
421 * value that is the number of bytes in our cache plus a few additional
422 * bytes. In order to stay on the safe side, and not promise more than
423 * we can cache, those additional bytes must be the minimum possible: 24
424 * bytes of rpc header (xid through accept state, with AUTH_NULL
425 * verifier), 12 for the compound header (with zero-length tag), and 44
426 * for the SEQUENCE op response:
427 */
428#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
429
430/*
431 * Give the client the number of ca_maxresponsesize_cached slots it
432 * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
433 * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
434 * than NFSD_MAX_SLOTS_PER_SESSION.
418 * 435 *
419 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we 436 * If we run out of reserved DRC memory we should (up to a point)
420 * should (up to a point) re-negotiate active sessions and reduce their 437 * re-negotiate active sessions and reduce their slot usage to make
421 * slot usage to make rooom for new connections. For now we just fail the 438 * rooom for new connections. For now we just fail the create session.
422 * create session.
423 */ 439 */
424static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) 440static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
425{ 441{
426 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; 442 int mem, size = fchan->maxresp_cached;
427 443
428 if (fchan->maxreqs < 1) 444 if (fchan->maxreqs < 1)
429 return nfserr_inval; 445 return nfserr_inval;
430 else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
431 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
432 446
433 spin_lock(&nfsd_serv->sv_lock); 447 if (size < NFSD_MIN_HDR_SEQ_SZ)
434 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) 448 size = NFSD_MIN_HDR_SEQ_SZ;
435 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; 449 size -= NFSD_MIN_HDR_SEQ_SZ;
436 nfsd_serv->sv_drc_pages_used += np; 450 if (size > NFSD_SLOT_CACHE_SIZE)
437 spin_unlock(&nfsd_serv->sv_lock); 451 size = NFSD_SLOT_CACHE_SIZE;
452
453 /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
454 mem = fchan->maxreqs * size;
455 if (mem > NFSD_MAX_MEM_PER_SESSION) {
456 fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
457 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
458 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
459 mem = fchan->maxreqs * size;
460 }
438 461
439 if (np <= 0) { 462 spin_lock(&nfsd_drc_lock);
440 status = nfserr_resource; 463 /* bound the total session drc memory ussage */
441 fchan->maxreqs = 0; 464 if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
442 } else 465 fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
443 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; 466 mem = fchan->maxreqs * size;
467 }
468 nfsd_drc_mem_used += mem;
469 spin_unlock(&nfsd_drc_lock);
444 470
445 return status; 471 if (fchan->maxreqs == 0)
472 return nfserr_serverfault;
473
474 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
475 return 0;
446} 476}
447 477
448/* 478/*
@@ -466,36 +496,41 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
466 fchan->maxresp_sz = maxcount; 496 fchan->maxresp_sz = maxcount;
467 session_fchan->maxresp_sz = fchan->maxresp_sz; 497 session_fchan->maxresp_sz = fchan->maxresp_sz;
468 498
469 /* Set the max response cached size our default which is
470 * a multiple of PAGE_SIZE and small */
471 session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
472 fchan->maxresp_cached = session_fchan->maxresp_cached;
473
474 /* Use the client's maxops if possible */ 499 /* Use the client's maxops if possible */
475 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 500 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
476 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 501 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
477 session_fchan->maxops = fchan->maxops; 502 session_fchan->maxops = fchan->maxops;
478 503
479 /* try to use the client requested number of slots */
480 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
481 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
482
483 /* FIXME: Error means no more DRC pages so the server should 504 /* FIXME: Error means no more DRC pages so the server should
484 * recover pages from existing sessions. For now fail session 505 * recover pages from existing sessions. For now fail session
485 * creation. 506 * creation.
486 */ 507 */
487 status = set_forechannel_maxreqs(fchan); 508 status = set_forechannel_drc_size(fchan);
488 509
510 session_fchan->maxresp_cached = fchan->maxresp_cached;
489 session_fchan->maxreqs = fchan->maxreqs; 511 session_fchan->maxreqs = fchan->maxreqs;
512
513 dprintk("%s status %d\n", __func__, status);
490 return status; 514 return status;
491} 515}
492 516
517static void
518free_session_slots(struct nfsd4_session *ses)
519{
520 int i;
521
522 for (i = 0; i < ses->se_fchannel.maxreqs; i++)
523 kfree(ses->se_slots[i]);
524}
525
493static int 526static int
494alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, 527alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
495 struct nfsd4_create_session *cses) 528 struct nfsd4_create_session *cses)
496{ 529{
497 struct nfsd4_session *new, tmp; 530 struct nfsd4_session *new, tmp;
498 int idx, status = nfserr_resource, slotsize; 531 struct nfsd4_slot *sp;
532 int idx, slotsize, cachesize, i;
533 int status;
499 534
500 memset(&tmp, 0, sizeof(tmp)); 535 memset(&tmp, 0, sizeof(tmp));
501 536
@@ -506,14 +541,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
506 if (status) 541 if (status)
507 goto out; 542 goto out;
508 543
509 /* allocate struct nfsd4_session and slot table in one piece */ 544 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
510 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot); 545 + sizeof(struct nfsd4_session) > PAGE_SIZE);
546
547 status = nfserr_serverfault;
548 /* allocate struct nfsd4_session and slot table pointers in one piece */
549 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
511 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); 550 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
512 if (!new) 551 if (!new)
513 goto out; 552 goto out;
514 553
515 memcpy(new, &tmp, sizeof(*new)); 554 memcpy(new, &tmp, sizeof(*new));
516 555
556 /* allocate each struct nfsd4_slot and data cache in one piece */
557 cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
558 for (i = 0; i < new->se_fchannel.maxreqs; i++) {
559 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
560 if (!sp)
561 goto out_free;
562 new->se_slots[i] = sp;
563 }
564
517 new->se_client = clp; 565 new->se_client = clp;
518 gen_sessionid(new); 566 gen_sessionid(new);
519 idx = hash_sessionid(&new->se_sessionid); 567 idx = hash_sessionid(&new->se_sessionid);
@@ -530,6 +578,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
530 status = nfs_ok; 578 status = nfs_ok;
531out: 579out:
532 return status; 580 return status;
581out_free:
582 free_session_slots(new);
583 kfree(new);
584 goto out;
533} 585}
534 586
535/* caller must hold sessionid_lock */ 587/* caller must hold sessionid_lock */
@@ -572,19 +624,16 @@ release_session(struct nfsd4_session *ses)
572 nfsd4_put_session(ses); 624 nfsd4_put_session(ses);
573} 625}
574 626
575static void nfsd4_release_respages(struct page **respages, short resused);
576
577void 627void
578free_session(struct kref *kref) 628free_session(struct kref *kref)
579{ 629{
580 struct nfsd4_session *ses; 630 struct nfsd4_session *ses;
581 int i;
582 631
583 ses = container_of(kref, struct nfsd4_session, se_ref); 632 ses = container_of(kref, struct nfsd4_session, se_ref);
584 for (i = 0; i < ses->se_fchannel.maxreqs; i++) { 633 spin_lock(&nfsd_drc_lock);
585 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; 634 nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
586 nfsd4_release_respages(e->ce_respages, e->ce_resused); 635 spin_unlock(&nfsd_drc_lock);
587 } 636 free_session_slots(ses);
588 kfree(ses); 637 kfree(ses);
589} 638}
590 639
@@ -647,18 +696,14 @@ shutdown_callback_client(struct nfs4_client *clp)
647 clp->cl_cb_conn.cb_client = NULL; 696 clp->cl_cb_conn.cb_client = NULL;
648 rpc_shutdown_client(clnt); 697 rpc_shutdown_client(clnt);
649 } 698 }
650 if (clp->cl_cb_conn.cb_cred) {
651 put_rpccred(clp->cl_cb_conn.cb_cred);
652 clp->cl_cb_conn.cb_cred = NULL;
653 }
654} 699}
655 700
656static inline void 701static inline void
657free_client(struct nfs4_client *clp) 702free_client(struct nfs4_client *clp)
658{ 703{
659 shutdown_callback_client(clp); 704 shutdown_callback_client(clp);
660 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, 705 if (clp->cl_cb_xprt)
661 clp->cl_slot.sl_cache_entry.ce_resused); 706 svc_xprt_put(clp->cl_cb_xprt);
662 if (clp->cl_cred.cr_group_info) 707 if (clp->cl_cred.cr_group_info)
663 put_group_info(clp->cl_cred.cr_group_info); 708 put_group_info(clp->cl_cred.cr_group_info);
664 kfree(clp->cl_principal); 709 kfree(clp->cl_principal);
@@ -714,25 +759,6 @@ expire_client(struct nfs4_client *clp)
714 put_nfs4_client(clp); 759 put_nfs4_client(clp);
715} 760}
716 761
717static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
718{
719 struct nfs4_client *clp;
720
721 clp = alloc_client(name);
722 if (clp == NULL)
723 return NULL;
724 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
725 atomic_set(&clp->cl_count, 1);
726 atomic_set(&clp->cl_cb_conn.cb_set, 0);
727 INIT_LIST_HEAD(&clp->cl_idhash);
728 INIT_LIST_HEAD(&clp->cl_strhash);
729 INIT_LIST_HEAD(&clp->cl_openowners);
730 INIT_LIST_HEAD(&clp->cl_delegations);
731 INIT_LIST_HEAD(&clp->cl_sessions);
732 INIT_LIST_HEAD(&clp->cl_lru);
733 return clp;
734}
735
736static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 762static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
737{ 763{
738 memcpy(target->cl_verifier.data, source->data, 764 memcpy(target->cl_verifier.data, source->data,
@@ -795,6 +821,46 @@ static void gen_confirm(struct nfs4_client *clp)
795 *p++ = i++; 821 *p++ = i++;
796} 822}
797 823
824static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
825 struct svc_rqst *rqstp, nfs4_verifier *verf)
826{
827 struct nfs4_client *clp;
828 struct sockaddr *sa = svc_addr(rqstp);
829 char *princ;
830
831 clp = alloc_client(name);
832 if (clp == NULL)
833 return NULL;
834
835 princ = svc_gss_principal(rqstp);
836 if (princ) {
837 clp->cl_principal = kstrdup(princ, GFP_KERNEL);
838 if (clp->cl_principal == NULL) {
839 free_client(clp);
840 return NULL;
841 }
842 }
843
844 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
845 atomic_set(&clp->cl_count, 1);
846 atomic_set(&clp->cl_cb_conn.cb_set, 0);
847 INIT_LIST_HEAD(&clp->cl_idhash);
848 INIT_LIST_HEAD(&clp->cl_strhash);
849 INIT_LIST_HEAD(&clp->cl_openowners);
850 INIT_LIST_HEAD(&clp->cl_delegations);
851 INIT_LIST_HEAD(&clp->cl_sessions);
852 INIT_LIST_HEAD(&clp->cl_lru);
853 clear_bit(0, &clp->cl_cb_slot_busy);
854 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
855 copy_verf(clp, verf);
856 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
857 clp->cl_flavor = rqstp->rq_flavor;
858 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
859 gen_confirm(clp);
860
861 return clp;
862}
863
798static int check_name(struct xdr_netobj name) 864static int check_name(struct xdr_netobj name)
799{ 865{
800 if (name.len == 0) 866 if (name.len == 0)
@@ -902,93 +968,40 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
902 return NULL; 968 return NULL;
903} 969}
904 970
905/* a helper function for parse_callback */
906static int
907parse_octet(unsigned int *lenp, char **addrp)
908{
909 unsigned int len = *lenp;
910 char *p = *addrp;
911 int n = -1;
912 char c;
913
914 for (;;) {
915 if (!len)
916 break;
917 len--;
918 c = *p++;
919 if (c == '.')
920 break;
921 if ((c < '0') || (c > '9')) {
922 n = -1;
923 break;
924 }
925 if (n < 0)
926 n = 0;
927 n = (n * 10) + (c - '0');
928 if (n > 255) {
929 n = -1;
930 break;
931 }
932 }
933 *lenp = len;
934 *addrp = p;
935 return n;
936}
937
938/* parse and set the setclientid ipv4 callback address */
939static int
940parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
941{
942 int temp = 0;
943 u32 cbaddr = 0;
944 u16 cbport = 0;
945 u32 addrlen = addr_len;
946 char *addr = addr_val;
947 int i, shift;
948
949 /* ipaddress */
950 shift = 24;
951 for(i = 4; i > 0 ; i--) {
952 if ((temp = parse_octet(&addrlen, &addr)) < 0) {
953 return 0;
954 }
955 cbaddr |= (temp << shift);
956 if (shift > 0)
957 shift -= 8;
958 }
959 *cbaddrp = cbaddr;
960
961 /* port */
962 shift = 8;
963 for(i = 2; i > 0 ; i--) {
964 if ((temp = parse_octet(&addrlen, &addr)) < 0) {
965 return 0;
966 }
967 cbport |= (temp << shift);
968 if (shift > 0)
969 shift -= 8;
970 }
971 *cbportp = cbport;
972 return 1;
973}
974
975static void 971static void
976gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) 972gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
977{ 973{
978 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 974 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
979 975 unsigned short expected_family;
980 /* Currently, we only support tcp for the callback channel */ 976
981 if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) 977 /* Currently, we only support tcp and tcp6 for the callback channel */
978 if (se->se_callback_netid_len == 3 &&
979 !memcmp(se->se_callback_netid_val, "tcp", 3))
980 expected_family = AF_INET;
981 else if (se->se_callback_netid_len == 4 &&
982 !memcmp(se->se_callback_netid_val, "tcp6", 4))
983 expected_family = AF_INET6;
984 else
982 goto out_err; 985 goto out_err;
983 986
984 if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, 987 cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
985 &cb->cb_addr, &cb->cb_port))) 988 se->se_callback_addr_len,
989 (struct sockaddr *) &cb->cb_addr,
990 sizeof(cb->cb_addr));
991
992 if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
986 goto out_err; 993 goto out_err;
994
995 if (cb->cb_addr.ss_family == AF_INET6)
996 ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
997
987 cb->cb_minorversion = 0; 998 cb->cb_minorversion = 0;
988 cb->cb_prog = se->se_callback_prog; 999 cb->cb_prog = se->se_callback_prog;
989 cb->cb_ident = se->se_callback_ident; 1000 cb->cb_ident = se->se_callback_ident;
990 return; 1001 return;
991out_err: 1002out_err:
1003 cb->cb_addr.ss_family = AF_UNSPEC;
1004 cb->cb_addrlen = 0;
992 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " 1005 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
993 "will not receive delegations\n", 1006 "will not receive delegations\n",
994 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); 1007 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -996,175 +1009,87 @@ out_err:
996 return; 1009 return;
997} 1010}
998 1011
999void
1000nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
1001{
1002 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1003
1004 resp->cstate.statp = statp;
1005}
1006
1007/* 1012/*
1008 * Dereference the result pages. 1013 * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
1009 */ 1014 */
1010static void 1015void
1011nfsd4_release_respages(struct page **respages, short resused) 1016nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1012{ 1017{
1013 int i; 1018 struct nfsd4_slot *slot = resp->cstate.slot;
1019 unsigned int base;
1014 1020
1015 dprintk("--> %s\n", __func__); 1021 dprintk("--> %s slot %p\n", __func__, slot);
1016 for (i = 0; i < resused; i++) {
1017 if (!respages[i])
1018 continue;
1019 put_page(respages[i]);
1020 respages[i] = NULL;
1021 }
1022}
1023 1022
1024static void 1023 slot->sl_opcnt = resp->opcnt;
1025nfsd4_copy_pages(struct page **topages, struct page **frompages, short count) 1024 slot->sl_status = resp->cstate.status;
1026{
1027 int i;
1028 1025
1029 for (i = 0; i < count; i++) { 1026 if (nfsd4_not_cached(resp)) {
1030 topages[i] = frompages[i]; 1027 slot->sl_datalen = 0;
1031 if (!topages[i]) 1028 return;
1032 continue;
1033 get_page(topages[i]);
1034 } 1029 }
1030 slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
1031 base = (char *)resp->cstate.datap -
1032 (char *)resp->xbuf->head[0].iov_base;
1033 if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
1034 slot->sl_datalen))
1035 WARN("%s: sessions DRC could not cache compound\n", __func__);
1036 return;
1035} 1037}
1036 1038
1037/* 1039/*
1038 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous 1040 * Encode the replay sequence operation from the slot values.
1039 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total 1041 * If cachethis is FALSE encode the uncached rep error on the next
1040 * length of the XDR response is less than se_fmaxresp_cached 1042 * operation which sets resp->p and increments resp->opcnt for
1041 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a 1043 * nfs4svc_encode_compoundres.
1042 * of the reply (e.g. readdir).
1043 * 1044 *
1044 * Store the base and length of the rq_req.head[0] page
1045 * of the NFSv4.1 data, just past the rpc header.
1046 */ 1045 */
1047void 1046static __be32
1048nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) 1047nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
1048 struct nfsd4_compoundres *resp)
1049{ 1049{
1050 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; 1050 struct nfsd4_op *op;
1051 struct svc_rqst *rqstp = resp->rqstp; 1051 struct nfsd4_slot *slot = resp->cstate.slot;
1052 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1053 struct nfsd4_op *op = &args->ops[resp->opcnt];
1054 struct kvec *resv = &rqstp->rq_res.head[0];
1055
1056 dprintk("--> %s entry %p\n", __func__, entry);
1057
1058 /* Don't cache a failed OP_SEQUENCE. */
1059 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1060 return;
1061 1052
1062 nfsd4_release_respages(entry->ce_respages, entry->ce_resused); 1053 dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
1063 entry->ce_opcnt = resp->opcnt; 1054 resp->opcnt, resp->cstate.slot->sl_cachethis);
1064 entry->ce_status = resp->cstate.status;
1065 1055
1066 /* 1056 /* Encode the replayed sequence operation */
1067 * Don't need a page to cache just the sequence operation - the slot 1057 op = &args->ops[resp->opcnt - 1];
1068 * does this for us! 1058 nfsd4_encode_operation(resp, op);
1069 */
1070 1059
1071 if (nfsd4_not_cached(resp)) { 1060 /* Return nfserr_retry_uncached_rep in next operation. */
1072 entry->ce_resused = 0; 1061 if (args->opcnt > 1 && slot->sl_cachethis == 0) {
1073 entry->ce_rpchdrlen = 0; 1062 op = &args->ops[resp->opcnt++];
1074 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__, 1063 op->status = nfserr_retry_uncached_rep;
1075 resp->cstate.slot->sl_cache_entry.ce_cachethis); 1064 nfsd4_encode_operation(resp, op);
1076 return;
1077 }
1078 entry->ce_resused = rqstp->rq_resused;
1079 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1080 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1081 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1082 entry->ce_resused);
1083 entry->ce_datav.iov_base = resp->cstate.statp;
1084 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1085 (char *)page_address(rqstp->rq_respages[0]));
1086 /* Current request rpc header length*/
1087 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1088 (char *)page_address(rqstp->rq_respages[0]);
1089}
1090
1091/*
1092 * We keep the rpc header, but take the nfs reply from the replycache.
1093 */
1094static int
1095nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1096 struct nfsd4_cache_entry *entry)
1097{
1098 struct svc_rqst *rqstp = resp->rqstp;
1099 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1100 int len;
1101
1102 /* Current request rpc header length*/
1103 len = (char *)resp->cstate.statp -
1104 (char *)page_address(rqstp->rq_respages[0]);
1105 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1106 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1107 entry->ce_datav.iov_len);
1108 return 0;
1109 } 1065 }
1110 /* copy the cached reply nfsd data past the current rpc header */ 1066 return op->status;
1111 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1112 entry->ce_datav.iov_len);
1113 resv->iov_len = len + entry->ce_datav.iov_len;
1114 return 1;
1115} 1067}
1116 1068
1117/* 1069/*
1118 * Keep the first page of the replay. Copy the NFSv4.1 data from the first 1070 * The sequence operation is not cached because we can use the slot and
1119 * cached page. Replace any futher replay pages from the cache. 1071 * session values.
1120 */ 1072 */
1121__be32 1073__be32
1122nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, 1074nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1123 struct nfsd4_sequence *seq) 1075 struct nfsd4_sequence *seq)
1124{ 1076{
1125 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; 1077 struct nfsd4_slot *slot = resp->cstate.slot;
1126 __be32 status; 1078 __be32 status;
1127 1079
1128 dprintk("--> %s entry %p\n", __func__, entry); 1080 dprintk("--> %s slot %p\n", __func__, slot);
1129
1130 /*
1131 * If this is just the sequence operation, we did not keep
1132 * a page in the cache entry because we can just use the
1133 * slot info stored in struct nfsd4_sequence that was checked
1134 * against the slot in nfsd4_sequence().
1135 *
1136 * This occurs when seq->cachethis is FALSE, or when the client
1137 * session inactivity timer fires and a solo sequence operation
1138 * is sent (lease renewal).
1139 */
1140 if (seq && nfsd4_not_cached(resp)) {
1141 seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
1142 return nfs_ok;
1143 }
1144
1145 if (!nfsd41_copy_replay_data(resp, entry)) {
1146 /*
1147 * Not enough room to use the replay rpc header, send the
1148 * cached header. Release all the allocated result pages.
1149 */
1150 svc_free_res_pages(resp->rqstp);
1151 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1152 entry->ce_resused);
1153 } else {
1154 /* Release all but the first allocated result page */
1155 1081
1156 resp->rqstp->rq_resused--; 1082 /* Either returns 0 or nfserr_retry_uncached */
1157 svc_free_res_pages(resp->rqstp); 1083 status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
1084 if (status == nfserr_retry_uncached_rep)
1085 return status;
1158 1086
1159 nfsd4_copy_pages(&resp->rqstp->rq_respages[1], 1087 /* The sequence operation has been encoded, cstate->datap set. */
1160 &entry->ce_respages[1], 1088 memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
1161 entry->ce_resused - 1);
1162 }
1163 1089
1164 resp->rqstp->rq_resused = entry->ce_resused; 1090 resp->opcnt = slot->sl_opcnt;
1165 resp->opcnt = entry->ce_opcnt; 1091 resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
1166 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen; 1092 status = slot->sl_status;
1167 status = entry->ce_status;
1168 1093
1169 return status; 1094 return status;
1170} 1095}
@@ -1194,13 +1119,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1194 int status; 1119 int status;
1195 unsigned int strhashval; 1120 unsigned int strhashval;
1196 char dname[HEXDIR_LEN]; 1121 char dname[HEXDIR_LEN];
1122 char addr_str[INET6_ADDRSTRLEN];
1197 nfs4_verifier verf = exid->verifier; 1123 nfs4_verifier verf = exid->verifier;
1198 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; 1124 struct sockaddr *sa = svc_addr(rqstp);
1199 1125
1126 rpc_ntop(sa, addr_str, sizeof(addr_str));
1200 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " 1127 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1201 " ip_addr=%u flags %x, spa_how %d\n", 1128 "ip_addr=%s flags %x, spa_how %d\n",
1202 __func__, rqstp, exid, exid->clname.len, exid->clname.data, 1129 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1203 ip_addr, exid->flags, exid->spa_how); 1130 addr_str, exid->flags, exid->spa_how);
1204 1131
1205 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) 1132 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1206 return nfserr_inval; 1133 return nfserr_inval;
@@ -1281,28 +1208,23 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1281 1208
1282out_new: 1209out_new:
1283 /* Normal case */ 1210 /* Normal case */
1284 new = create_client(exid->clname, dname); 1211 new = create_client(exid->clname, dname, rqstp, &verf);
1285 if (new == NULL) { 1212 if (new == NULL) {
1286 status = nfserr_resource; 1213 status = nfserr_serverfault;
1287 goto out; 1214 goto out;
1288 } 1215 }
1289 1216
1290 copy_verf(new, &verf);
1291 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1292 new->cl_addr = ip_addr;
1293 gen_clid(new); 1217 gen_clid(new);
1294 gen_confirm(new);
1295 add_to_unconfirmed(new, strhashval); 1218 add_to_unconfirmed(new, strhashval);
1296out_copy: 1219out_copy:
1297 exid->clientid.cl_boot = new->cl_clientid.cl_boot; 1220 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1298 exid->clientid.cl_id = new->cl_clientid.cl_id; 1221 exid->clientid.cl_id = new->cl_clientid.cl_id;
1299 1222
1300 new->cl_slot.sl_seqid = 0;
1301 exid->seqid = 1; 1223 exid->seqid = 1;
1302 nfsd4_set_ex_flags(new, exid); 1224 nfsd4_set_ex_flags(new, exid);
1303 1225
1304 dprintk("nfsd4_exchange_id seqid %d flags %x\n", 1226 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1305 new->cl_slot.sl_seqid, new->cl_exchange_flags); 1227 new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
1306 status = nfs_ok; 1228 status = nfs_ok;
1307 1229
1308out: 1230out:
@@ -1313,40 +1235,60 @@ error:
1313} 1235}
1314 1236
1315static int 1237static int
1316check_slot_seqid(u32 seqid, struct nfsd4_slot *slot) 1238check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
1317{ 1239{
1318 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid, 1240 dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
1319 slot->sl_seqid); 1241 slot_seqid);
1320 1242
1321 /* The slot is in use, and no response has been sent. */ 1243 /* The slot is in use, and no response has been sent. */
1322 if (slot->sl_inuse) { 1244 if (slot_inuse) {
1323 if (seqid == slot->sl_seqid) 1245 if (seqid == slot_seqid)
1324 return nfserr_jukebox; 1246 return nfserr_jukebox;
1325 else 1247 else
1326 return nfserr_seq_misordered; 1248 return nfserr_seq_misordered;
1327 } 1249 }
1328 /* Normal */ 1250 /* Normal */
1329 if (likely(seqid == slot->sl_seqid + 1)) 1251 if (likely(seqid == slot_seqid + 1))
1330 return nfs_ok; 1252 return nfs_ok;
1331 /* Replay */ 1253 /* Replay */
1332 if (seqid == slot->sl_seqid) 1254 if (seqid == slot_seqid)
1333 return nfserr_replay_cache; 1255 return nfserr_replay_cache;
1334 /* Wraparound */ 1256 /* Wraparound */
1335 if (seqid == 1 && (slot->sl_seqid + 1) == 0) 1257 if (seqid == 1 && (slot_seqid + 1) == 0)
1336 return nfs_ok; 1258 return nfs_ok;
1337 /* Misordered replay or misordered new request */ 1259 /* Misordered replay or misordered new request */
1338 return nfserr_seq_misordered; 1260 return nfserr_seq_misordered;
1339} 1261}
1340 1262
1263/*
1264 * Cache the create session result into the create session single DRC
1265 * slot cache by saving the xdr structure. sl_seqid has been set.
1266 * Do this for solo or embedded create session operations.
1267 */
1268static void
1269nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
1270 struct nfsd4_clid_slot *slot, int nfserr)
1271{
1272 slot->sl_status = nfserr;
1273 memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
1274}
1275
1276static __be32
1277nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
1278 struct nfsd4_clid_slot *slot)
1279{
1280 memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
1281 return slot->sl_status;
1282}
1283
1341__be32 1284__be32
1342nfsd4_create_session(struct svc_rqst *rqstp, 1285nfsd4_create_session(struct svc_rqst *rqstp,
1343 struct nfsd4_compound_state *cstate, 1286 struct nfsd4_compound_state *cstate,
1344 struct nfsd4_create_session *cr_ses) 1287 struct nfsd4_create_session *cr_ses)
1345{ 1288{
1346 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; 1289 struct sockaddr *sa = svc_addr(rqstp);
1347 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1348 struct nfs4_client *conf, *unconf; 1290 struct nfs4_client *conf, *unconf;
1349 struct nfsd4_slot *slot = NULL; 1291 struct nfsd4_clid_slot *cs_slot = NULL;
1350 int status = 0; 1292 int status = 0;
1351 1293
1352 nfs4_lock_state(); 1294 nfs4_lock_state();
@@ -1354,40 +1296,38 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1354 conf = find_confirmed_client(&cr_ses->clientid); 1296 conf = find_confirmed_client(&cr_ses->clientid);
1355 1297
1356 if (conf) { 1298 if (conf) {
1357 slot = &conf->cl_slot; 1299 cs_slot = &conf->cl_cs_slot;
1358 status = check_slot_seqid(cr_ses->seqid, slot); 1300 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1359 if (status == nfserr_replay_cache) { 1301 if (status == nfserr_replay_cache) {
1360 dprintk("Got a create_session replay! seqid= %d\n", 1302 dprintk("Got a create_session replay! seqid= %d\n",
1361 slot->sl_seqid); 1303 cs_slot->sl_seqid);
1362 cstate->slot = slot;
1363 cstate->status = status;
1364 /* Return the cached reply status */ 1304 /* Return the cached reply status */
1365 status = nfsd4_replay_cache_entry(resp, NULL); 1305 status = nfsd4_replay_create_session(cr_ses, cs_slot);
1366 goto out; 1306 goto out;
1367 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { 1307 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
1368 status = nfserr_seq_misordered; 1308 status = nfserr_seq_misordered;
1369 dprintk("Sequence misordered!\n"); 1309 dprintk("Sequence misordered!\n");
1370 dprintk("Expected seqid= %d but got seqid= %d\n", 1310 dprintk("Expected seqid= %d but got seqid= %d\n",
1371 slot->sl_seqid, cr_ses->seqid); 1311 cs_slot->sl_seqid, cr_ses->seqid);
1372 goto out; 1312 goto out;
1373 } 1313 }
1374 conf->cl_slot.sl_seqid++; 1314 cs_slot->sl_seqid++;
1375 } else if (unconf) { 1315 } else if (unconf) {
1376 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1316 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1377 (ip_addr != unconf->cl_addr)) { 1317 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
1378 status = nfserr_clid_inuse; 1318 status = nfserr_clid_inuse;
1379 goto out; 1319 goto out;
1380 } 1320 }
1381 1321
1382 slot = &unconf->cl_slot; 1322 cs_slot = &unconf->cl_cs_slot;
1383 status = check_slot_seqid(cr_ses->seqid, slot); 1323 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1384 if (status) { 1324 if (status) {
1385 /* an unconfirmed replay returns misordered */ 1325 /* an unconfirmed replay returns misordered */
1386 status = nfserr_seq_misordered; 1326 status = nfserr_seq_misordered;
1387 goto out; 1327 goto out_cache;
1388 } 1328 }
1389 1329
1390 slot->sl_seqid++; /* from 0 to 1 */ 1330 cs_slot->sl_seqid++; /* from 0 to 1 */
1391 move_to_confirmed(unconf); 1331 move_to_confirmed(unconf);
1392 1332
1393 /* 1333 /*
@@ -1396,6 +1336,19 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1396 cr_ses->flags &= ~SESSION4_PERSIST; 1336 cr_ses->flags &= ~SESSION4_PERSIST;
1397 cr_ses->flags &= ~SESSION4_RDMA; 1337 cr_ses->flags &= ~SESSION4_RDMA;
1398 1338
1339 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1340 unconf->cl_cb_xprt = rqstp->rq_xprt;
1341 svc_xprt_get(unconf->cl_cb_xprt);
1342 rpc_copy_addr(
1343 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1344 sa);
1345 unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
1346 unconf->cl_cb_conn.cb_minorversion =
1347 cstate->minorversion;
1348 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1349 unconf->cl_cb_seq_nr = 1;
1350 nfsd4_probe_callback(unconf);
1351 }
1399 conf = unconf; 1352 conf = unconf;
1400 } else { 1353 } else {
1401 status = nfserr_stale_clientid; 1354 status = nfserr_stale_clientid;
@@ -1408,12 +1361,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1408 1361
1409 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, 1362 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1410 NFS4_MAX_SESSIONID_LEN); 1363 NFS4_MAX_SESSIONID_LEN);
1411 cr_ses->seqid = slot->sl_seqid; 1364 cr_ses->seqid = cs_slot->sl_seqid;
1412 1365
1413 slot->sl_inuse = true; 1366out_cache:
1414 cstate->slot = slot; 1367 /* cache solo and embedded create sessions under the state lock */
1415 /* Ensure a page is used for the cache */ 1368 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1416 slot->sl_cache_entry.ce_cachethis = 1;
1417out: 1369out:
1418 nfs4_unlock_state(); 1370 nfs4_unlock_state();
1419 dprintk("%s returns %d\n", __func__, ntohl(status)); 1371 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1478,18 +1430,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1478 if (seq->slotid >= session->se_fchannel.maxreqs) 1430 if (seq->slotid >= session->se_fchannel.maxreqs)
1479 goto out; 1431 goto out;
1480 1432
1481 slot = &session->se_slots[seq->slotid]; 1433 slot = session->se_slots[seq->slotid];
1482 dprintk("%s: slotid %d\n", __func__, seq->slotid); 1434 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1483 1435
1484 status = check_slot_seqid(seq->seqid, slot); 1436 /* We do not negotiate the number of slots yet, so set the
1437 * maxslots to the session maxreqs which is used to encode
1438 * sr_highest_slotid and the sr_target_slot id to maxslots */
1439 seq->maxslots = session->se_fchannel.maxreqs;
1440
1441 status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse);
1485 if (status == nfserr_replay_cache) { 1442 if (status == nfserr_replay_cache) {
1486 cstate->slot = slot; 1443 cstate->slot = slot;
1487 cstate->session = session; 1444 cstate->session = session;
1488 /* Return the cached reply status and set cstate->status 1445 /* Return the cached reply status and set cstate->status
1489 * for nfsd4_svc_encode_compoundres processing */ 1446 * for nfsd4_proc_compound processing */
1490 status = nfsd4_replay_cache_entry(resp, seq); 1447 status = nfsd4_replay_cache_entry(resp, seq);
1491 cstate->status = nfserr_replay_cache; 1448 cstate->status = nfserr_replay_cache;
1492 goto replay_cache; 1449 goto out;
1493 } 1450 }
1494 if (status) 1451 if (status)
1495 goto out; 1452 goto out;
@@ -1497,23 +1454,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1497 /* Success! bump slot seqid */ 1454 /* Success! bump slot seqid */
1498 slot->sl_inuse = true; 1455 slot->sl_inuse = true;
1499 slot->sl_seqid = seq->seqid; 1456 slot->sl_seqid = seq->seqid;
1500 slot->sl_cache_entry.ce_cachethis = seq->cachethis; 1457 slot->sl_cachethis = seq->cachethis;
1501 /* Always set the cache entry cachethis for solo sequence */
1502 if (nfsd4_is_solo_sequence(resp))
1503 slot->sl_cache_entry.ce_cachethis = 1;
1504 1458
1505 cstate->slot = slot; 1459 cstate->slot = slot;
1506 cstate->session = session; 1460 cstate->session = session;
1507 1461
1508replay_cache: 1462 /* Hold a session reference until done processing the compound:
1509 /* Renew the clientid on success and on replay.
1510 * Hold a session reference until done processing the compound:
1511 * nfsd4_put_session called only if the cstate slot is set. 1463 * nfsd4_put_session called only if the cstate slot is set.
1512 */ 1464 */
1513 renew_client(session->se_client);
1514 nfsd4_get_session(session); 1465 nfsd4_get_session(session);
1515out: 1466out:
1516 spin_unlock(&sessionid_lock); 1467 spin_unlock(&sessionid_lock);
1468 /* Renew the clientid on success and on replay */
1469 if (cstate->session) {
1470 nfs4_lock_state();
1471 renew_client(session->se_client);
1472 nfs4_unlock_state();
1473 }
1517 dprintk("%s: return %d\n", __func__, ntohl(status)); 1474 dprintk("%s: return %d\n", __func__, ntohl(status));
1518 return status; 1475 return status;
1519} 1476}
@@ -1522,7 +1479,7 @@ __be32
1522nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1479nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1523 struct nfsd4_setclientid *setclid) 1480 struct nfsd4_setclientid *setclid)
1524{ 1481{
1525 struct sockaddr_in *sin = svc_addr_in(rqstp); 1482 struct sockaddr *sa = svc_addr(rqstp);
1526 struct xdr_netobj clname = { 1483 struct xdr_netobj clname = {
1527 .len = setclid->se_namelen, 1484 .len = setclid->se_namelen,
1528 .data = setclid->se_name, 1485 .data = setclid->se_name,
@@ -1531,7 +1488,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1531 unsigned int strhashval; 1488 unsigned int strhashval;
1532 struct nfs4_client *conf, *unconf, *new; 1489 struct nfs4_client *conf, *unconf, *new;
1533 __be32 status; 1490 __be32 status;
1534 char *princ;
1535 char dname[HEXDIR_LEN]; 1491 char dname[HEXDIR_LEN];
1536 1492
1537 if (!check_name(clname)) 1493 if (!check_name(clname))
@@ -1554,8 +1510,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1554 /* RFC 3530 14.2.33 CASE 0: */ 1510 /* RFC 3530 14.2.33 CASE 0: */
1555 status = nfserr_clid_inuse; 1511 status = nfserr_clid_inuse;
1556 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1512 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1557 dprintk("NFSD: setclientid: string in use by client" 1513 char addr_str[INET6_ADDRSTRLEN];
1558 " at %pI4\n", &conf->cl_addr); 1514 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
1515 sizeof(addr_str));
1516 dprintk("NFSD: setclientid: string in use by client "
1517 "at %s\n", addr_str);
1559 goto out; 1518 goto out;
1560 } 1519 }
1561 } 1520 }
@@ -1573,7 +1532,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1573 */ 1532 */
1574 if (unconf) 1533 if (unconf)
1575 expire_client(unconf); 1534 expire_client(unconf);
1576 new = create_client(clname, dname); 1535 new = create_client(clname, dname, rqstp, &clverifier);
1577 if (new == NULL) 1536 if (new == NULL)
1578 goto out; 1537 goto out;
1579 gen_clid(new); 1538 gen_clid(new);
@@ -1590,7 +1549,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1590 */ 1549 */
1591 expire_client(unconf); 1550 expire_client(unconf);
1592 } 1551 }
1593 new = create_client(clname, dname); 1552 new = create_client(clname, dname, rqstp, &clverifier);
1594 if (new == NULL) 1553 if (new == NULL)
1595 goto out; 1554 goto out;
1596 copy_clid(new, conf); 1555 copy_clid(new, conf);
@@ -1600,7 +1559,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1600 * probable client reboot; state will be removed if 1559 * probable client reboot; state will be removed if
1601 * confirmed. 1560 * confirmed.
1602 */ 1561 */
1603 new = create_client(clname, dname); 1562 new = create_client(clname, dname, rqstp, &clverifier);
1604 if (new == NULL) 1563 if (new == NULL)
1605 goto out; 1564 goto out;
1606 gen_clid(new); 1565 gen_clid(new);
@@ -1611,25 +1570,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1611 * confirmed. 1570 * confirmed.
1612 */ 1571 */
1613 expire_client(unconf); 1572 expire_client(unconf);
1614 new = create_client(clname, dname); 1573 new = create_client(clname, dname, rqstp, &clverifier);
1615 if (new == NULL) 1574 if (new == NULL)
1616 goto out; 1575 goto out;
1617 gen_clid(new); 1576 gen_clid(new);
1618 } 1577 }
1619 copy_verf(new, &clverifier); 1578 gen_callback(new, setclid, rpc_get_scope_id(sa));
1620 new->cl_addr = sin->sin_addr.s_addr;
1621 new->cl_flavor = rqstp->rq_flavor;
1622 princ = svc_gss_principal(rqstp);
1623 if (princ) {
1624 new->cl_principal = kstrdup(princ, GFP_KERNEL);
1625 if (new->cl_principal == NULL) {
1626 free_client(new);
1627 goto out;
1628 }
1629 }
1630 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1631 gen_confirm(new);
1632 gen_callback(new, setclid);
1633 add_to_unconfirmed(new, strhashval); 1579 add_to_unconfirmed(new, strhashval);
1634 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1580 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
1635 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 1581 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1651,7 +1597,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1651 struct nfsd4_compound_state *cstate, 1597 struct nfsd4_compound_state *cstate,
1652 struct nfsd4_setclientid_confirm *setclientid_confirm) 1598 struct nfsd4_setclientid_confirm *setclientid_confirm)
1653{ 1599{
1654 struct sockaddr_in *sin = svc_addr_in(rqstp); 1600 struct sockaddr *sa = svc_addr(rqstp);
1655 struct nfs4_client *conf, *unconf; 1601 struct nfs4_client *conf, *unconf;
1656 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 1602 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
1657 clientid_t * clid = &setclientid_confirm->sc_clientid; 1603 clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -1670,9 +1616,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1670 unconf = find_unconfirmed_client(clid); 1616 unconf = find_unconfirmed_client(clid);
1671 1617
1672 status = nfserr_clid_inuse; 1618 status = nfserr_clid_inuse;
1673 if (conf && conf->cl_addr != sin->sin_addr.s_addr) 1619 if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
1674 goto out; 1620 goto out;
1675 if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) 1621 if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
1676 goto out; 1622 goto out;
1677 1623
1678 /* 1624 /*
@@ -2163,7 +2109,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2163 return -EAGAIN; 2109 return -EAGAIN;
2164} 2110}
2165 2111
2166static struct lock_manager_operations nfsd_lease_mng_ops = { 2112static const struct lock_manager_operations nfsd_lease_mng_ops = {
2167 .fl_break = nfsd_break_deleg_cb, 2113 .fl_break = nfsd_break_deleg_cb,
2168 .fl_release_private = nfsd_release_deleg_cb, 2114 .fl_release_private = nfsd_release_deleg_cb,
2169 .fl_copy_lock = nfsd_copy_lock_deleg_cb, 2115 .fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -3368,7 +3314,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
3368 3314
3369/* Hack!: For now, we're defining this just so we can use a pointer to it 3315/* Hack!: For now, we're defining this just so we can use a pointer to it
3370 * as a unique cookie to identify our (NFSv4's) posix locks. */ 3316 * as a unique cookie to identify our (NFSv4's) posix locks. */
3371static struct lock_manager_operations nfsd_posix_mng_ops = { 3317static const struct lock_manager_operations nfsd_posix_mng_ops = {
3372}; 3318};
3373 3319
3374static inline void 3320static inline void
@@ -4072,7 +4018,7 @@ set_max_delegations(void)
4072 4018
4073/* initialization to perform when the nfsd service is started: */ 4019/* initialization to perform when the nfsd service is started: */
4074 4020
4075static void 4021static int
4076__nfs4_state_start(void) 4022__nfs4_state_start(void)
4077{ 4023{
4078 unsigned long grace_time; 4024 unsigned long grace_time;
@@ -4084,19 +4030,26 @@ __nfs4_state_start(void)
4084 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4030 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
4085 grace_time/HZ); 4031 grace_time/HZ);
4086 laundry_wq = create_singlethread_workqueue("nfsd4"); 4032 laundry_wq = create_singlethread_workqueue("nfsd4");
4033 if (laundry_wq == NULL)
4034 return -ENOMEM;
4087 queue_delayed_work(laundry_wq, &laundromat_work, grace_time); 4035 queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
4088 set_max_delegations(); 4036 set_max_delegations();
4037 return set_callback_cred();
4089} 4038}
4090 4039
4091void 4040int
4092nfs4_state_start(void) 4041nfs4_state_start(void)
4093{ 4042{
4043 int ret;
4044
4094 if (nfs4_init) 4045 if (nfs4_init)
4095 return; 4046 return 0;
4096 nfsd4_load_reboot_recovery_data(); 4047 nfsd4_load_reboot_recovery_data();
4097 __nfs4_state_start(); 4048 ret = __nfs4_state_start();
4049 if (ret)
4050 return ret;
4098 nfs4_init = 1; 4051 nfs4_init = 1;
4099 return; 4052 return 0;
4100} 4053}
4101 4054
4102time_t 4055time_t
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2dcc7feaa6ff..0fbd50cee1f6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1599,7 +1599,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1599static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) 1599static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
1600{ 1600{
1601 struct svc_fh tmp_fh; 1601 struct svc_fh tmp_fh;
1602 char *path, *rootpath; 1602 char *path = NULL, *rootpath;
1603 size_t rootlen;
1603 1604
1604 fh_init(&tmp_fh, NFS4_FHSIZE); 1605 fh_init(&tmp_fh, NFS4_FHSIZE);
1605 *stat = exp_pseudoroot(rqstp, &tmp_fh); 1606 *stat = exp_pseudoroot(rqstp, &tmp_fh);
@@ -1609,14 +1610,18 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
1609 1610
1610 path = exp->ex_pathname; 1611 path = exp->ex_pathname;
1611 1612
1612 if (strncmp(path, rootpath, strlen(rootpath))) { 1613 rootlen = strlen(rootpath);
1614 if (strncmp(path, rootpath, rootlen)) {
1613 dprintk("nfsd: fs_locations failed;" 1615 dprintk("nfsd: fs_locations failed;"
1614 "%s is not contained in %s\n", path, rootpath); 1616 "%s is not contained in %s\n", path, rootpath);
1615 *stat = nfserr_notsupp; 1617 *stat = nfserr_notsupp;
1616 return NULL; 1618 path = NULL;
1619 goto out;
1617 } 1620 }
1618 1621 path += rootlen;
1619 return path + strlen(rootpath); 1622out:
1623 fh_put(&tmp_fh);
1624 return path;
1620} 1625}
1621 1626
1622/* 1627/*
@@ -1793,11 +1798,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1793 goto out_nfserr; 1798 goto out_nfserr;
1794 } 1799 }
1795 } 1800 }
1796 if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
1797 if (exp->ex_fslocs.locations == NULL) {
1798 bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1799 }
1800 }
1801 if ((buflen -= 16) < 0) 1801 if ((buflen -= 16) < 0)
1802 goto out_resource; 1802 goto out_resource;
1803 1803
@@ -1825,8 +1825,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1825 goto out_resource; 1825 goto out_resource;
1826 if (!aclsupport) 1826 if (!aclsupport)
1827 word0 &= ~FATTR4_WORD0_ACL; 1827 word0 &= ~FATTR4_WORD0_ACL;
1828 if (!exp->ex_fslocs.locations)
1829 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1830 if (!word2) { 1828 if (!word2) {
1831 WRITE32(2); 1829 WRITE32(2);
1832 WRITE32(word0); 1830 WRITE32(word0);
@@ -3064,6 +3062,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3064 WRITE32(0); 3062 WRITE32(0);
3065 3063
3066 ADJUST_ARGS(); 3064 ADJUST_ARGS();
3065 resp->cstate.datap = p; /* DRC cache data pointer */
3067 return 0; 3066 return 0;
3068} 3067}
3069 3068
@@ -3166,7 +3165,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3166 return status; 3165 return status;
3167 3166
3168 session = resp->cstate.session; 3167 session = resp->cstate.session;
3169 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0) 3168 if (session == NULL || slot->sl_cachethis == 0)
3170 return status; 3169 return status;
3171 3170
3172 if (resp->opcnt >= args->opcnt) 3171 if (resp->opcnt >= args->opcnt)
@@ -3291,6 +3290,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3291 /* 3290 /*
3292 * All that remains is to write the tag and operation count... 3291 * All that remains is to write the tag and operation count...
3293 */ 3292 */
3293 struct nfsd4_compound_state *cs = &resp->cstate;
3294 struct kvec *iov; 3294 struct kvec *iov;
3295 p = resp->tagp; 3295 p = resp->tagp;
3296 *p++ = htonl(resp->taglen); 3296 *p++ = htonl(resp->taglen);
@@ -3304,17 +3304,11 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3304 iov = &rqstp->rq_res.head[0]; 3304 iov = &rqstp->rq_res.head[0];
3305 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3305 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3306 BUG_ON(iov->iov_len > PAGE_SIZE); 3306 BUG_ON(iov->iov_len > PAGE_SIZE);
3307 if (nfsd4_has_session(&resp->cstate)) { 3307 if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
3308 if (resp->cstate.status == nfserr_replay_cache && 3308 nfsd4_store_cache_entry(resp);
3309 !nfsd4_not_cached(resp)) { 3309 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3310 iov->iov_len = resp->cstate.iovlen; 3310 resp->cstate.slot->sl_inuse = false;
3311 } else { 3311 nfsd4_put_session(resp->cstate.session);
3312 nfsd4_store_cache_entry(resp);
3313 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3314 resp->cstate.slot->sl_inuse = 0;
3315 }
3316 if (resp->cstate.session)
3317 nfsd4_put_session(resp->cstate.session);
3318 } 3312 }
3319 return 1; 3313 return 1;
3320} 3314}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7e906c5b7671..5c01fc148ce8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -174,12 +174,13 @@ static const struct file_operations exports_operations = {
174}; 174};
175 175
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
177 178
178static struct file_operations pool_stats_operations = { 179static const struct file_operations pool_stats_operations = {
179 .open = nfsd_pool_stats_open, 180 .open = nfsd_pool_stats_open,
180 .read = seq_read, 181 .read = seq_read,
181 .llseek = seq_lseek, 182 .llseek = seq_lseek,
182 .release = seq_release, 183 .release = nfsd_pool_stats_release,
183 .owner = THIS_MODULE, 184 .owner = THIS_MODULE,
184}; 185};
185 186
@@ -776,10 +777,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
776 size -= len; 777 size -= len;
777 mesg += len; 778 mesg += len;
778 } 779 }
779 780 rv = mesg - buf;
780 mutex_unlock(&nfsd_mutex);
781 return (mesg-buf);
782
783out_free: 781out_free:
784 kfree(nthreads); 782 kfree(nthreads);
785 mutex_unlock(&nfsd_mutex); 783 mutex_unlock(&nfsd_mutex);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8847f3fbfc1e..01965b2f3a76 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -397,44 +397,51 @@ static inline void _fh_update_old(struct dentry *dentry,
397 fh->ofh_dirino = 0; 397 fh->ofh_dirino = 0;
398} 398}
399 399
400__be32 400static bool is_root_export(struct svc_export *exp)
401fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
402 struct svc_fh *ref_fh)
403{ 401{
404 /* ref_fh is a reference file handle. 402 return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
405 * if it is non-null and for the same filesystem, then we should compose 403}
406 * a filehandle which is of the same version, where possible.
407 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
408 * Then create a 32byte filehandle using nfs_fhbase_old
409 *
410 */
411 404
412 u8 version; 405static struct super_block *exp_sb(struct svc_export *exp)
413 u8 fsid_type = 0; 406{
414 struct inode * inode = dentry->d_inode; 407 return exp->ex_path.dentry->d_inode->i_sb;
415 struct dentry *parent = dentry->d_parent; 408}
416 __u32 *datap;
417 dev_t ex_dev = exp->ex_path.dentry->d_inode->i_sb->s_dev;
418 int root_export = (exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root);
419 409
420 dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", 410static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
421 MAJOR(ex_dev), MINOR(ex_dev), 411{
422 (long) exp->ex_path.dentry->d_inode->i_ino, 412 switch (fsid_type) {
423 parent->d_name.name, dentry->d_name.name, 413 case FSID_DEV:
424 (inode ? inode->i_ino : 0)); 414 if (!old_valid_dev(exp_sb(exp)->s_dev))
415 return 0;
416 /* FALL THROUGH */
417 case FSID_MAJOR_MINOR:
418 case FSID_ENCODE_DEV:
419 return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
420 case FSID_NUM:
421 return exp->ex_flags & NFSEXP_FSID;
422 case FSID_UUID8:
423 case FSID_UUID16:
424 if (!is_root_export(exp))
425 return 0;
426 /* fall through */
427 case FSID_UUID4_INUM:
428 case FSID_UUID16_INUM:
429 return exp->ex_uuid != NULL;
430 }
431 return 1;
432}
425 433
426 /* Choose filehandle version and fsid type based on 434
427 * the reference filehandle (if it is in the same export) 435static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
428 * or the export options. 436{
429 */ 437 u8 version;
430 retry: 438 u8 fsid_type;
439retry:
431 version = 1; 440 version = 1;
432 if (ref_fh && ref_fh->fh_export == exp) { 441 if (ref_fh && ref_fh->fh_export == exp) {
433 version = ref_fh->fh_handle.fh_version; 442 version = ref_fh->fh_handle.fh_version;
434 fsid_type = ref_fh->fh_handle.fh_fsid_type; 443 fsid_type = ref_fh->fh_handle.fh_fsid_type;
435 444
436 if (ref_fh == fhp)
437 fh_put(ref_fh);
438 ref_fh = NULL; 445 ref_fh = NULL;
439 446
440 switch (version) { 447 switch (version) {
@@ -447,58 +454,66 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
447 goto retry; 454 goto retry;
448 } 455 }
449 456
450 /* Need to check that this type works for this 457 /*
451 * export point. As the fsid -> filesystem mapping 458 * As the fsid -> filesystem mapping was guided by
452 * was guided by user-space, there is no guarantee 459 * user-space, there is no guarantee that the filesystem
453 * that the filesystem actually supports that fsid 460 * actually supports that fsid type. If it doesn't we
454 * type. If it doesn't we loop around again without 461 * loop around again without ref_fh set.
455 * ref_fh set.
456 */ 462 */
457 switch(fsid_type) { 463 if (!fsid_type_ok_for_exp(fsid_type, exp))
458 case FSID_DEV: 464 goto retry;
459 if (!old_valid_dev(ex_dev))
460 goto retry;
461 /* FALL THROUGH */
462 case FSID_MAJOR_MINOR:
463 case FSID_ENCODE_DEV:
464 if (!(exp->ex_path.dentry->d_inode->i_sb->s_type->fs_flags
465 & FS_REQUIRES_DEV))
466 goto retry;
467 break;
468 case FSID_NUM:
469 if (! (exp->ex_flags & NFSEXP_FSID))
470 goto retry;
471 break;
472 case FSID_UUID8:
473 case FSID_UUID16:
474 if (!root_export)
475 goto retry;
476 /* fall through */
477 case FSID_UUID4_INUM:
478 case FSID_UUID16_INUM:
479 if (exp->ex_uuid == NULL)
480 goto retry;
481 break;
482 }
483 } else if (exp->ex_flags & NFSEXP_FSID) { 465 } else if (exp->ex_flags & NFSEXP_FSID) {
484 fsid_type = FSID_NUM; 466 fsid_type = FSID_NUM;
485 } else if (exp->ex_uuid) { 467 } else if (exp->ex_uuid) {
486 if (fhp->fh_maxsize >= 64) { 468 if (fhp->fh_maxsize >= 64) {
487 if (root_export) 469 if (is_root_export(exp))
488 fsid_type = FSID_UUID16; 470 fsid_type = FSID_UUID16;
489 else 471 else
490 fsid_type = FSID_UUID16_INUM; 472 fsid_type = FSID_UUID16_INUM;
491 } else { 473 } else {
492 if (root_export) 474 if (is_root_export(exp))
493 fsid_type = FSID_UUID8; 475 fsid_type = FSID_UUID8;
494 else 476 else
495 fsid_type = FSID_UUID4_INUM; 477 fsid_type = FSID_UUID4_INUM;
496 } 478 }
497 } else if (!old_valid_dev(ex_dev)) 479 } else if (!old_valid_dev(exp_sb(exp)->s_dev))
498 /* for newer device numbers, we must use a newer fsid format */ 480 /* for newer device numbers, we must use a newer fsid format */
499 fsid_type = FSID_ENCODE_DEV; 481 fsid_type = FSID_ENCODE_DEV;
500 else 482 else
501 fsid_type = FSID_DEV; 483 fsid_type = FSID_DEV;
484 fhp->fh_handle.fh_version = version;
485 if (version)
486 fhp->fh_handle.fh_fsid_type = fsid_type;
487}
488
489__be32
490fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
491 struct svc_fh *ref_fh)
492{
493 /* ref_fh is a reference file handle.
494 * if it is non-null and for the same filesystem, then we should compose
495 * a filehandle which is of the same version, where possible.
496 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
497 * Then create a 32byte filehandle using nfs_fhbase_old
498 *
499 */
500
501 struct inode * inode = dentry->d_inode;
502 struct dentry *parent = dentry->d_parent;
503 __u32 *datap;
504 dev_t ex_dev = exp_sb(exp)->s_dev;
505
506 dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
507 MAJOR(ex_dev), MINOR(ex_dev),
508 (long) exp->ex_path.dentry->d_inode->i_ino,
509 parent->d_name.name, dentry->d_name.name,
510 (inode ? inode->i_ino : 0));
511
512 /* Choose filehandle version and fsid type based on
513 * the reference filehandle (if it is in the same export)
514 * or the export options.
515 */
516 set_version_and_fsid_type(fhp, exp, ref_fh);
502 517
503 if (ref_fh == fhp) 518 if (ref_fh == fhp)
504 fh_put(ref_fh); 519 fh_put(ref_fh);
@@ -516,7 +531,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
516 fhp->fh_export = exp; 531 fhp->fh_export = exp;
517 cache_get(&exp->h); 532 cache_get(&exp->h);
518 533
519 if (version == 0xca) { 534 if (fhp->fh_handle.fh_version == 0xca) {
520 /* old style filehandle please */ 535 /* old style filehandle please */
521 memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); 536 memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
522 fhp->fh_handle.fh_size = NFS_FHSIZE; 537 fhp->fh_handle.fh_size = NFS_FHSIZE;
@@ -530,22 +545,22 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
530 _fh_update_old(dentry, exp, &fhp->fh_handle); 545 _fh_update_old(dentry, exp, &fhp->fh_handle);
531 } else { 546 } else {
532 int len; 547 int len;
533 fhp->fh_handle.fh_version = 1;
534 fhp->fh_handle.fh_auth_type = 0; 548 fhp->fh_handle.fh_auth_type = 0;
535 datap = fhp->fh_handle.fh_auth+0; 549 datap = fhp->fh_handle.fh_auth+0;
536 fhp->fh_handle.fh_fsid_type = fsid_type; 550 mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
537 mk_fsid(fsid_type, datap, ex_dev,
538 exp->ex_path.dentry->d_inode->i_ino, 551 exp->ex_path.dentry->d_inode->i_ino,
539 exp->ex_fsid, exp->ex_uuid); 552 exp->ex_fsid, exp->ex_uuid);
540 553
541 len = key_len(fsid_type); 554 len = key_len(fhp->fh_handle.fh_fsid_type);
542 datap += len/4; 555 datap += len/4;
543 fhp->fh_handle.fh_size = 4 + len; 556 fhp->fh_handle.fh_size = 4 + len;
544 557
545 if (inode) 558 if (inode)
546 _fh_update(fhp, exp, dentry); 559 _fh_update(fhp, exp, dentry);
547 if (fhp->fh_handle.fh_fileid_type == 255) 560 if (fhp->fh_handle.fh_fileid_type == 255) {
561 fh_put(fhp);
548 return nfserr_opnotsupp; 562 return nfserr_opnotsupp;
563 }
549 } 564 }
550 565
551 return 0; 566 return 0;
@@ -639,8 +654,7 @@ enum fsid_source fsid_source(struct svc_fh *fhp)
639 case FSID_DEV: 654 case FSID_DEV:
640 case FSID_ENCODE_DEV: 655 case FSID_ENCODE_DEV:
641 case FSID_MAJOR_MINOR: 656 case FSID_MAJOR_MINOR:
642 if (fhp->fh_export->ex_path.dentry->d_inode->i_sb->s_type->fs_flags 657 if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
643 & FS_REQUIRES_DEV)
644 return FSIDSOURCE_DEV; 658 return FSIDSOURCE_DEV;
645 break; 659 break;
646 case FSID_NUM: 660 case FSID_NUM:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 24d58adfe5fd..67ea83eedd43 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
34#include <linux/nfsd/syscall.h> 34#include <linux/nfsd/syscall.h>
35#include <linux/lockd/bind.h> 35#include <linux/lockd/bind.h>
36#include <linux/nfsacl.h> 36#include <linux/nfsacl.h>
37#include <linux/seq_file.h>
37 38
38#define NFSDDBG_FACILITY NFSDDBG_SVC 39#define NFSDDBG_FACILITY NFSDDBG_SVC
39 40
@@ -66,6 +67,16 @@ struct timeval nfssvc_boot;
66DEFINE_MUTEX(nfsd_mutex); 67DEFINE_MUTEX(nfsd_mutex);
67struct svc_serv *nfsd_serv; 68struct svc_serv *nfsd_serv;
68 69
70/*
71 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
72 * nfsd_drc_max_pages limits the total amount of memory available for
73 * version 4.1 DRC caches.
74 * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
75 */
76spinlock_t nfsd_drc_lock;
77unsigned int nfsd_drc_max_mem;
78unsigned int nfsd_drc_mem_used;
79
69#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 80#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
70static struct svc_stat nfsd_acl_svcstats; 81static struct svc_stat nfsd_acl_svcstats;
71static struct svc_version * nfsd_acl_version[] = { 82static struct svc_version * nfsd_acl_version[] = {
@@ -235,13 +246,12 @@ void nfsd_reset_versions(void)
235 */ 246 */
236static void set_max_drc(void) 247static void set_max_drc(void)
237{ 248{
238 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ 249 #define NFSD_DRC_SIZE_SHIFT 10
239 #define NFSD_DRC_SIZE_SHIFT 7 250 nfsd_drc_max_mem = (nr_free_buffer_pages()
240 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() 251 >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
241 >> NFSD_DRC_SIZE_SHIFT; 252 nfsd_drc_mem_used = 0;
242 nfsd_serv->sv_drc_pages_used = 0; 253 spin_lock_init(&nfsd_drc_lock);
243 dprintk("%s svc_drc_max_pages %u\n", __func__, 254 dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
244 nfsd_serv->sv_drc_max_pages);
245} 255}
246 256
247int nfsd_create_serv(void) 257int nfsd_create_serv(void)
@@ -401,7 +411,9 @@ nfsd_svc(unsigned short port, int nrservs)
401 error = nfsd_racache_init(2*nrservs); 411 error = nfsd_racache_init(2*nrservs);
402 if (error<0) 412 if (error<0)
403 goto out; 413 goto out;
404 nfs4_state_start(); 414 error = nfs4_state_start();
415 if (error)
416 goto out;
405 417
406 nfsd_reset_versions(); 418 nfsd_reset_versions();
407 419
@@ -569,10 +581,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
569 + rqstp->rq_res.head[0].iov_len; 581 + rqstp->rq_res.head[0].iov_len;
570 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 582 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
571 583
572 /* NFSv4.1 DRC requires statp */
573 if (rqstp->rq_vers == 4)
574 nfsd4_set_statp(rqstp, statp);
575
576 /* Now call the procedure handler, and encode NFS status. */ 584 /* Now call the procedure handler, and encode NFS status. */
577 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 585 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
578 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 586 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -607,7 +615,25 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
607 615
608int nfsd_pool_stats_open(struct inode *inode, struct file *file) 616int nfsd_pool_stats_open(struct inode *inode, struct file *file)
609{ 617{
610 if (nfsd_serv == NULL) 618 int ret;
619 mutex_lock(&nfsd_mutex);
620 if (nfsd_serv == NULL) {
621 mutex_unlock(&nfsd_mutex);
611 return -ENODEV; 622 return -ENODEV;
612 return svc_pool_stats_open(nfsd_serv, file); 623 }
624 /* bump up the psudo refcount while traversing */
625 svc_get(nfsd_serv);
626 ret = svc_pool_stats_open(nfsd_serv, file);
627 mutex_unlock(&nfsd_mutex);
628 return ret;
629}
630
631int nfsd_pool_stats_release(struct inode *inode, struct file *file)
632{
633 int ret = seq_release(inode, file);
634 mutex_lock(&nfsd_mutex);
635 /* this function really, really should have been called svc_put() */
636 svc_destroy(nfsd_serv);
637 mutex_unlock(&nfsd_mutex);
638 return ret;
613} 639}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8fa09bfbcba7..a293f0273263 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -89,6 +89,12 @@ struct raparm_hbucket {
89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
90static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; 90static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
91 91
92static inline int
93nfsd_v4client(struct svc_rqst *rq)
94{
95 return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
96}
97
92/* 98/*
93 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 99 * Called from nfsd_lookup and encode_dirent. Check if we have crossed
94 * a mount point. 100 * a mount point.
@@ -115,7 +121,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
115 path_put(&path); 121 path_put(&path);
116 goto out; 122 goto out;
117 } 123 }
118 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 124 if (nfsd_v4client(rqstp) ||
125 (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
119 /* successfully crossed mount point */ 126 /* successfully crossed mount point */
120 /* 127 /*
121 * This is subtle: path.dentry is *not* on path.mnt 128 * This is subtle: path.dentry is *not* on path.mnt
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index c668bca579c1..5941958f1e47 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
36 36
37void nilfs_btnode_cache_init_once(struct address_space *btnc) 37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{ 38{
39 memset(btnc, 0, sizeof(*btnc));
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); 40 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock); 41 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list); 42 INIT_LIST_HEAD(&btnc->private_list);
@@ -46,7 +47,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); 47 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47} 48}
48 49
49static struct address_space_operations def_btnode_aops = { 50static const struct address_space_operations def_btnode_aops = {
50 .sync_page = block_sync_page, 51 .sync_page = block_sync_page,
51}; 52};
52 53
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf071..e097099bfc8f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
697 return 0; 697 return 0;
698} 698}
699 699
700struct file_operations nilfs_dir_operations = { 700const struct file_operations nilfs_dir_operations = {
701 .llseek = generic_file_llseek, 701 .llseek = generic_file_llseek,
702 .read = generic_read_dir, 702 .read = generic_read_dir,
703 .readdir = nilfs_readdir, 703 .readdir = nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 6bd84a0d8238..30292df443ce 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
117 return 0; 117 return 0;
118} 118}
119 119
120struct vm_operations_struct nilfs_file_vm_ops = { 120static const struct vm_operations_struct nilfs_file_vm_ops = {
121 .fault = filemap_fault, 121 .fault = filemap_fault,
122 .page_mkwrite = nilfs_page_mkwrite, 122 .page_mkwrite = nilfs_page_mkwrite,
123}; 123};
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
134 * We have mostly NULL's here: the current defaults are ok for 134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem. 135 * the nilfs filesystem.
136 */ 136 */
137struct file_operations nilfs_file_operations = { 137const struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek, 138 .llseek = generic_file_llseek,
139 .read = do_sync_read, 139 .read = do_sync_read,
140 .write = do_sync_write, 140 .write = do_sync_write,
@@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = {
151 .splice_read = generic_file_splice_read, 151 .splice_read = generic_file_splice_read,
152}; 152};
153 153
154struct inode_operations nilfs_file_inode_operations = { 154const struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate, 155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr, 156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission, 157 .permission = nilfs_permission,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1b3c2bb20da9..e6de0a27ab5d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,7 +52,7 @@
52#include "dat.h" 52#include "dat.h"
53#include "ifile.h" 53#include "ifile.h"
54 54
55static struct address_space_operations def_gcinode_aops = { 55static const struct address_space_operations def_gcinode_aops = {
56 .sync_page = block_sync_page, 56 .sync_page = block_sync_page,
57}; 57};
58 58
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 807e584b163d..5040220c3732 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
238 return size; 238 return size;
239} 239}
240 240
241struct address_space_operations nilfs_aops = { 241const struct address_space_operations nilfs_aops = {
242 .writepage = nilfs_writepage, 242 .writepage = nilfs_writepage,
243 .readpage = nilfs_readpage, 243 .readpage = nilfs_readpage,
244 .sync_page = block_sync_page, 244 .sync_page = block_sync_page,
@@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode,
400 ii->i_dir_acl = S_ISREG(inode->i_mode) ? 400 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
401 0 : le32_to_cpu(raw_inode->i_dir_acl); 401 0 : le32_to_cpu(raw_inode->i_dir_acl);
402#endif 402#endif
403 ii->i_dir_start_lookup = 0;
403 ii->i_cno = 0; 404 ii->i_cno = 0;
404 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 405 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
405 406
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 156bf6091a96..f6326112d647 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -427,13 +427,13 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
427} 427}
428 428
429 429
430static struct address_space_operations def_mdt_aops = { 430static const struct address_space_operations def_mdt_aops = {
431 .writepage = nilfs_mdt_write_page, 431 .writepage = nilfs_mdt_write_page,
432 .sync_page = block_sync_page, 432 .sync_page = block_sync_page,
433}; 433};
434 434
435static struct inode_operations def_mdt_iops; 435static const struct inode_operations def_mdt_iops;
436static struct file_operations def_mdt_fops; 436static const struct file_operations def_mdt_fops;
437 437
438/* 438/*
439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index df70dadb336f..ed02e886fa79 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -448,7 +448,7 @@ out:
448 return err; 448 return err;
449} 449}
450 450
451struct inode_operations nilfs_dir_inode_operations = { 451const struct inode_operations nilfs_dir_inode_operations = {
452 .create = nilfs_create, 452 .create = nilfs_create,
453 .lookup = nilfs_lookup, 453 .lookup = nilfs_lookup,
454 .link = nilfs_link, 454 .link = nilfs_link,
@@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = {
462 .permission = nilfs_permission, 462 .permission = nilfs_permission,
463}; 463};
464 464
465struct inode_operations nilfs_special_inode_operations = { 465const struct inode_operations nilfs_special_inode_operations = {
466 .setattr = nilfs_setattr, 466 .setattr = nilfs_setattr,
467 .permission = nilfs_permission, 467 .permission = nilfs_permission,
468}; 468};
469 469
470struct inode_operations nilfs_symlink_inode_operations = { 470const struct inode_operations nilfs_symlink_inode_operations = {
471 .readlink = generic_readlink, 471 .readlink = generic_readlink,
472 .follow_link = page_follow_link_light, 472 .follow_link = page_follow_link_light,
473 .put_link = page_put_link, 473 .put_link = page_put_link,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 724c63766e82..4da6f67e9a91 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,13 +294,13 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
294/* 294/*
295 * Inodes and files operations 295 * Inodes and files operations
296 */ 296 */
297extern struct file_operations nilfs_dir_operations; 297extern const struct file_operations nilfs_dir_operations;
298extern struct inode_operations nilfs_file_inode_operations; 298extern const struct inode_operations nilfs_file_inode_operations;
299extern struct file_operations nilfs_file_operations; 299extern const struct file_operations nilfs_file_operations;
300extern struct address_space_operations nilfs_aops; 300extern const struct address_space_operations nilfs_aops;
301extern struct inode_operations nilfs_dir_inode_operations; 301extern const struct inode_operations nilfs_dir_inode_operations;
302extern struct inode_operations nilfs_special_inode_operations; 302extern const struct inode_operations nilfs_special_inode_operations;
303extern struct inode_operations nilfs_symlink_inode_operations; 303extern const struct inode_operations nilfs_symlink_inode_operations;
304 304
305/* 305/*
306 * filesystem type 306 * filesystem type
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 55f3d6b60732..644e66727dd0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -504,7 +504,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
504 return 0; 504 return 0;
505} 505}
506 506
507static struct super_operations nilfs_sops = { 507static const struct super_operations nilfs_sops = {
508 .alloc_inode = nilfs_alloc_inode, 508 .alloc_inode = nilfs_alloc_inode,
509 .destroy_inode = nilfs_destroy_inode, 509 .destroy_inode = nilfs_destroy_inode,
510 .dirty_inode = nilfs_dirty_inode, 510 .dirty_inode = nilfs_dirty_inode,
@@ -560,7 +560,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
560 nilfs_nfs_get_inode); 560 nilfs_nfs_get_inode);
561} 561}
562 562
563static struct export_operations nilfs_export_ops = { 563static const struct export_operations nilfs_export_ops = {
564 .fh_to_dentry = nilfs_fh_to_dentry, 564 .fh_to_dentry = nilfs_fh_to_dentry,
565 .fh_to_parent = nilfs_fh_to_parent, 565 .fh_to_parent = nilfs_fh_to_parent,
566 .get_parent = nilfs_get_parent, 566 .get_parent = nilfs_get_parent,
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 477d37d83b31..44a88a9fa2c8 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
124 while (*s && len > 0) { 124 while (*s && len > 0) {
125 if (*s & 0x80) { 125 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u); 126 size = utf8_to_utf32(s, len, &u);
127 if (size < 0) { 127 if (size < 0)
128 /* Ignore character and move on */ 128 return -EINVAL;
129 size = 1; 129
130 } else if (u >= PLANE_SIZE) { 130 if (u >= PLANE_SIZE) {
131 u -= PLANE_SIZE; 131 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR | 132 *op++ = (wchar_t) (SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS)); 133 ((u >> 10) & SURROGATE_BITS));
@@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset)
270 270
271void unload_nls(struct nls_table *nls) 271void unload_nls(struct nls_table *nls)
272{ 272{
273 module_put(nls->owner); 273 if (nls)
274 module_put(nls->owner);
274} 275}
275 276
276static const wchar_t charset2uni[256] = { 277static const wchar_t charset2uni[256] = {
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f0667..cfce53cb65d7 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
1550 .migratepage = buffer_migrate_page, /* Move a page cache page from 1550 .migratepage = buffer_migrate_page, /* Move a page cache page from
1551 one physical page to an 1551 one physical page to an
1552 other. */ 1552 other. */
1553 .error_remove_page = generic_error_remove_page,
1553}; 1554};
1554 1555
1555/** 1556/**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
1569 .migratepage = buffer_migrate_page, /* Move a page cache page from 1570 .migratepage = buffer_migrate_page, /* Move a page cache page from
1570 one physical page to an 1571 one physical page to an
1571 other. */ 1572 other. */
1573 .error_remove_page = generic_error_remove_page,
1572}; 1574};
1573 1575
1574#ifdef NTFS_RW 1576#ifdef NTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 4350d4993b18..663c0e341f8b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2146,46 +2146,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2146} 2146}
2147 2147
2148/** 2148/**
2149 * ntfs_file_writev -
2150 *
2151 * Basically the same as generic_file_writev() except that it ends up calling
2152 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2153 */
2154static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2155 unsigned long nr_segs, loff_t *ppos)
2156{
2157 struct address_space *mapping = file->f_mapping;
2158 struct inode *inode = mapping->host;
2159 struct kiocb kiocb;
2160 ssize_t ret;
2161
2162 mutex_lock(&inode->i_mutex);
2163 init_sync_kiocb(&kiocb, file);
2164 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2165 if (ret == -EIOCBQUEUED)
2166 ret = wait_on_sync_kiocb(&kiocb);
2167 mutex_unlock(&inode->i_mutex);
2168 if (ret > 0) {
2169 int err = generic_write_sync(file, *ppos - ret, ret);
2170 if (err < 0)
2171 ret = err;
2172 }
2173 return ret;
2174}
2175
2176/**
2177 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2178 */
2179static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2180 size_t count, loff_t *ppos)
2181{
2182 struct iovec local_iov = { .iov_base = (void __user *)buf,
2183 .iov_len = count };
2184
2185 return ntfs_file_writev(file, &local_iov, 1, ppos);
2186}
2187
2188/**
2189 * ntfs_file_fsync - sync a file to disk 2149 * ntfs_file_fsync - sync a file to disk
2190 * @filp: file to be synced 2150 * @filp: file to be synced
2191 * @dentry: dentry describing the file to sync 2151 * @dentry: dentry describing the file to sync
@@ -2247,7 +2207,7 @@ const struct file_operations ntfs_file_ops = {
2247 .read = do_sync_read, /* Read from file. */ 2207 .read = do_sync_read, /* Read from file. */
2248 .aio_read = generic_file_aio_read, /* Async read from file. */ 2208 .aio_read = generic_file_aio_read, /* Async read from file. */
2249#ifdef NTFS_RW 2209#ifdef NTFS_RW
2250 .write = ntfs_file_write, /* Write to file. */ 2210 .write = do_sync_write, /* Write to file. */
2251 .aio_write = ntfs_file_aio_write, /* Async write to file. */ 2211 .aio_write = ntfs_file_aio_write, /* Async write to file. */
2252 /*.release = ,*/ /* Last file is closed. See 2212 /*.release = ,*/ /* Last file is closed. See
2253 fs/ext2/file.c:: 2213 fs/ext2/file.c::
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 50931b1ce4b9..8b2549f672bf 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -829,7 +829,7 @@ enum {
829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the 829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, 830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask 831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
832 is used to to obtain all flags that are valid for setting. */ 832 is used to obtain all flags that are valid for setting. */
833 /* 833 /*
834 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all 834 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION 835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3cd..a44b14cbceeb 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
47 return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); 47 return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
48 /* return (void *)__get_free_page(gfp_mask); */ 48 /* return (void *)__get_free_page(gfp_mask); */
49 } 49 }
50 if (likely(size >> PAGE_SHIFT < num_physpages)) 50 if (likely((size >> PAGE_SHIFT) < totalram_pages))
51 return __vmalloc(size, gfp_mask, PAGE_KERNEL); 51 return __vmalloc(size, gfp_mask, PAGE_KERNEL);
52 return NULL; 52 return NULL;
53} 53}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index abaaa1cbf8de..80b04770e8e9 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -201,8 +201,7 @@ use_utf8:
201 v, old_nls->charset); 201 v, old_nls->charset);
202 nls_map = old_nls; 202 nls_map = old_nls;
203 } else /* nls_map */ { 203 } else /* nls_map */ {
204 if (old_nls) 204 unload_nls(old_nls);
205 unload_nls(old_nls);
206 } 205 }
207 } else if (!strcmp(p, "utf8")) { 206 } else if (!strcmp(p, "utf8")) {
208 bool val = false; 207 bool val = false;
@@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb)
2427 ntfs_free(vol->upcase); 2426 ntfs_free(vol->upcase);
2428 vol->upcase = NULL; 2427 vol->upcase = NULL;
2429 } 2428 }
2430 if (vol->nls_map) { 2429
2431 unload_nls(vol->nls_map); 2430 unload_nls(vol->nls_map);
2432 vol->nls_map = NULL; 2431
2433 }
2434 sb->s_fs_info = NULL; 2432 sb->s_fs_info = NULL;
2435 kfree(vol); 2433 kfree(vol);
2436 2434
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 01596079dd63..31f25ce32c97 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -28,6 +28,7 @@ ocfs2-objs := \
28 locks.o \ 28 locks.o \
29 mmap.o \ 29 mmap.o \
30 namei.o \ 30 namei.o \
31 refcounttree.o \
31 resize.o \ 32 resize.o \
32 slot_map.o \ 33 slot_map.o \
33 suballoc.o \ 34 suballoc.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ab513ddaeff2..38a42f5d59ff 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,10 +49,21 @@
49#include "super.h" 49#include "super.h"
50#include "uptodate.h" 50#include "uptodate.h"
51#include "xattr.h" 51#include "xattr.h"
52#include "refcounttree.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
54 55
56enum ocfs2_contig_type {
57 CONTIG_NONE = 0,
58 CONTIG_LEFT,
59 CONTIG_RIGHT,
60 CONTIG_LEFTRIGHT,
61};
55 62
63static enum ocfs2_contig_type
64 ocfs2_extent_rec_contig(struct super_block *sb,
65 struct ocfs2_extent_rec *ext,
66 struct ocfs2_extent_rec *insert_rec);
56/* 67/*
57 * Operations for a specific extent tree type. 68 * Operations for a specific extent tree type.
58 * 69 *
@@ -79,18 +90,30 @@ struct ocfs2_extent_tree_operations {
79 * that value. new_clusters is the delta, and must be 90 * that value. new_clusters is the delta, and must be
80 * added to the total. Required. 91 * added to the total. Required.
81 */ 92 */
82 void (*eo_update_clusters)(struct inode *inode, 93 void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
83 struct ocfs2_extent_tree *et,
84 u32 new_clusters); 94 u32 new_clusters);
85 95
86 /* 96 /*
97 * If this extent tree is supported by an extent map, insert
98 * a record into the map.
99 */
100 void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
101 struct ocfs2_extent_rec *rec);
102
103 /*
104 * If this extent tree is supported by an extent map, truncate the
105 * map to clusters,
106 */
107 void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
108 u32 clusters);
109
110 /*
87 * If ->eo_insert_check() exists, it is called before rec is 111 * If ->eo_insert_check() exists, it is called before rec is
88 * inserted into the extent tree. It is optional. 112 * inserted into the extent tree. It is optional.
89 */ 113 */
90 int (*eo_insert_check)(struct inode *inode, 114 int (*eo_insert_check)(struct ocfs2_extent_tree *et,
91 struct ocfs2_extent_tree *et,
92 struct ocfs2_extent_rec *rec); 115 struct ocfs2_extent_rec *rec);
93 int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); 116 int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
94 117
95 /* 118 /*
96 * -------------------------------------------------------------- 119 * --------------------------------------------------------------
@@ -109,8 +132,17 @@ struct ocfs2_extent_tree_operations {
109 * it exists. If it does not, et->et_max_leaf_clusters is set 132 * it exists. If it does not, et->et_max_leaf_clusters is set
110 * to 0 (unlimited). Optional. 133 * to 0 (unlimited). Optional.
111 */ 134 */
112 void (*eo_fill_max_leaf_clusters)(struct inode *inode, 135 void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
113 struct ocfs2_extent_tree *et); 136
137 /*
138 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
139 * are contiguous or not. Optional. Don't need to set it if use
140 * ocfs2_extent_rec as the tree leaf.
141 */
142 enum ocfs2_contig_type
143 (*eo_extent_contig)(struct ocfs2_extent_tree *et,
144 struct ocfs2_extent_rec *ext,
145 struct ocfs2_extent_rec *insert_rec);
114}; 146};
115 147
116 148
@@ -121,19 +153,22 @@ struct ocfs2_extent_tree_operations {
121static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); 153static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
122static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 154static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
123 u64 blkno); 155 u64 blkno);
124static void ocfs2_dinode_update_clusters(struct inode *inode, 156static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
125 struct ocfs2_extent_tree *et,
126 u32 clusters); 157 u32 clusters);
127static int ocfs2_dinode_insert_check(struct inode *inode, 158static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
128 struct ocfs2_extent_tree *et, 159 struct ocfs2_extent_rec *rec);
160static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
161 u32 clusters);
162static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
129 struct ocfs2_extent_rec *rec); 163 struct ocfs2_extent_rec *rec);
130static int ocfs2_dinode_sanity_check(struct inode *inode, 164static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
131 struct ocfs2_extent_tree *et);
132static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 165static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
133static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 166static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 167 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
135 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 168 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
136 .eo_update_clusters = ocfs2_dinode_update_clusters, 169 .eo_update_clusters = ocfs2_dinode_update_clusters,
170 .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
171 .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
137 .eo_insert_check = ocfs2_dinode_insert_check, 172 .eo_insert_check = ocfs2_dinode_insert_check,
138 .eo_sanity_check = ocfs2_dinode_sanity_check, 173 .eo_sanity_check = ocfs2_dinode_sanity_check,
139 .eo_fill_root_el = ocfs2_dinode_fill_root_el, 174 .eo_fill_root_el = ocfs2_dinode_fill_root_el,
@@ -156,40 +191,53 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
156 return le64_to_cpu(di->i_last_eb_blk); 191 return le64_to_cpu(di->i_last_eb_blk);
157} 192}
158 193
159static void ocfs2_dinode_update_clusters(struct inode *inode, 194static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
160 struct ocfs2_extent_tree *et,
161 u32 clusters) 195 u32 clusters)
162{ 196{
197 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
163 struct ocfs2_dinode *di = et->et_object; 198 struct ocfs2_dinode *di = et->et_object;
164 199
165 le32_add_cpu(&di->i_clusters, clusters); 200 le32_add_cpu(&di->i_clusters, clusters);
166 spin_lock(&OCFS2_I(inode)->ip_lock); 201 spin_lock(&oi->ip_lock);
167 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 202 oi->ip_clusters = le32_to_cpu(di->i_clusters);
168 spin_unlock(&OCFS2_I(inode)->ip_lock); 203 spin_unlock(&oi->ip_lock);
169} 204}
170 205
171static int ocfs2_dinode_insert_check(struct inode *inode, 206static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
172 struct ocfs2_extent_tree *et, 207 struct ocfs2_extent_rec *rec)
208{
209 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
210
211 ocfs2_extent_map_insert_rec(inode, rec);
212}
213
214static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
215 u32 clusters)
216{
217 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
218
219 ocfs2_extent_map_trunc(inode, clusters);
220}
221
222static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
173 struct ocfs2_extent_rec *rec) 223 struct ocfs2_extent_rec *rec)
174{ 224{
175 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 225 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
226 struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
176 227
177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 228 BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 229 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 (OCFS2_I(inode)->ip_clusters != 230 (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
180 le32_to_cpu(rec->e_cpos)),
181 "Device %s, asking for sparse allocation: inode %llu, " 231 "Device %s, asking for sparse allocation: inode %llu, "
182 "cpos %u, clusters %u\n", 232 "cpos %u, clusters %u\n",
183 osb->dev_str, 233 osb->dev_str,
184 (unsigned long long)OCFS2_I(inode)->ip_blkno, 234 (unsigned long long)oi->ip_blkno,
185 rec->e_cpos, 235 rec->e_cpos, oi->ip_clusters);
186 OCFS2_I(inode)->ip_clusters);
187 236
188 return 0; 237 return 0;
189} 238}
190 239
191static int ocfs2_dinode_sanity_check(struct inode *inode, 240static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
192 struct ocfs2_extent_tree *et)
193{ 241{
194 struct ocfs2_dinode *di = et->et_object; 242 struct ocfs2_dinode *di = et->et_object;
195 243
@@ -229,8 +277,7 @@ static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
229 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); 277 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
230} 278}
231 279
232static void ocfs2_xattr_value_update_clusters(struct inode *inode, 280static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
233 struct ocfs2_extent_tree *et,
234 u32 clusters) 281 u32 clusters)
235{ 282{
236 struct ocfs2_xattr_value_buf *vb = et->et_object; 283 struct ocfs2_xattr_value_buf *vb = et->et_object;
@@ -252,12 +299,11 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
252 et->et_root_el = &xb->xb_attrs.xb_root.xt_list; 299 et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
253} 300}
254 301
255static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, 302static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
256 struct ocfs2_extent_tree *et)
257{ 303{
304 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
258 et->et_max_leaf_clusters = 305 et->et_max_leaf_clusters =
259 ocfs2_clusters_for_bytes(inode->i_sb, 306 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
260 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
261} 307}
262 308
263static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 309static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -277,8 +323,7 @@ static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
277 return le64_to_cpu(xt->xt_last_eb_blk); 323 return le64_to_cpu(xt->xt_last_eb_blk);
278} 324}
279 325
280static void ocfs2_xattr_tree_update_clusters(struct inode *inode, 326static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
281 struct ocfs2_extent_tree *et,
282 u32 clusters) 327 u32 clusters)
283{ 328{
284 struct ocfs2_xattr_block *xb = et->et_object; 329 struct ocfs2_xattr_block *xb = et->et_object;
@@ -309,8 +354,7 @@ static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
309 return le64_to_cpu(dx_root->dr_last_eb_blk); 354 return le64_to_cpu(dx_root->dr_last_eb_blk);
310} 355}
311 356
312static void ocfs2_dx_root_update_clusters(struct inode *inode, 357static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
313 struct ocfs2_extent_tree *et,
314 u32 clusters) 358 u32 clusters)
315{ 359{
316 struct ocfs2_dx_root_block *dx_root = et->et_object; 360 struct ocfs2_dx_root_block *dx_root = et->et_object;
@@ -318,8 +362,7 @@ static void ocfs2_dx_root_update_clusters(struct inode *inode,
318 le32_add_cpu(&dx_root->dr_clusters, clusters); 362 le32_add_cpu(&dx_root->dr_clusters, clusters);
319} 363}
320 364
321static int ocfs2_dx_root_sanity_check(struct inode *inode, 365static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
322 struct ocfs2_extent_tree *et)
323{ 366{
324 struct ocfs2_dx_root_block *dx_root = et->et_object; 367 struct ocfs2_dx_root_block *dx_root = et->et_object;
325 368
@@ -343,8 +386,54 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el, 386 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344}; 387};
345 388
389static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
390{
391 struct ocfs2_refcount_block *rb = et->et_object;
392
393 et->et_root_el = &rb->rf_list;
394}
395
396static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
397 u64 blkno)
398{
399 struct ocfs2_refcount_block *rb = et->et_object;
400
401 rb->rf_last_eb_blk = cpu_to_le64(blkno);
402}
403
404static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
405{
406 struct ocfs2_refcount_block *rb = et->et_object;
407
408 return le64_to_cpu(rb->rf_last_eb_blk);
409}
410
411static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
412 u32 clusters)
413{
414 struct ocfs2_refcount_block *rb = et->et_object;
415
416 le32_add_cpu(&rb->rf_clusters, clusters);
417}
418
419static enum ocfs2_contig_type
420ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
421 struct ocfs2_extent_rec *ext,
422 struct ocfs2_extent_rec *insert_rec)
423{
424 return CONTIG_NONE;
425}
426
427static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
428 .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
429 .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
430 .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
431 .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
432 .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
433};
434
346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 435static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
347 struct inode *inode, 436 struct ocfs2_caching_info *ci,
348 struct buffer_head *bh, 437 struct buffer_head *bh,
349 ocfs2_journal_access_func access, 438 ocfs2_journal_access_func access,
350 void *obj, 439 void *obj,
@@ -352,6 +441,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
352{ 441{
353 et->et_ops = ops; 442 et->et_ops = ops;
354 et->et_root_bh = bh; 443 et->et_root_bh = bh;
444 et->et_ci = ci;
355 et->et_root_journal_access = access; 445 et->et_root_journal_access = access;
356 if (!obj) 446 if (!obj)
357 obj = (void *)bh->b_data; 447 obj = (void *)bh->b_data;
@@ -361,41 +451,49 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
361 if (!et->et_ops->eo_fill_max_leaf_clusters) 451 if (!et->et_ops->eo_fill_max_leaf_clusters)
362 et->et_max_leaf_clusters = 0; 452 et->et_max_leaf_clusters = 0;
363 else 453 else
364 et->et_ops->eo_fill_max_leaf_clusters(inode, et); 454 et->et_ops->eo_fill_max_leaf_clusters(et);
365} 455}
366 456
367void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 457void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
368 struct inode *inode, 458 struct ocfs2_caching_info *ci,
369 struct buffer_head *bh) 459 struct buffer_head *bh)
370{ 460{
371 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di, 461 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
372 NULL, &ocfs2_dinode_et_ops); 462 NULL, &ocfs2_dinode_et_ops);
373} 463}
374 464
375void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 465void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
376 struct inode *inode, 466 struct ocfs2_caching_info *ci,
377 struct buffer_head *bh) 467 struct buffer_head *bh)
378{ 468{
379 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb, 469 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
380 NULL, &ocfs2_xattr_tree_et_ops); 470 NULL, &ocfs2_xattr_tree_et_ops);
381} 471}
382 472
383void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 473void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
384 struct inode *inode, 474 struct ocfs2_caching_info *ci,
385 struct ocfs2_xattr_value_buf *vb) 475 struct ocfs2_xattr_value_buf *vb)
386{ 476{
387 __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb, 477 __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
388 &ocfs2_xattr_value_et_ops); 478 &ocfs2_xattr_value_et_ops);
389} 479}
390 480
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 481void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode, 482 struct ocfs2_caching_info *ci,
393 struct buffer_head *bh) 483 struct buffer_head *bh)
394{ 484{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr, 485 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops); 486 NULL, &ocfs2_dx_root_et_ops);
397} 487}
398 488
489void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
490 struct ocfs2_caching_info *ci,
491 struct buffer_head *bh)
492{
493 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
494 NULL, &ocfs2_refcount_tree_et_ops);
495}
496
399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 497static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
400 u64 new_last_eb_blk) 498 u64 new_last_eb_blk)
401{ 499{
@@ -407,78 +505,71 @@ static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
407 return et->et_ops->eo_get_last_eb_blk(et); 505 return et->et_ops->eo_get_last_eb_blk(et);
408} 506}
409 507
410static inline void ocfs2_et_update_clusters(struct inode *inode, 508static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
411 struct ocfs2_extent_tree *et,
412 u32 clusters) 509 u32 clusters)
413{ 510{
414 et->et_ops->eo_update_clusters(inode, et, clusters); 511 et->et_ops->eo_update_clusters(et, clusters);
512}
513
514static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
515 struct ocfs2_extent_rec *rec)
516{
517 if (et->et_ops->eo_extent_map_insert)
518 et->et_ops->eo_extent_map_insert(et, rec);
519}
520
521static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
522 u32 clusters)
523{
524 if (et->et_ops->eo_extent_map_truncate)
525 et->et_ops->eo_extent_map_truncate(et, clusters);
415} 526}
416 527
417static inline int ocfs2_et_root_journal_access(handle_t *handle, 528static inline int ocfs2_et_root_journal_access(handle_t *handle,
418 struct inode *inode,
419 struct ocfs2_extent_tree *et, 529 struct ocfs2_extent_tree *et,
420 int type) 530 int type)
421{ 531{
422 return et->et_root_journal_access(handle, inode, et->et_root_bh, 532 return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
423 type); 533 type);
424} 534}
425 535
426static inline int ocfs2_et_insert_check(struct inode *inode, 536static inline enum ocfs2_contig_type
427 struct ocfs2_extent_tree *et, 537 ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
538 struct ocfs2_extent_rec *rec,
539 struct ocfs2_extent_rec *insert_rec)
540{
541 if (et->et_ops->eo_extent_contig)
542 return et->et_ops->eo_extent_contig(et, rec, insert_rec);
543
544 return ocfs2_extent_rec_contig(
545 ocfs2_metadata_cache_get_super(et->et_ci),
546 rec, insert_rec);
547}
548
549static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
428 struct ocfs2_extent_rec *rec) 550 struct ocfs2_extent_rec *rec)
429{ 551{
430 int ret = 0; 552 int ret = 0;
431 553
432 if (et->et_ops->eo_insert_check) 554 if (et->et_ops->eo_insert_check)
433 ret = et->et_ops->eo_insert_check(inode, et, rec); 555 ret = et->et_ops->eo_insert_check(et, rec);
434 return ret; 556 return ret;
435} 557}
436 558
437static inline int ocfs2_et_sanity_check(struct inode *inode, 559static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
438 struct ocfs2_extent_tree *et)
439{ 560{
440 int ret = 0; 561 int ret = 0;
441 562
442 if (et->et_ops->eo_sanity_check) 563 if (et->et_ops->eo_sanity_check)
443 ret = et->et_ops->eo_sanity_check(inode, et); 564 ret = et->et_ops->eo_sanity_check(et);
444 return ret; 565 return ret;
445} 566}
446 567
447static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 568static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
448static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
449 struct ocfs2_extent_block *eb); 570 struct ocfs2_extent_block *eb);
450 571static void ocfs2_adjust_rightmost_records(handle_t *handle,
451/* 572 struct ocfs2_extent_tree *et,
452 * Structures which describe a path through a btree, and functions to
453 * manipulate them.
454 *
455 * The idea here is to be as generic as possible with the tree
456 * manipulation code.
457 */
458struct ocfs2_path_item {
459 struct buffer_head *bh;
460 struct ocfs2_extent_list *el;
461};
462
463#define OCFS2_MAX_PATH_DEPTH 5
464
465struct ocfs2_path {
466 int p_tree_depth;
467 ocfs2_journal_access_func p_root_access;
468 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
469};
470
471#define path_root_bh(_path) ((_path)->p_node[0].bh)
472#define path_root_el(_path) ((_path)->p_node[0].el)
473#define path_root_access(_path)((_path)->p_root_access)
474#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
475#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
476#define path_num_items(_path) ((_path)->p_tree_depth + 1)
477
478static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
479 u32 cpos);
480static void ocfs2_adjust_rightmost_records(struct inode *inode,
481 handle_t *handle,
482 struct ocfs2_path *path, 573 struct ocfs2_path *path,
483 struct ocfs2_extent_rec *insert_rec); 574 struct ocfs2_extent_rec *insert_rec);
484/* 575/*
@@ -486,7 +577,7 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
486 * to build another path. Generally, this involves freeing the buffer 577 * to build another path. Generally, this involves freeing the buffer
487 * heads. 578 * heads.
488 */ 579 */
489static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 580void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
490{ 581{
491 int i, start = 0, depth = 0; 582 int i, start = 0, depth = 0;
492 struct ocfs2_path_item *node; 583 struct ocfs2_path_item *node;
@@ -515,7 +606,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
515 path->p_tree_depth = depth; 606 path->p_tree_depth = depth;
516} 607}
517 608
518static void ocfs2_free_path(struct ocfs2_path *path) 609void ocfs2_free_path(struct ocfs2_path *path)
519{ 610{
520 if (path) { 611 if (path) {
521 ocfs2_reinit_path(path, 0); 612 ocfs2_reinit_path(path, 0);
@@ -613,13 +704,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
613 return path; 704 return path;
614} 705}
615 706
616static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) 707struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
617{ 708{
618 return ocfs2_new_path(path_root_bh(path), path_root_el(path), 709 return ocfs2_new_path(path_root_bh(path), path_root_el(path),
619 path_root_access(path)); 710 path_root_access(path));
620} 711}
621 712
622static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) 713struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
623{ 714{
624 return ocfs2_new_path(et->et_root_bh, et->et_root_el, 715 return ocfs2_new_path(et->et_root_bh, et->et_root_el,
625 et->et_root_journal_access); 716 et->et_root_journal_access);
@@ -632,10 +723,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
632 * I don't like the way this function's name looks next to 723 * I don't like the way this function's name looks next to
633 * ocfs2_journal_access_path(), but I don't have a better one. 724 * ocfs2_journal_access_path(), but I don't have a better one.
634 */ 725 */
635static int ocfs2_path_bh_journal_access(handle_t *handle, 726int ocfs2_path_bh_journal_access(handle_t *handle,
636 struct inode *inode, 727 struct ocfs2_caching_info *ci,
637 struct ocfs2_path *path, 728 struct ocfs2_path *path,
638 int idx) 729 int idx)
639{ 730{
640 ocfs2_journal_access_func access = path_root_access(path); 731 ocfs2_journal_access_func access = path_root_access(path);
641 732
@@ -645,15 +736,16 @@ static int ocfs2_path_bh_journal_access(handle_t *handle,
645 if (idx) 736 if (idx)
646 access = ocfs2_journal_access_eb; 737 access = ocfs2_journal_access_eb;
647 738
648 return access(handle, inode, path->p_node[idx].bh, 739 return access(handle, ci, path->p_node[idx].bh,
649 OCFS2_JOURNAL_ACCESS_WRITE); 740 OCFS2_JOURNAL_ACCESS_WRITE);
650} 741}
651 742
652/* 743/*
653 * Convenience function to journal all components in a path. 744 * Convenience function to journal all components in a path.
654 */ 745 */
655static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 746int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
656 struct ocfs2_path *path) 747 handle_t *handle,
748 struct ocfs2_path *path)
657{ 749{
658 int i, ret = 0; 750 int i, ret = 0;
659 751
@@ -661,7 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
661 goto out; 753 goto out;
662 754
663 for(i = 0; i < path_num_items(path); i++) { 755 for(i = 0; i < path_num_items(path); i++) {
664 ret = ocfs2_path_bh_journal_access(handle, inode, path, i); 756 ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
665 if (ret < 0) { 757 if (ret < 0) {
666 mlog_errno(ret); 758 mlog_errno(ret);
667 goto out; 759 goto out;
@@ -702,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
702 return ret; 794 return ret;
703} 795}
704 796
705enum ocfs2_contig_type {
706 CONTIG_NONE = 0,
707 CONTIG_LEFT,
708 CONTIG_RIGHT,
709 CONTIG_LEFTRIGHT,
710};
711
712
713/* 797/*
714 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 798 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
715 * ocfs2_extent_contig only work properly against leaf nodes! 799 * ocfs2_extent_rec_contig only work properly against leaf nodes!
716 */ 800 */
717static int ocfs2_block_extent_contig(struct super_block *sb, 801static int ocfs2_block_extent_contig(struct super_block *sb,
718 struct ocfs2_extent_rec *ext, 802 struct ocfs2_extent_rec *ext,
@@ -738,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
738} 822}
739 823
740static enum ocfs2_contig_type 824static enum ocfs2_contig_type
741 ocfs2_extent_contig(struct inode *inode, 825 ocfs2_extent_rec_contig(struct super_block *sb,
742 struct ocfs2_extent_rec *ext, 826 struct ocfs2_extent_rec *ext,
743 struct ocfs2_extent_rec *insert_rec) 827 struct ocfs2_extent_rec *insert_rec)
744{ 828{
745 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 829 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
746 830
@@ -753,12 +837,12 @@ static enum ocfs2_contig_type
753 return CONTIG_NONE; 837 return CONTIG_NONE;
754 838
755 if (ocfs2_extents_adjacent(ext, insert_rec) && 839 if (ocfs2_extents_adjacent(ext, insert_rec) &&
756 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 840 ocfs2_block_extent_contig(sb, ext, blkno))
757 return CONTIG_RIGHT; 841 return CONTIG_RIGHT;
758 842
759 blkno = le64_to_cpu(ext->e_blkno); 843 blkno = le64_to_cpu(ext->e_blkno);
760 if (ocfs2_extents_adjacent(insert_rec, ext) && 844 if (ocfs2_extents_adjacent(insert_rec, ext) &&
761 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 845 ocfs2_block_extent_contig(sb, insert_rec, blkno))
762 return CONTIG_LEFT; 846 return CONTIG_LEFT;
763 847
764 return CONTIG_NONE; 848 return CONTIG_NONE;
@@ -853,13 +937,13 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
853 return 0; 937 return 0;
854} 938}
855 939
856int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, 940int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
857 struct buffer_head **bh) 941 struct buffer_head **bh)
858{ 942{
859 int rc; 943 int rc;
860 struct buffer_head *tmp = *bh; 944 struct buffer_head *tmp = *bh;
861 945
862 rc = ocfs2_read_block(inode, eb_blkno, &tmp, 946 rc = ocfs2_read_block(ci, eb_blkno, &tmp,
863 ocfs2_validate_extent_block); 947 ocfs2_validate_extent_block);
864 948
865 /* If ocfs2_read_block() got us a new bh, pass it up. */ 949 /* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -874,7 +958,6 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
874 * How many free extents have we got before we need more meta data? 958 * How many free extents have we got before we need more meta data?
875 */ 959 */
876int ocfs2_num_free_extents(struct ocfs2_super *osb, 960int ocfs2_num_free_extents(struct ocfs2_super *osb,
877 struct inode *inode,
878 struct ocfs2_extent_tree *et) 961 struct ocfs2_extent_tree *et)
879{ 962{
880 int retval; 963 int retval;
@@ -889,7 +972,8 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
889 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 972 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
890 973
891 if (last_eb_blk) { 974 if (last_eb_blk) {
892 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); 975 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
976 &eb_bh);
893 if (retval < 0) { 977 if (retval < 0) {
894 mlog_errno(retval); 978 mlog_errno(retval);
895 goto bail; 979 goto bail;
@@ -913,9 +997,8 @@ bail:
913 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 997 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
914 * l_count for you 998 * l_count for you
915 */ 999 */
916static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 1000static int ocfs2_create_new_meta_bhs(handle_t *handle,
917 handle_t *handle, 1001 struct ocfs2_extent_tree *et,
918 struct inode *inode,
919 int wanted, 1002 int wanted,
920 struct ocfs2_alloc_context *meta_ac, 1003 struct ocfs2_alloc_context *meta_ac,
921 struct buffer_head *bhs[]) 1004 struct buffer_head *bhs[])
@@ -924,6 +1007,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
924 u16 suballoc_bit_start; 1007 u16 suballoc_bit_start;
925 u32 num_got; 1008 u32 num_got;
926 u64 first_blkno; 1009 u64 first_blkno;
1010 struct ocfs2_super *osb =
1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
927 struct ocfs2_extent_block *eb; 1012 struct ocfs2_extent_block *eb;
928 1013
929 mlog_entry_void(); 1014 mlog_entry_void();
@@ -949,9 +1034,10 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
949 mlog_errno(status); 1034 mlog_errno(status);
950 goto bail; 1035 goto bail;
951 } 1036 }
952 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 1037 ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
953 1038
954 status = ocfs2_journal_access_eb(handle, inode, bhs[i], 1039 status = ocfs2_journal_access_eb(handle, et->et_ci,
1040 bhs[i],
955 OCFS2_JOURNAL_ACCESS_CREATE); 1041 OCFS2_JOURNAL_ACCESS_CREATE);
956 if (status < 0) { 1042 if (status < 0) {
957 mlog_errno(status); 1043 mlog_errno(status);
@@ -1023,7 +1109,6 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
1023 * extent block's rightmost record. 1109 * extent block's rightmost record.
1024 */ 1110 */
1025static int ocfs2_adjust_rightmost_branch(handle_t *handle, 1111static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1026 struct inode *inode,
1027 struct ocfs2_extent_tree *et) 1112 struct ocfs2_extent_tree *et)
1028{ 1113{
1029 int status; 1114 int status;
@@ -1037,7 +1122,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1037 return status; 1122 return status;
1038 } 1123 }
1039 1124
1040 status = ocfs2_find_path(inode, path, UINT_MAX); 1125 status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1041 if (status < 0) { 1126 if (status < 0) {
1042 mlog_errno(status); 1127 mlog_errno(status);
1043 goto out; 1128 goto out;
@@ -1050,7 +1135,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1050 goto out; 1135 goto out;
1051 } 1136 }
1052 1137
1053 status = ocfs2_journal_access_path(inode, handle, path); 1138 status = ocfs2_journal_access_path(et->et_ci, handle, path);
1054 if (status < 0) { 1139 if (status < 0) {
1055 mlog_errno(status); 1140 mlog_errno(status);
1056 goto out; 1141 goto out;
@@ -1059,7 +1144,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1059 el = path_leaf_el(path); 1144 el = path_leaf_el(path);
1060 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; 1145 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1061 1146
1062 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 1147 ocfs2_adjust_rightmost_records(handle, et, path, rec);
1063 1148
1064out: 1149out:
1065 ocfs2_free_path(path); 1150 ocfs2_free_path(path);
@@ -1068,7 +1153,7 @@ out:
1068 1153
1069/* 1154/*
1070 * Add an entire tree branch to our inode. eb_bh is the extent block 1155 * Add an entire tree branch to our inode. eb_bh is the extent block
1071 * to start at, if we don't want to start the branch at the dinode 1156 * to start at, if we don't want to start the branch at the root
1072 * structure. 1157 * structure.
1073 * 1158 *
1074 * last_eb_bh is required as we have to update it's next_leaf pointer 1159 * last_eb_bh is required as we have to update it's next_leaf pointer
@@ -1077,9 +1162,7 @@ out:
1077 * the new branch will be 'empty' in the sense that every block will 1162 * the new branch will be 'empty' in the sense that every block will
1078 * contain a single record with cluster count == 0. 1163 * contain a single record with cluster count == 0.
1079 */ 1164 */
1080static int ocfs2_add_branch(struct ocfs2_super *osb, 1165static int ocfs2_add_branch(handle_t *handle,
1081 handle_t *handle,
1082 struct inode *inode,
1083 struct ocfs2_extent_tree *et, 1166 struct ocfs2_extent_tree *et,
1084 struct buffer_head *eb_bh, 1167 struct buffer_head *eb_bh,
1085 struct buffer_head **last_eb_bh, 1168 struct buffer_head **last_eb_bh,
@@ -1123,7 +1206,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1123 if (root_end > new_cpos) { 1206 if (root_end > new_cpos) {
1124 mlog(0, "adjust the cluster end from %u to %u\n", 1207 mlog(0, "adjust the cluster end from %u to %u\n",
1125 root_end, new_cpos); 1208 root_end, new_cpos);
1126 status = ocfs2_adjust_rightmost_branch(handle, inode, et); 1209 status = ocfs2_adjust_rightmost_branch(handle, et);
1127 if (status) { 1210 if (status) {
1128 mlog_errno(status); 1211 mlog_errno(status);
1129 goto bail; 1212 goto bail;
@@ -1139,7 +1222,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1139 goto bail; 1222 goto bail;
1140 } 1223 }
1141 1224
1142 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 1225 status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1143 meta_ac, new_eb_bhs); 1226 meta_ac, new_eb_bhs);
1144 if (status < 0) { 1227 if (status < 0) {
1145 mlog_errno(status); 1228 mlog_errno(status);
@@ -1161,7 +1244,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1161 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1244 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1162 eb_el = &eb->h_list; 1245 eb_el = &eb->h_list;
1163 1246
1164 status = ocfs2_journal_access_eb(handle, inode, bh, 1247 status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1165 OCFS2_JOURNAL_ACCESS_CREATE); 1248 OCFS2_JOURNAL_ACCESS_CREATE);
1166 if (status < 0) { 1249 if (status < 0) {
1167 mlog_errno(status); 1250 mlog_errno(status);
@@ -1201,20 +1284,20 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1201 * journal_dirty erroring as it won't unless we've aborted the 1284 * journal_dirty erroring as it won't unless we've aborted the
1202 * handle (in which case we would never be here) so reserving 1285 * handle (in which case we would never be here) so reserving
1203 * the write with journal_access is all we need to do. */ 1286 * the write with journal_access is all we need to do. */
1204 status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh, 1287 status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1205 OCFS2_JOURNAL_ACCESS_WRITE); 1288 OCFS2_JOURNAL_ACCESS_WRITE);
1206 if (status < 0) { 1289 if (status < 0) {
1207 mlog_errno(status); 1290 mlog_errno(status);
1208 goto bail; 1291 goto bail;
1209 } 1292 }
1210 status = ocfs2_et_root_journal_access(handle, inode, et, 1293 status = ocfs2_et_root_journal_access(handle, et,
1211 OCFS2_JOURNAL_ACCESS_WRITE); 1294 OCFS2_JOURNAL_ACCESS_WRITE);
1212 if (status < 0) { 1295 if (status < 0) {
1213 mlog_errno(status); 1296 mlog_errno(status);
1214 goto bail; 1297 goto bail;
1215 } 1298 }
1216 if (eb_bh) { 1299 if (eb_bh) {
1217 status = ocfs2_journal_access_eb(handle, inode, eb_bh, 1300 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1218 OCFS2_JOURNAL_ACCESS_WRITE); 1301 OCFS2_JOURNAL_ACCESS_WRITE);
1219 if (status < 0) { 1302 if (status < 0) {
1220 mlog_errno(status); 1303 mlog_errno(status);
@@ -1274,9 +1357,7 @@ bail:
1274 * returns back the new extent block so you can add a branch to it 1357 * returns back the new extent block so you can add a branch to it
1275 * after this call. 1358 * after this call.
1276 */ 1359 */
1277static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 1360static int ocfs2_shift_tree_depth(handle_t *handle,
1278 handle_t *handle,
1279 struct inode *inode,
1280 struct ocfs2_extent_tree *et, 1361 struct ocfs2_extent_tree *et,
1281 struct ocfs2_alloc_context *meta_ac, 1362 struct ocfs2_alloc_context *meta_ac,
1282 struct buffer_head **ret_new_eb_bh) 1363 struct buffer_head **ret_new_eb_bh)
@@ -1290,7 +1371,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1290 1371
1291 mlog_entry_void(); 1372 mlog_entry_void();
1292 1373
1293 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 1374 status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1294 &new_eb_bh); 1375 &new_eb_bh);
1295 if (status < 0) { 1376 if (status < 0) {
1296 mlog_errno(status); 1377 mlog_errno(status);
@@ -1304,7 +1385,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1304 eb_el = &eb->h_list; 1385 eb_el = &eb->h_list;
1305 root_el = et->et_root_el; 1386 root_el = et->et_root_el;
1306 1387
1307 status = ocfs2_journal_access_eb(handle, inode, new_eb_bh, 1388 status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1308 OCFS2_JOURNAL_ACCESS_CREATE); 1389 OCFS2_JOURNAL_ACCESS_CREATE);
1309 if (status < 0) { 1390 if (status < 0) {
1310 mlog_errno(status); 1391 mlog_errno(status);
@@ -1323,7 +1404,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1323 goto bail; 1404 goto bail;
1324 } 1405 }
1325 1406
1326 status = ocfs2_et_root_journal_access(handle, inode, et, 1407 status = ocfs2_et_root_journal_access(handle, et,
1327 OCFS2_JOURNAL_ACCESS_WRITE); 1408 OCFS2_JOURNAL_ACCESS_WRITE);
1328 if (status < 0) { 1409 if (status < 0) {
1329 mlog_errno(status); 1410 mlog_errno(status);
@@ -1379,9 +1460,7 @@ bail:
1379 * 1460 *
1380 * return status < 0 indicates an error. 1461 * return status < 0 indicates an error.
1381 */ 1462 */
1382static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1463static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1383 struct inode *inode,
1384 struct ocfs2_extent_tree *et,
1385 struct buffer_head **target_bh) 1464 struct buffer_head **target_bh)
1386{ 1465{
1387 int status = 0, i; 1466 int status = 0, i;
@@ -1399,19 +1478,21 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1399 1478
1400 while(le16_to_cpu(el->l_tree_depth) > 1) { 1479 while(le16_to_cpu(el->l_tree_depth) > 1) {
1401 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1480 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1402 ocfs2_error(inode->i_sb, "Dinode %llu has empty " 1481 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1482 "Owner %llu has empty "
1403 "extent list (next_free_rec == 0)", 1483 "extent list (next_free_rec == 0)",
1404 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1484 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1405 status = -EIO; 1485 status = -EIO;
1406 goto bail; 1486 goto bail;
1407 } 1487 }
1408 i = le16_to_cpu(el->l_next_free_rec) - 1; 1488 i = le16_to_cpu(el->l_next_free_rec) - 1;
1409 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1489 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1410 if (!blkno) { 1490 if (!blkno) {
1411 ocfs2_error(inode->i_sb, "Dinode %llu has extent " 1491 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1492 "Owner %llu has extent "
1412 "list where extent # %d has no physical " 1493 "list where extent # %d has no physical "
1413 "block start", 1494 "block start",
1414 (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 1495 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1415 status = -EIO; 1496 status = -EIO;
1416 goto bail; 1497 goto bail;
1417 } 1498 }
@@ -1419,7 +1500,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1419 brelse(bh); 1500 brelse(bh);
1420 bh = NULL; 1501 bh = NULL;
1421 1502
1422 status = ocfs2_read_extent_block(inode, blkno, &bh); 1503 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1423 if (status < 0) { 1504 if (status < 0) {
1424 mlog_errno(status); 1505 mlog_errno(status);
1425 goto bail; 1506 goto bail;
@@ -1460,20 +1541,18 @@ bail:
1460 * 1541 *
1461 * *last_eb_bh will be updated by ocfs2_add_branch(). 1542 * *last_eb_bh will be updated by ocfs2_add_branch().
1462 */ 1543 */
1463static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1544static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1464 struct ocfs2_extent_tree *et, int *final_depth, 1545 int *final_depth, struct buffer_head **last_eb_bh,
1465 struct buffer_head **last_eb_bh,
1466 struct ocfs2_alloc_context *meta_ac) 1546 struct ocfs2_alloc_context *meta_ac)
1467{ 1547{
1468 int ret, shift; 1548 int ret, shift;
1469 struct ocfs2_extent_list *el = et->et_root_el; 1549 struct ocfs2_extent_list *el = et->et_root_el;
1470 int depth = le16_to_cpu(el->l_tree_depth); 1550 int depth = le16_to_cpu(el->l_tree_depth);
1471 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1472 struct buffer_head *bh = NULL; 1551 struct buffer_head *bh = NULL;
1473 1552
1474 BUG_ON(meta_ac == NULL); 1553 BUG_ON(meta_ac == NULL);
1475 1554
1476 shift = ocfs2_find_branch_target(osb, inode, et, &bh); 1555 shift = ocfs2_find_branch_target(et, &bh);
1477 if (shift < 0) { 1556 if (shift < 0) {
1478 ret = shift; 1557 ret = shift;
1479 mlog_errno(ret); 1558 mlog_errno(ret);
@@ -1490,8 +1569,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1490 /* ocfs2_shift_tree_depth will return us a buffer with 1569 /* ocfs2_shift_tree_depth will return us a buffer with
1491 * the new extent block (so we can pass that to 1570 * the new extent block (so we can pass that to
1492 * ocfs2_add_branch). */ 1571 * ocfs2_add_branch). */
1493 ret = ocfs2_shift_tree_depth(osb, handle, inode, et, 1572 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1494 meta_ac, &bh);
1495 if (ret < 0) { 1573 if (ret < 0) {
1496 mlog_errno(ret); 1574 mlog_errno(ret);
1497 goto out; 1575 goto out;
@@ -1517,7 +1595,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1517 /* call ocfs2_add_branch to add the final part of the tree with 1595 /* call ocfs2_add_branch to add the final part of the tree with
1518 * the new data. */ 1596 * the new data. */
1519 mlog(0, "add branch. bh = %p\n", bh); 1597 mlog(0, "add branch. bh = %p\n", bh);
1520 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, 1598 ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1521 meta_ac); 1599 meta_ac);
1522 if (ret < 0) { 1600 if (ret < 0) {
1523 mlog_errno(ret); 1601 mlog_errno(ret);
@@ -1687,7 +1765,7 @@ set_and_inc:
1687 * 1765 *
1688 * The array index of the subtree root is passed back. 1766 * The array index of the subtree root is passed back.
1689 */ 1767 */
1690static int ocfs2_find_subtree_root(struct inode *inode, 1768static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1691 struct ocfs2_path *left, 1769 struct ocfs2_path *left,
1692 struct ocfs2_path *right) 1770 struct ocfs2_path *right)
1693{ 1771{
@@ -1705,10 +1783,10 @@ static int ocfs2_find_subtree_root(struct inode *inode,
1705 * The caller didn't pass two adjacent paths. 1783 * The caller didn't pass two adjacent paths.
1706 */ 1784 */
1707 mlog_bug_on_msg(i > left->p_tree_depth, 1785 mlog_bug_on_msg(i > left->p_tree_depth,
1708 "Inode %lu, left depth %u, right depth %u\n" 1786 "Owner %llu, left depth %u, right depth %u\n"
1709 "left leaf blk %llu, right leaf blk %llu\n", 1787 "left leaf blk %llu, right leaf blk %llu\n",
1710 inode->i_ino, left->p_tree_depth, 1788 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1711 right->p_tree_depth, 1789 left->p_tree_depth, right->p_tree_depth,
1712 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1790 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1713 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1791 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1714 } while (left->p_node[i].bh->b_blocknr == 1792 } while (left->p_node[i].bh->b_blocknr ==
@@ -1725,7 +1803,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *);
1725 * This code can be called with a cpos larger than the tree, in which 1803 * This code can be called with a cpos larger than the tree, in which
1726 * case it will return the rightmost path. 1804 * case it will return the rightmost path.
1727 */ 1805 */
1728static int __ocfs2_find_path(struct inode *inode, 1806static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1729 struct ocfs2_extent_list *root_el, u32 cpos, 1807 struct ocfs2_extent_list *root_el, u32 cpos,
1730 path_insert_t *func, void *data) 1808 path_insert_t *func, void *data)
1731{ 1809{
@@ -1736,15 +1814,14 @@ static int __ocfs2_find_path(struct inode *inode,
1736 struct ocfs2_extent_block *eb; 1814 struct ocfs2_extent_block *eb;
1737 struct ocfs2_extent_list *el; 1815 struct ocfs2_extent_list *el;
1738 struct ocfs2_extent_rec *rec; 1816 struct ocfs2_extent_rec *rec;
1739 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1740 1817
1741 el = root_el; 1818 el = root_el;
1742 while (el->l_tree_depth) { 1819 while (el->l_tree_depth) {
1743 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1820 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1744 ocfs2_error(inode->i_sb, 1821 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1745 "Inode %llu has empty extent list at " 1822 "Owner %llu has empty extent list at "
1746 "depth %u\n", 1823 "depth %u\n",
1747 (unsigned long long)oi->ip_blkno, 1824 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1748 le16_to_cpu(el->l_tree_depth)); 1825 le16_to_cpu(el->l_tree_depth));
1749 ret = -EROFS; 1826 ret = -EROFS;
1750 goto out; 1827 goto out;
@@ -1767,10 +1844,10 @@ static int __ocfs2_find_path(struct inode *inode,
1767 1844
1768 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1845 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1769 if (blkno == 0) { 1846 if (blkno == 0) {
1770 ocfs2_error(inode->i_sb, 1847 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1771 "Inode %llu has bad blkno in extent list " 1848 "Owner %llu has bad blkno in extent list "
1772 "at depth %u (index %d)\n", 1849 "at depth %u (index %d)\n",
1773 (unsigned long long)oi->ip_blkno, 1850 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1774 le16_to_cpu(el->l_tree_depth), i); 1851 le16_to_cpu(el->l_tree_depth), i);
1775 ret = -EROFS; 1852 ret = -EROFS;
1776 goto out; 1853 goto out;
@@ -1778,7 +1855,7 @@ static int __ocfs2_find_path(struct inode *inode,
1778 1855
1779 brelse(bh); 1856 brelse(bh);
1780 bh = NULL; 1857 bh = NULL;
1781 ret = ocfs2_read_extent_block(inode, blkno, &bh); 1858 ret = ocfs2_read_extent_block(ci, blkno, &bh);
1782 if (ret) { 1859 if (ret) {
1783 mlog_errno(ret); 1860 mlog_errno(ret);
1784 goto out; 1861 goto out;
@@ -1789,10 +1866,10 @@ static int __ocfs2_find_path(struct inode *inode,
1789 1866
1790 if (le16_to_cpu(el->l_next_free_rec) > 1867 if (le16_to_cpu(el->l_next_free_rec) >
1791 le16_to_cpu(el->l_count)) { 1868 le16_to_cpu(el->l_count)) {
1792 ocfs2_error(inode->i_sb, 1869 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1793 "Inode %llu has bad count in extent list " 1870 "Owner %llu has bad count in extent list "
1794 "at block %llu (next free=%u, count=%u)\n", 1871 "at block %llu (next free=%u, count=%u)\n",
1795 (unsigned long long)oi->ip_blkno, 1872 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1796 (unsigned long long)bh->b_blocknr, 1873 (unsigned long long)bh->b_blocknr,
1797 le16_to_cpu(el->l_next_free_rec), 1874 le16_to_cpu(el->l_next_free_rec),
1798 le16_to_cpu(el->l_count)); 1875 le16_to_cpu(el->l_count));
@@ -1836,14 +1913,14 @@ static void find_path_ins(void *data, struct buffer_head *bh)
1836 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1913 ocfs2_path_insert_eb(fp->path, fp->index, bh);
1837 fp->index++; 1914 fp->index++;
1838} 1915}
1839static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1916int ocfs2_find_path(struct ocfs2_caching_info *ci,
1840 u32 cpos) 1917 struct ocfs2_path *path, u32 cpos)
1841{ 1918{
1842 struct find_path_data data; 1919 struct find_path_data data;
1843 1920
1844 data.index = 1; 1921 data.index = 1;
1845 data.path = path; 1922 data.path = path;
1846 return __ocfs2_find_path(inode, path_root_el(path), cpos, 1923 return __ocfs2_find_path(ci, path_root_el(path), cpos,
1847 find_path_ins, &data); 1924 find_path_ins, &data);
1848} 1925}
1849 1926
@@ -1868,13 +1945,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
1868 * 1945 *
1869 * This function doesn't handle non btree extent lists. 1946 * This function doesn't handle non btree extent lists.
1870 */ 1947 */
1871int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1948int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1872 u32 cpos, struct buffer_head **leaf_bh) 1949 struct ocfs2_extent_list *root_el, u32 cpos,
1950 struct buffer_head **leaf_bh)
1873{ 1951{
1874 int ret; 1952 int ret;
1875 struct buffer_head *bh = NULL; 1953 struct buffer_head *bh = NULL;
1876 1954
1877 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1955 ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1878 if (ret) { 1956 if (ret) {
1879 mlog_errno(ret); 1957 mlog_errno(ret);
1880 goto out; 1958 goto out;
@@ -1980,7 +2058,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1980 * - When we've adjusted the last extent record in the left path leaf and the 2058 * - When we've adjusted the last extent record in the left path leaf and the
1981 * 1st extent record in the right path leaf during cross extent block merge. 2059 * 1st extent record in the right path leaf during cross extent block merge.
1982 */ 2060 */
1983static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 2061static void ocfs2_complete_edge_insert(handle_t *handle,
1984 struct ocfs2_path *left_path, 2062 struct ocfs2_path *left_path,
1985 struct ocfs2_path *right_path, 2063 struct ocfs2_path *right_path,
1986 int subtree_index) 2064 int subtree_index)
@@ -2058,8 +2136,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
2058 mlog_errno(ret); 2136 mlog_errno(ret);
2059} 2137}
2060 2138
2061static int ocfs2_rotate_subtree_right(struct inode *inode, 2139static int ocfs2_rotate_subtree_right(handle_t *handle,
2062 handle_t *handle, 2140 struct ocfs2_extent_tree *et,
2063 struct ocfs2_path *left_path, 2141 struct ocfs2_path *left_path,
2064 struct ocfs2_path *right_path, 2142 struct ocfs2_path *right_path,
2065 int subtree_index) 2143 int subtree_index)
@@ -2075,10 +2153,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2075 left_el = path_leaf_el(left_path); 2153 left_el = path_leaf_el(left_path);
2076 2154
2077 if (left_el->l_next_free_rec != left_el->l_count) { 2155 if (left_el->l_next_free_rec != left_el->l_count) {
2078 ocfs2_error(inode->i_sb, 2156 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2079 "Inode %llu has non-full interior leaf node %llu" 2157 "Inode %llu has non-full interior leaf node %llu"
2080 "(next free = %u)", 2158 "(next free = %u)",
2081 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2159 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2082 (unsigned long long)left_leaf_bh->b_blocknr, 2160 (unsigned long long)left_leaf_bh->b_blocknr,
2083 le16_to_cpu(left_el->l_next_free_rec)); 2161 le16_to_cpu(left_el->l_next_free_rec));
2084 return -EROFS; 2162 return -EROFS;
@@ -2094,7 +2172,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2094 root_bh = left_path->p_node[subtree_index].bh; 2172 root_bh = left_path->p_node[subtree_index].bh;
2095 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2173 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2096 2174
2097 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 2175 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2098 subtree_index); 2176 subtree_index);
2099 if (ret) { 2177 if (ret) {
2100 mlog_errno(ret); 2178 mlog_errno(ret);
@@ -2102,14 +2180,14 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2102 } 2180 }
2103 2181
2104 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2182 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2105 ret = ocfs2_path_bh_journal_access(handle, inode, 2183 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2106 right_path, i); 2184 right_path, i);
2107 if (ret) { 2185 if (ret) {
2108 mlog_errno(ret); 2186 mlog_errno(ret);
2109 goto out; 2187 goto out;
2110 } 2188 }
2111 2189
2112 ret = ocfs2_path_bh_journal_access(handle, inode, 2190 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2113 left_path, i); 2191 left_path, i);
2114 if (ret) { 2192 if (ret) {
2115 mlog_errno(ret); 2193 mlog_errno(ret);
@@ -2123,7 +2201,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2123 /* This is a code error, not a disk corruption. */ 2201 /* This is a code error, not a disk corruption. */
2124 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 2202 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2125 "because rightmost leaf block %llu is empty\n", 2203 "because rightmost leaf block %llu is empty\n",
2126 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2204 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2127 (unsigned long long)right_leaf_bh->b_blocknr); 2205 (unsigned long long)right_leaf_bh->b_blocknr);
2128 2206
2129 ocfs2_create_empty_extent(right_el); 2207 ocfs2_create_empty_extent(right_el);
@@ -2157,8 +2235,8 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2157 goto out; 2235 goto out;
2158 } 2236 }
2159 2237
2160 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2238 ocfs2_complete_edge_insert(handle, left_path, right_path,
2161 subtree_index); 2239 subtree_index);
2162 2240
2163out: 2241out:
2164 return ret; 2242 return ret;
@@ -2248,10 +2326,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2248 int op_credits, 2326 int op_credits,
2249 struct ocfs2_path *path) 2327 struct ocfs2_path *path)
2250{ 2328{
2329 int ret;
2251 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2330 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2252 2331
2253 if (handle->h_buffer_credits < credits) 2332 if (handle->h_buffer_credits < credits) {
2254 return ocfs2_extend_trans(handle, credits); 2333 ret = ocfs2_extend_trans(handle,
2334 credits - handle->h_buffer_credits);
2335 if (ret)
2336 return ret;
2337
2338 if (unlikely(handle->h_buffer_credits < credits))
2339 return ocfs2_extend_trans(handle, credits);
2340 }
2255 2341
2256 return 0; 2342 return 0;
2257} 2343}
@@ -2321,8 +2407,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2321 * *ret_left_path will contain a valid path which can be passed to 2407 * *ret_left_path will contain a valid path which can be passed to
2322 * ocfs2_insert_path(). 2408 * ocfs2_insert_path().
2323 */ 2409 */
2324static int ocfs2_rotate_tree_right(struct inode *inode, 2410static int ocfs2_rotate_tree_right(handle_t *handle,
2325 handle_t *handle, 2411 struct ocfs2_extent_tree *et,
2326 enum ocfs2_split_type split, 2412 enum ocfs2_split_type split,
2327 u32 insert_cpos, 2413 u32 insert_cpos,
2328 struct ocfs2_path *right_path, 2414 struct ocfs2_path *right_path,
@@ -2331,6 +2417,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2331 int ret, start, orig_credits = handle->h_buffer_credits; 2417 int ret, start, orig_credits = handle->h_buffer_credits;
2332 u32 cpos; 2418 u32 cpos;
2333 struct ocfs2_path *left_path = NULL; 2419 struct ocfs2_path *left_path = NULL;
2420 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2334 2421
2335 *ret_left_path = NULL; 2422 *ret_left_path = NULL;
2336 2423
@@ -2341,7 +2428,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2341 goto out; 2428 goto out;
2342 } 2429 }
2343 2430
2344 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 2431 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2345 if (ret) { 2432 if (ret) {
2346 mlog_errno(ret); 2433 mlog_errno(ret);
2347 goto out; 2434 goto out;
@@ -2379,7 +2466,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2379 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2466 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2380 insert_cpos, cpos); 2467 insert_cpos, cpos);
2381 2468
2382 ret = ocfs2_find_path(inode, left_path, cpos); 2469 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2383 if (ret) { 2470 if (ret) {
2384 mlog_errno(ret); 2471 mlog_errno(ret);
2385 goto out; 2472 goto out;
@@ -2387,10 +2474,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2387 2474
2388 mlog_bug_on_msg(path_leaf_bh(left_path) == 2475 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2389 path_leaf_bh(right_path), 2476 path_leaf_bh(right_path),
2390 "Inode %lu: error during insert of %u " 2477 "Owner %llu: error during insert of %u "
2391 "(left path cpos %u) results in two identical " 2478 "(left path cpos %u) results in two identical "
2392 "paths ending at %llu\n", 2479 "paths ending at %llu\n",
2393 inode->i_ino, insert_cpos, cpos, 2480 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2481 insert_cpos, cpos,
2394 (unsigned long long) 2482 (unsigned long long)
2395 path_leaf_bh(left_path)->b_blocknr); 2483 path_leaf_bh(left_path)->b_blocknr);
2396 2484
@@ -2416,7 +2504,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2416 goto out_ret_path; 2504 goto out_ret_path;
2417 } 2505 }
2418 2506
2419 start = ocfs2_find_subtree_root(inode, left_path, right_path); 2507 start = ocfs2_find_subtree_root(et, left_path, right_path);
2420 2508
2421 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2509 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2422 start, 2510 start,
@@ -2430,7 +2518,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2430 goto out; 2518 goto out;
2431 } 2519 }
2432 2520
2433 ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 2521 ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2434 right_path, start); 2522 right_path, start);
2435 if (ret) { 2523 if (ret) {
2436 mlog_errno(ret); 2524 mlog_errno(ret);
@@ -2462,8 +2550,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2462 */ 2550 */
2463 ocfs2_mv_path(right_path, left_path); 2551 ocfs2_mv_path(right_path, left_path);
2464 2552
2465 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 2553 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2466 &cpos);
2467 if (ret) { 2554 if (ret) {
2468 mlog_errno(ret); 2555 mlog_errno(ret);
2469 goto out; 2556 goto out;
@@ -2477,7 +2564,8 @@ out_ret_path:
2477 return ret; 2564 return ret;
2478} 2565}
2479 2566
2480static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2567static int ocfs2_update_edge_lengths(handle_t *handle,
2568 struct ocfs2_extent_tree *et,
2481 int subtree_index, struct ocfs2_path *path) 2569 int subtree_index, struct ocfs2_path *path)
2482{ 2570{
2483 int i, idx, ret; 2571 int i, idx, ret;
@@ -2502,7 +2590,7 @@ static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2502 goto out; 2590 goto out;
2503 } 2591 }
2504 2592
2505 ret = ocfs2_journal_access_path(inode, handle, path); 2593 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2506 if (ret) { 2594 if (ret) {
2507 mlog_errno(ret); 2595 mlog_errno(ret);
2508 goto out; 2596 goto out;
@@ -2532,7 +2620,8 @@ out:
2532 return ret; 2620 return ret;
2533} 2621}
2534 2622
2535static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2623static void ocfs2_unlink_path(handle_t *handle,
2624 struct ocfs2_extent_tree *et,
2536 struct ocfs2_cached_dealloc_ctxt *dealloc, 2625 struct ocfs2_cached_dealloc_ctxt *dealloc,
2537 struct ocfs2_path *path, int unlink_start) 2626 struct ocfs2_path *path, int unlink_start)
2538{ 2627{
@@ -2554,12 +2643,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2554 mlog(ML_ERROR, 2643 mlog(ML_ERROR,
2555 "Inode %llu, attempted to remove extent block " 2644 "Inode %llu, attempted to remove extent block "
2556 "%llu with %u records\n", 2645 "%llu with %u records\n",
2557 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2646 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2558 (unsigned long long)le64_to_cpu(eb->h_blkno), 2647 (unsigned long long)le64_to_cpu(eb->h_blkno),
2559 le16_to_cpu(el->l_next_free_rec)); 2648 le16_to_cpu(el->l_next_free_rec));
2560 2649
2561 ocfs2_journal_dirty(handle, bh); 2650 ocfs2_journal_dirty(handle, bh);
2562 ocfs2_remove_from_cache(inode, bh); 2651 ocfs2_remove_from_cache(et->et_ci, bh);
2563 continue; 2652 continue;
2564 } 2653 }
2565 2654
@@ -2572,11 +2661,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2572 if (ret) 2661 if (ret)
2573 mlog_errno(ret); 2662 mlog_errno(ret);
2574 2663
2575 ocfs2_remove_from_cache(inode, bh); 2664 ocfs2_remove_from_cache(et->et_ci, bh);
2576 } 2665 }
2577} 2666}
2578 2667
2579static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, 2668static void ocfs2_unlink_subtree(handle_t *handle,
2669 struct ocfs2_extent_tree *et,
2580 struct ocfs2_path *left_path, 2670 struct ocfs2_path *left_path,
2581 struct ocfs2_path *right_path, 2671 struct ocfs2_path *right_path,
2582 int subtree_index, 2672 int subtree_index,
@@ -2607,17 +2697,17 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2607 ocfs2_journal_dirty(handle, root_bh); 2697 ocfs2_journal_dirty(handle, root_bh);
2608 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2698 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2609 2699
2610 ocfs2_unlink_path(inode, handle, dealloc, right_path, 2700 ocfs2_unlink_path(handle, et, dealloc, right_path,
2611 subtree_index + 1); 2701 subtree_index + 1);
2612} 2702}
2613 2703
2614static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, 2704static int ocfs2_rotate_subtree_left(handle_t *handle,
2705 struct ocfs2_extent_tree *et,
2615 struct ocfs2_path *left_path, 2706 struct ocfs2_path *left_path,
2616 struct ocfs2_path *right_path, 2707 struct ocfs2_path *right_path,
2617 int subtree_index, 2708 int subtree_index,
2618 struct ocfs2_cached_dealloc_ctxt *dealloc, 2709 struct ocfs2_cached_dealloc_ctxt *dealloc,
2619 int *deleted, 2710 int *deleted)
2620 struct ocfs2_extent_tree *et)
2621{ 2711{
2622 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2712 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2623 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2713 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
@@ -2653,7 +2743,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2653 return -EAGAIN; 2743 return -EAGAIN;
2654 2744
2655 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2745 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2656 ret = ocfs2_journal_access_eb(handle, inode, 2746 ret = ocfs2_journal_access_eb(handle, et->et_ci,
2657 path_leaf_bh(right_path), 2747 path_leaf_bh(right_path),
2658 OCFS2_JOURNAL_ACCESS_WRITE); 2748 OCFS2_JOURNAL_ACCESS_WRITE);
2659 if (ret) { 2749 if (ret) {
@@ -2672,7 +2762,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2672 * We have to update i_last_eb_blk during the meta 2762 * We have to update i_last_eb_blk during the meta
2673 * data delete. 2763 * data delete.
2674 */ 2764 */
2675 ret = ocfs2_et_root_journal_access(handle, inode, et, 2765 ret = ocfs2_et_root_journal_access(handle, et,
2676 OCFS2_JOURNAL_ACCESS_WRITE); 2766 OCFS2_JOURNAL_ACCESS_WRITE);
2677 if (ret) { 2767 if (ret) {
2678 mlog_errno(ret); 2768 mlog_errno(ret);
@@ -2688,7 +2778,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2688 */ 2778 */
2689 BUG_ON(right_has_empty && !del_right_subtree); 2779 BUG_ON(right_has_empty && !del_right_subtree);
2690 2780
2691 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 2781 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2692 subtree_index); 2782 subtree_index);
2693 if (ret) { 2783 if (ret) {
2694 mlog_errno(ret); 2784 mlog_errno(ret);
@@ -2696,14 +2786,14 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2696 } 2786 }
2697 2787
2698 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2788 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2699 ret = ocfs2_path_bh_journal_access(handle, inode, 2789 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2700 right_path, i); 2790 right_path, i);
2701 if (ret) { 2791 if (ret) {
2702 mlog_errno(ret); 2792 mlog_errno(ret);
2703 goto out; 2793 goto out;
2704 } 2794 }
2705 2795
2706 ret = ocfs2_path_bh_journal_access(handle, inode, 2796 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2707 left_path, i); 2797 left_path, i);
2708 if (ret) { 2798 if (ret) {
2709 mlog_errno(ret); 2799 mlog_errno(ret);
@@ -2740,9 +2830,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2740 mlog_errno(ret); 2830 mlog_errno(ret);
2741 2831
2742 if (del_right_subtree) { 2832 if (del_right_subtree) {
2743 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2833 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2744 subtree_index, dealloc); 2834 subtree_index, dealloc);
2745 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, 2835 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2746 left_path); 2836 left_path);
2747 if (ret) { 2837 if (ret) {
2748 mlog_errno(ret); 2838 mlog_errno(ret);
@@ -2766,7 +2856,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2766 2856
2767 *deleted = 1; 2857 *deleted = 1;
2768 } else 2858 } else
2769 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2859 ocfs2_complete_edge_insert(handle, left_path, right_path,
2770 subtree_index); 2860 subtree_index);
2771 2861
2772out: 2862out:
@@ -2852,8 +2942,8 @@ out:
2852 return ret; 2942 return ret;
2853} 2943}
2854 2944
2855static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2945static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2856 handle_t *handle, 2946 struct ocfs2_extent_tree *et,
2857 struct ocfs2_path *path) 2947 struct ocfs2_path *path)
2858{ 2948{
2859 int ret; 2949 int ret;
@@ -2863,7 +2953,7 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2863 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2953 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2864 return 0; 2954 return 0;
2865 2955
2866 ret = ocfs2_path_bh_journal_access(handle, inode, path, 2956 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2867 path_num_items(path) - 1); 2957 path_num_items(path) - 1);
2868 if (ret) { 2958 if (ret) {
2869 mlog_errno(ret); 2959 mlog_errno(ret);
@@ -2880,24 +2970,24 @@ out:
2880 return ret; 2970 return ret;
2881} 2971}
2882 2972
2883static int __ocfs2_rotate_tree_left(struct inode *inode, 2973static int __ocfs2_rotate_tree_left(handle_t *handle,
2884 handle_t *handle, int orig_credits, 2974 struct ocfs2_extent_tree *et,
2975 int orig_credits,
2885 struct ocfs2_path *path, 2976 struct ocfs2_path *path,
2886 struct ocfs2_cached_dealloc_ctxt *dealloc, 2977 struct ocfs2_cached_dealloc_ctxt *dealloc,
2887 struct ocfs2_path **empty_extent_path, 2978 struct ocfs2_path **empty_extent_path)
2888 struct ocfs2_extent_tree *et)
2889{ 2979{
2890 int ret, subtree_root, deleted; 2980 int ret, subtree_root, deleted;
2891 u32 right_cpos; 2981 u32 right_cpos;
2892 struct ocfs2_path *left_path = NULL; 2982 struct ocfs2_path *left_path = NULL;
2893 struct ocfs2_path *right_path = NULL; 2983 struct ocfs2_path *right_path = NULL;
2984 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2894 2985
2895 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2986 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2896 2987
2897 *empty_extent_path = NULL; 2988 *empty_extent_path = NULL;
2898 2989
2899 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, 2990 ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2900 &right_cpos);
2901 if (ret) { 2991 if (ret) {
2902 mlog_errno(ret); 2992 mlog_errno(ret);
2903 goto out; 2993 goto out;
@@ -2920,13 +3010,13 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2920 } 3010 }
2921 3011
2922 while (right_cpos) { 3012 while (right_cpos) {
2923 ret = ocfs2_find_path(inode, right_path, right_cpos); 3013 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2924 if (ret) { 3014 if (ret) {
2925 mlog_errno(ret); 3015 mlog_errno(ret);
2926 goto out; 3016 goto out;
2927 } 3017 }
2928 3018
2929 subtree_root = ocfs2_find_subtree_root(inode, left_path, 3019 subtree_root = ocfs2_find_subtree_root(et, left_path,
2930 right_path); 3020 right_path);
2931 3021
2932 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 3022 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
@@ -2946,16 +3036,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2946 * Caller might still want to make changes to the 3036 * Caller might still want to make changes to the
2947 * tree root, so re-add it to the journal here. 3037 * tree root, so re-add it to the journal here.
2948 */ 3038 */
2949 ret = ocfs2_path_bh_journal_access(handle, inode, 3039 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2950 left_path, 0); 3040 left_path, 0);
2951 if (ret) { 3041 if (ret) {
2952 mlog_errno(ret); 3042 mlog_errno(ret);
2953 goto out; 3043 goto out;
2954 } 3044 }
2955 3045
2956 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 3046 ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2957 right_path, subtree_root, 3047 right_path, subtree_root,
2958 dealloc, &deleted, et); 3048 dealloc, &deleted);
2959 if (ret == -EAGAIN) { 3049 if (ret == -EAGAIN) {
2960 /* 3050 /*
2961 * The rotation has to temporarily stop due to 3051 * The rotation has to temporarily stop due to
@@ -2982,7 +3072,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2982 3072
2983 ocfs2_mv_path(left_path, right_path); 3073 ocfs2_mv_path(left_path, right_path);
2984 3074
2985 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3075 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
2986 &right_cpos); 3076 &right_cpos);
2987 if (ret) { 3077 if (ret) {
2988 mlog_errno(ret); 3078 mlog_errno(ret);
@@ -2997,10 +3087,10 @@ out:
2997 return ret; 3087 return ret;
2998} 3088}
2999 3089
3000static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 3090static int ocfs2_remove_rightmost_path(handle_t *handle,
3091 struct ocfs2_extent_tree *et,
3001 struct ocfs2_path *path, 3092 struct ocfs2_path *path,
3002 struct ocfs2_cached_dealloc_ctxt *dealloc, 3093 struct ocfs2_cached_dealloc_ctxt *dealloc)
3003 struct ocfs2_extent_tree *et)
3004{ 3094{
3005 int ret, subtree_index; 3095 int ret, subtree_index;
3006 u32 cpos; 3096 u32 cpos;
@@ -3009,7 +3099,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3009 struct ocfs2_extent_list *el; 3099 struct ocfs2_extent_list *el;
3010 3100
3011 3101
3012 ret = ocfs2_et_sanity_check(inode, et); 3102 ret = ocfs2_et_sanity_check(et);
3013 if (ret) 3103 if (ret)
3014 goto out; 3104 goto out;
3015 /* 3105 /*
@@ -3024,13 +3114,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3024 goto out; 3114 goto out;
3025 } 3115 }
3026 3116
3027 ret = ocfs2_journal_access_path(inode, handle, path); 3117 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3028 if (ret) { 3118 if (ret) {
3029 mlog_errno(ret); 3119 mlog_errno(ret);
3030 goto out; 3120 goto out;
3031 } 3121 }
3032 3122
3033 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 3123 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3124 path, &cpos);
3034 if (ret) { 3125 if (ret) {
3035 mlog_errno(ret); 3126 mlog_errno(ret);
3036 goto out; 3127 goto out;
@@ -3048,23 +3139,23 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3048 goto out; 3139 goto out;
3049 } 3140 }
3050 3141
3051 ret = ocfs2_find_path(inode, left_path, cpos); 3142 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3052 if (ret) { 3143 if (ret) {
3053 mlog_errno(ret); 3144 mlog_errno(ret);
3054 goto out; 3145 goto out;
3055 } 3146 }
3056 3147
3057 ret = ocfs2_journal_access_path(inode, handle, left_path); 3148 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3058 if (ret) { 3149 if (ret) {
3059 mlog_errno(ret); 3150 mlog_errno(ret);
3060 goto out; 3151 goto out;
3061 } 3152 }
3062 3153
3063 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 3154 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3064 3155
3065 ocfs2_unlink_subtree(inode, handle, left_path, path, 3156 ocfs2_unlink_subtree(handle, et, left_path, path,
3066 subtree_index, dealloc); 3157 subtree_index, dealloc);
3067 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, 3158 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3068 left_path); 3159 left_path);
3069 if (ret) { 3160 if (ret) {
3070 mlog_errno(ret); 3161 mlog_errno(ret);
@@ -3078,10 +3169,10 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3078 * 'path' is also the leftmost path which 3169 * 'path' is also the leftmost path which
3079 * means it must be the only one. This gets 3170 * means it must be the only one. This gets
3080 * handled differently because we want to 3171 * handled differently because we want to
3081 * revert the inode back to having extents 3172 * revert the root back to having extents
3082 * in-line. 3173 * in-line.
3083 */ 3174 */
3084 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 3175 ocfs2_unlink_path(handle, et, dealloc, path, 1);
3085 3176
3086 el = et->et_root_el; 3177 el = et->et_root_el;
3087 el->l_tree_depth = 0; 3178 el->l_tree_depth = 0;
@@ -3114,10 +3205,10 @@ out:
3114 * the rightmost tree leaf record is removed so the caller is 3205 * the rightmost tree leaf record is removed so the caller is
3115 * responsible for detecting and correcting that. 3206 * responsible for detecting and correcting that.
3116 */ 3207 */
3117static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 3208static int ocfs2_rotate_tree_left(handle_t *handle,
3209 struct ocfs2_extent_tree *et,
3118 struct ocfs2_path *path, 3210 struct ocfs2_path *path,
3119 struct ocfs2_cached_dealloc_ctxt *dealloc, 3211 struct ocfs2_cached_dealloc_ctxt *dealloc)
3120 struct ocfs2_extent_tree *et)
3121{ 3212{
3122 int ret, orig_credits = handle->h_buffer_credits; 3213 int ret, orig_credits = handle->h_buffer_credits;
3123 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 3214 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -3134,8 +3225,7 @@ rightmost_no_delete:
3134 * Inline extents. This is trivially handled, so do 3225 * Inline extents. This is trivially handled, so do
3135 * it up front. 3226 * it up front.
3136 */ 3227 */
3137 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 3228 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3138 path);
3139 if (ret) 3229 if (ret)
3140 mlog_errno(ret); 3230 mlog_errno(ret);
3141 goto out; 3231 goto out;
@@ -3151,7 +3241,7 @@ rightmost_no_delete:
3151 * 3241 *
3152 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 3242 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
3153 * 2a) we need the left branch so that we can update it with the unlink 3243 * 2a) we need the left branch so that we can update it with the unlink
3154 * 2b) we need to bring the inode back to inline extents. 3244 * 2b) we need to bring the root back to inline extents.
3155 */ 3245 */
3156 3246
3157 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 3247 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
@@ -3167,9 +3257,9 @@ rightmost_no_delete:
3167 3257
3168 if (le16_to_cpu(el->l_next_free_rec) == 0) { 3258 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3169 ret = -EIO; 3259 ret = -EIO;
3170 ocfs2_error(inode->i_sb, 3260 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3171 "Inode %llu has empty extent block at %llu", 3261 "Owner %llu has empty extent block at %llu",
3172 (unsigned long long)OCFS2_I(inode)->ip_blkno, 3262 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3173 (unsigned long long)le64_to_cpu(eb->h_blkno)); 3263 (unsigned long long)le64_to_cpu(eb->h_blkno));
3174 goto out; 3264 goto out;
3175 } 3265 }
@@ -3183,8 +3273,8 @@ rightmost_no_delete:
3183 * nonempty list. 3273 * nonempty list.
3184 */ 3274 */
3185 3275
3186 ret = ocfs2_remove_rightmost_path(inode, handle, path, 3276 ret = ocfs2_remove_rightmost_path(handle, et, path,
3187 dealloc, et); 3277 dealloc);
3188 if (ret) 3278 if (ret)
3189 mlog_errno(ret); 3279 mlog_errno(ret);
3190 goto out; 3280 goto out;
@@ -3195,8 +3285,8 @@ rightmost_no_delete:
3195 * and restarting from there. 3285 * and restarting from there.
3196 */ 3286 */
3197try_rotate: 3287try_rotate:
3198 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 3288 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3199 dealloc, &restart_path, et); 3289 dealloc, &restart_path);
3200 if (ret && ret != -EAGAIN) { 3290 if (ret && ret != -EAGAIN) {
3201 mlog_errno(ret); 3291 mlog_errno(ret);
3202 goto out; 3292 goto out;
@@ -3206,9 +3296,9 @@ try_rotate:
3206 tmp_path = restart_path; 3296 tmp_path = restart_path;
3207 restart_path = NULL; 3297 restart_path = NULL;
3208 3298
3209 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 3299 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3210 tmp_path, dealloc, 3300 tmp_path, dealloc,
3211 &restart_path, et); 3301 &restart_path);
3212 if (ret && ret != -EAGAIN) { 3302 if (ret && ret != -EAGAIN) {
3213 mlog_errno(ret); 3303 mlog_errno(ret);
3214 goto out; 3304 goto out;
@@ -3259,7 +3349,7 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3259 } 3349 }
3260} 3350}
3261 3351
3262static int ocfs2_get_right_path(struct inode *inode, 3352static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3263 struct ocfs2_path *left_path, 3353 struct ocfs2_path *left_path,
3264 struct ocfs2_path **ret_right_path) 3354 struct ocfs2_path **ret_right_path)
3265{ 3355{
@@ -3276,8 +3366,8 @@ static int ocfs2_get_right_path(struct inode *inode,
3276 left_el = path_leaf_el(left_path); 3366 left_el = path_leaf_el(left_path);
3277 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 3367 BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3278 3368
3279 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3369 ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3280 &right_cpos); 3370 left_path, &right_cpos);
3281 if (ret) { 3371 if (ret) {
3282 mlog_errno(ret); 3372 mlog_errno(ret);
3283 goto out; 3373 goto out;
@@ -3293,7 +3383,7 @@ static int ocfs2_get_right_path(struct inode *inode,
3293 goto out; 3383 goto out;
3294 } 3384 }
3295 3385
3296 ret = ocfs2_find_path(inode, right_path, right_cpos); 3386 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3297 if (ret) { 3387 if (ret) {
3298 mlog_errno(ret); 3388 mlog_errno(ret);
3299 goto out; 3389 goto out;
@@ -3313,9 +3403,9 @@ out:
3313 * For index == l_count - 1, the "next" means the 1st extent rec of the 3403 * For index == l_count - 1, the "next" means the 1st extent rec of the
3314 * next extent block. 3404 * next extent block.
3315 */ 3405 */
3316static int ocfs2_merge_rec_right(struct inode *inode, 3406static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3317 struct ocfs2_path *left_path,
3318 handle_t *handle, 3407 handle_t *handle,
3408 struct ocfs2_extent_tree *et,
3319 struct ocfs2_extent_rec *split_rec, 3409 struct ocfs2_extent_rec *split_rec,
3320 int index) 3410 int index)
3321{ 3411{
@@ -3336,7 +3426,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3336 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 3426 if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3337 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 3427 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3338 /* we meet with a cross extent block merge. */ 3428 /* we meet with a cross extent block merge. */
3339 ret = ocfs2_get_right_path(inode, left_path, &right_path); 3429 ret = ocfs2_get_right_path(et, left_path, &right_path);
3340 if (ret) { 3430 if (ret) {
3341 mlog_errno(ret); 3431 mlog_errno(ret);
3342 goto out; 3432 goto out;
@@ -3355,8 +3445,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3355 le16_to_cpu(left_rec->e_leaf_clusters) != 3445 le16_to_cpu(left_rec->e_leaf_clusters) !=
3356 le32_to_cpu(right_rec->e_cpos)); 3446 le32_to_cpu(right_rec->e_cpos));
3357 3447
3358 subtree_index = ocfs2_find_subtree_root(inode, 3448 subtree_index = ocfs2_find_subtree_root(et, left_path,
3359 left_path, right_path); 3449 right_path);
3360 3450
3361 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3451 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3362 handle->h_buffer_credits, 3452 handle->h_buffer_credits,
@@ -3369,7 +3459,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3369 root_bh = left_path->p_node[subtree_index].bh; 3459 root_bh = left_path->p_node[subtree_index].bh;
3370 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3460 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3371 3461
3372 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3462 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3373 subtree_index); 3463 subtree_index);
3374 if (ret) { 3464 if (ret) {
3375 mlog_errno(ret); 3465 mlog_errno(ret);
@@ -3378,14 +3468,14 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3378 3468
3379 for (i = subtree_index + 1; 3469 for (i = subtree_index + 1;
3380 i < path_num_items(right_path); i++) { 3470 i < path_num_items(right_path); i++) {
3381 ret = ocfs2_path_bh_journal_access(handle, inode, 3471 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3382 right_path, i); 3472 right_path, i);
3383 if (ret) { 3473 if (ret) {
3384 mlog_errno(ret); 3474 mlog_errno(ret);
3385 goto out; 3475 goto out;
3386 } 3476 }
3387 3477
3388 ret = ocfs2_path_bh_journal_access(handle, inode, 3478 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3389 left_path, i); 3479 left_path, i);
3390 if (ret) { 3480 if (ret) {
3391 mlog_errno(ret); 3481 mlog_errno(ret);
@@ -3398,7 +3488,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3398 right_rec = &el->l_recs[index + 1]; 3488 right_rec = &el->l_recs[index + 1];
3399 } 3489 }
3400 3490
3401 ret = ocfs2_path_bh_journal_access(handle, inode, left_path, 3491 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3402 path_num_items(left_path) - 1); 3492 path_num_items(left_path) - 1);
3403 if (ret) { 3493 if (ret) {
3404 mlog_errno(ret); 3494 mlog_errno(ret);
@@ -3409,7 +3499,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3409 3499
3410 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3500 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3411 le64_add_cpu(&right_rec->e_blkno, 3501 le64_add_cpu(&right_rec->e_blkno,
3412 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3502 -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3503 split_clusters));
3413 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3504 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3414 3505
3415 ocfs2_cleanup_merge(el, index); 3506 ocfs2_cleanup_merge(el, index);
@@ -3423,8 +3514,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3423 if (ret) 3514 if (ret)
3424 mlog_errno(ret); 3515 mlog_errno(ret);
3425 3516
3426 ocfs2_complete_edge_insert(inode, handle, left_path, 3517 ocfs2_complete_edge_insert(handle, left_path, right_path,
3427 right_path, subtree_index); 3518 subtree_index);
3428 } 3519 }
3429out: 3520out:
3430 if (right_path) 3521 if (right_path)
@@ -3432,7 +3523,7 @@ out:
3432 return ret; 3523 return ret;
3433} 3524}
3434 3525
3435static int ocfs2_get_left_path(struct inode *inode, 3526static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3436 struct ocfs2_path *right_path, 3527 struct ocfs2_path *right_path,
3437 struct ocfs2_path **ret_left_path) 3528 struct ocfs2_path **ret_left_path)
3438{ 3529{
@@ -3445,7 +3536,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3445 /* This function shouldn't be called for non-trees. */ 3536 /* This function shouldn't be called for non-trees. */
3446 BUG_ON(right_path->p_tree_depth == 0); 3537 BUG_ON(right_path->p_tree_depth == 0);
3447 3538
3448 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 3539 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3449 right_path, &left_cpos); 3540 right_path, &left_cpos);
3450 if (ret) { 3541 if (ret) {
3451 mlog_errno(ret); 3542 mlog_errno(ret);
@@ -3462,7 +3553,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3462 goto out; 3553 goto out;
3463 } 3554 }
3464 3555
3465 ret = ocfs2_find_path(inode, left_path, left_cpos); 3556 ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3466 if (ret) { 3557 if (ret) {
3467 mlog_errno(ret); 3558 mlog_errno(ret);
3468 goto out; 3559 goto out;
@@ -3485,12 +3576,11 @@ out:
3485 * remove the rightmost leaf extent block in the right_path and change 3576 * remove the rightmost leaf extent block in the right_path and change
3486 * the right path to indicate the new rightmost path. 3577 * the right path to indicate the new rightmost path.
3487 */ 3578 */
3488static int ocfs2_merge_rec_left(struct inode *inode, 3579static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3489 struct ocfs2_path *right_path,
3490 handle_t *handle, 3580 handle_t *handle,
3581 struct ocfs2_extent_tree *et,
3491 struct ocfs2_extent_rec *split_rec, 3582 struct ocfs2_extent_rec *split_rec,
3492 struct ocfs2_cached_dealloc_ctxt *dealloc, 3583 struct ocfs2_cached_dealloc_ctxt *dealloc,
3493 struct ocfs2_extent_tree *et,
3494 int index) 3584 int index)
3495{ 3585{
3496 int ret, i, subtree_index = 0, has_empty_extent = 0; 3586 int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3508,7 +3598,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3508 right_rec = &el->l_recs[index]; 3598 right_rec = &el->l_recs[index];
3509 if (index == 0) { 3599 if (index == 0) {
3510 /* we meet with a cross extent block merge. */ 3600 /* we meet with a cross extent block merge. */
3511 ret = ocfs2_get_left_path(inode, right_path, &left_path); 3601 ret = ocfs2_get_left_path(et, right_path, &left_path);
3512 if (ret) { 3602 if (ret) {
3513 mlog_errno(ret); 3603 mlog_errno(ret);
3514 goto out; 3604 goto out;
@@ -3524,8 +3614,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3524 le16_to_cpu(left_rec->e_leaf_clusters) != 3614 le16_to_cpu(left_rec->e_leaf_clusters) !=
3525 le32_to_cpu(split_rec->e_cpos)); 3615 le32_to_cpu(split_rec->e_cpos));
3526 3616
3527 subtree_index = ocfs2_find_subtree_root(inode, 3617 subtree_index = ocfs2_find_subtree_root(et, left_path,
3528 left_path, right_path); 3618 right_path);
3529 3619
3530 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3620 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3531 handle->h_buffer_credits, 3621 handle->h_buffer_credits,
@@ -3538,7 +3628,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3538 root_bh = left_path->p_node[subtree_index].bh; 3628 root_bh = left_path->p_node[subtree_index].bh;
3539 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3629 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3540 3630
3541 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3631 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3542 subtree_index); 3632 subtree_index);
3543 if (ret) { 3633 if (ret) {
3544 mlog_errno(ret); 3634 mlog_errno(ret);
@@ -3547,14 +3637,14 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3547 3637
3548 for (i = subtree_index + 1; 3638 for (i = subtree_index + 1;
3549 i < path_num_items(right_path); i++) { 3639 i < path_num_items(right_path); i++) {
3550 ret = ocfs2_path_bh_journal_access(handle, inode, 3640 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3551 right_path, i); 3641 right_path, i);
3552 if (ret) { 3642 if (ret) {
3553 mlog_errno(ret); 3643 mlog_errno(ret);
3554 goto out; 3644 goto out;
3555 } 3645 }
3556 3646
3557 ret = ocfs2_path_bh_journal_access(handle, inode, 3647 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3558 left_path, i); 3648 left_path, i);
3559 if (ret) { 3649 if (ret) {
3560 mlog_errno(ret); 3650 mlog_errno(ret);
@@ -3567,7 +3657,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3567 has_empty_extent = 1; 3657 has_empty_extent = 1;
3568 } 3658 }
3569 3659
3570 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3660 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3571 path_num_items(right_path) - 1); 3661 path_num_items(right_path) - 1);
3572 if (ret) { 3662 if (ret) {
3573 mlog_errno(ret); 3663 mlog_errno(ret);
@@ -3586,7 +3676,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3586 3676
3587 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3677 le32_add_cpu(&right_rec->e_cpos, split_clusters);
3588 le64_add_cpu(&right_rec->e_blkno, 3678 le64_add_cpu(&right_rec->e_blkno,
3589 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3679 ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3680 split_clusters));
3590 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3681 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3591 3682
3592 ocfs2_cleanup_merge(el, index); 3683 ocfs2_cleanup_merge(el, index);
@@ -3608,9 +3699,9 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3608 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3699 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3609 le16_to_cpu(el->l_next_free_rec) == 1) { 3700 le16_to_cpu(el->l_next_free_rec) == 1) {
3610 3701
3611 ret = ocfs2_remove_rightmost_path(inode, handle, 3702 ret = ocfs2_remove_rightmost_path(handle, et,
3612 right_path, 3703 right_path,
3613 dealloc, et); 3704 dealloc);
3614 if (ret) { 3705 if (ret) {
3615 mlog_errno(ret); 3706 mlog_errno(ret);
3616 goto out; 3707 goto out;
@@ -3622,7 +3713,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3622 ocfs2_mv_path(right_path, left_path); 3713 ocfs2_mv_path(right_path, left_path);
3623 left_path = NULL; 3714 left_path = NULL;
3624 } else 3715 } else
3625 ocfs2_complete_edge_insert(inode, handle, left_path, 3716 ocfs2_complete_edge_insert(handle, left_path,
3626 right_path, subtree_index); 3717 right_path, subtree_index);
3627 } 3718 }
3628out: 3719out:
@@ -3631,15 +3722,13 @@ out:
3631 return ret; 3722 return ret;
3632} 3723}
3633 3724
3634static int ocfs2_try_to_merge_extent(struct inode *inode, 3725static int ocfs2_try_to_merge_extent(handle_t *handle,
3635 handle_t *handle, 3726 struct ocfs2_extent_tree *et,
3636 struct ocfs2_path *path, 3727 struct ocfs2_path *path,
3637 int split_index, 3728 int split_index,
3638 struct ocfs2_extent_rec *split_rec, 3729 struct ocfs2_extent_rec *split_rec,
3639 struct ocfs2_cached_dealloc_ctxt *dealloc, 3730 struct ocfs2_cached_dealloc_ctxt *dealloc,
3640 struct ocfs2_merge_ctxt *ctxt, 3731 struct ocfs2_merge_ctxt *ctxt)
3641 struct ocfs2_extent_tree *et)
3642
3643{ 3732{
3644 int ret = 0; 3733 int ret = 0;
3645 struct ocfs2_extent_list *el = path_leaf_el(path); 3734 struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -3655,8 +3744,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3655 * extents - having more than one in a leaf is 3744 * extents - having more than one in a leaf is
3656 * illegal. 3745 * illegal.
3657 */ 3746 */
3658 ret = ocfs2_rotate_tree_left(inode, handle, path, 3747 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3659 dealloc, et);
3660 if (ret) { 3748 if (ret) {
3661 mlog_errno(ret); 3749 mlog_errno(ret);
3662 goto out; 3750 goto out;
@@ -3685,8 +3773,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3685 * prevoius extent block. It is more efficient and easier 3773 * prevoius extent block. It is more efficient and easier
3686 * if we do merge_right first and merge_left later. 3774 * if we do merge_right first and merge_left later.
3687 */ 3775 */
3688 ret = ocfs2_merge_rec_right(inode, path, 3776 ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3689 handle, split_rec,
3690 split_index); 3777 split_index);
3691 if (ret) { 3778 if (ret) {
3692 mlog_errno(ret); 3779 mlog_errno(ret);
@@ -3699,8 +3786,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3699 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3786 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3700 3787
3701 /* The merge left us with an empty extent, remove it. */ 3788 /* The merge left us with an empty extent, remove it. */
3702 ret = ocfs2_rotate_tree_left(inode, handle, path, 3789 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3703 dealloc, et);
3704 if (ret) { 3790 if (ret) {
3705 mlog_errno(ret); 3791 mlog_errno(ret);
3706 goto out; 3792 goto out;
@@ -3712,18 +3798,15 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3712 * Note that we don't pass split_rec here on purpose - 3798 * Note that we don't pass split_rec here on purpose -
3713 * we've merged it into the rec already. 3799 * we've merged it into the rec already.
3714 */ 3800 */
3715 ret = ocfs2_merge_rec_left(inode, path, 3801 ret = ocfs2_merge_rec_left(path, handle, et, rec,
3716 handle, rec, 3802 dealloc, split_index);
3717 dealloc, et,
3718 split_index);
3719 3803
3720 if (ret) { 3804 if (ret) {
3721 mlog_errno(ret); 3805 mlog_errno(ret);
3722 goto out; 3806 goto out;
3723 } 3807 }
3724 3808
3725 ret = ocfs2_rotate_tree_left(inode, handle, path, 3809 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3726 dealloc, et);
3727 /* 3810 /*
3728 * Error from this last rotate is not critical, so 3811 * Error from this last rotate is not critical, so
3729 * print but don't bubble it up. 3812 * print but don't bubble it up.
@@ -3740,19 +3823,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3740 * the record on the left (hence the left merge). 3823 * the record on the left (hence the left merge).
3741 */ 3824 */
3742 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3825 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3743 ret = ocfs2_merge_rec_left(inode, 3826 ret = ocfs2_merge_rec_left(path, handle, et,
3744 path, 3827 split_rec, dealloc,
3745 handle, split_rec,
3746 dealloc, et,
3747 split_index); 3828 split_index);
3748 if (ret) { 3829 if (ret) {
3749 mlog_errno(ret); 3830 mlog_errno(ret);
3750 goto out; 3831 goto out;
3751 } 3832 }
3752 } else { 3833 } else {
3753 ret = ocfs2_merge_rec_right(inode, 3834 ret = ocfs2_merge_rec_right(path, handle,
3754 path, 3835 et, split_rec,
3755 handle, split_rec,
3756 split_index); 3836 split_index);
3757 if (ret) { 3837 if (ret) {
3758 mlog_errno(ret); 3838 mlog_errno(ret);
@@ -3765,8 +3845,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3765 * The merge may have left an empty extent in 3845 * The merge may have left an empty extent in
3766 * our leaf. Try to rotate it away. 3846 * our leaf. Try to rotate it away.
3767 */ 3847 */
3768 ret = ocfs2_rotate_tree_left(inode, handle, path, 3848 ret = ocfs2_rotate_tree_left(handle, et, path,
3769 dealloc, et); 3849 dealloc);
3770 if (ret) 3850 if (ret)
3771 mlog_errno(ret); 3851 mlog_errno(ret);
3772 ret = 0; 3852 ret = 0;
@@ -3812,10 +3892,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb,
3812 * list. If this leaf is part of an allocation tree, it is assumed 3892 * list. If this leaf is part of an allocation tree, it is assumed
3813 * that the tree above has been prepared. 3893 * that the tree above has been prepared.
3814 */ 3894 */
3815static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 3895static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3896 struct ocfs2_extent_rec *insert_rec,
3816 struct ocfs2_extent_list *el, 3897 struct ocfs2_extent_list *el,
3817 struct ocfs2_insert_type *insert, 3898 struct ocfs2_insert_type *insert)
3818 struct inode *inode)
3819{ 3899{
3820 int i = insert->ins_contig_index; 3900 int i = insert->ins_contig_index;
3821 unsigned int range; 3901 unsigned int range;
@@ -3827,7 +3907,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3827 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3907 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3828 BUG_ON(i == -1); 3908 BUG_ON(i == -1);
3829 rec = &el->l_recs[i]; 3909 rec = &el->l_recs[i];
3830 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, 3910 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3911 insert->ins_split, rec,
3831 insert_rec); 3912 insert_rec);
3832 goto rotate; 3913 goto rotate;
3833 } 3914 }
@@ -3869,10 +3950,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3869 3950
3870 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3951 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3871 le16_to_cpu(el->l_count), 3952 le16_to_cpu(el->l_count),
3872 "inode %lu, depth %u, count %u, next free %u, " 3953 "owner %llu, depth %u, count %u, next free %u, "
3873 "rec.cpos %u, rec.clusters %u, " 3954 "rec.cpos %u, rec.clusters %u, "
3874 "insert.cpos %u, insert.clusters %u\n", 3955 "insert.cpos %u, insert.clusters %u\n",
3875 inode->i_ino, 3956 ocfs2_metadata_cache_owner(et->et_ci),
3876 le16_to_cpu(el->l_tree_depth), 3957 le16_to_cpu(el->l_tree_depth),
3877 le16_to_cpu(el->l_count), 3958 le16_to_cpu(el->l_count),
3878 le16_to_cpu(el->l_next_free_rec), 3959 le16_to_cpu(el->l_next_free_rec),
@@ -3900,8 +3981,8 @@ rotate:
3900 ocfs2_rotate_leaf(el, insert_rec); 3981 ocfs2_rotate_leaf(el, insert_rec);
3901} 3982}
3902 3983
3903static void ocfs2_adjust_rightmost_records(struct inode *inode, 3984static void ocfs2_adjust_rightmost_records(handle_t *handle,
3904 handle_t *handle, 3985 struct ocfs2_extent_tree *et,
3905 struct ocfs2_path *path, 3986 struct ocfs2_path *path,
3906 struct ocfs2_extent_rec *insert_rec) 3987 struct ocfs2_extent_rec *insert_rec)
3907{ 3988{
@@ -3919,9 +4000,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
3919 4000
3920 next_free = le16_to_cpu(el->l_next_free_rec); 4001 next_free = le16_to_cpu(el->l_next_free_rec);
3921 if (next_free == 0) { 4002 if (next_free == 0) {
3922 ocfs2_error(inode->i_sb, 4003 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3923 "Dinode %llu has a bad extent list", 4004 "Owner %llu has a bad extent list",
3924 (unsigned long long)OCFS2_I(inode)->ip_blkno); 4005 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3925 ret = -EIO; 4006 ret = -EIO;
3926 return; 4007 return;
3927 } 4008 }
@@ -3941,7 +4022,8 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
3941 } 4022 }
3942} 4023}
3943 4024
3944static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 4025static int ocfs2_append_rec_to_path(handle_t *handle,
4026 struct ocfs2_extent_tree *et,
3945 struct ocfs2_extent_rec *insert_rec, 4027 struct ocfs2_extent_rec *insert_rec,
3946 struct ocfs2_path *right_path, 4028 struct ocfs2_path *right_path,
3947 struct ocfs2_path **ret_left_path) 4029 struct ocfs2_path **ret_left_path)
@@ -3969,8 +4051,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3969 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 4051 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3970 u32 left_cpos; 4052 u32 left_cpos;
3971 4053
3972 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 4054 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3973 &left_cpos); 4055 right_path, &left_cpos);
3974 if (ret) { 4056 if (ret) {
3975 mlog_errno(ret); 4057 mlog_errno(ret);
3976 goto out; 4058 goto out;
@@ -3992,7 +4074,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3992 goto out; 4074 goto out;
3993 } 4075 }
3994 4076
3995 ret = ocfs2_find_path(inode, left_path, left_cpos); 4077 ret = ocfs2_find_path(et->et_ci, left_path,
4078 left_cpos);
3996 if (ret) { 4079 if (ret) {
3997 mlog_errno(ret); 4080 mlog_errno(ret);
3998 goto out; 4081 goto out;
@@ -4005,13 +4088,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
4005 } 4088 }
4006 } 4089 }
4007 4090
4008 ret = ocfs2_journal_access_path(inode, handle, right_path); 4091 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4009 if (ret) { 4092 if (ret) {
4010 mlog_errno(ret); 4093 mlog_errno(ret);
4011 goto out; 4094 goto out;
4012 } 4095 }
4013 4096
4014 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); 4097 ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4015 4098
4016 *ret_left_path = left_path; 4099 *ret_left_path = left_path;
4017 ret = 0; 4100 ret = 0;
@@ -4022,7 +4105,7 @@ out:
4022 return ret; 4105 return ret;
4023} 4106}
4024 4107
4025static void ocfs2_split_record(struct inode *inode, 4108static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4026 struct ocfs2_path *left_path, 4109 struct ocfs2_path *left_path,
4027 struct ocfs2_path *right_path, 4110 struct ocfs2_path *right_path,
4028 struct ocfs2_extent_rec *split_rec, 4111 struct ocfs2_extent_rec *split_rec,
@@ -4095,7 +4178,8 @@ static void ocfs2_split_record(struct inode *inode,
4095 } 4178 }
4096 4179
4097 rec = &el->l_recs[index]; 4180 rec = &el->l_recs[index];
4098 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); 4181 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4182 split, rec, split_rec);
4099 ocfs2_rotate_leaf(insert_el, split_rec); 4183 ocfs2_rotate_leaf(insert_el, split_rec);
4100} 4184}
4101 4185
@@ -4107,8 +4191,8 @@ static void ocfs2_split_record(struct inode *inode,
4107 * in. left_path should only be passed in if we need to update that 4191 * in. left_path should only be passed in if we need to update that
4108 * portion of the tree after an edge insert. 4192 * portion of the tree after an edge insert.
4109 */ 4193 */
4110static int ocfs2_insert_path(struct inode *inode, 4194static int ocfs2_insert_path(handle_t *handle,
4111 handle_t *handle, 4195 struct ocfs2_extent_tree *et,
4112 struct ocfs2_path *left_path, 4196 struct ocfs2_path *left_path,
4113 struct ocfs2_path *right_path, 4197 struct ocfs2_path *right_path,
4114 struct ocfs2_extent_rec *insert_rec, 4198 struct ocfs2_extent_rec *insert_rec,
@@ -4134,7 +4218,7 @@ static int ocfs2_insert_path(struct inode *inode,
4134 goto out; 4218 goto out;
4135 } 4219 }
4136 4220
4137 ret = ocfs2_journal_access_path(inode, handle, left_path); 4221 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4138 if (ret < 0) { 4222 if (ret < 0) {
4139 mlog_errno(ret); 4223 mlog_errno(ret);
4140 goto out; 4224 goto out;
@@ -4145,7 +4229,7 @@ static int ocfs2_insert_path(struct inode *inode,
4145 * Pass both paths to the journal. The majority of inserts 4229 * Pass both paths to the journal. The majority of inserts
4146 * will be touching all components anyway. 4230 * will be touching all components anyway.
4147 */ 4231 */
4148 ret = ocfs2_journal_access_path(inode, handle, right_path); 4232 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4149 if (ret < 0) { 4233 if (ret < 0) {
4150 mlog_errno(ret); 4234 mlog_errno(ret);
4151 goto out; 4235 goto out;
@@ -4157,7 +4241,7 @@ static int ocfs2_insert_path(struct inode *inode,
4157 * of splits, but it's easier to just let one separate 4241 * of splits, but it's easier to just let one separate
4158 * function sort it all out. 4242 * function sort it all out.
4159 */ 4243 */
4160 ocfs2_split_record(inode, left_path, right_path, 4244 ocfs2_split_record(et, left_path, right_path,
4161 insert_rec, insert->ins_split); 4245 insert_rec, insert->ins_split);
4162 4246
4163 /* 4247 /*
@@ -4171,8 +4255,8 @@ static int ocfs2_insert_path(struct inode *inode,
4171 if (ret) 4255 if (ret)
4172 mlog_errno(ret); 4256 mlog_errno(ret);
4173 } else 4257 } else
4174 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 4258 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4175 insert, inode); 4259 insert);
4176 4260
4177 ret = ocfs2_journal_dirty(handle, leaf_bh); 4261 ret = ocfs2_journal_dirty(handle, leaf_bh);
4178 if (ret) 4262 if (ret)
@@ -4185,10 +4269,10 @@ static int ocfs2_insert_path(struct inode *inode,
4185 * 4269 *
4186 * XXX: Should we extend the transaction here? 4270 * XXX: Should we extend the transaction here?
4187 */ 4271 */
4188 subtree_index = ocfs2_find_subtree_root(inode, left_path, 4272 subtree_index = ocfs2_find_subtree_root(et, left_path,
4189 right_path); 4273 right_path);
4190 ocfs2_complete_edge_insert(inode, handle, left_path, 4274 ocfs2_complete_edge_insert(handle, left_path, right_path,
4191 right_path, subtree_index); 4275 subtree_index);
4192 } 4276 }
4193 4277
4194 ret = 0; 4278 ret = 0;
@@ -4196,8 +4280,7 @@ out:
4196 return ret; 4280 return ret;
4197} 4281}
4198 4282
4199static int ocfs2_do_insert_extent(struct inode *inode, 4283static int ocfs2_do_insert_extent(handle_t *handle,
4200 handle_t *handle,
4201 struct ocfs2_extent_tree *et, 4284 struct ocfs2_extent_tree *et,
4202 struct ocfs2_extent_rec *insert_rec, 4285 struct ocfs2_extent_rec *insert_rec,
4203 struct ocfs2_insert_type *type) 4286 struct ocfs2_insert_type *type)
@@ -4210,7 +4293,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4210 4293
4211 el = et->et_root_el; 4294 el = et->et_root_el;
4212 4295
4213 ret = ocfs2_et_root_journal_access(handle, inode, et, 4296 ret = ocfs2_et_root_journal_access(handle, et,
4214 OCFS2_JOURNAL_ACCESS_WRITE); 4297 OCFS2_JOURNAL_ACCESS_WRITE);
4215 if (ret) { 4298 if (ret) {
4216 mlog_errno(ret); 4299 mlog_errno(ret);
@@ -4218,7 +4301,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4218 } 4301 }
4219 4302
4220 if (le16_to_cpu(el->l_tree_depth) == 0) { 4303 if (le16_to_cpu(el->l_tree_depth) == 0) {
4221 ocfs2_insert_at_leaf(insert_rec, el, type, inode); 4304 ocfs2_insert_at_leaf(et, insert_rec, el, type);
4222 goto out_update_clusters; 4305 goto out_update_clusters;
4223 } 4306 }
4224 4307
@@ -4241,7 +4324,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4241 cpos = UINT_MAX; 4324 cpos = UINT_MAX;
4242 } 4325 }
4243 4326
4244 ret = ocfs2_find_path(inode, right_path, cpos); 4327 ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4245 if (ret) { 4328 if (ret) {
4246 mlog_errno(ret); 4329 mlog_errno(ret);
4247 goto out; 4330 goto out;
@@ -4260,7 +4343,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4260 * can wind up skipping both of these two special cases... 4343 * can wind up skipping both of these two special cases...
4261 */ 4344 */
4262 if (rotate) { 4345 if (rotate) {
4263 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, 4346 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4264 le32_to_cpu(insert_rec->e_cpos), 4347 le32_to_cpu(insert_rec->e_cpos),
4265 right_path, &left_path); 4348 right_path, &left_path);
4266 if (ret) { 4349 if (ret) {
@@ -4272,7 +4355,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4272 * ocfs2_rotate_tree_right() might have extended the 4355 * ocfs2_rotate_tree_right() might have extended the
4273 * transaction without re-journaling our tree root. 4356 * transaction without re-journaling our tree root.
4274 */ 4357 */
4275 ret = ocfs2_et_root_journal_access(handle, inode, et, 4358 ret = ocfs2_et_root_journal_access(handle, et,
4276 OCFS2_JOURNAL_ACCESS_WRITE); 4359 OCFS2_JOURNAL_ACCESS_WRITE);
4277 if (ret) { 4360 if (ret) {
4278 mlog_errno(ret); 4361 mlog_errno(ret);
@@ -4280,7 +4363,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4280 } 4363 }
4281 } else if (type->ins_appending == APPEND_TAIL 4364 } else if (type->ins_appending == APPEND_TAIL
4282 && type->ins_contig != CONTIG_LEFT) { 4365 && type->ins_contig != CONTIG_LEFT) {
4283 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 4366 ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4284 right_path, &left_path); 4367 right_path, &left_path);
4285 if (ret) { 4368 if (ret) {
4286 mlog_errno(ret); 4369 mlog_errno(ret);
@@ -4288,7 +4371,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4288 } 4371 }
4289 } 4372 }
4290 4373
4291 ret = ocfs2_insert_path(inode, handle, left_path, right_path, 4374 ret = ocfs2_insert_path(handle, et, left_path, right_path,
4292 insert_rec, type); 4375 insert_rec, type);
4293 if (ret) { 4376 if (ret) {
4294 mlog_errno(ret); 4377 mlog_errno(ret);
@@ -4297,7 +4380,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4297 4380
4298out_update_clusters: 4381out_update_clusters:
4299 if (type->ins_split == SPLIT_NONE) 4382 if (type->ins_split == SPLIT_NONE)
4300 ocfs2_et_update_clusters(inode, et, 4383 ocfs2_et_update_clusters(et,
4301 le16_to_cpu(insert_rec->e_leaf_clusters)); 4384 le16_to_cpu(insert_rec->e_leaf_clusters));
4302 4385
4303 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4386 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4312,7 +4395,8 @@ out:
4312} 4395}
4313 4396
4314static enum ocfs2_contig_type 4397static enum ocfs2_contig_type
4315ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, 4398ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4399 struct ocfs2_path *path,
4316 struct ocfs2_extent_list *el, int index, 4400 struct ocfs2_extent_list *el, int index,
4317 struct ocfs2_extent_rec *split_rec) 4401 struct ocfs2_extent_rec *split_rec)
4318{ 4402{
@@ -4324,12 +4408,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4324 struct ocfs2_path *left_path = NULL, *right_path = NULL; 4408 struct ocfs2_path *left_path = NULL, *right_path = NULL;
4325 struct buffer_head *bh; 4409 struct buffer_head *bh;
4326 struct ocfs2_extent_block *eb; 4410 struct ocfs2_extent_block *eb;
4411 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4327 4412
4328 if (index > 0) { 4413 if (index > 0) {
4329 rec = &el->l_recs[index - 1]; 4414 rec = &el->l_recs[index - 1];
4330 } else if (path->p_tree_depth > 0) { 4415 } else if (path->p_tree_depth > 0) {
4331 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 4416 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4332 path, &left_cpos);
4333 if (status) 4417 if (status)
4334 goto out; 4418 goto out;
4335 4419
@@ -4338,7 +4422,8 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4338 if (!left_path) 4422 if (!left_path)
4339 goto out; 4423 goto out;
4340 4424
4341 status = ocfs2_find_path(inode, left_path, left_cpos); 4425 status = ocfs2_find_path(et->et_ci, left_path,
4426 left_cpos);
4342 if (status) 4427 if (status)
4343 goto out; 4428 goto out;
4344 4429
@@ -4348,7 +4433,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4348 le16_to_cpu(new_el->l_count)) { 4433 le16_to_cpu(new_el->l_count)) {
4349 bh = path_leaf_bh(left_path); 4434 bh = path_leaf_bh(left_path);
4350 eb = (struct ocfs2_extent_block *)bh->b_data; 4435 eb = (struct ocfs2_extent_block *)bh->b_data;
4351 ocfs2_error(inode->i_sb, 4436 ocfs2_error(sb,
4352 "Extent block #%llu has an " 4437 "Extent block #%llu has an "
4353 "invalid l_next_free_rec of " 4438 "invalid l_next_free_rec of "
4354 "%d. It should have " 4439 "%d. It should have "
@@ -4373,7 +4458,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4373 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 4458 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4374 ret = CONTIG_RIGHT; 4459 ret = CONTIG_RIGHT;
4375 } else { 4460 } else {
4376 ret = ocfs2_extent_contig(inode, rec, split_rec); 4461 ret = ocfs2_et_extent_contig(et, rec, split_rec);
4377 } 4462 }
4378 } 4463 }
4379 4464
@@ -4382,8 +4467,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4382 rec = &el->l_recs[index + 1]; 4467 rec = &el->l_recs[index + 1];
4383 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4468 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4384 path->p_tree_depth > 0) { 4469 path->p_tree_depth > 0) {
4385 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, 4470 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4386 path, &right_cpos);
4387 if (status) 4471 if (status)
4388 goto out; 4472 goto out;
4389 4473
@@ -4394,7 +4478,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4394 if (!right_path) 4478 if (!right_path)
4395 goto out; 4479 goto out;
4396 4480
4397 status = ocfs2_find_path(inode, right_path, right_cpos); 4481 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4398 if (status) 4482 if (status)
4399 goto out; 4483 goto out;
4400 4484
@@ -4404,7 +4488,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4404 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4488 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4405 bh = path_leaf_bh(right_path); 4489 bh = path_leaf_bh(right_path);
4406 eb = (struct ocfs2_extent_block *)bh->b_data; 4490 eb = (struct ocfs2_extent_block *)bh->b_data;
4407 ocfs2_error(inode->i_sb, 4491 ocfs2_error(sb,
4408 "Extent block #%llu has an " 4492 "Extent block #%llu has an "
4409 "invalid l_next_free_rec of %d", 4493 "invalid l_next_free_rec of %d",
4410 (unsigned long long)le64_to_cpu(eb->h_blkno), 4494 (unsigned long long)le64_to_cpu(eb->h_blkno),
@@ -4419,7 +4503,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4419 if (rec) { 4503 if (rec) {
4420 enum ocfs2_contig_type contig_type; 4504 enum ocfs2_contig_type contig_type;
4421 4505
4422 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 4506 contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4423 4507
4424 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4508 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4425 ret = CONTIG_LEFTRIGHT; 4509 ret = CONTIG_LEFTRIGHT;
@@ -4436,11 +4520,10 @@ out:
4436 return ret; 4520 return ret;
4437} 4521}
4438 4522
4439static void ocfs2_figure_contig_type(struct inode *inode, 4523static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4440 struct ocfs2_insert_type *insert, 4524 struct ocfs2_insert_type *insert,
4441 struct ocfs2_extent_list *el, 4525 struct ocfs2_extent_list *el,
4442 struct ocfs2_extent_rec *insert_rec, 4526 struct ocfs2_extent_rec *insert_rec)
4443 struct ocfs2_extent_tree *et)
4444{ 4527{
4445 int i; 4528 int i;
4446 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4529 enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -4448,8 +4531,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
4448 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4531 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4449 4532
4450 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4533 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4451 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 4534 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4452 insert_rec); 4535 insert_rec);
4453 if (contig_type != CONTIG_NONE) { 4536 if (contig_type != CONTIG_NONE) {
4454 insert->ins_contig_index = i; 4537 insert->ins_contig_index = i;
4455 break; 4538 break;
@@ -4530,8 +4613,7 @@ set_tail_append:
4530 * All of the information is stored on the ocfs2_insert_type 4613 * All of the information is stored on the ocfs2_insert_type
4531 * structure. 4614 * structure.
4532 */ 4615 */
4533static int ocfs2_figure_insert_type(struct inode *inode, 4616static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4534 struct ocfs2_extent_tree *et,
4535 struct buffer_head **last_eb_bh, 4617 struct buffer_head **last_eb_bh,
4536 struct ocfs2_extent_rec *insert_rec, 4618 struct ocfs2_extent_rec *insert_rec,
4537 int *free_records, 4619 int *free_records,
@@ -4555,7 +4637,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4555 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4637 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4556 * may want it later. 4638 * may want it later.
4557 */ 4639 */
4558 ret = ocfs2_read_extent_block(inode, 4640 ret = ocfs2_read_extent_block(et->et_ci,
4559 ocfs2_et_get_last_eb_blk(et), 4641 ocfs2_et_get_last_eb_blk(et),
4560 &bh); 4642 &bh);
4561 if (ret) { 4643 if (ret) {
@@ -4578,7 +4660,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4578 le16_to_cpu(el->l_next_free_rec); 4660 le16_to_cpu(el->l_next_free_rec);
4579 4661
4580 if (!insert->ins_tree_depth) { 4662 if (!insert->ins_tree_depth) {
4581 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4663 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4582 ocfs2_figure_appending_type(insert, el, insert_rec); 4664 ocfs2_figure_appending_type(insert, el, insert_rec);
4583 return 0; 4665 return 0;
4584 } 4666 }
@@ -4596,7 +4678,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4596 * us the rightmost tree path. This is accounted for below in 4678 * us the rightmost tree path. This is accounted for below in
4597 * the appending code. 4679 * the appending code.
4598 */ 4680 */
4599 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 4681 ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4600 if (ret) { 4682 if (ret) {
4601 mlog_errno(ret); 4683 mlog_errno(ret);
4602 goto out; 4684 goto out;
@@ -4612,7 +4694,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4612 * into two types of appends: simple record append, or a 4694 * into two types of appends: simple record append, or a
4613 * rotate inside the tail leaf. 4695 * rotate inside the tail leaf.
4614 */ 4696 */
4615 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4697 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4616 4698
4617 /* 4699 /*
4618 * The insert code isn't quite ready to deal with all cases of 4700 * The insert code isn't quite ready to deal with all cases of
@@ -4657,13 +4739,11 @@ out:
4657} 4739}
4658 4740
4659/* 4741/*
4660 * Insert an extent into an inode btree. 4742 * Insert an extent into a btree.
4661 * 4743 *
4662 * The caller needs to update fe->i_clusters 4744 * The caller needs to update the owning btree's cluster count.
4663 */ 4745 */
4664int ocfs2_insert_extent(struct ocfs2_super *osb, 4746int ocfs2_insert_extent(handle_t *handle,
4665 handle_t *handle,
4666 struct inode *inode,
4667 struct ocfs2_extent_tree *et, 4747 struct ocfs2_extent_tree *et,
4668 u32 cpos, 4748 u32 cpos,
4669 u64 start_blk, 4749 u64 start_blk,
@@ -4677,21 +4757,22 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4677 struct ocfs2_insert_type insert = {0, }; 4757 struct ocfs2_insert_type insert = {0, };
4678 struct ocfs2_extent_rec rec; 4758 struct ocfs2_extent_rec rec;
4679 4759
4680 mlog(0, "add %u clusters at position %u to inode %llu\n", 4760 mlog(0, "add %u clusters at position %u to owner %llu\n",
4681 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4761 new_clusters, cpos,
4762 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4682 4763
4683 memset(&rec, 0, sizeof(rec)); 4764 memset(&rec, 0, sizeof(rec));
4684 rec.e_cpos = cpu_to_le32(cpos); 4765 rec.e_cpos = cpu_to_le32(cpos);
4685 rec.e_blkno = cpu_to_le64(start_blk); 4766 rec.e_blkno = cpu_to_le64(start_blk);
4686 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4767 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4687 rec.e_flags = flags; 4768 rec.e_flags = flags;
4688 status = ocfs2_et_insert_check(inode, et, &rec); 4769 status = ocfs2_et_insert_check(et, &rec);
4689 if (status) { 4770 if (status) {
4690 mlog_errno(status); 4771 mlog_errno(status);
4691 goto bail; 4772 goto bail;
4692 } 4773 }
4693 4774
4694 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, 4775 status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4695 &free_records, &insert); 4776 &free_records, &insert);
4696 if (status < 0) { 4777 if (status < 0) {
4697 mlog_errno(status); 4778 mlog_errno(status);
@@ -4705,7 +4786,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4705 free_records, insert.ins_tree_depth); 4786 free_records, insert.ins_tree_depth);
4706 4787
4707 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4788 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4708 status = ocfs2_grow_tree(inode, handle, et, 4789 status = ocfs2_grow_tree(handle, et,
4709 &insert.ins_tree_depth, &last_eb_bh, 4790 &insert.ins_tree_depth, &last_eb_bh,
4710 meta_ac); 4791 meta_ac);
4711 if (status) { 4792 if (status) {
@@ -4715,11 +4796,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4715 } 4796 }
4716 4797
4717 /* Finally, we can add clusters. This might rotate the tree for us. */ 4798 /* Finally, we can add clusters. This might rotate the tree for us. */
4718 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); 4799 status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4719 if (status < 0) 4800 if (status < 0)
4720 mlog_errno(status); 4801 mlog_errno(status);
4721 else if (et->et_ops == &ocfs2_dinode_et_ops) 4802 else
4722 ocfs2_extent_map_insert_rec(inode, &rec); 4803 ocfs2_et_extent_map_insert(et, &rec);
4723 4804
4724bail: 4805bail:
4725 brelse(last_eb_bh); 4806 brelse(last_eb_bh);
@@ -4735,13 +4816,11 @@ bail:
4735 * it is not limited to the file storage. Any extent tree can use this 4816 * it is not limited to the file storage. Any extent tree can use this
4736 * function if it implements the proper ocfs2_extent_tree. 4817 * function if it implements the proper ocfs2_extent_tree.
4737 */ 4818 */
4738int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 4819int ocfs2_add_clusters_in_btree(handle_t *handle,
4739 struct inode *inode, 4820 struct ocfs2_extent_tree *et,
4740 u32 *logical_offset, 4821 u32 *logical_offset,
4741 u32 clusters_to_add, 4822 u32 clusters_to_add,
4742 int mark_unwritten, 4823 int mark_unwritten,
4743 struct ocfs2_extent_tree *et,
4744 handle_t *handle,
4745 struct ocfs2_alloc_context *data_ac, 4824 struct ocfs2_alloc_context *data_ac,
4746 struct ocfs2_alloc_context *meta_ac, 4825 struct ocfs2_alloc_context *meta_ac,
4747 enum ocfs2_alloc_restarted *reason_ret) 4826 enum ocfs2_alloc_restarted *reason_ret)
@@ -4752,13 +4831,15 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4752 u32 bit_off, num_bits; 4831 u32 bit_off, num_bits;
4753 u64 block; 4832 u64 block;
4754 u8 flags = 0; 4833 u8 flags = 0;
4834 struct ocfs2_super *osb =
4835 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4755 4836
4756 BUG_ON(!clusters_to_add); 4837 BUG_ON(!clusters_to_add);
4757 4838
4758 if (mark_unwritten) 4839 if (mark_unwritten)
4759 flags = OCFS2_EXT_UNWRITTEN; 4840 flags = OCFS2_EXT_UNWRITTEN;
4760 4841
4761 free_extents = ocfs2_num_free_extents(osb, inode, et); 4842 free_extents = ocfs2_num_free_extents(osb, et);
4762 if (free_extents < 0) { 4843 if (free_extents < 0) {
4763 status = free_extents; 4844 status = free_extents;
4764 mlog_errno(status); 4845 mlog_errno(status);
@@ -4795,7 +4876,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4795 BUG_ON(num_bits > clusters_to_add); 4876 BUG_ON(num_bits > clusters_to_add);
4796 4877
4797 /* reserve our write early -- insert_extent may update the tree root */ 4878 /* reserve our write early -- insert_extent may update the tree root */
4798 status = ocfs2_et_root_journal_access(handle, inode, et, 4879 status = ocfs2_et_root_journal_access(handle, et,
4799 OCFS2_JOURNAL_ACCESS_WRITE); 4880 OCFS2_JOURNAL_ACCESS_WRITE);
4800 if (status < 0) { 4881 if (status < 0) {
4801 mlog_errno(status); 4882 mlog_errno(status);
@@ -4803,10 +4884,10 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4803 } 4884 }
4804 4885
4805 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4886 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4806 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 4887 mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
4807 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4888 num_bits, bit_off,
4808 status = ocfs2_insert_extent(osb, handle, inode, et, 4889 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4809 *logical_offset, block, 4890 status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4810 num_bits, flags, meta_ac); 4891 num_bits, flags, meta_ac);
4811 if (status < 0) { 4892 if (status < 0) {
4812 mlog_errno(status); 4893 mlog_errno(status);
@@ -4856,10 +4937,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
4856 split_rec->e_flags = rec->e_flags; 4937 split_rec->e_flags = rec->e_flags;
4857} 4938}
4858 4939
4859static int ocfs2_split_and_insert(struct inode *inode, 4940static int ocfs2_split_and_insert(handle_t *handle,
4860 handle_t *handle,
4861 struct ocfs2_path *path,
4862 struct ocfs2_extent_tree *et, 4941 struct ocfs2_extent_tree *et,
4942 struct ocfs2_path *path,
4863 struct buffer_head **last_eb_bh, 4943 struct buffer_head **last_eb_bh,
4864 int split_index, 4944 int split_index,
4865 struct ocfs2_extent_rec *orig_split_rec, 4945 struct ocfs2_extent_rec *orig_split_rec,
@@ -4892,7 +4972,7 @@ leftright:
4892 4972
4893 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4973 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4894 le16_to_cpu(rightmost_el->l_count)) { 4974 le16_to_cpu(rightmost_el->l_count)) {
4895 ret = ocfs2_grow_tree(inode, handle, et, 4975 ret = ocfs2_grow_tree(handle, et,
4896 &depth, last_eb_bh, meta_ac); 4976 &depth, last_eb_bh, meta_ac);
4897 if (ret) { 4977 if (ret) {
4898 mlog_errno(ret); 4978 mlog_errno(ret);
@@ -4921,8 +5001,8 @@ leftright:
4921 */ 5001 */
4922 insert.ins_split = SPLIT_RIGHT; 5002 insert.ins_split = SPLIT_RIGHT;
4923 5003
4924 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, 5004 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4925 &rec); 5005 &tmprec, insert_range, &rec);
4926 5006
4927 split_rec = tmprec; 5007 split_rec = tmprec;
4928 5008
@@ -4930,7 +5010,7 @@ leftright:
4930 do_leftright = 1; 5010 do_leftright = 1;
4931 } 5011 }
4932 5012
4933 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 5013 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4934 if (ret) { 5014 if (ret) {
4935 mlog_errno(ret); 5015 mlog_errno(ret);
4936 goto out; 5016 goto out;
@@ -4946,7 +5026,7 @@ leftright:
4946 ocfs2_reinit_path(path, 1); 5026 ocfs2_reinit_path(path, 1);
4947 5027
4948 cpos = le32_to_cpu(split_rec.e_cpos); 5028 cpos = le32_to_cpu(split_rec.e_cpos);
4949 ret = ocfs2_find_path(inode, path, cpos); 5029 ret = ocfs2_find_path(et->et_ci, path, cpos);
4950 if (ret) { 5030 if (ret) {
4951 mlog_errno(ret); 5031 mlog_errno(ret);
4952 goto out; 5032 goto out;
@@ -4961,8 +5041,8 @@ out:
4961 return ret; 5041 return ret;
4962} 5042}
4963 5043
4964static int ocfs2_replace_extent_rec(struct inode *inode, 5044static int ocfs2_replace_extent_rec(handle_t *handle,
4965 handle_t *handle, 5045 struct ocfs2_extent_tree *et,
4966 struct ocfs2_path *path, 5046 struct ocfs2_path *path,
4967 struct ocfs2_extent_list *el, 5047 struct ocfs2_extent_list *el,
4968 int split_index, 5048 int split_index,
@@ -4970,7 +5050,7 @@ static int ocfs2_replace_extent_rec(struct inode *inode,
4970{ 5050{
4971 int ret; 5051 int ret;
4972 5052
4973 ret = ocfs2_path_bh_journal_access(handle, inode, path, 5053 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4974 path_num_items(path) - 1); 5054 path_num_items(path) - 1);
4975 if (ret) { 5055 if (ret) {
4976 mlog_errno(ret); 5056 mlog_errno(ret);
@@ -4985,9 +5065,8 @@ out:
4985} 5065}
4986 5066
4987/* 5067/*
4988 * Mark part or all of the extent record at split_index in the leaf 5068 * Split part or all of the extent record at split_index in the leaf
4989 * pointed to by path as written. This removes the unwritten 5069 * pointed to by path. Merge with the contiguous extent record if needed.
4990 * extent flag.
4991 * 5070 *
4992 * Care is taken to handle contiguousness so as to not grow the tree. 5071 * Care is taken to handle contiguousness so as to not grow the tree.
4993 * 5072 *
@@ -5004,14 +5083,13 @@ out:
5004 * have been brought into cache (and pinned via the journal), so the 5083 * have been brought into cache (and pinned via the journal), so the
5005 * extra overhead is not expressed in terms of disk reads. 5084 * extra overhead is not expressed in terms of disk reads.
5006 */ 5085 */
5007static int __ocfs2_mark_extent_written(struct inode *inode, 5086int ocfs2_split_extent(handle_t *handle,
5008 struct ocfs2_extent_tree *et, 5087 struct ocfs2_extent_tree *et,
5009 handle_t *handle, 5088 struct ocfs2_path *path,
5010 struct ocfs2_path *path, 5089 int split_index,
5011 int split_index, 5090 struct ocfs2_extent_rec *split_rec,
5012 struct ocfs2_extent_rec *split_rec, 5091 struct ocfs2_alloc_context *meta_ac,
5013 struct ocfs2_alloc_context *meta_ac, 5092 struct ocfs2_cached_dealloc_ctxt *dealloc)
5014 struct ocfs2_cached_dealloc_ctxt *dealloc)
5015{ 5093{
5016 int ret = 0; 5094 int ret = 0;
5017 struct ocfs2_extent_list *el = path_leaf_el(path); 5095 struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5020,12 +5098,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5020 struct ocfs2_merge_ctxt ctxt; 5098 struct ocfs2_merge_ctxt ctxt;
5021 struct ocfs2_extent_list *rightmost_el; 5099 struct ocfs2_extent_list *rightmost_el;
5022 5100
5023 if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
5024 ret = -EIO;
5025 mlog_errno(ret);
5026 goto out;
5027 }
5028
5029 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 5101 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5030 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 5102 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5031 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 5103 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
@@ -5034,19 +5106,19 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5034 goto out; 5106 goto out;
5035 } 5107 }
5036 5108
5037 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, 5109 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5038 split_index, 5110 split_index,
5039 split_rec); 5111 split_rec);
5040 5112
5041 /* 5113 /*
5042 * The core merge / split code wants to know how much room is 5114 * The core merge / split code wants to know how much room is
5043 * left in this inodes allocation tree, so we pass the 5115 * left in this allocation tree, so we pass the
5044 * rightmost extent list. 5116 * rightmost extent list.
5045 */ 5117 */
5046 if (path->p_tree_depth) { 5118 if (path->p_tree_depth) {
5047 struct ocfs2_extent_block *eb; 5119 struct ocfs2_extent_block *eb;
5048 5120
5049 ret = ocfs2_read_extent_block(inode, 5121 ret = ocfs2_read_extent_block(et->et_ci,
5050 ocfs2_et_get_last_eb_blk(et), 5122 ocfs2_et_get_last_eb_blk(et),
5051 &last_eb_bh); 5123 &last_eb_bh);
5052 if (ret) { 5124 if (ret) {
@@ -5073,19 +5145,18 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5073 5145
5074 if (ctxt.c_contig_type == CONTIG_NONE) { 5146 if (ctxt.c_contig_type == CONTIG_NONE) {
5075 if (ctxt.c_split_covers_rec) 5147 if (ctxt.c_split_covers_rec)
5076 ret = ocfs2_replace_extent_rec(inode, handle, 5148 ret = ocfs2_replace_extent_rec(handle, et, path, el,
5077 path, el,
5078 split_index, split_rec); 5149 split_index, split_rec);
5079 else 5150 else
5080 ret = ocfs2_split_and_insert(inode, handle, path, et, 5151 ret = ocfs2_split_and_insert(handle, et, path,
5081 &last_eb_bh, split_index, 5152 &last_eb_bh, split_index,
5082 split_rec, meta_ac); 5153 split_rec, meta_ac);
5083 if (ret) 5154 if (ret)
5084 mlog_errno(ret); 5155 mlog_errno(ret);
5085 } else { 5156 } else {
5086 ret = ocfs2_try_to_merge_extent(inode, handle, path, 5157 ret = ocfs2_try_to_merge_extent(handle, et, path,
5087 split_index, split_rec, 5158 split_index, split_rec,
5088 dealloc, &ctxt, et); 5159 dealloc, &ctxt);
5089 if (ret) 5160 if (ret)
5090 mlog_errno(ret); 5161 mlog_errno(ret);
5091 } 5162 }
@@ -5096,46 +5167,31 @@ out:
5096} 5167}
5097 5168
5098/* 5169/*
5099 * Mark the already-existing extent at cpos as written for len clusters. 5170 * Change the flags of the already-existing extent at cpos for len clusters.
5171 *
5172 * new_flags: the flags we want to set.
5173 * clear_flags: the flags we want to clear.
5174 * phys: the new physical offset we want this new extent starts from.
5100 * 5175 *
5101 * If the existing extent is larger than the request, initiate a 5176 * If the existing extent is larger than the request, initiate a
5102 * split. An attempt will be made at merging with adjacent extents. 5177 * split. An attempt will be made at merging with adjacent extents.
5103 * 5178 *
5104 * The caller is responsible for passing down meta_ac if we'll need it. 5179 * The caller is responsible for passing down meta_ac if we'll need it.
5105 */ 5180 */
5106int ocfs2_mark_extent_written(struct inode *inode, 5181int ocfs2_change_extent_flag(handle_t *handle,
5107 struct ocfs2_extent_tree *et, 5182 struct ocfs2_extent_tree *et,
5108 handle_t *handle, u32 cpos, u32 len, u32 phys, 5183 u32 cpos, u32 len, u32 phys,
5109 struct ocfs2_alloc_context *meta_ac, 5184 struct ocfs2_alloc_context *meta_ac,
5110 struct ocfs2_cached_dealloc_ctxt *dealloc) 5185 struct ocfs2_cached_dealloc_ctxt *dealloc,
5186 int new_flags, int clear_flags)
5111{ 5187{
5112 int ret, index; 5188 int ret, index;
5113 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); 5189 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5190 u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5114 struct ocfs2_extent_rec split_rec; 5191 struct ocfs2_extent_rec split_rec;
5115 struct ocfs2_path *left_path = NULL; 5192 struct ocfs2_path *left_path = NULL;
5116 struct ocfs2_extent_list *el; 5193 struct ocfs2_extent_list *el;
5117 5194 struct ocfs2_extent_rec *rec;
5118 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
5119 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
5120
5121 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5122 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5123 "that are being written to, but the feature bit "
5124 "is not set in the super block.",
5125 (unsigned long long)OCFS2_I(inode)->ip_blkno);
5126 ret = -EROFS;
5127 goto out;
5128 }
5129
5130 /*
5131 * XXX: This should be fixed up so that we just re-insert the
5132 * next extent records.
5133 *
5134 * XXX: This is a hack on the extent tree, maybe it should be
5135 * an op?
5136 */
5137 if (et->et_ops == &ocfs2_dinode_et_ops)
5138 ocfs2_extent_map_trunc(inode, 0);
5139 5195
5140 left_path = ocfs2_new_path_from_et(et); 5196 left_path = ocfs2_new_path_from_et(et);
5141 if (!left_path) { 5197 if (!left_path) {
@@ -5144,7 +5200,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
5144 goto out; 5200 goto out;
5145 } 5201 }
5146 5202
5147 ret = ocfs2_find_path(inode, left_path, cpos); 5203 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5148 if (ret) { 5204 if (ret) {
5149 mlog_errno(ret); 5205 mlog_errno(ret);
5150 goto out; 5206 goto out;
@@ -5153,34 +5209,102 @@ int ocfs2_mark_extent_written(struct inode *inode,
5153 5209
5154 index = ocfs2_search_extent_list(el, cpos); 5210 index = ocfs2_search_extent_list(el, cpos);
5155 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5211 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5156 ocfs2_error(inode->i_sb, 5212 ocfs2_error(sb,
5157 "Inode %llu has an extent at cpos %u which can no " 5213 "Owner %llu has an extent at cpos %u which can no "
5158 "longer be found.\n", 5214 "longer be found.\n",
5159 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5215 (unsigned long long)
5216 ocfs2_metadata_cache_owner(et->et_ci), cpos);
5160 ret = -EROFS; 5217 ret = -EROFS;
5161 goto out; 5218 goto out;
5162 } 5219 }
5163 5220
5221 ret = -EIO;
5222 rec = &el->l_recs[index];
5223 if (new_flags && (rec->e_flags & new_flags)) {
5224 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5225 "extent that already had them",
5226 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5227 new_flags);
5228 goto out;
5229 }
5230
5231 if (clear_flags && !(rec->e_flags & clear_flags)) {
5232 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5233 "extent that didn't have them",
5234 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5235 clear_flags);
5236 goto out;
5237 }
5238
5164 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 5239 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5165 split_rec.e_cpos = cpu_to_le32(cpos); 5240 split_rec.e_cpos = cpu_to_le32(cpos);
5166 split_rec.e_leaf_clusters = cpu_to_le16(len); 5241 split_rec.e_leaf_clusters = cpu_to_le16(len);
5167 split_rec.e_blkno = cpu_to_le64(start_blkno); 5242 split_rec.e_blkno = cpu_to_le64(start_blkno);
5168 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 5243 split_rec.e_flags = rec->e_flags;
5169 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 5244 if (new_flags)
5170 5245 split_rec.e_flags |= new_flags;
5171 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, 5246 if (clear_flags)
5172 index, &split_rec, meta_ac, 5247 split_rec.e_flags &= ~clear_flags;
5173 dealloc); 5248
5249 ret = ocfs2_split_extent(handle, et, left_path,
5250 index, &split_rec, meta_ac,
5251 dealloc);
5174 if (ret) 5252 if (ret)
5175 mlog_errno(ret); 5253 mlog_errno(ret);
5176 5254
5177out: 5255out:
5178 ocfs2_free_path(left_path); 5256 ocfs2_free_path(left_path);
5179 return ret; 5257 return ret;
5258
5180} 5259}
5181 5260
5182static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, 5261/*
5183 handle_t *handle, struct ocfs2_path *path, 5262 * Mark the already-existing extent at cpos as written for len clusters.
5263 * This removes the unwritten extent flag.
5264 *
5265 * If the existing extent is larger than the request, initiate a
5266 * split. An attempt will be made at merging with adjacent extents.
5267 *
5268 * The caller is responsible for passing down meta_ac if we'll need it.
5269 */
5270int ocfs2_mark_extent_written(struct inode *inode,
5271 struct ocfs2_extent_tree *et,
5272 handle_t *handle, u32 cpos, u32 len, u32 phys,
5273 struct ocfs2_alloc_context *meta_ac,
5274 struct ocfs2_cached_dealloc_ctxt *dealloc)
5275{
5276 int ret;
5277
5278 mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
5279 inode->i_ino, cpos, len, phys);
5280
5281 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5282 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5283 "that are being written to, but the feature bit "
5284 "is not set in the super block.",
5285 (unsigned long long)OCFS2_I(inode)->ip_blkno);
5286 ret = -EROFS;
5287 goto out;
5288 }
5289
5290 /*
5291 * XXX: This should be fixed up so that we just re-insert the
5292 * next extent records.
5293 */
5294 ocfs2_et_extent_map_truncate(et, 0);
5295
5296 ret = ocfs2_change_extent_flag(handle, et, cpos,
5297 len, phys, meta_ac, dealloc,
5298 0, OCFS2_EXT_UNWRITTEN);
5299 if (ret)
5300 mlog_errno(ret);
5301
5302out:
5303 return ret;
5304}
5305
5306static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5307 struct ocfs2_path *path,
5184 int index, u32 new_range, 5308 int index, u32 new_range,
5185 struct ocfs2_alloc_context *meta_ac) 5309 struct ocfs2_alloc_context *meta_ac)
5186{ 5310{
@@ -5197,11 +5321,12 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5197 */ 5321 */
5198 el = path_leaf_el(path); 5322 el = path_leaf_el(path);
5199 rec = &el->l_recs[index]; 5323 rec = &el->l_recs[index];
5200 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); 5324 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5325 &split_rec, new_range, rec);
5201 5326
5202 depth = path->p_tree_depth; 5327 depth = path->p_tree_depth;
5203 if (depth > 0) { 5328 if (depth > 0) {
5204 ret = ocfs2_read_extent_block(inode, 5329 ret = ocfs2_read_extent_block(et->et_ci,
5205 ocfs2_et_get_last_eb_blk(et), 5330 ocfs2_et_get_last_eb_blk(et),
5206 &last_eb_bh); 5331 &last_eb_bh);
5207 if (ret < 0) { 5332 if (ret < 0) {
@@ -5224,7 +5349,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5224 5349
5225 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 5350 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5226 le16_to_cpu(rightmost_el->l_count)) { 5351 le16_to_cpu(rightmost_el->l_count)) {
5227 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, 5352 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5228 meta_ac); 5353 meta_ac);
5229 if (ret) { 5354 if (ret) {
5230 mlog_errno(ret); 5355 mlog_errno(ret);
@@ -5238,7 +5363,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5238 insert.ins_split = SPLIT_RIGHT; 5363 insert.ins_split = SPLIT_RIGHT;
5239 insert.ins_tree_depth = depth; 5364 insert.ins_tree_depth = depth;
5240 5365
5241 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 5366 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5242 if (ret) 5367 if (ret)
5243 mlog_errno(ret); 5368 mlog_errno(ret);
5244 5369
@@ -5247,23 +5372,23 @@ out:
5247 return ret; 5372 return ret;
5248} 5373}
5249 5374
5250static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 5375static int ocfs2_truncate_rec(handle_t *handle,
5376 struct ocfs2_extent_tree *et,
5251 struct ocfs2_path *path, int index, 5377 struct ocfs2_path *path, int index,
5252 struct ocfs2_cached_dealloc_ctxt *dealloc, 5378 struct ocfs2_cached_dealloc_ctxt *dealloc,
5253 u32 cpos, u32 len, 5379 u32 cpos, u32 len)
5254 struct ocfs2_extent_tree *et)
5255{ 5380{
5256 int ret; 5381 int ret;
5257 u32 left_cpos, rec_range, trunc_range; 5382 u32 left_cpos, rec_range, trunc_range;
5258 int wants_rotate = 0, is_rightmost_tree_rec = 0; 5383 int wants_rotate = 0, is_rightmost_tree_rec = 0;
5259 struct super_block *sb = inode->i_sb; 5384 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5260 struct ocfs2_path *left_path = NULL; 5385 struct ocfs2_path *left_path = NULL;
5261 struct ocfs2_extent_list *el = path_leaf_el(path); 5386 struct ocfs2_extent_list *el = path_leaf_el(path);
5262 struct ocfs2_extent_rec *rec; 5387 struct ocfs2_extent_rec *rec;
5263 struct ocfs2_extent_block *eb; 5388 struct ocfs2_extent_block *eb;
5264 5389
5265 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5390 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5266 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5391 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5267 if (ret) { 5392 if (ret) {
5268 mlog_errno(ret); 5393 mlog_errno(ret);
5269 goto out; 5394 goto out;
@@ -5295,14 +5420,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5295 * by this leaf and the one to it's left. 5420 * by this leaf and the one to it's left.
5296 * 5421 *
5297 * There are two cases we can skip: 5422 * There are two cases we can skip:
5298 * 1) Path is the leftmost one in our inode tree. 5423 * 1) Path is the leftmost one in our btree.
5299 * 2) The leaf is rightmost and will be empty after 5424 * 2) The leaf is rightmost and will be empty after
5300 * we remove the extent record - the rotate code 5425 * we remove the extent record - the rotate code
5301 * knows how to update the newly formed edge. 5426 * knows how to update the newly formed edge.
5302 */ 5427 */
5303 5428
5304 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, 5429 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5305 &left_cpos);
5306 if (ret) { 5430 if (ret) {
5307 mlog_errno(ret); 5431 mlog_errno(ret);
5308 goto out; 5432 goto out;
@@ -5316,7 +5440,8 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5316 goto out; 5440 goto out;
5317 } 5441 }
5318 5442
5319 ret = ocfs2_find_path(inode, left_path, left_cpos); 5443 ret = ocfs2_find_path(et->et_ci, left_path,
5444 left_cpos);
5320 if (ret) { 5445 if (ret) {
5321 mlog_errno(ret); 5446 mlog_errno(ret);
5322 goto out; 5447 goto out;
@@ -5332,13 +5457,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5332 goto out; 5457 goto out;
5333 } 5458 }
5334 5459
5335 ret = ocfs2_journal_access_path(inode, handle, path); 5460 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5336 if (ret) { 5461 if (ret) {
5337 mlog_errno(ret); 5462 mlog_errno(ret);
5338 goto out; 5463 goto out;
5339 } 5464 }
5340 5465
5341 ret = ocfs2_journal_access_path(inode, handle, left_path); 5466 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5342 if (ret) { 5467 if (ret) {
5343 mlog_errno(ret); 5468 mlog_errno(ret);
5344 goto out; 5469 goto out;
@@ -5361,7 +5486,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5361 * be deleted by the rotate code. 5486 * be deleted by the rotate code.
5362 */ 5487 */
5363 rec = &el->l_recs[next_free - 1]; 5488 rec = &el->l_recs[next_free - 1];
5364 ocfs2_adjust_rightmost_records(inode, handle, path, 5489 ocfs2_adjust_rightmost_records(handle, et, path,
5365 rec); 5490 rec);
5366 } 5491 }
5367 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5492 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
@@ -5373,11 +5498,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5373 /* Remove rightmost portion of the record */ 5498 /* Remove rightmost portion of the record */
5374 le16_add_cpu(&rec->e_leaf_clusters, -len); 5499 le16_add_cpu(&rec->e_leaf_clusters, -len);
5375 if (is_rightmost_tree_rec) 5500 if (is_rightmost_tree_rec)
5376 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 5501 ocfs2_adjust_rightmost_records(handle, et, path, rec);
5377 } else { 5502 } else {
5378 /* Caller should have trapped this. */ 5503 /* Caller should have trapped this. */
5379 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " 5504 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5380 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, 5505 "(%u, %u)\n",
5506 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5381 le32_to_cpu(rec->e_cpos), 5507 le32_to_cpu(rec->e_cpos),
5382 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5508 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5383 BUG(); 5509 BUG();
@@ -5386,14 +5512,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5386 if (left_path) { 5512 if (left_path) {
5387 int subtree_index; 5513 int subtree_index;
5388 5514
5389 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 5515 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5390 ocfs2_complete_edge_insert(inode, handle, left_path, path, 5516 ocfs2_complete_edge_insert(handle, left_path, path,
5391 subtree_index); 5517 subtree_index);
5392 } 5518 }
5393 5519
5394 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5520 ocfs2_journal_dirty(handle, path_leaf_bh(path));
5395 5521
5396 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5522 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5397 if (ret) { 5523 if (ret) {
5398 mlog_errno(ret); 5524 mlog_errno(ret);
5399 goto out; 5525 goto out;
@@ -5404,9 +5530,9 @@ out:
5404 return ret; 5530 return ret;
5405} 5531}
5406 5532
5407int ocfs2_remove_extent(struct inode *inode, 5533int ocfs2_remove_extent(handle_t *handle,
5408 struct ocfs2_extent_tree *et, 5534 struct ocfs2_extent_tree *et,
5409 u32 cpos, u32 len, handle_t *handle, 5535 u32 cpos, u32 len,
5410 struct ocfs2_alloc_context *meta_ac, 5536 struct ocfs2_alloc_context *meta_ac,
5411 struct ocfs2_cached_dealloc_ctxt *dealloc) 5537 struct ocfs2_cached_dealloc_ctxt *dealloc)
5412{ 5538{
@@ -5416,7 +5542,11 @@ int ocfs2_remove_extent(struct inode *inode,
5416 struct ocfs2_extent_list *el; 5542 struct ocfs2_extent_list *el;
5417 struct ocfs2_path *path = NULL; 5543 struct ocfs2_path *path = NULL;
5418 5544
5419 ocfs2_extent_map_trunc(inode, 0); 5545 /*
5546 * XXX: Why are we truncating to 0 instead of wherever this
5547 * affects us?
5548 */
5549 ocfs2_et_extent_map_truncate(et, 0);
5420 5550
5421 path = ocfs2_new_path_from_et(et); 5551 path = ocfs2_new_path_from_et(et);
5422 if (!path) { 5552 if (!path) {
@@ -5425,7 +5555,7 @@ int ocfs2_remove_extent(struct inode *inode,
5425 goto out; 5555 goto out;
5426 } 5556 }
5427 5557
5428 ret = ocfs2_find_path(inode, path, cpos); 5558 ret = ocfs2_find_path(et->et_ci, path, cpos);
5429 if (ret) { 5559 if (ret) {
5430 mlog_errno(ret); 5560 mlog_errno(ret);
5431 goto out; 5561 goto out;
@@ -5434,10 +5564,11 @@ int ocfs2_remove_extent(struct inode *inode,
5434 el = path_leaf_el(path); 5564 el = path_leaf_el(path);
5435 index = ocfs2_search_extent_list(el, cpos); 5565 index = ocfs2_search_extent_list(el, cpos);
5436 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5566 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5437 ocfs2_error(inode->i_sb, 5567 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5438 "Inode %llu has an extent at cpos %u which can no " 5568 "Owner %llu has an extent at cpos %u which can no "
5439 "longer be found.\n", 5569 "longer be found.\n",
5440 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5570 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5571 cpos);
5441 ret = -EROFS; 5572 ret = -EROFS;
5442 goto out; 5573 goto out;
5443 } 5574 }
@@ -5464,20 +5595,21 @@ int ocfs2_remove_extent(struct inode *inode,
5464 5595
5465 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5596 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5466 5597
5467 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " 5598 mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
5468 "(cpos %u, len %u)\n", 5599 "(cpos %u, len %u)\n",
5469 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, 5600 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5601 cpos, len, index,
5470 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); 5602 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5471 5603
5472 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5604 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5473 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5605 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5474 cpos, len, et); 5606 cpos, len);
5475 if (ret) { 5607 if (ret) {
5476 mlog_errno(ret); 5608 mlog_errno(ret);
5477 goto out; 5609 goto out;
5478 } 5610 }
5479 } else { 5611 } else {
5480 ret = ocfs2_split_tree(inode, et, handle, path, index, 5612 ret = ocfs2_split_tree(handle, et, path, index,
5481 trunc_range, meta_ac); 5613 trunc_range, meta_ac);
5482 if (ret) { 5614 if (ret) {
5483 mlog_errno(ret); 5615 mlog_errno(ret);
@@ -5490,7 +5622,7 @@ int ocfs2_remove_extent(struct inode *inode,
5490 */ 5622 */
5491 ocfs2_reinit_path(path, 1); 5623 ocfs2_reinit_path(path, 1);
5492 5624
5493 ret = ocfs2_find_path(inode, path, cpos); 5625 ret = ocfs2_find_path(et->et_ci, path, cpos);
5494 if (ret) { 5626 if (ret) {
5495 mlog_errno(ret); 5627 mlog_errno(ret);
5496 goto out; 5628 goto out;
@@ -5499,9 +5631,9 @@ int ocfs2_remove_extent(struct inode *inode,
5499 el = path_leaf_el(path); 5631 el = path_leaf_el(path);
5500 index = ocfs2_search_extent_list(el, cpos); 5632 index = ocfs2_search_extent_list(el, cpos);
5501 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5633 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5502 ocfs2_error(inode->i_sb, 5634 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5503 "Inode %llu: split at cpos %u lost record.", 5635 "Owner %llu: split at cpos %u lost record.",
5504 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5636 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5505 cpos); 5637 cpos);
5506 ret = -EROFS; 5638 ret = -EROFS;
5507 goto out; 5639 goto out;
@@ -5515,18 +5647,18 @@ int ocfs2_remove_extent(struct inode *inode,
5515 rec_range = le32_to_cpu(rec->e_cpos) + 5647 rec_range = le32_to_cpu(rec->e_cpos) +
5516 ocfs2_rec_clusters(el, rec); 5648 ocfs2_rec_clusters(el, rec);
5517 if (rec_range != trunc_range) { 5649 if (rec_range != trunc_range) {
5518 ocfs2_error(inode->i_sb, 5650 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5519 "Inode %llu: error after split at cpos %u" 5651 "Owner %llu: error after split at cpos %u"
5520 "trunc len %u, existing record is (%u,%u)", 5652 "trunc len %u, existing record is (%u,%u)",
5521 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5653 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5522 cpos, len, le32_to_cpu(rec->e_cpos), 5654 cpos, len, le32_to_cpu(rec->e_cpos),
5523 ocfs2_rec_clusters(el, rec)); 5655 ocfs2_rec_clusters(el, rec));
5524 ret = -EROFS; 5656 ret = -EROFS;
5525 goto out; 5657 goto out;
5526 } 5658 }
5527 5659
5528 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5660 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5529 cpos, len, et); 5661 cpos, len);
5530 if (ret) { 5662 if (ret) {
5531 mlog_errno(ret); 5663 mlog_errno(ret);
5532 goto out; 5664 goto out;
@@ -5573,7 +5705,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5573 goto out; 5705 goto out;
5574 } 5706 }
5575 5707
5576 ret = ocfs2_et_root_journal_access(handle, inode, et, 5708 ret = ocfs2_et_root_journal_access(handle, et,
5577 OCFS2_JOURNAL_ACCESS_WRITE); 5709 OCFS2_JOURNAL_ACCESS_WRITE);
5578 if (ret) { 5710 if (ret) {
5579 mlog_errno(ret); 5711 mlog_errno(ret);
@@ -5583,14 +5715,13 @@ int ocfs2_remove_btree_range(struct inode *inode,
5583 vfs_dq_free_space_nodirty(inode, 5715 vfs_dq_free_space_nodirty(inode,
5584 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5716 ocfs2_clusters_to_bytes(inode->i_sb, len));
5585 5717
5586 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, 5718 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5587 dealloc);
5588 if (ret) { 5719 if (ret) {
5589 mlog_errno(ret); 5720 mlog_errno(ret);
5590 goto out_commit; 5721 goto out_commit;
5591 } 5722 }
5592 5723
5593 ocfs2_et_update_clusters(inode, et, -len); 5724 ocfs2_et_update_clusters(et, -len);
5594 5725
5595 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5726 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5596 if (ret) { 5727 if (ret) {
@@ -5690,7 +5821,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5690 goto bail; 5821 goto bail;
5691 } 5822 }
5692 5823
5693 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, 5824 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5694 OCFS2_JOURNAL_ACCESS_WRITE); 5825 OCFS2_JOURNAL_ACCESS_WRITE);
5695 if (status < 0) { 5826 if (status < 0) {
5696 mlog_errno(status); 5827 mlog_errno(status);
@@ -5752,7 +5883,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5752 while (i >= 0) { 5883 while (i >= 0) {
5753 /* Caller has given us at least enough credits to 5884 /* Caller has given us at least enough credits to
5754 * update the truncate log dinode */ 5885 * update the truncate log dinode */
5755 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, 5886 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5756 OCFS2_JOURNAL_ACCESS_WRITE); 5887 OCFS2_JOURNAL_ACCESS_WRITE);
5757 if (status < 0) { 5888 if (status < 0) {
5758 mlog_errno(status); 5889 mlog_errno(status);
@@ -6010,7 +6141,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6010 tl->tl_used = 0; 6141 tl->tl_used = 0;
6011 6142
6012 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); 6143 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6013 status = ocfs2_write_block(osb, tl_bh, tl_inode); 6144 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6014 if (status < 0) { 6145 if (status < 0) {
6015 mlog_errno(status); 6146 mlog_errno(status);
6016 goto bail; 6147 goto bail;
@@ -6400,9 +6531,9 @@ ocfs2_find_per_slot_free_list(int type,
6400 return fl; 6531 return fl;
6401} 6532}
6402 6533
6403static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6534int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6404 int type, int slot, u64 blkno, 6535 int type, int slot, u64 blkno,
6405 unsigned int bit) 6536 unsigned int bit)
6406{ 6537{
6407 int ret; 6538 int ret;
6408 struct ocfs2_per_slot_free_list *fl; 6539 struct ocfs2_per_slot_free_list *fl;
@@ -6518,7 +6649,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6518 goto out; 6649 goto out;
6519 } 6650 }
6520 6651
6521 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 6652 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6522 if (ret) { 6653 if (ret) {
6523 mlog_errno(ret); 6654 mlog_errno(ret);
6524 goto out; 6655 goto out;
@@ -6551,7 +6682,7 @@ out:
6551 */ 6682 */
6552static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 6683static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6553 handle_t *handle, struct ocfs2_truncate_context *tc, 6684 handle_t *handle, struct ocfs2_truncate_context *tc,
6554 u32 clusters_to_del, u64 *delete_start) 6685 u32 clusters_to_del, u64 *delete_start, u8 *flags)
6555{ 6686{
6556 int ret, i, index = path->p_tree_depth; 6687 int ret, i, index = path->p_tree_depth;
6557 u32 new_edge = 0; 6688 u32 new_edge = 0;
@@ -6561,6 +6692,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6561 struct ocfs2_extent_rec *rec; 6692 struct ocfs2_extent_rec *rec;
6562 6693
6563 *delete_start = 0; 6694 *delete_start = 0;
6695 *flags = 0;
6564 6696
6565 while (index >= 0) { 6697 while (index >= 0) {
6566 bh = path->p_node[index].bh; 6698 bh = path->p_node[index].bh;
@@ -6648,6 +6780,7 @@ find_tail_record:
6648 *delete_start = le64_to_cpu(rec->e_blkno) 6780 *delete_start = le64_to_cpu(rec->e_blkno)
6649 + ocfs2_clusters_to_blocks(inode->i_sb, 6781 + ocfs2_clusters_to_blocks(inode->i_sb,
6650 le16_to_cpu(rec->e_leaf_clusters)); 6782 le16_to_cpu(rec->e_leaf_clusters));
6783 *flags = rec->e_flags;
6651 6784
6652 /* 6785 /*
6653 * If it's now empty, remove this record. 6786 * If it's now empty, remove this record.
@@ -6719,7 +6852,7 @@ delete:
6719 6852
6720 mlog(0, "deleting this extent block.\n"); 6853 mlog(0, "deleting this extent block.\n");
6721 6854
6722 ocfs2_remove_from_cache(inode, bh); 6855 ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6723 6856
6724 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 6857 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6725 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 6858 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
@@ -6747,7 +6880,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6747 struct buffer_head *fe_bh, 6880 struct buffer_head *fe_bh,
6748 handle_t *handle, 6881 handle_t *handle,
6749 struct ocfs2_truncate_context *tc, 6882 struct ocfs2_truncate_context *tc,
6750 struct ocfs2_path *path) 6883 struct ocfs2_path *path,
6884 struct ocfs2_alloc_context *meta_ac)
6751{ 6885{
6752 int status; 6886 int status;
6753 struct ocfs2_dinode *fe; 6887 struct ocfs2_dinode *fe;
@@ -6755,6 +6889,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6755 struct ocfs2_extent_list *el; 6889 struct ocfs2_extent_list *el;
6756 struct buffer_head *last_eb_bh = NULL; 6890 struct buffer_head *last_eb_bh = NULL;
6757 u64 delete_blk = 0; 6891 u64 delete_blk = 0;
6892 u8 rec_flags;
6758 6893
6759 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6894 fe = (struct ocfs2_dinode *) fe_bh->b_data;
6760 6895
@@ -6769,14 +6904,14 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6769 * Each component will be touched, so we might as well journal 6904 * Each component will be touched, so we might as well journal
6770 * here to avoid having to handle errors later. 6905 * here to avoid having to handle errors later.
6771 */ 6906 */
6772 status = ocfs2_journal_access_path(inode, handle, path); 6907 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6773 if (status < 0) { 6908 if (status < 0) {
6774 mlog_errno(status); 6909 mlog_errno(status);
6775 goto bail; 6910 goto bail;
6776 } 6911 }
6777 6912
6778 if (last_eb_bh) { 6913 if (last_eb_bh) {
6779 status = ocfs2_journal_access_eb(handle, inode, last_eb_bh, 6914 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6780 OCFS2_JOURNAL_ACCESS_WRITE); 6915 OCFS2_JOURNAL_ACCESS_WRITE);
6781 if (status < 0) { 6916 if (status < 0) {
6782 mlog_errno(status); 6917 mlog_errno(status);
@@ -6810,7 +6945,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6810 inode->i_blocks = ocfs2_inode_sector_count(inode); 6945 inode->i_blocks = ocfs2_inode_sector_count(inode);
6811 6946
6812 status = ocfs2_trim_tree(inode, path, handle, tc, 6947 status = ocfs2_trim_tree(inode, path, handle, tc,
6813 clusters_to_del, &delete_blk); 6948 clusters_to_del, &delete_blk, &rec_flags);
6814 if (status) { 6949 if (status) {
6815 mlog_errno(status); 6950 mlog_errno(status);
6816 goto bail; 6951 goto bail;
@@ -6842,8 +6977,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6842 } 6977 }
6843 6978
6844 if (delete_blk) { 6979 if (delete_blk) {
6845 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 6980 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6846 clusters_to_del); 6981 status = ocfs2_decrease_refcount(inode, handle,
6982 ocfs2_blocks_to_clusters(osb->sb,
6983 delete_blk),
6984 clusters_to_del, meta_ac,
6985 &tc->tc_dealloc, 1);
6986 else
6987 status = ocfs2_truncate_log_append(osb, handle,
6988 delete_blk,
6989 clusters_to_del);
6847 if (status < 0) { 6990 if (status < 0) {
6848 mlog_errno(status); 6991 mlog_errno(status);
6849 goto bail; 6992 goto bail;
@@ -6863,9 +7006,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6863 return 0; 7006 return 0;
6864} 7007}
6865 7008
6866static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 7009void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6867 unsigned int from, unsigned int to, 7010 unsigned int from, unsigned int to,
6868 struct page *page, int zero, u64 *phys) 7011 struct page *page, int zero, u64 *phys)
6869{ 7012{
6870 int ret, partial = 0; 7013 int ret, partial = 0;
6871 7014
@@ -6933,20 +7076,16 @@ out:
6933 ocfs2_unlock_and_free_pages(pages, numpages); 7076 ocfs2_unlock_and_free_pages(pages, numpages);
6934} 7077}
6935 7078
6936static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 7079int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6937 struct page **pages, int *num) 7080 struct page **pages, int *num)
6938{ 7081{
6939 int numpages, ret = 0; 7082 int numpages, ret = 0;
6940 struct super_block *sb = inode->i_sb;
6941 struct address_space *mapping = inode->i_mapping; 7083 struct address_space *mapping = inode->i_mapping;
6942 unsigned long index; 7084 unsigned long index;
6943 loff_t last_page_bytes; 7085 loff_t last_page_bytes;
6944 7086
6945 BUG_ON(start > end); 7087 BUG_ON(start > end);
6946 7088
6947 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6948 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6949
6950 numpages = 0; 7089 numpages = 0;
6951 last_page_bytes = PAGE_ALIGN(end); 7090 last_page_bytes = PAGE_ALIGN(end);
6952 index = start >> PAGE_CACHE_SHIFT; 7091 index = start >> PAGE_CACHE_SHIFT;
@@ -6974,6 +7113,17 @@ out:
6974 return ret; 7113 return ret;
6975} 7114}
6976 7115
7116static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
7117 struct page **pages, int *num)
7118{
7119 struct super_block *sb = inode->i_sb;
7120
7121 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
7122 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
7123
7124 return ocfs2_grab_pages(inode, start, end, pages, num);
7125}
7126
6977/* 7127/*
6978 * Zero the area past i_size but still within an allocated 7128 * Zero the area past i_size but still within an allocated
6979 * cluster. This avoids exposing nonzero data on subsequent file 7129 * cluster. This avoids exposing nonzero data on subsequent file
@@ -7138,7 +7288,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7138 goto out_unlock; 7288 goto out_unlock;
7139 } 7289 }
7140 7290
7141 ret = ocfs2_journal_access_di(handle, inode, di_bh, 7291 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7142 OCFS2_JOURNAL_ACCESS_WRITE); 7292 OCFS2_JOURNAL_ACCESS_WRITE);
7143 if (ret) { 7293 if (ret) {
7144 mlog_errno(ret); 7294 mlog_errno(ret);
@@ -7218,9 +7368,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7218 * this proves to be false, we could always re-build 7368 * this proves to be false, we could always re-build
7219 * the in-inode data from our pages. 7369 * the in-inode data from our pages.
7220 */ 7370 */
7221 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 7371 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7222 ret = ocfs2_insert_extent(osb, handle, inode, &et, 7372 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7223 0, block, 1, 0, NULL);
7224 if (ret) { 7373 if (ret) {
7225 mlog_errno(ret); 7374 mlog_errno(ret);
7226 goto out_commit; 7375 goto out_commit;
@@ -7262,11 +7411,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
7262{ 7411{
7263 int status, i, credits, tl_sem = 0; 7412 int status, i, credits, tl_sem = 0;
7264 u32 clusters_to_del, new_highest_cpos, range; 7413 u32 clusters_to_del, new_highest_cpos, range;
7414 u64 blkno = 0;
7265 struct ocfs2_extent_list *el; 7415 struct ocfs2_extent_list *el;
7266 handle_t *handle = NULL; 7416 handle_t *handle = NULL;
7267 struct inode *tl_inode = osb->osb_tl_inode; 7417 struct inode *tl_inode = osb->osb_tl_inode;
7268 struct ocfs2_path *path = NULL; 7418 struct ocfs2_path *path = NULL;
7269 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7420 struct ocfs2_alloc_context *meta_ac = NULL;
7421 struct ocfs2_refcount_tree *ref_tree = NULL;
7270 7422
7271 mlog_entry_void(); 7423 mlog_entry_void();
7272 7424
@@ -7292,10 +7444,12 @@ start:
7292 goto bail; 7444 goto bail;
7293 } 7445 }
7294 7446
7447 credits = 0;
7448
7295 /* 7449 /*
7296 * Truncate always works against the rightmost tree branch. 7450 * Truncate always works against the rightmost tree branch.
7297 */ 7451 */
7298 status = ocfs2_find_path(inode, path, UINT_MAX); 7452 status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7299 if (status) { 7453 if (status) {
7300 mlog_errno(status); 7454 mlog_errno(status);
7301 goto bail; 7455 goto bail;
@@ -7332,10 +7486,15 @@ start:
7332 clusters_to_del = 0; 7486 clusters_to_del = 0;
7333 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7334 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
7335 } else if (range > new_highest_cpos) { 7490 } else if (range > new_highest_cpos) {
7336 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7337 le32_to_cpu(el->l_recs[i].e_cpos)) - 7492 le32_to_cpu(el->l_recs[i].e_cpos)) -
7338 new_highest_cpos; 7493 new_highest_cpos;
7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
7495 ocfs2_clusters_to_blocks(inode->i_sb,
7496 ocfs2_rec_clusters(el, &el->l_recs[i]) -
7497 clusters_to_del);
7339 } else { 7498 } else {
7340 status = 0; 7499 status = 0;
7341 goto bail; 7500 goto bail;
@@ -7344,6 +7503,29 @@ start:
7344 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7345 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7346 7505
7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7508 OCFS2_HAS_REFCOUNT_FL));
7509
7510 status = ocfs2_lock_refcount_tree(osb,
7511 le64_to_cpu(di->i_refcount_loc),
7512 1, &ref_tree, NULL);
7513 if (status) {
7514 mlog_errno(status);
7515 goto bail;
7516 }
7517
7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7519 blkno,
7520 clusters_to_del,
7521 &credits,
7522 &meta_ac);
7523 if (status < 0) {
7524 mlog_errno(status);
7525 goto bail;
7526 }
7527 }
7528
7347 mutex_lock(&tl_inode->i_mutex); 7529 mutex_lock(&tl_inode->i_mutex);
7348 tl_sem = 1; 7530 tl_sem = 1;
7349 /* ocfs2_truncate_log_needs_flush guarantees us at least one 7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one
@@ -7357,7 +7539,7 @@ start:
7357 } 7539 }
7358 } 7540 }
7359 7541
7360 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7361 (struct ocfs2_dinode *)fe_bh->b_data, 7543 (struct ocfs2_dinode *)fe_bh->b_data,
7362 el); 7544 el);
7363 handle = ocfs2_start_trans(osb, credits); 7545 handle = ocfs2_start_trans(osb, credits);
@@ -7369,7 +7551,7 @@ start:
7369 } 7551 }
7370 7552
7371 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7372 tc, path); 7554 tc, path, meta_ac);
7373 if (status < 0) { 7555 if (status < 0) {
7374 mlog_errno(status); 7556 mlog_errno(status);
7375 goto bail; 7557 goto bail;
@@ -7383,6 +7565,16 @@ start:
7383 7565
7384 ocfs2_reinit_path(path, 1); 7566 ocfs2_reinit_path(path, 1);
7385 7567
7568 if (meta_ac) {
7569 ocfs2_free_alloc_context(meta_ac);
7570 meta_ac = NULL;
7571 }
7572
7573 if (ref_tree) {
7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7575 ref_tree = NULL;
7576 }
7577
7386 /* 7578 /*
7387 * The check above will catch the case where we've truncated 7579 * The check above will catch the case where we've truncated
7388 * away all allocation. 7580 * away all allocation.
@@ -7399,6 +7591,12 @@ bail:
7399 if (handle) 7591 if (handle)
7400 ocfs2_commit_trans(osb, handle); 7592 ocfs2_commit_trans(osb, handle);
7401 7593
7594 if (meta_ac)
7595 ocfs2_free_alloc_context(meta_ac);
7596
7597 if (ref_tree)
7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7599
7402 ocfs2_run_deallocs(osb, &tc->tc_dealloc); 7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7403 7601
7404 ocfs2_free_path(path); 7602 ocfs2_free_path(path);
@@ -7445,7 +7643,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7445 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7643 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7446 7644
7447 if (fe->id2.i_list.l_tree_depth) { 7645 if (fe->id2.i_list.l_tree_depth) {
7448 status = ocfs2_read_extent_block(inode, 7646 status = ocfs2_read_extent_block(INODE_CACHE(inode),
7449 le64_to_cpu(fe->i_last_eb_blk), 7647 le64_to_cpu(fe->i_last_eb_blk),
7450 &last_eb_bh); 7648 &last_eb_bh);
7451 if (status < 0) { 7649 if (status < 0) {
@@ -7507,7 +7705,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7507 goto out; 7705 goto out;
7508 } 7706 }
7509 7707
7510 ret = ocfs2_journal_access_di(handle, inode, di_bh, 7708 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7511 OCFS2_JOURNAL_ACCESS_WRITE); 7709 OCFS2_JOURNAL_ACCESS_WRITE);
7512 if (ret) { 7710 if (ret) {
7513 mlog_errno(ret); 7711 mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 353254ba29e1..9c122d574464 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,8 @@
45 * 45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a 46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree 47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions. With metadata ecc, we now call different journal_access 48 * functions. It needs the ocfs2_caching_info structure associated with
49 * I/O on the tree. With metadata ecc, we now call different journal_access
49 * functions for each type of metadata, so it must have the 50 * functions for each type of metadata, so it must have the
50 * root_journal_access function. 51 * root_journal_access function.
51 * ocfs2_extent_tree_operations abstract the normal operations we do for 52 * ocfs2_extent_tree_operations abstract the normal operations we do for
@@ -56,6 +57,7 @@ struct ocfs2_extent_tree {
56 struct ocfs2_extent_tree_operations *et_ops; 57 struct ocfs2_extent_tree_operations *et_ops;
57 struct buffer_head *et_root_bh; 58 struct buffer_head *et_root_bh;
58 struct ocfs2_extent_list *et_root_el; 59 struct ocfs2_extent_list *et_root_el;
60 struct ocfs2_caching_info *et_ci;
59 ocfs2_journal_access_func et_root_journal_access; 61 ocfs2_journal_access_func et_root_journal_access;
60 void *et_object; 62 void *et_object;
61 unsigned int et_max_leaf_clusters; 63 unsigned int et_max_leaf_clusters;
@@ -66,31 +68,32 @@ struct ocfs2_extent_tree {
66 * specified object buffer. 68 * specified object buffer.
67 */ 69 */
68void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 70void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode, 71 struct ocfs2_caching_info *ci,
70 struct buffer_head *bh); 72 struct buffer_head *bh);
71void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 73void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode, 74 struct ocfs2_caching_info *ci,
73 struct buffer_head *bh); 75 struct buffer_head *bh);
74struct ocfs2_xattr_value_buf; 76struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 77void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 78 struct ocfs2_caching_info *ci,
77 struct ocfs2_xattr_value_buf *vb); 79 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 80void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode, 81 struct ocfs2_caching_info *ci,
80 struct buffer_head *bh); 82 struct buffer_head *bh);
83void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
84 struct ocfs2_caching_info *ci,
85 struct buffer_head *bh);
81 86
82/* 87/*
83 * Read an extent block into *bh. If *bh is NULL, a bh will be 88 * Read an extent block into *bh. If *bh is NULL, a bh will be
84 * allocated. This is a cached read. The extent block will be validated 89 * allocated. This is a cached read. The extent block will be validated
85 * with ocfs2_validate_extent_block(). 90 * with ocfs2_validate_extent_block().
86 */ 91 */
87int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, 92int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
88 struct buffer_head **bh); 93 struct buffer_head **bh);
89 94
90struct ocfs2_alloc_context; 95struct ocfs2_alloc_context;
91int ocfs2_insert_extent(struct ocfs2_super *osb, 96int ocfs2_insert_extent(handle_t *handle,
92 handle_t *handle,
93 struct inode *inode,
94 struct ocfs2_extent_tree *et, 97 struct ocfs2_extent_tree *et,
95 u32 cpos, 98 u32 cpos,
96 u64 start_blk, 99 u64 start_blk,
@@ -103,25 +106,36 @@ enum ocfs2_alloc_restarted {
103 RESTART_TRANS, 106 RESTART_TRANS,
104 RESTART_META 107 RESTART_META
105}; 108};
106int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 109int ocfs2_add_clusters_in_btree(handle_t *handle,
107 struct inode *inode, 110 struct ocfs2_extent_tree *et,
108 u32 *logical_offset, 111 u32 *logical_offset,
109 u32 clusters_to_add, 112 u32 clusters_to_add,
110 int mark_unwritten, 113 int mark_unwritten,
111 struct ocfs2_extent_tree *et,
112 handle_t *handle,
113 struct ocfs2_alloc_context *data_ac, 114 struct ocfs2_alloc_context *data_ac,
114 struct ocfs2_alloc_context *meta_ac, 115 struct ocfs2_alloc_context *meta_ac,
115 enum ocfs2_alloc_restarted *reason_ret); 116 enum ocfs2_alloc_restarted *reason_ret);
116struct ocfs2_cached_dealloc_ctxt; 117struct ocfs2_cached_dealloc_ctxt;
118struct ocfs2_path;
119int ocfs2_split_extent(handle_t *handle,
120 struct ocfs2_extent_tree *et,
121 struct ocfs2_path *path,
122 int split_index,
123 struct ocfs2_extent_rec *split_rec,
124 struct ocfs2_alloc_context *meta_ac,
125 struct ocfs2_cached_dealloc_ctxt *dealloc);
117int ocfs2_mark_extent_written(struct inode *inode, 126int ocfs2_mark_extent_written(struct inode *inode,
118 struct ocfs2_extent_tree *et, 127 struct ocfs2_extent_tree *et,
119 handle_t *handle, u32 cpos, u32 len, u32 phys, 128 handle_t *handle, u32 cpos, u32 len, u32 phys,
120 struct ocfs2_alloc_context *meta_ac, 129 struct ocfs2_alloc_context *meta_ac,
121 struct ocfs2_cached_dealloc_ctxt *dealloc); 130 struct ocfs2_cached_dealloc_ctxt *dealloc);
122int ocfs2_remove_extent(struct inode *inode, 131int ocfs2_change_extent_flag(handle_t *handle,
123 struct ocfs2_extent_tree *et, 132 struct ocfs2_extent_tree *et,
124 u32 cpos, u32 len, handle_t *handle, 133 u32 cpos, u32 len, u32 phys,
134 struct ocfs2_alloc_context *meta_ac,
135 struct ocfs2_cached_dealloc_ctxt *dealloc,
136 int new_flags, int clear_flags);
137int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
138 u32 cpos, u32 len,
125 struct ocfs2_alloc_context *meta_ac, 139 struct ocfs2_alloc_context *meta_ac,
126 struct ocfs2_cached_dealloc_ctxt *dealloc); 140 struct ocfs2_cached_dealloc_ctxt *dealloc);
127int ocfs2_remove_btree_range(struct inode *inode, 141int ocfs2_remove_btree_range(struct inode *inode,
@@ -130,7 +144,6 @@ int ocfs2_remove_btree_range(struct inode *inode,
130 struct ocfs2_cached_dealloc_ctxt *dealloc); 144 struct ocfs2_cached_dealloc_ctxt *dealloc);
131 145
132int ocfs2_num_free_extents(struct ocfs2_super *osb, 146int ocfs2_num_free_extents(struct ocfs2_super *osb,
133 struct inode *inode,
134 struct ocfs2_extent_tree *et); 147 struct ocfs2_extent_tree *et);
135 148
136/* 149/*
@@ -195,6 +208,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
195} 208}
196int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 209int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
197 u64 blkno, unsigned int bit); 210 u64 blkno, unsigned int bit);
211int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
212 int type, int slot, u64 blkno,
213 unsigned int bit);
198static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) 214static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
199{ 215{
200 return c->c_global_allocator != NULL; 216 return c->c_global_allocator != NULL;
@@ -222,8 +238,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
222int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
223 unsigned int start, unsigned int end, int trunc); 239 unsigned int start, unsigned int end, int trunc);
224 240
225int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 241int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
226 u32 cpos, struct buffer_head **leaf_bh); 242 struct ocfs2_extent_list *root_el, u32 cpos,
243 struct buffer_head **leaf_bh);
227int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); 244int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
228 245
229/* 246/*
@@ -254,4 +271,50 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
254 return !rec->e_leaf_clusters; 271 return !rec->e_leaf_clusters;
255} 272}
256 273
274int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
275 struct page **pages, int *num);
276void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
277 unsigned int from, unsigned int to,
278 struct page *page, int zero, u64 *phys);
279/*
280 * Structures which describe a path through a btree, and functions to
281 * manipulate them.
282 *
283 * The idea here is to be as generic as possible with the tree
284 * manipulation code.
285 */
286struct ocfs2_path_item {
287 struct buffer_head *bh;
288 struct ocfs2_extent_list *el;
289};
290
291#define OCFS2_MAX_PATH_DEPTH 5
292
293struct ocfs2_path {
294 int p_tree_depth;
295 ocfs2_journal_access_func p_root_access;
296 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
297};
298
299#define path_root_bh(_path) ((_path)->p_node[0].bh)
300#define path_root_el(_path) ((_path)->p_node[0].el)
301#define path_root_access(_path)((_path)->p_root_access)
302#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
303#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
304#define path_num_items(_path) ((_path)->p_tree_depth + 1)
305
306void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
307void ocfs2_free_path(struct ocfs2_path *path);
308int ocfs2_find_path(struct ocfs2_caching_info *ci,
309 struct ocfs2_path *path,
310 u32 cpos);
311struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
312struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
313int ocfs2_path_bh_journal_access(handle_t *handle,
314 struct ocfs2_caching_info *ci,
315 struct ocfs2_path *path,
316 int idx);
317int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
318 handle_t *handle,
319 struct ocfs2_path *path);
257#endif /* OCFS2_ALLOC_H */ 320#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8a1e61545f41..deb2b132ae5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
44#include "suballoc.h" 44#include "suballoc.h"
45#include "super.h" 45#include "super.h"
46#include "symlink.h" 46#include "symlink.h"
47#include "refcounttree.h"
47 48
48#include "buffer_head_io.h" 49#include "buffer_head_io.h"
49 50
@@ -126,8 +127,8 @@ bail:
126 return err; 127 return err;
127} 128}
128 129
129static int ocfs2_get_block(struct inode *inode, sector_t iblock, 130int ocfs2_get_block(struct inode *inode, sector_t iblock,
130 struct buffer_head *bh_result, int create) 131 struct buffer_head *bh_result, int create)
131{ 132{
132 int err = 0; 133 int err = 0;
133 unsigned int ext_flags; 134 unsigned int ext_flags;
@@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
590 goto bail; 591 goto bail;
591 } 592 }
592 593
594 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
593 /* 596 /*
594 * get_more_blocks() expects us to describe a hole by clearing 597 * get_more_blocks() expects us to describe a hole by clearing
595 * the mapped bit on bh_result(). 598 * the mapped bit on bh_result().
@@ -687,6 +690,10 @@ static ssize_t ocfs2_direct_IO(int rw,
687 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 690 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
688 return 0; 691 return 0;
689 692
693 /* Fallback to buffered I/O if we are appending. */
694 if (i_size_read(inode) <= offset)
695 return 0;
696
690 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 697 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
691 inode->i_sb->s_bdev, iov, offset, 698 inode->i_sb->s_bdev, iov, offset,
692 nr_segs, 699 nr_segs,
@@ -1259,7 +1266,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1259 goto out; 1266 goto out;
1260 } 1267 }
1261 } else if (unwritten) { 1268 } else if (unwritten) {
1262 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1269 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1270 wc->w_di_bh);
1263 ret = ocfs2_mark_extent_written(inode, &et, 1271 ret = ocfs2_mark_extent_written(inode, &et,
1264 wc->w_handle, cpos, 1, phys, 1272 wc->w_handle, cpos, 1, phys,
1265 meta_ac, &wc->w_dealloc); 1273 meta_ac, &wc->w_dealloc);
@@ -1448,6 +1456,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1448 goto out; 1456 goto out;
1449 } 1457 }
1450 1458
1459 /* We should already CoW the refcountd extent. */
1460 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1461
1451 /* 1462 /*
1452 * Assume worst case - that we're writing in 1463 * Assume worst case - that we're writing in
1453 * the middle of the extent. 1464 * the middle of the extent.
@@ -1528,7 +1539,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1528 goto out; 1539 goto out;
1529 } 1540 }
1530 1541
1531 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, 1542 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1532 OCFS2_JOURNAL_ACCESS_WRITE); 1543 OCFS2_JOURNAL_ACCESS_WRITE);
1533 if (ret) { 1544 if (ret) {
1534 ocfs2_commit_trans(osb, handle); 1545 ocfs2_commit_trans(osb, handle);
@@ -1699,6 +1710,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1699 goto out; 1710 goto out;
1700 } 1711 }
1701 1712
1713 ret = ocfs2_check_range_for_refcount(inode, pos, len);
1714 if (ret < 0) {
1715 mlog_errno(ret);
1716 goto out;
1717 } else if (ret == 1) {
1718 ret = ocfs2_refcount_cow(inode, di_bh,
1719 wc->w_cpos, wc->w_clen, UINT_MAX);
1720 if (ret) {
1721 mlog_errno(ret);
1722 goto out;
1723 }
1724 }
1725
1702 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, 1726 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1703 &extents_to_split); 1727 &extents_to_split);
1704 if (ret) { 1728 if (ret) {
@@ -1726,7 +1750,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1726 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), 1750 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
1727 clusters_to_alloc, extents_to_split); 1751 clusters_to_alloc, extents_to_split);
1728 1752
1729 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1753 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1754 wc->w_di_bh);
1730 ret = ocfs2_lock_allocators(inode, &et, 1755 ret = ocfs2_lock_allocators(inode, &et,
1731 clusters_to_alloc, extents_to_split, 1756 clusters_to_alloc, extents_to_split,
1732 &data_ac, &meta_ac); 1757 &data_ac, &meta_ac);
@@ -1773,7 +1798,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1773 * We don't want this to fail in ocfs2_write_end(), so do it 1798 * We don't want this to fail in ocfs2_write_end(), so do it
1774 * here. 1799 * here.
1775 */ 1800 */
1776 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, 1801 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1777 OCFS2_JOURNAL_ACCESS_WRITE); 1802 OCFS2_JOURNAL_ACCESS_WRITE);
1778 if (ret) { 1803 if (ret) {
1779 mlog_errno(ret); 1804 mlog_errno(ret);
@@ -1997,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
1997 .releasepage = ocfs2_releasepage, 2022 .releasepage = ocfs2_releasepage,
1998 .migratepage = buffer_migrate_page, 2023 .migratepage = buffer_migrate_page,
1999 .is_partially_uptodate = block_is_partially_uptodate, 2024 .is_partially_uptodate = block_is_partially_uptodate,
2025 .error_remove_page = generic_error_remove_page,
2000}; 2026};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e49232e11..c48e93ffc513 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
57 struct buffer_head *di_bh); 57 struct buffer_head *di_bh);
58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); 58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
59 59
60int ocfs2_get_block(struct inode *inode, sector_t iblock,
61 struct buffer_head *bh_result, int create);
60/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
61#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
62 test_bit(0, (unsigned long *)&iocb->private) 64 test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 15c8e6deee2e..d43d34a1dd31 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -52,12 +52,12 @@ enum ocfs2_state_bits {
52BUFFER_FNS(NeedsValidate, needs_validate); 52BUFFER_FNS(NeedsValidate, needs_validate);
53 53
54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, 54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
55 struct inode *inode) 55 struct ocfs2_caching_info *ci)
56{ 56{
57 int ret = 0; 57 int ret = 0;
58 58
59 mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", 59 mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
60 (unsigned long long)bh->b_blocknr, inode); 60 (unsigned long long)bh->b_blocknr, ci);
61 61
62 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); 62 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
63 BUG_ON(buffer_jbd(bh)); 63 BUG_ON(buffer_jbd(bh));
@@ -70,7 +70,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
70 goto out; 70 goto out;
71 } 71 }
72 72
73 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 73 ocfs2_metadata_cache_io_lock(ci);
74 74
75 lock_buffer(bh); 75 lock_buffer(bh);
76 set_buffer_uptodate(bh); 76 set_buffer_uptodate(bh);
@@ -85,7 +85,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
85 wait_on_buffer(bh); 85 wait_on_buffer(bh);
86 86
87 if (buffer_uptodate(bh)) { 87 if (buffer_uptodate(bh)) {
88 ocfs2_set_buffer_uptodate(inode, bh); 88 ocfs2_set_buffer_uptodate(ci, bh);
89 } else { 89 } else {
90 /* We don't need to remove the clustered uptodate 90 /* We don't need to remove the clustered uptodate
91 * information for this bh as it's not marked locally 91 * information for this bh as it's not marked locally
@@ -94,7 +94,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
94 put_bh(bh); 94 put_bh(bh);
95 } 95 }
96 96
97 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 97 ocfs2_metadata_cache_io_unlock(ci);
98out: 98out:
99 mlog_exit(ret); 99 mlog_exit(ret);
100 return ret; 100 return ret;
@@ -177,7 +177,7 @@ bail:
177 return status; 177 return status;
178} 178}
179 179
180int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 180int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
181 struct buffer_head *bhs[], int flags, 181 struct buffer_head *bhs[], int flags,
182 int (*validate)(struct super_block *sb, 182 int (*validate)(struct super_block *sb,
183 struct buffer_head *bh)) 183 struct buffer_head *bh))
@@ -185,11 +185,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
185 int status = 0; 185 int status = 0;
186 int i, ignore_cache = 0; 186 int i, ignore_cache = 0;
187 struct buffer_head *bh; 187 struct buffer_head *bh;
188 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
188 189
189 mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", 190 mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
190 inode, (unsigned long long)block, nr, flags); 191 ci, (unsigned long long)block, nr, flags);
191 192
192 BUG_ON(!inode); 193 BUG_ON(!ci);
193 BUG_ON((flags & OCFS2_BH_READAHEAD) && 194 BUG_ON((flags & OCFS2_BH_READAHEAD) &&
194 (flags & OCFS2_BH_IGNORE_CACHE)); 195 (flags & OCFS2_BH_IGNORE_CACHE));
195 196
@@ -212,12 +213,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
212 goto bail; 213 goto bail;
213 } 214 }
214 215
215 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 216 ocfs2_metadata_cache_io_lock(ci);
216 for (i = 0 ; i < nr ; i++) { 217 for (i = 0 ; i < nr ; i++) {
217 if (bhs[i] == NULL) { 218 if (bhs[i] == NULL) {
218 bhs[i] = sb_getblk(inode->i_sb, block++); 219 bhs[i] = sb_getblk(sb, block++);
219 if (bhs[i] == NULL) { 220 if (bhs[i] == NULL) {
220 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 221 ocfs2_metadata_cache_io_unlock(ci);
221 status = -EIO; 222 status = -EIO;
222 mlog_errno(status); 223 mlog_errno(status);
223 goto bail; 224 goto bail;
@@ -250,11 +251,11 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
250 * before our is-it-in-flight check. 251 * before our is-it-in-flight check.
251 */ 252 */
252 253
253 if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { 254 if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
254 mlog(ML_UPTODATE, 255 mlog(ML_UPTODATE,
255 "bh (%llu), inode %llu not uptodate\n", 256 "bh (%llu), owner %llu not uptodate\n",
256 (unsigned long long)bh->b_blocknr, 257 (unsigned long long)bh->b_blocknr,
257 (unsigned long long)OCFS2_I(inode)->ip_blkno); 258 (unsigned long long)ocfs2_metadata_cache_owner(ci));
258 /* We're using ignore_cache here to say 259 /* We're using ignore_cache here to say
259 * "go to disk" */ 260 * "go to disk" */
260 ignore_cache = 1; 261 ignore_cache = 1;
@@ -283,7 +284,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
283 * previously submitted request than we are 284 * previously submitted request than we are
284 * done here. */ 285 * done here. */
285 if ((flags & OCFS2_BH_READAHEAD) 286 if ((flags & OCFS2_BH_READAHEAD)
286 && ocfs2_buffer_read_ahead(inode, bh)) 287 && ocfs2_buffer_read_ahead(ci, bh))
287 continue; 288 continue;
288 289
289 lock_buffer(bh); 290 lock_buffer(bh);
@@ -305,7 +306,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
305 * buffer lock. */ 306 * buffer lock. */
306 if (!(flags & OCFS2_BH_IGNORE_CACHE) 307 if (!(flags & OCFS2_BH_IGNORE_CACHE)
307 && !(flags & OCFS2_BH_READAHEAD) 308 && !(flags & OCFS2_BH_READAHEAD)
308 && ocfs2_buffer_uptodate(inode, bh)) { 309 && ocfs2_buffer_uptodate(ci, bh)) {
309 unlock_buffer(bh); 310 unlock_buffer(bh);
310 continue; 311 continue;
311 } 312 }
@@ -327,7 +328,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
327 328
328 if (!(flags & OCFS2_BH_READAHEAD)) { 329 if (!(flags & OCFS2_BH_READAHEAD)) {
329 /* We know this can't have changed as we hold the 330 /* We know this can't have changed as we hold the
330 * inode sem. Avoid doing any work on the bh if the 331 * owner sem. Avoid doing any work on the bh if the
331 * journal has it. */ 332 * journal has it. */
332 if (!buffer_jbd(bh)) 333 if (!buffer_jbd(bh))
333 wait_on_buffer(bh); 334 wait_on_buffer(bh);
@@ -351,7 +352,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
351 * that better not have changed */ 352 * that better not have changed */
352 BUG_ON(buffer_jbd(bh)); 353 BUG_ON(buffer_jbd(bh));
353 clear_buffer_needs_validate(bh); 354 clear_buffer_needs_validate(bh);
354 status = validate(inode->i_sb, bh); 355 status = validate(sb, bh);
355 if (status) { 356 if (status) {
356 put_bh(bh); 357 put_bh(bh);
357 bhs[i] = NULL; 358 bhs[i] = NULL;
@@ -363,9 +364,9 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
363 /* Always set the buffer in the cache, even if it was 364 /* Always set the buffer in the cache, even if it was
364 * a forced read, or read-ahead which hasn't yet 365 * a forced read, or read-ahead which hasn't yet
365 * completed. */ 366 * completed. */
366 ocfs2_set_buffer_uptodate(inode, bh); 367 ocfs2_set_buffer_uptodate(ci, bh);
367 } 368 }
368 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 369 ocfs2_metadata_cache_io_unlock(ci);
369 370
370 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 371 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
371 (unsigned long long)block, nr, 372 (unsigned long long)block, nr,
@@ -399,7 +400,7 @@ static void ocfs2_check_super_or_backup(struct super_block *sb,
399 400
400/* 401/*
401 * Write super block and backups doesn't need to collaborate with journal, 402 * Write super block and backups doesn't need to collaborate with journal,
402 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed 403 * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
403 * into this function. 404 * into this function.
404 */ 405 */
405int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 406int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c75d682dadd8..b97bcc6dde7c 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -33,7 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
33 33
34int ocfs2_write_block(struct ocfs2_super *osb, 34int ocfs2_write_block(struct ocfs2_super *osb,
35 struct buffer_head *bh, 35 struct buffer_head *bh,
36 struct inode *inode); 36 struct ocfs2_caching_info *ci);
37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
38 unsigned int nr, struct buffer_head *bhs[]); 38 unsigned int nr, struct buffer_head *bhs[]);
39 39
@@ -44,7 +44,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
44 * be set even for a READAHEAD call, as it marks the buffer for later 44 * be set even for a READAHEAD call, as it marks the buffer for later
45 * validation. 45 * validation.
46 */ 46 */
47int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 47int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
48 struct buffer_head *bhs[], int flags, 48 struct buffer_head *bhs[], int flags,
49 int (*validate)(struct super_block *sb, 49 int (*validate)(struct super_block *sb,
50 struct buffer_head *bh)); 50 struct buffer_head *bh));
@@ -55,7 +55,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
55#define OCFS2_BH_IGNORE_CACHE 1 55#define OCFS2_BH_IGNORE_CACHE 1
56#define OCFS2_BH_READAHEAD 8 56#define OCFS2_BH_READAHEAD 8
57 57
58static inline int ocfs2_read_block(struct inode *inode, u64 off, 58static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
59 struct buffer_head **bh, 59 struct buffer_head **bh,
60 int (*validate)(struct super_block *sb, 60 int (*validate)(struct super_block *sb,
61 struct buffer_head *bh)) 61 struct buffer_head *bh))
@@ -68,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
68 goto bail; 68 goto bail;
69 } 69 }
70 70
71 status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate); 71 status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
72 72
73bail: 73bail:
74 return status; 74 return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d04611..c452d116b892 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
966} 966}
967#endif /* CONFIG_DEBUG_FS */ 967#endif /* CONFIG_DEBUG_FS */
968 968
969static struct file_operations o2hb_debug_fops = { 969static const struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open, 970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release, 971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read, 972 .read = o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 96df5416993e..1cd2934de615 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT),
114 define_mask(ERROR), 115 define_mask(ERROR),
115 define_mask(NOTICE), 116 define_mask(NOTICE),
116 define_mask(KTHREAD), 117 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 696c32e50716..9b4d11726cf2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
113#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 113#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
116/* bits that are infrequently given and frequently matched in the high word */ 117/* bits that are infrequently given and frequently matched in the high word */
117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index f8424874fa07..da794bc07a6c 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v)
163{ 163{
164} 164}
165 165
166static struct seq_operations nst_seq_ops = { 166static const struct seq_operations nst_seq_ops = {
167 .start = nst_seq_start, 167 .start = nst_seq_start,
168 .next = nst_seq_next, 168 .next = nst_seq_next,
169 .stop = nst_seq_stop, 169 .stop = nst_seq_stop,
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
207 return seq_release_private(inode, file); 207 return seq_release_private(inode, file);
208} 208}
209 209
210static struct file_operations nst_seq_fops = { 210static const struct file_operations nst_seq_fops = {
211 .open = nst_fop_open, 211 .open = nst_fop_open,
212 .read = seq_read, 212 .read = seq_read,
213 .llseek = seq_lseek, 213 .llseek = seq_lseek,
@@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file *seq, void *v)
344{ 344{
345} 345}
346 346
347static struct seq_operations sc_seq_ops = { 347static const struct seq_operations sc_seq_ops = {
348 .start = sc_seq_start, 348 .start = sc_seq_start,
349 .next = sc_seq_next, 349 .next = sc_seq_next,
350 .stop = sc_seq_stop, 350 .stop = sc_seq_stop,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
388 return seq_release_private(inode, file); 388 return seq_release_private(inode, file);
389} 389}
390 390
391static struct file_operations sc_seq_fops = { 391static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 392 .open = sc_fop_open,
393 .read = seq_read, 393 .read = seq_read,
394 .llseek = seq_lseek, 394 .llseek = seq_lseek,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b358f3bf896d..28c3ec238796 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -176,7 +176,7 @@ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
176 struct ocfs2_dx_root_block *dx_root; 176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer; 177 struct ocfs2_dir_block_trailer *trailer;
178 178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 179 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE); 180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) { 181 if (ret) {
182 mlog_errno(ret); 182 mlog_errno(ret);
@@ -564,7 +564,8 @@ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
564 int ret; 564 int ret;
565 struct buffer_head *tmp = *bh; 565 struct buffer_head *tmp = *bh;
566 566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block); 567 ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
568 ocfs2_validate_dir_block);
568 if (ret) { 569 if (ret) {
569 mlog_errno(ret); 570 mlog_errno(ret);
570 goto out; 571 goto out;
@@ -622,7 +623,8 @@ static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
622 u64 blkno = le64_to_cpu(di->i_dx_root); 623 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh; 624 struct buffer_head *tmp = *dx_root_bh;
624 625
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root); 626 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
627 ocfs2_validate_dx_root);
626 628
627 /* If ocfs2_read_block() got us a new bh, pass it up. */ 629 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh) 630 if (!ret && !*dx_root_bh)
@@ -662,7 +664,8 @@ static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
662 int ret; 664 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh; 665 struct buffer_head *tmp = *dx_leaf_bh;
664 666
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf); 667 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
668 ocfs2_validate_dx_leaf);
666 669
667 /* If ocfs2_read_block() got us a new bh, pass it up. */ 670 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh) 671 if (!ret && !*dx_leaf_bh)
@@ -680,7 +683,7 @@ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
680{ 683{
681 int ret; 684 int ret;
682 685
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0, 686 ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf); 687 ocfs2_validate_dx_leaf);
685 if (ret) 688 if (ret)
686 mlog_errno(ret); 689 mlog_errno(ret);
@@ -802,7 +805,8 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
802 struct ocfs2_extent_rec *rec = NULL; 805 struct ocfs2_extent_rec *rec = NULL;
803 806
804 if (el->l_tree_depth) { 807 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh); 808 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
809 &eb_bh);
806 if (ret) { 810 if (ret) {
807 mlog_errno(ret); 811 mlog_errno(ret);
808 goto out; 812 goto out;
@@ -1133,7 +1137,8 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
1133 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1137 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1134 access = ocfs2_journal_access_di; 1138 access = ocfs2_journal_access_di;
1135 1139
1136 ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1140 ret = access(handle, INODE_CACHE(dir), de_bh,
1141 OCFS2_JOURNAL_ACCESS_WRITE);
1137 if (ret) { 1142 if (ret) {
1138 mlog_errno(ret); 1143 mlog_errno(ret);
1139 goto out; 1144 goto out;
@@ -1176,7 +1181,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1176 goto bail; 1181 goto bail;
1177 } 1182 }
1178 if (de == de_del) { 1183 if (de == de_del) {
1179 status = access(handle, dir, bh, 1184 status = access(handle, INODE_CACHE(dir), bh,
1180 OCFS2_JOURNAL_ACCESS_WRITE); 1185 OCFS2_JOURNAL_ACCESS_WRITE);
1181 if (status < 0) { 1186 if (status < 0) {
1182 status = -EIO; 1187 status = -EIO;
@@ -1326,7 +1331,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1326 * the entry count needs to be updated. Also, we might be 1331 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list. 1332 * adding to the start of the free list.
1328 */ 1333 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1334 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1335 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) { 1336 if (ret) {
1332 mlog_errno(ret); 1337 mlog_errno(ret);
@@ -1334,7 +1339,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1334 } 1339 }
1335 1340
1336 if (!ocfs2_dx_root_inline(dx_root)) { 1341 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir, 1342 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
1338 lookup->dl_dx_leaf_bh, 1343 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE); 1344 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) { 1345 if (ret) {
@@ -1493,7 +1498,7 @@ static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1493 int ret; 1498 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf; 1499 struct ocfs2_dx_leaf *dx_leaf;
1495 1500
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, 1501 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE); 1502 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) { 1503 if (ret) {
1499 mlog_errno(ret); 1504 mlog_errno(ret);
@@ -1523,7 +1528,7 @@ static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1523 struct ocfs2_dx_root_block *dx_root; 1528 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 1529 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525 1530
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1531 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE); 1532 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) { 1533 if (ret) {
1529 mlog_errno(ret); 1534 mlog_errno(ret);
@@ -1645,11 +1650,13 @@ int __ocfs2_add_entry(handle_t *handle,
1645 */ 1650 */
1646 if (ocfs2_free_list_at_root(lookup)) { 1651 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh; 1652 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh, 1653 retval = ocfs2_journal_access_dr(handle,
1654 INODE_CACHE(dir), bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE); 1655 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else { 1656 } else {
1651 bh = lookup->dl_prev_leaf_bh; 1657 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh, 1658 retval = ocfs2_journal_access_db(handle,
1659 INODE_CACHE(dir), bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE); 1660 OCFS2_JOURNAL_ACCESS_WRITE);
1654 } 1661 }
1655 if (retval) { 1662 if (retval) {
@@ -1700,11 +1707,13 @@ int __ocfs2_add_entry(handle_t *handle,
1700 } 1707 }
1701 1708
1702 if (insert_bh == parent_fe_bh) 1709 if (insert_bh == parent_fe_bh)
1703 status = ocfs2_journal_access_di(handle, dir, 1710 status = ocfs2_journal_access_di(handle,
1711 INODE_CACHE(dir),
1704 insert_bh, 1712 insert_bh,
1705 OCFS2_JOURNAL_ACCESS_WRITE); 1713 OCFS2_JOURNAL_ACCESS_WRITE);
1706 else { 1714 else {
1707 status = ocfs2_journal_access_db(handle, dir, 1715 status = ocfs2_journal_access_db(handle,
1716 INODE_CACHE(dir),
1708 insert_bh, 1717 insert_bh,
1709 OCFS2_JOURNAL_ACCESS_WRITE); 1718 OCFS2_JOURNAL_ACCESS_WRITE);
1710 1719
@@ -2280,7 +2289,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2280 struct ocfs2_inline_data *data = &di->id2.i_data; 2289 struct ocfs2_inline_data *data = &di->id2.i_data;
2281 unsigned int size = le16_to_cpu(data->id_count); 2290 unsigned int size = le16_to_cpu(data->id_count);
2282 2291
2283 ret = ocfs2_journal_access_di(handle, inode, di_bh, 2292 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2284 OCFS2_JOURNAL_ACCESS_WRITE); 2293 OCFS2_JOURNAL_ACCESS_WRITE);
2285 if (ret) { 2294 if (ret) {
2286 mlog_errno(ret); 2295 mlog_errno(ret);
@@ -2332,9 +2341,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2332 goto bail; 2341 goto bail;
2333 } 2342 }
2334 2343
2335 ocfs2_set_new_buffer_uptodate(inode, new_bh); 2344 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2336 2345
2337 status = ocfs2_journal_access_db(handle, inode, new_bh, 2346 status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
2338 OCFS2_JOURNAL_ACCESS_CREATE); 2347 OCFS2_JOURNAL_ACCESS_CREATE);
2339 if (status < 0) { 2348 if (status < 0) {
2340 mlog_errno(status); 2349 mlog_errno(status);
@@ -2418,9 +2427,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2418 ret = -EIO; 2427 ret = -EIO;
2419 goto out; 2428 goto out;
2420 } 2429 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh); 2430 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
2422 2431
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 2432 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE); 2433 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) { 2434 if (ret < 0) {
2426 mlog_errno(ret); 2435 mlog_errno(ret);
@@ -2454,7 +2463,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2454 if (ret) 2463 if (ret)
2455 mlog_errno(ret); 2464 mlog_errno(ret);
2456 2465
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh, 2466 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE); 2467 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) { 2468 if (ret) {
2460 mlog_errno(ret); 2469 mlog_errno(ret);
@@ -2495,9 +2504,9 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2495 } 2504 }
2496 dx_leaves[i] = bh; 2505 dx_leaves[i] = bh;
2497 2506
2498 ocfs2_set_new_buffer_uptodate(dir, bh); 2507 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
2499 2508
2500 ret = ocfs2_journal_access_dl(handle, dir, bh, 2509 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE); 2510 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) { 2511 if (ret < 0) {
2503 mlog_errno(ret); 2512 mlog_errno(ret);
@@ -2582,7 +2591,6 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2582{ 2591{
2583 int ret; 2592 int ret;
2584 u64 phys_blkno; 2593 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586 2594
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, 2595 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno); 2596 num_dx_leaves, &phys_blkno);
@@ -2591,7 +2599,7 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2591 goto out; 2599 goto out;
2592 } 2600 }
2593 2601
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0, 2602 ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
2595 meta_ac); 2603 meta_ac);
2596 if (ret) 2604 if (ret)
2597 mlog_errno(ret); 2605 mlog_errno(ret);
@@ -2895,7 +2903,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2895 struct ocfs2_extent_tree dx_et; 2903 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0; 2904 int did_quota = 0, bytes_allocated = 0;
2897 2905
2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2906 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
2899 2907
2900 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2908 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0; 2909 dx_alloc = 0;
@@ -3005,9 +3013,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3005 goto out_commit; 3013 goto out_commit;
3006 } 3014 }
3007 3015
3008 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); 3016 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
3009 3017
3010 ret = ocfs2_journal_access_db(handle, dir, dirdata_bh, 3018 ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
3011 OCFS2_JOURNAL_ACCESS_CREATE); 3019 OCFS2_JOURNAL_ACCESS_CREATE);
3012 if (ret) { 3020 if (ret) {
3013 mlog_errno(ret); 3021 mlog_errno(ret);
@@ -3060,7 +3068,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3060 * We let the later dirent insert modify c/mtime - to the user 3068 * We let the later dirent insert modify c/mtime - to the user
3061 * the data hasn't changed. 3069 * the data hasn't changed.
3062 */ 3070 */
3063 ret = ocfs2_journal_access_di(handle, dir, di_bh, 3071 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
3064 OCFS2_JOURNAL_ACCESS_CREATE); 3072 OCFS2_JOURNAL_ACCESS_CREATE);
3065 if (ret) { 3073 if (ret) {
3066 mlog_errno(ret); 3074 mlog_errno(ret);
@@ -3085,7 +3093,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3085 * This should never fail as our extent list is empty and all 3093 * This should never fail as our extent list is empty and all
3086 * related blocks have been journaled already. 3094 * related blocks have been journaled already.
3087 */ 3095 */
3088 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, 3096 ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
3089 0, NULL); 3097 0, NULL);
3090 if (ret) { 3098 if (ret) {
3091 mlog_errno(ret); 3099 mlog_errno(ret);
@@ -3117,8 +3125,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh, 3125 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh); 3126 dirdata_bh);
3119 } else { 3127 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); 3128 ocfs2_init_dx_root_extent_tree(&dx_et,
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, 3129 INODE_CACHE(dir),
3130 dx_root_bh);
3131 ret = ocfs2_insert_extent(handle, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL); 3132 dx_insert_blkno, 1, 0, NULL);
3123 if (ret) 3133 if (ret)
3124 mlog_errno(ret); 3134 mlog_errno(ret);
@@ -3138,7 +3148,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3138 } 3148 }
3139 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); 3149 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
3140 3150
3141 ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, 3151 ret = ocfs2_insert_extent(handle, &et, 1,
3142 blkno, len, 0, NULL); 3152 blkno, len, 0, NULL);
3143 if (ret) { 3153 if (ret) {
3144 mlog_errno(ret); 3154 mlog_errno(ret);
@@ -3337,8 +3347,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3337 spin_lock(&OCFS2_I(dir)->ip_lock); 3347 spin_lock(&OCFS2_I(dir)->ip_lock);
3338 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { 3348 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
3339 spin_unlock(&OCFS2_I(dir)->ip_lock); 3349 spin_unlock(&OCFS2_I(dir)->ip_lock);
3340 ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); 3350 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
3341 num_free_extents = ocfs2_num_free_extents(osb, dir, &et); 3351 parent_fe_bh);
3352 num_free_extents = ocfs2_num_free_extents(osb, &et);
3342 if (num_free_extents < 0) { 3353 if (num_free_extents < 0) {
3343 status = num_free_extents; 3354 status = num_free_extents;
3344 mlog_errno(status); 3355 mlog_errno(status);
@@ -3387,9 +3398,9 @@ do_extend:
3387 goto bail; 3398 goto bail;
3388 } 3399 }
3389 3400
3390 ocfs2_set_new_buffer_uptodate(dir, new_bh); 3401 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
3391 3402
3392 status = ocfs2_journal_access_db(handle, dir, new_bh, 3403 status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
3393 OCFS2_JOURNAL_ACCESS_CREATE); 3404 OCFS2_JOURNAL_ACCESS_CREATE);
3394 if (status < 0) { 3405 if (status < 0) {
3395 mlog_errno(status); 3406 mlog_errno(status);
@@ -3829,7 +3840,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3829 (unsigned long long)OCFS2_I(dir)->ip_blkno, 3840 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3830 (unsigned long long)leaf_blkno, insert_hash); 3841 (unsigned long long)leaf_blkno, insert_hash);
3831 3842
3832 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 3843 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
3833 3844
3834 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3845 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3835 /* 3846 /*
@@ -3885,7 +3896,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3885 } 3896 }
3886 did_quota = 1; 3897 did_quota = 1;
3887 3898
3888 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, 3899 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
3889 OCFS2_JOURNAL_ACCESS_WRITE); 3900 OCFS2_JOURNAL_ACCESS_WRITE);
3890 if (ret) { 3901 if (ret) {
3891 mlog_errno(ret); 3902 mlog_errno(ret);
@@ -3949,7 +3960,8 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3949 } 3960 }
3950 3961
3951 for (i = 0; i < num_dx_leaves; i++) { 3962 for (i = 0; i < num_dx_leaves; i++) {
3952 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i], 3963 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3964 orig_dx_leaves[i],
3953 OCFS2_JOURNAL_ACCESS_WRITE); 3965 OCFS2_JOURNAL_ACCESS_WRITE);
3954 if (ret) { 3966 if (ret) {
3955 mlog_errno(ret); 3967 mlog_errno(ret);
@@ -4165,7 +4177,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4165 * failure to add the dx_root_bh to the journal won't result 4177 * failure to add the dx_root_bh to the journal won't result
4166 * us losing clusters. 4178 * us losing clusters.
4167 */ 4179 */
4168 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 4180 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
4169 OCFS2_JOURNAL_ACCESS_WRITE); 4181 OCFS2_JOURNAL_ACCESS_WRITE);
4170 if (ret) { 4182 if (ret) {
4171 mlog_errno(ret); 4183 mlog_errno(ret);
@@ -4207,9 +4219,8 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4207 4219
4208 /* This should never fail considering we start with an empty 4220 /* This should never fail considering we start with an empty
4209 * dx_root. */ 4221 * dx_root. */
4210 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 4222 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4211 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, 4223 ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
4212 insert_blkno, 1, 0, NULL);
4213 if (ret) 4224 if (ret)
4214 mlog_errno(ret); 4225 mlog_errno(ret);
4215 did_quota = 0; 4226 did_quota = 0;
@@ -4469,7 +4480,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4469 goto out_unlock; 4480 goto out_unlock;
4470 } 4481 }
4471 4482
4472 ret = ocfs2_journal_access_di(handle, dir, di_bh, 4483 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
4473 OCFS2_JOURNAL_ACCESS_WRITE); 4484 OCFS2_JOURNAL_ACCESS_WRITE);
4474 if (ret) { 4485 if (ret) {
4475 mlog_errno(ret); 4486 mlog_errno(ret);
@@ -4532,7 +4543,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4532 if (ocfs2_dx_root_inline(dx_root)) 4543 if (ocfs2_dx_root_inline(dx_root))
4533 goto remove_index; 4544 goto remove_index;
4534 4545
4535 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 4546 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4536 4547
4537 /* XXX: What if dr_clusters is too large? */ 4548 /* XXX: What if dr_clusters is too large? */
4538 while (le32_to_cpu(dx_root->dr_clusters)) { 4549 while (le32_to_cpu(dx_root->dr_clusters)) {
@@ -4565,7 +4576,7 @@ remove_index:
4565 goto out; 4576 goto out;
4566 } 4577 }
4567 4578
4568 ocfs2_remove_from_cache(dir, dx_root_bh); 4579 ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
4569out: 4580out:
4570 ocfs2_schedule_truncate_log_flush(osb, 1); 4581 ocfs2_schedule_truncate_log_flush(osb, 1);
4571 ocfs2_run_deallocs(osb, &dealloc); 4582 ocfs2_run_deallocs(osb, &dealloc);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e58322..01cf8cc3d286 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf3..ca96bce50e18 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index df52f706f669..42b0bad7a612 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/utsname.h>
31#include <linux/sysctl.h> 30#include <linux/sysctl.h>
32#include <linux/spinlock.h> 31#include <linux/spinlock.h>
33#include <linux/debugfs.h> 32#include <linux/debugfs.h>
@@ -479,7 +478,7 @@ bail:
479 return -ENOMEM; 478 return -ENOMEM;
480} 479}
481 480
482static struct file_operations debug_purgelist_fops = { 481static const struct file_operations debug_purgelist_fops = {
483 .open = debug_purgelist_open, 482 .open = debug_purgelist_open,
484 .release = debug_buffer_release, 483 .release = debug_buffer_release,
485 .read = debug_buffer_read, 484 .read = debug_buffer_read,
@@ -539,7 +538,7 @@ bail:
539 return -ENOMEM; 538 return -ENOMEM;
540} 539}
541 540
542static struct file_operations debug_mle_fops = { 541static const struct file_operations debug_mle_fops = {
543 .open = debug_mle_open, 542 .open = debug_mle_open,
544 .release = debug_buffer_release, 543 .release = debug_buffer_release,
545 .read = debug_buffer_read, 544 .read = debug_buffer_read,
@@ -683,7 +682,7 @@ static int lockres_seq_show(struct seq_file *s, void *v)
683 return 0; 682 return 0;
684} 683}
685 684
686static struct seq_operations debug_lockres_ops = { 685static const struct seq_operations debug_lockres_ops = {
687 .start = lockres_seq_start, 686 .start = lockres_seq_start,
688 .stop = lockres_seq_stop, 687 .stop = lockres_seq_stop,
689 .next = lockres_seq_next, 688 .next = lockres_seq_next,
@@ -742,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
742 return seq_release_private(inode, file); 741 return seq_release_private(inode, file);
743} 742}
744 743
745static struct file_operations debug_lockres_fops = { 744static const struct file_operations debug_lockres_fops = {
746 .open = debug_lockres_open, 745 .open = debug_lockres_open,
747 .release = debug_lockres_release, 746 .release = debug_lockres_release,
748 .read = seq_read, 747 .read = seq_read,
@@ -926,7 +925,7 @@ bail:
926 return -ENOMEM; 925 return -ENOMEM;
927} 926}
928 927
929static struct file_operations debug_state_fops = { 928static const struct file_operations debug_state_fops = {
930 .open = debug_state_open, 929 .open = debug_state_open,
931 .release = debug_buffer_release, 930 .release = debug_buffer_release,
932 .read = debug_buffer_read, 931 .read = debug_buffer_read,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd8..0334000676d3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/delay.h> 33#include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac8..437698e9465f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4dd..83bcaf266b35 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e3280569..d9fa3d22e17c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d490b66ad9d7..52ec020ea78b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
@@ -212,14 +211,18 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
212 spin_lock(&dlm->spinlock); 211 spin_lock(&dlm->spinlock);
213 } 212 }
214 213
214 spin_lock(&res->spinlock);
215 if (!list_empty(&res->purge)) { 215 if (!list_empty(&res->purge)) {
216 mlog(0, "removing lockres %.*s:%p from purgelist, " 216 mlog(0, "removing lockres %.*s:%p from purgelist, "
217 "master = %d\n", res->lockname.len, res->lockname.name, 217 "master = %d\n", res->lockname.len, res->lockname.name,
218 res, master); 218 res, master);
219 list_del_init(&res->purge); 219 list_del_init(&res->purge);
220 spin_unlock(&res->spinlock);
220 dlm_lockres_put(res); 221 dlm_lockres_put(res);
221 dlm->purge_count--; 222 dlm->purge_count--;
222 } 223 } else
224 spin_unlock(&res->spinlock);
225
223 __dlm_unhash_lockres(res); 226 __dlm_unhash_lockres(res);
224 227
225 /* lockres is not in the hash now. drop the flag and wake up 228 /* lockres is not in the hash now. drop the flag and wake up
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e0..00f53b2aea76 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 110bb57c46ab..0d38d67194cb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
53#include "super.h" 53#include "super.h"
54#include "uptodate.h" 54#include "uptodate.h"
55#include "quota.h" 55#include "quota.h"
56#include "refcounttree.h"
56 57
57#include "buffer_head_io.h" 58#include "buffer_head_io.h"
58 59
@@ -110,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
110 111
111static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); 112static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
112 113
114static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
115 int new_level);
116static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
117 int blocking);
118
113#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) 119#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
114 120
115/* This aids in debugging situations where a bad LVB might be involved. */ 121/* This aids in debugging situations where a bad LVB might be involved. */
@@ -278,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
278 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, 284 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
279}; 285};
280 286
287static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
288 .check_downconvert = ocfs2_check_refcount_downconvert,
289 .downconvert_worker = ocfs2_refcount_convert_worker,
290 .flags = 0,
291};
292
281static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 293static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
282{ 294{
283 return lockres->l_type == OCFS2_LOCK_TYPE_META || 295 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -306,6 +318,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re
306 return (struct ocfs2_mem_dqinfo *)lockres->l_priv; 318 return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
307} 319}
308 320
321static inline struct ocfs2_refcount_tree *
322ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
323{
324 return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
325}
326
309static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) 327static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
310{ 328{
311 if (lockres->l_ops->get_osb) 329 if (lockres->l_ops->get_osb)
@@ -693,6 +711,17 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
693 info); 711 info);
694} 712}
695 713
714void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
715 struct ocfs2_super *osb, u64 ref_blkno,
716 unsigned int generation)
717{
718 ocfs2_lock_res_init_once(lockres);
719 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
720 generation, lockres->l_name);
721 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
722 &ocfs2_refcount_block_lops, osb);
723}
724
696void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 725void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
697{ 726{
698 mlog_entry_void(); 727 mlog_entry_void();
@@ -1548,8 +1577,10 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1548 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1577 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1549 write ? "EXMODE" : "PRMODE"); 1578 write ? "EXMODE" : "PRMODE");
1550 1579
1551 if (ocfs2_mount_local(osb)) 1580 if (ocfs2_mount_local(osb)) {
1581 mlog_exit(0);
1552 return 0; 1582 return 0;
1583 }
1553 1584
1554 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1585 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1555 1586
@@ -2127,7 +2158,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2127 2158
2128 /* This will discard any caching information we might have had 2159 /* This will discard any caching information we might have had
2129 * for the inode metadata. */ 2160 * for the inode metadata. */
2130 ocfs2_metadata_cache_purge(inode); 2161 ocfs2_metadata_cache_purge(INODE_CACHE(inode));
2131 2162
2132 ocfs2_extent_map_trunc(inode, 0); 2163 ocfs2_extent_map_trunc(inode, 0);
2133 2164
@@ -3009,6 +3040,7 @@ static void ocfs2_unlock_ast(void *opaque, int error)
3009 "unlock_action %d\n", error, lockres->l_name, 3040 "unlock_action %d\n", error, lockres->l_name,
3010 lockres->l_unlock_action); 3041 lockres->l_unlock_action);
3011 spin_unlock_irqrestore(&lockres->l_lock, flags); 3042 spin_unlock_irqrestore(&lockres->l_lock, flags);
3043 mlog_exit_void();
3012 return; 3044 return;
3013 } 3045 }
3014 3046
@@ -3495,11 +3527,11 @@ out:
3495 return UNBLOCK_CONTINUE; 3527 return UNBLOCK_CONTINUE;
3496} 3528}
3497 3529
3498static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, 3530static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
3499 int new_level) 3531 struct ocfs2_lock_res *lockres,
3532 int new_level)
3500{ 3533{
3501 struct inode *inode = ocfs2_lock_res_inode(lockres); 3534 int checkpointed = ocfs2_ci_fully_checkpointed(ci);
3502 int checkpointed = ocfs2_inode_fully_checkpointed(inode);
3503 3535
3504 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); 3536 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
3505 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); 3537 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
@@ -3507,10 +3539,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3507 if (checkpointed) 3539 if (checkpointed)
3508 return 1; 3540 return 1;
3509 3541
3510 ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb)); 3542 ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
3511 return 0; 3543 return 0;
3512} 3544}
3513 3545
3546static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3547 int new_level)
3548{
3549 struct inode *inode = ocfs2_lock_res_inode(lockres);
3550
3551 return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
3552}
3553
3514static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) 3554static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
3515{ 3555{
3516 struct inode *inode = ocfs2_lock_res_inode(lockres); 3556 struct inode *inode = ocfs2_lock_res_inode(lockres);
@@ -3640,6 +3680,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3640 return UNBLOCK_CONTINUE_POST; 3680 return UNBLOCK_CONTINUE_POST;
3641} 3681}
3642 3682
3683static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
3684 int new_level)
3685{
3686 struct ocfs2_refcount_tree *tree =
3687 ocfs2_lock_res_refcount_tree(lockres);
3688
3689 return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
3690}
3691
3692static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
3693 int blocking)
3694{
3695 struct ocfs2_refcount_tree *tree =
3696 ocfs2_lock_res_refcount_tree(lockres);
3697
3698 ocfs2_metadata_cache_purge(&tree->rf_ci);
3699
3700 return UNBLOCK_CONTINUE;
3701}
3702
3643static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) 3703static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3644{ 3704{
3645 struct ocfs2_qinfo_lvb *lvb; 3705 struct ocfs2_qinfo_lvb *lvb;
@@ -3752,6 +3812,37 @@ bail:
3752 return status; 3812 return status;
3753} 3813}
3754 3814
3815int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
3816{
3817 int status;
3818 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3819 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3820 struct ocfs2_super *osb = lockres->l_priv;
3821
3822
3823 if (ocfs2_is_hard_readonly(osb))
3824 return -EROFS;
3825
3826 if (ocfs2_mount_local(osb))
3827 return 0;
3828
3829 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3830 if (status < 0)
3831 mlog_errno(status);
3832
3833 return status;
3834}
3835
3836void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3837{
3838 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3839 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3840 struct ocfs2_super *osb = lockres->l_priv;
3841
3842 if (!ocfs2_mount_local(osb))
3843 ocfs2_cluster_unlock(osb, lockres, level);
3844}
3845
3755/* 3846/*
3756 * This is the filesystem locking protocol. It provides the lock handling 3847 * This is the filesystem locking protocol. It provides the lock handling
3757 * hooks for the underlying DLM. It has a maximum version number. 3848 * hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 7553836931de..d1ce48e1b3d6 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -101,6 +101,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
101struct ocfs2_mem_dqinfo; 101struct ocfs2_mem_dqinfo;
102void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, 102void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
103 struct ocfs2_mem_dqinfo *info); 103 struct ocfs2_mem_dqinfo *info);
104void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
105 struct ocfs2_super *osb, u64 ref_blkno,
106 unsigned int generation);
104void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 107void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
105int ocfs2_create_new_inode_locks(struct inode *inode); 108int ocfs2_create_new_inode_locks(struct inode *inode);
106int ocfs2_drop_inode_locks(struct inode *inode); 109int ocfs2_drop_inode_locks(struct inode *inode);
@@ -148,6 +151,9 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock);
148void ocfs2_file_unlock(struct file *file); 151void ocfs2_file_unlock(struct file *file);
149int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); 152int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
150void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); 153void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
154struct ocfs2_refcount_tree;
155int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
156void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
151 157
152 158
153void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 159void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2bb1a04d253..843db64e9d4a 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
293 struct ocfs2_extent_block *eb; 293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el; 294 struct ocfs2_extent_list *el;
295 295
296 ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); 296 ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
297 if (ret) { 297 if (ret) {
298 mlog_errno(ret); 298 mlog_errno(ret);
299 goto out; 299 goto out;
@@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
353 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block 353 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
354 * containing el. 354 * containing el.
355 */ 355 */
356static int ocfs2_figure_hole_clusters(struct inode *inode, 356int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
357 struct ocfs2_extent_list *el, 357 struct ocfs2_extent_list *el,
358 struct buffer_head *eb_bh, 358 struct buffer_head *eb_bh,
359 u32 v_cluster, 359 u32 v_cluster,
360 u32 *num_clusters) 360 u32 *num_clusters)
361{ 361{
362 int ret, i; 362 int ret, i;
363 struct buffer_head *next_eb_bh = NULL; 363 struct buffer_head *next_eb_bh = NULL;
@@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
376 goto no_more_extents; 376 goto no_more_extents;
377 377
378 ret = ocfs2_read_extent_block(inode, 378 ret = ocfs2_read_extent_block(ci,
379 le64_to_cpu(eb->h_next_leaf_blk), 379 le64_to_cpu(eb->h_next_leaf_blk),
380 &next_eb_bh); 380 &next_eb_bh);
381 if (ret) { 381 if (ret) {
@@ -428,7 +428,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
428 tree_height = le16_to_cpu(el->l_tree_depth); 428 tree_height = le16_to_cpu(el->l_tree_depth);
429 429
430 if (tree_height > 0) { 430 if (tree_height > 0) {
431 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 431 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
432 &eb_bh);
432 if (ret) { 433 if (ret) {
433 mlog_errno(ret); 434 mlog_errno(ret);
434 goto out; 435 goto out;
@@ -455,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
455 * field. 456 * field.
456 */ 457 */
457 if (hole_len) { 458 if (hole_len) {
458 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, 459 ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
460 el, eb_bh,
459 v_cluster, &len); 461 v_cluster, &len);
460 if (ret) { 462 if (ret) {
461 mlog_errno(ret); 463 mlog_errno(ret);
@@ -539,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
539 541
540int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 542int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
541 u32 *p_cluster, u32 *num_clusters, 543 u32 *p_cluster, u32 *num_clusters,
542 struct ocfs2_extent_list *el) 544 struct ocfs2_extent_list *el,
545 unsigned int *extent_flags)
543{ 546{
544 int ret = 0, i; 547 int ret = 0, i;
545 struct buffer_head *eb_bh = NULL; 548 struct buffer_head *eb_bh = NULL;
@@ -548,7 +551,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
548 u32 coff; 551 u32 coff;
549 552
550 if (el->l_tree_depth) { 553 if (el->l_tree_depth) {
551 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 554 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
555 &eb_bh);
552 if (ret) { 556 if (ret) {
553 mlog_errno(ret); 557 mlog_errno(ret);
554 goto out; 558 goto out;
@@ -590,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
590 *p_cluster = *p_cluster + coff; 594 *p_cluster = *p_cluster + coff;
591 if (num_clusters) 595 if (num_clusters)
592 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 596 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
597
598 if (extent_flags)
599 *extent_flags = rec->e_flags;
593 } 600 }
594out: 601out:
595 if (eb_bh) 602 if (eb_bh)
@@ -862,8 +869,8 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
862 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); 869 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
863 } 870 }
864 871
865 rc = ocfs2_read_blocks(inode, p_block, count, bhs + done, 872 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
866 flags, validate); 873 bhs + done, flags, validate);
867 if (rc) { 874 if (rc) {
868 mlog_errno(rc); 875 mlog_errno(rc);
869 break; 876 break;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index b7dd9731b462..e79d41c2c909 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -55,12 +55,18 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
55 55
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el); 58 struct ocfs2_extent_list *el,
59 unsigned int *extent_flags);
59 60
60int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 61int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
61 struct buffer_head *bhs[], int flags, 62 struct buffer_head *bhs[], int flags,
62 int (*validate)(struct super_block *sb, 63 int (*validate)(struct super_block *sb,
63 struct buffer_head *bh)); 64 struct buffer_head *bh));
65int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
66 struct ocfs2_extent_list *el,
67 struct buffer_head *eb_bh,
68 u32 v_cluster,
69 u32 *num_clusters);
64static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, 70static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
65 struct buffer_head **bh, 71 struct buffer_head **bh,
66 int (*validate)(struct super_block *sb, 72 int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 221c5e98957b..89fc8ee1f5a5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
59#include "xattr.h" 59#include "xattr.h"
60#include "acl.h" 60#include "acl.h"
61#include "quota.h" 61#include "quota.h"
62#include "refcounttree.h"
62 63
63#include "buffer_head_io.h" 64#include "buffer_head_io.h"
64 65
@@ -259,7 +260,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
259 goto out; 260 goto out;
260 } 261 }
261 262
262 ret = ocfs2_journal_access_di(handle, inode, bh, 263 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
263 OCFS2_JOURNAL_ACCESS_WRITE); 264 OCFS2_JOURNAL_ACCESS_WRITE);
264 if (ret) { 265 if (ret) {
265 mlog_errno(ret); 266 mlog_errno(ret);
@@ -334,6 +335,39 @@ out:
334 return ret; 335 return ret;
335} 336}
336 337
338static int ocfs2_cow_file_pos(struct inode *inode,
339 struct buffer_head *fe_bh,
340 u64 offset)
341{
342 int status;
343 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
344 unsigned int num_clusters = 0;
345 unsigned int ext_flags = 0;
346
347 /*
348 * If the new offset is aligned to the range of the cluster, there is
349 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
350 * CoW either.
351 */
352 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
353 return 0;
354
355 status = ocfs2_get_clusters(inode, cpos, &phys,
356 &num_clusters, &ext_flags);
357 if (status) {
358 mlog_errno(status);
359 goto out;
360 }
361
362 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
363 goto out;
364
365 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
366
367out:
368 return status;
369}
370
337static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 371static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
338 struct inode *inode, 372 struct inode *inode,
339 struct buffer_head *fe_bh, 373 struct buffer_head *fe_bh,
@@ -346,6 +380,17 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
346 380
347 mlog_entry_void(); 381 mlog_entry_void();
348 382
383 /*
384 * We need to CoW the cluster contains the offset if it is reflinked
385 * since we will call ocfs2_zero_range_for_truncate later which will
386 * write "0" from offset to the end of the cluster.
387 */
388 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
389 if (status) {
390 mlog_errno(status);
391 return status;
392 }
393
349 /* TODO: This needs to actually orphan the inode in this 394 /* TODO: This needs to actually orphan the inode in this
350 * transaction. */ 395 * transaction. */
351 396
@@ -356,7 +401,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
356 goto out; 401 goto out;
357 } 402 }
358 403
359 status = ocfs2_journal_access_di(handle, inode, fe_bh, 404 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
360 OCFS2_JOURNAL_ACCESS_WRITE); 405 OCFS2_JOURNAL_ACCESS_WRITE);
361 if (status < 0) { 406 if (status < 0) {
362 mlog_errno(status); 407 mlog_errno(status);
@@ -486,6 +531,8 @@ bail_unlock_sem:
486 up_write(&OCFS2_I(inode)->ip_alloc_sem); 531 up_write(&OCFS2_I(inode)->ip_alloc_sem);
487 532
488bail: 533bail:
534 if (!status && OCFS2_I(inode)->ip_clusters == 0)
535 status = ocfs2_try_remove_refcount_tree(inode, di_bh);
489 536
490 mlog_exit(status); 537 mlog_exit(status);
491 return status; 538 return status;
@@ -515,11 +562,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
515 int ret; 562 int ret;
516 struct ocfs2_extent_tree et; 563 struct ocfs2_extent_tree et;
517 564
518 ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); 565 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
519 ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, 566 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
520 clusters_to_add, mark_unwritten, 567 clusters_to_add, mark_unwritten,
521 &et, handle, 568 data_ac, meta_ac, reason_ret);
522 data_ac, meta_ac, reason_ret);
523 569
524 return ret; 570 return ret;
525} 571}
@@ -564,7 +610,7 @@ restart_all:
564 (unsigned long long)OCFS2_I(inode)->ip_blkno, 610 (unsigned long long)OCFS2_I(inode)->ip_blkno,
565 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), 611 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
566 clusters_to_add); 612 clusters_to_add);
567 ocfs2_init_dinode_extent_tree(&et, inode, bh); 613 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
568 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 614 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
569 &data_ac, &meta_ac); 615 &data_ac, &meta_ac);
570 if (status) { 616 if (status) {
@@ -593,7 +639,7 @@ restarted_transaction:
593 /* reserve a write to the file entry early on - that we if we 639 /* reserve a write to the file entry early on - that we if we
594 * run out of credits in the allocation path, we can still 640 * run out of credits in the allocation path, we can still
595 * update i_size. */ 641 * update i_size. */
596 status = ocfs2_journal_access_di(handle, inode, bh, 642 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
597 OCFS2_JOURNAL_ACCESS_WRITE); 643 OCFS2_JOURNAL_ACCESS_WRITE);
598 if (status < 0) { 644 if (status < 0) {
599 mlog_errno(status); 645 mlog_errno(status);
@@ -1131,7 +1177,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1131 goto out; 1177 goto out;
1132 } 1178 }
1133 1179
1134 ret = ocfs2_journal_access_di(handle, inode, bh, 1180 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1135 OCFS2_JOURNAL_ACCESS_WRITE); 1181 OCFS2_JOURNAL_ACCESS_WRITE);
1136 if (ret < 0) { 1182 if (ret < 0) {
1137 mlog_errno(ret); 1183 mlog_errno(ret);
@@ -1395,7 +1441,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1395 struct address_space *mapping = inode->i_mapping; 1441 struct address_space *mapping = inode->i_mapping;
1396 struct ocfs2_extent_tree et; 1442 struct ocfs2_extent_tree et;
1397 1443
1398 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 1444 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1399 ocfs2_init_dealloc_ctxt(&dealloc); 1445 ocfs2_init_dealloc_ctxt(&dealloc);
1400 1446
1401 if (byte_len == 0) 1447 if (byte_len == 0)
@@ -1657,6 +1703,70 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1657 OCFS2_IOC_RESVSP64, &sr, change_size); 1703 OCFS2_IOC_RESVSP64, &sr, change_size);
1658} 1704}
1659 1705
1706int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
1707 size_t count)
1708{
1709 int ret = 0;
1710 unsigned int extent_flags;
1711 u32 cpos, clusters, extent_len, phys_cpos;
1712 struct super_block *sb = inode->i_sb;
1713
1714 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
1715 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
1716 return 0;
1717
1718 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1719 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1720
1721 while (clusters) {
1722 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1723 &extent_flags);
1724 if (ret < 0) {
1725 mlog_errno(ret);
1726 goto out;
1727 }
1728
1729 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
1730 ret = 1;
1731 break;
1732 }
1733
1734 if (extent_len > clusters)
1735 extent_len = clusters;
1736
1737 clusters -= extent_len;
1738 cpos += extent_len;
1739 }
1740out:
1741 return ret;
1742}
1743
1744static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
1745 loff_t pos, size_t count,
1746 int *meta_level)
1747{
1748 int ret;
1749 struct buffer_head *di_bh = NULL;
1750 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1751 u32 clusters =
1752 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
1753
1754 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1755 if (ret) {
1756 mlog_errno(ret);
1757 goto out;
1758 }
1759
1760 *meta_level = 1;
1761
1762 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
1763 if (ret)
1764 mlog_errno(ret);
1765out:
1766 brelse(di_bh);
1767 return ret;
1768}
1769
1660static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1770static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1661 loff_t *ppos, 1771 loff_t *ppos,
1662 size_t count, 1772 size_t count,
@@ -1713,6 +1823,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1713 1823
1714 end = saved_pos + count; 1824 end = saved_pos + count;
1715 1825
1826 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
1827 if (ret == 1) {
1828 ocfs2_inode_unlock(inode, meta_level);
1829 meta_level = -1;
1830
1831 ret = ocfs2_prepare_inode_for_refcount(inode,
1832 saved_pos,
1833 count,
1834 &meta_level);
1835 }
1836
1837 if (ret < 0) {
1838 mlog_errno(ret);
1839 goto out_unlock;
1840 }
1841
1716 /* 1842 /*
1717 * Skip the O_DIRECT checks if we don't need 1843 * Skip the O_DIRECT checks if we don't need
1718 * them. 1844 * them.
@@ -1759,7 +1885,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1759 *ppos = saved_pos; 1885 *ppos = saved_pos;
1760 1886
1761out_unlock: 1887out_unlock:
1762 ocfs2_inode_unlock(inode, meta_level); 1888 if (meta_level >= 0)
1889 ocfs2_inode_unlock(inode, meta_level);
1763 1890
1764out: 1891out:
1765 return ret; 1892 return ret;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 172f9fbc9fc7..d66cf4f7c70e 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -69,4 +69,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
69int ocfs2_change_file_space(struct file *file, unsigned int cmd, 69int ocfs2_change_file_space(struct file *file, unsigned int cmd,
70 struct ocfs2_space_resv *sr); 70 struct ocfs2_space_resv *sr);
71 71
72int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
73 size_t count);
72#endif /* OCFS2_FILE_H */ 74#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4dc8890ba316..0297fb8982b8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
53#include "sysfile.h" 53#include "sysfile.h"
54#include "uptodate.h" 54#include "uptodate.h"
55#include "xattr.h" 55#include "xattr.h"
56#include "refcounttree.h"
56 57
57#include "buffer_head_io.h" 58#include "buffer_head_io.h"
58 59
@@ -562,7 +563,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
562 goto out; 563 goto out;
563 } 564 }
564 565
565 status = ocfs2_journal_access_di(handle, inode, fe_bh, 566 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
567 fe_bh,
566 OCFS2_JOURNAL_ACCESS_WRITE); 568 OCFS2_JOURNAL_ACCESS_WRITE);
567 if (status < 0) { 569 if (status < 0) {
568 mlog_errno(status); 570 mlog_errno(status);
@@ -646,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
646 } 648 }
647 649
648 /* set the inodes dtime */ 650 /* set the inodes dtime */
649 status = ocfs2_journal_access_di(handle, inode, di_bh, 651 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
650 OCFS2_JOURNAL_ACCESS_WRITE); 652 OCFS2_JOURNAL_ACCESS_WRITE);
651 if (status < 0) { 653 if (status < 0) {
652 mlog_errno(status); 654 mlog_errno(status);
@@ -662,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode,
662 goto bail_commit; 664 goto bail_commit;
663 } 665 }
664 666
665 ocfs2_remove_from_cache(inode, di_bh); 667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
666 vfs_dq_free_inode(inode); 668 vfs_dq_free_inode(inode);
667 669
668 status = ocfs2_free_dinode(handle, inode_alloc_inode, 670 status = ocfs2_free_dinode(handle, inode_alloc_inode,
@@ -781,6 +783,12 @@ static int ocfs2_wipe_inode(struct inode *inode,
781 goto bail_unlock_dir; 783 goto bail_unlock_dir;
782 } 784 }
783 785
786 status = ocfs2_remove_refcount_tree(inode, di_bh);
787 if (status < 0) {
788 mlog_errno(status);
789 goto bail_unlock_dir;
790 }
791
784 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 792 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
785 orphan_dir_bh); 793 orphan_dir_bh);
786 if (status < 0) 794 if (status < 0)
@@ -1112,13 +1120,14 @@ void ocfs2_clear_inode(struct inode *inode)
1112 ocfs2_lock_res_free(&oi->ip_inode_lockres); 1120 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1113 ocfs2_lock_res_free(&oi->ip_open_lockres); 1121 ocfs2_lock_res_free(&oi->ip_open_lockres);
1114 1122
1115 ocfs2_metadata_cache_purge(inode); 1123 ocfs2_metadata_cache_exit(INODE_CACHE(inode));
1116 1124
1117 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, 1125 mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
1118 "Clear inode of %llu, inode has %u cache items\n", 1126 "Clear inode of %llu, inode has %u cache items\n",
1119 (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); 1127 (unsigned long long)oi->ip_blkno,
1128 INODE_CACHE(inode)->ci_num_cached);
1120 1129
1121 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 1130 mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
1122 "Clear inode of %llu, inode has a bad flag\n", 1131 "Clear inode of %llu, inode has a bad flag\n",
1123 (unsigned long long)oi->ip_blkno); 1132 (unsigned long long)oi->ip_blkno);
1124 1133
@@ -1145,9 +1154,7 @@ void ocfs2_clear_inode(struct inode *inode)
1145 (unsigned long long)oi->ip_blkno, oi->ip_open_count); 1154 (unsigned long long)oi->ip_blkno, oi->ip_open_count);
1146 1155
1147 /* Clear all other flags. */ 1156 /* Clear all other flags. */
1148 oi->ip_flags = OCFS2_INODE_CACHE_INLINE; 1157 oi->ip_flags = 0;
1149 oi->ip_created_trans = 0;
1150 oi->ip_last_trans = 0;
1151 oi->ip_dir_start_lookup = 0; 1158 oi->ip_dir_start_lookup = 0;
1152 oi->ip_blkno = 0ULL; 1159 oi->ip_blkno = 0ULL;
1153 1160
@@ -1239,7 +1246,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1239 mlog_entry("(inode %llu)\n", 1246 mlog_entry("(inode %llu)\n",
1240 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1247 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1241 1248
1242 status = ocfs2_journal_access_di(handle, inode, bh, 1249 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1243 OCFS2_JOURNAL_ACCESS_WRITE); 1250 OCFS2_JOURNAL_ACCESS_WRITE);
1244 if (status < 0) { 1251 if (status < 0) {
1245 mlog_errno(status); 1252 mlog_errno(status);
@@ -1380,8 +1387,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
1380 int rc; 1387 int rc;
1381 struct buffer_head *tmp = *bh; 1388 struct buffer_head *tmp = *bh;
1382 1389
1383 rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp, 1390 rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
1384 flags, ocfs2_validate_inode_block); 1391 1, &tmp, flags, ocfs2_validate_inode_block);
1385 1392
1386 /* If ocfs2_read_blocks() got us a new bh, pass it up. */ 1393 /* If ocfs2_read_blocks() got us a new bh, pass it up. */
1387 if (!rc && !*bh) 1394 if (!rc && !*bh)
@@ -1394,3 +1401,56 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
1394{ 1401{
1395 return ocfs2_read_inode_block_full(inode, bh, 0); 1402 return ocfs2_read_inode_block_full(inode, bh, 0);
1396} 1403}
1404
1405
1406static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
1407{
1408 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1409
1410 return oi->ip_blkno;
1411}
1412
1413static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
1414{
1415 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1416
1417 return oi->vfs_inode.i_sb;
1418}
1419
1420static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
1421{
1422 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1423
1424 spin_lock(&oi->ip_lock);
1425}
1426
1427static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
1428{
1429 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1430
1431 spin_unlock(&oi->ip_lock);
1432}
1433
1434static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
1435{
1436 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1437
1438 mutex_lock(&oi->ip_io_mutex);
1439}
1440
1441static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
1442{
1443 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1444
1445 mutex_unlock(&oi->ip_io_mutex);
1446}
1447
1448const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
1449 .co_owner = ocfs2_inode_cache_owner,
1450 .co_get_super = ocfs2_inode_cache_get_super,
1451 .co_cache_lock = ocfs2_inode_cache_lock,
1452 .co_cache_unlock = ocfs2_inode_cache_unlock,
1453 .co_io_lock = ocfs2_inode_cache_io_lock,
1454 .co_io_unlock = ocfs2_inode_cache_io_unlock,
1455};
1456
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ea71525aad41..ba4fe07b293c 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -60,12 +60,6 @@ struct ocfs2_inode_info
60 60
61 u32 ip_dir_start_lookup; 61 u32 ip_dir_start_lookup;
62 62
63 /* next two are protected by trans_inc_lock */
64 /* which transaction were we created on? Zero if none. */
65 unsigned long ip_created_trans;
66 /* last transaction we were a part of. */
67 unsigned long ip_last_trans;
68
69 struct ocfs2_caching_info ip_metadata_cache; 63 struct ocfs2_caching_info ip_metadata_cache;
70 64
71 struct ocfs2_extent_map ip_extent_map; 65 struct ocfs2_extent_map ip_extent_map;
@@ -106,8 +100,6 @@ struct ocfs2_inode_info
106#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 100#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
107/* Does someone have the file open O_DIRECT */ 101/* Does someone have the file open O_DIRECT */
108#define OCFS2_INODE_OPEN_DIRECT 0x00000040 102#define OCFS2_INODE_OPEN_DIRECT 0x00000040
109/* Indicates that the metadata cache should be used as an array. */
110#define OCFS2_INODE_CACHE_INLINE 0x00000080
111 103
112static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 104static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
113{ 105{
@@ -120,6 +112,12 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
120extern struct kmem_cache *ocfs2_inode_cache; 112extern struct kmem_cache *ocfs2_inode_cache;
121 113
122extern const struct address_space_operations ocfs2_aops; 114extern const struct address_space_operations ocfs2_aops;
115extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
116
117static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
118{
119 return &OCFS2_I(inode)->ip_metadata_cache;
120}
123 121
124void ocfs2_clear_inode(struct inode *inode); 122void ocfs2_clear_inode(struct inode *inode);
125void ocfs2_delete_inode(struct inode *inode); 123void ocfs2_delete_inode(struct inode *inode);
@@ -172,4 +170,10 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
172/* The same, but can be passed OCFS2_BH_* flags */ 170/* The same, but can be passed OCFS2_BH_* flags */
173int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, 171int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
174 int flags); 172 int flags);
173
174static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
175{
176 return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
177}
178
175#endif /* OCFS2_INODE_H */ 179#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 467b413bec21..31fbb0619510 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -21,6 +21,7 @@
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h" 23#include "resize.h"
24#include "refcounttree.h"
24 25
25#include <linux/ext2_fs.h> 26#include <linux/ext2_fs.h>
26 27
@@ -115,6 +116,9 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
115 int status; 116 int status;
116 struct ocfs2_space_resv sr; 117 struct ocfs2_space_resv sr;
117 struct ocfs2_new_group_input input; 118 struct ocfs2_new_group_input input;
119 struct reflink_arguments args;
120 const char *old_path, *new_path;
121 bool preserve;
118 122
119 switch (cmd) { 123 switch (cmd) {
120 case OCFS2_IOC_GETFLAGS: 124 case OCFS2_IOC_GETFLAGS:
@@ -160,6 +164,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
160 return -EFAULT; 164 return -EFAULT;
161 165
162 return ocfs2_group_add(inode, &input); 166 return ocfs2_group_add(inode, &input);
167 case OCFS2_IOC_REFLINK:
168 if (copy_from_user(&args, (struct reflink_arguments *)arg,
169 sizeof(args)))
170 return -EFAULT;
171 old_path = (const char *)(unsigned long)args.old_path;
172 new_path = (const char *)(unsigned long)args.new_path;
173 preserve = (args.preserve != 0);
174
175 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
163 default: 176 default:
164 return -ENOTTY; 177 return -ENOTTY;
165 } 178 }
@@ -182,6 +195,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
182 case OCFS2_IOC_GROUP_EXTEND: 195 case OCFS2_IOC_GROUP_EXTEND:
183 case OCFS2_IOC_GROUP_ADD: 196 case OCFS2_IOC_GROUP_ADD:
184 case OCFS2_IOC_GROUP_ADD64: 197 case OCFS2_IOC_GROUP_ADD64:
198 case OCFS2_IOC_REFLINK:
185 break; 199 break;
186 default: 200 default:
187 return -ENOIOCTLCMD; 201 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c48b93ac6b65..54c16b66327e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -48,6 +48,7 @@
48#include "slot_map.h" 48#include "slot_map.h"
49#include "super.h" 49#include "super.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h"
51#include "quota.h" 52#include "quota.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
@@ -554,6 +555,14 @@ static struct ocfs2_triggers eb_triggers = {
554 .ot_offset = offsetof(struct ocfs2_extent_block, h_check), 555 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
555}; 556};
556 557
558static struct ocfs2_triggers rb_triggers = {
559 .ot_triggers = {
560 .t_commit = ocfs2_commit_trigger,
561 .t_abort = ocfs2_abort_trigger,
562 },
563 .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
564};
565
557static struct ocfs2_triggers gd_triggers = { 566static struct ocfs2_triggers gd_triggers = {
558 .ot_triggers = { 567 .ot_triggers = {
559 .t_commit = ocfs2_commit_trigger, 568 .t_commit = ocfs2_commit_trigger,
@@ -601,14 +610,16 @@ static struct ocfs2_triggers dl_triggers = {
601}; 610};
602 611
603static int __ocfs2_journal_access(handle_t *handle, 612static int __ocfs2_journal_access(handle_t *handle,
604 struct inode *inode, 613 struct ocfs2_caching_info *ci,
605 struct buffer_head *bh, 614 struct buffer_head *bh,
606 struct ocfs2_triggers *triggers, 615 struct ocfs2_triggers *triggers,
607 int type) 616 int type)
608{ 617{
609 int status; 618 int status;
619 struct ocfs2_super *osb =
620 OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
610 621
611 BUG_ON(!inode); 622 BUG_ON(!ci || !ci->ci_ops);
612 BUG_ON(!handle); 623 BUG_ON(!handle);
613 BUG_ON(!bh); 624 BUG_ON(!bh);
614 625
@@ -627,15 +638,15 @@ static int __ocfs2_journal_access(handle_t *handle,
627 BUG(); 638 BUG();
628 } 639 }
629 640
630 /* Set the current transaction information on the inode so 641 /* Set the current transaction information on the ci so
631 * that the locking code knows whether it can drop it's locks 642 * that the locking code knows whether it can drop it's locks
632 * on this inode or not. We're protected from the commit 643 * on this ci or not. We're protected from the commit
633 * thread updating the current transaction id until 644 * thread updating the current transaction id until
634 * ocfs2_commit_trans() because ocfs2_start_trans() took 645 * ocfs2_commit_trans() because ocfs2_start_trans() took
635 * j_trans_barrier for us. */ 646 * j_trans_barrier for us. */
636 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 647 ocfs2_set_ci_lock_trans(osb->journal, ci);
637 648
638 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 649 ocfs2_metadata_cache_io_lock(ci);
639 switch (type) { 650 switch (type) {
640 case OCFS2_JOURNAL_ACCESS_CREATE: 651 case OCFS2_JOURNAL_ACCESS_CREATE:
641 case OCFS2_JOURNAL_ACCESS_WRITE: 652 case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -650,9 +661,9 @@ static int __ocfs2_journal_access(handle_t *handle,
650 status = -EINVAL; 661 status = -EINVAL;
651 mlog(ML_ERROR, "Uknown access type!\n"); 662 mlog(ML_ERROR, "Uknown access type!\n");
652 } 663 }
653 if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers) 664 if (!status && ocfs2_meta_ecc(osb) && triggers)
654 jbd2_journal_set_triggers(bh, &triggers->ot_triggers); 665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
655 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 666 ocfs2_metadata_cache_io_unlock(ci);
656 667
657 if (status < 0) 668 if (status < 0)
658 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 669 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
@@ -662,66 +673,65 @@ static int __ocfs2_journal_access(handle_t *handle,
662 return status; 673 return status;
663} 674}
664 675
665int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, 676int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
666 struct buffer_head *bh, int type) 677 struct buffer_head *bh, int type)
667{ 678{
668 return __ocfs2_journal_access(handle, inode, bh, &di_triggers, 679 return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
669 type);
670} 680}
671 681
672int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, 682int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
673 struct buffer_head *bh, int type) 683 struct buffer_head *bh, int type)
674{ 684{
675 return __ocfs2_journal_access(handle, inode, bh, &eb_triggers, 685 return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
676 type);
677} 686}
678 687
679int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, 688int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
680 struct buffer_head *bh, int type) 689 struct buffer_head *bh, int type)
681{ 690{
682 return __ocfs2_journal_access(handle, inode, bh, &gd_triggers, 691 return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
683 type); 692 type);
684} 693}
685 694
686int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 695int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
687 struct buffer_head *bh, int type) 696 struct buffer_head *bh, int type)
688{ 697{
689 return __ocfs2_journal_access(handle, inode, bh, &db_triggers, 698 return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
690 type);
691} 699}
692 700
693int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, 701int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
694 struct buffer_head *bh, int type) 702 struct buffer_head *bh, int type)
695{ 703{
696 return __ocfs2_journal_access(handle, inode, bh, &xb_triggers, 704 return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
697 type);
698} 705}
699 706
700int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, 707int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
701 struct buffer_head *bh, int type) 708 struct buffer_head *bh, int type)
702{ 709{
703 return __ocfs2_journal_access(handle, inode, bh, &dq_triggers, 710 return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
704 type);
705} 711}
706 712
707int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, 713int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
708 struct buffer_head *bh, int type) 714 struct buffer_head *bh, int type)
709{ 715{
710 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers, 716 return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
711 type);
712} 717}
713 718
714int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, 719int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
715 struct buffer_head *bh, int type) 720 struct buffer_head *bh, int type)
716{ 721{
717 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers, 722 return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
718 type); 723}
724
725int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
726 struct buffer_head *bh, int type)
727{
728 return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
719} 729}
720 730
721int ocfs2_journal_access(handle_t *handle, struct inode *inode, 731int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
722 struct buffer_head *bh, int type) 732 struct buffer_head *bh, int type)
723{ 733{
724 return __ocfs2_journal_access(handle, inode, bh, NULL, type); 734 return __ocfs2_journal_access(handle, ci, bh, NULL, type);
725} 735}
726 736
727int ocfs2_journal_dirty(handle_t *handle, 737int ocfs2_journal_dirty(handle_t *handle,
@@ -898,7 +908,7 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
898 ocfs2_bump_recovery_generation(fe); 908 ocfs2_bump_recovery_generation(fe);
899 909
900 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); 910 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
901 status = ocfs2_write_block(osb, bh, journal->j_inode); 911 status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
902 if (status < 0) 912 if (status < 0)
903 mlog_errno(status); 913 mlog_errno(status);
904 914
@@ -1642,7 +1652,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1642 ocfs2_get_recovery_generation(fe); 1652 ocfs2_get_recovery_generation(fe);
1643 1653
1644 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); 1654 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
1645 status = ocfs2_write_block(osb, bh, inode); 1655 status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
1646 if (status < 0) 1656 if (status < 0)
1647 mlog_errno(status); 1657 mlog_errno(status);
1648 1658
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2c3222aec622..3f74e09b0d80 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -90,56 +90,66 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
90 return old_id; 90 return old_id;
91} 91}
92 92
93static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, 93static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
94 struct inode *inode) 94 struct ocfs2_caching_info *ci)
95{ 95{
96 spin_lock(&trans_inc_lock); 96 spin_lock(&trans_inc_lock);
97 OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; 97 ci->ci_last_trans = journal->j_trans_id;
98 spin_unlock(&trans_inc_lock); 98 spin_unlock(&trans_inc_lock);
99} 99}
100 100
101/* Used to figure out whether it's safe to drop a metadata lock on an 101/* Used to figure out whether it's safe to drop a metadata lock on an
102 * inode. Returns true if all the inodes changes have been 102 * cached object. Returns true if all the object's changes have been
103 * checkpointed to disk. You should be holding the spinlock on the 103 * checkpointed to disk. You should be holding the spinlock on the
104 * metadata lock while calling this to be sure that nobody can take 104 * metadata lock while calling this to be sure that nobody can take
105 * the lock and put it on another transaction. */ 105 * the lock and put it on another transaction. */
106static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) 106static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
107{ 107{
108 int ret; 108 int ret;
109 struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; 109 struct ocfs2_journal *journal =
110 OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
110 111
111 spin_lock(&trans_inc_lock); 112 spin_lock(&trans_inc_lock);
112 ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); 113 ret = time_after(journal->j_trans_id, ci->ci_last_trans);
113 spin_unlock(&trans_inc_lock); 114 spin_unlock(&trans_inc_lock);
114 return ret; 115 return ret;
115} 116}
116 117
117/* convenience function to check if an inode is still new (has never 118/* convenience function to check if an object backed by struct
118 * hit disk) Will do you a favor and set created_trans = 0 when you've 119 * ocfs2_caching_info is still new (has never hit disk) Will do you a
119 * been checkpointed. returns '1' if the inode is still new. */ 120 * favor and set created_trans = 0 when you've
120static inline int ocfs2_inode_is_new(struct inode *inode) 121 * been checkpointed. returns '1' if the ci is still new. */
122static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
121{ 123{
122 int ret; 124 int ret;
125 struct ocfs2_journal *journal =
126 OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
123 127
128 spin_lock(&trans_inc_lock);
129 ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
130 if (!ret)
131 ci->ci_created_trans = 0;
132 spin_unlock(&trans_inc_lock);
133 return ret;
134}
135
136/* Wrapper for inodes so we can check system files */
137static inline int ocfs2_inode_is_new(struct inode *inode)
138{
124 /* System files are never "new" as they're written out by 139 /* System files are never "new" as they're written out by
125 * mkfs. This helps us early during mount, before we have the 140 * mkfs. This helps us early during mount, before we have the
126 * journal open and j_trans_id could be junk. */ 141 * journal open and j_trans_id could be junk. */
127 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
128 return 0; 143 return 0;
129 spin_lock(&trans_inc_lock); 144
130 ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, 145 return ocfs2_ci_is_new(INODE_CACHE(inode));
131 OCFS2_I(inode)->ip_created_trans));
132 if (!ret)
133 OCFS2_I(inode)->ip_created_trans = 0;
134 spin_unlock(&trans_inc_lock);
135 return ret;
136} 146}
137 147
138static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, 148static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
139 struct inode *inode) 149 struct ocfs2_caching_info *ci)
140{ 150{
141 spin_lock(&trans_inc_lock); 151 spin_lock(&trans_inc_lock);
142 OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; 152 ci->ci_created_trans = osb->journal->j_trans_id;
143 spin_unlock(&trans_inc_lock); 153 spin_unlock(&trans_inc_lock);
144} 154}
145 155
@@ -200,7 +210,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
200 if (ocfs2_mount_local(osb)) 210 if (ocfs2_mount_local(osb))
201 return; 211 return;
202 212
203 if (!ocfs2_inode_fully_checkpointed(inode)) { 213 if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
204 /* WARNING: This only kicks off a single 214 /* WARNING: This only kicks off a single
205 * checkpoint. If someone races you and adds more 215 * checkpoint. If someone races you and adds more
206 * metadata to the journal, you won't know, and will 216 * metadata to the journal, you won't know, and will
@@ -210,7 +220,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
210 ocfs2_start_checkpoint(osb); 220 ocfs2_start_checkpoint(osb);
211 221
212 wait_event(osb->journal->j_checkpointed, 222 wait_event(osb->journal->j_checkpointed,
213 ocfs2_inode_fully_checkpointed(inode)); 223 ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
214 } 224 }
215} 225}
216 226
@@ -266,31 +276,34 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
266 276
267 277
268/* ocfs2_inode */ 278/* ocfs2_inode */
269int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
270 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
271/* ocfs2_extent_block */ 281/* ocfs2_extent_block */
272int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, 282int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
283 struct buffer_head *bh, int type);
284/* ocfs2_refcount_block */
285int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
273 struct buffer_head *bh, int type); 286 struct buffer_head *bh, int type);
274/* ocfs2_group_desc */ 287/* ocfs2_group_desc */
275int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
276 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
277/* ocfs2_xattr_block */ 290/* ocfs2_xattr_block */
278int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, 291int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
279 struct buffer_head *bh, int type); 292 struct buffer_head *bh, int type);
280/* quota blocks */ 293/* quota blocks */
281int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, 294int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
282 struct buffer_head *bh, int type); 295 struct buffer_head *bh, int type);
283/* dirblock */ 296/* dirblock */
284int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 297int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
285 struct buffer_head *bh, int type); 298 struct buffer_head *bh, int type);
286/* ocfs2_dx_root_block */ 299/* ocfs2_dx_root_block */
287int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, 300int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
288 struct buffer_head *bh, int type); 301 struct buffer_head *bh, int type);
289/* ocfs2_dx_leaf */ 302/* ocfs2_dx_leaf */
290int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, 303int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
291 struct buffer_head *bh, int type); 304 struct buffer_head *bh, int type);
292/* Anything that has no ecc */ 305/* Anything that has no ecc */
293int ocfs2_journal_access(handle_t *handle, struct inode *inode, 306int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
294 struct buffer_head *bh, int type); 307 struct buffer_head *bh, int type);
295 308
296/* 309/*
@@ -477,6 +490,23 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
477 return credits; 490 return credits;
478} 491}
479 492
493/* inode update, new refcount block and its allocation credits. */
494#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
495 + OCFS2_SUBALLOC_ALLOC)
496
497/* inode and the refcount block update. */
498#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
499
500/*
501 * inode and the refcount block update.
502 * It doesn't include the credits for sub alloc change.
503 * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
504 */
505#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
506
507/* 2 metadata alloc, 2 new blocks and root refcount block */
508#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
509
480/* 510/*
481 * Please note that the caller must make sure that root_el is the root 511 * Please note that the caller must make sure that root_el is the root
482 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 512 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index bac7e6abaf47..ac10f83edb95 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -297,8 +297,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
297 } 297 }
298 memcpy(alloc_copy, alloc, bh->b_size); 298 memcpy(alloc_copy, alloc, bh->b_size);
299 299
300 status = ocfs2_journal_access_di(handle, local_alloc_inode, bh, 300 status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
301 OCFS2_JOURNAL_ACCESS_WRITE); 301 bh, OCFS2_JOURNAL_ACCESS_WRITE);
302 if (status < 0) { 302 if (status < 0) {
303 mlog_errno(status); 303 mlog_errno(status);
304 goto out_commit; 304 goto out_commit;
@@ -392,7 +392,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
392 ocfs2_clear_local_alloc(alloc); 392 ocfs2_clear_local_alloc(alloc);
393 393
394 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); 394 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
395 status = ocfs2_write_block(osb, alloc_bh, inode); 395 status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
396 if (status < 0) 396 if (status < 0)
397 mlog_errno(status); 397 mlog_errno(status);
398 398
@@ -678,7 +678,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
678 * delete bits from it! */ 678 * delete bits from it! */
679 *num_bits = bits_wanted; 679 *num_bits = bits_wanted;
680 680
681 status = ocfs2_journal_access_di(handle, local_alloc_inode, 681 status = ocfs2_journal_access_di(handle,
682 INODE_CACHE(local_alloc_inode),
682 osb->local_alloc_bh, 683 osb->local_alloc_bh,
683 OCFS2_JOURNAL_ACCESS_WRITE); 684 OCFS2_JOURNAL_ACCESS_WRITE);
684 if (status < 0) { 685 if (status < 0) {
@@ -1156,7 +1157,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1156 } 1157 }
1157 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); 1158 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
1158 1159
1159 status = ocfs2_journal_access_di(handle, local_alloc_inode, 1160 status = ocfs2_journal_access_di(handle,
1161 INODE_CACHE(local_alloc_inode),
1160 osb->local_alloc_bh, 1162 osb->local_alloc_bh,
1161 OCFS2_JOURNAL_ACCESS_WRITE); 1163 OCFS2_JOURNAL_ACCESS_WRITE);
1162 if (status < 0) { 1164 if (status < 0) {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b606496b72ec..39737613424a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -202,7 +202,7 @@ out:
202 return ret; 202 return ret;
203} 203}
204 204
205static struct vm_operations_struct ocfs2_file_vm_ops = { 205static const struct vm_operations_struct ocfs2_file_vm_ops = {
206 .fault = ocfs2_fault, 206 .fault = ocfs2_fault,
207 .page_mkwrite = ocfs2_page_mkwrite, 207 .page_mkwrite = ocfs2_page_mkwrite,
208}; 208};
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8601f934010b..f010b22b1c44 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -69,7 +69,6 @@
69static int ocfs2_mknod_locked(struct ocfs2_super *osb, 69static int ocfs2_mknod_locked(struct ocfs2_super *osb,
70 struct inode *dir, 70 struct inode *dir,
71 struct inode *inode, 71 struct inode *inode,
72 struct dentry *dentry,
73 dev_t dev, 72 dev_t dev,
74 struct buffer_head **new_fe_bh, 73 struct buffer_head **new_fe_bh,
75 struct buffer_head *parent_fe_bh, 74 struct buffer_head *parent_fe_bh,
@@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
78 77
79static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 78static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 79 struct inode **ret_orphan_dir,
81 struct inode *inode, 80 u64 blkno,
82 char *name, 81 char *name,
83 struct ocfs2_dir_lookup_result *lookup); 82 struct ocfs2_dir_lookup_result *lookup);
84 83
@@ -358,8 +357,12 @@ static int ocfs2_mknod(struct inode *dir,
358 } 357 }
359 did_quota_inode = 1; 358 did_quota_inode = 1;
360 359
360 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
361 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
362 dentry->d_name.name);
363
361 /* do the real work now. */ 364 /* do the real work now. */
362 status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, 365 status = ocfs2_mknod_locked(osb, dir, inode, dev,
363 &new_fe_bh, parent_fe_bh, handle, 366 &new_fe_bh, parent_fe_bh, handle,
364 inode_ac); 367 inode_ac);
365 if (status < 0) { 368 if (status < 0) {
@@ -375,7 +378,8 @@ static int ocfs2_mknod(struct inode *dir,
375 goto leave; 378 goto leave;
376 } 379 }
377 380
378 status = ocfs2_journal_access_di(handle, dir, parent_fe_bh, 381 status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
382 parent_fe_bh,
379 OCFS2_JOURNAL_ACCESS_WRITE); 383 OCFS2_JOURNAL_ACCESS_WRITE);
380 if (status < 0) { 384 if (status < 0) {
381 mlog_errno(status); 385 mlog_errno(status);
@@ -465,7 +469,6 @@ leave:
465static int ocfs2_mknod_locked(struct ocfs2_super *osb, 469static int ocfs2_mknod_locked(struct ocfs2_super *osb,
466 struct inode *dir, 470 struct inode *dir,
467 struct inode *inode, 471 struct inode *inode,
468 struct dentry *dentry,
469 dev_t dev, 472 dev_t dev,
470 struct buffer_head **new_fe_bh, 473 struct buffer_head **new_fe_bh,
471 struct buffer_head *parent_fe_bh, 474 struct buffer_head *parent_fe_bh,
@@ -479,10 +482,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
479 u16 suballoc_bit; 482 u16 suballoc_bit;
480 u16 feat; 483 u16 feat;
481 484
482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
484 dentry->d_name.name);
485
486 *new_fe_bh = NULL; 485 *new_fe_bh = NULL;
487 486
488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, 487 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
@@ -507,9 +506,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
507 mlog_errno(status); 506 mlog_errno(status);
508 goto leave; 507 goto leave;
509 } 508 }
510 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); 509 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
511 510
512 status = ocfs2_journal_access_di(handle, inode, *new_fe_bh, 511 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
512 *new_fe_bh,
513 OCFS2_JOURNAL_ACCESS_CREATE); 513 OCFS2_JOURNAL_ACCESS_CREATE);
514 if (status < 0) { 514 if (status < 0) {
515 mlog_errno(status); 515 mlog_errno(status);
@@ -565,7 +565,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
565 } 565 }
566 566
567 ocfs2_populate_inode(inode, fe, 1); 567 ocfs2_populate_inode(inode, fe, 1);
568 ocfs2_inode_set_new(osb, inode); 568 ocfs2_ci_set_new(osb, INODE_CACHE(inode));
569 if (!ocfs2_mount_local(osb)) { 569 if (!ocfs2_mount_local(osb)) {
570 status = ocfs2_create_new_inode_locks(inode); 570 status = ocfs2_create_new_inode_locks(inode);
571 if (status < 0) 571 if (status < 0)
@@ -682,7 +682,7 @@ static int ocfs2_link(struct dentry *old_dentry,
682 goto out_unlock_inode; 682 goto out_unlock_inode;
683 } 683 }
684 684
685 err = ocfs2_journal_access_di(handle, inode, fe_bh, 685 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
686 OCFS2_JOURNAL_ACCESS_WRITE); 686 OCFS2_JOURNAL_ACCESS_WRITE);
687 if (err < 0) { 687 if (err < 0) {
688 mlog_errno(err); 688 mlog_errno(err);
@@ -850,7 +850,8 @@ static int ocfs2_unlink(struct inode *dir,
850 } 850 }
851 851
852 if (inode_is_unlinkable(inode)) { 852 if (inode_is_unlinkable(inode)) {
853 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 853 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
854 OCFS2_I(inode)->ip_blkno,
854 orphan_name, &orphan_insert); 855 orphan_name, &orphan_insert);
855 if (status < 0) { 856 if (status < 0) {
856 mlog_errno(status); 857 mlog_errno(status);
@@ -866,7 +867,7 @@ static int ocfs2_unlink(struct inode *dir,
866 goto leave; 867 goto leave;
867 } 868 }
868 869
869 status = ocfs2_journal_access_di(handle, inode, fe_bh, 870 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
870 OCFS2_JOURNAL_ACCESS_WRITE); 871 OCFS2_JOURNAL_ACCESS_WRITE);
871 if (status < 0) { 872 if (status < 0) {
872 mlog_errno(status); 873 mlog_errno(status);
@@ -1241,9 +1242,8 @@ static int ocfs2_rename(struct inode *old_dir,
1241 1242
1242 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { 1243 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1243 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1244 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1244 new_inode, 1245 OCFS2_I(new_inode)->ip_blkno,
1245 orphan_name, 1246 orphan_name, &orphan_insert);
1246 &orphan_insert);
1247 if (status < 0) { 1247 if (status < 0) {
1248 mlog_errno(status); 1248 mlog_errno(status);
1249 goto bail; 1249 goto bail;
@@ -1284,7 +1284,8 @@ static int ocfs2_rename(struct inode *old_dir,
1284 goto bail; 1284 goto bail;
1285 } 1285 }
1286 } 1286 }
1287 status = ocfs2_journal_access_di(handle, new_inode, newfe_bh, 1287 status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
1288 newfe_bh,
1288 OCFS2_JOURNAL_ACCESS_WRITE); 1289 OCFS2_JOURNAL_ACCESS_WRITE);
1289 if (status < 0) { 1290 if (status < 0) {
1290 mlog_errno(status); 1291 mlog_errno(status);
@@ -1331,7 +1332,8 @@ static int ocfs2_rename(struct inode *old_dir,
1331 old_inode->i_ctime = CURRENT_TIME; 1332 old_inode->i_ctime = CURRENT_TIME;
1332 mark_inode_dirty(old_inode); 1333 mark_inode_dirty(old_inode);
1333 1334
1334 status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh, 1335 status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
1336 old_inode_bh,
1335 OCFS2_JOURNAL_ACCESS_WRITE); 1337 OCFS2_JOURNAL_ACCESS_WRITE);
1336 if (status >= 0) { 1338 if (status >= 0) {
1337 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; 1339 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1407,9 +1409,10 @@ static int ocfs2_rename(struct inode *old_dir,
1407 (int)old_dir_nlink, old_dir->i_nlink); 1409 (int)old_dir_nlink, old_dir->i_nlink);
1408 } else { 1410 } else {
1409 struct ocfs2_dinode *fe; 1411 struct ocfs2_dinode *fe;
1410 status = ocfs2_journal_access_di(handle, old_dir, 1412 status = ocfs2_journal_access_di(handle,
1411 old_dir_bh, 1413 INODE_CACHE(old_dir),
1412 OCFS2_JOURNAL_ACCESS_WRITE); 1414 old_dir_bh,
1415 OCFS2_JOURNAL_ACCESS_WRITE);
1413 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1416 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1414 ocfs2_set_links_count(fe, old_dir->i_nlink); 1417 ocfs2_set_links_count(fe, old_dir->i_nlink);
1415 status = ocfs2_journal_dirty(handle, old_dir_bh); 1418 status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1527,9 +1530,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1527 mlog_errno(status); 1530 mlog_errno(status);
1528 goto bail; 1531 goto bail;
1529 } 1532 }
1530 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); 1533 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
1534 bhs[virtual]);
1531 1535
1532 status = ocfs2_journal_access(handle, inode, bhs[virtual], 1536 status = ocfs2_journal_access(handle, INODE_CACHE(inode),
1537 bhs[virtual],
1533 OCFS2_JOURNAL_ACCESS_CREATE); 1538 OCFS2_JOURNAL_ACCESS_CREATE);
1534 if (status < 0) { 1539 if (status < 0) {
1535 mlog_errno(status); 1540 mlog_errno(status);
@@ -1692,7 +1697,11 @@ static int ocfs2_symlink(struct inode *dir,
1692 } 1697 }
1693 did_quota_inode = 1; 1698 did_quota_inode = 1;
1694 1699
1695 status = ocfs2_mknod_locked(osb, dir, inode, dentry, 1700 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
1701 inode->i_mode, dentry->d_name.len,
1702 dentry->d_name.name);
1703
1704 status = ocfs2_mknod_locked(osb, dir, inode,
1696 0, &new_fe_bh, parent_fe_bh, handle, 1705 0, &new_fe_bh, parent_fe_bh, handle,
1697 inode_ac); 1706 inode_ac);
1698 if (status < 0) { 1707 if (status < 0) {
@@ -1842,7 +1851,7 @@ bail:
1842 1851
1843static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 1852static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1844 struct inode **ret_orphan_dir, 1853 struct inode **ret_orphan_dir,
1845 struct inode *inode, 1854 u64 blkno,
1846 char *name, 1855 char *name,
1847 struct ocfs2_dir_lookup_result *lookup) 1856 struct ocfs2_dir_lookup_result *lookup)
1848{ 1857{
@@ -1850,7 +1859,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1850 struct buffer_head *orphan_dir_bh = NULL; 1859 struct buffer_head *orphan_dir_bh = NULL;
1851 int status = 0; 1860 int status = 0;
1852 1861
1853 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 1862 status = ocfs2_blkno_stringify(blkno, name);
1854 if (status < 0) { 1863 if (status < 0) {
1855 mlog_errno(status); 1864 mlog_errno(status);
1856 return status; 1865 return status;
@@ -1917,7 +1926,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1917 goto leave; 1926 goto leave;
1918 } 1927 }
1919 1928
1920 status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh, 1929 status = ocfs2_journal_access_di(handle,
1930 INODE_CACHE(orphan_dir_inode),
1931 orphan_dir_bh,
1921 OCFS2_JOURNAL_ACCESS_WRITE); 1932 OCFS2_JOURNAL_ACCESS_WRITE);
1922 if (status < 0) { 1933 if (status < 0) {
1923 mlog_errno(status); 1934 mlog_errno(status);
@@ -2002,7 +2013,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2002 goto leave; 2013 goto leave;
2003 } 2014 }
2004 2015
2005 status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh, 2016 status = ocfs2_journal_access_di(handle,
2017 INODE_CACHE(orphan_dir_inode),
2018 orphan_dir_bh,
2006 OCFS2_JOURNAL_ACCESS_WRITE); 2019 OCFS2_JOURNAL_ACCESS_WRITE);
2007 if (status < 0) { 2020 if (status < 0) {
2008 mlog_errno(status); 2021 mlog_errno(status);
@@ -2028,6 +2041,274 @@ leave:
2028 return status; 2041 return status;
2029} 2042}
2030 2043
2044int ocfs2_create_inode_in_orphan(struct inode *dir,
2045 int mode,
2046 struct inode **new_inode)
2047{
2048 int status, did_quota_inode = 0;
2049 struct inode *inode = NULL;
2050 struct inode *orphan_dir = NULL;
2051 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2052 struct ocfs2_dinode *di = NULL;
2053 handle_t *handle = NULL;
2054 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
2055 struct buffer_head *parent_di_bh = NULL;
2056 struct buffer_head *new_di_bh = NULL;
2057 struct ocfs2_alloc_context *inode_ac = NULL;
2058 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2059
2060 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2061 if (status < 0) {
2062 if (status != -ENOENT)
2063 mlog_errno(status);
2064 return status;
2065 }
2066
2067 /*
2068 * We give the orphan dir the root blkno to fake an orphan name,
2069 * and allocate enough space for our insertion.
2070 */
2071 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
2072 osb->root_blkno,
2073 orphan_name, &orphan_insert);
2074 if (status < 0) {
2075 mlog_errno(status);
2076 goto leave;
2077 }
2078
2079 /* reserve an inode spot */
2080 status = ocfs2_reserve_new_inode(osb, &inode_ac);
2081 if (status < 0) {
2082 if (status != -ENOSPC)
2083 mlog_errno(status);
2084 goto leave;
2085 }
2086
2087 inode = ocfs2_get_init_inode(dir, mode);
2088 if (!inode) {
2089 status = -ENOMEM;
2090 mlog_errno(status);
2091 goto leave;
2092 }
2093
2094 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
2095 if (IS_ERR(handle)) {
2096 status = PTR_ERR(handle);
2097 handle = NULL;
2098 mlog_errno(status);
2099 goto leave;
2100 }
2101
2102 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
2103 * to be called. */
2104 if (sb_any_quota_active(osb->sb) &&
2105 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
2106 status = -EDQUOT;
2107 goto leave;
2108 }
2109 did_quota_inode = 1;
2110
2111 /* do the real work now. */
2112 status = ocfs2_mknod_locked(osb, dir, inode,
2113 0, &new_di_bh, parent_di_bh, handle,
2114 inode_ac);
2115 if (status < 0) {
2116 mlog_errno(status);
2117 goto leave;
2118 }
2119
2120 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
2121 if (status < 0) {
2122 mlog_errno(status);
2123 goto leave;
2124 }
2125
2126 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2127 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
2128 &orphan_insert, orphan_dir);
2129 if (status < 0) {
2130 mlog_errno(status);
2131 goto leave;
2132 }
2133
2134 /* get open lock so that only nodes can't remove it from orphan dir. */
2135 status = ocfs2_open_lock(inode);
2136 if (status < 0)
2137 mlog_errno(status);
2138
2139leave:
2140 if (status < 0 && did_quota_inode)
2141 vfs_dq_free_inode(inode);
2142 if (handle)
2143 ocfs2_commit_trans(osb, handle);
2144
2145 if (orphan_dir) {
2146 /* This was locked for us in ocfs2_prepare_orphan_dir() */
2147 ocfs2_inode_unlock(orphan_dir, 1);
2148 mutex_unlock(&orphan_dir->i_mutex);
2149 iput(orphan_dir);
2150 }
2151
2152 if (status == -ENOSPC)
2153 mlog(0, "Disk is full\n");
2154
2155 if ((status < 0) && inode) {
2156 clear_nlink(inode);
2157 iput(inode);
2158 }
2159
2160 if (inode_ac)
2161 ocfs2_free_alloc_context(inode_ac);
2162
2163 brelse(new_di_bh);
2164
2165 if (!status)
2166 *new_inode = inode;
2167
2168 ocfs2_free_dir_lookup_result(&orphan_insert);
2169
2170 ocfs2_inode_unlock(dir, 1);
2171 brelse(parent_di_bh);
2172 return status;
2173}
2174
2175int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2176 struct inode *inode,
2177 struct dentry *dentry)
2178{
2179 int status = 0;
2180 struct buffer_head *parent_di_bh = NULL;
2181 handle_t *handle = NULL;
2182 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2183 struct ocfs2_dinode *dir_di, *di;
2184 struct inode *orphan_dir_inode = NULL;
2185 struct buffer_head *orphan_dir_bh = NULL;
2186 struct buffer_head *di_bh = NULL;
2187 struct ocfs2_dir_lookup_result lookup = { NULL, };
2188
2189 mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
2190 dentry->d_name.len, dentry->d_name.name);
2191
2192 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2193 if (status < 0) {
2194 if (status != -ENOENT)
2195 mlog_errno(status);
2196 return status;
2197 }
2198
2199 dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
2200 if (!dir_di->i_links_count) {
2201 /* can't make a file in a deleted directory. */
2202 status = -ENOENT;
2203 goto leave;
2204 }
2205
2206 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
2207 dentry->d_name.len);
2208 if (status)
2209 goto leave;
2210
2211 /* get a spot inside the dir. */
2212 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
2213 dentry->d_name.name,
2214 dentry->d_name.len, &lookup);
2215 if (status < 0) {
2216 mlog_errno(status);
2217 goto leave;
2218 }
2219
2220 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2221 ORPHAN_DIR_SYSTEM_INODE,
2222 osb->slot_num);
2223 if (!orphan_dir_inode) {
2224 status = -EEXIST;
2225 mlog_errno(status);
2226 goto leave;
2227 }
2228
2229 mutex_lock(&orphan_dir_inode->i_mutex);
2230
2231 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
2232 if (status < 0) {
2233 mlog_errno(status);
2234 mutex_unlock(&orphan_dir_inode->i_mutex);
2235 iput(orphan_dir_inode);
2236 goto leave;
2237 }
2238
2239 status = ocfs2_read_inode_block(inode, &di_bh);
2240 if (status < 0) {
2241 mlog_errno(status);
2242 goto orphan_unlock;
2243 }
2244
2245 handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
2246 if (IS_ERR(handle)) {
2247 status = PTR_ERR(handle);
2248 handle = NULL;
2249 mlog_errno(status);
2250 goto orphan_unlock;
2251 }
2252
2253 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
2254 di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2255 if (status < 0) {
2256 mlog_errno(status);
2257 goto out_commit;
2258 }
2259
2260 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
2261 orphan_dir_bh);
2262 if (status < 0) {
2263 mlog_errno(status);
2264 goto out_commit;
2265 }
2266
2267 di = (struct ocfs2_dinode *)di_bh->b_data;
2268 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
2269 di->i_orphaned_slot = 0;
2270 ocfs2_journal_dirty(handle, di_bh);
2271
2272 status = ocfs2_add_entry(handle, dentry, inode,
2273 OCFS2_I(inode)->ip_blkno, parent_di_bh,
2274 &lookup);
2275 if (status < 0) {
2276 mlog_errno(status);
2277 goto out_commit;
2278 }
2279
2280 status = ocfs2_dentry_attach_lock(dentry, inode,
2281 OCFS2_I(dir)->ip_blkno);
2282 if (status) {
2283 mlog_errno(status);
2284 goto out_commit;
2285 }
2286
2287 insert_inode_hash(inode);
2288 dentry->d_op = &ocfs2_dentry_ops;
2289 d_instantiate(dentry, inode);
2290 status = 0;
2291out_commit:
2292 ocfs2_commit_trans(osb, handle);
2293orphan_unlock:
2294 ocfs2_inode_unlock(orphan_dir_inode, 1);
2295 mutex_unlock(&orphan_dir_inode->i_mutex);
2296 iput(orphan_dir_inode);
2297leave:
2298
2299 ocfs2_inode_unlock(dir, 1);
2300
2301 brelse(di_bh);
2302 brelse(parent_di_bh);
2303 brelse(orphan_dir_bh);
2304
2305 ocfs2_free_dir_lookup_result(&lookup);
2306
2307 mlog_exit(status);
2308
2309 return status;
2310}
2311
2031const struct inode_operations ocfs2_dir_iops = { 2312const struct inode_operations ocfs2_dir_iops = {
2032 .create = ocfs2_create, 2313 .create = ocfs2_create,
2033 .lookup = ocfs2_lookup, 2314 .lookup = ocfs2_lookup,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 688aef64c879..e5d059d4f115 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
35 struct inode *orphan_dir_inode, 35 struct inode *orphan_dir_inode,
36 struct inode *inode, 36 struct inode *inode,
37 struct buffer_head *orphan_dir_bh); 37 struct buffer_head *orphan_dir_bh);
38int ocfs2_create_inode_in_orphan(struct inode *dir,
39 int mode,
40 struct inode **new_inode);
41int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
42 struct inode *new_inode,
43 struct dentry *new_dentry);
38 44
39#endif /* OCFS2_NAMEI_H */ 45#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 39e1d5a39505..eae404602424 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -51,20 +51,51 @@
51/* For struct ocfs2_blockcheck_stats */ 51/* For struct ocfs2_blockcheck_stats */
52#include "blockcheck.h" 52#include "blockcheck.h"
53 53
54
55/* Caching of metadata buffers */
56
54/* Most user visible OCFS2 inodes will have very few pieces of 57/* Most user visible OCFS2 inodes will have very few pieces of
55 * metadata, but larger files (including bitmaps, etc) must be taken 58 * metadata, but larger files (including bitmaps, etc) must be taken
56 * into account when designing an access scheme. We allow a small 59 * into account when designing an access scheme. We allow a small
57 * amount of inlined blocks to be stored on an array and grow the 60 * amount of inlined blocks to be stored on an array and grow the
58 * structure into a rb tree when necessary. */ 61 * structure into a rb tree when necessary. */
59#define OCFS2_INODE_MAX_CACHE_ARRAY 2 62#define OCFS2_CACHE_INFO_MAX_ARRAY 2
63
64/* Flags for ocfs2_caching_info */
65
66enum ocfs2_caching_info_flags {
67 /* Indicates that the metadata cache is using the inline array */
68 OCFS2_CACHE_FL_INLINE = 1<<1,
69};
60 70
71struct ocfs2_caching_operations;
61struct ocfs2_caching_info { 72struct ocfs2_caching_info {
73 /*
74 * The parent structure provides the locks, but because the
75 * parent structure can differ, it provides locking operations
76 * to struct ocfs2_caching_info.
77 */
78 const struct ocfs2_caching_operations *ci_ops;
79
80 /* next two are protected by trans_inc_lock */
81 /* which transaction were we created on? Zero if none. */
82 unsigned long ci_created_trans;
83 /* last transaction we were a part of. */
84 unsigned long ci_last_trans;
85
86 /* Cache structures */
87 unsigned int ci_flags;
62 unsigned int ci_num_cached; 88 unsigned int ci_num_cached;
63 union { 89 union {
64 sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; 90 sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
65 struct rb_root ci_tree; 91 struct rb_root ci_tree;
66 } ci_cache; 92 } ci_cache;
67}; 93};
94/*
95 * Need this prototype here instead of in uptodate.h because journal.h
96 * uses it.
97 */
98struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
68 99
69/* this limits us to 256 nodes 100/* this limits us to 256 nodes
70 * if we need more, we can do a kmalloc for the map */ 101 * if we need more, we can do a kmalloc for the map */
@@ -377,12 +408,17 @@ struct ocfs2_super
377 408
378 /* the group we used to allocate inodes. */ 409 /* the group we used to allocate inodes. */
379 u64 osb_inode_alloc_group; 410 u64 osb_inode_alloc_group;
411
412 /* rb tree root for refcount lock. */
413 struct rb_root osb_rf_lock_tree;
414 struct ocfs2_refcount_tree *osb_ref_tree_lru;
380}; 415};
381 416
382#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 417#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
383 418
384/* Useful typedef for passing around journal access functions */ 419/* Useful typedef for passing around journal access functions */
385typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode, 420typedef int (*ocfs2_journal_access_func)(handle_t *handle,
421 struct ocfs2_caching_info *ci,
386 struct buffer_head *bh, int type); 422 struct buffer_head *bh, int type);
387 423
388static inline int ocfs2_should_order_data(struct inode *inode) 424static inline int ocfs2_should_order_data(struct inode *inode)
@@ -480,6 +516,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
480 ocfs2_set_links_count(di, links); 516 ocfs2_set_links_count(di, links);
481} 517}
482 518
519static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
520{
521 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
522 return 1;
523 return 0;
524}
525
483/* set / clear functions because cluster events can make these happen 526/* set / clear functions because cluster events can make these happen
484 * in parallel so we want the transitions to be atomic. this also 527 * in parallel so we want the transitions to be atomic. this also
485 * means that any future flags osb_flags must be protected by spinlock 528 * means that any future flags osb_flags must be protected by spinlock
@@ -578,6 +621,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
578#define OCFS2_IS_VALID_DX_LEAF(ptr) \ 621#define OCFS2_IS_VALID_DX_LEAF(ptr) \
579 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) 622 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
580 623
624#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \
625 (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
626
581static inline unsigned long ino_from_blkno(struct super_block *sb, 627static inline unsigned long ino_from_blkno(struct super_block *sb,
582 u64 blkno) 628 u64 blkno)
583{ 629{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7ab6e9e5e77c..e9431e4a5e7c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -68,6 +68,7 @@
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" 69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" 70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
71#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1"
71 72
72/* Compatibility flags */ 73/* Compatibility flags */
73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 74#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -98,7 +99,8 @@
98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 99 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
99 | OCFS2_FEATURE_INCOMPAT_XATTR \ 100 | OCFS2_FEATURE_INCOMPAT_XATTR \
100 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 104#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 105 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 106 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -160,6 +162,9 @@
160/* Metadata checksum and error correction */ 162/* Metadata checksum and error correction */
161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 163#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
162 164
165/* Refcount tree support */
166#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
167
163/* 168/*
164 * backup superblock flag is used to indicate that this volume 169 * backup superblock flag is used to indicate that this volume
165 * has backup superblocks. 170 * has backup superblocks.
@@ -223,6 +228,7 @@
223#define OCFS2_HAS_XATTR_FL (0x0002) 228#define OCFS2_HAS_XATTR_FL (0x0002)
224#define OCFS2_INLINE_XATTR_FL (0x0004) 229#define OCFS2_INLINE_XATTR_FL (0x0004)
225#define OCFS2_INDEXED_DIR_FL (0x0008) 230#define OCFS2_INDEXED_DIR_FL (0x0008)
231#define OCFS2_HAS_REFCOUNT_FL (0x0010)
226 232
227/* Inode attributes, keep in sync with EXT2 */ 233/* Inode attributes, keep in sync with EXT2 */
228#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 234#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
@@ -241,8 +247,11 @@
241/* 247/*
242 * Extent record flags (e_node.leaf.flags) 248 * Extent record flags (e_node.leaf.flags)
243 */ 249 */
244#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but 250#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
245 * unwritten */ 251 * unwritten */
252#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference
253 * counted in an associated
254 * refcount tree */
246 255
247/* 256/*
248 * ioctl commands 257 * ioctl commands
@@ -292,6 +301,15 @@ struct ocfs2_new_group_input {
292#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) 301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
293#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) 302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
294 303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
295/* 313/*
296 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
297 */ 315 */
@@ -717,7 +735,8 @@ struct ocfs2_dinode {
717 __le64 i_xattr_loc; 735 __le64 i_xattr_loc;
718/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 736/*80*/ struct ocfs2_block_check i_check; /* Error checking */
719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ 737/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5]; 738/*90*/ __le64 i_refcount_loc;
739 __le64 i_reserved2[4];
721/*B8*/ union { 740/*B8*/ union {
722 __le64 i_pad1; /* Generic way to refer to this 741 __le64 i_pad1; /* Generic way to refer to this
723 64bit union */ 742 64bit union */
@@ -901,6 +920,60 @@ struct ocfs2_group_desc
901/*40*/ __u8 bg_bitmap[0]; 920/*40*/ __u8 bg_bitmap[0];
902}; 921};
903 922
923struct ocfs2_refcount_rec {
924/*00*/ __le64 r_cpos; /* Physical offset, in clusters */
925 __le32 r_clusters; /* Clusters covered by this extent */
926 __le32 r_refcount; /* Reference count of this extent */
927/*10*/
928};
929#define OCFS2_32BIT_POS_MASK (0xffffffffULL)
930
931#define OCFS2_REFCOUNT_LEAF_FL (0x00000001)
932#define OCFS2_REFCOUNT_TREE_FL (0x00000002)
933
934struct ocfs2_refcount_list {
935/*00*/ __le16 rl_count; /* Maximum number of entries possible
936 in rl_records */
937 __le16 rl_used; /* Current number of used records */
938 __le32 rl_reserved2;
939 __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
940/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
941};
942
943
944struct ocfs2_refcount_block {
945/*00*/ __u8 rf_signature[8]; /* Signature for verification */
946 __le16 rf_suballoc_slot; /* Slot suballocator this block
947 belongs to */
948 __le16 rf_suballoc_bit; /* Bit offset in suballocator
949 block group */
950 __le32 rf_fs_generation; /* Must match superblock */
951/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */
952 __le64 rf_parent; /* Parent block, only valid if
953 OCFS2_REFCOUNT_LEAF_FL is set in
954 rf_flags */
955/*20*/ struct ocfs2_block_check rf_check; /* Error checking */
956 __le64 rf_last_eb_blk; /* Pointer to last extent block */
957/*30*/ __le32 rf_count; /* Number of inodes sharing this
958 refcount tree */
959 __le32 rf_flags; /* See the flags above */
960 __le32 rf_clusters; /* clusters covered by refcount tree. */
961 __le32 rf_cpos; /* cluster offset in refcount tree.*/
962/*40*/ __le32 rf_generation; /* generation number. all be the same
963 * for the same refcount tree. */
964 __le32 rf_reserved0;
965 __le64 rf_reserved1[7];
966/*80*/ union {
967 struct ocfs2_refcount_list rf_records; /* List of refcount
968 records */
969 struct ocfs2_extent_list rf_list; /* Extent record list,
970 only valid if
971 OCFS2_REFCOUNT_TREE_FL
972 is set in rf_flags */
973 };
974/* Actual on-disk size is one block */
975};
976
904/* 977/*
905 * On disk extended attribute structure for OCFS2. 978 * On disk extended attribute structure for OCFS2.
906 */ 979 */
@@ -1312,6 +1385,32 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
1312 1385
1313 return size / sizeof(struct ocfs2_extent_rec); 1386 return size / sizeof(struct ocfs2_extent_rec);
1314} 1387}
1388
1389static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
1390{
1391 int size;
1392
1393 size = sb->s_blocksize -
1394 offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
1395
1396 return size / sizeof(struct ocfs2_extent_rec);
1397}
1398
1399static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
1400{
1401 int size;
1402
1403 size = sb->s_blocksize -
1404 offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
1405
1406 return size / sizeof(struct ocfs2_refcount_rec);
1407}
1408
1409static inline u32
1410ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
1411{
1412 return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1413}
1315#else 1414#else
1316static inline int ocfs2_fast_symlink_chars(int blocksize) 1415static inline int ocfs2_fast_symlink_chars(int blocksize)
1317{ 1416{
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index c212cf5a2bdf..d277aabf5dfb 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -49,6 +49,7 @@ enum ocfs2_lock_type {
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC, 50 OCFS2_LOCK_TYPE_NFS_SYNC,
51 OCFS2_LOCK_TYPE_ORPHAN_SCAN, 51 OCFS2_LOCK_TYPE_ORPHAN_SCAN,
52 OCFS2_LOCK_TYPE_REFCOUNT,
52 OCFS2_NUM_LOCK_TYPES 53 OCFS2_NUM_LOCK_TYPES
53}; 54};
54 55
@@ -89,6 +90,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
89 case OCFS2_LOCK_TYPE_ORPHAN_SCAN: 90 case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
90 c = 'P'; 91 c = 'P';
91 break; 92 break;
93 case OCFS2_LOCK_TYPE_REFCOUNT:
94 c = 'T';
95 break;
92 default: 96 default:
93 c = '\0'; 97 c = '\0';
94 } 98 }
@@ -110,6 +114,7 @@ static char *ocfs2_lock_type_strings[] = {
110 [OCFS2_LOCK_TYPE_QINFO] = "Quota", 114 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
111 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", 115 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
112 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", 116 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
117 [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
113}; 118};
114 119
115static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 120static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 3fb96fcd4c81..e5df9d170b0c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
109int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 109int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
110 struct buffer_head **bh); 110 struct buffer_head **bh);
111 111
112extern struct dquot_operations ocfs2_quota_operations; 112extern const struct dquot_operations ocfs2_quota_operations;
113extern struct quota_format_type ocfs2_quota_format; 113extern struct quota_format_type ocfs2_quota_format;
114 114
115int ocfs2_quota_setup(void); 115int ocfs2_quota_setup(void);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 44f2a5e1d042..b437dc0c4cad 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -154,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block,
154 err = -EIO; 154 err = -EIO;
155 mlog_errno(err); 155 mlog_errno(err);
156 } 156 }
157 return err;; 157 return err;
158} 158}
159 159
160/* Read data from global quotafile - avoid pagecache and such because we cannot 160/* Read data from global quotafile - avoid pagecache and such because we cannot
@@ -253,8 +253,9 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
253 flush_dcache_page(bh->b_page); 253 flush_dcache_page(bh->b_page);
254 set_buffer_uptodate(bh); 254 set_buffer_uptodate(bh);
255 unlock_buffer(bh); 255 unlock_buffer(bh);
256 ocfs2_set_buffer_uptodate(gqinode, bh); 256 ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
257 err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type); 257 err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
258 ja_type);
258 if (err < 0) { 259 if (err < 0) {
259 brelse(bh); 260 brelse(bh);
260 goto out; 261 goto out;
@@ -849,7 +850,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
849 kmem_cache_free(ocfs2_dquot_cachep, dquot); 850 kmem_cache_free(ocfs2_dquot_cachep, dquot);
850} 851}
851 852
852struct dquot_operations ocfs2_quota_operations = { 853const struct dquot_operations ocfs2_quota_operations = {
853 .initialize = dquot_initialize, 854 .initialize = dquot_initialize,
854 .drop = dquot_drop, 855 .drop = dquot_drop,
855 .alloc_space = dquot_alloc_space, 856 .alloc_space = dquot_alloc_space,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bdb09cb6e1fe..1a2c50a759fa 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -108,7 +108,7 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
108 mlog_errno(status); 108 mlog_errno(status);
109 return status; 109 return status;
110 } 110 }
111 status = ocfs2_journal_access_dq(handle, inode, bh, 111 status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
112 OCFS2_JOURNAL_ACCESS_WRITE); 112 OCFS2_JOURNAL_ACCESS_WRITE);
113 if (status < 0) { 113 if (status < 0) {
114 mlog_errno(status); 114 mlog_errno(status);
@@ -510,7 +510,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
510 goto out_commit; 510 goto out_commit;
511 } 511 }
512 /* Release local quota file entry */ 512 /* Release local quota file entry */
513 status = ocfs2_journal_access_dq(handle, lqinode, 513 status = ocfs2_journal_access_dq(handle,
514 INODE_CACHE(lqinode),
514 qbh, OCFS2_JOURNAL_ACCESS_WRITE); 515 qbh, OCFS2_JOURNAL_ACCESS_WRITE);
515 if (status < 0) { 516 if (status < 0) {
516 mlog_errno(status); 517 mlog_errno(status);
@@ -619,7 +620,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
619 mlog_errno(status); 620 mlog_errno(status);
620 goto out_bh; 621 goto out_bh;
621 } 622 }
622 status = ocfs2_journal_access_dq(handle, lqinode, bh, 623 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
624 bh,
623 OCFS2_JOURNAL_ACCESS_WRITE); 625 OCFS2_JOURNAL_ACCESS_WRITE);
624 if (status < 0) { 626 if (status < 0) {
625 mlog_errno(status); 627 mlog_errno(status);
@@ -993,8 +995,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
993 goto out_trans; 995 goto out_trans;
994 } 996 }
995 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 997 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
996 ocfs2_set_new_buffer_uptodate(lqinode, bh); 998 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
997 status = ocfs2_journal_access_dq(handle, lqinode, bh, 999 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
998 OCFS2_JOURNAL_ACCESS_CREATE); 1000 OCFS2_JOURNAL_ACCESS_CREATE);
999 if (status < 0) { 1001 if (status < 0) {
1000 mlog_errno(status); 1002 mlog_errno(status);
@@ -1027,8 +1029,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1027 mlog_errno(status); 1029 mlog_errno(status);
1028 goto out_trans; 1030 goto out_trans;
1029 } 1031 }
1030 ocfs2_set_new_buffer_uptodate(lqinode, dbh); 1032 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
1031 status = ocfs2_journal_access_dq(handle, lqinode, dbh, 1033 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
1032 OCFS2_JOURNAL_ACCESS_CREATE); 1034 OCFS2_JOURNAL_ACCESS_CREATE);
1033 if (status < 0) { 1035 if (status < 0) {
1034 mlog_errno(status); 1036 mlog_errno(status);
@@ -1131,7 +1133,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1131 mlog_errno(status); 1133 mlog_errno(status);
1132 goto out; 1134 goto out;
1133 } 1135 }
1134 ocfs2_set_new_buffer_uptodate(lqinode, bh); 1136 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
1135 1137
1136 /* Local quota info, chunk header and the new block we initialize */ 1138 /* Local quota info, chunk header and the new block we initialize */
1137 handle = ocfs2_start_trans(OCFS2_SB(sb), 1139 handle = ocfs2_start_trans(OCFS2_SB(sb),
@@ -1143,7 +1145,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1143 goto out; 1145 goto out;
1144 } 1146 }
1145 /* Zero created block */ 1147 /* Zero created block */
1146 status = ocfs2_journal_access_dq(handle, lqinode, bh, 1148 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
1147 OCFS2_JOURNAL_ACCESS_CREATE); 1149 OCFS2_JOURNAL_ACCESS_CREATE);
1148 if (status < 0) { 1150 if (status < 0) {
1149 mlog_errno(status); 1151 mlog_errno(status);
@@ -1158,7 +1160,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1158 goto out_trans; 1160 goto out_trans;
1159 } 1161 }
1160 /* Update chunk header */ 1162 /* Update chunk header */
1161 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, 1163 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
1164 chunk->qc_headerbh,
1162 OCFS2_JOURNAL_ACCESS_WRITE); 1165 OCFS2_JOURNAL_ACCESS_WRITE);
1163 if (status < 0) { 1166 if (status < 0) {
1164 mlog_errno(status); 1167 mlog_errno(status);
@@ -1292,7 +1295,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1292 goto out; 1295 goto out;
1293 } 1296 }
1294 1297
1295 status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type], 1298 status = ocfs2_journal_access_dq(handle,
1299 INODE_CACHE(sb_dqopt(sb)->files[type]),
1296 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); 1300 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
1297 if (status < 0) { 1301 if (status < 0) {
1298 mlog_errno(status); 1302 mlog_errno(status);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 000000000000..60287fc56bcb
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4313 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.c
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#include <linux/sort.h>
19#define MLOG_MASK_PREFIX ML_REFCOUNT
20#include <cluster/masklog.h>
21#include "ocfs2.h"
22#include "inode.h"
23#include "alloc.h"
24#include "suballoc.h"
25#include "journal.h"
26#include "uptodate.h"
27#include "super.h"
28#include "buffer_head_io.h"
29#include "blockcheck.h"
30#include "refcounttree.h"
31#include "sysfile.h"
32#include "dlmglue.h"
33#include "extent_map.h"
34#include "aops.h"
35#include "xattr.h"
36#include "namei.h"
37
38#include <linux/bio.h>
39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h>
42#include <linux/writeback.h>
43#include <linux/pagevec.h>
44#include <linux/swap.h>
45#include <linux/security.h>
46#include <linux/fsnotify.h>
47#include <linux/quotaops.h>
48#include <linux/namei.h>
49#include <linux/mount.h>
50
51struct ocfs2_cow_context {
52 struct inode *inode;
53 u32 cow_start;
54 u32 cow_len;
55 struct ocfs2_extent_tree data_et;
56 struct ocfs2_refcount_tree *ref_tree;
57 struct buffer_head *ref_root_bh;
58 struct ocfs2_alloc_context *meta_ac;
59 struct ocfs2_alloc_context *data_ac;
60 struct ocfs2_cached_dealloc_ctxt dealloc;
61 void *cow_object;
62 struct ocfs2_post_refcount *post_refcount;
63 int extra_credits;
64 int (*get_clusters)(struct ocfs2_cow_context *context,
65 u32 v_cluster, u32 *p_cluster,
66 u32 *num_clusters,
67 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct ocfs2_cow_context *context,
70 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len);
72};
73
74static inline struct ocfs2_refcount_tree *
75cache_info_to_refcount(struct ocfs2_caching_info *ci)
76{
77 return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
78}
79
80static int ocfs2_validate_refcount_block(struct super_block *sb,
81 struct buffer_head *bh)
82{
83 int rc;
84 struct ocfs2_refcount_block *rb =
85 (struct ocfs2_refcount_block *)bh->b_data;
86
87 mlog(0, "Validating refcount block %llu\n",
88 (unsigned long long)bh->b_blocknr);
89
90 BUG_ON(!buffer_uptodate(bh));
91
92 /*
93 * If the ecc fails, we return the error but otherwise
94 * leave the filesystem running. We know any error is
95 * local to this block.
96 */
97 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
98 if (rc) {
99 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
100 (unsigned long long)bh->b_blocknr);
101 return rc;
102 }
103
104
105 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
106 ocfs2_error(sb,
107 "Refcount block #%llu has bad signature %.*s",
108 (unsigned long long)bh->b_blocknr, 7,
109 rb->rf_signature);
110 return -EINVAL;
111 }
112
113 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
114 ocfs2_error(sb,
115 "Refcount block #%llu has an invalid rf_blkno "
116 "of %llu",
117 (unsigned long long)bh->b_blocknr,
118 (unsigned long long)le64_to_cpu(rb->rf_blkno));
119 return -EINVAL;
120 }
121
122 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
123 ocfs2_error(sb,
124 "Refcount block #%llu has an invalid "
125 "rf_fs_generation of #%u",
126 (unsigned long long)bh->b_blocknr,
127 le32_to_cpu(rb->rf_fs_generation));
128 return -EINVAL;
129 }
130
131 return 0;
132}
133
134static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
135 u64 rb_blkno,
136 struct buffer_head **bh)
137{
138 int rc;
139 struct buffer_head *tmp = *bh;
140
141 rc = ocfs2_read_block(ci, rb_blkno, &tmp,
142 ocfs2_validate_refcount_block);
143
144 /* If ocfs2_read_block() got us a new bh, pass it up. */
145 if (!rc && !*bh)
146 *bh = tmp;
147
148 return rc;
149}
150
151static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
152{
153 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
154
155 return rf->rf_blkno;
156}
157
158static struct super_block *
159ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
160{
161 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
162
163 return rf->rf_sb;
164}
165
166static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
167{
168 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
169
170 spin_lock(&rf->rf_lock);
171}
172
173static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
174{
175 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
176
177 spin_unlock(&rf->rf_lock);
178}
179
180static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
181{
182 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
183
184 mutex_lock(&rf->rf_io_mutex);
185}
186
187static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
188{
189 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
190
191 mutex_unlock(&rf->rf_io_mutex);
192}
193
194static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
195 .co_owner = ocfs2_refcount_cache_owner,
196 .co_get_super = ocfs2_refcount_cache_get_super,
197 .co_cache_lock = ocfs2_refcount_cache_lock,
198 .co_cache_unlock = ocfs2_refcount_cache_unlock,
199 .co_io_lock = ocfs2_refcount_cache_io_lock,
200 .co_io_unlock = ocfs2_refcount_cache_io_unlock,
201};
202
203static struct ocfs2_refcount_tree *
204ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
205{
206 struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
207 struct ocfs2_refcount_tree *tree = NULL;
208
209 while (n) {
210 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
211
212 if (blkno < tree->rf_blkno)
213 n = n->rb_left;
214 else if (blkno > tree->rf_blkno)
215 n = n->rb_right;
216 else
217 return tree;
218 }
219
220 return NULL;
221}
222
223/* osb_lock is already locked. */
224static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
225 struct ocfs2_refcount_tree *new)
226{
227 u64 rf_blkno = new->rf_blkno;
228 struct rb_node *parent = NULL;
229 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
230 struct ocfs2_refcount_tree *tmp;
231
232 while (*p) {
233 parent = *p;
234
235 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
236 rf_node);
237
238 if (rf_blkno < tmp->rf_blkno)
239 p = &(*p)->rb_left;
240 else if (rf_blkno > tmp->rf_blkno)
241 p = &(*p)->rb_right;
242 else {
243 /* This should never happen! */
244 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
245 (unsigned long long)rf_blkno);
246 BUG();
247 }
248 }
249
250 rb_link_node(&new->rf_node, parent, p);
251 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
252}
253
254static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
255{
256 ocfs2_metadata_cache_exit(&tree->rf_ci);
257 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
258 ocfs2_lock_res_free(&tree->rf_lockres);
259 kfree(tree);
260}
261
262static inline void
263ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
264 struct ocfs2_refcount_tree *tree)
265{
266 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
267 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
268 osb->osb_ref_tree_lru = NULL;
269}
270
271static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
272 struct ocfs2_refcount_tree *tree)
273{
274 spin_lock(&osb->osb_lock);
275 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
276 spin_unlock(&osb->osb_lock);
277}
278
279void ocfs2_kref_remove_refcount_tree(struct kref *kref)
280{
281 struct ocfs2_refcount_tree *tree =
282 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
283
284 ocfs2_free_refcount_tree(tree);
285}
286
287static inline void
288ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
289{
290 kref_get(&tree->rf_getcnt);
291}
292
293static inline void
294ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
295{
296 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
297}
298
299static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
300 struct super_block *sb)
301{
302 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
303 mutex_init(&new->rf_io_mutex);
304 new->rf_sb = sb;
305 spin_lock_init(&new->rf_lock);
306}
307
308static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
309 struct ocfs2_refcount_tree *new,
310 u64 rf_blkno, u32 generation)
311{
312 init_rwsem(&new->rf_sem);
313 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
314 rf_blkno, generation);
315}
316
317static struct ocfs2_refcount_tree*
318ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
319{
320 struct ocfs2_refcount_tree *new;
321
322 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
323 if (!new)
324 return NULL;
325
326 new->rf_blkno = rf_blkno;
327 kref_init(&new->rf_getcnt);
328 ocfs2_init_refcount_tree_ci(new, osb->sb);
329
330 return new;
331}
332
333static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
334 struct ocfs2_refcount_tree **ret_tree)
335{
336 int ret = 0;
337 struct ocfs2_refcount_tree *tree, *new = NULL;
338 struct buffer_head *ref_root_bh = NULL;
339 struct ocfs2_refcount_block *ref_rb;
340
341 spin_lock(&osb->osb_lock);
342 if (osb->osb_ref_tree_lru &&
343 osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
344 tree = osb->osb_ref_tree_lru;
345 else
346 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
347 if (tree)
348 goto out;
349
350 spin_unlock(&osb->osb_lock);
351
352 new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
353 if (!new) {
354 ret = -ENOMEM;
355 mlog_errno(ret);
356 return ret;
357 }
358 /*
359 * We need the generation to create the refcount tree lock and since
360 * it isn't changed during the tree modification, we are safe here to
361 * read without protection.
362 * We also have to purge the cache after we create the lock since the
363 * refcount block may have the stale data. It can only be trusted when
364 * we hold the refcount lock.
365 */
366 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
367 if (ret) {
368 mlog_errno(ret);
369 ocfs2_metadata_cache_exit(&new->rf_ci);
370 kfree(new);
371 return ret;
372 }
373
374 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
375 new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
376 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
377 new->rf_generation);
378 ocfs2_metadata_cache_purge(&new->rf_ci);
379
380 spin_lock(&osb->osb_lock);
381 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
382 if (tree)
383 goto out;
384
385 ocfs2_insert_refcount_tree(osb, new);
386
387 tree = new;
388 new = NULL;
389
390out:
391 *ret_tree = tree;
392
393 osb->osb_ref_tree_lru = tree;
394
395 spin_unlock(&osb->osb_lock);
396
397 if (new)
398 ocfs2_free_refcount_tree(new);
399
400 brelse(ref_root_bh);
401 return ret;
402}
403
404static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
405{
406 int ret;
407 struct buffer_head *di_bh = NULL;
408 struct ocfs2_dinode *di;
409
410 ret = ocfs2_read_inode_block(inode, &di_bh);
411 if (ret) {
412 mlog_errno(ret);
413 goto out;
414 }
415
416 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
417
418 di = (struct ocfs2_dinode *)di_bh->b_data;
419 *ref_blkno = le64_to_cpu(di->i_refcount_loc);
420 brelse(di_bh);
421out:
422 return ret;
423}
424
425static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
426 struct ocfs2_refcount_tree *tree, int rw)
427{
428 int ret;
429
430 ret = ocfs2_refcount_lock(tree, rw);
431 if (ret) {
432 mlog_errno(ret);
433 goto out;
434 }
435
436 if (rw)
437 down_write(&tree->rf_sem);
438 else
439 down_read(&tree->rf_sem);
440
441out:
442 return ret;
443}
444
445/*
446 * Lock the refcount tree pointed by ref_blkno and return the tree.
447 * In most case, we lock the tree and read the refcount block.
448 * So read it here if the caller really needs it.
449 *
450 * If the tree has been re-created by other node, it will free the
451 * old one and re-create it.
452 */
453int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
454 u64 ref_blkno, int rw,
455 struct ocfs2_refcount_tree **ret_tree,
456 struct buffer_head **ref_bh)
457{
458 int ret, delete_tree = 0;
459 struct ocfs2_refcount_tree *tree = NULL;
460 struct buffer_head *ref_root_bh = NULL;
461 struct ocfs2_refcount_block *rb;
462
463again:
464 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
465 if (ret) {
466 mlog_errno(ret);
467 return ret;
468 }
469
470 ocfs2_refcount_tree_get(tree);
471
472 ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
473 if (ret) {
474 mlog_errno(ret);
475 ocfs2_refcount_tree_put(tree);
476 goto out;
477 }
478
479 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
480 &ref_root_bh);
481 if (ret) {
482 mlog_errno(ret);
483 ocfs2_unlock_refcount_tree(osb, tree, rw);
484 ocfs2_refcount_tree_put(tree);
485 goto out;
486 }
487
488 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
489 /*
490 * If the refcount block has been freed and re-created, we may need
491 * to recreate the refcount tree also.
492 *
493 * Here we just remove the tree from the rb-tree, and the last
494 * kref holder will unlock and delete this refcount_tree.
495 * Then we goto "again" and ocfs2_get_refcount_tree will create
496 * the new refcount tree for us.
497 */
498 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
499 if (!tree->rf_removed) {
500 ocfs2_erase_refcount_tree_from_list(osb, tree);
501 tree->rf_removed = 1;
502 delete_tree = 1;
503 }
504
505 ocfs2_unlock_refcount_tree(osb, tree, rw);
506 /*
507 * We get an extra reference when we create the refcount
508 * tree, so another put will destroy it.
509 */
510 if (delete_tree)
511 ocfs2_refcount_tree_put(tree);
512 brelse(ref_root_bh);
513 ref_root_bh = NULL;
514 goto again;
515 }
516
517 *ret_tree = tree;
518 if (ref_bh) {
519 *ref_bh = ref_root_bh;
520 ref_root_bh = NULL;
521 }
522out:
523 brelse(ref_root_bh);
524 return ret;
525}
526
527int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
528 struct ocfs2_refcount_tree **ret_tree,
529 struct buffer_head **ref_bh)
530{
531 int ret;
532 u64 ref_blkno;
533
534 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
535 if (ret) {
536 mlog_errno(ret);
537 return ret;
538 }
539
540 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
541 rw, ret_tree, ref_bh);
542}
543
544void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
545 struct ocfs2_refcount_tree *tree, int rw)
546{
547 if (rw)
548 up_write(&tree->rf_sem);
549 else
550 up_read(&tree->rf_sem);
551
552 ocfs2_refcount_unlock(tree, rw);
553 ocfs2_refcount_tree_put(tree);
554}
555
556void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
557{
558 struct rb_node *node;
559 struct ocfs2_refcount_tree *tree;
560 struct rb_root *root = &osb->osb_rf_lock_tree;
561
562 while ((node = rb_last(root)) != NULL) {
563 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
564
565 mlog(0, "Purge tree %llu\n",
566 (unsigned long long) tree->rf_blkno);
567
568 rb_erase(&tree->rf_node, root);
569 ocfs2_free_refcount_tree(tree);
570 }
571}
572
573/*
574 * Create a refcount tree for an inode.
575 * We take for granted that the inode is already locked.
576 */
577static int ocfs2_create_refcount_tree(struct inode *inode,
578 struct buffer_head *di_bh)
579{
580 int ret;
581 handle_t *handle = NULL;
582 struct ocfs2_alloc_context *meta_ac = NULL;
583 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
584 struct ocfs2_inode_info *oi = OCFS2_I(inode);
585 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
586 struct buffer_head *new_bh = NULL;
587 struct ocfs2_refcount_block *rb;
588 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
589 u16 suballoc_bit_start;
590 u32 num_got;
591 u64 first_blkno;
592
593 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
594
595 mlog(0, "create tree for inode %lu\n", inode->i_ino);
596
597 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
598 if (ret) {
599 mlog_errno(ret);
600 goto out;
601 }
602
603 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
604 if (IS_ERR(handle)) {
605 ret = PTR_ERR(handle);
606 mlog_errno(ret);
607 goto out;
608 }
609
610 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
611 OCFS2_JOURNAL_ACCESS_WRITE);
612 if (ret) {
613 mlog_errno(ret);
614 goto out_commit;
615 }
616
617 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
618 &suballoc_bit_start, &num_got,
619 &first_blkno);
620 if (ret) {
621 mlog_errno(ret);
622 goto out_commit;
623 }
624
625 new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
626 if (!new_tree) {
627 ret = -ENOMEM;
628 mlog_errno(ret);
629 goto out_commit;
630 }
631
632 new_bh = sb_getblk(inode->i_sb, first_blkno);
633 ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
634
635 ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
636 OCFS2_JOURNAL_ACCESS_CREATE);
637 if (ret) {
638 mlog_errno(ret);
639 goto out_commit;
640 }
641
642 /* Initialize ocfs2_refcount_block. */
643 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
644 memset(rb, 0, inode->i_sb->s_blocksize);
645 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
646 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
647 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
648 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
649 rb->rf_blkno = cpu_to_le64(first_blkno);
650 rb->rf_count = cpu_to_le32(1);
651 rb->rf_records.rl_count =
652 cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
653 spin_lock(&osb->osb_lock);
654 rb->rf_generation = osb->s_next_generation++;
655 spin_unlock(&osb->osb_lock);
656
657 ocfs2_journal_dirty(handle, new_bh);
658
659 spin_lock(&oi->ip_lock);
660 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
661 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
662 di->i_refcount_loc = cpu_to_le64(first_blkno);
663 spin_unlock(&oi->ip_lock);
664
665 mlog(0, "created tree for inode %lu, refblock %llu\n",
666 inode->i_ino, (unsigned long long)first_blkno);
667
668 ocfs2_journal_dirty(handle, di_bh);
669
670 /*
671 * We have to init the tree lock here since it will use
672 * the generation number to create it.
673 */
674 new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
675 ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
676 new_tree->rf_generation);
677
678 spin_lock(&osb->osb_lock);
679 tree = ocfs2_find_refcount_tree(osb, first_blkno);
680
681 /*
682 * We've just created a new refcount tree in this block. If
683 * we found a refcount tree on the ocfs2_super, it must be
684 * one we just deleted. We free the old tree before
685 * inserting the new tree.
686 */
687 BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
688 if (tree)
689 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
690 ocfs2_insert_refcount_tree(osb, new_tree);
691 spin_unlock(&osb->osb_lock);
692 new_tree = NULL;
693 if (tree)
694 ocfs2_refcount_tree_put(tree);
695
696out_commit:
697 ocfs2_commit_trans(osb, handle);
698
699out:
700 if (new_tree) {
701 ocfs2_metadata_cache_exit(&new_tree->rf_ci);
702 kfree(new_tree);
703 }
704
705 brelse(new_bh);
706 if (meta_ac)
707 ocfs2_free_alloc_context(meta_ac);
708
709 return ret;
710}
711
712static int ocfs2_set_refcount_tree(struct inode *inode,
713 struct buffer_head *di_bh,
714 u64 refcount_loc)
715{
716 int ret;
717 handle_t *handle = NULL;
718 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
719 struct ocfs2_inode_info *oi = OCFS2_I(inode);
720 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
721 struct buffer_head *ref_root_bh = NULL;
722 struct ocfs2_refcount_block *rb;
723 struct ocfs2_refcount_tree *ref_tree;
724
725 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
726
727 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
728 &ref_tree, &ref_root_bh);
729 if (ret) {
730 mlog_errno(ret);
731 return ret;
732 }
733
734 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
735 if (IS_ERR(handle)) {
736 ret = PTR_ERR(handle);
737 mlog_errno(ret);
738 goto out;
739 }
740
741 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
742 OCFS2_JOURNAL_ACCESS_WRITE);
743 if (ret) {
744 mlog_errno(ret);
745 goto out_commit;
746 }
747
748 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
749 OCFS2_JOURNAL_ACCESS_WRITE);
750 if (ret) {
751 mlog_errno(ret);
752 goto out_commit;
753 }
754
755 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
756 le32_add_cpu(&rb->rf_count, 1);
757
758 ocfs2_journal_dirty(handle, ref_root_bh);
759
760 spin_lock(&oi->ip_lock);
761 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
762 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
763 di->i_refcount_loc = cpu_to_le64(refcount_loc);
764 spin_unlock(&oi->ip_lock);
765 ocfs2_journal_dirty(handle, di_bh);
766
767out_commit:
768 ocfs2_commit_trans(osb, handle);
769out:
770 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
771 brelse(ref_root_bh);
772
773 return ret;
774}
775
776int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
777{
778 int ret, delete_tree = 0;
779 handle_t *handle = NULL;
780 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
781 struct ocfs2_inode_info *oi = OCFS2_I(inode);
782 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
783 struct ocfs2_refcount_block *rb;
784 struct inode *alloc_inode = NULL;
785 struct buffer_head *alloc_bh = NULL;
786 struct buffer_head *blk_bh = NULL;
787 struct ocfs2_refcount_tree *ref_tree;
788 int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
789 u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
790 u16 bit = 0;
791
792 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
793 return 0;
794
795 BUG_ON(!ref_blkno);
796 ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
797 if (ret) {
798 mlog_errno(ret);
799 return ret;
800 }
801
802 rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
803
804 /*
805 * If we are the last user, we need to free the block.
806 * So lock the allocator ahead.
807 */
808 if (le32_to_cpu(rb->rf_count) == 1) {
809 blk = le64_to_cpu(rb->rf_blkno);
810 bit = le16_to_cpu(rb->rf_suballoc_bit);
811 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
812
813 alloc_inode = ocfs2_get_system_file_inode(osb,
814 EXTENT_ALLOC_SYSTEM_INODE,
815 le16_to_cpu(rb->rf_suballoc_slot));
816 if (!alloc_inode) {
817 ret = -ENOMEM;
818 mlog_errno(ret);
819 goto out;
820 }
821 mutex_lock(&alloc_inode->i_mutex);
822
823 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
824 if (ret) {
825 mlog_errno(ret);
826 goto out_mutex;
827 }
828
829 credits += OCFS2_SUBALLOC_FREE;
830 }
831
832 handle = ocfs2_start_trans(osb, credits);
833 if (IS_ERR(handle)) {
834 ret = PTR_ERR(handle);
835 mlog_errno(ret);
836 goto out_unlock;
837 }
838
839 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
840 OCFS2_JOURNAL_ACCESS_WRITE);
841 if (ret) {
842 mlog_errno(ret);
843 goto out_commit;
844 }
845
846 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
847 OCFS2_JOURNAL_ACCESS_WRITE);
848 if (ret) {
849 mlog_errno(ret);
850 goto out_commit;
851 }
852
853 spin_lock(&oi->ip_lock);
854 oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
855 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
856 di->i_refcount_loc = 0;
857 spin_unlock(&oi->ip_lock);
858 ocfs2_journal_dirty(handle, di_bh);
859
860 le32_add_cpu(&rb->rf_count , -1);
861 ocfs2_journal_dirty(handle, blk_bh);
862
863 if (!rb->rf_count) {
864 delete_tree = 1;
865 ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
866 ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
867 alloc_bh, bit, bg_blkno, 1);
868 if (ret)
869 mlog_errno(ret);
870 }
871
872out_commit:
873 ocfs2_commit_trans(osb, handle);
874out_unlock:
875 if (alloc_inode) {
876 ocfs2_inode_unlock(alloc_inode, 1);
877 brelse(alloc_bh);
878 }
879out_mutex:
880 if (alloc_inode) {
881 mutex_unlock(&alloc_inode->i_mutex);
882 iput(alloc_inode);
883 }
884out:
885 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
886 if (delete_tree)
887 ocfs2_refcount_tree_put(ref_tree);
888 brelse(blk_bh);
889
890 return ret;
891}
892
893static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
894 struct buffer_head *ref_leaf_bh,
895 u64 cpos, unsigned int len,
896 struct ocfs2_refcount_rec *ret_rec,
897 int *index)
898{
899 int i = 0;
900 struct ocfs2_refcount_block *rb =
901 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
902 struct ocfs2_refcount_rec *rec = NULL;
903
904 for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
905 rec = &rb->rf_records.rl_recs[i];
906
907 if (le64_to_cpu(rec->r_cpos) +
908 le32_to_cpu(rec->r_clusters) <= cpos)
909 continue;
910 else if (le64_to_cpu(rec->r_cpos) > cpos)
911 break;
912
913 /* ok, cpos fail in this rec. Just return. */
914 if (ret_rec)
915 *ret_rec = *rec;
916 goto out;
917 }
918
919 if (ret_rec) {
920 /* We meet with a hole here, so fake the rec. */
921 ret_rec->r_cpos = cpu_to_le64(cpos);
922 ret_rec->r_refcount = 0;
923 if (i < le16_to_cpu(rb->rf_records.rl_used) &&
924 le64_to_cpu(rec->r_cpos) < cpos + len)
925 ret_rec->r_clusters =
926 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
927 else
928 ret_rec->r_clusters = cpu_to_le32(len);
929 }
930
931out:
932 *index = i;
933}
934
935/*
936 * Try to remove refcount tree. The mechanism is:
937 * 1) Check whether i_clusters == 0, if no, exit.
938 * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
939 * 3) Check whether we have inline xattr stored outside, if yes, exit.
940 * 4) Remove the tree.
941 */
942int ocfs2_try_remove_refcount_tree(struct inode *inode,
943 struct buffer_head *di_bh)
944{
945 int ret;
946 struct ocfs2_inode_info *oi = OCFS2_I(inode);
947 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
948
949 down_write(&oi->ip_xattr_sem);
950 down_write(&oi->ip_alloc_sem);
951
952 if (oi->ip_clusters)
953 goto out;
954
955 if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
956 goto out;
957
958 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
959 ocfs2_has_inline_xattr_value_outside(inode, di))
960 goto out;
961
962 ret = ocfs2_remove_refcount_tree(inode, di_bh);
963 if (ret)
964 mlog_errno(ret);
965out:
966 up_write(&oi->ip_alloc_sem);
967 up_write(&oi->ip_xattr_sem);
968 return 0;
969}
970
971/*
972 * Given a cpos and len, try to find the refcount record which contains cpos.
973 * 1. If cpos can be found in one refcount record, return the record.
974 * 2. If cpos can't be found, return a fake record which start from cpos
975 * and end at a small value between cpos+len and start of the next record.
976 * This fake record has r_refcount = 0.
977 */
978static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
979 struct buffer_head *ref_root_bh,
980 u64 cpos, unsigned int len,
981 struct ocfs2_refcount_rec *ret_rec,
982 int *index,
983 struct buffer_head **ret_bh)
984{
985 int ret = 0, i, found;
986 u32 low_cpos;
987 struct ocfs2_extent_list *el;
988 struct ocfs2_extent_rec *tmp, *rec = NULL;
989 struct ocfs2_extent_block *eb;
990 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
991 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
992 struct ocfs2_refcount_block *rb =
993 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
994
995 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
996 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
997 ret_rec, index);
998 *ret_bh = ref_root_bh;
999 get_bh(ref_root_bh);
1000 return 0;
1001 }
1002
1003 el = &rb->rf_list;
1004 low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1005
1006 if (el->l_tree_depth) {
1007 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1008 if (ret) {
1009 mlog_errno(ret);
1010 goto out;
1011 }
1012
1013 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1014 el = &eb->h_list;
1015
1016 if (el->l_tree_depth) {
1017 ocfs2_error(sb,
1018 "refcount tree %llu has non zero tree "
1019 "depth in leaf btree tree block %llu\n",
1020 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1021 (unsigned long long)eb_bh->b_blocknr);
1022 ret = -EROFS;
1023 goto out;
1024 }
1025 }
1026
1027 found = 0;
1028 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1029 rec = &el->l_recs[i];
1030
1031 if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1032 found = 1;
1033 break;
1034 }
1035 }
1036
1037 /* adjust len when we have ocfs2_extent_rec after it. */
1038 if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
1039 tmp = &el->l_recs[i+1];
1040
1041 if (le32_to_cpu(tmp->e_cpos) < cpos + len)
1042 len = le32_to_cpu(tmp->e_cpos) - cpos;
1043 }
1044
1045 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1046 &ref_leaf_bh);
1047 if (ret) {
1048 mlog_errno(ret);
1049 goto out;
1050 }
1051
1052 ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1053 ret_rec, index);
1054 *ret_bh = ref_leaf_bh;
1055out:
1056 brelse(eb_bh);
1057 return ret;
1058}
1059
1060enum ocfs2_ref_rec_contig {
1061 REF_CONTIG_NONE = 0,
1062 REF_CONTIG_LEFT,
1063 REF_CONTIG_RIGHT,
1064 REF_CONTIG_LEFTRIGHT,
1065};
1066
1067static enum ocfs2_ref_rec_contig
1068 ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1069 int index)
1070{
1071 if ((rb->rf_records.rl_recs[index].r_refcount ==
1072 rb->rf_records.rl_recs[index + 1].r_refcount) &&
1073 (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1074 le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1075 le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1076 return REF_CONTIG_RIGHT;
1077
1078 return REF_CONTIG_NONE;
1079}
1080
1081static enum ocfs2_ref_rec_contig
1082 ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1083 int index)
1084{
1085 enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1086
1087 if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1088 ret = ocfs2_refcount_rec_adjacent(rb, index);
1089
1090 if (index > 0) {
1091 enum ocfs2_ref_rec_contig tmp;
1092
1093 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1094
1095 if (tmp == REF_CONTIG_RIGHT) {
1096 if (ret == REF_CONTIG_RIGHT)
1097 ret = REF_CONTIG_LEFTRIGHT;
1098 else
1099 ret = REF_CONTIG_LEFT;
1100 }
1101 }
1102
1103 return ret;
1104}
1105
1106static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1107 int index)
1108{
1109 BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1110 rb->rf_records.rl_recs[index+1].r_refcount);
1111
1112 le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1113 le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1114
1115 if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1116 memmove(&rb->rf_records.rl_recs[index + 1],
1117 &rb->rf_records.rl_recs[index + 2],
1118 sizeof(struct ocfs2_refcount_rec) *
1119 (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1120
1121 memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1122 0, sizeof(struct ocfs2_refcount_rec));
1123 le16_add_cpu(&rb->rf_records.rl_used, -1);
1124}
1125
1126/*
1127 * Merge the refcount rec if we are contiguous with the adjacent recs.
1128 */
1129static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1130 int index)
1131{
1132 enum ocfs2_ref_rec_contig contig =
1133 ocfs2_refcount_rec_contig(rb, index);
1134
1135 if (contig == REF_CONTIG_NONE)
1136 return;
1137
1138 if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1139 BUG_ON(index == 0);
1140 index--;
1141 }
1142
1143 ocfs2_rotate_refcount_rec_left(rb, index);
1144
1145 if (contig == REF_CONTIG_LEFTRIGHT)
1146 ocfs2_rotate_refcount_rec_left(rb, index);
1147}
1148
1149/*
1150 * Change the refcount indexed by "index" in ref_bh.
1151 * If refcount reaches 0, remove it.
1152 */
1153static int ocfs2_change_refcount_rec(handle_t *handle,
1154 struct ocfs2_caching_info *ci,
1155 struct buffer_head *ref_leaf_bh,
1156 int index, int merge, int change)
1157{
1158 int ret;
1159 struct ocfs2_refcount_block *rb =
1160 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1161 struct ocfs2_refcount_list *rl = &rb->rf_records;
1162 struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1163
1164 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1165 OCFS2_JOURNAL_ACCESS_WRITE);
1166 if (ret) {
1167 mlog_errno(ret);
1168 goto out;
1169 }
1170
1171 mlog(0, "change index %d, old count %u, change %d\n", index,
1172 le32_to_cpu(rec->r_refcount), change);
1173 le32_add_cpu(&rec->r_refcount, change);
1174
1175 if (!rec->r_refcount) {
1176 if (index != le16_to_cpu(rl->rl_used) - 1) {
1177 memmove(rec, rec + 1,
1178 (le16_to_cpu(rl->rl_used) - index - 1) *
1179 sizeof(struct ocfs2_refcount_rec));
1180 memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1181 0, sizeof(struct ocfs2_refcount_rec));
1182 }
1183
1184 le16_add_cpu(&rl->rl_used, -1);
1185 } else if (merge)
1186 ocfs2_refcount_rec_merge(rb, index);
1187
1188 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1189 if (ret)
1190 mlog_errno(ret);
1191out:
1192 return ret;
1193}
1194
1195static int ocfs2_expand_inline_ref_root(handle_t *handle,
1196 struct ocfs2_caching_info *ci,
1197 struct buffer_head *ref_root_bh,
1198 struct buffer_head **ref_leaf_bh,
1199 struct ocfs2_alloc_context *meta_ac)
1200{
1201 int ret;
1202 u16 suballoc_bit_start;
1203 u32 num_got;
1204 u64 blkno;
1205 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1206 struct buffer_head *new_bh = NULL;
1207 struct ocfs2_refcount_block *new_rb;
1208 struct ocfs2_refcount_block *root_rb =
1209 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1210
1211 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1212 OCFS2_JOURNAL_ACCESS_WRITE);
1213 if (ret) {
1214 mlog_errno(ret);
1215 goto out;
1216 }
1217
1218 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1219 &suballoc_bit_start, &num_got,
1220 &blkno);
1221 if (ret) {
1222 mlog_errno(ret);
1223 goto out;
1224 }
1225
1226 new_bh = sb_getblk(sb, blkno);
1227 if (new_bh == NULL) {
1228 ret = -EIO;
1229 mlog_errno(ret);
1230 goto out;
1231 }
1232 ocfs2_set_new_buffer_uptodate(ci, new_bh);
1233
1234 ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1235 OCFS2_JOURNAL_ACCESS_CREATE);
1236 if (ret) {
1237 mlog_errno(ret);
1238 goto out;
1239 }
1240
1241 /*
1242 * Initialize ocfs2_refcount_block.
1243 * It should contain the same information as the old root.
1244 * so just memcpy it and change the corresponding field.
1245 */
1246 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1247
1248 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1249 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
1250 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1251 new_rb->rf_blkno = cpu_to_le64(blkno);
1252 new_rb->rf_cpos = cpu_to_le32(0);
1253 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1254 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1255 ocfs2_journal_dirty(handle, new_bh);
1256
1257 /* Now change the root. */
1258 memset(&root_rb->rf_list, 0, sb->s_blocksize -
1259 offsetof(struct ocfs2_refcount_block, rf_list));
1260 root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1261 root_rb->rf_clusters = cpu_to_le32(1);
1262 root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1263 root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1264 root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1265 root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1266
1267 ocfs2_journal_dirty(handle, ref_root_bh);
1268
1269 mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
1270 le16_to_cpu(new_rb->rf_records.rl_used));
1271
1272 *ref_leaf_bh = new_bh;
1273 new_bh = NULL;
1274out:
1275 brelse(new_bh);
1276 return ret;
1277}
1278
1279static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1280 struct ocfs2_refcount_rec *next)
1281{
1282 if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1283 ocfs2_get_ref_rec_low_cpos(next))
1284 return 1;
1285
1286 return 0;
1287}
1288
1289static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1290{
1291 const struct ocfs2_refcount_rec *l = a, *r = b;
1292 u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1293 u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1294
1295 if (l_cpos > r_cpos)
1296 return 1;
1297 if (l_cpos < r_cpos)
1298 return -1;
1299 return 0;
1300}
1301
1302static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1303{
1304 const struct ocfs2_refcount_rec *l = a, *r = b;
1305 u64 l_cpos = le64_to_cpu(l->r_cpos);
1306 u64 r_cpos = le64_to_cpu(r->r_cpos);
1307
1308 if (l_cpos > r_cpos)
1309 return 1;
1310 if (l_cpos < r_cpos)
1311 return -1;
1312 return 0;
1313}
1314
1315static void swap_refcount_rec(void *a, void *b, int size)
1316{
1317 struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1318
1319 tmp = *(struct ocfs2_refcount_rec *)l;
1320 *(struct ocfs2_refcount_rec *)l =
1321 *(struct ocfs2_refcount_rec *)r;
1322 *(struct ocfs2_refcount_rec *)r = tmp;
1323}
1324
1325/*
1326 * The refcount cpos are ordered by their 64bit cpos,
1327 * But we will use the low 32 bit to be the e_cpos in the b-tree.
1328 * So we need to make sure that this pos isn't intersected with others.
1329 *
1330 * Note: The refcount block is already sorted by their low 32 bit cpos,
1331 * So just try the middle pos first, and we will exit when we find
1332 * the good position.
1333 */
1334static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1335 u32 *split_pos, int *split_index)
1336{
1337 int num_used = le16_to_cpu(rl->rl_used);
1338 int delta, middle = num_used / 2;
1339
1340 for (delta = 0; delta < middle; delta++) {
1341 /* Let's check delta earlier than middle */
1342 if (ocfs2_refcount_rec_no_intersect(
1343 &rl->rl_recs[middle - delta - 1],
1344 &rl->rl_recs[middle - delta])) {
1345 *split_index = middle - delta;
1346 break;
1347 }
1348
1349 /* For even counts, don't walk off the end */
1350 if ((middle + delta + 1) == num_used)
1351 continue;
1352
1353 /* Now try delta past middle */
1354 if (ocfs2_refcount_rec_no_intersect(
1355 &rl->rl_recs[middle + delta],
1356 &rl->rl_recs[middle + delta + 1])) {
1357 *split_index = middle + delta + 1;
1358 break;
1359 }
1360 }
1361
1362 if (delta >= middle)
1363 return -ENOSPC;
1364
1365 *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1366 return 0;
1367}
1368
1369static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1370 struct buffer_head *new_bh,
1371 u32 *split_cpos)
1372{
1373 int split_index = 0, num_moved, ret;
1374 u32 cpos = 0;
1375 struct ocfs2_refcount_block *rb =
1376 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1377 struct ocfs2_refcount_list *rl = &rb->rf_records;
1378 struct ocfs2_refcount_block *new_rb =
1379 (struct ocfs2_refcount_block *)new_bh->b_data;
1380 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1381
1382 mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
1383 (unsigned long long)ref_leaf_bh->b_blocknr,
1384 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
1385
1386 /*
1387 * XXX: Improvement later.
1388 * If we know all the high 32 bit cpos is the same, no need to sort.
1389 *
1390 * In order to make the whole process safe, we do:
1391 * 1. sort the entries by their low 32 bit cpos first so that we can
1392 * find the split cpos easily.
1393 * 2. call ocfs2_insert_extent to insert the new refcount block.
1394 * 3. move the refcount rec to the new block.
1395 * 4. sort the entries by their 64 bit cpos.
1396 * 5. dirty the new_rb and rb.
1397 */
1398 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1399 sizeof(struct ocfs2_refcount_rec),
1400 cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1401
1402 ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1403 if (ret) {
1404 mlog_errno(ret);
1405 return ret;
1406 }
1407
1408 new_rb->rf_cpos = cpu_to_le32(cpos);
1409
1410 /* move refcount records starting from split_index to the new block. */
1411 num_moved = le16_to_cpu(rl->rl_used) - split_index;
1412 memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1413 num_moved * sizeof(struct ocfs2_refcount_rec));
1414
1415 /*ok, remove the entries we just moved over to the other block. */
1416 memset(&rl->rl_recs[split_index], 0,
1417 num_moved * sizeof(struct ocfs2_refcount_rec));
1418
1419 /* change old and new rl_used accordingly. */
1420 le16_add_cpu(&rl->rl_used, -num_moved);
1421 new_rl->rl_used = cpu_to_le32(num_moved);
1422
1423 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1424 sizeof(struct ocfs2_refcount_rec),
1425 cmp_refcount_rec_by_cpos, swap_refcount_rec);
1426
1427 sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1428 sizeof(struct ocfs2_refcount_rec),
1429 cmp_refcount_rec_by_cpos, swap_refcount_rec);
1430
1431 *split_cpos = cpos;
1432 return 0;
1433}
1434
1435static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1436 struct ocfs2_caching_info *ci,
1437 struct buffer_head *ref_root_bh,
1438 struct buffer_head *ref_leaf_bh,
1439 struct ocfs2_alloc_context *meta_ac)
1440{
1441 int ret;
1442 u16 suballoc_bit_start;
1443 u32 num_got, new_cpos;
1444 u64 blkno;
1445 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1446 struct ocfs2_refcount_block *root_rb =
1447 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1448 struct buffer_head *new_bh = NULL;
1449 struct ocfs2_refcount_block *new_rb;
1450 struct ocfs2_extent_tree ref_et;
1451
1452 BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1453
1454 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1455 OCFS2_JOURNAL_ACCESS_WRITE);
1456 if (ret) {
1457 mlog_errno(ret);
1458 goto out;
1459 }
1460
1461 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1462 OCFS2_JOURNAL_ACCESS_WRITE);
1463 if (ret) {
1464 mlog_errno(ret);
1465 goto out;
1466 }
1467
1468 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1469 &suballoc_bit_start, &num_got,
1470 &blkno);
1471 if (ret) {
1472 mlog_errno(ret);
1473 goto out;
1474 }
1475
1476 new_bh = sb_getblk(sb, blkno);
1477 if (new_bh == NULL) {
1478 ret = -EIO;
1479 mlog_errno(ret);
1480 goto out;
1481 }
1482 ocfs2_set_new_buffer_uptodate(ci, new_bh);
1483
1484 ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1485 OCFS2_JOURNAL_ACCESS_CREATE);
1486 if (ret) {
1487 mlog_errno(ret);
1488 goto out;
1489 }
1490
1491 /* Initialize ocfs2_refcount_block. */
1492 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1493 memset(new_rb, 0, sb->s_blocksize);
1494 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1495 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
1496 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1497 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1498 new_rb->rf_blkno = cpu_to_le64(blkno);
1499 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1500 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1501 new_rb->rf_records.rl_count =
1502 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1503 new_rb->rf_generation = root_rb->rf_generation;
1504
1505 ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1506 if (ret) {
1507 mlog_errno(ret);
1508 goto out;
1509 }
1510
1511 ocfs2_journal_dirty(handle, ref_leaf_bh);
1512 ocfs2_journal_dirty(handle, new_bh);
1513
1514 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1515
1516 mlog(0, "insert new leaf block %llu at %u\n",
1517 (unsigned long long)new_bh->b_blocknr, new_cpos);
1518
1519 /* Insert the new leaf block with the specific offset cpos. */
1520 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1521 1, 0, meta_ac);
1522 if (ret)
1523 mlog_errno(ret);
1524
1525out:
1526 brelse(new_bh);
1527 return ret;
1528}
1529
1530static int ocfs2_expand_refcount_tree(handle_t *handle,
1531 struct ocfs2_caching_info *ci,
1532 struct buffer_head *ref_root_bh,
1533 struct buffer_head *ref_leaf_bh,
1534 struct ocfs2_alloc_context *meta_ac)
1535{
1536 int ret;
1537 struct buffer_head *expand_bh = NULL;
1538
1539 if (ref_root_bh == ref_leaf_bh) {
1540 /*
1541 * the old root bh hasn't been expanded to a b-tree,
1542 * so expand it first.
1543 */
1544 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1545 &expand_bh, meta_ac);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out;
1549 }
1550 } else {
1551 expand_bh = ref_leaf_bh;
1552 get_bh(expand_bh);
1553 }
1554
1555
1556 /* Now add a new refcount block into the tree.*/
1557 ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1558 expand_bh, meta_ac);
1559 if (ret)
1560 mlog_errno(ret);
1561out:
1562 brelse(expand_bh);
1563 return ret;
1564}
1565
1566/*
1567 * Adjust the extent rec in b-tree representing ref_leaf_bh.
1568 *
1569 * Only called when we have inserted a new refcount rec at index 0
1570 * which means ocfs2_extent_rec.e_cpos may need some change.
1571 */
1572static int ocfs2_adjust_refcount_rec(handle_t *handle,
1573 struct ocfs2_caching_info *ci,
1574 struct buffer_head *ref_root_bh,
1575 struct buffer_head *ref_leaf_bh,
1576 struct ocfs2_refcount_rec *rec)
1577{
1578 int ret = 0, i;
1579 u32 new_cpos, old_cpos;
1580 struct ocfs2_path *path = NULL;
1581 struct ocfs2_extent_tree et;
1582 struct ocfs2_refcount_block *rb =
1583 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1584 struct ocfs2_extent_list *el;
1585
1586 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1587 goto out;
1588
1589 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1590 old_cpos = le32_to_cpu(rb->rf_cpos);
1591 new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1592 if (old_cpos <= new_cpos)
1593 goto out;
1594
1595 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1596
1597 path = ocfs2_new_path_from_et(&et);
1598 if (!path) {
1599 ret = -ENOMEM;
1600 mlog_errno(ret);
1601 goto out;
1602 }
1603
1604 ret = ocfs2_find_path(ci, path, old_cpos);
1605 if (ret) {
1606 mlog_errno(ret);
1607 goto out;
1608 }
1609
1610 /*
1611 * 2 more credits, one for the leaf refcount block, one for
1612 * the extent block contains the extent rec.
1613 */
1614 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
1615 if (ret < 0) {
1616 mlog_errno(ret);
1617 goto out;
1618 }
1619
1620 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1621 OCFS2_JOURNAL_ACCESS_WRITE);
1622 if (ret < 0) {
1623 mlog_errno(ret);
1624 goto out;
1625 }
1626
1627 ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1628 OCFS2_JOURNAL_ACCESS_WRITE);
1629 if (ret < 0) {
1630 mlog_errno(ret);
1631 goto out;
1632 }
1633
1634 /* change the leaf extent block first. */
1635 el = path_leaf_el(path);
1636
1637 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1638 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1639 break;
1640
1641 BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1642
1643 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1644
1645 /* change the r_cpos in the leaf block. */
1646 rb->rf_cpos = cpu_to_le32(new_cpos);
1647
1648 ocfs2_journal_dirty(handle, path_leaf_bh(path));
1649 ocfs2_journal_dirty(handle, ref_leaf_bh);
1650
1651out:
1652 ocfs2_free_path(path);
1653 return ret;
1654}
1655
1656static int ocfs2_insert_refcount_rec(handle_t *handle,
1657 struct ocfs2_caching_info *ci,
1658 struct buffer_head *ref_root_bh,
1659 struct buffer_head *ref_leaf_bh,
1660 struct ocfs2_refcount_rec *rec,
1661 int index, int merge,
1662 struct ocfs2_alloc_context *meta_ac)
1663{
1664 int ret;
1665 struct ocfs2_refcount_block *rb =
1666 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1667 struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1668 struct buffer_head *new_bh = NULL;
1669
1670 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1671
1672 if (rf_list->rl_used == rf_list->rl_count) {
1673 u64 cpos = le64_to_cpu(rec->r_cpos);
1674 u32 len = le32_to_cpu(rec->r_clusters);
1675
1676 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1677 ref_leaf_bh, meta_ac);
1678 if (ret) {
1679 mlog_errno(ret);
1680 goto out;
1681 }
1682
1683 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1684 cpos, len, NULL, &index,
1685 &new_bh);
1686 if (ret) {
1687 mlog_errno(ret);
1688 goto out;
1689 }
1690
1691 ref_leaf_bh = new_bh;
1692 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1693 rf_list = &rb->rf_records;
1694 }
1695
1696 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1697 OCFS2_JOURNAL_ACCESS_WRITE);
1698 if (ret) {
1699 mlog_errno(ret);
1700 goto out;
1701 }
1702
1703 if (index < le16_to_cpu(rf_list->rl_used))
1704 memmove(&rf_list->rl_recs[index + 1],
1705 &rf_list->rl_recs[index],
1706 (le16_to_cpu(rf_list->rl_used) - index) *
1707 sizeof(struct ocfs2_refcount_rec));
1708
1709 mlog(0, "insert refcount record start %llu, len %u, count %u "
1710 "to leaf block %llu at index %d\n",
1711 (unsigned long long)le64_to_cpu(rec->r_cpos),
1712 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
1713 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1714
1715 rf_list->rl_recs[index] = *rec;
1716
1717 le16_add_cpu(&rf_list->rl_used, 1);
1718
1719 if (merge)
1720 ocfs2_refcount_rec_merge(rb, index);
1721
1722 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1723 if (ret) {
1724 mlog_errno(ret);
1725 goto out;
1726 }
1727
1728 if (index == 0) {
1729 ret = ocfs2_adjust_refcount_rec(handle, ci,
1730 ref_root_bh,
1731 ref_leaf_bh, rec);
1732 if (ret)
1733 mlog_errno(ret);
1734 }
1735out:
1736 brelse(new_bh);
1737 return ret;
1738}
1739
1740/*
1741 * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1742 * This is much simple than our b-tree code.
1743 * split_rec is the new refcount rec we want to insert.
1744 * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1745 * increase refcount or decrease a refcount to non-zero).
1746 * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1747 * rec( in case we decrease a refcount to zero).
1748 */
1749static int ocfs2_split_refcount_rec(handle_t *handle,
1750 struct ocfs2_caching_info *ci,
1751 struct buffer_head *ref_root_bh,
1752 struct buffer_head *ref_leaf_bh,
1753 struct ocfs2_refcount_rec *split_rec,
1754 int index, int merge,
1755 struct ocfs2_alloc_context *meta_ac,
1756 struct ocfs2_cached_dealloc_ctxt *dealloc)
1757{
1758 int ret, recs_need;
1759 u32 len;
1760 struct ocfs2_refcount_block *rb =
1761 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1762 struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1763 struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1764 struct ocfs2_refcount_rec *tail_rec = NULL;
1765 struct buffer_head *new_bh = NULL;
1766
1767 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1768
1769 mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
1770 le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
1771 le64_to_cpu(split_rec->r_cpos),
1772 le32_to_cpu(split_rec->r_clusters));
1773
1774 /*
1775 * If we just need to split the header or tail clusters,
1776 * no more recs are needed, just split is OK.
1777 * Otherwise we at least need one new recs.
1778 */
1779 if (!split_rec->r_refcount &&
1780 (split_rec->r_cpos == orig_rec->r_cpos ||
1781 le64_to_cpu(split_rec->r_cpos) +
1782 le32_to_cpu(split_rec->r_clusters) ==
1783 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1784 recs_need = 0;
1785 else
1786 recs_need = 1;
1787
1788 /*
1789 * We need one more rec if we split in the middle and the new rec have
1790 * some refcount in it.
1791 */
1792 if (split_rec->r_refcount &&
1793 (split_rec->r_cpos != orig_rec->r_cpos &&
1794 le64_to_cpu(split_rec->r_cpos) +
1795 le32_to_cpu(split_rec->r_clusters) !=
1796 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1797 recs_need++;
1798
1799 /* If the leaf block don't have enough record, expand it. */
1800 if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
1801 struct ocfs2_refcount_rec tmp_rec;
1802 u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1803 len = le32_to_cpu(orig_rec->r_clusters);
1804 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1805 ref_leaf_bh, meta_ac);
1806 if (ret) {
1807 mlog_errno(ret);
1808 goto out;
1809 }
1810
1811 /*
1812 * We have to re-get it since now cpos may be moved to
1813 * another leaf block.
1814 */
1815 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1816 cpos, len, &tmp_rec, &index,
1817 &new_bh);
1818 if (ret) {
1819 mlog_errno(ret);
1820 goto out;
1821 }
1822
1823 ref_leaf_bh = new_bh;
1824 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1825 rf_list = &rb->rf_records;
1826 orig_rec = &rf_list->rl_recs[index];
1827 }
1828
1829 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1830 OCFS2_JOURNAL_ACCESS_WRITE);
1831 if (ret) {
1832 mlog_errno(ret);
1833 goto out;
1834 }
1835
1836 /*
1837 * We have calculated out how many new records we need and store
1838 * in recs_need, so spare enough space first by moving the records
1839 * after "index" to the end.
1840 */
1841 if (index != le16_to_cpu(rf_list->rl_used) - 1)
1842 memmove(&rf_list->rl_recs[index + 1 + recs_need],
1843 &rf_list->rl_recs[index + 1],
1844 (le16_to_cpu(rf_list->rl_used) - index - 1) *
1845 sizeof(struct ocfs2_refcount_rec));
1846
1847 len = (le64_to_cpu(orig_rec->r_cpos) +
1848 le32_to_cpu(orig_rec->r_clusters)) -
1849 (le64_to_cpu(split_rec->r_cpos) +
1850 le32_to_cpu(split_rec->r_clusters));
1851
1852 /*
1853 * If we have "len", the we will split in the tail and move it
1854 * to the end of the space we have just spared.
1855 */
1856 if (len) {
1857 tail_rec = &rf_list->rl_recs[index + recs_need];
1858
1859 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1860 le64_add_cpu(&tail_rec->r_cpos,
1861 le32_to_cpu(tail_rec->r_clusters) - len);
1862 tail_rec->r_clusters = le32_to_cpu(len);
1863 }
1864
1865 /*
1866 * If the split pos isn't the same as the original one, we need to
1867 * split in the head.
1868 *
1869 * Note: We have the chance that split_rec.r_refcount = 0,
1870 * recs_need = 0 and len > 0, which means we just cut the head from
1871 * the orig_rec and in that case we have done some modification in
1872 * orig_rec above, so the check for r_cpos is faked.
1873 */
1874 if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1875 len = le64_to_cpu(split_rec->r_cpos) -
1876 le64_to_cpu(orig_rec->r_cpos);
1877 orig_rec->r_clusters = cpu_to_le32(len);
1878 index++;
1879 }
1880
1881 le16_add_cpu(&rf_list->rl_used, recs_need);
1882
1883 if (split_rec->r_refcount) {
1884 rf_list->rl_recs[index] = *split_rec;
1885 mlog(0, "insert refcount record start %llu, len %u, count %u "
1886 "to leaf block %llu at index %d\n",
1887 (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1888 le32_to_cpu(split_rec->r_clusters),
1889 le32_to_cpu(split_rec->r_refcount),
1890 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1891
1892 if (merge)
1893 ocfs2_refcount_rec_merge(rb, index);
1894 }
1895
1896 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1897 if (ret)
1898 mlog_errno(ret);
1899
1900out:
1901 brelse(new_bh);
1902 return ret;
1903}
1904
1905static int __ocfs2_increase_refcount(handle_t *handle,
1906 struct ocfs2_caching_info *ci,
1907 struct buffer_head *ref_root_bh,
1908 u64 cpos, u32 len, int merge,
1909 struct ocfs2_alloc_context *meta_ac,
1910 struct ocfs2_cached_dealloc_ctxt *dealloc)
1911{
1912 int ret = 0, index;
1913 struct buffer_head *ref_leaf_bh = NULL;
1914 struct ocfs2_refcount_rec rec;
1915 unsigned int set_len = 0;
1916
1917 mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
1918 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1919 (unsigned long long)cpos, len);
1920
1921 while (len) {
1922 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1923 cpos, len, &rec, &index,
1924 &ref_leaf_bh);
1925 if (ret) {
1926 mlog_errno(ret);
1927 goto out;
1928 }
1929
1930 set_len = le32_to_cpu(rec.r_clusters);
1931
1932 /*
1933 * Here we may meet with 3 situations:
1934 *
1935 * 1. If we find an already existing record, and the length
1936 * is the same, cool, we just need to increase the r_refcount
1937 * and it is OK.
1938 * 2. If we find a hole, just insert it with r_refcount = 1.
1939 * 3. If we are in the middle of one extent record, split
1940 * it.
1941 */
1942 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
1943 set_len <= len) {
1944 mlog(0, "increase refcount rec, start %llu, len %u, "
1945 "count %u\n", (unsigned long long)cpos, set_len,
1946 le32_to_cpu(rec.r_refcount));
1947 ret = ocfs2_change_refcount_rec(handle, ci,
1948 ref_leaf_bh, index,
1949 merge, 1);
1950 if (ret) {
1951 mlog_errno(ret);
1952 goto out;
1953 }
1954 } else if (!rec.r_refcount) {
1955 rec.r_refcount = cpu_to_le32(1);
1956
1957 mlog(0, "insert refcount rec, start %llu, len %u\n",
1958 (unsigned long long)le64_to_cpu(rec.r_cpos),
1959 set_len);
1960 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
1961 ref_leaf_bh,
1962 &rec, index,
1963 merge, meta_ac);
1964 if (ret) {
1965 mlog_errno(ret);
1966 goto out;
1967 }
1968 } else {
1969 set_len = min((u64)(cpos + len),
1970 le64_to_cpu(rec.r_cpos) + set_len) - cpos;
1971 rec.r_cpos = cpu_to_le64(cpos);
1972 rec.r_clusters = cpu_to_le32(set_len);
1973 le32_add_cpu(&rec.r_refcount, 1);
1974
1975 mlog(0, "split refcount rec, start %llu, "
1976 "len %u, count %u\n",
1977 (unsigned long long)le64_to_cpu(rec.r_cpos),
1978 set_len, le32_to_cpu(rec.r_refcount));
1979 ret = ocfs2_split_refcount_rec(handle, ci,
1980 ref_root_bh, ref_leaf_bh,
1981 &rec, index, merge,
1982 meta_ac, dealloc);
1983 if (ret) {
1984 mlog_errno(ret);
1985 goto out;
1986 }
1987 }
1988
1989 cpos += set_len;
1990 len -= set_len;
1991 brelse(ref_leaf_bh);
1992 ref_leaf_bh = NULL;
1993 }
1994
1995out:
1996 brelse(ref_leaf_bh);
1997 return ret;
1998}
1999
2000static int ocfs2_remove_refcount_extent(handle_t *handle,
2001 struct ocfs2_caching_info *ci,
2002 struct buffer_head *ref_root_bh,
2003 struct buffer_head *ref_leaf_bh,
2004 struct ocfs2_alloc_context *meta_ac,
2005 struct ocfs2_cached_dealloc_ctxt *dealloc)
2006{
2007 int ret;
2008 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2009 struct ocfs2_refcount_block *rb =
2010 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2011 struct ocfs2_extent_tree et;
2012
2013 BUG_ON(rb->rf_records.rl_used);
2014
2015 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2016 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2017 1, meta_ac, dealloc);
2018 if (ret) {
2019 mlog_errno(ret);
2020 goto out;
2021 }
2022
2023 ocfs2_remove_from_cache(ci, ref_leaf_bh);
2024
2025 /*
2026 * add the freed block to the dealloc so that it will be freed
2027 * when we run dealloc.
2028 */
2029 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2030 le16_to_cpu(rb->rf_suballoc_slot),
2031 le64_to_cpu(rb->rf_blkno),
2032 le16_to_cpu(rb->rf_suballoc_bit));
2033 if (ret) {
2034 mlog_errno(ret);
2035 goto out;
2036 }
2037
2038 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2039 OCFS2_JOURNAL_ACCESS_WRITE);
2040 if (ret) {
2041 mlog_errno(ret);
2042 goto out;
2043 }
2044
2045 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2046
2047 le32_add_cpu(&rb->rf_clusters, -1);
2048
2049 /*
2050 * check whether we need to restore the root refcount block if
2051 * there is no leaf extent block at atll.
2052 */
2053 if (!rb->rf_list.l_next_free_rec) {
2054 BUG_ON(rb->rf_clusters);
2055
2056 mlog(0, "reset refcount tree root %llu to be a record block.\n",
2057 (unsigned long long)ref_root_bh->b_blocknr);
2058
2059 rb->rf_flags = 0;
2060 rb->rf_parent = 0;
2061 rb->rf_cpos = 0;
2062 memset(&rb->rf_records, 0, sb->s_blocksize -
2063 offsetof(struct ocfs2_refcount_block, rf_records));
2064 rb->rf_records.rl_count =
2065 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2066 }
2067
2068 ocfs2_journal_dirty(handle, ref_root_bh);
2069
2070out:
2071 return ret;
2072}
2073
2074int ocfs2_increase_refcount(handle_t *handle,
2075 struct ocfs2_caching_info *ci,
2076 struct buffer_head *ref_root_bh,
2077 u64 cpos, u32 len,
2078 struct ocfs2_alloc_context *meta_ac,
2079 struct ocfs2_cached_dealloc_ctxt *dealloc)
2080{
2081 return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2082 cpos, len, 1,
2083 meta_ac, dealloc);
2084}
2085
2086static int ocfs2_decrease_refcount_rec(handle_t *handle,
2087 struct ocfs2_caching_info *ci,
2088 struct buffer_head *ref_root_bh,
2089 struct buffer_head *ref_leaf_bh,
2090 int index, u64 cpos, unsigned int len,
2091 struct ocfs2_alloc_context *meta_ac,
2092 struct ocfs2_cached_dealloc_ctxt *dealloc)
2093{
2094 int ret;
2095 struct ocfs2_refcount_block *rb =
2096 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2097 struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2098
2099 BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2100 BUG_ON(cpos + len >
2101 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2102
2103 if (cpos == le64_to_cpu(rec->r_cpos) &&
2104 len == le32_to_cpu(rec->r_clusters))
2105 ret = ocfs2_change_refcount_rec(handle, ci,
2106 ref_leaf_bh, index, 1, -1);
2107 else {
2108 struct ocfs2_refcount_rec split = *rec;
2109 split.r_cpos = cpu_to_le64(cpos);
2110 split.r_clusters = cpu_to_le32(len);
2111
2112 le32_add_cpu(&split.r_refcount, -1);
2113
2114 mlog(0, "split refcount rec, start %llu, "
2115 "len %u, count %u, original start %llu, len %u\n",
2116 (unsigned long long)le64_to_cpu(split.r_cpos),
2117 len, le32_to_cpu(split.r_refcount),
2118 (unsigned long long)le64_to_cpu(rec->r_cpos),
2119 le32_to_cpu(rec->r_clusters));
2120 ret = ocfs2_split_refcount_rec(handle, ci,
2121 ref_root_bh, ref_leaf_bh,
2122 &split, index, 1,
2123 meta_ac, dealloc);
2124 }
2125
2126 if (ret) {
2127 mlog_errno(ret);
2128 goto out;
2129 }
2130
2131 /* Remove the leaf refcount block if it contains no refcount record. */
2132 if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2133 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2134 ref_leaf_bh, meta_ac,
2135 dealloc);
2136 if (ret)
2137 mlog_errno(ret);
2138 }
2139
2140out:
2141 return ret;
2142}
2143
2144static int __ocfs2_decrease_refcount(handle_t *handle,
2145 struct ocfs2_caching_info *ci,
2146 struct buffer_head *ref_root_bh,
2147 u64 cpos, u32 len,
2148 struct ocfs2_alloc_context *meta_ac,
2149 struct ocfs2_cached_dealloc_ctxt *dealloc,
2150 int delete)
2151{
2152 int ret = 0, index = 0;
2153 struct ocfs2_refcount_rec rec;
2154 unsigned int r_count = 0, r_len;
2155 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2156 struct buffer_head *ref_leaf_bh = NULL;
2157
2158 mlog(0, "Tree owner %llu, decrease refcount start %llu, "
2159 "len %u, delete %u\n",
2160 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2161 (unsigned long long)cpos, len, delete);
2162
2163 while (len) {
2164 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2165 cpos, len, &rec, &index,
2166 &ref_leaf_bh);
2167 if (ret) {
2168 mlog_errno(ret);
2169 goto out;
2170 }
2171
2172 r_count = le32_to_cpu(rec.r_refcount);
2173 BUG_ON(r_count == 0);
2174 if (!delete)
2175 BUG_ON(r_count > 1);
2176
2177 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2178 le32_to_cpu(rec.r_clusters)) - cpos;
2179
2180 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2181 ref_leaf_bh, index,
2182 cpos, r_len,
2183 meta_ac, dealloc);
2184 if (ret) {
2185 mlog_errno(ret);
2186 goto out;
2187 }
2188
2189 if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2190 ret = ocfs2_cache_cluster_dealloc(dealloc,
2191 ocfs2_clusters_to_blocks(sb, cpos),
2192 r_len);
2193 if (ret) {
2194 mlog_errno(ret);
2195 goto out;
2196 }
2197 }
2198
2199 cpos += r_len;
2200 len -= r_len;
2201 brelse(ref_leaf_bh);
2202 ref_leaf_bh = NULL;
2203 }
2204
2205out:
2206 brelse(ref_leaf_bh);
2207 return ret;
2208}
2209
2210/* Caller must hold refcount tree lock. */
2211int ocfs2_decrease_refcount(struct inode *inode,
2212 handle_t *handle, u32 cpos, u32 len,
2213 struct ocfs2_alloc_context *meta_ac,
2214 struct ocfs2_cached_dealloc_ctxt *dealloc,
2215 int delete)
2216{
2217 int ret;
2218 u64 ref_blkno;
2219 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2220 struct buffer_head *ref_root_bh = NULL;
2221 struct ocfs2_refcount_tree *tree;
2222
2223 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2224
2225 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2226 if (ret) {
2227 mlog_errno(ret);
2228 goto out;
2229 }
2230
2231 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2232 if (ret) {
2233 mlog_errno(ret);
2234 goto out;
2235 }
2236
2237 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2238 &ref_root_bh);
2239 if (ret) {
2240 mlog_errno(ret);
2241 goto out;
2242 }
2243
2244 ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2245 cpos, len, meta_ac, dealloc, delete);
2246 if (ret)
2247 mlog_errno(ret);
2248out:
2249 brelse(ref_root_bh);
2250 return ret;
2251}
2252
2253/*
2254 * Mark the already-existing extent at cpos as refcounted for len clusters.
2255 * This adds the refcount extent flag.
2256 *
2257 * If the existing extent is larger than the request, initiate a
2258 * split. An attempt will be made at merging with adjacent extents.
2259 *
2260 * The caller is responsible for passing down meta_ac if we'll need it.
2261 */
2262static int ocfs2_mark_extent_refcounted(struct inode *inode,
2263 struct ocfs2_extent_tree *et,
2264 handle_t *handle, u32 cpos,
2265 u32 len, u32 phys,
2266 struct ocfs2_alloc_context *meta_ac,
2267 struct ocfs2_cached_dealloc_ctxt *dealloc)
2268{
2269 int ret;
2270
2271 mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
2272 inode->i_ino, cpos, len, phys);
2273
2274 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2275 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2276 "tree, but the feature bit is not set in the "
2277 "super block.", inode->i_ino);
2278 ret = -EROFS;
2279 goto out;
2280 }
2281
2282 ret = ocfs2_change_extent_flag(handle, et, cpos,
2283 len, phys, meta_ac, dealloc,
2284 OCFS2_EXT_REFCOUNTED, 0);
2285 if (ret)
2286 mlog_errno(ret);
2287
2288out:
2289 return ret;
2290}
2291
2292/*
2293 * Given some contiguous physical clusters, calculate what we need
2294 * for modifying their refcount.
2295 */
2296static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2297 struct ocfs2_caching_info *ci,
2298 struct buffer_head *ref_root_bh,
2299 u64 start_cpos,
2300 u32 clusters,
2301 int *meta_add,
2302 int *credits)
2303{
2304 int ret = 0, index, ref_blocks = 0, recs_add = 0;
2305 u64 cpos = start_cpos;
2306 struct ocfs2_refcount_block *rb;
2307 struct ocfs2_refcount_rec rec;
2308 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2309 u32 len;
2310
2311 mlog(0, "start_cpos %llu, clusters %u\n",
2312 (unsigned long long)start_cpos, clusters);
2313 while (clusters) {
2314 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2315 cpos, clusters, &rec,
2316 &index, &ref_leaf_bh);
2317 if (ret) {
2318 mlog_errno(ret);
2319 goto out;
2320 }
2321
2322 if (ref_leaf_bh != prev_bh) {
2323 /*
2324 * Now we encounter a new leaf block, so calculate
2325 * whether we need to extend the old leaf.
2326 */
2327 if (prev_bh) {
2328 rb = (struct ocfs2_refcount_block *)
2329 prev_bh->b_data;
2330
2331 if (le64_to_cpu(rb->rf_records.rl_used) +
2332 recs_add >
2333 le16_to_cpu(rb->rf_records.rl_count))
2334 ref_blocks++;
2335 }
2336
2337 recs_add = 0;
2338 *credits += 1;
2339 brelse(prev_bh);
2340 prev_bh = ref_leaf_bh;
2341 get_bh(prev_bh);
2342 }
2343
2344 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2345
2346 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
2347 "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
2348 recs_add, (unsigned long long)cpos, clusters,
2349 (unsigned long long)le64_to_cpu(rec.r_cpos),
2350 le32_to_cpu(rec.r_clusters),
2351 le32_to_cpu(rec.r_refcount), index);
2352
2353 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2354 le32_to_cpu(rec.r_clusters)) - cpos;
2355 /*
2356 * If the refcount rec already exist, cool. We just need
2357 * to check whether there is a split. Otherwise we just need
2358 * to increase the refcount.
2359 * If we will insert one, increases recs_add.
2360 *
2361 * We record all the records which will be inserted to the
2362 * same refcount block, so that we can tell exactly whether
2363 * we need a new refcount block or not.
2364 */
2365 if (rec.r_refcount) {
2366 /* Check whether we need a split at the beginning. */
2367 if (cpos == start_cpos &&
2368 cpos != le64_to_cpu(rec.r_cpos))
2369 recs_add++;
2370
2371 /* Check whether we need a split in the end. */
2372 if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2373 le32_to_cpu(rec.r_clusters))
2374 recs_add++;
2375 } else
2376 recs_add++;
2377
2378 brelse(ref_leaf_bh);
2379 ref_leaf_bh = NULL;
2380 clusters -= len;
2381 cpos += len;
2382 }
2383
2384 if (prev_bh) {
2385 rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2386
2387 if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
2388 le16_to_cpu(rb->rf_records.rl_count))
2389 ref_blocks++;
2390
2391 *credits += 1;
2392 }
2393
2394 if (!ref_blocks)
2395 goto out;
2396
2397 mlog(0, "we need ref_blocks %d\n", ref_blocks);
2398 *meta_add += ref_blocks;
2399 *credits += ref_blocks;
2400
2401 /*
2402 * So we may need ref_blocks to insert into the tree.
2403 * That also means we need to change the b-tree and add that number
2404 * of records since we never merge them.
2405 * We need one more block for expansion since the new created leaf
2406 * block is also full and needs split.
2407 */
2408 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2409 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2410 struct ocfs2_extent_tree et;
2411
2412 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2413 *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2414 *credits += ocfs2_calc_extend_credits(sb,
2415 et.et_root_el,
2416 ref_blocks);
2417 } else {
2418 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2419 *meta_add += 1;
2420 }
2421
2422out:
2423 brelse(ref_leaf_bh);
2424 brelse(prev_bh);
2425 return ret;
2426}
2427
2428/*
2429 * For refcount tree, we will decrease some contiguous clusters
2430 * refcount count, so just go through it to see how many blocks
2431 * we gonna touch and whether we need to create new blocks.
2432 *
2433 * Normally the refcount blocks store these refcount should be
2434 * continguous also, so that we can get the number easily.
2435 * As for meta_ac, we will at most add split 2 refcount record and
2436 * 2 more refcount block, so just check it in a rough way.
2437 *
2438 * Caller must hold refcount tree lock.
2439 */
2440int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2441 struct buffer_head *di_bh,
2442 u64 phys_blkno,
2443 u32 clusters,
2444 int *credits,
2445 struct ocfs2_alloc_context **meta_ac)
2446{
2447 int ret, ref_blocks = 0;
2448 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2449 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2450 struct buffer_head *ref_root_bh = NULL;
2451 struct ocfs2_refcount_tree *tree;
2452 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2453
2454 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2455 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2456 "tree, but the feature bit is not set in the "
2457 "super block.", inode->i_ino);
2458 ret = -EROFS;
2459 goto out;
2460 }
2461
2462 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2463
2464 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2465 le64_to_cpu(di->i_refcount_loc), &tree);
2466 if (ret) {
2467 mlog_errno(ret);
2468 goto out;
2469 }
2470
2471 ret = ocfs2_read_refcount_block(&tree->rf_ci,
2472 le64_to_cpu(di->i_refcount_loc),
2473 &ref_root_bh);
2474 if (ret) {
2475 mlog_errno(ret);
2476 goto out;
2477 }
2478
2479 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2480 &tree->rf_ci,
2481 ref_root_bh,
2482 start_cpos, clusters,
2483 &ref_blocks, credits);
2484 if (ret) {
2485 mlog_errno(ret);
2486 goto out;
2487 }
2488
2489 mlog(0, "reserve new metadata %d, credits = %d\n",
2490 ref_blocks, *credits);
2491
2492 if (ref_blocks) {
2493 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2494 ref_blocks, meta_ac);
2495 if (ret)
2496 mlog_errno(ret);
2497 }
2498
2499out:
2500 brelse(ref_root_bh);
2501 return ret;
2502}
2503
2504#define MAX_CONTIG_BYTES 1048576
2505
2506static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2507{
2508 return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2509}
2510
2511static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2512{
2513 return ~(ocfs2_cow_contig_clusters(sb) - 1);
2514}
2515
2516/*
2517 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2518 * find an offset (start + (n * contig_clusters)) that is closest to cpos
2519 * while still being less than or equal to it.
2520 *
2521 * The goal is to break the extent at a multiple of contig_clusters.
2522 */
2523static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2524 unsigned int start,
2525 unsigned int cpos)
2526{
2527 BUG_ON(start > cpos);
2528
2529 return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2530}
2531
2532/*
2533 * Given a cluster count of len, pad it out so that it is a multiple
2534 * of contig_clusters.
2535 */
2536static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2537 unsigned int len)
2538{
2539 unsigned int padded =
2540 (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2541 ocfs2_cow_contig_mask(sb);
2542
2543 /* Did we wrap? */
2544 if (padded < len)
2545 padded = UINT_MAX;
2546
2547 return padded;
2548}
2549
2550/*
2551 * Calculate out the start and number of virtual clusters we need to to CoW.
2552 *
2553 * cpos is vitual start cluster position we want to do CoW in a
2554 * file and write_len is the cluster length.
2555 * max_cpos is the place where we want to stop CoW intentionally.
2556 *
2557 * Normal we will start CoW from the beginning of extent record cotaining cpos.
2558 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2559 * get good I/O from the resulting extent tree.
2560 */
2561static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2562 struct ocfs2_extent_list *el,
2563 u32 cpos,
2564 u32 write_len,
2565 u32 max_cpos,
2566 u32 *cow_start,
2567 u32 *cow_len)
2568{
2569 int ret = 0;
2570 int tree_height = le16_to_cpu(el->l_tree_depth), i;
2571 struct buffer_head *eb_bh = NULL;
2572 struct ocfs2_extent_block *eb = NULL;
2573 struct ocfs2_extent_rec *rec;
2574 unsigned int want_clusters, rec_end = 0;
2575 int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2576 int leaf_clusters;
2577
2578 BUG_ON(cpos + write_len > max_cpos);
2579
2580 if (tree_height > 0) {
2581 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2582 if (ret) {
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2588 el = &eb->h_list;
2589
2590 if (el->l_tree_depth) {
2591 ocfs2_error(inode->i_sb,
2592 "Inode %lu has non zero tree depth in "
2593 "leaf block %llu\n", inode->i_ino,
2594 (unsigned long long)eb_bh->b_blocknr);
2595 ret = -EROFS;
2596 goto out;
2597 }
2598 }
2599
2600 *cow_len = 0;
2601 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2602 rec = &el->l_recs[i];
2603
2604 if (ocfs2_is_empty_extent(rec)) {
2605 mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2606 "index %d\n", inode->i_ino, i);
2607 continue;
2608 }
2609
2610 if (le32_to_cpu(rec->e_cpos) +
2611 le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2612 continue;
2613
2614 if (*cow_len == 0) {
2615 /*
2616 * We should find a refcounted record in the
2617 * first pass.
2618 */
2619 BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2620 *cow_start = le32_to_cpu(rec->e_cpos);
2621 }
2622
2623 /*
2624 * If we encounter a hole, a non-refcounted record or
2625 * pass the max_cpos, stop the search.
2626 */
2627 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2628 (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2629 (max_cpos <= le32_to_cpu(rec->e_cpos)))
2630 break;
2631
2632 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2633 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2634 if (rec_end > max_cpos) {
2635 rec_end = max_cpos;
2636 leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2637 }
2638
2639 /*
2640 * How many clusters do we actually need from
2641 * this extent? First we see how many we actually
2642 * need to complete the write. If that's smaller
2643 * than contig_clusters, we try for contig_clusters.
2644 */
2645 if (!*cow_len)
2646 want_clusters = write_len;
2647 else
2648 want_clusters = (cpos + write_len) -
2649 (*cow_start + *cow_len);
2650 if (want_clusters < contig_clusters)
2651 want_clusters = contig_clusters;
2652
2653 /*
2654 * If the write does not cover the whole extent, we
2655 * need to calculate how we're going to split the extent.
2656 * We try to do it on contig_clusters boundaries.
2657 *
2658 * Any extent smaller than contig_clusters will be
2659 * CoWed in its entirety.
2660 */
2661 if (leaf_clusters <= contig_clusters)
2662 *cow_len += leaf_clusters;
2663 else if (*cow_len || (*cow_start == cpos)) {
2664 /*
2665 * This extent needs to be CoW'd from its
2666 * beginning, so all we have to do is compute
2667 * how many clusters to grab. We align
2668 * want_clusters to the edge of contig_clusters
2669 * to get better I/O.
2670 */
2671 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2672 want_clusters);
2673
2674 if (leaf_clusters < want_clusters)
2675 *cow_len += leaf_clusters;
2676 else
2677 *cow_len += want_clusters;
2678 } else if ((*cow_start + contig_clusters) >=
2679 (cpos + write_len)) {
2680 /*
2681 * Breaking off contig_clusters at the front
2682 * of the extent will cover our write. That's
2683 * easy.
2684 */
2685 *cow_len = contig_clusters;
2686 } else if ((rec_end - cpos) <= contig_clusters) {
2687 /*
2688 * Breaking off contig_clusters at the tail of
2689 * this extent will cover cpos.
2690 */
2691 *cow_start = rec_end - contig_clusters;
2692 *cow_len = contig_clusters;
2693 } else if ((rec_end - cpos) <= want_clusters) {
2694 /*
2695 * While we can't fit the entire write in this
2696 * extent, we know that the write goes from cpos
2697 * to the end of the extent. Break that off.
2698 * We try to break it at some multiple of
2699 * contig_clusters from the front of the extent.
2700 * Failing that (ie, cpos is within
2701 * contig_clusters of the front), we'll CoW the
2702 * entire extent.
2703 */
2704 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2705 *cow_start, cpos);
2706 *cow_len = rec_end - *cow_start;
2707 } else {
2708 /*
2709 * Ok, the entire write lives in the middle of
2710 * this extent. Let's try to slice the extent up
2711 * nicely. Optimally, our CoW region starts at
2712 * m*contig_clusters from the beginning of the
2713 * extent and goes for n*contig_clusters,
2714 * covering the entire write.
2715 */
2716 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2717 *cow_start, cpos);
2718
2719 want_clusters = (cpos + write_len) - *cow_start;
2720 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2721 want_clusters);
2722 if (*cow_start + want_clusters <= rec_end)
2723 *cow_len = want_clusters;
2724 else
2725 *cow_len = rec_end - *cow_start;
2726 }
2727
2728 /* Have we covered our entire write yet? */
2729 if ((*cow_start + *cow_len) >= (cpos + write_len))
2730 break;
2731
2732 /*
2733 * If we reach the end of the extent block and don't get enough
2734 * clusters, continue with the next extent block if possible.
2735 */
2736 if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2737 eb && eb->h_next_leaf_blk) {
2738 brelse(eb_bh);
2739 eb_bh = NULL;
2740
2741 ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2742 le64_to_cpu(eb->h_next_leaf_blk),
2743 &eb_bh);
2744 if (ret) {
2745 mlog_errno(ret);
2746 goto out;
2747 }
2748
2749 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2750 el = &eb->h_list;
2751 i = -1;
2752 }
2753 }
2754
2755out:
2756 brelse(eb_bh);
2757 return ret;
2758}
2759
2760/*
2761 * Prepare meta_ac, data_ac and calculate credits when we want to add some
2762 * num_clusters in data_tree "et" and change the refcount for the old
2763 * clusters(starting form p_cluster) in the refcount tree.
2764 *
2765 * Note:
2766 * 1. since we may split the old tree, so we at most will need num_clusters + 2
2767 * more new leaf records.
2768 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2769 * just give data_ac = NULL.
2770 */
2771static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2772 u32 p_cluster, u32 num_clusters,
2773 struct ocfs2_extent_tree *et,
2774 struct ocfs2_caching_info *ref_ci,
2775 struct buffer_head *ref_root_bh,
2776 struct ocfs2_alloc_context **meta_ac,
2777 struct ocfs2_alloc_context **data_ac,
2778 int *credits)
2779{
2780 int ret = 0, meta_add = 0;
2781 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2782
2783 if (num_free_extents < 0) {
2784 ret = num_free_extents;
2785 mlog_errno(ret);
2786 goto out;
2787 }
2788
2789 if (num_free_extents < num_clusters + 2)
2790 meta_add =
2791 ocfs2_extend_meta_needed(et->et_root_el);
2792
2793 *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2794 num_clusters + 2);
2795
2796 ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2797 p_cluster, num_clusters,
2798 &meta_add, credits);
2799 if (ret) {
2800 mlog_errno(ret);
2801 goto out;
2802 }
2803
2804 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
2805 meta_add, num_clusters, *credits);
2806 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2807 meta_ac);
2808 if (ret) {
2809 mlog_errno(ret);
2810 goto out;
2811 }
2812
2813 if (data_ac) {
2814 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2815 data_ac);
2816 if (ret)
2817 mlog_errno(ret);
2818 }
2819
2820out:
2821 if (ret) {
2822 if (*meta_ac) {
2823 ocfs2_free_alloc_context(*meta_ac);
2824 *meta_ac = NULL;
2825 }
2826 }
2827
2828 return ret;
2829}
2830
2831static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2832{
2833 BUG_ON(buffer_dirty(bh));
2834
2835 clear_buffer_mapped(bh);
2836
2837 return 0;
2838}
2839
2840static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2841 struct ocfs2_cow_context *context,
2842 u32 cpos, u32 old_cluster,
2843 u32 new_cluster, u32 new_len)
2844{
2845 int ret = 0, partial;
2846 struct ocfs2_caching_info *ci = context->data_et.et_ci;
2847 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2848 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2849 struct page *page;
2850 pgoff_t page_index;
2851 unsigned int from, to;
2852 loff_t offset, end, map_end;
2853 struct address_space *mapping = context->inode->i_mapping;
2854
2855 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2856 new_cluster, new_len, cpos);
2857
2858 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2859 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2860
2861 while (offset < end) {
2862 page_index = offset >> PAGE_CACHE_SHIFT;
2863 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
2864 if (map_end > end)
2865 map_end = end;
2866
2867 /* from, to is the offset within the page. */
2868 from = offset & (PAGE_CACHE_SIZE - 1);
2869 to = PAGE_CACHE_SIZE;
2870 if (map_end & (PAGE_CACHE_SIZE - 1))
2871 to = map_end & (PAGE_CACHE_SIZE - 1);
2872
2873 page = grab_cache_page(mapping, page_index);
2874
2875 /* This page can't be dirtied before we CoW it out. */
2876 BUG_ON(PageDirty(page));
2877
2878 if (!PageUptodate(page)) {
2879 ret = block_read_full_page(page, ocfs2_get_block);
2880 if (ret) {
2881 mlog_errno(ret);
2882 goto unlock;
2883 }
2884 lock_page(page);
2885 }
2886
2887 if (page_has_buffers(page)) {
2888 ret = walk_page_buffers(handle, page_buffers(page),
2889 from, to, &partial,
2890 ocfs2_clear_cow_buffer);
2891 if (ret) {
2892 mlog_errno(ret);
2893 goto unlock;
2894 }
2895 }
2896
2897 ocfs2_map_and_dirty_page(context->inode,
2898 handle, from, to,
2899 page, 0, &new_block);
2900 mark_page_accessed(page);
2901unlock:
2902 unlock_page(page);
2903 page_cache_release(page);
2904 page = NULL;
2905 offset = map_end;
2906 if (ret)
2907 break;
2908 }
2909
2910 return ret;
2911}
2912
2913static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
2914 struct ocfs2_cow_context *context,
2915 u32 cpos, u32 old_cluster,
2916 u32 new_cluster, u32 new_len)
2917{
2918 int ret = 0;
2919 struct super_block *sb = context->inode->i_sb;
2920 struct ocfs2_caching_info *ci = context->data_et.et_ci;
2921 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
2922 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
2923 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2924 struct ocfs2_super *osb = OCFS2_SB(sb);
2925 struct buffer_head *old_bh = NULL;
2926 struct buffer_head *new_bh = NULL;
2927
2928 mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
2929 new_cluster, new_len);
2930
2931 for (i = 0; i < blocks; i++, old_block++, new_block++) {
2932 new_bh = sb_getblk(osb->sb, new_block);
2933 if (new_bh == NULL) {
2934 ret = -EIO;
2935 mlog_errno(ret);
2936 break;
2937 }
2938
2939 ocfs2_set_new_buffer_uptodate(ci, new_bh);
2940
2941 ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
2942 if (ret) {
2943 mlog_errno(ret);
2944 break;
2945 }
2946
2947 ret = ocfs2_journal_access(handle, ci, new_bh,
2948 OCFS2_JOURNAL_ACCESS_CREATE);
2949 if (ret) {
2950 mlog_errno(ret);
2951 break;
2952 }
2953
2954 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
2955 ret = ocfs2_journal_dirty(handle, new_bh);
2956 if (ret) {
2957 mlog_errno(ret);
2958 break;
2959 }
2960
2961 brelse(new_bh);
2962 brelse(old_bh);
2963 new_bh = NULL;
2964 old_bh = NULL;
2965 }
2966
2967 brelse(new_bh);
2968 brelse(old_bh);
2969 return ret;
2970}
2971
2972static int ocfs2_clear_ext_refcount(handle_t *handle,
2973 struct ocfs2_extent_tree *et,
2974 u32 cpos, u32 p_cluster, u32 len,
2975 unsigned int ext_flags,
2976 struct ocfs2_alloc_context *meta_ac,
2977 struct ocfs2_cached_dealloc_ctxt *dealloc)
2978{
2979 int ret, index;
2980 struct ocfs2_extent_rec replace_rec;
2981 struct ocfs2_path *path = NULL;
2982 struct ocfs2_extent_list *el;
2983 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2984 u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
2985
2986 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
2987 (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
2988
2989 memset(&replace_rec, 0, sizeof(replace_rec));
2990 replace_rec.e_cpos = cpu_to_le32(cpos);
2991 replace_rec.e_leaf_clusters = cpu_to_le16(len);
2992 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
2993 p_cluster));
2994 replace_rec.e_flags = ext_flags;
2995 replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
2996
2997 path = ocfs2_new_path_from_et(et);
2998 if (!path) {
2999 ret = -ENOMEM;
3000 mlog_errno(ret);
3001 goto out;
3002 }
3003
3004 ret = ocfs2_find_path(et->et_ci, path, cpos);
3005 if (ret) {
3006 mlog_errno(ret);
3007 goto out;
3008 }
3009
3010 el = path_leaf_el(path);
3011
3012 index = ocfs2_search_extent_list(el, cpos);
3013 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3014 ocfs2_error(sb,
3015 "Inode %llu has an extent at cpos %u which can no "
3016 "longer be found.\n",
3017 (unsigned long long)ino, cpos);
3018 ret = -EROFS;
3019 goto out;
3020 }
3021
3022 ret = ocfs2_split_extent(handle, et, path, index,
3023 &replace_rec, meta_ac, dealloc);
3024 if (ret)
3025 mlog_errno(ret);
3026
3027out:
3028 ocfs2_free_path(path);
3029 return ret;
3030}
3031
3032static int ocfs2_replace_clusters(handle_t *handle,
3033 struct ocfs2_cow_context *context,
3034 u32 cpos, u32 old,
3035 u32 new, u32 len,
3036 unsigned int ext_flags)
3037{
3038 int ret;
3039 struct ocfs2_caching_info *ci = context->data_et.et_ci;
3040 u64 ino = ocfs2_metadata_cache_owner(ci);
3041
3042 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
3043 (unsigned long long)ino, cpos, old, new, len, ext_flags);
3044
3045 /*If the old clusters is unwritten, no need to duplicate. */
3046 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3047 ret = context->cow_duplicate_clusters(handle, context, cpos,
3048 old, new, len);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out;
3052 }
3053 }
3054
3055 ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3056 cpos, new, len, ext_flags,
3057 context->meta_ac, &context->dealloc);
3058 if (ret)
3059 mlog_errno(ret);
3060out:
3061 return ret;
3062}
3063
3064static int ocfs2_cow_sync_writeback(struct super_block *sb,
3065 struct ocfs2_cow_context *context,
3066 u32 cpos, u32 num_clusters)
3067{
3068 int ret = 0;
3069 loff_t offset, end, map_end;
3070 pgoff_t page_index;
3071 struct page *page;
3072
3073 if (ocfs2_should_order_data(context->inode))
3074 return 0;
3075
3076 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3077 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3078
3079 ret = filemap_fdatawrite_range(context->inode->i_mapping,
3080 offset, end - 1);
3081 if (ret < 0) {
3082 mlog_errno(ret);
3083 return ret;
3084 }
3085
3086 while (offset < end) {
3087 page_index = offset >> PAGE_CACHE_SHIFT;
3088 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
3089 if (map_end > end)
3090 map_end = end;
3091
3092 page = grab_cache_page(context->inode->i_mapping, page_index);
3093 BUG_ON(!page);
3094
3095 wait_on_page_writeback(page);
3096 if (PageError(page)) {
3097 ret = -EIO;
3098 mlog_errno(ret);
3099 } else
3100 mark_page_accessed(page);
3101
3102 unlock_page(page);
3103 page_cache_release(page);
3104 page = NULL;
3105 offset = map_end;
3106 if (ret)
3107 break;
3108 }
3109
3110 return ret;
3111}
3112
3113static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3114 u32 v_cluster, u32 *p_cluster,
3115 u32 *num_clusters,
3116 unsigned int *extent_flags)
3117{
3118 return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3119 num_clusters, extent_flags);
3120}
3121
3122static int ocfs2_make_clusters_writable(struct super_block *sb,
3123 struct ocfs2_cow_context *context,
3124 u32 cpos, u32 p_cluster,
3125 u32 num_clusters, unsigned int e_flags)
3126{
3127 int ret, delete, index, credits = 0;
3128 u32 new_bit, new_len;
3129 unsigned int set_len;
3130 struct ocfs2_super *osb = OCFS2_SB(sb);
3131 handle_t *handle;
3132 struct buffer_head *ref_leaf_bh = NULL;
3133 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3134 struct ocfs2_refcount_rec rec;
3135
3136 mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
3137 cpos, p_cluster, num_clusters, e_flags);
3138
3139 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3140 &context->data_et,
3141 ref_ci,
3142 context->ref_root_bh,
3143 &context->meta_ac,
3144 &context->data_ac, &credits);
3145 if (ret) {
3146 mlog_errno(ret);
3147 return ret;
3148 }
3149
3150 if (context->post_refcount)
3151 credits += context->post_refcount->credits;
3152
3153 credits += context->extra_credits;
3154 handle = ocfs2_start_trans(osb, credits);
3155 if (IS_ERR(handle)) {
3156 ret = PTR_ERR(handle);
3157 mlog_errno(ret);
3158 goto out;
3159 }
3160
3161 while (num_clusters) {
3162 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3163 p_cluster, num_clusters,
3164 &rec, &index, &ref_leaf_bh);
3165 if (ret) {
3166 mlog_errno(ret);
3167 goto out_commit;
3168 }
3169
3170 BUG_ON(!rec.r_refcount);
3171 set_len = min((u64)p_cluster + num_clusters,
3172 le64_to_cpu(rec.r_cpos) +
3173 le32_to_cpu(rec.r_clusters)) - p_cluster;
3174
3175 /*
3176 * There are many different situation here.
3177 * 1. If refcount == 1, remove the flag and don't COW.
3178 * 2. If refcount > 1, allocate clusters.
3179 * Here we may not allocate r_len once at a time, so continue
3180 * until we reach num_clusters.
3181 */
3182 if (le32_to_cpu(rec.r_refcount) == 1) {
3183 delete = 0;
3184 ret = ocfs2_clear_ext_refcount(handle,
3185 &context->data_et,
3186 cpos, p_cluster,
3187 set_len, e_flags,
3188 context->meta_ac,
3189 &context->dealloc);
3190 if (ret) {
3191 mlog_errno(ret);
3192 goto out_commit;
3193 }
3194 } else {
3195 delete = 1;
3196
3197 ret = __ocfs2_claim_clusters(osb, handle,
3198 context->data_ac,
3199 1, set_len,
3200 &new_bit, &new_len);
3201 if (ret) {
3202 mlog_errno(ret);
3203 goto out_commit;
3204 }
3205
3206 ret = ocfs2_replace_clusters(handle, context,
3207 cpos, p_cluster, new_bit,
3208 new_len, e_flags);
3209 if (ret) {
3210 mlog_errno(ret);
3211 goto out_commit;
3212 }
3213 set_len = new_len;
3214 }
3215
3216 ret = __ocfs2_decrease_refcount(handle, ref_ci,
3217 context->ref_root_bh,
3218 p_cluster, set_len,
3219 context->meta_ac,
3220 &context->dealloc, delete);
3221 if (ret) {
3222 mlog_errno(ret);
3223 goto out_commit;
3224 }
3225
3226 cpos += set_len;
3227 p_cluster += set_len;
3228 num_clusters -= set_len;
3229 brelse(ref_leaf_bh);
3230 ref_leaf_bh = NULL;
3231 }
3232
3233 /* handle any post_cow action. */
3234 if (context->post_refcount && context->post_refcount->func) {
3235 ret = context->post_refcount->func(context->inode, handle,
3236 context->post_refcount->para);
3237 if (ret) {
3238 mlog_errno(ret);
3239 goto out_commit;
3240 }
3241 }
3242
3243 /*
3244 * Here we should write the new page out first if we are
3245 * in write-back mode.
3246 */
3247 if (context->get_clusters == ocfs2_di_get_clusters) {
3248 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
3249 if (ret)
3250 mlog_errno(ret);
3251 }
3252
3253out_commit:
3254 ocfs2_commit_trans(osb, handle);
3255
3256out:
3257 if (context->data_ac) {
3258 ocfs2_free_alloc_context(context->data_ac);
3259 context->data_ac = NULL;
3260 }
3261 if (context->meta_ac) {
3262 ocfs2_free_alloc_context(context->meta_ac);
3263 context->meta_ac = NULL;
3264 }
3265 brelse(ref_leaf_bh);
3266
3267 return ret;
3268}
3269
3270static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3271{
3272 int ret = 0;
3273 struct inode *inode = context->inode;
3274 u32 cow_start = context->cow_start, cow_len = context->cow_len;
3275 u32 p_cluster, num_clusters;
3276 unsigned int ext_flags;
3277 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3278
3279 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3280 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3281 "tree, but the feature bit is not set in the "
3282 "super block.", inode->i_ino);
3283 return -EROFS;
3284 }
3285
3286 ocfs2_init_dealloc_ctxt(&context->dealloc);
3287
3288 while (cow_len) {
3289 ret = context->get_clusters(context, cow_start, &p_cluster,
3290 &num_clusters, &ext_flags);
3291 if (ret) {
3292 mlog_errno(ret);
3293 break;
3294 }
3295
3296 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3297
3298 if (cow_len < num_clusters)
3299 num_clusters = cow_len;
3300
3301 ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3302 cow_start, p_cluster,
3303 num_clusters, ext_flags);
3304 if (ret) {
3305 mlog_errno(ret);
3306 break;
3307 }
3308
3309 cow_len -= num_clusters;
3310 cow_start += num_clusters;
3311 }
3312
3313 if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3314 ocfs2_schedule_truncate_log_flush(osb, 1);
3315 ocfs2_run_deallocs(osb, &context->dealloc);
3316 }
3317
3318 return ret;
3319}
3320
3321/*
3322 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3323 * past max_cpos. This will stop when it runs into a hole or an
3324 * unrefcounted extent.
3325 */
3326static int ocfs2_refcount_cow_hunk(struct inode *inode,
3327 struct buffer_head *di_bh,
3328 u32 cpos, u32 write_len, u32 max_cpos)
3329{
3330 int ret;
3331 u32 cow_start = 0, cow_len = 0;
3332 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3333 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3334 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3335 struct buffer_head *ref_root_bh = NULL;
3336 struct ocfs2_refcount_tree *ref_tree;
3337 struct ocfs2_cow_context *context = NULL;
3338
3339 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3340
3341 ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3342 cpos, write_len, max_cpos,
3343 &cow_start, &cow_len);
3344 if (ret) {
3345 mlog_errno(ret);
3346 goto out;
3347 }
3348
3349 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
3350 "cow_len %u\n", inode->i_ino,
3351 cpos, write_len, cow_start, cow_len);
3352
3353 BUG_ON(cow_len == 0);
3354
3355 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3356 if (!context) {
3357 ret = -ENOMEM;
3358 mlog_errno(ret);
3359 goto out;
3360 }
3361
3362 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3363 1, &ref_tree, &ref_root_bh);
3364 if (ret) {
3365 mlog_errno(ret);
3366 goto out;
3367 }
3368
3369 context->inode = inode;
3370 context->cow_start = cow_start;
3371 context->cow_len = cow_len;
3372 context->ref_tree = ref_tree;
3373 context->ref_root_bh = ref_root_bh;
3374 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3375 context->get_clusters = ocfs2_di_get_clusters;
3376
3377 ocfs2_init_dinode_extent_tree(&context->data_et,
3378 INODE_CACHE(inode), di_bh);
3379
3380 ret = ocfs2_replace_cow(context);
3381 if (ret)
3382 mlog_errno(ret);
3383
3384 /*
3385 * truncate the extent map here since no matter whether we meet with
3386 * any error during the action, we shouldn't trust cached extent map
3387 * any more.
3388 */
3389 ocfs2_extent_map_trunc(inode, cow_start);
3390
3391 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3392 brelse(ref_root_bh);
3393out:
3394 kfree(context);
3395 return ret;
3396}
3397
3398/*
3399 * CoW any and all clusters between cpos and cpos+write_len.
3400 * Don't CoW past max_cpos. If this returns successfully, all
3401 * clusters between cpos and cpos+write_len are safe to modify.
3402 */
3403int ocfs2_refcount_cow(struct inode *inode,
3404 struct buffer_head *di_bh,
3405 u32 cpos, u32 write_len, u32 max_cpos)
3406{
3407 int ret = 0;
3408 u32 p_cluster, num_clusters;
3409 unsigned int ext_flags;
3410
3411 while (write_len) {
3412 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3413 &num_clusters, &ext_flags);
3414 if (ret) {
3415 mlog_errno(ret);
3416 break;
3417 }
3418
3419 if (write_len < num_clusters)
3420 num_clusters = write_len;
3421
3422 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3423 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3424 num_clusters, max_cpos);
3425 if (ret) {
3426 mlog_errno(ret);
3427 break;
3428 }
3429 }
3430
3431 write_len -= num_clusters;
3432 cpos += num_clusters;
3433 }
3434
3435 return ret;
3436}
3437
3438static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3439 u32 v_cluster, u32 *p_cluster,
3440 u32 *num_clusters,
3441 unsigned int *extent_flags)
3442{
3443 struct inode *inode = context->inode;
3444 struct ocfs2_xattr_value_root *xv = context->cow_object;
3445
3446 return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3447 num_clusters, &xv->xr_list,
3448 extent_flags);
3449}
3450
3451/*
3452 * Given a xattr value root, calculate the most meta/credits we need for
3453 * refcount tree change if we truncate it to 0.
3454 */
3455int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3456 struct ocfs2_caching_info *ref_ci,
3457 struct buffer_head *ref_root_bh,
3458 struct ocfs2_xattr_value_root *xv,
3459 int *meta_add, int *credits)
3460{
3461 int ret = 0, index, ref_blocks = 0;
3462 u32 p_cluster, num_clusters;
3463 u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3464 struct ocfs2_refcount_block *rb;
3465 struct ocfs2_refcount_rec rec;
3466 struct buffer_head *ref_leaf_bh = NULL;
3467
3468 while (cpos < clusters) {
3469 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3470 &num_clusters, &xv->xr_list,
3471 NULL);
3472 if (ret) {
3473 mlog_errno(ret);
3474 goto out;
3475 }
3476
3477 cpos += num_clusters;
3478
3479 while (num_clusters) {
3480 ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3481 p_cluster, num_clusters,
3482 &rec, &index,
3483 &ref_leaf_bh);
3484 if (ret) {
3485 mlog_errno(ret);
3486 goto out;
3487 }
3488
3489 BUG_ON(!rec.r_refcount);
3490
3491 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3492
3493 /*
3494 * We really don't know whether the other clusters is in
3495 * this refcount block or not, so just take the worst
3496 * case that all the clusters are in this block and each
3497 * one will split a refcount rec, so totally we need
3498 * clusters * 2 new refcount rec.
3499 */
3500 if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3501 le16_to_cpu(rb->rf_records.rl_count))
3502 ref_blocks++;
3503
3504 *credits += 1;
3505 brelse(ref_leaf_bh);
3506 ref_leaf_bh = NULL;
3507
3508 if (num_clusters <= le32_to_cpu(rec.r_clusters))
3509 break;
3510 else
3511 num_clusters -= le32_to_cpu(rec.r_clusters);
3512 p_cluster += num_clusters;
3513 }
3514 }
3515
3516 *meta_add += ref_blocks;
3517 if (!ref_blocks)
3518 goto out;
3519
3520 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3521 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3522 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3523 else {
3524 struct ocfs2_extent_tree et;
3525
3526 ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3527 *credits += ocfs2_calc_extend_credits(inode->i_sb,
3528 et.et_root_el,
3529 ref_blocks);
3530 }
3531
3532out:
3533 brelse(ref_leaf_bh);
3534 return ret;
3535}
3536
3537/*
3538 * Do CoW for xattr.
3539 */
3540int ocfs2_refcount_cow_xattr(struct inode *inode,
3541 struct ocfs2_dinode *di,
3542 struct ocfs2_xattr_value_buf *vb,
3543 struct ocfs2_refcount_tree *ref_tree,
3544 struct buffer_head *ref_root_bh,
3545 u32 cpos, u32 write_len,
3546 struct ocfs2_post_refcount *post)
3547{
3548 int ret;
3549 struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3550 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3551 struct ocfs2_cow_context *context = NULL;
3552 u32 cow_start, cow_len;
3553
3554 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3555
3556 ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3557 cpos, write_len, UINT_MAX,
3558 &cow_start, &cow_len);
3559 if (ret) {
3560 mlog_errno(ret);
3561 goto out;
3562 }
3563
3564 BUG_ON(cow_len == 0);
3565
3566 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3567 if (!context) {
3568 ret = -ENOMEM;
3569 mlog_errno(ret);
3570 goto out;
3571 }
3572
3573 context->inode = inode;
3574 context->cow_start = cow_start;
3575 context->cow_len = cow_len;
3576 context->ref_tree = ref_tree;
3577 context->ref_root_bh = ref_root_bh;;
3578 context->cow_object = xv;
3579
3580 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3581 /* We need the extra credits for duplicate_clusters by jbd. */
3582 context->extra_credits =
3583 ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3584 context->get_clusters = ocfs2_xattr_value_get_clusters;
3585 context->post_refcount = post;
3586
3587 ocfs2_init_xattr_value_extent_tree(&context->data_et,
3588 INODE_CACHE(inode), vb);
3589
3590 ret = ocfs2_replace_cow(context);
3591 if (ret)
3592 mlog_errno(ret);
3593
3594out:
3595 kfree(context);
3596 return ret;
3597}
3598
3599/*
3600 * Insert a new extent into refcount tree and mark a extent rec
3601 * as refcounted in the dinode tree.
3602 */
3603int ocfs2_add_refcount_flag(struct inode *inode,
3604 struct ocfs2_extent_tree *data_et,
3605 struct ocfs2_caching_info *ref_ci,
3606 struct buffer_head *ref_root_bh,
3607 u32 cpos, u32 p_cluster, u32 num_clusters,
3608 struct ocfs2_cached_dealloc_ctxt *dealloc,
3609 struct ocfs2_post_refcount *post)
3610{
3611 int ret;
3612 handle_t *handle;
3613 int credits = 1, ref_blocks = 0;
3614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3615 struct ocfs2_alloc_context *meta_ac = NULL;
3616
3617 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3618 ref_ci, ref_root_bh,
3619 p_cluster, num_clusters,
3620 &ref_blocks, &credits);
3621 if (ret) {
3622 mlog_errno(ret);
3623 goto out;
3624 }
3625
3626 mlog(0, "reserve new metadata %d, credits = %d\n",
3627 ref_blocks, credits);
3628
3629 if (ref_blocks) {
3630 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
3631 ref_blocks, &meta_ac);
3632 if (ret) {
3633 mlog_errno(ret);
3634 goto out;
3635 }
3636 }
3637
3638 if (post)
3639 credits += post->credits;
3640
3641 handle = ocfs2_start_trans(osb, credits);
3642 if (IS_ERR(handle)) {
3643 ret = PTR_ERR(handle);
3644 mlog_errno(ret);
3645 goto out;
3646 }
3647
3648 ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3649 cpos, num_clusters, p_cluster,
3650 meta_ac, dealloc);
3651 if (ret) {
3652 mlog_errno(ret);
3653 goto out_commit;
3654 }
3655
3656 ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3657 p_cluster, num_clusters, 0,
3658 meta_ac, dealloc);
3659 if (ret) {
3660 mlog_errno(ret);
3661 goto out_commit;
3662 }
3663
3664 if (post && post->func) {
3665 ret = post->func(inode, handle, post->para);
3666 if (ret)
3667 mlog_errno(ret);
3668 }
3669
3670out_commit:
3671 ocfs2_commit_trans(osb, handle);
3672out:
3673 if (meta_ac)
3674 ocfs2_free_alloc_context(meta_ac);
3675 return ret;
3676}
3677
3678static int ocfs2_change_ctime(struct inode *inode,
3679 struct buffer_head *di_bh)
3680{
3681 int ret;
3682 handle_t *handle;
3683 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3684
3685 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3686 OCFS2_INODE_UPDATE_CREDITS);
3687 if (IS_ERR(handle)) {
3688 ret = PTR_ERR(handle);
3689 mlog_errno(ret);
3690 goto out;
3691 }
3692
3693 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3694 OCFS2_JOURNAL_ACCESS_WRITE);
3695 if (ret) {
3696 mlog_errno(ret);
3697 goto out_commit;
3698 }
3699
3700 inode->i_ctime = CURRENT_TIME;
3701 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3702 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3703
3704 ocfs2_journal_dirty(handle, di_bh);
3705
3706out_commit:
3707 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3708out:
3709 return ret;
3710}
3711
3712static int ocfs2_attach_refcount_tree(struct inode *inode,
3713 struct buffer_head *di_bh)
3714{
3715 int ret, data_changed = 0;
3716 struct buffer_head *ref_root_bh = NULL;
3717 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3718 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3719 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3720 struct ocfs2_refcount_tree *ref_tree;
3721 unsigned int ext_flags;
3722 loff_t size;
3723 u32 cpos, num_clusters, clusters, p_cluster;
3724 struct ocfs2_cached_dealloc_ctxt dealloc;
3725 struct ocfs2_extent_tree di_et;
3726
3727 ocfs2_init_dealloc_ctxt(&dealloc);
3728
3729 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3730 ret = ocfs2_create_refcount_tree(inode, di_bh);
3731 if (ret) {
3732 mlog_errno(ret);
3733 goto out;
3734 }
3735 }
3736
3737 BUG_ON(!di->i_refcount_loc);
3738 ret = ocfs2_lock_refcount_tree(osb,
3739 le64_to_cpu(di->i_refcount_loc), 1,
3740 &ref_tree, &ref_root_bh);
3741 if (ret) {
3742 mlog_errno(ret);
3743 goto out;
3744 }
3745
3746 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3747
3748 size = i_size_read(inode);
3749 clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3750
3751 cpos = 0;
3752 while (cpos < clusters) {
3753 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3754 &num_clusters, &ext_flags);
3755
3756 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3757 ret = ocfs2_add_refcount_flag(inode, &di_et,
3758 &ref_tree->rf_ci,
3759 ref_root_bh, cpos,
3760 p_cluster, num_clusters,
3761 &dealloc, NULL);
3762 if (ret) {
3763 mlog_errno(ret);
3764 goto unlock;
3765 }
3766
3767 data_changed = 1;
3768 }
3769 cpos += num_clusters;
3770 }
3771
3772 if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3773 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3774 &ref_tree->rf_ci,
3775 ref_root_bh,
3776 &dealloc);
3777 if (ret) {
3778 mlog_errno(ret);
3779 goto unlock;
3780 }
3781 }
3782
3783 if (data_changed) {
3784 ret = ocfs2_change_ctime(inode, di_bh);
3785 if (ret)
3786 mlog_errno(ret);
3787 }
3788
3789unlock:
3790 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3791 brelse(ref_root_bh);
3792
3793 if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3794 ocfs2_schedule_truncate_log_flush(osb, 1);
3795 ocfs2_run_deallocs(osb, &dealloc);
3796 }
3797out:
3798 /*
3799 * Empty the extent map so that we may get the right extent
3800 * record from the disk.
3801 */
3802 ocfs2_extent_map_trunc(inode, 0);
3803
3804 return ret;
3805}
3806
3807static int ocfs2_add_refcounted_extent(struct inode *inode,
3808 struct ocfs2_extent_tree *et,
3809 struct ocfs2_caching_info *ref_ci,
3810 struct buffer_head *ref_root_bh,
3811 u32 cpos, u32 p_cluster, u32 num_clusters,
3812 unsigned int ext_flags,
3813 struct ocfs2_cached_dealloc_ctxt *dealloc)
3814{
3815 int ret;
3816 handle_t *handle;
3817 int credits = 0;
3818 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3819 struct ocfs2_alloc_context *meta_ac = NULL;
3820
3821 ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3822 p_cluster, num_clusters,
3823 et, ref_ci,
3824 ref_root_bh, &meta_ac,
3825 NULL, &credits);
3826 if (ret) {
3827 mlog_errno(ret);
3828 goto out;
3829 }
3830
3831 handle = ocfs2_start_trans(osb, credits);
3832 if (IS_ERR(handle)) {
3833 ret = PTR_ERR(handle);
3834 mlog_errno(ret);
3835 goto out;
3836 }
3837
3838 ret = ocfs2_insert_extent(handle, et, cpos,
3839 cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
3840 p_cluster)),
3841 num_clusters, ext_flags, meta_ac);
3842 if (ret) {
3843 mlog_errno(ret);
3844 goto out_commit;
3845 }
3846
3847 ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3848 p_cluster, num_clusters,
3849 meta_ac, dealloc);
3850 if (ret)
3851 mlog_errno(ret);
3852
3853out_commit:
3854 ocfs2_commit_trans(osb, handle);
3855out:
3856 if (meta_ac)
3857 ocfs2_free_alloc_context(meta_ac);
3858 return ret;
3859}
3860
3861static int ocfs2_duplicate_extent_list(struct inode *s_inode,
3862 struct inode *t_inode,
3863 struct buffer_head *t_bh,
3864 struct ocfs2_caching_info *ref_ci,
3865 struct buffer_head *ref_root_bh,
3866 struct ocfs2_cached_dealloc_ctxt *dealloc)
3867{
3868 int ret = 0;
3869 u32 p_cluster, num_clusters, clusters, cpos;
3870 loff_t size;
3871 unsigned int ext_flags;
3872 struct ocfs2_extent_tree et;
3873
3874 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
3875
3876 size = i_size_read(s_inode);
3877 clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
3878
3879 cpos = 0;
3880 while (cpos < clusters) {
3881 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
3882 &num_clusters, &ext_flags);
3883
3884 if (p_cluster) {
3885 ret = ocfs2_add_refcounted_extent(t_inode, &et,
3886 ref_ci, ref_root_bh,
3887 cpos, p_cluster,
3888 num_clusters,
3889 ext_flags,
3890 dealloc);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out;
3894 }
3895 }
3896
3897 cpos += num_clusters;
3898 }
3899
3900out:
3901 return ret;
3902}
3903
3904/*
3905 * change the new file's attributes to the src.
3906 *
3907 * reflink creates a snapshot of a file, that means the attributes
3908 * must be identical except for three exceptions - nlink, ino, and ctime.
3909 */
3910static int ocfs2_complete_reflink(struct inode *s_inode,
3911 struct buffer_head *s_bh,
3912 struct inode *t_inode,
3913 struct buffer_head *t_bh,
3914 bool preserve)
3915{
3916 int ret;
3917 handle_t *handle;
3918 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3919 struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
3920 loff_t size = i_size_read(s_inode);
3921
3922 handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
3923 OCFS2_INODE_UPDATE_CREDITS);
3924 if (IS_ERR(handle)) {
3925 ret = PTR_ERR(handle);
3926 mlog_errno(ret);
3927 return ret;
3928 }
3929
3930 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3931 OCFS2_JOURNAL_ACCESS_WRITE);
3932 if (ret) {
3933 mlog_errno(ret);
3934 goto out_commit;
3935 }
3936
3937 spin_lock(&OCFS2_I(t_inode)->ip_lock);
3938 OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
3939 OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
3940 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
3941 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3942 i_size_write(t_inode, size);
3943
3944 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
3945 di->i_clusters = s_di->i_clusters;
3946 di->i_size = s_di->i_size;
3947 di->i_dyn_features = s_di->i_dyn_features;
3948 di->i_attr = s_di->i_attr;
3949
3950 if (preserve) {
3951 di->i_uid = s_di->i_uid;
3952 di->i_gid = s_di->i_gid;
3953 di->i_mode = s_di->i_mode;
3954
3955 /*
3956 * update time.
3957 * we want mtime to appear identical to the source and
3958 * update ctime.
3959 */
3960 t_inode->i_ctime = CURRENT_TIME;
3961
3962 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
3963 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
3964
3965 t_inode->i_mtime = s_inode->i_mtime;
3966 di->i_mtime = s_di->i_mtime;
3967 di->i_mtime_nsec = s_di->i_mtime_nsec;
3968 }
3969
3970 ocfs2_journal_dirty(handle, t_bh);
3971
3972out_commit:
3973 ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
3974 return ret;
3975}
3976
3977static int ocfs2_create_reflink_node(struct inode *s_inode,
3978 struct buffer_head *s_bh,
3979 struct inode *t_inode,
3980 struct buffer_head *t_bh,
3981 bool preserve)
3982{
3983 int ret;
3984 struct buffer_head *ref_root_bh = NULL;
3985 struct ocfs2_cached_dealloc_ctxt dealloc;
3986 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3987 struct ocfs2_refcount_block *rb;
3988 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
3989 struct ocfs2_refcount_tree *ref_tree;
3990
3991 ocfs2_init_dealloc_ctxt(&dealloc);
3992
3993 ret = ocfs2_set_refcount_tree(t_inode, t_bh,
3994 le64_to_cpu(di->i_refcount_loc));
3995 if (ret) {
3996 mlog_errno(ret);
3997 goto out;
3998 }
3999
4000 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4001 1, &ref_tree, &ref_root_bh);
4002 if (ret) {
4003 mlog_errno(ret);
4004 goto out;
4005 }
4006 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4007
4008 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4009 &ref_tree->rf_ci, ref_root_bh,
4010 &dealloc);
4011 if (ret) {
4012 mlog_errno(ret);
4013 goto out_unlock_refcount;
4014 }
4015
4016 ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve);
4017 if (ret)
4018 mlog_errno(ret);
4019
4020out_unlock_refcount:
4021 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4022 brelse(ref_root_bh);
4023out:
4024 if (ocfs2_dealloc_has_cluster(&dealloc)) {
4025 ocfs2_schedule_truncate_log_flush(osb, 1);
4026 ocfs2_run_deallocs(osb, &dealloc);
4027 }
4028
4029 return ret;
4030}
4031
4032static int __ocfs2_reflink(struct dentry *old_dentry,
4033 struct buffer_head *old_bh,
4034 struct inode *new_inode,
4035 bool preserve)
4036{
4037 int ret;
4038 struct inode *inode = old_dentry->d_inode;
4039 struct buffer_head *new_bh = NULL;
4040
4041 ret = filemap_fdatawrite(inode->i_mapping);
4042 if (ret) {
4043 mlog_errno(ret);
4044 goto out;
4045 }
4046
4047 ret = ocfs2_attach_refcount_tree(inode, old_bh);
4048 if (ret) {
4049 mlog_errno(ret);
4050 goto out;
4051 }
4052
4053 mutex_lock(&new_inode->i_mutex);
4054 ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
4055 if (ret) {
4056 mlog_errno(ret);
4057 goto out_unlock;
4058 }
4059
4060 ret = ocfs2_create_reflink_node(inode, old_bh,
4061 new_inode, new_bh, preserve);
4062 if (ret) {
4063 mlog_errno(ret);
4064 goto inode_unlock;
4065 }
4066
4067 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4068 ret = ocfs2_reflink_xattrs(inode, old_bh,
4069 new_inode, new_bh,
4070 preserve);
4071 if (ret)
4072 mlog_errno(ret);
4073 }
4074inode_unlock:
4075 ocfs2_inode_unlock(new_inode, 1);
4076 brelse(new_bh);
4077out_unlock:
4078 mutex_unlock(&new_inode->i_mutex);
4079out:
4080 if (!ret) {
4081 ret = filemap_fdatawait(inode->i_mapping);
4082 if (ret)
4083 mlog_errno(ret);
4084 }
4085 return ret;
4086}
4087
4088static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4089 struct dentry *new_dentry, bool preserve)
4090{
4091 int error;
4092 struct inode *inode = old_dentry->d_inode;
4093 struct buffer_head *old_bh = NULL;
4094 struct inode *new_orphan_inode = NULL;
4095
4096 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4097 return -EOPNOTSUPP;
4098
4099 error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4100 &new_orphan_inode);
4101 if (error) {
4102 mlog_errno(error);
4103 goto out;
4104 }
4105
4106 error = ocfs2_inode_lock(inode, &old_bh, 1);
4107 if (error) {
4108 mlog_errno(error);
4109 goto out;
4110 }
4111
4112 down_write(&OCFS2_I(inode)->ip_xattr_sem);
4113 down_write(&OCFS2_I(inode)->ip_alloc_sem);
4114 error = __ocfs2_reflink(old_dentry, old_bh,
4115 new_orphan_inode, preserve);
4116 up_write(&OCFS2_I(inode)->ip_alloc_sem);
4117 up_write(&OCFS2_I(inode)->ip_xattr_sem);
4118
4119 ocfs2_inode_unlock(inode, 1);
4120 brelse(old_bh);
4121
4122 if (error) {
4123 mlog_errno(error);
4124 goto out;
4125 }
4126
4127 /* If the security isn't preserved, we need to re-initialize them. */
4128 if (!preserve) {
4129 error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
4130 if (error)
4131 mlog_errno(error);
4132 }
4133out:
4134 if (!error) {
4135 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4136 new_dentry);
4137 if (error)
4138 mlog_errno(error);
4139 }
4140
4141 if (new_orphan_inode) {
4142 /*
4143 * We need to open_unlock the inode no matter whether we
4144 * succeed or not, so that other nodes can delete it later.
4145 */
4146 ocfs2_open_unlock(new_orphan_inode);
4147 if (error)
4148 iput(new_orphan_inode);
4149 }
4150
4151 return error;
4152}
4153
4154/*
4155 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4156 * sys_reflink(). This will go away when vfs_reflink() exists in
4157 * fs/namei.c.
4158 */
4159
4160/* copied from may_create in VFS. */
4161static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4162{
4163 if (child->d_inode)
4164 return -EEXIST;
4165 if (IS_DEADDIR(dir))
4166 return -ENOENT;
4167 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4168}
4169
4170/* copied from user_path_parent. */
4171static int ocfs2_user_path_parent(const char __user *path,
4172 struct nameidata *nd, char **name)
4173{
4174 char *s = getname(path);
4175 int error;
4176
4177 if (IS_ERR(s))
4178 return PTR_ERR(s);
4179
4180 error = path_lookup(s, LOOKUP_PARENT, nd);
4181 if (error)
4182 putname(s);
4183 else
4184 *name = s;
4185
4186 return error;
4187}
4188
4189/**
4190 * ocfs2_vfs_reflink - Create a reference-counted link
4191 *
4192 * @old_dentry: source dentry + inode
4193 * @dir: directory to create the target
4194 * @new_dentry: target dentry
4195 * @preserve: if true, preserve all file attributes
4196 */
4197int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4198 struct dentry *new_dentry, bool preserve)
4199{
4200 struct inode *inode = old_dentry->d_inode;
4201 int error;
4202
4203 if (!inode)
4204 return -ENOENT;
4205
4206 error = ocfs2_may_create(dir, new_dentry);
4207 if (error)
4208 return error;
4209
4210 if (dir->i_sb != inode->i_sb)
4211 return -EXDEV;
4212
4213 /*
4214 * A reflink to an append-only or immutable file cannot be created.
4215 */
4216 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4217 return -EPERM;
4218
4219 /* Only regular files can be reflinked. */
4220 if (!S_ISREG(inode->i_mode))
4221 return -EPERM;
4222
4223 /*
4224 * If the caller wants to preserve ownership, they require the
4225 * rights to do so.
4226 */
4227 if (preserve) {
4228 if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
4229 return -EPERM;
4230 if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4231 return -EPERM;
4232 }
4233
4234 /*
4235 * If the caller is modifying any aspect of the attributes, they
4236 * are not creating a snapshot. They need read permission on the
4237 * file.
4238 */
4239 if (!preserve) {
4240 error = inode_permission(inode, MAY_READ);
4241 if (error)
4242 return error;
4243 }
4244
4245 mutex_lock(&inode->i_mutex);
4246 vfs_dq_init(dir);
4247 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4248 mutex_unlock(&inode->i_mutex);
4249 if (!error)
4250 fsnotify_create(dir, new_dentry);
4251 return error;
4252}
4253/*
4254 * Most codes are copied from sys_linkat.
4255 */
4256int ocfs2_reflink_ioctl(struct inode *inode,
4257 const char __user *oldname,
4258 const char __user *newname,
4259 bool preserve)
4260{
4261 struct dentry *new_dentry;
4262 struct nameidata nd;
4263 struct path old_path;
4264 int error;
4265 char *to = NULL;
4266
4267 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4268 return -EOPNOTSUPP;
4269
4270 error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4271 if (error) {
4272 mlog_errno(error);
4273 return error;
4274 }
4275
4276 error = ocfs2_user_path_parent(newname, &nd, &to);
4277 if (error) {
4278 mlog_errno(error);
4279 goto out;
4280 }
4281
4282 error = -EXDEV;
4283 if (old_path.mnt != nd.path.mnt)
4284 goto out_release;
4285 new_dentry = lookup_create(&nd, 0);
4286 error = PTR_ERR(new_dentry);
4287 if (IS_ERR(new_dentry)) {
4288 mlog_errno(error);
4289 goto out_unlock;
4290 }
4291
4292 error = mnt_want_write(nd.path.mnt);
4293 if (error) {
4294 mlog_errno(error);
4295 goto out_dput;
4296 }
4297
4298 error = ocfs2_vfs_reflink(old_path.dentry,
4299 nd.path.dentry->d_inode,
4300 new_dentry, preserve);
4301 mnt_drop_write(nd.path.mnt);
4302out_dput:
4303 dput(new_dentry);
4304out_unlock:
4305 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
4306out_release:
4307 path_put(&nd.path);
4308 putname(to);
4309out:
4310 path_put(&old_path);
4311
4312 return error;
4313}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 000000000000..c1d19b1d3ecc
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,106 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.h
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#ifndef OCFS2_REFCOUNTTREE_H
18#define OCFS2_REFCOUNTTREE_H
19
20struct ocfs2_refcount_tree {
21 struct rb_node rf_node;
22 u64 rf_blkno;
23 u32 rf_generation;
24 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed;
28
29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock;
32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb;
34};
35
36void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
37int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
38 struct ocfs2_refcount_tree **tree,
39 struct buffer_head **ref_bh);
40void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
41 struct ocfs2_refcount_tree *tree,
42 int rw);
43
44int ocfs2_decrease_refcount(struct inode *inode,
45 handle_t *handle, u32 cpos, u32 len,
46 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc,
48 int delete);
49int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
50 struct buffer_head *di_bh,
51 u64 phys_blkno,
52 u32 clusters,
53 int *credits,
54 struct ocfs2_alloc_context **meta_ac);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos);
57
58typedef int (ocfs2_post_refcount_func)(struct inode *inode,
59 handle_t *handle,
60 void *para);
61/*
62 * Some refcount caller need to do more work after we modify the data b-tree
63 * during refcount operation(including CoW and add refcount flag), and make the
64 * transaction complete. So it must give us this structure so that we can do it
65 * within our transaction.
66 *
67 */
68struct ocfs2_post_refcount {
69 int credits; /* credits it need for journal. */
70 ocfs2_post_refcount_func *func; /* real function. */
71 void *para;
72};
73
74int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
75 struct ocfs2_caching_info *ref_ci,
76 struct buffer_head *ref_root_bh,
77 struct ocfs2_xattr_value_root *xv,
78 int *meta_add, int *credits);
79int ocfs2_refcount_cow_xattr(struct inode *inode,
80 struct ocfs2_dinode *di,
81 struct ocfs2_xattr_value_buf *vb,
82 struct ocfs2_refcount_tree *ref_tree,
83 struct buffer_head *ref_root_bh,
84 u32 cpos, u32 write_len,
85 struct ocfs2_post_refcount *post);
86int ocfs2_add_refcount_flag(struct inode *inode,
87 struct ocfs2_extent_tree *data_et,
88 struct ocfs2_caching_info *ref_ci,
89 struct buffer_head *ref_root_bh,
90 u32 cpos, u32 p_cluster, u32 num_clusters,
91 struct ocfs2_cached_dealloc_ctxt *dealloc,
92 struct ocfs2_post_refcount *post);
93int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
94int ocfs2_try_remove_refcount_tree(struct inode *inode,
95 struct buffer_head *di_bh);
96int ocfs2_increase_refcount(handle_t *handle,
97 struct ocfs2_caching_info *ci,
98 struct buffer_head *ref_root_bh,
99 u64 cpos, u32 len,
100 struct ocfs2_alloc_context *meta_ac,
101 struct ocfs2_cached_dealloc_ctxt *dealloc);
102int ocfs2_reflink_ioctl(struct inode *inode,
103 const char __user *oldname,
104 const char __user *newname,
105 bool preserve);
106#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 424adaa5f900..3c3d673a4d20 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", 106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster); 107 new_clusters, first_new_cluster);
108 108
109 ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh, 109 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
110 OCFS2_JOURNAL_ACCESS_WRITE); 110 group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) { 111 if (ret < 0) {
112 mlog_errno(ret); 112 mlog_errno(ret);
113 goto out; 113 goto out;
@@ -141,7 +141,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
141 } 141 }
142 142
143 /* update the inode accordingly. */ 143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh, 144 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE); 145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) { 146 if (ret < 0) {
147 mlog_errno(ret); 147 mlog_errno(ret);
@@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
514 goto out_unlock; 514 goto out_unlock;
515 } 515 }
516 516
517 ocfs2_set_new_buffer_uptodate(inode, group_bh); 517 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
518 518
519 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); 519 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
520 if (ret) { 520 if (ret) {
@@ -536,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
536 cl = &fe->id2.i_chain; 536 cl = &fe->id2.i_chain;
537 cr = &cl->cl_recs[input->chain]; 537 cr = &cl->cl_recs[input->chain];
538 538
539 ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh, 539 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
540 OCFS2_JOURNAL_ACCESS_WRITE); 540 group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
541 if (ret < 0) { 541 if (ret < 0) {
542 mlog_errno(ret); 542 mlog_errno(ret);
543 goto out_commit; 543 goto out_commit;
@@ -552,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
552 goto out_commit; 552 goto out_commit;
553 } 553 }
554 554
555 ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh, 555 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
556 OCFS2_JOURNAL_ACCESS_WRITE); 556 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
557 if (ret < 0) { 557 if (ret < 0) {
558 mlog_errno(ret); 558 mlog_errno(ret);
559 goto out_commit; 559 goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 40661e7824e9..bfbd7e9e949f 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If 150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 153 ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
154 OCFS2_BH_IGNORE_CACHE, NULL); 154 si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -213,7 +213,7 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
213 ocfs2_update_disk_slot_old(si, slot_num, &bh); 213 ocfs2_update_disk_slot_old(si, slot_num, &bh);
214 spin_unlock(&osb->osb_lock); 214 spin_unlock(&osb->osb_lock);
215 215
216 status = ocfs2_write_block(osb, bh, si->si_inode); 216 status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
217 if (status < 0) 217 if (status < 0)
218 mlog_errno(status); 218 mlog_errno(status);
219 219
@@ -404,8 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
404 (unsigned long long)blkno); 404 (unsigned long long)blkno);
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 407 status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
408 OCFS2_BH_IGNORE_CACHE, NULL); 408 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
409 if (status < 0) { 409 if (status < 0) {
410 mlog_errno(status); 410 mlog_errno(status);
411 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 73a16d4666dc..c30b644d9572 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -310,7 +310,7 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
310 int rc; 310 int rc;
311 struct buffer_head *tmp = *bh; 311 struct buffer_head *tmp = *bh;
312 312
313 rc = ocfs2_read_block(inode, gd_blkno, &tmp, 313 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
314 ocfs2_validate_group_descriptor); 314 ocfs2_validate_group_descriptor);
315 if (rc) 315 if (rc)
316 goto out; 316 goto out;
@@ -352,7 +352,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
352 } 352 }
353 353
354 status = ocfs2_journal_access_gd(handle, 354 status = ocfs2_journal_access_gd(handle,
355 alloc_inode, 355 INODE_CACHE(alloc_inode),
356 bg_bh, 356 bg_bh,
357 OCFS2_JOURNAL_ACCESS_CREATE); 357 OCFS2_JOURNAL_ACCESS_CREATE);
358 if (status < 0) { 358 if (status < 0) {
@@ -476,7 +476,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
476 mlog_errno(status); 476 mlog_errno(status);
477 goto bail; 477 goto bail;
478 } 478 }
479 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); 479 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
480 480
481 status = ocfs2_block_group_fill(handle, 481 status = ocfs2_block_group_fill(handle,
482 alloc_inode, 482 alloc_inode,
@@ -491,7 +491,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
491 491
492 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 492 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
493 493
494 status = ocfs2_journal_access_di(handle, alloc_inode, 494 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
495 bh, OCFS2_JOURNAL_ACCESS_WRITE); 495 bh, OCFS2_JOURNAL_ACCESS_WRITE);
496 if (status < 0) { 496 if (status < 0) {
497 mlog_errno(status); 497 mlog_errno(status);
@@ -1033,7 +1033,7 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1033 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1033 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1034 1034
1035 status = ocfs2_journal_access_gd(handle, 1035 status = ocfs2_journal_access_gd(handle,
1036 alloc_inode, 1036 INODE_CACHE(alloc_inode),
1037 group_bh, 1037 group_bh,
1038 journal_type); 1038 journal_type);
1039 if (status < 0) { 1039 if (status < 0) {
@@ -1106,7 +1106,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1106 bg_ptr = le64_to_cpu(bg->bg_next_group); 1106 bg_ptr = le64_to_cpu(bg->bg_next_group);
1107 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1107 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1108 1108
1109 status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh, 1109 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1110 prev_bg_bh,
1110 OCFS2_JOURNAL_ACCESS_WRITE); 1111 OCFS2_JOURNAL_ACCESS_WRITE);
1111 if (status < 0) { 1112 if (status < 0) {
1112 mlog_errno(status); 1113 mlog_errno(status);
@@ -1121,8 +1122,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1121 goto out_rollback; 1122 goto out_rollback;
1122 } 1123 }
1123 1124
1124 status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh, 1125 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1125 OCFS2_JOURNAL_ACCESS_WRITE); 1126 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1126 if (status < 0) { 1127 if (status < 0) {
1127 mlog_errno(status); 1128 mlog_errno(status);
1128 goto out_rollback; 1129 goto out_rollback;
@@ -1136,8 +1137,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1136 goto out_rollback; 1137 goto out_rollback;
1137 } 1138 }
1138 1139
1139 status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh, 1140 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1140 OCFS2_JOURNAL_ACCESS_WRITE); 1141 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1141 if (status < 0) { 1142 if (status < 0) {
1142 mlog_errno(status); 1143 mlog_errno(status);
1143 goto out_rollback; 1144 goto out_rollback;
@@ -1288,7 +1289,7 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1288 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1289 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1289 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1290 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1290 1291
1291 ret = ocfs2_journal_access_di(handle, inode, di_bh, 1292 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1292 OCFS2_JOURNAL_ACCESS_WRITE); 1293 OCFS2_JOURNAL_ACCESS_WRITE);
1293 if (ret < 0) { 1294 if (ret < 0) {
1294 mlog_errno(ret); 1295 mlog_errno(ret);
@@ -1461,7 +1462,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1461 /* Ok, claim our bits now: set the info on dinode, chainlist 1462 /* Ok, claim our bits now: set the info on dinode, chainlist
1462 * and then the group */ 1463 * and then the group */
1463 status = ocfs2_journal_access_di(handle, 1464 status = ocfs2_journal_access_di(handle,
1464 alloc_inode, 1465 INODE_CACHE(alloc_inode),
1465 ac->ac_bh, 1466 ac->ac_bh,
1466 OCFS2_JOURNAL_ACCESS_WRITE); 1467 OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (status < 0) { 1468 if (status < 0) {
@@ -1907,8 +1908,8 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1907 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1908 if (ocfs2_is_cluster_bitmap(alloc_inode))
1908 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1909 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1909 1910
1910 status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh, 1911 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1911 journal_type); 1912 group_bh, journal_type);
1912 if (status < 0) { 1913 if (status < 0) {
1913 mlog_errno(status); 1914 mlog_errno(status);
1914 goto bail; 1915 goto bail;
@@ -1993,8 +1994,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1993 goto bail; 1994 goto bail;
1994 } 1995 }
1995 1996
1996 status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh, 1997 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1997 OCFS2_JOURNAL_ACCESS_WRITE); 1998 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1998 if (status < 0) { 1999 if (status < 0) {
1999 mlog_errno(status); 2000 mlog_errno(status);
2000 goto bail; 2001 goto bail;
@@ -2151,7 +2152,7 @@ int ocfs2_lock_allocators(struct inode *inode,
2151 2152
2152 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2153 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2153 2154
2154 num_free_extents = ocfs2_num_free_extents(osb, inode, et); 2155 num_free_extents = ocfs2_num_free_extents(osb, et);
2155 if (num_free_extents < 0) { 2156 if (num_free_extents < 0) {
2156 ret = num_free_extents; 2157 ret = num_free_extents;
2157 mlog_errno(ret); 2158 mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a3f8871d21fd..c0e48aeebb1c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/random.h> 32#include <linux/random.h>
34#include <linux/statfs.h> 33#include <linux/statfs.h>
@@ -69,6 +68,7 @@
69#include "ver.h" 68#include "ver.h"
70#include "xattr.h" 69#include "xattr.h"
71#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h"
72 72
73#include "buffer_head_io.h" 73#include "buffer_head_io.h"
74 74
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
373} 373}
374#endif /* CONFIG_DEBUG_FS */ 374#endif /* CONFIG_DEBUG_FS */
375 375
376static struct file_operations ocfs2_osb_debug_fops = { 376static const struct file_operations ocfs2_osb_debug_fops = {
377 .open = ocfs2_osb_debug_open, 377 .open = ocfs2_osb_debug_open,
378 .release = ocfs2_debug_release, 378 .release = ocfs2_debug_release,
379 .read = ocfs2_debug_read, 379 .read = ocfs2_debug_read,
@@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
965 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); 965 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
966} 966}
967 967
968static struct quotactl_ops ocfs2_quotactl_ops = { 968static const struct quotactl_ops ocfs2_quotactl_ops = {
969 .quota_on = ocfs2_quota_on, 969 .quota_on = ocfs2_quota_on,
970 .quota_off = ocfs2_quota_off, 970 .quota_off = ocfs2_quota_off,
971 .quota_sync = vfs_quota_sync, 971 .quota_sync = vfs_quota_sync,
@@ -1668,8 +1668,6 @@ static void ocfs2_inode_init_once(void *data)
1668 spin_lock_init(&oi->ip_lock); 1668 spin_lock_init(&oi->ip_lock);
1669 ocfs2_extent_map_init(&oi->vfs_inode); 1669 ocfs2_extent_map_init(&oi->vfs_inode);
1670 INIT_LIST_HEAD(&oi->ip_io_markers); 1670 INIT_LIST_HEAD(&oi->ip_io_markers);
1671 oi->ip_created_trans = 0;
1672 oi->ip_last_trans = 0;
1673 oi->ip_dir_start_lookup = 0; 1671 oi->ip_dir_start_lookup = 0;
1674 1672
1675 init_rwsem(&oi->ip_alloc_sem); 1673 init_rwsem(&oi->ip_alloc_sem);
@@ -1683,7 +1681,8 @@ static void ocfs2_inode_init_once(void *data)
1683 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1681 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1684 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1682 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1685 1683
1686 ocfs2_metadata_cache_init(&oi->vfs_inode); 1684 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
1685 &ocfs2_inode_caching_ops);
1687 1686
1688 inode_init_once(&oi->vfs_inode); 1687 inode_init_once(&oi->vfs_inode);
1689} 1688}
@@ -1859,6 +1858,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1859 1858
1860 ocfs2_sync_blockdev(sb); 1859 ocfs2_sync_blockdev(sb);
1861 1860
1861 ocfs2_purge_refcount_trees(osb);
1862
1862 /* No cluster connection means we've failed during mount, so skip 1863 /* No cluster connection means we've failed during mount, so skip
1863 * all the steps which depended on that to complete. */ 1864 * all the steps which depended on that to complete. */
1864 if (osb->cconn) { 1865 if (osb->cconn) {
@@ -2065,6 +2066,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2065 goto bail; 2066 goto bail;
2066 } 2067 }
2067 2068
2069 osb->osb_rf_lock_tree = RB_ROOT;
2070
2068 osb->s_feature_compat = 2071 osb->s_feature_compat =
2069 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); 2072 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
2070 osb->s_feature_ro_compat = 2073 osb->s_feature_ro_compat =
@@ -2490,7 +2493,8 @@ void __ocfs2_abort(struct super_block* sb,
2490 /* Force a panic(). This stinks, but it's better than letting 2493 /* Force a panic(). This stinks, but it's better than letting
2491 * things continue without having a proper hard readonly 2494 * things continue without having a proper hard readonly
2492 * here. */ 2495 * here. */
2493 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 2496 if (!ocfs2_mount_local(OCFS2_SB(sb)))
2497 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
2494 ocfs2_handle_error(sb); 2498 ocfs2_handle_error(sb);
2495} 2499}
2496 2500
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110f..e3421030a69f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
38#include <linux/types.h> 38#include <linux/types.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/utsname.h>
42#include <linux/namei.h> 41#include <linux/namei.h>
43 42
44#define MLOG_MASK_PREFIX ML_NAMEI 43#define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 187b99ff0368..b6284f235d2f 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -75,15 +75,77 @@ struct ocfs2_meta_cache_item {
75 75
76static struct kmem_cache *ocfs2_uptodate_cachep = NULL; 76static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
77 77
78void ocfs2_metadata_cache_init(struct inode *inode) 78u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
79{ 79{
80 struct ocfs2_inode_info *oi = OCFS2_I(inode); 80 BUG_ON(!ci || !ci->ci_ops);
81 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
82 81
83 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; 82 return ci->ci_ops->co_owner(ci);
83}
84
85struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
86{
87 BUG_ON(!ci || !ci->ci_ops);
88
89 return ci->ci_ops->co_get_super(ci);
90}
91
92static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
93{
94 BUG_ON(!ci || !ci->ci_ops);
95
96 ci->ci_ops->co_cache_lock(ci);
97}
98
99static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
100{
101 BUG_ON(!ci || !ci->ci_ops);
102
103 ci->ci_ops->co_cache_unlock(ci);
104}
105
106void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
107{
108 BUG_ON(!ci || !ci->ci_ops);
109
110 ci->ci_ops->co_io_lock(ci);
111}
112
113void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
114{
115 BUG_ON(!ci || !ci->ci_ops);
116
117 ci->ci_ops->co_io_unlock(ci);
118}
119
120
121static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
122 int clear)
123{
124 ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
84 ci->ci_num_cached = 0; 125 ci->ci_num_cached = 0;
126
127 if (clear) {
128 ci->ci_created_trans = 0;
129 ci->ci_last_trans = 0;
130 }
131}
132
133void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
134 const struct ocfs2_caching_operations *ops)
135{
136 BUG_ON(!ops);
137
138 ci->ci_ops = ops;
139 ocfs2_metadata_cache_reset(ci, 1);
85} 140}
86 141
142void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
143{
144 ocfs2_metadata_cache_purge(ci);
145 ocfs2_metadata_cache_reset(ci, 1);
146}
147
148
87/* No lock taken here as 'root' is not expected to be visible to other 149/* No lock taken here as 'root' is not expected to be visible to other
88 * processes. */ 150 * processes. */
89static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 151static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
@@ -112,19 +174,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
112 * This function is a few more lines longer than necessary due to some 174 * This function is a few more lines longer than necessary due to some
113 * accounting done here, but I think it's worth tracking down those 175 * accounting done here, but I think it's worth tracking down those
114 * bugs sooner -- Mark */ 176 * bugs sooner -- Mark */
115void ocfs2_metadata_cache_purge(struct inode *inode) 177void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
116{ 178{
117 struct ocfs2_inode_info *oi = OCFS2_I(inode);
118 unsigned int tree, to_purge, purged; 179 unsigned int tree, to_purge, purged;
119 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
120 struct rb_root root = RB_ROOT; 180 struct rb_root root = RB_ROOT;
121 181
122 spin_lock(&oi->ip_lock); 182 BUG_ON(!ci || !ci->ci_ops);
123 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 183
184 ocfs2_metadata_cache_lock(ci);
185 tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
124 to_purge = ci->ci_num_cached; 186 to_purge = ci->ci_num_cached;
125 187
126 mlog(0, "Purge %u %s items from Inode %llu\n", to_purge, 188 mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
127 tree ? "array" : "tree", (unsigned long long)oi->ip_blkno); 189 tree ? "array" : "tree",
190 (unsigned long long)ocfs2_metadata_cache_owner(ci));
128 191
129 /* If we're a tree, save off the root so that we can safely 192 /* If we're a tree, save off the root so that we can safely
130 * initialize the cache. We do the work to free tree members 193 * initialize the cache. We do the work to free tree members
@@ -132,16 +195,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode)
132 if (tree) 195 if (tree)
133 root = ci->ci_cache.ci_tree; 196 root = ci->ci_cache.ci_tree;
134 197
135 ocfs2_metadata_cache_init(inode); 198 ocfs2_metadata_cache_reset(ci, 0);
136 spin_unlock(&oi->ip_lock); 199 ocfs2_metadata_cache_unlock(ci);
137 200
138 purged = ocfs2_purge_copied_metadata_tree(&root); 201 purged = ocfs2_purge_copied_metadata_tree(&root);
139 /* If possible, track the number wiped so that we can more 202 /* If possible, track the number wiped so that we can more
140 * easily detect counting errors. Unfortunately, this is only 203 * easily detect counting errors. Unfortunately, this is only
141 * meaningful for trees. */ 204 * meaningful for trees. */
142 if (tree && purged != to_purge) 205 if (tree && purged != to_purge)
143 mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n", 206 mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
144 (unsigned long long)oi->ip_blkno, to_purge, purged); 207 (unsigned long long)ocfs2_metadata_cache_owner(ci),
208 to_purge, purged);
145} 209}
146 210
147/* Returns the index in the cache array, -1 if not found. 211/* Returns the index in the cache array, -1 if not found.
@@ -182,27 +246,25 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
182 return NULL; 246 return NULL;
183} 247}
184 248
185static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, 249static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
186 struct buffer_head *bh) 250 struct buffer_head *bh)
187{ 251{
188 int index = -1; 252 int index = -1;
189 struct ocfs2_meta_cache_item *item = NULL; 253 struct ocfs2_meta_cache_item *item = NULL;
190 254
191 spin_lock(&oi->ip_lock); 255 ocfs2_metadata_cache_lock(ci);
192 256
193 mlog(0, "Inode %llu, query block %llu (inline = %u)\n", 257 mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
194 (unsigned long long)oi->ip_blkno, 258 (unsigned long long)ocfs2_metadata_cache_owner(ci),
195 (unsigned long long) bh->b_blocknr, 259 (unsigned long long) bh->b_blocknr,
196 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); 260 !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
197 261
198 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) 262 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
199 index = ocfs2_search_cache_array(&oi->ip_metadata_cache, 263 index = ocfs2_search_cache_array(ci, bh->b_blocknr);
200 bh->b_blocknr);
201 else 264 else
202 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, 265 item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
203 bh->b_blocknr);
204 266
205 spin_unlock(&oi->ip_lock); 267 ocfs2_metadata_cache_unlock(ci);
206 268
207 mlog(0, "index = %d, item = %p\n", index, item); 269 mlog(0, "index = %d, item = %p\n", index, item);
208 270
@@ -214,7 +276,7 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
214 * 276 *
215 * This can be called under lock_buffer() 277 * This can be called under lock_buffer()
216 */ 278 */
217int ocfs2_buffer_uptodate(struct inode *inode, 279int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
218 struct buffer_head *bh) 280 struct buffer_head *bh)
219{ 281{
220 /* Doesn't matter if the bh is in our cache or not -- if it's 282 /* Doesn't matter if the bh is in our cache or not -- if it's
@@ -230,24 +292,24 @@ int ocfs2_buffer_uptodate(struct inode *inode,
230 292
231 /* Ok, locally the buffer is marked as up to date, now search 293 /* Ok, locally the buffer is marked as up to date, now search
232 * our cache to see if we can trust that. */ 294 * our cache to see if we can trust that. */
233 return ocfs2_buffer_cached(OCFS2_I(inode), bh); 295 return ocfs2_buffer_cached(ci, bh);
234} 296}
235 297
236/* 298/*
237 * Determine whether a buffer is currently out on a read-ahead request. 299 * Determine whether a buffer is currently out on a read-ahead request.
238 * ip_io_sem should be held to serialize submitters with the logic here. 300 * ci_io_sem should be held to serialize submitters with the logic here.
239 */ 301 */
240int ocfs2_buffer_read_ahead(struct inode *inode, 302int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
241 struct buffer_head *bh) 303 struct buffer_head *bh)
242{ 304{
243 return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh); 305 return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
244} 306}
245 307
246/* Requires ip_lock */ 308/* Requires ip_lock */
247static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 309static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
248 sector_t block) 310 sector_t block)
249{ 311{
250 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); 312 BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
251 313
252 mlog(0, "block %llu takes position %u\n", (unsigned long long) block, 314 mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
253 ci->ci_num_cached); 315 ci->ci_num_cached);
@@ -292,66 +354,64 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
292 ci->ci_num_cached++; 354 ci->ci_num_cached++;
293} 355}
294 356
295static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, 357/* co_cache_lock() must be held */
296 struct ocfs2_caching_info *ci) 358static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
297{ 359{
298 assert_spin_locked(&oi->ip_lock); 360 return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
299 361 (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
300 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
301 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
302} 362}
303 363
304/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the 364/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
305 * pointers in tree after we use them - this allows caller to detect 365 * pointers in tree after we use them - this allows caller to detect
306 * when to free in case of error. */ 366 * when to free in case of error.
307static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, 367 *
368 * The co_cache_lock() must be held. */
369static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
308 struct ocfs2_meta_cache_item **tree) 370 struct ocfs2_meta_cache_item **tree)
309{ 371{
310 int i; 372 int i;
311 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
312 373
313 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, 374 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
314 "Inode %llu, num cached = %u, should be %u\n", 375 "Owner %llu, num cached = %u, should be %u\n",
315 (unsigned long long)oi->ip_blkno, ci->ci_num_cached, 376 (unsigned long long)ocfs2_metadata_cache_owner(ci),
316 OCFS2_INODE_MAX_CACHE_ARRAY); 377 ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
317 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 378 mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
318 "Inode %llu not marked as inline anymore!\n", 379 "Owner %llu not marked as inline anymore!\n",
319 (unsigned long long)oi->ip_blkno); 380 (unsigned long long)ocfs2_metadata_cache_owner(ci));
320 assert_spin_locked(&oi->ip_lock);
321 381
322 /* Be careful to initialize the tree members *first* because 382 /* Be careful to initialize the tree members *first* because
323 * once the ci_tree is used, the array is junk... */ 383 * once the ci_tree is used, the array is junk... */
324 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 384 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
325 tree[i]->c_block = ci->ci_cache.ci_array[i]; 385 tree[i]->c_block = ci->ci_cache.ci_array[i];
326 386
327 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; 387 ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
328 ci->ci_cache.ci_tree = RB_ROOT; 388 ci->ci_cache.ci_tree = RB_ROOT;
329 /* this will be set again by __ocfs2_insert_cache_tree */ 389 /* this will be set again by __ocfs2_insert_cache_tree */
330 ci->ci_num_cached = 0; 390 ci->ci_num_cached = 0;
331 391
332 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 392 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
333 __ocfs2_insert_cache_tree(ci, tree[i]); 393 __ocfs2_insert_cache_tree(ci, tree[i]);
334 tree[i] = NULL; 394 tree[i] = NULL;
335 } 395 }
336 396
337 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", 397 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
338 (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); 398 (unsigned long long)ocfs2_metadata_cache_owner(ci),
399 ci->ci_flags, ci->ci_num_cached);
339} 400}
340 401
341/* Slow path function - memory allocation is necessary. See the 402/* Slow path function - memory allocation is necessary. See the
342 * comment above ocfs2_set_buffer_uptodate for more information. */ 403 * comment above ocfs2_set_buffer_uptodate for more information. */
343static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, 404static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
344 sector_t block, 405 sector_t block,
345 int expand_tree) 406 int expand_tree)
346{ 407{
347 int i; 408 int i;
348 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
349 struct ocfs2_meta_cache_item *new = NULL; 409 struct ocfs2_meta_cache_item *new = NULL;
350 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = 410 struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
351 { NULL, }; 411 { NULL, };
352 412
353 mlog(0, "Inode %llu, block %llu, expand = %d\n", 413 mlog(0, "Owner %llu, block %llu, expand = %d\n",
354 (unsigned long long)oi->ip_blkno, 414 (unsigned long long)ocfs2_metadata_cache_owner(ci),
355 (unsigned long long)block, expand_tree); 415 (unsigned long long)block, expand_tree);
356 416
357 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 417 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
@@ -364,7 +424,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
364 if (expand_tree) { 424 if (expand_tree) {
365 /* Do *not* allocate an array here - the removal code 425 /* Do *not* allocate an array here - the removal code
366 * has no way of tracking that. */ 426 * has no way of tracking that. */
367 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 427 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
368 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 428 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
369 GFP_NOFS); 429 GFP_NOFS);
370 if (!tree[i]) { 430 if (!tree[i]) {
@@ -376,21 +436,21 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
376 } 436 }
377 } 437 }
378 438
379 spin_lock(&oi->ip_lock); 439 ocfs2_metadata_cache_lock(ci);
380 if (ocfs2_insert_can_use_array(oi, ci)) { 440 if (ocfs2_insert_can_use_array(ci)) {
381 mlog(0, "Someone cleared the tree underneath us\n"); 441 mlog(0, "Someone cleared the tree underneath us\n");
382 /* Ok, items were removed from the cache in between 442 /* Ok, items were removed from the cache in between
383 * locks. Detect this and revert back to the fast path */ 443 * locks. Detect this and revert back to the fast path */
384 ocfs2_append_cache_array(ci, block); 444 ocfs2_append_cache_array(ci, block);
385 spin_unlock(&oi->ip_lock); 445 ocfs2_metadata_cache_unlock(ci);
386 goto out_free; 446 goto out_free;
387 } 447 }
388 448
389 if (expand_tree) 449 if (expand_tree)
390 ocfs2_expand_cache(oi, tree); 450 ocfs2_expand_cache(ci, tree);
391 451
392 __ocfs2_insert_cache_tree(ci, new); 452 __ocfs2_insert_cache_tree(ci, new);
393 spin_unlock(&oi->ip_lock); 453 ocfs2_metadata_cache_unlock(ci);
394 454
395 new = NULL; 455 new = NULL;
396out_free: 456out_free:
@@ -400,14 +460,14 @@ out_free:
400 /* If these were used, then ocfs2_expand_cache re-set them to 460 /* If these were used, then ocfs2_expand_cache re-set them to
401 * NULL for us. */ 461 * NULL for us. */
402 if (tree[0]) { 462 if (tree[0]) {
403 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 463 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
404 if (tree[i]) 464 if (tree[i])
405 kmem_cache_free(ocfs2_uptodate_cachep, 465 kmem_cache_free(ocfs2_uptodate_cachep,
406 tree[i]); 466 tree[i]);
407 } 467 }
408} 468}
409 469
410/* Item insertion is guarded by ip_io_mutex, so the insertion path takes 470/* Item insertion is guarded by co_io_lock(), so the insertion path takes
411 * advantage of this by not rechecking for a duplicate insert during 471 * advantage of this by not rechecking for a duplicate insert during
412 * the slow case. Additionally, if the cache needs to be bumped up to 472 * the slow case. Additionally, if the cache needs to be bumped up to
413 * a tree, the code will not recheck after acquiring the lock -- 473 * a tree, the code will not recheck after acquiring the lock --
@@ -425,59 +485,55 @@ out_free:
425 * Readahead buffers can be passed in here before the I/O request is 485 * Readahead buffers can be passed in here before the I/O request is
426 * completed. 486 * completed.
427 */ 487 */
428void ocfs2_set_buffer_uptodate(struct inode *inode, 488void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
429 struct buffer_head *bh) 489 struct buffer_head *bh)
430{ 490{
431 int expand; 491 int expand;
432 struct ocfs2_inode_info *oi = OCFS2_I(inode);
433 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
434 492
435 /* The block may very well exist in our cache already, so avoid 493 /* The block may very well exist in our cache already, so avoid
436 * doing any more work in that case. */ 494 * doing any more work in that case. */
437 if (ocfs2_buffer_cached(oi, bh)) 495 if (ocfs2_buffer_cached(ci, bh))
438 return; 496 return;
439 497
440 mlog(0, "Inode %llu, inserting block %llu\n", 498 mlog(0, "Owner %llu, inserting block %llu\n",
441 (unsigned long long)oi->ip_blkno, 499 (unsigned long long)ocfs2_metadata_cache_owner(ci),
442 (unsigned long long)bh->b_blocknr); 500 (unsigned long long)bh->b_blocknr);
443 501
444 /* No need to recheck under spinlock - insertion is guarded by 502 /* No need to recheck under spinlock - insertion is guarded by
445 * ip_io_mutex */ 503 * co_io_lock() */
446 spin_lock(&oi->ip_lock); 504 ocfs2_metadata_cache_lock(ci);
447 if (ocfs2_insert_can_use_array(oi, ci)) { 505 if (ocfs2_insert_can_use_array(ci)) {
448 /* Fast case - it's an array and there's a free 506 /* Fast case - it's an array and there's a free
449 * spot. */ 507 * spot. */
450 ocfs2_append_cache_array(ci, bh->b_blocknr); 508 ocfs2_append_cache_array(ci, bh->b_blocknr);
451 spin_unlock(&oi->ip_lock); 509 ocfs2_metadata_cache_unlock(ci);
452 return; 510 return;
453 } 511 }
454 512
455 expand = 0; 513 expand = 0;
456 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 514 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
457 /* We need to bump things up to a tree. */ 515 /* We need to bump things up to a tree. */
458 expand = 1; 516 expand = 1;
459 } 517 }
460 spin_unlock(&oi->ip_lock); 518 ocfs2_metadata_cache_unlock(ci);
461 519
462 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); 520 __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
463} 521}
464 522
465/* Called against a newly allocated buffer. Most likely nobody should 523/* Called against a newly allocated buffer. Most likely nobody should
466 * be able to read this sort of metadata while it's still being 524 * be able to read this sort of metadata while it's still being
467 * allocated, but this is careful to take ip_io_mutex anyway. */ 525 * allocated, but this is careful to take co_io_lock() anyway. */
468void ocfs2_set_new_buffer_uptodate(struct inode *inode, 526void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
469 struct buffer_head *bh) 527 struct buffer_head *bh)
470{ 528{
471 struct ocfs2_inode_info *oi = OCFS2_I(inode);
472
473 /* This should definitely *not* exist in our cache */ 529 /* This should definitely *not* exist in our cache */
474 BUG_ON(ocfs2_buffer_cached(oi, bh)); 530 BUG_ON(ocfs2_buffer_cached(ci, bh));
475 531
476 set_buffer_uptodate(bh); 532 set_buffer_uptodate(bh);
477 533
478 mutex_lock(&oi->ip_io_mutex); 534 ocfs2_metadata_cache_io_lock(ci);
479 ocfs2_set_buffer_uptodate(inode, bh); 535 ocfs2_set_buffer_uptodate(ci, bh);
480 mutex_unlock(&oi->ip_io_mutex); 536 ocfs2_metadata_cache_io_unlock(ci);
481} 537}
482 538
483/* Requires ip_lock. */ 539/* Requires ip_lock. */
@@ -487,7 +543,7 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
487 sector_t *array = ci->ci_cache.ci_array; 543 sector_t *array = ci->ci_cache.ci_array;
488 int bytes; 544 int bytes;
489 545
490 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); 546 BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
491 BUG_ON(index >= ci->ci_num_cached); 547 BUG_ON(index >= ci->ci_num_cached);
492 BUG_ON(!ci->ci_num_cached); 548 BUG_ON(!ci->ci_num_cached);
493 549
@@ -515,21 +571,19 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
515 ci->ci_num_cached--; 571 ci->ci_num_cached--;
516} 572}
517 573
518static void ocfs2_remove_block_from_cache(struct inode *inode, 574static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
519 sector_t block) 575 sector_t block)
520{ 576{
521 int index; 577 int index;
522 struct ocfs2_meta_cache_item *item = NULL; 578 struct ocfs2_meta_cache_item *item = NULL;
523 struct ocfs2_inode_info *oi = OCFS2_I(inode);
524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
525 579
526 spin_lock(&oi->ip_lock); 580 ocfs2_metadata_cache_lock(ci);
527 mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n", 581 mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
528 (unsigned long long)oi->ip_blkno, 582 (unsigned long long)ocfs2_metadata_cache_owner(ci),
529 (unsigned long long) block, ci->ci_num_cached, 583 (unsigned long long) block, ci->ci_num_cached,
530 oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 584 ci->ci_flags & OCFS2_CACHE_FL_INLINE);
531 585
532 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 586 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
533 index = ocfs2_search_cache_array(ci, block); 587 index = ocfs2_search_cache_array(ci, block);
534 if (index != -1) 588 if (index != -1)
535 ocfs2_remove_metadata_array(ci, index); 589 ocfs2_remove_metadata_array(ci, index);
@@ -538,7 +592,7 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
538 if (item) 592 if (item)
539 ocfs2_remove_metadata_tree(ci, item); 593 ocfs2_remove_metadata_tree(ci, item);
540 } 594 }
541 spin_unlock(&oi->ip_lock); 595 ocfs2_metadata_cache_unlock(ci);
542 596
543 if (item) 597 if (item)
544 kmem_cache_free(ocfs2_uptodate_cachep, item); 598 kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -549,23 +603,24 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
549 * bother reverting things to an inlined array in the case of a remove 603 * bother reverting things to an inlined array in the case of a remove
550 * which moves us back under the limit. 604 * which moves us back under the limit.
551 */ 605 */
552void ocfs2_remove_from_cache(struct inode *inode, 606void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
553 struct buffer_head *bh) 607 struct buffer_head *bh)
554{ 608{
555 sector_t block = bh->b_blocknr; 609 sector_t block = bh->b_blocknr;
556 610
557 ocfs2_remove_block_from_cache(inode, block); 611 ocfs2_remove_block_from_cache(ci, block);
558} 612}
559 613
560/* Called when we remove xattr clusters from an inode. */ 614/* Called when we remove xattr clusters from an inode. */
561void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, 615void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
562 sector_t block, 616 sector_t block,
563 u32 c_len) 617 u32 c_len)
564{ 618{
565 unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; 619 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
620 unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
566 621
567 for (i = 0; i < b_len; i++, block++) 622 for (i = 0; i < b_len; i++, block++)
568 ocfs2_remove_block_from_cache(inode, block); 623 ocfs2_remove_block_from_cache(ci, block);
569} 624}
570 625
571int __init init_ocfs2_uptodate_cache(void) 626int __init init_ocfs2_uptodate_cache(void)
@@ -577,7 +632,7 @@ int __init init_ocfs2_uptodate_cache(void)
577 return -ENOMEM; 632 return -ENOMEM;
578 633
579 mlog(0, "%u inlined cache items per inode.\n", 634 mlog(0, "%u inlined cache items per inode.\n",
580 OCFS2_INODE_MAX_CACHE_ARRAY); 635 OCFS2_CACHE_INFO_MAX_ARRAY);
581 636
582 return 0; 637 return 0;
583} 638}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 531b4b3a0c47..0d826fe2da0d 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -26,24 +26,59 @@
26#ifndef OCFS2_UPTODATE_H 26#ifndef OCFS2_UPTODATE_H
27#define OCFS2_UPTODATE_H 27#define OCFS2_UPTODATE_H
28 28
29/*
30 * The caching code relies on locking provided by the user of
31 * struct ocfs2_caching_info. These operations connect that up.
32 */
33struct ocfs2_caching_operations {
34 /*
35 * A u64 representing the owning structure. Usually this
36 * is the block number (i_blkno or whatnot). This is used so
37 * that caching log messages can identify the owning structure.
38 */
39 u64 (*co_owner)(struct ocfs2_caching_info *ci);
40
41 /* The superblock is needed during I/O. */
42 struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
43 /*
44 * Lock and unlock the caching data. These will not sleep, and
45 * should probably be spinlocks.
46 */
47 void (*co_cache_lock)(struct ocfs2_caching_info *ci);
48 void (*co_cache_unlock)(struct ocfs2_caching_info *ci);
49
50 /*
51 * Lock and unlock for disk I/O. These will sleep, and should
52 * be mutexes.
53 */
54 void (*co_io_lock)(struct ocfs2_caching_info *ci);
55 void (*co_io_unlock)(struct ocfs2_caching_info *ci);
56};
57
29int __init init_ocfs2_uptodate_cache(void); 58int __init init_ocfs2_uptodate_cache(void);
30void exit_ocfs2_uptodate_cache(void); 59void exit_ocfs2_uptodate_cache(void);
31 60
32void ocfs2_metadata_cache_init(struct inode *inode); 61void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
33void ocfs2_metadata_cache_purge(struct inode *inode); 62 const struct ocfs2_caching_operations *ops);
63void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
64void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
65
66u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
67void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
68void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
34 69
35int ocfs2_buffer_uptodate(struct inode *inode, 70int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
36 struct buffer_head *bh); 71 struct buffer_head *bh);
37void ocfs2_set_buffer_uptodate(struct inode *inode, 72void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
38 struct buffer_head *bh); 73 struct buffer_head *bh);
39void ocfs2_set_new_buffer_uptodate(struct inode *inode, 74void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
40 struct buffer_head *bh); 75 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode, 76void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
42 struct buffer_head *bh); 77 struct buffer_head *bh);
43void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, 78void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
44 sector_t block, 79 sector_t block,
45 u32 c_len); 80 u32 c_len);
46int ocfs2_buffer_read_ahead(struct inode *inode, 81int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
47 struct buffer_head *bh); 82 struct buffer_head *bh);
48 83
49#endif /* OCFS2_UPTODATE_H */ 84#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1a27cda984f..fe3419068df2 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -55,7 +55,8 @@
55#include "buffer_head_io.h" 55#include "buffer_head_io.h"
56#include "super.h" 56#include "super.h"
57#include "xattr.h" 57#include "xattr.h"
58 58#include "refcounttree.h"
59#include "acl.h"
59 60
60struct ocfs2_xattr_def_value_root { 61struct ocfs2_xattr_def_value_root {
61 struct ocfs2_xattr_value_root xv; 62 struct ocfs2_xattr_value_root xv;
@@ -140,7 +141,7 @@ struct ocfs2_xattr_search {
140 int not_found; 141 int not_found;
141}; 142};
142 143
143static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, 144static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
144 struct ocfs2_xattr_header *xh, 145 struct ocfs2_xattr_header *xh,
145 int index, 146 int index,
146 int *block_off, 147 int *block_off,
@@ -157,7 +158,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
157 struct ocfs2_xattr_search *xs); 158 struct ocfs2_xattr_search *xs);
158 159
159static int ocfs2_xattr_tree_list_index_block(struct inode *inode, 160static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
160 struct ocfs2_xattr_tree_root *xt, 161 struct buffer_head *blk_bh,
161 char *buffer, 162 char *buffer,
162 size_t buffer_size); 163 size_t buffer_size);
163 164
@@ -170,12 +171,42 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
170 struct ocfs2_xattr_search *xs, 171 struct ocfs2_xattr_search *xs,
171 struct ocfs2_xattr_set_ctxt *ctxt); 172 struct ocfs2_xattr_set_ctxt *ctxt);
172 173
173static int ocfs2_delete_xattr_index_block(struct inode *inode, 174typedef int (xattr_tree_rec_func)(struct inode *inode,
174 struct buffer_head *xb_bh); 175 struct buffer_head *root_bh,
176 u64 blkno, u32 cpos, u32 len, void *para);
177static int ocfs2_iterate_xattr_index_block(struct inode *inode,
178 struct buffer_head *root_bh,
179 xattr_tree_rec_func *rec_func,
180 void *para);
181static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
182 struct ocfs2_xattr_bucket *bucket,
183 void *para);
184static int ocfs2_rm_xattr_cluster(struct inode *inode,
185 struct buffer_head *root_bh,
186 u64 blkno,
187 u32 cpos,
188 u32 len,
189 void *para);
190
175static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, 191static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
176 u64 src_blk, u64 last_blk, u64 to_blk, 192 u64 src_blk, u64 last_blk, u64 to_blk,
177 unsigned int start_bucket, 193 unsigned int start_bucket,
178 u32 *first_hash); 194 u32 *first_hash);
195static int ocfs2_prepare_refcount_xattr(struct inode *inode,
196 struct ocfs2_dinode *di,
197 struct ocfs2_xattr_info *xi,
198 struct ocfs2_xattr_search *xis,
199 struct ocfs2_xattr_search *xbs,
200 struct ocfs2_refcount_tree **ref_tree,
201 int *meta_need,
202 int *credits);
203static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
204 struct ocfs2_xattr_bucket *bucket,
205 int offset,
206 struct ocfs2_xattr_value_root **xv,
207 struct buffer_head **bh);
208static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
209 const void *value, size_t size, int flags);
179 210
180static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 211static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
181{ 212{
@@ -254,9 +285,9 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
254 break; 285 break;
255 } 286 }
256 287
257 if (!ocfs2_buffer_uptodate(bucket->bu_inode, 288 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
258 bucket->bu_bhs[i])) 289 bucket->bu_bhs[i]))
259 ocfs2_set_new_buffer_uptodate(bucket->bu_inode, 290 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
260 bucket->bu_bhs[i]); 291 bucket->bu_bhs[i]);
261 } 292 }
262 293
@@ -271,7 +302,7 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
271{ 302{
272 int rc; 303 int rc;
273 304
274 rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno, 305 rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
275 bucket->bu_blocks, bucket->bu_bhs, 0, 306 bucket->bu_blocks, bucket->bu_bhs, 0,
276 NULL); 307 NULL);
277 if (!rc) { 308 if (!rc) {
@@ -297,7 +328,8 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
297 int i, rc = 0; 328 int i, rc = 0;
298 329
299 for (i = 0; i < bucket->bu_blocks; i++) { 330 for (i = 0; i < bucket->bu_blocks; i++) {
300 rc = ocfs2_journal_access(handle, bucket->bu_inode, 331 rc = ocfs2_journal_access(handle,
332 INODE_CACHE(bucket->bu_inode),
301 bucket->bu_bhs[i], type); 333 bucket->bu_bhs[i], type);
302 if (rc) { 334 if (rc) {
303 mlog_errno(rc); 335 mlog_errno(rc);
@@ -399,7 +431,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
399 int rc; 431 int rc;
400 struct buffer_head *tmp = *bh; 432 struct buffer_head *tmp = *bh;
401 433
402 rc = ocfs2_read_block(inode, xb_blkno, &tmp, 434 rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
403 ocfs2_validate_xattr_block); 435 ocfs2_validate_xattr_block);
404 436
405 /* If ocfs2_read_block() got us a new bh, pass it up. */ 437 /* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -596,15 +628,14 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
596 int status = 0; 628 int status = 0;
597 handle_t *handle = ctxt->handle; 629 handle_t *handle = ctxt->handle;
598 enum ocfs2_alloc_restarted why; 630 enum ocfs2_alloc_restarted why;
599 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
600 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); 631 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
601 struct ocfs2_extent_tree et; 632 struct ocfs2_extent_tree et;
602 633
603 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); 634 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
604 635
605 ocfs2_init_xattr_value_extent_tree(&et, inode, vb); 636 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
606 637
607 status = vb->vb_access(handle, inode, vb->vb_bh, 638 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
608 OCFS2_JOURNAL_ACCESS_WRITE); 639 OCFS2_JOURNAL_ACCESS_WRITE);
609 if (status < 0) { 640 if (status < 0) {
610 mlog_errno(status); 641 mlog_errno(status);
@@ -612,13 +643,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
612 } 643 }
613 644
614 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); 645 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
615 status = ocfs2_add_clusters_in_btree(osb, 646 status = ocfs2_add_clusters_in_btree(handle,
616 inode, 647 &et,
617 &logical_start, 648 &logical_start,
618 clusters_to_add, 649 clusters_to_add,
619 0, 650 0,
620 &et,
621 handle,
622 ctxt->data_ac, 651 ctxt->data_ac,
623 ctxt->meta_ac, 652 ctxt->meta_ac,
624 &why); 653 &why);
@@ -649,6 +678,7 @@ leave:
649static int __ocfs2_remove_xattr_range(struct inode *inode, 678static int __ocfs2_remove_xattr_range(struct inode *inode,
650 struct ocfs2_xattr_value_buf *vb, 679 struct ocfs2_xattr_value_buf *vb,
651 u32 cpos, u32 phys_cpos, u32 len, 680 u32 cpos, u32 phys_cpos, u32 len,
681 unsigned int ext_flags,
652 struct ocfs2_xattr_set_ctxt *ctxt) 682 struct ocfs2_xattr_set_ctxt *ctxt)
653{ 683{
654 int ret; 684 int ret;
@@ -656,16 +686,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
656 handle_t *handle = ctxt->handle; 686 handle_t *handle = ctxt->handle;
657 struct ocfs2_extent_tree et; 687 struct ocfs2_extent_tree et;
658 688
659 ocfs2_init_xattr_value_extent_tree(&et, inode, vb); 689 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
660 690
661 ret = vb->vb_access(handle, inode, vb->vb_bh, 691 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
662 OCFS2_JOURNAL_ACCESS_WRITE); 692 OCFS2_JOURNAL_ACCESS_WRITE);
663 if (ret) { 693 if (ret) {
664 mlog_errno(ret); 694 mlog_errno(ret);
665 goto out; 695 goto out;
666 } 696 }
667 697
668 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac, 698 ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
669 &ctxt->dealloc); 699 &ctxt->dealloc);
670 if (ret) { 700 if (ret) {
671 mlog_errno(ret); 701 mlog_errno(ret);
@@ -680,7 +710,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
680 goto out; 710 goto out;
681 } 711 }
682 712
683 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); 713 if (ext_flags & OCFS2_EXT_REFCOUNTED)
714 ret = ocfs2_decrease_refcount(inode, handle,
715 ocfs2_blocks_to_clusters(inode->i_sb,
716 phys_blkno),
717 len, ctxt->meta_ac, &ctxt->dealloc, 1);
718 else
719 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
720 phys_blkno, len);
684 if (ret) 721 if (ret)
685 mlog_errno(ret); 722 mlog_errno(ret);
686 723
@@ -695,6 +732,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
695 struct ocfs2_xattr_set_ctxt *ctxt) 732 struct ocfs2_xattr_set_ctxt *ctxt)
696{ 733{
697 int ret = 0; 734 int ret = 0;
735 unsigned int ext_flags;
698 u32 trunc_len, cpos, phys_cpos, alloc_size; 736 u32 trunc_len, cpos, phys_cpos, alloc_size;
699 u64 block; 737 u64 block;
700 738
@@ -706,7 +744,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
706 while (trunc_len) { 744 while (trunc_len) {
707 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, 745 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
708 &alloc_size, 746 &alloc_size,
709 &vb->vb_xv->xr_list); 747 &vb->vb_xv->xr_list, &ext_flags);
710 if (ret) { 748 if (ret) {
711 mlog_errno(ret); 749 mlog_errno(ret);
712 goto out; 750 goto out;
@@ -717,15 +755,15 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
717 755
718 ret = __ocfs2_remove_xattr_range(inode, vb, cpos, 756 ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
719 phys_cpos, alloc_size, 757 phys_cpos, alloc_size,
720 ctxt); 758 ext_flags, ctxt);
721 if (ret) { 759 if (ret) {
722 mlog_errno(ret); 760 mlog_errno(ret);
723 goto out; 761 goto out;
724 } 762 }
725 763
726 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 764 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
727 ocfs2_remove_xattr_clusters_from_cache(inode, block, 765 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
728 alloc_size); 766 block, alloc_size);
729 cpos += alloc_size; 767 cpos += alloc_size;
730 trunc_len -= alloc_size; 768 trunc_len -= alloc_size;
731 } 769 }
@@ -810,6 +848,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
810 return result; 848 return result;
811} 849}
812 850
851int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
852 struct ocfs2_dinode *di)
853{
854 struct ocfs2_xattr_header *xh;
855 int i;
856
857 xh = (struct ocfs2_xattr_header *)
858 ((void *)di + inode->i_sb->s_blocksize -
859 le16_to_cpu(di->i_xattr_inline_size));
860
861 for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
862 if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
863 return 1;
864
865 return 0;
866}
867
813static int ocfs2_xattr_ibody_list(struct inode *inode, 868static int ocfs2_xattr_ibody_list(struct inode *inode,
814 struct ocfs2_dinode *di, 869 struct ocfs2_dinode *di,
815 char *buffer, 870 char *buffer,
@@ -855,11 +910,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
855 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; 910 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
856 ret = ocfs2_xattr_list_entries(inode, header, 911 ret = ocfs2_xattr_list_entries(inode, header,
857 buffer, buffer_size); 912 buffer, buffer_size);
858 } else { 913 } else
859 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 914 ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
860 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
861 buffer, buffer_size); 915 buffer, buffer_size);
862 }
863 916
864 brelse(blk_bh); 917 brelse(blk_bh);
865 918
@@ -961,7 +1014,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
961 cpos = 0; 1014 cpos = 0;
962 while (cpos < clusters) { 1015 while (cpos < clusters) {
963 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1016 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
964 &num_clusters, el); 1017 &num_clusters, el, NULL);
965 if (ret) { 1018 if (ret) {
966 mlog_errno(ret); 1019 mlog_errno(ret);
967 goto out; 1020 goto out;
@@ -970,7 +1023,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
970 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1023 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
971 /* Copy ocfs2_xattr_value */ 1024 /* Copy ocfs2_xattr_value */
972 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1025 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
973 ret = ocfs2_read_block(inode, blkno, &bh, NULL); 1026 ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
1027 &bh, NULL);
974 if (ret) { 1028 if (ret) {
975 mlog_errno(ret); 1029 mlog_errno(ret);
976 goto out; 1030 goto out;
@@ -1085,7 +1139,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
1085 i = xs->here - xs->header->xh_entries; 1139 i = xs->here - xs->header->xh_entries;
1086 1140
1087 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 1141 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
1088 ret = ocfs2_xattr_bucket_get_name_value(inode, 1142 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
1089 bucket_xh(xs->bucket), 1143 bucket_xh(xs->bucket),
1090 i, 1144 i,
1091 &block_off, 1145 &block_off,
@@ -1183,7 +1237,7 @@ static int ocfs2_xattr_get(struct inode *inode,
1183 1237
1184static int __ocfs2_xattr_set_value_outside(struct inode *inode, 1238static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1185 handle_t *handle, 1239 handle_t *handle,
1186 struct ocfs2_xattr_value_root *xv, 1240 struct ocfs2_xattr_value_buf *vb,
1187 const void *value, 1241 const void *value,
1188 int value_len) 1242 int value_len)
1189{ 1243{
@@ -1194,28 +1248,34 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1194 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); 1248 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
1195 u64 blkno; 1249 u64 blkno;
1196 struct buffer_head *bh = NULL; 1250 struct buffer_head *bh = NULL;
1251 unsigned int ext_flags;
1252 struct ocfs2_xattr_value_root *xv = vb->vb_xv;
1197 1253
1198 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); 1254 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
1199 1255
1200 while (cpos < clusters) { 1256 while (cpos < clusters) {
1201 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1257 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
1202 &num_clusters, &xv->xr_list); 1258 &num_clusters, &xv->xr_list,
1259 &ext_flags);
1203 if (ret) { 1260 if (ret) {
1204 mlog_errno(ret); 1261 mlog_errno(ret);
1205 goto out; 1262 goto out;
1206 } 1263 }
1207 1264
1265 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1266
1208 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1267 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
1209 1268
1210 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1269 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
1211 ret = ocfs2_read_block(inode, blkno, &bh, NULL); 1270 ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
1271 &bh, NULL);
1212 if (ret) { 1272 if (ret) {
1213 mlog_errno(ret); 1273 mlog_errno(ret);
1214 goto out; 1274 goto out;
1215 } 1275 }
1216 1276
1217 ret = ocfs2_journal_access(handle, 1277 ret = ocfs2_journal_access(handle,
1218 inode, 1278 INODE_CACHE(inode),
1219 bh, 1279 bh,
1220 OCFS2_JOURNAL_ACCESS_WRITE); 1280 OCFS2_JOURNAL_ACCESS_WRITE);
1221 if (ret < 0) { 1281 if (ret < 0) {
@@ -1266,7 +1326,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
1266 void *val = xs->base + offs; 1326 void *val = xs->base + offs;
1267 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 1327 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1268 1328
1269 ret = vb->vb_access(handle, inode, vb->vb_bh, 1329 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
1270 OCFS2_JOURNAL_ACCESS_WRITE); 1330 OCFS2_JOURNAL_ACCESS_WRITE);
1271 if (ret) { 1331 if (ret) {
1272 mlog_errno(ret); 1332 mlog_errno(ret);
@@ -1294,7 +1354,7 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
1294{ 1354{
1295 int ret; 1355 int ret;
1296 1356
1297 ret = vb->vb_access(handle, inode, vb->vb_bh, 1357 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
1298 OCFS2_JOURNAL_ACCESS_WRITE); 1358 OCFS2_JOURNAL_ACCESS_WRITE);
1299 if (ret) { 1359 if (ret) {
1300 mlog_errno(ret); 1360 mlog_errno(ret);
@@ -1355,7 +1415,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
1355 mlog_errno(ret); 1415 mlog_errno(ret);
1356 return ret; 1416 return ret;
1357 } 1417 }
1358 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, 1418 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
1359 xi->value, xi->value_len); 1419 xi->value, xi->value_len);
1360 if (ret < 0) 1420 if (ret < 0)
1361 mlog_errno(ret); 1421 mlog_errno(ret);
@@ -1594,7 +1654,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1594 1654
1595 ret = __ocfs2_xattr_set_value_outside(inode, 1655 ret = __ocfs2_xattr_set_value_outside(inode,
1596 handle, 1656 handle,
1597 vb.vb_xv, 1657 &vb,
1598 xi->value, 1658 xi->value,
1599 xi->value_len); 1659 xi->value_len);
1600 if (ret < 0) 1660 if (ret < 0)
@@ -1615,7 +1675,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1615 } 1675 }
1616 } 1676 }
1617 1677
1618 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, 1678 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
1619 OCFS2_JOURNAL_ACCESS_WRITE); 1679 OCFS2_JOURNAL_ACCESS_WRITE);
1620 if (ret) { 1680 if (ret) {
1621 mlog_errno(ret); 1681 mlog_errno(ret);
@@ -1623,7 +1683,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1623 } 1683 }
1624 1684
1625 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 1685 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1626 ret = vb.vb_access(handle, inode, vb.vb_bh, 1686 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1627 OCFS2_JOURNAL_ACCESS_WRITE); 1687 OCFS2_JOURNAL_ACCESS_WRITE);
1628 if (ret) { 1688 if (ret) {
1629 mlog_errno(ret); 1689 mlog_errno(ret);
@@ -1700,51 +1760,112 @@ out:
1700 return ret; 1760 return ret;
1701} 1761}
1702 1762
1763/*
1764 * In xattr remove, if it is stored outside and refcounted, we may have
1765 * the chance to split the refcount tree. So need the allocators.
1766 */
1767static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
1768 struct ocfs2_xattr_value_root *xv,
1769 struct ocfs2_caching_info *ref_ci,
1770 struct buffer_head *ref_root_bh,
1771 struct ocfs2_alloc_context **meta_ac,
1772 int *ref_credits)
1773{
1774 int ret, meta_add = 0;
1775 u32 p_cluster, num_clusters;
1776 unsigned int ext_flags;
1777
1778 *ref_credits = 0;
1779 ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
1780 &num_clusters,
1781 &xv->xr_list,
1782 &ext_flags);
1783 if (ret) {
1784 mlog_errno(ret);
1785 goto out;
1786 }
1787
1788 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
1789 goto out;
1790
1791 ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
1792 ref_root_bh, xv,
1793 &meta_add, ref_credits);
1794 if (ret) {
1795 mlog_errno(ret);
1796 goto out;
1797 }
1798
1799 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
1800 meta_add, meta_ac);
1801 if (ret)
1802 mlog_errno(ret);
1803
1804out:
1805 return ret;
1806}
1807
1703static int ocfs2_remove_value_outside(struct inode*inode, 1808static int ocfs2_remove_value_outside(struct inode*inode,
1704 struct ocfs2_xattr_value_buf *vb, 1809 struct ocfs2_xattr_value_buf *vb,
1705 struct ocfs2_xattr_header *header) 1810 struct ocfs2_xattr_header *header,
1811 struct ocfs2_caching_info *ref_ci,
1812 struct buffer_head *ref_root_bh)
1706{ 1813{
1707 int ret = 0, i; 1814 int ret = 0, i, ref_credits;
1708 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1815 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1709 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 1816 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1817 void *val;
1710 1818
1711 ocfs2_init_dealloc_ctxt(&ctxt.dealloc); 1819 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
1712 1820
1713 ctxt.handle = ocfs2_start_trans(osb,
1714 ocfs2_remove_extent_credits(osb->sb));
1715 if (IS_ERR(ctxt.handle)) {
1716 ret = PTR_ERR(ctxt.handle);
1717 mlog_errno(ret);
1718 goto out;
1719 }
1720
1721 for (i = 0; i < le16_to_cpu(header->xh_count); i++) { 1821 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1722 struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; 1822 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1723 1823
1724 if (!ocfs2_xattr_is_local(entry)) { 1824 if (ocfs2_xattr_is_local(entry))
1725 void *val; 1825 continue;
1726 1826
1727 val = (void *)header + 1827 val = (void *)header +
1728 le16_to_cpu(entry->xe_name_offset); 1828 le16_to_cpu(entry->xe_name_offset);
1729 vb->vb_xv = (struct ocfs2_xattr_value_root *) 1829 vb->vb_xv = (struct ocfs2_xattr_value_root *)
1730 (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); 1830 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1731 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); 1831
1732 if (ret < 0) { 1832 ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
1733 mlog_errno(ret); 1833 ref_ci, ref_root_bh,
1734 break; 1834 &ctxt.meta_ac,
1735 } 1835 &ref_credits);
1836
1837 ctxt.handle = ocfs2_start_trans(osb, ref_credits +
1838 ocfs2_remove_extent_credits(osb->sb));
1839 if (IS_ERR(ctxt.handle)) {
1840 ret = PTR_ERR(ctxt.handle);
1841 mlog_errno(ret);
1842 break;
1843 }
1844
1845 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
1846 if (ret < 0) {
1847 mlog_errno(ret);
1848 break;
1849 }
1850
1851 ocfs2_commit_trans(osb, ctxt.handle);
1852 if (ctxt.meta_ac) {
1853 ocfs2_free_alloc_context(ctxt.meta_ac);
1854 ctxt.meta_ac = NULL;
1736 } 1855 }
1737 } 1856 }
1738 1857
1739 ocfs2_commit_trans(osb, ctxt.handle); 1858 if (ctxt.meta_ac)
1859 ocfs2_free_alloc_context(ctxt.meta_ac);
1740 ocfs2_schedule_truncate_log_flush(osb, 1); 1860 ocfs2_schedule_truncate_log_flush(osb, 1);
1741 ocfs2_run_deallocs(osb, &ctxt.dealloc); 1861 ocfs2_run_deallocs(osb, &ctxt.dealloc);
1742out:
1743 return ret; 1862 return ret;
1744} 1863}
1745 1864
1746static int ocfs2_xattr_ibody_remove(struct inode *inode, 1865static int ocfs2_xattr_ibody_remove(struct inode *inode,
1747 struct buffer_head *di_bh) 1866 struct buffer_head *di_bh,
1867 struct ocfs2_caching_info *ref_ci,
1868 struct buffer_head *ref_root_bh)
1748{ 1869{
1749 1870
1750 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1871 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1759,13 +1880,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
1759 ((void *)di + inode->i_sb->s_blocksize - 1880 ((void *)di + inode->i_sb->s_blocksize -
1760 le16_to_cpu(di->i_xattr_inline_size)); 1881 le16_to_cpu(di->i_xattr_inline_size));
1761 1882
1762 ret = ocfs2_remove_value_outside(inode, &vb, header); 1883 ret = ocfs2_remove_value_outside(inode, &vb, header,
1884 ref_ci, ref_root_bh);
1763 1885
1764 return ret; 1886 return ret;
1765} 1887}
1766 1888
1889struct ocfs2_rm_xattr_bucket_para {
1890 struct ocfs2_caching_info *ref_ci;
1891 struct buffer_head *ref_root_bh;
1892};
1893
1767static int ocfs2_xattr_block_remove(struct inode *inode, 1894static int ocfs2_xattr_block_remove(struct inode *inode,
1768 struct buffer_head *blk_bh) 1895 struct buffer_head *blk_bh,
1896 struct ocfs2_caching_info *ref_ci,
1897 struct buffer_head *ref_root_bh)
1769{ 1898{
1770 struct ocfs2_xattr_block *xb; 1899 struct ocfs2_xattr_block *xb;
1771 int ret = 0; 1900 int ret = 0;
@@ -1773,19 +1902,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
1773 .vb_bh = blk_bh, 1902 .vb_bh = blk_bh,
1774 .vb_access = ocfs2_journal_access_xb, 1903 .vb_access = ocfs2_journal_access_xb,
1775 }; 1904 };
1905 struct ocfs2_rm_xattr_bucket_para args = {
1906 .ref_ci = ref_ci,
1907 .ref_root_bh = ref_root_bh,
1908 };
1776 1909
1777 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 1910 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1778 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 1911 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1779 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); 1912 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1780 ret = ocfs2_remove_value_outside(inode, &vb, header); 1913 ret = ocfs2_remove_value_outside(inode, &vb, header,
1914 ref_ci, ref_root_bh);
1781 } else 1915 } else
1782 ret = ocfs2_delete_xattr_index_block(inode, blk_bh); 1916 ret = ocfs2_iterate_xattr_index_block(inode,
1917 blk_bh,
1918 ocfs2_rm_xattr_cluster,
1919 &args);
1783 1920
1784 return ret; 1921 return ret;
1785} 1922}
1786 1923
1787static int ocfs2_xattr_free_block(struct inode *inode, 1924static int ocfs2_xattr_free_block(struct inode *inode,
1788 u64 block) 1925 u64 block,
1926 struct ocfs2_caching_info *ref_ci,
1927 struct buffer_head *ref_root_bh)
1789{ 1928{
1790 struct inode *xb_alloc_inode; 1929 struct inode *xb_alloc_inode;
1791 struct buffer_head *xb_alloc_bh = NULL; 1930 struct buffer_head *xb_alloc_bh = NULL;
@@ -1803,7 +1942,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
1803 goto out; 1942 goto out;
1804 } 1943 }
1805 1944
1806 ret = ocfs2_xattr_block_remove(inode, blk_bh); 1945 ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
1807 if (ret < 0) { 1946 if (ret < 0) {
1808 mlog_errno(ret); 1947 mlog_errno(ret);
1809 goto out; 1948 goto out;
@@ -1863,6 +2002,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1863{ 2002{
1864 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2003 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1865 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2004 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2005 struct ocfs2_refcount_tree *ref_tree = NULL;
2006 struct buffer_head *ref_root_bh = NULL;
2007 struct ocfs2_caching_info *ref_ci = NULL;
1866 handle_t *handle; 2008 handle_t *handle;
1867 int ret; 2009 int ret;
1868 2010
@@ -1872,8 +2014,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1872 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) 2014 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
1873 return 0; 2015 return 0;
1874 2016
2017 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
2018 ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
2019 le64_to_cpu(di->i_refcount_loc),
2020 1, &ref_tree, &ref_root_bh);
2021 if (ret) {
2022 mlog_errno(ret);
2023 goto out;
2024 }
2025 ref_ci = &ref_tree->rf_ci;
2026
2027 }
2028
1875 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { 2029 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
1876 ret = ocfs2_xattr_ibody_remove(inode, di_bh); 2030 ret = ocfs2_xattr_ibody_remove(inode, di_bh,
2031 ref_ci, ref_root_bh);
1877 if (ret < 0) { 2032 if (ret < 0) {
1878 mlog_errno(ret); 2033 mlog_errno(ret);
1879 goto out; 2034 goto out;
@@ -1882,7 +2037,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1882 2037
1883 if (di->i_xattr_loc) { 2038 if (di->i_xattr_loc) {
1884 ret = ocfs2_xattr_free_block(inode, 2039 ret = ocfs2_xattr_free_block(inode,
1885 le64_to_cpu(di->i_xattr_loc)); 2040 le64_to_cpu(di->i_xattr_loc),
2041 ref_ci, ref_root_bh);
1886 if (ret < 0) { 2042 if (ret < 0) {
1887 mlog_errno(ret); 2043 mlog_errno(ret);
1888 goto out; 2044 goto out;
@@ -1896,7 +2052,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1896 mlog_errno(ret); 2052 mlog_errno(ret);
1897 goto out; 2053 goto out;
1898 } 2054 }
1899 ret = ocfs2_journal_access_di(handle, inode, di_bh, 2055 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1900 OCFS2_JOURNAL_ACCESS_WRITE); 2056 OCFS2_JOURNAL_ACCESS_WRITE);
1901 if (ret) { 2057 if (ret) {
1902 mlog_errno(ret); 2058 mlog_errno(ret);
@@ -1916,6 +2072,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1916out_commit: 2072out_commit:
1917 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 2073 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1918out: 2074out:
2075 if (ref_tree)
2076 ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
2077 brelse(ref_root_bh);
1919 return ret; 2078 return ret;
1920} 2079}
1921 2080
@@ -2083,6 +2242,84 @@ cleanup:
2083 return ret; 2242 return ret;
2084} 2243}
2085 2244
2245static int ocfs2_create_xattr_block(handle_t *handle,
2246 struct inode *inode,
2247 struct buffer_head *inode_bh,
2248 struct ocfs2_alloc_context *meta_ac,
2249 struct buffer_head **ret_bh,
2250 int indexed)
2251{
2252 int ret;
2253 u16 suballoc_bit_start;
2254 u32 num_got;
2255 u64 first_blkno;
2256 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
2257 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2258 struct buffer_head *new_bh = NULL;
2259 struct ocfs2_xattr_block *xblk;
2260
2261 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
2262 OCFS2_JOURNAL_ACCESS_CREATE);
2263 if (ret < 0) {
2264 mlog_errno(ret);
2265 goto end;
2266 }
2267
2268 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
2269 &suballoc_bit_start, &num_got,
2270 &first_blkno);
2271 if (ret < 0) {
2272 mlog_errno(ret);
2273 goto end;
2274 }
2275
2276 new_bh = sb_getblk(inode->i_sb, first_blkno);
2277 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2278
2279 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
2280 new_bh,
2281 OCFS2_JOURNAL_ACCESS_CREATE);
2282 if (ret < 0) {
2283 mlog_errno(ret);
2284 goto end;
2285 }
2286
2287 /* Initialize ocfs2_xattr_block */
2288 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2289 memset(xblk, 0, inode->i_sb->s_blocksize);
2290 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2291 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
2292 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2293 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2294 xblk->xb_blkno = cpu_to_le64(first_blkno);
2295
2296 if (indexed) {
2297 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2298 xr->xt_clusters = cpu_to_le32(1);
2299 xr->xt_last_eb_blk = 0;
2300 xr->xt_list.l_tree_depth = 0;
2301 xr->xt_list.l_count = cpu_to_le16(
2302 ocfs2_xattr_recs_per_xb(inode->i_sb));
2303 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2304 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2305 }
2306
2307 ret = ocfs2_journal_dirty(handle, new_bh);
2308 if (ret < 0) {
2309 mlog_errno(ret);
2310 goto end;
2311 }
2312 di->i_xattr_loc = cpu_to_le64(first_blkno);
2313 ocfs2_journal_dirty(handle, inode_bh);
2314
2315 *ret_bh = new_bh;
2316 new_bh = NULL;
2317
2318end:
2319 brelse(new_bh);
2320 return ret;
2321}
2322
2086/* 2323/*
2087 * ocfs2_xattr_block_set() 2324 * ocfs2_xattr_block_set()
2088 * 2325 *
@@ -2095,63 +2332,24 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2095 struct ocfs2_xattr_set_ctxt *ctxt) 2332 struct ocfs2_xattr_set_ctxt *ctxt)
2096{ 2333{
2097 struct buffer_head *new_bh = NULL; 2334 struct buffer_head *new_bh = NULL;
2098 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2099 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2100 handle_t *handle = ctxt->handle; 2335 handle_t *handle = ctxt->handle;
2101 struct ocfs2_xattr_block *xblk = NULL; 2336 struct ocfs2_xattr_block *xblk = NULL;
2102 u16 suballoc_bit_start;
2103 u32 num_got;
2104 u64 first_blkno;
2105 int ret; 2337 int ret;
2106 2338
2107 if (!xs->xattr_bh) { 2339 if (!xs->xattr_bh) {
2108 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, 2340 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
2109 OCFS2_JOURNAL_ACCESS_CREATE); 2341 ctxt->meta_ac, &new_bh, 0);
2110 if (ret < 0) { 2342 if (ret) {
2111 mlog_errno(ret);
2112 goto end;
2113 }
2114
2115 ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
2116 &suballoc_bit_start, &num_got,
2117 &first_blkno);
2118 if (ret < 0) {
2119 mlog_errno(ret);
2120 goto end;
2121 }
2122
2123 new_bh = sb_getblk(inode->i_sb, first_blkno);
2124 ocfs2_set_new_buffer_uptodate(inode, new_bh);
2125
2126 ret = ocfs2_journal_access_xb(handle, inode, new_bh,
2127 OCFS2_JOURNAL_ACCESS_CREATE);
2128 if (ret < 0) {
2129 mlog_errno(ret); 2343 mlog_errno(ret);
2130 goto end; 2344 goto end;
2131 } 2345 }
2132 2346
2133 /* Initialize ocfs2_xattr_block */
2134 xs->xattr_bh = new_bh; 2347 xs->xattr_bh = new_bh;
2135 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2348 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2136 memset(xblk, 0, inode->i_sb->s_blocksize);
2137 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2138 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
2139 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2140 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2141 xblk->xb_blkno = cpu_to_le64(first_blkno);
2142
2143 xs->header = &xblk->xb_attrs.xb_header; 2349 xs->header = &xblk->xb_attrs.xb_header;
2144 xs->base = (void *)xs->header; 2350 xs->base = (void *)xs->header;
2145 xs->end = (void *)xblk + inode->i_sb->s_blocksize; 2351 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
2146 xs->here = xs->header->xh_entries; 2352 xs->here = xs->header->xh_entries;
2147
2148 ret = ocfs2_journal_dirty(handle, new_bh);
2149 if (ret < 0) {
2150 mlog_errno(ret);
2151 goto end;
2152 }
2153 di->i_xattr_loc = cpu_to_le64(first_blkno);
2154 ocfs2_journal_dirty(handle, xs->inode_bh);
2155 } else 2353 } else
2156 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2354 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2157 2355
@@ -2273,7 +2471,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2273 old_in_xb = 1; 2471 old_in_xb = 1;
2274 2472
2275 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 2473 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2276 ret = ocfs2_xattr_bucket_get_name_value(inode, 2474 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
2277 bucket_xh(xbs->bucket), 2475 bucket_xh(xbs->bucket),
2278 i, &block_off, 2476 i, &block_off,
2279 &name_offset); 2477 &name_offset);
@@ -2428,6 +2626,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2428 struct ocfs2_xattr_search *xis, 2626 struct ocfs2_xattr_search *xis,
2429 struct ocfs2_xattr_search *xbs, 2627 struct ocfs2_xattr_search *xbs,
2430 struct ocfs2_xattr_set_ctxt *ctxt, 2628 struct ocfs2_xattr_set_ctxt *ctxt,
2629 int extra_meta,
2431 int *credits) 2630 int *credits)
2432{ 2631{
2433 int clusters_add, meta_add, ret; 2632 int clusters_add, meta_add, ret;
@@ -2444,6 +2643,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2444 return ret; 2643 return ret;
2445 } 2644 }
2446 2645
2646 meta_add += extra_meta;
2447 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 2647 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2448 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 2648 "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
2449 2649
@@ -2598,7 +2798,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2598 2798
2599 if (!ret) { 2799 if (!ret) {
2600 /* Update inode ctime. */ 2800 /* Update inode ctime. */
2601 ret = ocfs2_journal_access_di(ctxt->handle, inode, 2801 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2602 xis->inode_bh, 2802 xis->inode_bh,
2603 OCFS2_JOURNAL_ACCESS_WRITE); 2803 OCFS2_JOURNAL_ACCESS_WRITE);
2604 if (ret) { 2804 if (ret) {
@@ -2711,10 +2911,11 @@ int ocfs2_xattr_set(struct inode *inode,
2711{ 2911{
2712 struct buffer_head *di_bh = NULL; 2912 struct buffer_head *di_bh = NULL;
2713 struct ocfs2_dinode *di; 2913 struct ocfs2_dinode *di;
2714 int ret, credits; 2914 int ret, credits, ref_meta = 0, ref_credits = 0;
2715 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2915 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2716 struct inode *tl_inode = osb->osb_tl_inode; 2916 struct inode *tl_inode = osb->osb_tl_inode;
2717 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 2917 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
2918 struct ocfs2_refcount_tree *ref_tree = NULL;
2718 2919
2719 struct ocfs2_xattr_info xi = { 2920 struct ocfs2_xattr_info xi = {
2720 .name_index = name_index, 2921 .name_index = name_index,
@@ -2779,6 +2980,17 @@ int ocfs2_xattr_set(struct inode *inode,
2779 goto cleanup; 2980 goto cleanup;
2780 } 2981 }
2781 2982
2983 /* Check whether the value is refcounted and do some prepartion. */
2984 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
2985 (!xis.not_found || !xbs.not_found)) {
2986 ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
2987 &xis, &xbs, &ref_tree,
2988 &ref_meta, &ref_credits);
2989 if (ret) {
2990 mlog_errno(ret);
2991 goto cleanup;
2992 }
2993 }
2782 2994
2783 mutex_lock(&tl_inode->i_mutex); 2995 mutex_lock(&tl_inode->i_mutex);
2784 2996
@@ -2793,7 +3005,7 @@ int ocfs2_xattr_set(struct inode *inode,
2793 mutex_unlock(&tl_inode->i_mutex); 3005 mutex_unlock(&tl_inode->i_mutex);
2794 3006
2795 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, 3007 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
2796 &xbs, &ctxt, &credits); 3008 &xbs, &ctxt, ref_meta, &credits);
2797 if (ret) { 3009 if (ret) {
2798 mlog_errno(ret); 3010 mlog_errno(ret);
2799 goto cleanup; 3011 goto cleanup;
@@ -2801,7 +3013,7 @@ int ocfs2_xattr_set(struct inode *inode,
2801 3013
2802 /* we need to update inode's ctime field, so add credit for it. */ 3014 /* we need to update inode's ctime field, so add credit for it. */
2803 credits += OCFS2_INODE_UPDATE_CREDITS; 3015 credits += OCFS2_INODE_UPDATE_CREDITS;
2804 ctxt.handle = ocfs2_start_trans(osb, credits); 3016 ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
2805 if (IS_ERR(ctxt.handle)) { 3017 if (IS_ERR(ctxt.handle)) {
2806 ret = PTR_ERR(ctxt.handle); 3018 ret = PTR_ERR(ctxt.handle);
2807 mlog_errno(ret); 3019 mlog_errno(ret);
@@ -2819,8 +3031,16 @@ int ocfs2_xattr_set(struct inode *inode,
2819 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) 3031 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
2820 ocfs2_schedule_truncate_log_flush(osb, 1); 3032 ocfs2_schedule_truncate_log_flush(osb, 1);
2821 ocfs2_run_deallocs(osb, &ctxt.dealloc); 3033 ocfs2_run_deallocs(osb, &ctxt.dealloc);
3034
2822cleanup: 3035cleanup:
3036 if (ref_tree)
3037 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
2823 up_write(&OCFS2_I(inode)->ip_xattr_sem); 3038 up_write(&OCFS2_I(inode)->ip_xattr_sem);
3039 if (!value && !ret) {
3040 ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
3041 if (ret)
3042 mlog_errno(ret);
3043 }
2824 ocfs2_inode_unlock(inode, 1); 3044 ocfs2_inode_unlock(inode, 1);
2825cleanup_nolock: 3045cleanup_nolock:
2826 brelse(di_bh); 3046 brelse(di_bh);
@@ -2849,7 +3069,8 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
2849 u64 e_blkno = 0; 3069 u64 e_blkno = 0;
2850 3070
2851 if (el->l_tree_depth) { 3071 if (el->l_tree_depth) {
2852 ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); 3072 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
3073 &eb_bh);
2853 if (ret) { 3074 if (ret) {
2854 mlog_errno(ret); 3075 mlog_errno(ret);
2855 goto out; 3076 goto out;
@@ -2931,7 +3152,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2931 if (cmp) 3152 if (cmp)
2932 continue; 3153 continue;
2933 3154
2934 ret = ocfs2_xattr_bucket_get_name_value(inode, 3155 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
2935 xh, 3156 xh,
2936 i, 3157 i,
2937 &block_off, 3158 &block_off,
@@ -3175,7 +3396,7 @@ struct ocfs2_xattr_tree_list {
3175 size_t result; 3396 size_t result;
3176}; 3397};
3177 3398
3178static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, 3399static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
3179 struct ocfs2_xattr_header *xh, 3400 struct ocfs2_xattr_header *xh,
3180 int index, 3401 int index,
3181 int *block_off, 3402 int *block_off,
@@ -3188,8 +3409,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
3188 3409
3189 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); 3410 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
3190 3411
3191 *block_off = name_offset >> inode->i_sb->s_blocksize_bits; 3412 *block_off = name_offset >> sb->s_blocksize_bits;
3192 *new_offset = name_offset % inode->i_sb->s_blocksize; 3413 *new_offset = name_offset % sb->s_blocksize;
3193 3414
3194 return 0; 3415 return 0;
3195} 3416}
@@ -3209,7 +3430,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
3209 prefix = ocfs2_xattr_prefix(type); 3430 prefix = ocfs2_xattr_prefix(type);
3210 3431
3211 if (prefix) { 3432 if (prefix) {
3212 ret = ocfs2_xattr_bucket_get_name_value(inode, 3433 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
3213 bucket_xh(bucket), 3434 bucket_xh(bucket),
3214 i, 3435 i,
3215 &block_off, 3436 &block_off,
@@ -3232,22 +3453,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
3232 return ret; 3453 return ret;
3233} 3454}
3234 3455
3235static int ocfs2_xattr_tree_list_index_block(struct inode *inode, 3456static int ocfs2_iterate_xattr_index_block(struct inode *inode,
3236 struct ocfs2_xattr_tree_root *xt, 3457 struct buffer_head *blk_bh,
3237 char *buffer, 3458 xattr_tree_rec_func *rec_func,
3238 size_t buffer_size) 3459 void *para)
3239{ 3460{
3240 struct ocfs2_extent_list *el = &xt->xt_list; 3461 struct ocfs2_xattr_block *xb =
3462 (struct ocfs2_xattr_block *)blk_bh->b_data;
3463 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
3241 int ret = 0; 3464 int ret = 0;
3242 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; 3465 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
3243 u64 p_blkno = 0; 3466 u64 p_blkno = 0;
3244 struct ocfs2_xattr_tree_list xl = {
3245 .buffer = buffer,
3246 .buffer_size = buffer_size,
3247 .result = 0,
3248 };
3249 3467
3250 if (le16_to_cpu(el->l_next_free_rec) == 0) 3468 if (!el->l_next_free_rec || !rec_func)
3251 return 0; 3469 return 0;
3252 3470
3253 while (name_hash > 0) { 3471 while (name_hash > 0) {
@@ -3255,16 +3473,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3255 &e_cpos, &num_clusters, el); 3473 &e_cpos, &num_clusters, el);
3256 if (ret) { 3474 if (ret) {
3257 mlog_errno(ret); 3475 mlog_errno(ret);
3258 goto out; 3476 break;
3259 } 3477 }
3260 3478
3261 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, 3479 ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
3262 ocfs2_list_xattr_bucket, 3480 num_clusters, para);
3263 &xl);
3264 if (ret) { 3481 if (ret) {
3265 if (ret != -ERANGE) 3482 if (ret != -ERANGE)
3266 mlog_errno(ret); 3483 mlog_errno(ret);
3267 goto out; 3484 break;
3268 } 3485 }
3269 3486
3270 if (e_cpos == 0) 3487 if (e_cpos == 0)
@@ -3273,6 +3490,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3273 name_hash = e_cpos - 1; 3490 name_hash = e_cpos - 1;
3274 } 3491 }
3275 3492
3493 return ret;
3494
3495}
3496
3497static int ocfs2_list_xattr_tree_rec(struct inode *inode,
3498 struct buffer_head *root_bh,
3499 u64 blkno, u32 cpos, u32 len, void *para)
3500{
3501 return ocfs2_iterate_xattr_buckets(inode, blkno, len,
3502 ocfs2_list_xattr_bucket, para);
3503}
3504
3505static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3506 struct buffer_head *blk_bh,
3507 char *buffer,
3508 size_t buffer_size)
3509{
3510 int ret;
3511 struct ocfs2_xattr_tree_list xl = {
3512 .buffer = buffer,
3513 .buffer_size = buffer_size,
3514 .result = 0,
3515 };
3516
3517 ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
3518 ocfs2_list_xattr_tree_rec, &xl);
3519 if (ret) {
3520 mlog_errno(ret);
3521 goto out;
3522 }
3523
3276 ret = xl.result; 3524 ret = xl.result;
3277out: 3525out:
3278 return ret; 3526 return ret;
@@ -3426,7 +3674,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
3426 */ 3674 */
3427 down_write(&oi->ip_alloc_sem); 3675 down_write(&oi->ip_alloc_sem);
3428 3676
3429 ret = ocfs2_journal_access_xb(handle, inode, xb_bh, 3677 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
3430 OCFS2_JOURNAL_ACCESS_WRITE); 3678 OCFS2_JOURNAL_ACCESS_WRITE);
3431 if (ret) { 3679 if (ret) {
3432 mlog_errno(ret); 3680 mlog_errno(ret);
@@ -4263,9 +4511,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
4263 (unsigned long long)OCFS2_I(inode)->ip_blkno, 4511 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4264 prev_cpos, (unsigned long long)bucket_blkno(first)); 4512 prev_cpos, (unsigned long long)bucket_blkno(first));
4265 4513
4266 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 4514 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
4267 4515
4268 ret = ocfs2_journal_access_xb(handle, inode, root_bh, 4516 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
4269 OCFS2_JOURNAL_ACCESS_WRITE); 4517 OCFS2_JOURNAL_ACCESS_WRITE);
4270 if (ret < 0) { 4518 if (ret < 0) {
4271 mlog_errno(ret); 4519 mlog_errno(ret);
@@ -4319,7 +4567,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
4319 4567
4320 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", 4568 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
4321 num_bits, (unsigned long long)block, v_start); 4569 num_bits, (unsigned long long)block, v_start);
4322 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, 4570 ret = ocfs2_insert_extent(handle, &et, v_start, block,
4323 num_bits, 0, ctxt->meta_ac); 4571 num_bits, 0, ctxt->meta_ac);
4324 if (ret < 0) { 4572 if (ret < 0) {
4325 mlog_errno(ret); 4573 mlog_errno(ret);
@@ -4798,10 +5046,13 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4798 struct ocfs2_xattr_entry *xe = xs->here; 5046 struct ocfs2_xattr_entry *xe = xs->here;
4799 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); 5047 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4800 void *base; 5048 void *base;
5049 struct ocfs2_xattr_value_buf vb = {
5050 .vb_access = ocfs2_journal_access,
5051 };
4801 5052
4802 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); 5053 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4803 5054
4804 ret = ocfs2_xattr_bucket_get_name_value(inode, xh, 5055 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
4805 xe - xh->xh_entries, 5056 xe - xh->xh_entries,
4806 &block_off, 5057 &block_off,
4807 &offset); 5058 &offset);
@@ -4814,8 +5065,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4814 xv = (struct ocfs2_xattr_value_root *)(base + offset + 5065 xv = (struct ocfs2_xattr_value_root *)(base + offset +
4815 OCFS2_XATTR_SIZE(xe->xe_name_len)); 5066 OCFS2_XATTR_SIZE(xe->xe_name_len));
4816 5067
5068 vb.vb_xv = xv;
5069 vb.vb_bh = xs->bucket->bu_bhs[block_off];
4817 ret = __ocfs2_xattr_set_value_outside(inode, handle, 5070 ret = __ocfs2_xattr_set_value_outside(inode, handle,
4818 xv, val, value_len); 5071 &vb, val, value_len);
4819 if (ret) 5072 if (ret)
4820 mlog_errno(ret); 5073 mlog_errno(ret);
4821out: 5074out:
@@ -4826,7 +5079,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4826 struct buffer_head *root_bh, 5079 struct buffer_head *root_bh,
4827 u64 blkno, 5080 u64 blkno,
4828 u32 cpos, 5081 u32 cpos,
4829 u32 len) 5082 u32 len,
5083 void *para)
4830{ 5084{
4831 int ret; 5085 int ret;
4832 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5086 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4838,14 +5092,22 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4838 struct ocfs2_cached_dealloc_ctxt dealloc; 5092 struct ocfs2_cached_dealloc_ctxt dealloc;
4839 struct ocfs2_extent_tree et; 5093 struct ocfs2_extent_tree et;
4840 5094
4841 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 5095 ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
5096 ocfs2_delete_xattr_in_bucket, para);
5097 if (ret) {
5098 mlog_errno(ret);
5099 return ret;
5100 }
5101
5102 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
4842 5103
4843 ocfs2_init_dealloc_ctxt(&dealloc); 5104 ocfs2_init_dealloc_ctxt(&dealloc);
4844 5105
4845 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", 5106 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
4846 cpos, len, (unsigned long long)blkno); 5107 cpos, len, (unsigned long long)blkno);
4847 5108
4848 ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); 5109 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
5110 len);
4849 5111
4850 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); 5112 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
4851 if (ret) { 5113 if (ret) {
@@ -4870,14 +5132,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4870 goto out; 5132 goto out;
4871 } 5133 }
4872 5134
4873 ret = ocfs2_journal_access_xb(handle, inode, root_bh, 5135 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
4874 OCFS2_JOURNAL_ACCESS_WRITE); 5136 OCFS2_JOURNAL_ACCESS_WRITE);
4875 if (ret) { 5137 if (ret) {
4876 mlog_errno(ret); 5138 mlog_errno(ret);
4877 goto out_commit; 5139 goto out_commit;
4878 } 5140 }
4879 5141
4880 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, 5142 ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
4881 &dealloc); 5143 &dealloc);
4882 if (ret) { 5144 if (ret) {
4883 mlog_errno(ret); 5145 mlog_errno(ret);
@@ -5220,7 +5482,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5220 struct ocfs2_xattr_bucket *bucket, 5482 struct ocfs2_xattr_bucket *bucket,
5221 void *para) 5483 void *para)
5222{ 5484{
5223 int ret = 0; 5485 int ret = 0, ref_credits;
5224 struct ocfs2_xattr_header *xh = bucket_xh(bucket); 5486 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
5225 u16 i; 5487 u16 i;
5226 struct ocfs2_xattr_entry *xe; 5488 struct ocfs2_xattr_entry *xe;
@@ -5228,7 +5490,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5228 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; 5490 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
5229 int credits = ocfs2_remove_extent_credits(osb->sb) + 5491 int credits = ocfs2_remove_extent_credits(osb->sb) +
5230 ocfs2_blocks_per_xattr_bucket(inode->i_sb); 5492 ocfs2_blocks_per_xattr_bucket(inode->i_sb);
5231 5493 struct ocfs2_xattr_value_root *xv;
5494 struct ocfs2_rm_xattr_bucket_para *args =
5495 (struct ocfs2_rm_xattr_bucket_para *)para;
5232 5496
5233 ocfs2_init_dealloc_ctxt(&ctxt.dealloc); 5497 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
5234 5498
@@ -5237,7 +5501,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5237 if (ocfs2_xattr_is_local(xe)) 5501 if (ocfs2_xattr_is_local(xe))
5238 continue; 5502 continue;
5239 5503
5240 ctxt.handle = ocfs2_start_trans(osb, credits); 5504 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
5505 i, &xv, NULL);
5506
5507 ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
5508 args->ref_ci,
5509 args->ref_root_bh,
5510 &ctxt.meta_ac,
5511 &ref_credits);
5512
5513 ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
5241 if (IS_ERR(ctxt.handle)) { 5514 if (IS_ERR(ctxt.handle)) {
5242 ret = PTR_ERR(ctxt.handle); 5515 ret = PTR_ERR(ctxt.handle);
5243 mlog_errno(ret); 5516 mlog_errno(ret);
@@ -5248,57 +5521,1439 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5248 i, 0, &ctxt); 5521 i, 0, &ctxt);
5249 5522
5250 ocfs2_commit_trans(osb, ctxt.handle); 5523 ocfs2_commit_trans(osb, ctxt.handle);
5524 if (ctxt.meta_ac) {
5525 ocfs2_free_alloc_context(ctxt.meta_ac);
5526 ctxt.meta_ac = NULL;
5527 }
5251 if (ret) { 5528 if (ret) {
5252 mlog_errno(ret); 5529 mlog_errno(ret);
5253 break; 5530 break;
5254 } 5531 }
5255 } 5532 }
5256 5533
5534 if (ctxt.meta_ac)
5535 ocfs2_free_alloc_context(ctxt.meta_ac);
5257 ocfs2_schedule_truncate_log_flush(osb, 1); 5536 ocfs2_schedule_truncate_log_flush(osb, 1);
5258 ocfs2_run_deallocs(osb, &ctxt.dealloc); 5537 ocfs2_run_deallocs(osb, &ctxt.dealloc);
5259 return ret; 5538 return ret;
5260} 5539}
5261 5540
5262static int ocfs2_delete_xattr_index_block(struct inode *inode, 5541/*
5263 struct buffer_head *xb_bh) 5542 * Whenever we modify a xattr value root in the bucket(e.g, CoW
5543 * or change the extent record flag), we need to recalculate
5544 * the metaecc for the whole bucket. So it is done here.
5545 *
5546 * Note:
5547 * We have to give the extra credits for the caller.
5548 */
5549static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
5550 handle_t *handle,
5551 void *para)
5552{
5553 int ret;
5554 struct ocfs2_xattr_bucket *bucket =
5555 (struct ocfs2_xattr_bucket *)para;
5556
5557 ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
5558 OCFS2_JOURNAL_ACCESS_WRITE);
5559 if (ret) {
5560 mlog_errno(ret);
5561 return ret;
5562 }
5563
5564 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
5565
5566 return 0;
5567}
5568
5569/*
5570 * Special action we need if the xattr value is refcounted.
5571 *
5572 * 1. If the xattr is refcounted, lock the tree.
5573 * 2. CoW the xattr if we are setting the new value and the value
5574 * will be stored outside.
5575 * 3. In other case, decrease_refcount will work for us, so just
5576 * lock the refcount tree, calculate the meta and credits is OK.
5577 *
5578 * We have to do CoW before ocfs2_init_xattr_set_ctxt since
5579 * currently CoW is a completed transaction, while this function
5580 * will also lock the allocators and let us deadlock. So we will
5581 * CoW the whole xattr value.
5582 */
5583static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5584 struct ocfs2_dinode *di,
5585 struct ocfs2_xattr_info *xi,
5586 struct ocfs2_xattr_search *xis,
5587 struct ocfs2_xattr_search *xbs,
5588 struct ocfs2_refcount_tree **ref_tree,
5589 int *meta_add,
5590 int *credits)
5264{ 5591{
5265 struct ocfs2_xattr_block *xb =
5266 (struct ocfs2_xattr_block *)xb_bh->b_data;
5267 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
5268 int ret = 0; 5592 int ret = 0;
5269 u32 name_hash = UINT_MAX, e_cpos, num_clusters; 5593 struct ocfs2_xattr_block *xb;
5270 u64 p_blkno; 5594 struct ocfs2_xattr_entry *xe;
5595 char *base;
5596 u32 p_cluster, num_clusters;
5597 unsigned int ext_flags;
5598 int name_offset, name_len;
5599 struct ocfs2_xattr_value_buf vb;
5600 struct ocfs2_xattr_bucket *bucket = NULL;
5601 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5602 struct ocfs2_post_refcount refcount;
5603 struct ocfs2_post_refcount *p = NULL;
5604 struct buffer_head *ref_root_bh = NULL;
5271 5605
5272 if (le16_to_cpu(el->l_next_free_rec) == 0) 5606 if (!xis->not_found) {
5273 return 0; 5607 xe = xis->here;
5608 name_offset = le16_to_cpu(xe->xe_name_offset);
5609 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
5610 base = xis->base;
5611 vb.vb_bh = xis->inode_bh;
5612 vb.vb_access = ocfs2_journal_access_di;
5613 } else {
5614 int i, block_off = 0;
5615 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
5616 xe = xbs->here;
5617 name_offset = le16_to_cpu(xe->xe_name_offset);
5618 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
5619 i = xbs->here - xbs->header->xh_entries;
5274 5620
5275 while (name_hash > 0) { 5621 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
5276 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, 5622 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
5277 &e_cpos, &num_clusters, el); 5623 bucket_xh(xbs->bucket),
5624 i, &block_off,
5625 &name_offset);
5626 if (ret) {
5627 mlog_errno(ret);
5628 goto out;
5629 }
5630 base = bucket_block(xbs->bucket, block_off);
5631 vb.vb_bh = xbs->bucket->bu_bhs[block_off];
5632 vb.vb_access = ocfs2_journal_access;
5633
5634 if (ocfs2_meta_ecc(osb)) {
5635 /*create parameters for ocfs2_post_refcount. */
5636 bucket = xbs->bucket;
5637 refcount.credits = bucket->bu_blocks;
5638 refcount.para = bucket;
5639 refcount.func =
5640 ocfs2_xattr_bucket_post_refcount;
5641 p = &refcount;
5642 }
5643 } else {
5644 base = xbs->base;
5645 vb.vb_bh = xbs->xattr_bh;
5646 vb.vb_access = ocfs2_journal_access_xb;
5647 }
5648 }
5649
5650 if (ocfs2_xattr_is_local(xe))
5651 goto out;
5652
5653 vb.vb_xv = (struct ocfs2_xattr_value_root *)
5654 (base + name_offset + name_len);
5655
5656 ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
5657 &num_clusters, &vb.vb_xv->xr_list,
5658 &ext_flags);
5659 if (ret) {
5660 mlog_errno(ret);
5661 goto out;
5662 }
5663
5664 /*
5665 * We just need to check the 1st extent record, since we always
5666 * CoW the whole xattr. So there shouldn't be a xattr with
5667 * some REFCOUNT extent recs after the 1st one.
5668 */
5669 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
5670 goto out;
5671
5672 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
5673 1, ref_tree, &ref_root_bh);
5674 if (ret) {
5675 mlog_errno(ret);
5676 goto out;
5677 }
5678
5679 /*
5680 * If we are deleting the xattr or the new size will be stored inside,
5681 * cool, leave it there, the xattr truncate process will remove them
5682 * for us(it still needs the refcount tree lock and the meta, credits).
5683 * And the worse case is that every cluster truncate will split the
5684 * refcount tree, and make the original extent become 3. So we will need
5685 * 2 * cluster more extent recs at most.
5686 */
5687 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
5688
5689 ret = ocfs2_refcounted_xattr_delete_need(inode,
5690 &(*ref_tree)->rf_ci,
5691 ref_root_bh, vb.vb_xv,
5692 meta_add, credits);
5693 if (ret)
5694 mlog_errno(ret);
5695 goto out;
5696 }
5697
5698 ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
5699 *ref_tree, ref_root_bh, 0,
5700 le32_to_cpu(vb.vb_xv->xr_clusters), p);
5701 if (ret)
5702 mlog_errno(ret);
5703
5704out:
5705 brelse(ref_root_bh);
5706 return ret;
5707}
5708
5709/*
5710 * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
5711 * The physical clusters will be added to refcount tree.
5712 */
5713static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
5714 struct ocfs2_xattr_value_root *xv,
5715 struct ocfs2_extent_tree *value_et,
5716 struct ocfs2_caching_info *ref_ci,
5717 struct buffer_head *ref_root_bh,
5718 struct ocfs2_cached_dealloc_ctxt *dealloc,
5719 struct ocfs2_post_refcount *refcount)
5720{
5721 int ret = 0;
5722 u32 clusters = le32_to_cpu(xv->xr_clusters);
5723 u32 cpos, p_cluster, num_clusters;
5724 struct ocfs2_extent_list *el = &xv->xr_list;
5725 unsigned int ext_flags;
5726
5727 cpos = 0;
5728 while (cpos < clusters) {
5729 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
5730 &num_clusters, el, &ext_flags);
5731
5732 cpos += num_clusters;
5733 if ((ext_flags & OCFS2_EXT_REFCOUNTED))
5734 continue;
5735
5736 BUG_ON(!p_cluster);
5737
5738 ret = ocfs2_add_refcount_flag(inode, value_et,
5739 ref_ci, ref_root_bh,
5740 cpos - num_clusters,
5741 p_cluster, num_clusters,
5742 dealloc, refcount);
5743 if (ret) {
5744 mlog_errno(ret);
5745 break;
5746 }
5747 }
5748
5749 return ret;
5750}
5751
5752/*
5753 * Given a normal ocfs2_xattr_header, refcount all the entries which
5754 * have value stored outside.
5755 * Used for xattrs stored in inode and ocfs2_xattr_block.
5756 */
5757static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
5758 struct ocfs2_xattr_value_buf *vb,
5759 struct ocfs2_xattr_header *header,
5760 struct ocfs2_caching_info *ref_ci,
5761 struct buffer_head *ref_root_bh,
5762 struct ocfs2_cached_dealloc_ctxt *dealloc)
5763{
5764
5765 struct ocfs2_xattr_entry *xe;
5766 struct ocfs2_xattr_value_root *xv;
5767 struct ocfs2_extent_tree et;
5768 int i, ret = 0;
5769
5770 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
5771 xe = &header->xh_entries[i];
5772
5773 if (ocfs2_xattr_is_local(xe))
5774 continue;
5775
5776 xv = (struct ocfs2_xattr_value_root *)((void *)header +
5777 le16_to_cpu(xe->xe_name_offset) +
5778 OCFS2_XATTR_SIZE(xe->xe_name_len));
5779
5780 vb->vb_xv = xv;
5781 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
5782
5783 ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
5784 ref_ci, ref_root_bh,
5785 dealloc, NULL);
5786 if (ret) {
5787 mlog_errno(ret);
5788 break;
5789 }
5790 }
5791
5792 return ret;
5793}
5794
5795static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
5796 struct buffer_head *fe_bh,
5797 struct ocfs2_caching_info *ref_ci,
5798 struct buffer_head *ref_root_bh,
5799 struct ocfs2_cached_dealloc_ctxt *dealloc)
5800{
5801 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
5802 struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
5803 (fe_bh->b_data + inode->i_sb->s_blocksize -
5804 le16_to_cpu(di->i_xattr_inline_size));
5805 struct ocfs2_xattr_value_buf vb = {
5806 .vb_bh = fe_bh,
5807 .vb_access = ocfs2_journal_access_di,
5808 };
5809
5810 return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
5811 ref_ci, ref_root_bh, dealloc);
5812}
5813
5814struct ocfs2_xattr_tree_value_refcount_para {
5815 struct ocfs2_caching_info *ref_ci;
5816 struct buffer_head *ref_root_bh;
5817 struct ocfs2_cached_dealloc_ctxt *dealloc;
5818};
5819
5820static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
5821 struct ocfs2_xattr_bucket *bucket,
5822 int offset,
5823 struct ocfs2_xattr_value_root **xv,
5824 struct buffer_head **bh)
5825{
5826 int ret, block_off, name_offset;
5827 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
5828 struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
5829 void *base;
5830
5831 ret = ocfs2_xattr_bucket_get_name_value(sb,
5832 bucket_xh(bucket),
5833 offset,
5834 &block_off,
5835 &name_offset);
5836 if (ret) {
5837 mlog_errno(ret);
5838 goto out;
5839 }
5840
5841 base = bucket_block(bucket, block_off);
5842
5843 *xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
5844 OCFS2_XATTR_SIZE(xe->xe_name_len));
5845
5846 if (bh)
5847 *bh = bucket->bu_bhs[block_off];
5848out:
5849 return ret;
5850}
5851
5852/*
5853 * For a given xattr bucket, refcount all the entries which
5854 * have value stored outside.
5855 */
5856static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
5857 struct ocfs2_xattr_bucket *bucket,
5858 void *para)
5859{
5860 int i, ret = 0;
5861 struct ocfs2_extent_tree et;
5862 struct ocfs2_xattr_tree_value_refcount_para *ref =
5863 (struct ocfs2_xattr_tree_value_refcount_para *)para;
5864 struct ocfs2_xattr_header *xh =
5865 (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
5866 struct ocfs2_xattr_entry *xe;
5867 struct ocfs2_xattr_value_buf vb = {
5868 .vb_access = ocfs2_journal_access,
5869 };
5870 struct ocfs2_post_refcount refcount = {
5871 .credits = bucket->bu_blocks,
5872 .para = bucket,
5873 .func = ocfs2_xattr_bucket_post_refcount,
5874 };
5875 struct ocfs2_post_refcount *p = NULL;
5876
5877 /* We only need post_refcount if we support metaecc. */
5878 if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
5879 p = &refcount;
5880
5881 mlog(0, "refcount bucket %llu, count = %u\n",
5882 (unsigned long long)bucket_blkno(bucket),
5883 le16_to_cpu(xh->xh_count));
5884 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
5885 xe = &xh->xh_entries[i];
5886
5887 if (ocfs2_xattr_is_local(xe))
5888 continue;
5889
5890 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
5891 &vb.vb_xv, &vb.vb_bh);
5892 if (ret) {
5893 mlog_errno(ret);
5894 break;
5895 }
5896
5897 ocfs2_init_xattr_value_extent_tree(&et,
5898 INODE_CACHE(inode), &vb);
5899
5900 ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
5901 &et, ref->ref_ci,
5902 ref->ref_root_bh,
5903 ref->dealloc, p);
5904 if (ret) {
5905 mlog_errno(ret);
5906 break;
5907 }
5908 }
5909
5910 return ret;
5911
5912}
5913
5914static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
5915 struct buffer_head *root_bh,
5916 u64 blkno, u32 cpos, u32 len, void *para)
5917{
5918 return ocfs2_iterate_xattr_buckets(inode, blkno, len,
5919 ocfs2_xattr_bucket_value_refcount,
5920 para);
5921}
5922
5923static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
5924 struct buffer_head *blk_bh,
5925 struct ocfs2_caching_info *ref_ci,
5926 struct buffer_head *ref_root_bh,
5927 struct ocfs2_cached_dealloc_ctxt *dealloc)
5928{
5929 int ret = 0;
5930 struct ocfs2_xattr_block *xb =
5931 (struct ocfs2_xattr_block *)blk_bh->b_data;
5932
5933 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
5934 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
5935 struct ocfs2_xattr_value_buf vb = {
5936 .vb_bh = blk_bh,
5937 .vb_access = ocfs2_journal_access_xb,
5938 };
5939
5940 ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
5941 ref_ci, ref_root_bh,
5942 dealloc);
5943 } else {
5944 struct ocfs2_xattr_tree_value_refcount_para para = {
5945 .ref_ci = ref_ci,
5946 .ref_root_bh = ref_root_bh,
5947 .dealloc = dealloc,
5948 };
5949
5950 ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
5951 ocfs2_refcount_xattr_tree_rec,
5952 &para);
5953 }
5954
5955 return ret;
5956}
5957
5958int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
5959 struct buffer_head *fe_bh,
5960 struct ocfs2_caching_info *ref_ci,
5961 struct buffer_head *ref_root_bh,
5962 struct ocfs2_cached_dealloc_ctxt *dealloc)
5963{
5964 int ret = 0;
5965 struct ocfs2_inode_info *oi = OCFS2_I(inode);
5966 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
5967 struct buffer_head *blk_bh = NULL;
5968
5969 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
5970 ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
5971 ref_ci, ref_root_bh,
5972 dealloc);
5278 if (ret) { 5973 if (ret) {
5279 mlog_errno(ret); 5974 mlog_errno(ret);
5280 goto out; 5975 goto out;
5281 } 5976 }
5977 }
5978
5979 if (!di->i_xattr_loc)
5980 goto out;
5981
5982 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
5983 &blk_bh);
5984 if (ret < 0) {
5985 mlog_errno(ret);
5986 goto out;
5987 }
5988
5989 ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
5990 ref_root_bh, dealloc);
5991 if (ret)
5992 mlog_errno(ret);
5993
5994 brelse(blk_bh);
5995out:
5996
5997 return ret;
5998}
5999
6000typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
6001/*
6002 * Store the information we need in xattr reflink.
6003 * old_bh and new_bh are inode bh for the old and new inode.
6004 */
6005struct ocfs2_xattr_reflink {
6006 struct inode *old_inode;
6007 struct inode *new_inode;
6008 struct buffer_head *old_bh;
6009 struct buffer_head *new_bh;
6010 struct ocfs2_caching_info *ref_ci;
6011 struct buffer_head *ref_root_bh;
6012 struct ocfs2_cached_dealloc_ctxt *dealloc;
6013 should_xattr_reflinked *xattr_reflinked;
6014};
6015
6016/*
6017 * Given a xattr header and xe offset,
6018 * return the proper xv and the corresponding bh.
6019 * xattr in inode, block and xattr tree have different implementaions.
6020 */
6021typedef int (get_xattr_value_root)(struct super_block *sb,
6022 struct buffer_head *bh,
6023 struct ocfs2_xattr_header *xh,
6024 int offset,
6025 struct ocfs2_xattr_value_root **xv,
6026 struct buffer_head **ret_bh,
6027 void *para);
6028
6029/*
6030 * Calculate all the xattr value root metadata stored in this xattr header and
6031 * credits we need if we create them from the scratch.
6032 * We use get_xattr_value_root so that all types of xattr container can use it.
6033 */
6034static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
6035 struct buffer_head *bh,
6036 struct ocfs2_xattr_header *xh,
6037 int *metas, int *credits,
6038 int *num_recs,
6039 get_xattr_value_root *func,
6040 void *para)
6041{
6042 int i, ret = 0;
6043 struct ocfs2_xattr_value_root *xv;
6044 struct ocfs2_xattr_entry *xe;
6045
6046 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
6047 xe = &xh->xh_entries[i];
6048 if (ocfs2_xattr_is_local(xe))
6049 continue;
6050
6051 ret = func(sb, bh, xh, i, &xv, NULL, para);
6052 if (ret) {
6053 mlog_errno(ret);
6054 break;
6055 }
6056
6057 *metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
6058 le16_to_cpu(xv->xr_list.l_next_free_rec);
6059
6060 *credits += ocfs2_calc_extend_credits(sb,
6061 &def_xv.xv.xr_list,
6062 le32_to_cpu(xv->xr_clusters));
6063
6064 /*
6065 * If the value is a tree with depth > 1, We don't go deep
6066 * to the extent block, so just calculate a maximum record num.
6067 */
6068 if (!xv->xr_list.l_tree_depth)
6069 *num_recs += xv->xr_list.l_next_free_rec;
6070 else
6071 *num_recs += ocfs2_clusters_for_bytes(sb,
6072 XATTR_SIZE_MAX);
6073 }
6074
6075 return ret;
6076}
6077
6078/* Used by xattr inode and block to return the right xv and buffer_head. */
6079static int ocfs2_get_xattr_value_root(struct super_block *sb,
6080 struct buffer_head *bh,
6081 struct ocfs2_xattr_header *xh,
6082 int offset,
6083 struct ocfs2_xattr_value_root **xv,
6084 struct buffer_head **ret_bh,
6085 void *para)
6086{
6087 struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
6088
6089 *xv = (struct ocfs2_xattr_value_root *)((void *)xh +
6090 le16_to_cpu(xe->xe_name_offset) +
6091 OCFS2_XATTR_SIZE(xe->xe_name_len));
6092
6093 if (ret_bh)
6094 *ret_bh = bh;
6095
6096 return 0;
6097}
6098
6099/*
6100 * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
6101 * It is only used for inline xattr and xattr block.
6102 */
6103static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
6104 struct ocfs2_xattr_header *xh,
6105 struct buffer_head *ref_root_bh,
6106 int *credits,
6107 struct ocfs2_alloc_context **meta_ac)
6108{
6109 int ret, meta_add = 0, num_recs = 0;
6110 struct ocfs2_refcount_block *rb =
6111 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
6112
6113 *credits = 0;
6114
6115 ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
6116 &meta_add, credits, &num_recs,
6117 ocfs2_get_xattr_value_root,
6118 NULL);
6119 if (ret) {
6120 mlog_errno(ret);
6121 goto out;
6122 }
6123
6124 /*
6125 * We need to add/modify num_recs in refcount tree, so just calculate
6126 * an approximate number we need for refcount tree change.
6127 * Sometimes we need to split the tree, and after split, half recs
6128 * will be moved to the new block, and a new block can only provide
6129 * half number of recs. So we multiple new blocks by 2.
6130 */
6131 num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
6132 meta_add += num_recs;
6133 *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
6134 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
6135 *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
6136 le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
6137 else
6138 *credits += 1;
6139
6140 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
6141 if (ret)
6142 mlog_errno(ret);
6143
6144out:
6145 return ret;
6146}
5282 6147
5283 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, 6148/*
5284 ocfs2_delete_xattr_in_bucket, 6149 * Given a xattr header, reflink all the xattrs in this container.
5285 NULL); 6150 * It can be used for inode, block and bucket.
6151 *
6152 * NOTE:
6153 * Before we call this function, the caller has memcpy the xattr in
6154 * old_xh to the new_xh.
6155 *
6156 * If args.xattr_reflinked is set, call it to decide whether the xe should
6157 * be reflinked or not. If not, remove it from the new xattr header.
6158 */
6159static int ocfs2_reflink_xattr_header(handle_t *handle,
6160 struct ocfs2_xattr_reflink *args,
6161 struct buffer_head *old_bh,
6162 struct ocfs2_xattr_header *xh,
6163 struct buffer_head *new_bh,
6164 struct ocfs2_xattr_header *new_xh,
6165 struct ocfs2_xattr_value_buf *vb,
6166 struct ocfs2_alloc_context *meta_ac,
6167 get_xattr_value_root *func,
6168 void *para)
6169{
6170 int ret = 0, i, j;
6171 struct super_block *sb = args->old_inode->i_sb;
6172 struct buffer_head *value_bh;
6173 struct ocfs2_xattr_entry *xe, *last;
6174 struct ocfs2_xattr_value_root *xv, *new_xv;
6175 struct ocfs2_extent_tree data_et;
6176 u32 clusters, cpos, p_cluster, num_clusters;
6177 unsigned int ext_flags = 0;
6178
6179 mlog(0, "reflink xattr in container %llu, count = %u\n",
6180 (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
6181
6182 last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
6183 for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
6184 xe = &xh->xh_entries[i];
6185
6186 if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
6187 xe = &new_xh->xh_entries[j];
6188
6189 le16_add_cpu(&new_xh->xh_count, -1);
6190 if (new_xh->xh_count) {
6191 memmove(xe, xe + 1,
6192 (void *)last - (void *)xe);
6193 memset(last, 0,
6194 sizeof(struct ocfs2_xattr_entry));
6195 }
6196
6197 /*
6198 * We don't want j to increase in the next round since
6199 * it is already moved ahead.
6200 */
6201 j--;
6202 continue;
6203 }
6204
6205 if (ocfs2_xattr_is_local(xe))
6206 continue;
6207
6208 ret = func(sb, old_bh, xh, i, &xv, NULL, para);
6209 if (ret) {
6210 mlog_errno(ret);
6211 break;
6212 }
6213
6214 ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
6215 if (ret) {
6216 mlog_errno(ret);
6217 break;
6218 }
6219
6220 /*
6221 * For the xattr which has l_tree_depth = 0, all the extent
6222 * recs have already be copied to the new xh with the
6223 * propriate OCFS2_EXT_REFCOUNTED flag we just need to
6224 * increase the refount count int the refcount tree.
6225 *
6226 * For the xattr which has l_tree_depth > 0, we need
6227 * to initialize it to the empty default value root,
6228 * and then insert the extents one by one.
6229 */
6230 if (xv->xr_list.l_tree_depth) {
6231 memcpy(new_xv, &def_xv, sizeof(def_xv));
6232 vb->vb_xv = new_xv;
6233 vb->vb_bh = value_bh;
6234 ocfs2_init_xattr_value_extent_tree(&data_et,
6235 INODE_CACHE(args->new_inode), vb);
6236 }
6237
6238 clusters = le32_to_cpu(xv->xr_clusters);
6239 cpos = 0;
6240 while (cpos < clusters) {
6241 ret = ocfs2_xattr_get_clusters(args->old_inode,
6242 cpos,
6243 &p_cluster,
6244 &num_clusters,
6245 &xv->xr_list,
6246 &ext_flags);
6247 if (ret) {
6248 mlog_errno(ret);
6249 goto out;
6250 }
6251
6252 BUG_ON(!p_cluster);
6253
6254 if (xv->xr_list.l_tree_depth) {
6255 ret = ocfs2_insert_extent(handle,
6256 &data_et, cpos,
6257 ocfs2_clusters_to_blocks(
6258 args->old_inode->i_sb,
6259 p_cluster),
6260 num_clusters, ext_flags,
6261 meta_ac);
6262 if (ret) {
6263 mlog_errno(ret);
6264 goto out;
6265 }
6266 }
6267
6268 ret = ocfs2_increase_refcount(handle, args->ref_ci,
6269 args->ref_root_bh,
6270 p_cluster, num_clusters,
6271 meta_ac, args->dealloc);
6272 if (ret) {
6273 mlog_errno(ret);
6274 goto out;
6275 }
6276
6277 cpos += num_clusters;
6278 }
6279 }
6280
6281out:
6282 return ret;
6283}
6284
6285static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
6286{
6287 int ret = 0, credits = 0;
6288 handle_t *handle;
6289 struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
6290 struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
6291 int inline_size = le16_to_cpu(di->i_xattr_inline_size);
6292 int header_off = osb->sb->s_blocksize - inline_size;
6293 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
6294 (args->old_bh->b_data + header_off);
6295 struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
6296 (args->new_bh->b_data + header_off);
6297 struct ocfs2_alloc_context *meta_ac = NULL;
6298 struct ocfs2_inode_info *new_oi;
6299 struct ocfs2_dinode *new_di;
6300 struct ocfs2_xattr_value_buf vb = {
6301 .vb_bh = args->new_bh,
6302 .vb_access = ocfs2_journal_access_di,
6303 };
6304
6305 ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
6306 &credits, &meta_ac);
6307 if (ret) {
6308 mlog_errno(ret);
6309 goto out;
6310 }
6311
6312 handle = ocfs2_start_trans(osb, credits);
6313 if (IS_ERR(handle)) {
6314 ret = PTR_ERR(handle);
6315 mlog_errno(ret);
6316 goto out;
6317 }
6318
6319 ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
6320 args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
6321 if (ret) {
6322 mlog_errno(ret);
6323 goto out_commit;
6324 }
6325
6326 memcpy(args->new_bh->b_data + header_off,
6327 args->old_bh->b_data + header_off, inline_size);
6328
6329 new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
6330 new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
6331
6332 ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
6333 args->new_bh, new_xh, &vb, meta_ac,
6334 ocfs2_get_xattr_value_root, NULL);
6335 if (ret) {
6336 mlog_errno(ret);
6337 goto out_commit;
6338 }
6339
6340 new_oi = OCFS2_I(args->new_inode);
6341 spin_lock(&new_oi->ip_lock);
6342 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
6343 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
6344 spin_unlock(&new_oi->ip_lock);
6345
6346 ocfs2_journal_dirty(handle, args->new_bh);
6347
6348out_commit:
6349 ocfs2_commit_trans(osb, handle);
6350
6351out:
6352 if (meta_ac)
6353 ocfs2_free_alloc_context(meta_ac);
6354 return ret;
6355}
6356
6357static int ocfs2_create_empty_xattr_block(struct inode *inode,
6358 struct buffer_head *fe_bh,
6359 struct buffer_head **ret_bh,
6360 int indexed)
6361{
6362 int ret;
6363 handle_t *handle;
6364 struct ocfs2_alloc_context *meta_ac;
6365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6366
6367 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
6368 if (ret < 0) {
6369 mlog_errno(ret);
6370 return ret;
6371 }
6372
6373 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6374 if (IS_ERR(handle)) {
6375 ret = PTR_ERR(handle);
6376 mlog_errno(ret);
6377 goto out;
6378 }
6379
6380 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6381 (unsigned long long)fe_bh->b_blocknr, indexed);
6382 ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
6383 meta_ac, ret_bh, indexed);
6384 if (ret)
6385 mlog_errno(ret);
6386
6387 ocfs2_commit_trans(osb, handle);
6388out:
6389 ocfs2_free_alloc_context(meta_ac);
6390 return ret;
6391}
6392
6393static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
6394 struct buffer_head *blk_bh,
6395 struct buffer_head *new_blk_bh)
6396{
6397 int ret = 0, credits = 0;
6398 handle_t *handle;
6399 struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
6400 struct ocfs2_dinode *new_di;
6401 struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
6402 int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
6403 struct ocfs2_xattr_block *xb =
6404 (struct ocfs2_xattr_block *)blk_bh->b_data;
6405 struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
6406 struct ocfs2_xattr_block *new_xb =
6407 (struct ocfs2_xattr_block *)new_blk_bh->b_data;
6408 struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
6409 struct ocfs2_alloc_context *meta_ac;
6410 struct ocfs2_xattr_value_buf vb = {
6411 .vb_bh = new_blk_bh,
6412 .vb_access = ocfs2_journal_access_xb,
6413 };
6414
6415 ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
6416 &credits, &meta_ac);
6417 if (ret) {
6418 mlog_errno(ret);
6419 return ret;
6420 }
6421
6422 /* One more credits in case we need to add xattr flags in new inode. */
6423 handle = ocfs2_start_trans(osb, credits + 1);
6424 if (IS_ERR(handle)) {
6425 ret = PTR_ERR(handle);
6426 mlog_errno(ret);
6427 goto out;
6428 }
6429
6430 if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
6431 ret = ocfs2_journal_access_di(handle,
6432 INODE_CACHE(args->new_inode),
6433 args->new_bh,
6434 OCFS2_JOURNAL_ACCESS_WRITE);
6435 if (ret) {
6436 mlog_errno(ret);
6437 goto out_commit;
6438 }
6439 }
6440
6441 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
6442 new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
6443 if (ret) {
6444 mlog_errno(ret);
6445 goto out_commit;
6446 }
6447
6448 memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
6449 osb->sb->s_blocksize - header_off);
6450
6451 ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
6452 new_blk_bh, new_xh, &vb, meta_ac,
6453 ocfs2_get_xattr_value_root, NULL);
6454 if (ret) {
6455 mlog_errno(ret);
6456 goto out_commit;
6457 }
6458
6459 ocfs2_journal_dirty(handle, new_blk_bh);
6460
6461 if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
6462 new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
6463 spin_lock(&new_oi->ip_lock);
6464 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
6465 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
6466 spin_unlock(&new_oi->ip_lock);
6467
6468 ocfs2_journal_dirty(handle, args->new_bh);
6469 }
6470
6471out_commit:
6472 ocfs2_commit_trans(osb, handle);
6473
6474out:
6475 ocfs2_free_alloc_context(meta_ac);
6476 return ret;
6477}
6478
6479struct ocfs2_reflink_xattr_tree_args {
6480 struct ocfs2_xattr_reflink *reflink;
6481 struct buffer_head *old_blk_bh;
6482 struct buffer_head *new_blk_bh;
6483 struct ocfs2_xattr_bucket *old_bucket;
6484 struct ocfs2_xattr_bucket *new_bucket;
6485};
6486
6487/*
6488 * NOTE:
6489 * We have to handle the case that both old bucket and new bucket
6490 * will call this function to get the right ret_bh.
6491 * So The caller must give us the right bh.
6492 */
6493static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
6494 struct buffer_head *bh,
6495 struct ocfs2_xattr_header *xh,
6496 int offset,
6497 struct ocfs2_xattr_value_root **xv,
6498 struct buffer_head **ret_bh,
6499 void *para)
6500{
6501 struct ocfs2_reflink_xattr_tree_args *args =
6502 (struct ocfs2_reflink_xattr_tree_args *)para;
6503 struct ocfs2_xattr_bucket *bucket;
6504
6505 if (bh == args->old_bucket->bu_bhs[0])
6506 bucket = args->old_bucket;
6507 else
6508 bucket = args->new_bucket;
6509
6510 return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
6511 xv, ret_bh);
6512}
6513
6514struct ocfs2_value_tree_metas {
6515 int num_metas;
6516 int credits;
6517 int num_recs;
6518};
6519
6520static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
6521 struct buffer_head *bh,
6522 struct ocfs2_xattr_header *xh,
6523 int offset,
6524 struct ocfs2_xattr_value_root **xv,
6525 struct buffer_head **ret_bh,
6526 void *para)
6527{
6528 struct ocfs2_xattr_bucket *bucket =
6529 (struct ocfs2_xattr_bucket *)para;
6530
6531 return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
6532 xv, ret_bh);
6533}
6534
6535static int ocfs2_calc_value_tree_metas(struct inode *inode,
6536 struct ocfs2_xattr_bucket *bucket,
6537 void *para)
6538{
6539 struct ocfs2_value_tree_metas *metas =
6540 (struct ocfs2_value_tree_metas *)para;
6541 struct ocfs2_xattr_header *xh =
6542 (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
6543
6544 /* Add the credits for this bucket first. */
6545 metas->credits += bucket->bu_blocks;
6546 return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
6547 xh, &metas->num_metas,
6548 &metas->credits, &metas->num_recs,
6549 ocfs2_value_tree_metas_in_bucket,
6550 bucket);
6551}
6552
6553/*
6554 * Given a xattr extent rec starting from blkno and having len clusters,
6555 * iterate all the buckets calculate how much metadata we need for reflinking
6556 * all the ocfs2_xattr_value_root and lock the allocators accordingly.
6557 */
6558static int ocfs2_lock_reflink_xattr_rec_allocators(
6559 struct ocfs2_reflink_xattr_tree_args *args,
6560 struct ocfs2_extent_tree *xt_et,
6561 u64 blkno, u32 len, int *credits,
6562 struct ocfs2_alloc_context **meta_ac,
6563 struct ocfs2_alloc_context **data_ac)
6564{
6565 int ret, num_free_extents;
6566 struct ocfs2_value_tree_metas metas;
6567 struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
6568 struct ocfs2_refcount_block *rb;
6569
6570 memset(&metas, 0, sizeof(metas));
6571
6572 ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
6573 ocfs2_calc_value_tree_metas, &metas);
6574 if (ret) {
6575 mlog_errno(ret);
6576 goto out;
6577 }
6578
6579 *credits = metas.credits;
6580
6581 /*
6582 * Calculate we need for refcount tree change.
6583 *
6584 * We need to add/modify num_recs in refcount tree, so just calculate
6585 * an approximate number we need for refcount tree change.
6586 * Sometimes we need to split the tree, and after split, half recs
6587 * will be moved to the new block, and a new block can only provide
6588 * half number of recs. So we multiple new blocks by 2.
6589 * In the end, we have to add credits for modifying the already
6590 * existed refcount block.
6591 */
6592 rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
6593 metas.num_recs =
6594 (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
6595 ocfs2_refcount_recs_per_rb(osb->sb) * 2;
6596 metas.num_metas += metas.num_recs;
6597 *credits += metas.num_recs +
6598 metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
6599 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
6600 *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
6601 le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
6602 else
6603 *credits += 1;
6604
6605 /* count in the xattr tree change. */
6606 num_free_extents = ocfs2_num_free_extents(osb, xt_et);
6607 if (num_free_extents < 0) {
6608 ret = num_free_extents;
6609 mlog_errno(ret);
6610 goto out;
6611 }
6612
6613 if (num_free_extents < len)
6614 metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
6615
6616 *credits += ocfs2_calc_extend_credits(osb->sb,
6617 xt_et->et_root_el, len);
6618
6619 if (metas.num_metas) {
6620 ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
6621 meta_ac);
5286 if (ret) { 6622 if (ret) {
5287 mlog_errno(ret); 6623 mlog_errno(ret);
5288 goto out; 6624 goto out;
5289 } 6625 }
6626 }
5290 6627
5291 ret = ocfs2_rm_xattr_cluster(inode, xb_bh, 6628 if (len) {
5292 p_blkno, e_cpos, num_clusters); 6629 ret = ocfs2_reserve_clusters(osb, len, data_ac);
6630 if (ret)
6631 mlog_errno(ret);
6632 }
6633out:
6634 if (ret) {
6635 if (*meta_ac) {
6636 ocfs2_free_alloc_context(*meta_ac);
6637 meta_ac = NULL;
6638 }
6639 }
6640
6641 return ret;
6642}
6643
6644static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6645 u64 blkno, u64 new_blkno, u32 clusters,
6646 struct ocfs2_alloc_context *meta_ac,
6647 struct ocfs2_alloc_context *data_ac,
6648 struct ocfs2_reflink_xattr_tree_args *args)
6649{
6650 int i, j, ret = 0;
6651 struct super_block *sb = args->reflink->old_inode->i_sb;
6652 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
6653 u32 num_buckets = clusters * bpc;
6654 int bpb = args->old_bucket->bu_blocks;
6655 struct ocfs2_xattr_value_buf vb = {
6656 .vb_access = ocfs2_journal_access,
6657 };
6658
6659 for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
6660 ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
5293 if (ret) { 6661 if (ret) {
5294 mlog_errno(ret); 6662 mlog_errno(ret);
5295 break; 6663 break;
5296 } 6664 }
5297 6665
5298 if (e_cpos == 0) 6666 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
6667 if (ret) {
6668 mlog_errno(ret);
5299 break; 6669 break;
6670 }
5300 6671
5301 name_hash = e_cpos - 1; 6672 /*
6673 * The real bucket num in this series of blocks is stored
6674 * in the 1st bucket.
6675 */
6676 if (i == 0)
6677 num_buckets = le16_to_cpu(
6678 bucket_xh(args->old_bucket)->xh_num_buckets);
6679
6680 ret = ocfs2_xattr_bucket_journal_access(handle,
6681 args->new_bucket,
6682 OCFS2_JOURNAL_ACCESS_CREATE);
6683 if (ret) {
6684 mlog_errno(ret);
6685 break;
6686 }
6687
6688 for (j = 0; j < bpb; j++)
6689 memcpy(bucket_block(args->new_bucket, j),
6690 bucket_block(args->old_bucket, j),
6691 sb->s_blocksize);
6692
6693 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6694
6695 ret = ocfs2_reflink_xattr_header(handle, args->reflink,
6696 args->old_bucket->bu_bhs[0],
6697 bucket_xh(args->old_bucket),
6698 args->new_bucket->bu_bhs[0],
6699 bucket_xh(args->new_bucket),
6700 &vb, meta_ac,
6701 ocfs2_get_reflink_xattr_value_root,
6702 args);
6703 if (ret) {
6704 mlog_errno(ret);
6705 break;
6706 }
6707
6708 /*
6709 * Re-access and dirty the bucket to calculate metaecc.
6710 * Because we may extend the transaction in reflink_xattr_header
6711 * which will let the already accessed block gone.
6712 */
6713 ret = ocfs2_xattr_bucket_journal_access(handle,
6714 args->new_bucket,
6715 OCFS2_JOURNAL_ACCESS_WRITE);
6716 if (ret) {
6717 mlog_errno(ret);
6718 break;
6719 }
6720
6721 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6722 ocfs2_xattr_bucket_relse(args->old_bucket);
6723 ocfs2_xattr_bucket_relse(args->new_bucket);
6724 }
6725
6726 ocfs2_xattr_bucket_relse(args->old_bucket);
6727 ocfs2_xattr_bucket_relse(args->new_bucket);
6728 return ret;
6729}
6730/*
6731 * Create the same xattr extent record in the new inode's xattr tree.
6732 */
6733static int ocfs2_reflink_xattr_rec(struct inode *inode,
6734 struct buffer_head *root_bh,
6735 u64 blkno,
6736 u32 cpos,
6737 u32 len,
6738 void *para)
6739{
6740 int ret, credits = 0;
6741 u32 p_cluster, num_clusters;
6742 u64 new_blkno;
6743 handle_t *handle;
6744 struct ocfs2_reflink_xattr_tree_args *args =
6745 (struct ocfs2_reflink_xattr_tree_args *)para;
6746 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6747 struct ocfs2_alloc_context *meta_ac = NULL;
6748 struct ocfs2_alloc_context *data_ac = NULL;
6749 struct ocfs2_extent_tree et;
6750
6751 ocfs2_init_xattr_tree_extent_tree(&et,
6752 INODE_CACHE(args->reflink->new_inode),
6753 args->new_blk_bh);
6754
6755 ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
6756 len, &credits,
6757 &meta_ac, &data_ac);
6758 if (ret) {
6759 mlog_errno(ret);
6760 goto out;
6761 }
6762
6763 handle = ocfs2_start_trans(osb, credits);
6764 if (IS_ERR(handle)) {
6765 ret = PTR_ERR(handle);
6766 mlog_errno(ret);
6767 goto out;
6768 }
6769
6770 ret = ocfs2_claim_clusters(osb, handle, data_ac,
6771 len, &p_cluster, &num_clusters);
6772 if (ret) {
6773 mlog_errno(ret);
6774 goto out_commit;
6775 }
6776
6777 new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
6778
6779 mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
6780 (unsigned long long)blkno, (unsigned long long)new_blkno, len);
6781 ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
6782 meta_ac, data_ac, args);
6783 if (ret) {
6784 mlog_errno(ret);
6785 goto out_commit;
6786 }
6787
6788 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
6789 (unsigned long long)new_blkno, len, cpos);
6790 ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
6791 len, 0, meta_ac);
6792 if (ret)
6793 mlog_errno(ret);
6794
6795out_commit:
6796 ocfs2_commit_trans(osb, handle);
6797
6798out:
6799 if (meta_ac)
6800 ocfs2_free_alloc_context(meta_ac);
6801 if (data_ac)
6802 ocfs2_free_alloc_context(data_ac);
6803 return ret;
6804}
6805
6806/*
6807 * Create reflinked xattr buckets.
6808 * We will add bucket one by one, and refcount all the xattrs in the bucket
6809 * if they are stored outside.
6810 */
6811static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
6812 struct buffer_head *blk_bh,
6813 struct buffer_head *new_blk_bh)
6814{
6815 int ret;
6816 struct ocfs2_reflink_xattr_tree_args para;
6817
6818 memset(&para, 0, sizeof(para));
6819 para.reflink = args;
6820 para.old_blk_bh = blk_bh;
6821 para.new_blk_bh = new_blk_bh;
6822
6823 para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
6824 if (!para.old_bucket) {
6825 mlog_errno(-ENOMEM);
6826 return -ENOMEM;
6827 }
6828
6829 para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
6830 if (!para.new_bucket) {
6831 ret = -ENOMEM;
6832 mlog_errno(ret);
6833 goto out;
6834 }
6835
6836 ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
6837 ocfs2_reflink_xattr_rec,
6838 &para);
6839 if (ret)
6840 mlog_errno(ret);
6841
6842out:
6843 ocfs2_xattr_bucket_free(para.old_bucket);
6844 ocfs2_xattr_bucket_free(para.new_bucket);
6845 return ret;
6846}
6847
6848static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
6849 struct buffer_head *blk_bh)
6850{
6851 int ret, indexed = 0;
6852 struct buffer_head *new_blk_bh = NULL;
6853 struct ocfs2_xattr_block *xb =
6854 (struct ocfs2_xattr_block *)blk_bh->b_data;
6855
6856
6857 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
6858 indexed = 1;
6859
6860 ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
6861 &new_blk_bh, indexed);
6862 if (ret) {
6863 mlog_errno(ret);
6864 goto out;
6865 }
6866
6867 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
6868 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
6869 else
6870 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
6871 if (ret)
6872 mlog_errno(ret);
6873
6874out:
6875 brelse(new_blk_bh);
6876 return ret;
6877}
6878
6879static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
6880{
6881 int type = ocfs2_xattr_get_type(xe);
6882
6883 return type != OCFS2_XATTR_INDEX_SECURITY &&
6884 type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
6885 type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
6886}
6887
6888int ocfs2_reflink_xattrs(struct inode *old_inode,
6889 struct buffer_head *old_bh,
6890 struct inode *new_inode,
6891 struct buffer_head *new_bh,
6892 bool preserve_security)
6893{
6894 int ret;
6895 struct ocfs2_xattr_reflink args;
6896 struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
6897 struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
6898 struct buffer_head *blk_bh = NULL;
6899 struct ocfs2_cached_dealloc_ctxt dealloc;
6900 struct ocfs2_refcount_tree *ref_tree;
6901 struct buffer_head *ref_root_bh = NULL;
6902
6903 ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
6904 le64_to_cpu(di->i_refcount_loc),
6905 1, &ref_tree, &ref_root_bh);
6906 if (ret) {
6907 mlog_errno(ret);
6908 goto out;
6909 }
6910
6911 ocfs2_init_dealloc_ctxt(&dealloc);
6912
6913 args.old_inode = old_inode;
6914 args.new_inode = new_inode;
6915 args.old_bh = old_bh;
6916 args.new_bh = new_bh;
6917 args.ref_ci = &ref_tree->rf_ci;
6918 args.ref_root_bh = ref_root_bh;
6919 args.dealloc = &dealloc;
6920 if (preserve_security)
6921 args.xattr_reflinked = NULL;
6922 else
6923 args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
6924
6925 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
6926 ret = ocfs2_reflink_xattr_inline(&args);
6927 if (ret) {
6928 mlog_errno(ret);
6929 goto out_unlock;
6930 }
6931 }
6932
6933 if (!di->i_xattr_loc)
6934 goto out_unlock;
6935
6936 ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
6937 &blk_bh);
6938 if (ret < 0) {
6939 mlog_errno(ret);
6940 goto out_unlock;
6941 }
6942
6943 ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
6944 if (ret)
6945 mlog_errno(ret);
6946
6947 brelse(blk_bh);
6948
6949out_unlock:
6950 ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
6951 ref_tree, 1);
6952 brelse(ref_root_bh);
6953
6954 if (ocfs2_dealloc_has_cluster(&dealloc)) {
6955 ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
6956 ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
5302 } 6957 }
5303 6958
5304out: 6959out:
@@ -5306,6 +6961,51 @@ out:
5306} 6961}
5307 6962
5308/* 6963/*
6964 * Initialize security and acl for a already created inode.
6965 * Used for reflink a non-preserve-security file.
6966 *
6967 * It uses common api like ocfs2_xattr_set, so the caller
6968 * must not hold any lock expect i_mutex.
6969 */
6970int ocfs2_init_security_and_acl(struct inode *dir,
6971 struct inode *inode)
6972{
6973 int ret = 0;
6974 struct buffer_head *dir_bh = NULL;
6975 struct ocfs2_security_xattr_info si = {
6976 .enable = 1,
6977 };
6978
6979 ret = ocfs2_init_security_get(inode, dir, &si);
6980 if (!ret) {
6981 ret = ocfs2_xattr_security_set(inode, si.name,
6982 si.value, si.value_len,
6983 XATTR_CREATE);
6984 if (ret) {
6985 mlog_errno(ret);
6986 goto leave;
6987 }
6988 } else if (ret != -EOPNOTSUPP) {
6989 mlog_errno(ret);
6990 goto leave;
6991 }
6992
6993 ret = ocfs2_inode_lock(dir, &dir_bh, 0);
6994 if (ret) {
6995 mlog_errno(ret);
6996 goto leave;
6997 }
6998
6999 ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
7000 if (ret)
7001 mlog_errno(ret);
7002
7003 ocfs2_inode_unlock(dir, 0);
7004 brelse(dir_bh);
7005leave:
7006 return ret;
7007}
7008/*
5309 * 'security' attributes support 7009 * 'security' attributes support
5310 */ 7010 */
5311static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, 7011static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1ca7e9a1b7bc..08e36389f56d 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
55 int, const char *, const void *, size_t, int, 55 int, const char *, const void *, size_t, int,
56 struct ocfs2_alloc_context *, 56 struct ocfs2_alloc_context *,
57 struct ocfs2_alloc_context *); 57 struct ocfs2_alloc_context *);
58int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
59 struct ocfs2_dinode *di);
58int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 60int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *, 61int ocfs2_init_security_get(struct inode *, struct inode *,
60 struct ocfs2_security_xattr_info *); 62 struct ocfs2_security_xattr_info *);
@@ -83,5 +85,16 @@ struct ocfs2_xattr_value_buf {
83 struct ocfs2_xattr_value_root *vb_xv; 85 struct ocfs2_xattr_value_root *vb_xv;
84}; 86};
85 87
86 88int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
89 struct buffer_head *fe_bh,
90 struct ocfs2_caching_info *ref_ci,
91 struct buffer_head *ref_root_bh,
92 struct ocfs2_cached_dealloc_ctxt *dealloc);
93int ocfs2_reflink_xattrs(struct inode *old_inode,
94 struct buffer_head *old_bh,
95 struct inode *new_inode,
96 struct buffer_head *new_bh,
97 bool preserve_security);
98int ocfs2_init_security_and_acl(struct inode *dir,
99 struct inode *inode);
87#endif /* OCFS2_XATTR_H */ 100#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c7275cfbdcfb..b42d62419034 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -489,7 +489,7 @@ out:
489 return ret; 489 return ret;
490} 490}
491 491
492struct inode_operations omfs_dir_inops = { 492const struct inode_operations omfs_dir_inops = {
493 .lookup = omfs_lookup, 493 .lookup = omfs_lookup,
494 .mkdir = omfs_mkdir, 494 .mkdir = omfs_mkdir,
495 .rename = omfs_rename, 495 .rename = omfs_rename,
@@ -498,7 +498,7 @@ struct inode_operations omfs_dir_inops = {
498 .rmdir = omfs_rmdir, 498 .rmdir = omfs_rmdir,
499}; 499};
500 500
501struct file_operations omfs_dir_operations = { 501const struct file_operations omfs_dir_operations = {
502 .read = generic_read_dir, 502 .read = generic_read_dir,
503 .readdir = omfs_readdir, 503 .readdir = omfs_readdir,
504 .llseek = generic_file_llseek, 504 .llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d17e774eaf45..399487c09364 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
322 return generic_block_bmap(mapping, block, omfs_get_block); 322 return generic_block_bmap(mapping, block, omfs_get_block);
323} 323}
324 324
325struct file_operations omfs_file_operations = { 325const struct file_operations omfs_file_operations = {
326 .llseek = generic_file_llseek, 326 .llseek = generic_file_llseek,
327 .read = do_sync_read, 327 .read = do_sync_read,
328 .write = do_sync_write, 328 .write = do_sync_write,
@@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = {
333 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
334}; 334};
335 335
336struct inode_operations omfs_file_inops = { 336const struct inode_operations omfs_file_inops = {
337 .truncate = omfs_truncate 337 .truncate = omfs_truncate
338}; 338};
339 339
340struct address_space_operations omfs_aops = { 340const struct address_space_operations omfs_aops = {
341 .readpage = omfs_readpage, 341 .readpage = omfs_readpage,
342 .readpages = omfs_readpages, 342 .readpages = omfs_readpages,
343 .writepage = omfs_writepage, 343 .writepage = omfs_writepage,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 379ae5fb4411..f3b7c1541f3a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
278 return 0; 278 return 0;
279} 279}
280 280
281static struct super_operations omfs_sops = { 281static const struct super_operations omfs_sops = {
282 .write_inode = omfs_write_inode, 282 .write_inode = omfs_write_inode,
283 .delete_inode = omfs_delete_inode, 283 .delete_inode = omfs_delete_inode,
284 .put_super = omfs_put_super, 284 .put_super = omfs_put_super,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 2bc0f0670406..ebe2fdbe535e 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,16 +44,16 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
44extern int omfs_clear_range(struct super_block *sb, u64 block, int count); 44extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
45 45
46/* dir.c */ 46/* dir.c */
47extern struct file_operations omfs_dir_operations; 47extern const struct file_operations omfs_dir_operations;
48extern struct inode_operations omfs_dir_inops; 48extern const struct inode_operations omfs_dir_inops;
49extern int omfs_make_empty(struct inode *inode, struct super_block *sb); 49extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, 50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
51 u64 fsblock); 51 u64 fsblock);
52 52
53/* file.c */ 53/* file.c */
54extern struct file_operations omfs_file_operations; 54extern const struct file_operations omfs_file_operations;
55extern struct inode_operations omfs_file_inops; 55extern const struct inode_operations omfs_file_inops;
56extern struct address_space_operations omfs_aops; 56extern const struct address_space_operations omfs_aops;
57extern void omfs_make_empty_table(struct buffer_head *bh, int offset); 57extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
58extern int omfs_shrink_inode(struct inode *inode); 58extern int omfs_shrink_inode(struct inode *inode);
59 59
diff --git a/fs/open.c b/fs/open.c
index 31191bf513e4..4f01e06227c6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -290,10 +290,9 @@ out:
290 return error; 290 return error;
291} 291}
292 292
293SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length) 293SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
294{ 294{
295 /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */ 295 return do_sys_truncate(path, length);
296 return do_sys_truncate(path, (long)length);
297} 296}
298 297
299static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) 298static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index fbeaddf595d3..f38fee0311a7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,19 +248,11 @@ ssize_t part_stat_show(struct device *dev,
248 part_stat_read(p, merges[WRITE]), 248 part_stat_read(p, merges[WRITE]),
249 (unsigned long long)part_stat_read(p, sectors[WRITE]), 249 (unsigned long long)part_stat_read(p, sectors[WRITE]),
250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), 250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
251 part_in_flight(p), 251 p->in_flight,
252 jiffies_to_msecs(part_stat_read(p, io_ticks)), 252 jiffies_to_msecs(part_stat_read(p, io_ticks)),
253 jiffies_to_msecs(part_stat_read(p, time_in_queue))); 253 jiffies_to_msecs(part_stat_read(p, time_in_queue)));
254} 254}
255 255
256ssize_t part_inflight_show(struct device *dev,
257 struct device_attribute *attr, char *buf)
258{
259 struct hd_struct *p = dev_to_part(dev);
260
261 return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
262}
263
264#ifdef CONFIG_FAIL_MAKE_REQUEST 256#ifdef CONFIG_FAIL_MAKE_REQUEST
265ssize_t part_fail_show(struct device *dev, 257ssize_t part_fail_show(struct device *dev,
266 struct device_attribute *attr, char *buf) 258 struct device_attribute *attr, char *buf)
@@ -289,7 +281,6 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
289static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 281static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
290static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 282static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
291static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 283static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
292static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
293#ifdef CONFIG_FAIL_MAKE_REQUEST 284#ifdef CONFIG_FAIL_MAKE_REQUEST
294static struct device_attribute dev_attr_fail = 285static struct device_attribute dev_attr_fail =
295 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); 286 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -301,7 +292,6 @@ static struct attribute *part_attrs[] = {
301 &dev_attr_size.attr, 292 &dev_attr_size.attr,
302 &dev_attr_alignment_offset.attr, 293 &dev_attr_alignment_offset.attr,
303 &dev_attr_stat.attr, 294 &dev_attr_stat.attr,
304 &dev_attr_inflight.attr,
305#ifdef CONFIG_FAIL_MAKE_REQUEST 295#ifdef CONFIG_FAIL_MAKE_REQUEST
306 &dev_attr_fail.attr, 296 &dev_attr_fail.attr,
307#endif 297#endif
@@ -581,7 +571,7 @@ try_scan:
581 } 571 }
582 572
583 if (from + size > get_capacity(disk)) { 573 if (from + size > get_capacity(disk)) {
584 struct block_device_operations *bdops = disk->fops; 574 const struct block_device_operations *bdops = disk->fops;
585 unsigned long long capacity; 575 unsigned long long capacity;
586 576
587 printk(KERN_WARNING 577 printk(KERN_WARNING
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650bbbb8..07f77a7945c3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -82,6 +82,7 @@
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/ptrace.h> 83#include <linux/ptrace.h>
84#include <linux/tracehook.h> 84#include <linux/tracehook.h>
85#include <linux/swapops.h>
85 86
86#include <asm/pgtable.h> 87#include <asm/pgtable.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -321,6 +322,94 @@ static inline void task_context_switch_counts(struct seq_file *m,
321 p->nivcsw); 322 p->nivcsw);
322} 323}
323 324
325#ifdef CONFIG_MMU
326
327struct stack_stats {
328 struct vm_area_struct *vma;
329 unsigned long startpage;
330 unsigned long usage;
331};
332
333static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
334 unsigned long end, struct mm_walk *walk)
335{
336 struct stack_stats *ss = walk->private;
337 struct vm_area_struct *vma = ss->vma;
338 pte_t *pte, ptent;
339 spinlock_t *ptl;
340 int ret = 0;
341
342 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
343 for (; addr != end; pte++, addr += PAGE_SIZE) {
344 ptent = *pte;
345
346#ifdef CONFIG_STACK_GROWSUP
347 if (pte_present(ptent) || is_swap_pte(ptent))
348 ss->usage = addr - ss->startpage + PAGE_SIZE;
349#else
350 if (pte_present(ptent) || is_swap_pte(ptent)) {
351 ss->usage = ss->startpage - addr + PAGE_SIZE;
352 pte++;
353 ret = 1;
354 break;
355 }
356#endif
357 }
358 pte_unmap_unlock(pte - 1, ptl);
359 cond_resched();
360 return ret;
361}
362
363static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
364 struct task_struct *task)
365{
366 struct stack_stats ss;
367 struct mm_walk stack_walk = {
368 .pmd_entry = stack_usage_pte_range,
369 .mm = vma->vm_mm,
370 .private = &ss,
371 };
372
373 if (!vma->vm_mm || is_vm_hugetlb_page(vma))
374 return 0;
375
376 ss.vma = vma;
377 ss.startpage = task->stack_start & PAGE_MASK;
378 ss.usage = 0;
379
380#ifdef CONFIG_STACK_GROWSUP
381 walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
382 &stack_walk);
383#else
384 walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
385 &stack_walk);
386#endif
387 return ss.usage;
388}
389
390static inline void task_show_stack_usage(struct seq_file *m,
391 struct task_struct *task)
392{
393 struct vm_area_struct *vma;
394 struct mm_struct *mm = get_task_mm(task);
395
396 if (mm) {
397 down_read(&mm->mmap_sem);
398 vma = find_vma(mm, task->stack_start);
399 if (vma)
400 seq_printf(m, "Stack usage:\t%lu kB\n",
401 get_stack_usage_in_bytes(vma, task) >> 10);
402
403 up_read(&mm->mmap_sem);
404 mmput(mm);
405 }
406}
407#else
408static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
409{
410}
411#endif /* CONFIG_MMU */
412
324int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 413int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
325 struct pid *pid, struct task_struct *task) 414 struct pid *pid, struct task_struct *task)
326{ 415{
@@ -340,6 +429,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
340 task_show_regs(m, task); 429 task_show_regs(m, task);
341#endif 430#endif
342 task_context_switch_counts(m, task); 431 task_context_switch_counts(m, task);
432 task_show_stack_usage(m, task);
343 return 0; 433 return 0;
344} 434}
345 435
@@ -481,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
481 rsslim, 571 rsslim,
482 mm ? mm->start_code : 0, 572 mm ? mm->start_code : 0,
483 mm ? mm->end_code : 0, 573 mm ? mm->end_code : 0,
484 (permitted && mm) ? mm->start_stack : 0, 574 (permitted) ? task->stack_start : 0,
485 esp, 575 esp,
486 eip, 576 eip,
487 /* The signal information here is obsolete. 577 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6658a9..837469a96598 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
447 447
448 do_posix_clock_monotonic_gettime(&uptime); 448 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 449 read_lock(&tasklist_lock);
450 points = badness(task, uptime.tv_sec); 450 points = badness(task->group_leader, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 451 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 452 return sprintf(buffer, "%lu\n", points);
453} 453}
@@ -458,7 +458,7 @@ struct limit_names {
458}; 458};
459 459
460static const struct limit_names lnames[RLIM_NLIMITS] = { 460static const struct limit_names lnames[RLIM_NLIMITS] = {
461 [RLIMIT_CPU] = {"Max cpu time", "ms"}, 461 [RLIMIT_CPU] = {"Max cpu time", "seconds"},
462 [RLIMIT_FSIZE] = {"Max file size", "bytes"}, 462 [RLIMIT_FSIZE] = {"Max file size", "bytes"},
463 [RLIMIT_DATA] = {"Max data size", "bytes"}, 463 [RLIMIT_DATA] = {"Max data size", "bytes"},
464 [RLIMIT_STACK] = {"Max stack size", "bytes"}, 464 [RLIMIT_STACK] = {"Max stack size", "bytes"},
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1000 char buffer[PROC_NUMBUF]; 1000 char buffer[PROC_NUMBUF];
1001 size_t len; 1001 size_t len;
1002 int oom_adjust; 1002 int oom_adjust = OOM_DISABLE;
1003 unsigned long flags;
1003 1004
1004 if (!task) 1005 if (!task)
1005 return -ESRCH; 1006 return -ESRCH;
1006 oom_adjust = task->oomkilladj; 1007
1008 if (lock_task_sighand(task, &flags)) {
1009 oom_adjust = task->signal->oom_adj;
1010 unlock_task_sighand(task, &flags);
1011 }
1012
1007 put_task_struct(task); 1013 put_task_struct(task);
1008 1014
1009 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); 1015 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1015,32 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1015 size_t count, loff_t *ppos) 1021 size_t count, loff_t *ppos)
1016{ 1022{
1017 struct task_struct *task; 1023 struct task_struct *task;
1018 char buffer[PROC_NUMBUF], *end; 1024 char buffer[PROC_NUMBUF];
1019 int oom_adjust; 1025 long oom_adjust;
1026 unsigned long flags;
1027 int err;
1020 1028
1021 memset(buffer, 0, sizeof(buffer)); 1029 memset(buffer, 0, sizeof(buffer));
1022 if (count > sizeof(buffer) - 1) 1030 if (count > sizeof(buffer) - 1)
1023 count = sizeof(buffer) - 1; 1031 count = sizeof(buffer) - 1;
1024 if (copy_from_user(buffer, buf, count)) 1032 if (copy_from_user(buffer, buf, count))
1025 return -EFAULT; 1033 return -EFAULT;
1026 oom_adjust = simple_strtol(buffer, &end, 0); 1034
1035 err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1036 if (err)
1037 return -EINVAL;
1027 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && 1038 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1028 oom_adjust != OOM_DISABLE) 1039 oom_adjust != OOM_DISABLE)
1029 return -EINVAL; 1040 return -EINVAL;
1030 if (*end == '\n') 1041
1031 end++;
1032 task = get_proc_task(file->f_path.dentry->d_inode); 1042 task = get_proc_task(file->f_path.dentry->d_inode);
1033 if (!task) 1043 if (!task)
1034 return -ESRCH; 1044 return -ESRCH;
1035 if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { 1045 if (!lock_task_sighand(task, &flags)) {
1046 put_task_struct(task);
1047 return -ESRCH;
1048 }
1049
1050 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1051 unlock_task_sighand(task, &flags);
1036 put_task_struct(task); 1052 put_task_struct(task);
1037 return -EACCES; 1053 return -EACCES;
1038 } 1054 }
1039 task->oomkilladj = oom_adjust; 1055
1056 task->signal->oom_adj = oom_adjust;
1057
1058 unlock_task_sighand(task, &flags);
1040 put_task_struct(task); 1059 put_task_struct(task);
1041 if (end - buffer == 0) 1060
1042 return -EIO; 1061 return count;
1043 return end - buffer;
1044} 1062}
1045 1063
1046static const struct file_operations proc_oom_adjust_operations = { 1064static const struct file_operations proc_oom_adjust_operations = {
@@ -1169,17 +1187,16 @@ static ssize_t proc_fault_inject_write(struct file * file,
1169 count = sizeof(buffer) - 1; 1187 count = sizeof(buffer) - 1;
1170 if (copy_from_user(buffer, buf, count)) 1188 if (copy_from_user(buffer, buf, count))
1171 return -EFAULT; 1189 return -EFAULT;
1172 make_it_fail = simple_strtol(buffer, &end, 0); 1190 make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
1173 if (*end == '\n') 1191 if (*end)
1174 end++; 1192 return -EINVAL;
1175 task = get_proc_task(file->f_dentry->d_inode); 1193 task = get_proc_task(file->f_dentry->d_inode);
1176 if (!task) 1194 if (!task)
1177 return -ESRCH; 1195 return -ESRCH;
1178 task->make_it_fail = make_it_fail; 1196 task->make_it_fail = make_it_fail;
1179 put_task_struct(task); 1197 put_task_struct(task);
1180 if (end - buffer == 0) 1198
1181 return -EIO; 1199 return count;
1182 return end - buffer;
1183} 1200}
1184 1201
1185static const struct file_operations proc_fault_inject_operations = { 1202static const struct file_operations proc_fault_inject_operations = {
@@ -2586,9 +2603,6 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2586 dput(dentry); 2603 dput(dentry);
2587 } 2604 }
2588 2605
2589 if (tgid == 0)
2590 goto out;
2591
2592 name.name = buf; 2606 name.name = buf;
2593 name.len = snprintf(buf, sizeof(buf), "%d", tgid); 2607 name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2594 leader = d_hash_and_lookup(mnt->mnt_root, &name); 2608 leader = d_hash_and_lookup(mnt->mnt_root, &name);
@@ -2645,17 +2659,16 @@ out:
2645void proc_flush_task(struct task_struct *task) 2659void proc_flush_task(struct task_struct *task)
2646{ 2660{
2647 int i; 2661 int i;
2648 struct pid *pid, *tgid = NULL; 2662 struct pid *pid, *tgid;
2649 struct upid *upid; 2663 struct upid *upid;
2650 2664
2651 pid = task_pid(task); 2665 pid = task_pid(task);
2652 if (thread_group_leader(task)) 2666 tgid = task_tgid(task);
2653 tgid = task_tgid(task);
2654 2667
2655 for (i = 0; i <= pid->level; i++) { 2668 for (i = 0; i <= pid->level; i++) {
2656 upid = &pid->numbers[i]; 2669 upid = &pid->numbers[i];
2657 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2670 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2658 tgid ? tgid->numbers[i].nr : 0); 2671 tgid->numbers[i].nr);
2659 } 2672 }
2660 2673
2661 upid = &pid->numbers[pid->level]; 2674 upid = &pid->numbers[pid->level];
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 59b43a068872..56013371f9f3 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,9 +17,15 @@
17#include <linux/elfcore.h> 17#include <linux/elfcore.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h>
20#include <linux/init.h> 21#include <linux/init.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include <asm/io.h> 23#include <asm/io.h>
24#include <linux/list.h>
25#include <linux/ioport.h>
26#include <linux/mm.h>
27#include <linux/memory.h>
28#include <asm/sections.h>
23 29
24#define CORE_STR "CORE" 30#define CORE_STR "CORE"
25 31
@@ -29,17 +35,6 @@
29 35
30static struct proc_dir_entry *proc_root_kcore; 36static struct proc_dir_entry *proc_root_kcore;
31 37
32static int open_kcore(struct inode * inode, struct file * filp)
33{
34 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
35}
36
37static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
38
39static const struct file_operations proc_kcore_operations = {
40 .read = read_kcore,
41 .open = open_kcore,
42};
43 38
44#ifndef kc_vaddr_to_offset 39#ifndef kc_vaddr_to_offset
45#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) 40#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
@@ -57,18 +52,19 @@ struct memelfnote
57 void *data; 52 void *data;
58}; 53};
59 54
60static struct kcore_list *kclist; 55static LIST_HEAD(kclist_head);
61static DEFINE_RWLOCK(kclist_lock); 56static DEFINE_RWLOCK(kclist_lock);
57static int kcore_need_update = 1;
62 58
63void 59void
64kclist_add(struct kcore_list *new, void *addr, size_t size) 60kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
65{ 61{
66 new->addr = (unsigned long)addr; 62 new->addr = (unsigned long)addr;
67 new->size = size; 63 new->size = size;
64 new->type = type;
68 65
69 write_lock(&kclist_lock); 66 write_lock(&kclist_lock);
70 new->next = kclist; 67 list_add_tail(&new->list, &kclist_head);
71 kclist = new;
72 write_unlock(&kclist_lock); 68 write_unlock(&kclist_lock);
73} 69}
74 70
@@ -80,7 +76,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
80 *nphdr = 1; /* PT_NOTE */ 76 *nphdr = 1; /* PT_NOTE */
81 size = 0; 77 size = 0;
82 78
83 for (m=kclist; m; m=m->next) { 79 list_for_each_entry(m, &kclist_head, list) {
84 try = kc_vaddr_to_offset((size_t)m->addr + m->size); 80 try = kc_vaddr_to_offset((size_t)m->addr + m->size);
85 if (try > size) 81 if (try > size)
86 size = try; 82 size = try;
@@ -97,6 +93,177 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
97 return size + *elf_buflen; 93 return size + *elf_buflen;
98} 94}
99 95
96static void free_kclist_ents(struct list_head *head)
97{
98 struct kcore_list *tmp, *pos;
99
100 list_for_each_entry_safe(pos, tmp, head, list) {
101 list_del(&pos->list);
102 kfree(pos);
103 }
104}
105/*
106 * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
107 */
108static void __kcore_update_ram(struct list_head *list)
109{
110 int nphdr;
111 size_t size;
112 struct kcore_list *tmp, *pos;
113 LIST_HEAD(garbage);
114
115 write_lock(&kclist_lock);
116 if (kcore_need_update) {
117 list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
118 if (pos->type == KCORE_RAM
119 || pos->type == KCORE_VMEMMAP)
120 list_move(&pos->list, &garbage);
121 }
122 list_splice_tail(list, &kclist_head);
123 } else
124 list_splice(list, &garbage);
125 kcore_need_update = 0;
126 proc_root_kcore->size = get_kcore_size(&nphdr, &size);
127 write_unlock(&kclist_lock);
128
129 free_kclist_ents(&garbage);
130}
131
132
133#ifdef CONFIG_HIGHMEM
134/*
135 * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
136 * because memory hole is not as big as !HIGHMEM case.
137 * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
138 */
139static int kcore_update_ram(void)
140{
141 LIST_HEAD(head);
142 struct kcore_list *ent;
143 int ret = 0;
144
145 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
146 if (!ent)
147 return -ENOMEM;
148 ent->addr = (unsigned long)__va(0);
149 ent->size = max_low_pfn << PAGE_SHIFT;
150 ent->type = KCORE_RAM;
151 list_add(&ent->list, &head);
152 __kcore_update_ram(&head);
153 return ret;
154}
155
156#else /* !CONFIG_HIGHMEM */
157
158#ifdef CONFIG_SPARSEMEM_VMEMMAP
159/* calculate vmemmap's address from given system ram pfn and register it */
160int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
161{
162 unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
163 unsigned long nr_pages = ent->size >> PAGE_SHIFT;
164 unsigned long start, end;
165 struct kcore_list *vmm, *tmp;
166
167
168 start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
169 end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
170 end = ALIGN(end, PAGE_SIZE);
171 /* overlap check (because we have to align page */
172 list_for_each_entry(tmp, head, list) {
173 if (tmp->type != KCORE_VMEMMAP)
174 continue;
175 if (start < tmp->addr + tmp->size)
176 if (end > tmp->addr)
177 end = tmp->addr;
178 }
179 if (start < end) {
180 vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
181 if (!vmm)
182 return 0;
183 vmm->addr = start;
184 vmm->size = end - start;
185 vmm->type = KCORE_VMEMMAP;
186 list_add_tail(&vmm->list, head);
187 }
188 return 1;
189
190}
191#else
192int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
193{
194 return 1;
195}
196
197#endif
198
199static int
200kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
201{
202 struct list_head *head = (struct list_head *)arg;
203 struct kcore_list *ent;
204
205 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
206 if (!ent)
207 return -ENOMEM;
208 ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
209 ent->size = nr_pages << PAGE_SHIFT;
210
211 /* Sanity check: Can happen in 32bit arch...maybe */
212 if (ent->addr < (unsigned long) __va(0))
213 goto free_out;
214
215 /* cut not-mapped area. ....from ppc-32 code. */
216 if (ULONG_MAX - ent->addr < ent->size)
217 ent->size = ULONG_MAX - ent->addr;
218
219 /* cut when vmalloc() area is higher than direct-map area */
220 if (VMALLOC_START > (unsigned long)__va(0)) {
221 if (ent->addr > VMALLOC_START)
222 goto free_out;
223 if (VMALLOC_START - ent->addr < ent->size)
224 ent->size = VMALLOC_START - ent->addr;
225 }
226
227 ent->type = KCORE_RAM;
228 list_add_tail(&ent->list, head);
229
230 if (!get_sparsemem_vmemmap_info(ent, head)) {
231 list_del(&ent->list);
232 goto free_out;
233 }
234
235 return 0;
236free_out:
237 kfree(ent);
238 return 1;
239}
240
241static int kcore_update_ram(void)
242{
243 int nid, ret;
244 unsigned long end_pfn;
245 LIST_HEAD(head);
246
247 /* Not inialized....update now */
248 /* find out "max pfn" */
249 end_pfn = 0;
250 for_each_node_state(nid, N_HIGH_MEMORY) {
251 unsigned long node_end;
252 node_end = NODE_DATA(nid)->node_start_pfn +
253 NODE_DATA(nid)->node_spanned_pages;
254 if (end_pfn < node_end)
255 end_pfn = node_end;
256 }
257 /* scan 0 to max_pfn */
258 ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
259 if (ret) {
260 free_kclist_ents(&head);
261 return -ENOMEM;
262 }
263 __kcore_update_ram(&head);
264 return ret;
265}
266#endif /* CONFIG_HIGHMEM */
100 267
101/*****************************************************************************/ 268/*****************************************************************************/
102/* 269/*
@@ -192,7 +359,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
192 nhdr->p_align = 0; 359 nhdr->p_align = 0;
193 360
194 /* setup ELF PT_LOAD program header for every area */ 361 /* setup ELF PT_LOAD program header for every area */
195 for (m=kclist; m; m=m->next) { 362 list_for_each_entry(m, &kclist_head, list) {
196 phdr = (struct elf_phdr *) bufp; 363 phdr = (struct elf_phdr *) bufp;
197 bufp += sizeof(struct elf_phdr); 364 bufp += sizeof(struct elf_phdr);
198 offset += sizeof(struct elf_phdr); 365 offset += sizeof(struct elf_phdr);
@@ -265,7 +432,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
265 unsigned long start; 432 unsigned long start;
266 433
267 read_lock(&kclist_lock); 434 read_lock(&kclist_lock);
268 proc_root_kcore->size = size = get_kcore_size(&nphdr, &elf_buflen); 435 size = get_kcore_size(&nphdr, &elf_buflen);
436
269 if (buflen == 0 || *fpos >= size) { 437 if (buflen == 0 || *fpos >= size) {
270 read_unlock(&kclist_lock); 438 read_unlock(&kclist_lock);
271 return 0; 439 return 0;
@@ -317,7 +485,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
317 struct kcore_list *m; 485 struct kcore_list *m;
318 486
319 read_lock(&kclist_lock); 487 read_lock(&kclist_lock);
320 for (m=kclist; m; m=m->next) { 488 list_for_each_entry(m, &kclist_head, list) {
321 if (start >= m->addr && start < (m->addr+m->size)) 489 if (start >= m->addr && start < (m->addr+m->size))
322 break; 490 break;
323 } 491 }
@@ -326,45 +494,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
326 if (m == NULL) { 494 if (m == NULL) {
327 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
328 return -EFAULT; 496 return -EFAULT;
329 } else if (is_vmalloc_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
330 char * elf_buf; 498 char * elf_buf;
331 struct vm_struct *m;
332 unsigned long curstart = start;
333 unsigned long cursize = tsz;
334 499
335 elf_buf = kzalloc(tsz, GFP_KERNEL); 500 elf_buf = kzalloc(tsz, GFP_KERNEL);
336 if (!elf_buf) 501 if (!elf_buf)
337 return -ENOMEM; 502 return -ENOMEM;
338 503 vread(elf_buf, (char *)start, tsz);
339 read_lock(&vmlist_lock); 504 /* we have to zero-fill user buffer even if no read */
340 for (m=vmlist; m && cursize; m=m->next) {
341 unsigned long vmstart;
342 unsigned long vmsize;
343 unsigned long msize = m->size - PAGE_SIZE;
344
345 if (((unsigned long)m->addr + msize) <
346 curstart)
347 continue;
348 if ((unsigned long)m->addr > (curstart +
349 cursize))
350 break;
351 vmstart = (curstart < (unsigned long)m->addr ?
352 (unsigned long)m->addr : curstart);
353 if (((unsigned long)m->addr + msize) >
354 (curstart + cursize))
355 vmsize = curstart + cursize - vmstart;
356 else
357 vmsize = (unsigned long)m->addr +
358 msize - vmstart;
359 curstart = vmstart + vmsize;
360 cursize -= vmsize;
361 /* don't dump ioremap'd stuff! (TA) */
362 if (m->flags & VM_IOREMAP)
363 continue;
364 memcpy(elf_buf + (vmstart - start),
365 (char *)vmstart, vmsize);
366 }
367 read_unlock(&vmlist_lock);
368 if (copy_to_user(buffer, elf_buf, tsz)) { 505 if (copy_to_user(buffer, elf_buf, tsz)) {
369 kfree(elf_buf); 506 kfree(elf_buf);
370 return -EFAULT; 507 return -EFAULT;
@@ -402,12 +539,96 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
402 return acc; 539 return acc;
403} 540}
404 541
542
543static int open_kcore(struct inode *inode, struct file *filp)
544{
545 if (!capable(CAP_SYS_RAWIO))
546 return -EPERM;
547 if (kcore_need_update)
548 kcore_update_ram();
549 if (i_size_read(inode) != proc_root_kcore->size) {
550 mutex_lock(&inode->i_mutex);
551 i_size_write(inode, proc_root_kcore->size);
552 mutex_unlock(&inode->i_mutex);
553 }
554 return 0;
555}
556
557
558static const struct file_operations proc_kcore_operations = {
559 .read = read_kcore,
560 .open = open_kcore,
561};
562
563#ifdef CONFIG_MEMORY_HOTPLUG
564/* just remember that we have to update kcore */
565static int __meminit kcore_callback(struct notifier_block *self,
566 unsigned long action, void *arg)
567{
568 switch (action) {
569 case MEM_ONLINE:
570 case MEM_OFFLINE:
571 write_lock(&kclist_lock);
572 kcore_need_update = 1;
573 write_unlock(&kclist_lock);
574 }
575 return NOTIFY_OK;
576}
577#endif
578
579
580static struct kcore_list kcore_vmalloc;
581
582#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
583static struct kcore_list kcore_text;
584/*
585 * If defined, special segment is used for mapping kernel text instead of
586 * direct-map area. We need to create special TEXT section.
587 */
588static void __init proc_kcore_text_init(void)
589{
590 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
591}
592#else
593static void __init proc_kcore_text_init(void)
594{
595}
596#endif
597
598#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
599/*
600 * MODULES_VADDR has no intersection with VMALLOC_ADDR.
601 */
602struct kcore_list kcore_modules;
603static void __init add_modules_range(void)
604{
605 kclist_add(&kcore_modules, (void *)MODULES_VADDR,
606 MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
607}
608#else
609static void __init add_modules_range(void)
610{
611}
612#endif
613
405static int __init proc_kcore_init(void) 614static int __init proc_kcore_init(void)
406{ 615{
407 proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); 616 proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
408 if (proc_root_kcore) 617 &proc_kcore_operations);
409 proc_root_kcore->size = 618 if (!proc_root_kcore) {
410 (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; 619 printk(KERN_ERR "couldn't create /proc/kcore\n");
620 return 0; /* Always returns 0. */
621 }
622 /* Store text area if it's special */
623 proc_kcore_text_init();
624 /* Store vmalloc area */
625 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
626 VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
627 add_modules_range();
628 /* Store direct-map area from physical memory map */
629 kcore_update_ram();
630 hotplug_memory_notifier(kcore_callback, 0);
631
411 return 0; 632 return 0;
412} 633}
413module_init(proc_kcore_init); 634module_init(proc_kcore_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d5c410d47fae..c7bff4f603ff 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
81 "Writeback: %8lu kB\n" 81 "Writeback: %8lu kB\n"
82 "AnonPages: %8lu kB\n" 82 "AnonPages: %8lu kB\n"
83 "Mapped: %8lu kB\n" 83 "Mapped: %8lu kB\n"
84 "Shmem: %8lu kB\n"
84 "Slab: %8lu kB\n" 85 "Slab: %8lu kB\n"
85 "SReclaimable: %8lu kB\n" 86 "SReclaimable: %8lu kB\n"
86 "SUnreclaim: %8lu kB\n" 87 "SUnreclaim: %8lu kB\n"
88 "KernelStack: %8lu kB\n"
87 "PageTables: %8lu kB\n" 89 "PageTables: %8lu kB\n"
88#ifdef CONFIG_QUICKLIST 90#ifdef CONFIG_QUICKLIST
89 "Quicklists: %8lu kB\n" 91 "Quicklists: %8lu kB\n"
@@ -95,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
95 "Committed_AS: %8lu kB\n" 97 "Committed_AS: %8lu kB\n"
96 "VmallocTotal: %8lu kB\n" 98 "VmallocTotal: %8lu kB\n"
97 "VmallocUsed: %8lu kB\n" 99 "VmallocUsed: %8lu kB\n"
98 "VmallocChunk: %8lu kB\n", 100 "VmallocChunk: %8lu kB\n"
101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %8lu kB\n"
103#endif
104 ,
99 K(i.totalram), 105 K(i.totalram),
100 K(i.freeram), 106 K(i.freeram),
101 K(i.bufferram), 107 K(i.bufferram),
@@ -124,10 +130,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
124 K(global_page_state(NR_WRITEBACK)), 130 K(global_page_state(NR_WRITEBACK)),
125 K(global_page_state(NR_ANON_PAGES)), 131 K(global_page_state(NR_ANON_PAGES)),
126 K(global_page_state(NR_FILE_MAPPED)), 132 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)),
127 K(global_page_state(NR_SLAB_RECLAIMABLE) + 134 K(global_page_state(NR_SLAB_RECLAIMABLE) +
128 global_page_state(NR_SLAB_UNRECLAIMABLE)), 135 global_page_state(NR_SLAB_UNRECLAIMABLE)),
129 K(global_page_state(NR_SLAB_RECLAIMABLE)), 136 K(global_page_state(NR_SLAB_RECLAIMABLE)),
130 K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 137 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
138 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
131 K(global_page_state(NR_PAGETABLE)), 139 K(global_page_state(NR_PAGETABLE)),
132#ifdef CONFIG_QUICKLIST 140#ifdef CONFIG_QUICKLIST
133 K(quicklist_total_size()), 141 K(quicklist_total_size()),
@@ -140,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
140 (unsigned long)VMALLOC_TOTAL >> 10, 148 (unsigned long)VMALLOC_TOTAL >> 10,
141 vmi.used >> 10, 149 vmi.used >> 10,
142 vmi.largest_chunk >> 10 150 vmi.largest_chunk >> 10
151#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif
143 ); 154 );
144 155
145 hugetlb_report_meminfo(m); 156 hugetlb_report_meminfo(m);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 7e14d1a04001..9fe7d7ebe115 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -109,7 +109,7 @@ static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
109 return rb_next((struct rb_node *) v); 109 return rb_next((struct rb_node *) v);
110} 110}
111 111
112static struct seq_operations proc_nommu_region_list_seqop = { 112static const struct seq_operations proc_nommu_region_list_seqop = {
113 .start = nommu_region_list_start, 113 .start = nommu_region_list_start,
114 .next = nommu_region_list_next, 114 .next = nommu_region_list_next,
115 .stop = nommu_region_list_stop, 115 .stop = nommu_region_list_stop,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2707c6c7a20f..2281c2cbfe2b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -2,6 +2,7 @@
2#include <linux/compiler.h> 2#include <linux/compiler.h>
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/ksm.h>
5#include <linux/mm.h> 6#include <linux/mm.h>
6#include <linux/mmzone.h> 7#include <linux/mmzone.h>
7#include <linux/proc_fs.h> 8#include <linux/proc_fs.h>
@@ -95,6 +96,8 @@ static const struct file_operations proc_kpagecount_operations = {
95#define KPF_UNEVICTABLE 18 96#define KPF_UNEVICTABLE 18
96#define KPF_NOPAGE 20 97#define KPF_NOPAGE 20
97 98
99#define KPF_KSM 21
100
98/* kernel hacking assistances 101/* kernel hacking assistances
99 * WARNING: subject to change, never rely on them! 102 * WARNING: subject to change, never rely on them!
100 */ 103 */
@@ -137,6 +140,8 @@ static u64 get_uflags(struct page *page)
137 u |= 1 << KPF_MMAP; 140 u |= 1 << KPF_MMAP;
138 if (PageAnon(page)) 141 if (PageAnon(page))
139 u |= 1 << KPF_ANON; 142 u |= 1 << KPF_ANON;
143 if (PageKsm(page))
144 u |= 1 << KPF_KSM;
140 145
141 /* 146 /*
142 * compound pages: export both head/tail info 147 * compound pages: export both head/tail info
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16bf..f667e8aeabdf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
153 153
154 /* careful: calling conventions are nasty here */ 154 /* careful: calling conventions are nasty here */
155 res = count; 155 res = count;
156 error = table->proc_handler(table, write, filp, buf, &res, ppos); 156 error = table->proc_handler(table, write, buf, &res, ppos);
157 if (!error) 157 if (!error)
158 error = res; 158 error = res;
159out: 159out:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd8be1d235c..2a1bef9203c6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -243,6 +243,25 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
243 } else if (vma->vm_start <= mm->start_stack && 243 } else if (vma->vm_start <= mm->start_stack &&
244 vma->vm_end >= mm->start_stack) { 244 vma->vm_end >= mm->start_stack) {
245 name = "[stack]"; 245 name = "[stack]";
246 } else {
247 unsigned long stack_start;
248 struct proc_maps_private *pmp;
249
250 pmp = m->private;
251 stack_start = pmp->task->stack_start;
252
253 if (vma->vm_start <= stack_start &&
254 vma->vm_end >= stack_start) {
255 pad_len_spaces(m, len);
256 seq_printf(m,
257 "[threadstack:%08lx]",
258#ifdef CONFIG_STACK_GROWSUP
259 vma->vm_end - stack_start
260#else
261 stack_start - vma->vm_start
262#endif
263 );
264 }
246 } 265 }
247 } else { 266 } else {
248 name = "[vdso]"; 267 name = "[vdso]";
@@ -465,23 +484,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
465 return 0; 484 return 0;
466} 485}
467 486
487#define CLEAR_REFS_ALL 1
488#define CLEAR_REFS_ANON 2
489#define CLEAR_REFS_MAPPED 3
490
468static ssize_t clear_refs_write(struct file *file, const char __user *buf, 491static ssize_t clear_refs_write(struct file *file, const char __user *buf,
469 size_t count, loff_t *ppos) 492 size_t count, loff_t *ppos)
470{ 493{
471 struct task_struct *task; 494 struct task_struct *task;
472 char buffer[PROC_NUMBUF], *end; 495 char buffer[PROC_NUMBUF];
473 struct mm_struct *mm; 496 struct mm_struct *mm;
474 struct vm_area_struct *vma; 497 struct vm_area_struct *vma;
498 long type;
475 499
476 memset(buffer, 0, sizeof(buffer)); 500 memset(buffer, 0, sizeof(buffer));
477 if (count > sizeof(buffer) - 1) 501 if (count > sizeof(buffer) - 1)
478 count = sizeof(buffer) - 1; 502 count = sizeof(buffer) - 1;
479 if (copy_from_user(buffer, buf, count)) 503 if (copy_from_user(buffer, buf, count))
480 return -EFAULT; 504 return -EFAULT;
481 if (!simple_strtol(buffer, &end, 0)) 505 if (strict_strtol(strstrip(buffer), 10, &type))
506 return -EINVAL;
507 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
482 return -EINVAL; 508 return -EINVAL;
483 if (*end == '\n')
484 end++;
485 task = get_proc_task(file->f_path.dentry->d_inode); 509 task = get_proc_task(file->f_path.dentry->d_inode);
486 if (!task) 510 if (!task)
487 return -ESRCH; 511 return -ESRCH;
@@ -494,18 +518,31 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
494 down_read(&mm->mmap_sem); 518 down_read(&mm->mmap_sem);
495 for (vma = mm->mmap; vma; vma = vma->vm_next) { 519 for (vma = mm->mmap; vma; vma = vma->vm_next) {
496 clear_refs_walk.private = vma; 520 clear_refs_walk.private = vma;
497 if (!is_vm_hugetlb_page(vma)) 521 if (is_vm_hugetlb_page(vma))
498 walk_page_range(vma->vm_start, vma->vm_end, 522 continue;
499 &clear_refs_walk); 523 /*
524 * Writing 1 to /proc/pid/clear_refs affects all pages.
525 *
526 * Writing 2 to /proc/pid/clear_refs only affects
527 * Anonymous pages.
528 *
529 * Writing 3 to /proc/pid/clear_refs only affects file
530 * mapped pages.
531 */
532 if (type == CLEAR_REFS_ANON && vma->vm_file)
533 continue;
534 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
535 continue;
536 walk_page_range(vma->vm_start, vma->vm_end,
537 &clear_refs_walk);
500 } 538 }
501 flush_tlb_mm(mm); 539 flush_tlb_mm(mm);
502 up_read(&mm->mmap_sem); 540 up_read(&mm->mmap_sem);
503 mmput(mm); 541 mmput(mm);
504 } 542 }
505 put_task_struct(task); 543 put_task_struct(task);
506 if (end - buffer == 0) 544
507 return -EIO; 545 return count;
508 return end - buffer;
509} 546}
510 547
511const struct file_operations proc_clear_refs_operations = { 548const struct file_operations proc_clear_refs_operations = {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f146..766b1d456050 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -4,13 +4,18 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/seq_file.h> 5#include <linux/seq_file.h>
6#include <linux/time.h> 6#include <linux/time.h>
7#include <linux/kernel_stat.h>
7#include <asm/cputime.h> 8#include <asm/cputime.h>
8 9
9static int uptime_proc_show(struct seq_file *m, void *v) 10static int uptime_proc_show(struct seq_file *m, void *v)
10{ 11{
11 struct timespec uptime; 12 struct timespec uptime;
12 struct timespec idle; 13 struct timespec idle;
13 cputime_t idletime = cputime_add(init_task.utime, init_task.stime); 14 int i;
15 cputime_t idletime = cputime_zero;
16
17 for_each_possible_cpu(i)
18 idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
14 19
15 do_posix_clock_monotonic_gettime(&uptime); 20 do_posix_clock_monotonic_gettime(&uptime);
16 monotonic_to_bootbased(&uptime); 21 monotonic_to_bootbased(&uptime);
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index be8e0e1445b6..5f6089994042 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -6,20 +6,9 @@ config QNX4FS_FS
6 QNX 4 and QNX 6 (the latter is also called QNX RTP). 6 QNX 4 and QNX 6 (the latter is also called QNX RTP).
7 Further information is available at <http://www.qnx.com/>. 7 Further information is available at <http://www.qnx.com/>.
8 Say Y if you intend to mount QNX hard disks or floppies. 8 Say Y if you intend to mount QNX hard disks or floppies.
9 Unless you say Y to "QNX4FS read-write support" below, you will
10 only be able to read these file systems.
11 9
12 To compile this file system support as a module, choose M here: the 10 To compile this file system support as a module, choose M here: the
13 module will be called qnx4. 11 module will be called qnx4.
14 12
15 If you don't know whether you need it, then you don't need it: 13 If you don't know whether you need it, then you don't need it:
16 answer N. 14 answer N.
17
18config QNX4FS_RW
19 bool "QNX4FS write support (DANGEROUS)"
20 depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
21 help
22 Say Y if you want to test write support for QNX4 file systems.
23
24 It's currently broken, so for now:
25 answer N.
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index e4d408cc5473..4a283b3f87f8 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_QNX4FS_FS) += qnx4.o 5obj-$(CONFIG_QNX4FS_FS) += qnx4.o
6 6
7qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o 7qnx4-objs := inode.o dir.o namei.o bitmap.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index e1cd061a25f7..0afba069d567 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -78,84 +78,3 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
78 78
79 return total_free; 79 return total_free;
80} 80}
81
82#ifdef CONFIG_QNX4FS_RW
83
84int qnx4_is_free(struct super_block *sb, long block)
85{
86 int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
87 int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
88 struct buffer_head *bh;
89 const char *g;
90 int ret = -EIO;
91
92 start += block / (QNX4_BLOCK_SIZE * 8);
93 QNX4DEBUG(("qnx4: is_free requesting block [%lu], bitmap in block [%lu]\n",
94 (unsigned long) block, (unsigned long) start));
95 (void) size; /* CHECKME */
96 bh = sb_bread(sb, start);
97 if (bh == NULL) {
98 return -EIO;
99 }
100 g = bh->b_data + (block % QNX4_BLOCK_SIZE);
101 if (((*g) & (1 << (block % 8))) == 0) {
102 QNX4DEBUG(("qnx4: is_free -> block is free\n"));
103 ret = 1;
104 } else {
105 QNX4DEBUG(("qnx4: is_free -> block is busy\n"));
106 ret = 0;
107 }
108 brelse(bh);
109
110 return ret;
111}
112
113int qnx4_set_bitmap(struct super_block *sb, long block, int busy)
114{
115 int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
116 int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
117 struct buffer_head *bh;
118 char *g;
119
120 start += block / (QNX4_BLOCK_SIZE * 8);
121 QNX4DEBUG(("qnx4: set_bitmap requesting block [%lu], bitmap in block [%lu]\n",
122 (unsigned long) block, (unsigned long) start));
123 (void) size; /* CHECKME */
124 bh = sb_bread(sb, start);
125 if (bh == NULL) {
126 return -EIO;
127 }
128 g = bh->b_data + (block % QNX4_BLOCK_SIZE);
129 if (busy == 0) {
130 (*g) &= ~(1 << (block % 8));
131 } else {
132 (*g) |= (1 << (block % 8));
133 }
134 mark_buffer_dirty(bh);
135 brelse(bh);
136
137 return 0;
138}
139
140static void qnx4_clear_inode(struct inode *inode)
141{
142 struct qnx4_inode_entry *qnx4_ino = qnx4_raw_inode(inode);
143 /* What for? */
144 memset(qnx4_ino->di_fname, 0, sizeof qnx4_ino->di_fname);
145 qnx4_ino->di_size = 0;
146 qnx4_ino->di_num_xtnts = 0;
147 qnx4_ino->di_mode = 0;
148 qnx4_ino->di_status = 0;
149}
150
151void qnx4_free_inode(struct inode *inode)
152{
153 if (inode->i_ino < 1) {
154 printk("free_inode: inode 0 or nonexistent inode\n");
155 return;
156 }
157 qnx4_clear_inode(inode);
158 clear_inode(inode);
159}
160
161#endif
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 003c68f3238b..86cc39cb1398 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -85,9 +85,4 @@ const struct file_operations qnx4_dir_operations =
85const struct inode_operations qnx4_dir_inode_operations = 85const struct inode_operations qnx4_dir_inode_operations =
86{ 86{
87 .lookup = qnx4_lookup, 87 .lookup = qnx4_lookup,
88#ifdef CONFIG_QNX4FS_RW
89 .create = qnx4_create,
90 .unlink = qnx4_unlink,
91 .rmdir = qnx4_rmdir,
92#endif
93}; 88};
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
deleted file mode 100644
index 09b170ac936c..000000000000
--- a/fs/qnx4/file.c
+++ /dev/null
@@ -1,40 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.2.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 25-05-1998 by Richard Frowijn : first release.
11 * 21-06-1998 by Frank Denis : wrote qnx4_readpage to use generic_file_read.
12 * 27-06-1998 by Frank Denis : file overwriting.
13 */
14
15#include "qnx4.h"
16
17/*
18 * We have mostly NULL's here: the current defaults are ok for
19 * the qnx4 filesystem.
20 */
21const struct file_operations qnx4_file_operations =
22{
23 .llseek = generic_file_llseek,
24 .read = do_sync_read,
25 .aio_read = generic_file_aio_read,
26 .mmap = generic_file_mmap,
27 .splice_read = generic_file_splice_read,
28#ifdef CONFIG_QNX4FS_RW
29 .write = do_sync_write,
30 .aio_write = generic_file_aio_write,
31 .fsync = simple_fsync,
32#endif
33};
34
35const struct inode_operations qnx4_file_inode_operations =
36{
37#ifdef CONFIG_QNX4FS_RW
38 .truncate = qnx4_truncate,
39#endif
40};
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 681df5fcd161..d2cd1798d8c4 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -28,73 +28,6 @@
28 28
29static const struct super_operations qnx4_sops; 29static const struct super_operations qnx4_sops;
30 30
31#ifdef CONFIG_QNX4FS_RW
32
33static void qnx4_delete_inode(struct inode *inode)
34{
35 QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
36 truncate_inode_pages(&inode->i_data, 0);
37 inode->i_size = 0;
38 qnx4_truncate(inode);
39 lock_kernel();
40 qnx4_free_inode(inode);
41 unlock_kernel();
42}
43
44static int qnx4_write_inode(struct inode *inode, int do_sync)
45{
46 struct qnx4_inode_entry *raw_inode;
47 int block, ino;
48 struct buffer_head *bh;
49 ino = inode->i_ino;
50
51 QNX4DEBUG(("qnx4: write inode 1.\n"));
52 if (inode->i_nlink == 0) {
53 return 0;
54 }
55 if (!ino) {
56 printk("qnx4: bad inode number on dev %s: %d is out of range\n",
57 inode->i_sb->s_id, ino);
58 return -EIO;
59 }
60 QNX4DEBUG(("qnx4: write inode 2.\n"));
61 block = ino / QNX4_INODES_PER_BLOCK;
62 lock_kernel();
63 if (!(bh = sb_bread(inode->i_sb, block))) {
64 printk("qnx4: major problem: unable to read inode from dev "
65 "%s\n", inode->i_sb->s_id);
66 unlock_kernel();
67 return -EIO;
68 }
69 raw_inode = ((struct qnx4_inode_entry *) bh->b_data) +
70 (ino % QNX4_INODES_PER_BLOCK);
71 raw_inode->di_mode = cpu_to_le16(inode->i_mode);
72 raw_inode->di_uid = cpu_to_le16(fs_high2lowuid(inode->i_uid));
73 raw_inode->di_gid = cpu_to_le16(fs_high2lowgid(inode->i_gid));
74 raw_inode->di_nlink = cpu_to_le16(inode->i_nlink);
75 raw_inode->di_size = cpu_to_le32(inode->i_size);
76 raw_inode->di_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
77 raw_inode->di_atime = cpu_to_le32(inode->i_atime.tv_sec);
78 raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
79 raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
80 mark_buffer_dirty(bh);
81 if (do_sync) {
82 sync_dirty_buffer(bh);
83 if (buffer_req(bh) && !buffer_uptodate(bh)) {
84 printk("qnx4: IO error syncing inode [%s:%08x]\n",
85 inode->i_sb->s_id, ino);
86 brelse(bh);
87 unlock_kernel();
88 return -EIO;
89 }
90 }
91 brelse(bh);
92 unlock_kernel();
93 return 0;
94}
95
96#endif
97
98static void qnx4_put_super(struct super_block *sb); 31static void qnx4_put_super(struct super_block *sb);
99static struct inode *qnx4_alloc_inode(struct super_block *sb); 32static struct inode *qnx4_alloc_inode(struct super_block *sb);
100static void qnx4_destroy_inode(struct inode *inode); 33static void qnx4_destroy_inode(struct inode *inode);
@@ -108,10 +41,6 @@ static const struct super_operations qnx4_sops =
108 .put_super = qnx4_put_super, 41 .put_super = qnx4_put_super,
109 .statfs = qnx4_statfs, 42 .statfs = qnx4_statfs,
110 .remount_fs = qnx4_remount, 43 .remount_fs = qnx4_remount,
111#ifdef CONFIG_QNX4FS_RW
112 .write_inode = qnx4_write_inode,
113 .delete_inode = qnx4_delete_inode,
114#endif
115}; 44};
116 45
117static int qnx4_remount(struct super_block *sb, int *flags, char *data) 46static int qnx4_remount(struct super_block *sb, int *flags, char *data)
@@ -120,15 +49,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
120 49
121 qs = qnx4_sb(sb); 50 qs = qnx4_sb(sb);
122 qs->Version = QNX4_VERSION; 51 qs->Version = QNX4_VERSION;
123#ifndef CONFIG_QNX4FS_RW
124 *flags |= MS_RDONLY; 52 *flags |= MS_RDONLY;
125#endif
126 if (*flags & MS_RDONLY) {
127 return 0;
128 }
129
130 mark_buffer_dirty(qs->sb_buf);
131
132 return 0; 53 return 0;
133} 54}
134 55
@@ -354,9 +275,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
354 } 275 }
355 s->s_op = &qnx4_sops; 276 s->s_op = &qnx4_sops;
356 s->s_magic = QNX4_SUPER_MAGIC; 277 s->s_magic = QNX4_SUPER_MAGIC;
357#ifndef CONFIG_QNX4FS_RW
358 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ 278 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
359#endif
360 qnx4_sb(s)->sb_buf = bh; 279 qnx4_sb(s)->sb_buf = bh;
361 qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; 280 qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
362 281
@@ -489,8 +408,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
489 408
490 memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); 409 memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
491 if (S_ISREG(inode->i_mode)) { 410 if (S_ISREG(inode->i_mode)) {
492 inode->i_op = &qnx4_file_inode_operations; 411 inode->i_fop = &generic_ro_fops;
493 inode->i_fop = &qnx4_file_operations;
494 inode->i_mapping->a_ops = &qnx4_aops; 412 inode->i_mapping->a_ops = &qnx4_aops;
495 qnx4_i(inode)->mmu_private = inode->i_size; 413 qnx4_i(inode)->mmu_private = inode->i_size;
496 } else if (S_ISDIR(inode->i_mode)) { 414 } else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 5972ed214937..ae1e7edbacd6 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -134,108 +134,3 @@ out:
134 134
135 return NULL; 135 return NULL;
136} 136}
137
138#ifdef CONFIG_QNX4FS_RW
139int qnx4_create(struct inode *dir, struct dentry *dentry, int mode,
140 struct nameidata *nd)
141{
142 QNX4DEBUG(("qnx4: qnx4_create\n"));
143 if (dir == NULL) {
144 return -ENOENT;
145 }
146 return -ENOSPC;
147}
148
149int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
150{
151 struct buffer_head *bh;
152 struct qnx4_inode_entry *de;
153 struct inode *inode;
154 int retval;
155 int ino;
156
157 QNX4DEBUG(("qnx4: qnx4_rmdir [%s]\n", dentry->d_name.name));
158 lock_kernel();
159 bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
160 &de, &ino);
161 if (bh == NULL) {
162 unlock_kernel();
163 return -ENOENT;
164 }
165 inode = dentry->d_inode;
166 if (inode->i_ino != ino) {
167 retval = -EIO;
168 goto end_rmdir;
169 }
170#if 0
171 if (!empty_dir(inode)) {
172 retval = -ENOTEMPTY;
173 goto end_rmdir;
174 }
175#endif
176 if (inode->i_nlink != 2) {
177 QNX4DEBUG(("empty directory has nlink!=2 (%d)\n", inode->i_nlink));
178 }
179 QNX4DEBUG(("qnx4: deleting directory\n"));
180 de->di_status = 0;
181 memset(de->di_fname, 0, sizeof de->di_fname);
182 de->di_mode = 0;
183 mark_buffer_dirty_inode(bh, dir);
184 clear_nlink(inode);
185 mark_inode_dirty(inode);
186 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
187 inode_dec_link_count(dir);
188 retval = 0;
189
190 end_rmdir:
191 brelse(bh);
192
193 unlock_kernel();
194 return retval;
195}
196
197int qnx4_unlink(struct inode *dir, struct dentry *dentry)
198{
199 struct buffer_head *bh;
200 struct qnx4_inode_entry *de;
201 struct inode *inode;
202 int retval;
203 int ino;
204
205 QNX4DEBUG(("qnx4: qnx4_unlink [%s]\n", dentry->d_name.name));
206 lock_kernel();
207 bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
208 &de, &ino);
209 if (bh == NULL) {
210 unlock_kernel();
211 return -ENOENT;
212 }
213 inode = dentry->d_inode;
214 if (inode->i_ino != ino) {
215 retval = -EIO;
216 goto end_unlink;
217 }
218 retval = -EPERM;
219 if (!inode->i_nlink) {
220 QNX4DEBUG(("Deleting nonexistent file (%s:%lu), %d\n",
221 inode->i_sb->s_id,
222 inode->i_ino, inode->i_nlink));
223 inode->i_nlink = 1;
224 }
225 de->di_status = 0;
226 memset(de->di_fname, 0, sizeof de->di_fname);
227 de->di_mode = 0;
228 mark_buffer_dirty_inode(bh, dir);
229 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
230 mark_inode_dirty(dir);
231 inode->i_ctime = dir->i_ctime;
232 inode_dec_link_count(inode);
233 retval = 0;
234
235end_unlink:
236 unlock_kernel();
237 brelse(bh);
238
239 return retval;
240}
241#endif
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 9efc089454f6..33a60858203b 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -29,17 +29,9 @@ extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
29 29
30extern struct buffer_head *qnx4_bread(struct inode *, int, int); 30extern struct buffer_head *qnx4_bread(struct inode *, int, int);
31 31
32extern const struct inode_operations qnx4_file_inode_operations;
33extern const struct inode_operations qnx4_dir_inode_operations; 32extern const struct inode_operations qnx4_dir_inode_operations;
34extern const struct file_operations qnx4_file_operations;
35extern const struct file_operations qnx4_dir_operations; 33extern const struct file_operations qnx4_dir_operations;
36extern int qnx4_is_free(struct super_block *sb, long block); 34extern int qnx4_is_free(struct super_block *sb, long block);
37extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
38extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
39extern void qnx4_truncate(struct inode *inode);
40extern void qnx4_free_inode(struct inode *inode);
41extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
42extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
43 35
44static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) 36static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
45{ 37{
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
deleted file mode 100644
index d94d9ee241fe..000000000000
--- a/fs/qnx4/truncate.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 30-06-1998 by Frank DENIS : ugly filler.
11 */
12
13#include <linux/smp_lock.h>
14#include "qnx4.h"
15
16#ifdef CONFIG_QNX4FS_RW
17
18void qnx4_truncate(struct inode *inode)
19{
20 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
21 S_ISLNK(inode->i_mode))) {
22 return;
23 }
24 lock_kernel();
25 if (!(S_ISDIR(inode->i_mode))) {
26 /* TODO */
27 }
28 QNX4DEBUG(("qnx4: qnx4_truncate called\n"));
29 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
30 mark_inode_dirty(inode);
31 unlock_kernel();
32}
33
34#endif
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 38f7bd559f35..39b49c42a7ed 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info);
1839/* 1839/*
1840 * Definitions of diskquota operations. 1840 * Definitions of diskquota operations.
1841 */ 1841 */
1842struct dquot_operations dquot_operations = { 1842const struct dquot_operations dquot_operations = {
1843 .initialize = dquot_initialize, 1843 .initialize = dquot_initialize,
1844 .drop = dquot_drop, 1844 .drop = dquot_drop,
1845 .alloc_space = dquot_alloc_space, 1845 .alloc_space = dquot_alloc_space,
@@ -2461,7 +2461,7 @@ out:
2461} 2461}
2462EXPORT_SYMBOL(vfs_set_dqinfo); 2462EXPORT_SYMBOL(vfs_set_dqinfo);
2463 2463
2464struct quotactl_ops vfs_quotactl_ops = { 2464const struct quotactl_ops vfs_quotactl_ops = {
2465 .quota_on = vfs_quota_on, 2465 .quota_on = vfs_quota_on,
2466 .quota_off = vfs_quota_off, 2466 .quota_off = vfs_quota_off,
2467 .quota_sync = vfs_quota_sync, 2467 .quota_sync = vfs_quota_sync,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 11f0c06316de..32fae4040ebf 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
69 /* make various checks */ 69 /* make various checks */
70 order = get_order(newsize); 70 order = get_order(newsize);
71 if (unlikely(order >= MAX_ORDER)) 71 if (unlikely(order >= MAX_ORDER))
72 goto too_big; 72 return -EFBIG;
73 73
74 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 74 ret = inode_newsize_ok(inode, newsize);
75 if (limit != RLIM_INFINITY && newsize > limit) 75 if (ret)
76 goto fsize_exceeded; 76 return ret;
77
78 if (newsize > inode->i_sb->s_maxbytes)
79 goto too_big;
80 77
81 i_size_write(inode, newsize); 78 i_size_write(inode, newsize);
82 79
@@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
118 115
119 return 0; 116 return 0;
120 117
121 fsize_exceeded: 118add_error:
122 send_sig(SIGXFSZ, current, 0);
123 too_big:
124 return -EFBIG;
125
126 add_error:
127 while (loop < npages) 119 while (loop < npages)
128 __free_page(pages + loop++); 120 __free_page(pages + loop++);
129 return ret; 121 return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a7f0110fca4c..a6090aa1a7c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,12 +34,10 @@
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38#include "internal.h" 39#include "internal.h"
39 40
40/* some random number */
41#define RAMFS_MAGIC 0x858458f6
42
43#define RAMFS_DEFAULT_MODE 0755 41#define RAMFS_DEFAULT_MODE 0755
44 42
45static const struct super_operations ramfs_ops; 43static const struct super_operations ramfs_ops;
diff --git a/fs/read_write.c b/fs/read_write.c
index 6c8c55dec2bc..3ac28987f22a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
839 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 839 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
840 840
841 pos = *ppos; 841 pos = *ppos;
842 retval = -EINVAL;
843 if (unlikely(pos < 0))
844 goto fput_out;
845 if (unlikely(pos + count > max)) { 842 if (unlikely(pos + count > max)) {
846 retval = -EOVERFLOW; 843 retval = -EOVERFLOW;
847 if (pos >= max) 844 if (pos >= max)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7adea74d6a8a..f0ad05f38022 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
612static int reiserfs_write_info(struct super_block *, int); 612static int reiserfs_write_info(struct super_block *, int);
613static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 613static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
614 614
615static struct dquot_operations reiserfs_quota_operations = { 615static const struct dquot_operations reiserfs_quota_operations = {
616 .initialize = dquot_initialize, 616 .initialize = dquot_initialize,
617 .drop = dquot_drop, 617 .drop = dquot_drop,
618 .alloc_space = dquot_alloc_space, 618 .alloc_space = dquot_alloc_space,
@@ -629,7 +629,7 @@ static struct dquot_operations reiserfs_quota_operations = {
629 .destroy_dquot = dquot_destroy, 629 .destroy_dquot = dquot_destroy,
630}; 630};
631 631
632static struct quotactl_ops reiserfs_qctl_operations = { 632static const struct quotactl_ops reiserfs_qctl_operations = {
633 .quota_on = reiserfs_quota_on, 633 .quota_on = reiserfs_quota_on,
634 .quota_off = vfs_quota_off, 634 .quota_off = vfs_quota_off,
635 .quota_sync = vfs_quota_sync, 635 .quota_sync = vfs_quota_sync,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 4ab3c03d8f95..c117fa80d1e9 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = {
284 .readdir = romfs_readdir, 284 .readdir = romfs_readdir,
285}; 285};
286 286
287static struct inode_operations romfs_dir_inode_operations = { 287static const struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup, 288 .lookup = romfs_lookup,
289}; 289};
290 290
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; 528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
529 529
530 root = romfs_iget(sb, pos); 530 root = romfs_iget(sb, pos);
531 if (!root) 531 if (IS_ERR(root))
532 goto error; 532 goto error;
533 533
534 sb->s_root = d_alloc_root(root); 534 sb->s_root = d_alloc_root(root);
diff --git a/fs/select.c b/fs/select.c
index 8084834e123e..fd38ce2e32e3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/sched.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
@@ -41,22 +42,28 @@
41 * better solutions.. 42 * better solutions..
42 */ 43 */
43 44
45#define MAX_SLACK (100 * NSEC_PER_MSEC)
46
44static long __estimate_accuracy(struct timespec *tv) 47static long __estimate_accuracy(struct timespec *tv)
45{ 48{
46 long slack; 49 long slack;
47 int divfactor = 1000; 50 int divfactor = 1000;
48 51
52 if (tv->tv_sec < 0)
53 return 0;
54
49 if (task_nice(current) > 0) 55 if (task_nice(current) > 0)
50 divfactor = divfactor / 5; 56 divfactor = divfactor / 5;
51 57
58 if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
59 return MAX_SLACK;
60
52 slack = tv->tv_nsec / divfactor; 61 slack = tv->tv_nsec / divfactor;
53 slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); 62 slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
54 63
55 if (slack > 100 * NSEC_PER_MSEC) 64 if (slack > MAX_SLACK)
56 slack = 100 * NSEC_PER_MSEC; 65 return MAX_SLACK;
57 66
58 if (slack < 0)
59 slack = 0;
60 return slack; 67 return slack;
61} 68}
62 69
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 6c959275f2d0..eae7d9dbf3ff 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path);
429 */ 429 */
430int seq_path(struct seq_file *m, struct path *path, char *esc) 430int seq_path(struct seq_file *m, struct path *path, char *esc)
431{ 431{
432 if (m->count < m->size) { 432 char *buf;
433 char *s = m->buf + m->count; 433 size_t size = seq_get_buf(m, &buf);
434 char *p = d_path(path, s, m->size - m->count); 434 int res = -1;
435
436 if (size) {
437 char *p = d_path(path, buf, size);
435 if (!IS_ERR(p)) { 438 if (!IS_ERR(p)) {
436 s = mangle_path(s, p, esc); 439 char *end = mangle_path(buf, p, esc);
437 if (s) { 440 if (end)
438 p = m->buf + m->count; 441 res = end - buf;
439 m->count = s - m->buf;
440 return s - p;
441 }
442 } 442 }
443 } 443 }
444 m->count = m->size; 444 seq_commit(m, res);
445 return -1; 445
446 return res;
446} 447}
447EXPORT_SYMBOL(seq_path); 448EXPORT_SYMBOL(seq_path);
448 449
@@ -454,26 +455,28 @@ EXPORT_SYMBOL(seq_path);
454int seq_path_root(struct seq_file *m, struct path *path, struct path *root, 455int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
455 char *esc) 456 char *esc)
456{ 457{
457 int err = -ENAMETOOLONG; 458 char *buf;
458 if (m->count < m->size) { 459 size_t size = seq_get_buf(m, &buf);
459 char *s = m->buf + m->count; 460 int res = -ENAMETOOLONG;
461
462 if (size) {
460 char *p; 463 char *p;
461 464
462 spin_lock(&dcache_lock); 465 spin_lock(&dcache_lock);
463 p = __d_path(path, root, s, m->size - m->count); 466 p = __d_path(path, root, buf, size);
464 spin_unlock(&dcache_lock); 467 spin_unlock(&dcache_lock);
465 err = PTR_ERR(p); 468 res = PTR_ERR(p);
466 if (!IS_ERR(p)) { 469 if (!IS_ERR(p)) {
467 s = mangle_path(s, p, esc); 470 char *end = mangle_path(buf, p, esc);
468 if (s) { 471 if (end)
469 p = m->buf + m->count; 472 res = end - buf;
470 m->count = s - m->buf; 473 else
471 return 0; 474 res = -ENAMETOOLONG;
472 }
473 } 475 }
474 } 476 }
475 m->count = m->size; 477 seq_commit(m, res);
476 return err; 478
479 return res < 0 ? res : 0;
477} 480}
478 481
479/* 482/*
@@ -481,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
481 */ 484 */
482int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) 485int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
483{ 486{
484 if (m->count < m->size) { 487 char *buf;
485 char *s = m->buf + m->count; 488 size_t size = seq_get_buf(m, &buf);
486 char *p = dentry_path(dentry, s, m->size - m->count); 489 int res = -1;
490
491 if (size) {
492 char *p = dentry_path(dentry, buf, size);
487 if (!IS_ERR(p)) { 493 if (!IS_ERR(p)) {
488 s = mangle_path(s, p, esc); 494 char *end = mangle_path(buf, p, esc);
489 if (s) { 495 if (end)
490 p = m->buf + m->count; 496 res = end - buf;
491 m->count = s - m->buf;
492 return s - p;
493 }
494 } 497 }
495 } 498 }
496 m->count = m->size; 499 seq_commit(m, res);
497 return -1; 500
501 return res;
498} 502}
499 503
500int seq_bitmap(struct seq_file *m, const unsigned long *bits, 504int seq_bitmap(struct seq_file *m, const unsigned long *bits,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1402d2d54f52..1c4c8f089970 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
459static void 459static void
460smb_unload_nls(struct smb_sb_info *server) 460smb_unload_nls(struct smb_sb_info *server)
461{ 461{
462 if (server->remote_nls) { 462 unload_nls(server->remote_nls);
463 unload_nls(server->remote_nls); 463 unload_nls(server->local_nls);
464 server->remote_nls = NULL;
465 }
466 if (server->local_nls) {
467 unload_nls(server->local_nls);
468 server->local_nls = NULL;
469 }
470} 464}
471 465
472static void 466static void
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 9468168b9af5..71c29b6670b4 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -509,7 +509,7 @@ date_unix2dos(struct smb_sb_info *server,
509 month = 2; 509 month = 2;
510 } else { 510 } else {
511 nl_day = (year & 3) || day <= 59 ? day : day - 1; 511 nl_day = (year & 3) || day <= 59 ? day : day - 1;
512 for (month = 0; month < 12; month++) 512 for (month = 1; month < 12; month++)
513 if (day_n[month] > nl_day) 513 if (day_n[month] > nl_day)
514 break; 514 break;
515 } 515 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cb5fc57e370b..6c197ef53add 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -44,7 +44,7 @@
44#include "squashfs.h" 44#include "squashfs.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static int supported_squashfs_filesystem(short major, short minor, short comp)
50{ 50{
@@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = {
444 .fs_flags = FS_REQUIRES_DEV 444 .fs_flags = FS_REQUIRES_DEV
445}; 445};
446 446
447static struct super_operations squashfs_super_ops = { 447static const struct super_operations squashfs_super_ops = {
448 .alloc_inode = squashfs_alloc_inode, 448 .alloc_inode = squashfs_alloc_inode,
449 .destroy_inode = squashfs_destroy_inode, 449 .destroy_inode = squashfs_destroy_inode,
450 .statfs = squashfs_statfs, 450 .statfs = squashfs_statfs,
diff --git a/fs/super.c b/fs/super.c
index b03fea8fbfb6..19eb70b374bc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock);
54static struct super_block *alloc_super(struct file_system_type *type) 54static struct super_block *alloc_super(struct file_system_type *type)
55{ 55{
56 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); 56 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
57 static struct super_operations default_op; 57 static const struct super_operations default_op;
58 58
59 if (s) { 59 if (s) {
60 if (security_sb_alloc(s)) { 60 if (security_sb_alloc(s)) {
@@ -465,6 +465,48 @@ rescan:
465} 465}
466 466
467EXPORT_SYMBOL(get_super); 467EXPORT_SYMBOL(get_super);
468
469/**
470 * get_active_super - get an active reference to the superblock of a device
471 * @bdev: device to get the superblock for
472 *
473 * Scans the superblock list and finds the superblock of the file system
474 * mounted on the device given. Returns the superblock with an active
475 * reference and s_umount held exclusively or %NULL if none was found.
476 */
477struct super_block *get_active_super(struct block_device *bdev)
478{
479 struct super_block *sb;
480
481 if (!bdev)
482 return NULL;
483
484 spin_lock(&sb_lock);
485 list_for_each_entry(sb, &super_blocks, s_list) {
486 if (sb->s_bdev != bdev)
487 continue;
488
489 sb->s_count++;
490 spin_unlock(&sb_lock);
491 down_write(&sb->s_umount);
492 if (sb->s_root) {
493 spin_lock(&sb_lock);
494 if (sb->s_count > S_BIAS) {
495 atomic_inc(&sb->s_active);
496 sb->s_count--;
497 spin_unlock(&sb_lock);
498 return sb;
499 }
500 spin_unlock(&sb_lock);
501 }
502 up_write(&sb->s_umount);
503 put_super(sb);
504 yield();
505 spin_lock(&sb_lock);
506 }
507 spin_unlock(&sb_lock);
508 return NULL;
509}
468 510
469struct super_block * user_get_super(dev_t dev) 511struct super_block * user_get_super(dev_t dev)
470{ 512{
@@ -527,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
527{ 569{
528 int retval; 570 int retval;
529 int remount_rw; 571 int remount_rw;
530 572
573 if (sb->s_frozen != SB_UNFROZEN)
574 return -EBUSY;
575
531#ifdef CONFIG_BLOCK 576#ifdef CONFIG_BLOCK
532 if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) 577 if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
533 return -EACCES; 578 return -EACCES;
534#endif 579#endif
580
535 if (flags & MS_RDONLY) 581 if (flags & MS_RDONLY)
536 acct_auto_close(sb); 582 acct_auto_close(sb);
537 shrink_dcache_sb(sb); 583 shrink_dcache_sb(sb);
@@ -743,9 +789,14 @@ int get_sb_bdev(struct file_system_type *fs_type,
743 * will protect the lockfs code from trying to start a snapshot 789 * will protect the lockfs code from trying to start a snapshot
744 * while we are mounting 790 * while we are mounting
745 */ 791 */
746 down(&bdev->bd_mount_sem); 792 mutex_lock(&bdev->bd_fsfreeze_mutex);
793 if (bdev->bd_fsfreeze_count > 0) {
794 mutex_unlock(&bdev->bd_fsfreeze_mutex);
795 error = -EBUSY;
796 goto error_bdev;
797 }
747 s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); 798 s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
748 up(&bdev->bd_mount_sem); 799 mutex_unlock(&bdev->bd_fsfreeze_mutex);
749 if (IS_ERR(s)) 800 if (IS_ERR(s))
750 goto error_s; 801 goto error_s;
751 802
@@ -892,6 +943,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
892 if (error) 943 if (error)
893 goto out_sb; 944 goto out_sb;
894 945
946 /*
947 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
948 * but s_maxbytes was an unsigned long long for many releases. Throw
949 * this warning for a little while to try and catch filesystems that
950 * violate this rule. This warning should be either removed or
951 * converted to a BUG() in 2.6.34.
952 */
953 WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
954 "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
955
895 mnt->mnt_mountpoint = mnt->mnt_root; 956 mnt->mnt_mountpoint = mnt->mnt_root;
896 mnt->mnt_parent = mnt; 957 mnt->mnt_parent = mnt;
897 up_write(&mnt->mnt_sb->s_umount); 958 up_write(&mnt->mnt_sb->s_umount);
diff --git a/fs/sync.c b/fs/sync.c
index c08467a5d7cb..d104591b066b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -183,6 +183,7 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
183 ret = err; 183 ret = err;
184 return ret; 184 return ret;
185} 185}
186EXPORT_SYMBOL(file_fsync);
186 187
187/** 188/**
188 * vfs_fsync_range - helper to sync a range of data & metadata to disk 189 * vfs_fsync_range - helper to sync a range of data & metadata to disk
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 2524714bece1..60c702bc10ae 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -40,7 +40,7 @@ struct bin_buffer {
40 struct mutex mutex; 40 struct mutex mutex;
41 void *buffer; 41 void *buffer;
42 int mmapped; 42 int mmapped;
43 struct vm_operations_struct *vm_ops; 43 const struct vm_operations_struct *vm_ops;
44 struct file *file; 44 struct file *file;
45 struct hlist_node list; 45 struct hlist_node list;
46}; 46};
@@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
331} 331}
332#endif 332#endif
333 333
334static struct vm_operations_struct bin_vm_ops = { 334static const struct vm_operations_struct bin_vm_ops = {
335 .open = bin_vma_open, 335 .open = bin_vma_open,
336 .close = bin_vma_close, 336 .close = bin_vma_close,
337 .fault = bin_fault, 337 .fault = bin_fault,
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index ee1ce68fd98b..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -715,7 +715,7 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
715 * ubifs_get_free_space - return amount of free space. 715 * ubifs_get_free_space - return amount of free space.
716 * @c: UBIFS file-system description object 716 * @c: UBIFS file-system description object
717 * 717 *
718 * This function calculates and retuns amount of free space to report to 718 * This function calculates and returns amount of free space to report to
719 * user-space. 719 * user-space.
720 */ 720 */
721long long ubifs_get_free_space(struct ubifs_info *c) 721long long ubifs_get_free_space(struct ubifs_info *c)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index f3a7945527fb..4775af401167 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -510,7 +510,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; 510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
511 int first = 1, iip; 511 int first = 1, iip;
512 struct ubifs_debug_info *d = c->dbg; 512 struct ubifs_debug_info *d = c->dbg;
513 union ubifs_key lower_key, upper_key, l_key, u_key; 513 union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
514 unsigned long long uninitialized_var(last_sqnum); 514 unsigned long long uninitialized_var(last_sqnum);
515 struct ubifs_idx_node *idx; 515 struct ubifs_idx_node *idx;
516 struct list_head list; 516 struct list_head list;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ce2cd8343618..dbc093afd946 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -210,6 +210,20 @@ const char *dbg_cstate(int cmt_state)
210 } 210 }
211} 211}
212 212
213const char *dbg_jhead(int jhead)
214{
215 switch (jhead) {
216 case GCHD:
217 return "0 (GC)";
218 case BASEHD:
219 return "1 (base)";
220 case DATAHD:
221 return "2 (data)";
222 default:
223 return "unknown journal head";
224 }
225}
226
213static void dump_ch(const struct ubifs_ch *ch) 227static void dump_ch(const struct ubifs_ch *ch)
214{ 228{
215 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); 229 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
@@ -623,8 +637,9 @@ void dbg_dump_budg(struct ubifs_info *c)
623 /* If we are in R/O mode, journal heads do not exist */ 637 /* If we are in R/O mode, journal heads do not exist */
624 if (c->jheads) 638 if (c->jheads)
625 for (i = 0; i < c->jhead_cnt; i++) 639 for (i = 0; i < c->jhead_cnt; i++)
626 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", 640 printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
627 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); 641 dbg_jhead(c->jheads[i].wbuf.jhead),
642 c->jheads[i].wbuf.lnum);
628 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 643 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
629 bud = rb_entry(rb, struct ubifs_bud, rb); 644 bud = rb_entry(rb, struct ubifs_bud, rb);
630 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 645 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -648,9 +663,90 @@ void dbg_dump_budg(struct ubifs_info *c)
648 663
649void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) 664void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
650{ 665{
651 printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), " 666 int i, spc, dark = 0, dead = 0;
652 "flags %#x\n", lp->lnum, lp->free, lp->dirty, 667 struct rb_node *rb;
653 c->leb_size - lp->free - lp->dirty, lp->flags); 668 struct ubifs_bud *bud;
669
670 spc = lp->free + lp->dirty;
671 if (spc < c->dead_wm)
672 dead = spc;
673 else
674 dark = ubifs_calc_dark(c, spc);
675
676 if (lp->flags & LPROPS_INDEX)
677 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
678 "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
679 lp->dirty, c->leb_size - spc, spc, lp->flags);
680 else
681 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
682 "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
683 "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
684 c->leb_size - spc, spc, dark, dead,
685 (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
686
687 if (lp->flags & LPROPS_TAKEN) {
688 if (lp->flags & LPROPS_INDEX)
689 printk(KERN_CONT "index, taken");
690 else
691 printk(KERN_CONT "taken");
692 } else {
693 const char *s;
694
695 if (lp->flags & LPROPS_INDEX) {
696 switch (lp->flags & LPROPS_CAT_MASK) {
697 case LPROPS_DIRTY_IDX:
698 s = "dirty index";
699 break;
700 case LPROPS_FRDI_IDX:
701 s = "freeable index";
702 break;
703 default:
704 s = "index";
705 }
706 } else {
707 switch (lp->flags & LPROPS_CAT_MASK) {
708 case LPROPS_UNCAT:
709 s = "not categorized";
710 break;
711 case LPROPS_DIRTY:
712 s = "dirty";
713 break;
714 case LPROPS_FREE:
715 s = "free";
716 break;
717 case LPROPS_EMPTY:
718 s = "empty";
719 break;
720 case LPROPS_FREEABLE:
721 s = "freeable";
722 break;
723 default:
724 s = NULL;
725 break;
726 }
727 }
728 printk(KERN_CONT "%s", s);
729 }
730
731 for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
732 bud = rb_entry(rb, struct ubifs_bud, rb);
733 if (bud->lnum == lp->lnum) {
734 int head = 0;
735 for (i = 0; i < c->jhead_cnt; i++) {
736 if (lp->lnum == c->jheads[i].wbuf.lnum) {
737 printk(KERN_CONT ", jhead %s",
738 dbg_jhead(i));
739 head = 1;
740 }
741 }
742 if (!head)
743 printk(KERN_CONT ", bud of jhead %s",
744 dbg_jhead(bud->jhead));
745 }
746 }
747 if (lp->lnum == c->gc_lnum)
748 printk(KERN_CONT ", GC LEB");
749 printk(KERN_CONT ")\n");
654} 750}
655 751
656void dbg_dump_lprops(struct ubifs_info *c) 752void dbg_dump_lprops(struct ubifs_info *c)
@@ -724,7 +820,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
724 820
725 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 821 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
726 current->pid, lnum); 822 current->pid, lnum);
727 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 823 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
728 if (IS_ERR(sleb)) { 824 if (IS_ERR(sleb)) {
729 ubifs_err("scan error %d", (int)PTR_ERR(sleb)); 825 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
730 return; 826 return;
@@ -909,8 +1005,10 @@ out:
909 ubifs_msg("saved lprops statistics dump"); 1005 ubifs_msg("saved lprops statistics dump");
910 dbg_dump_lstats(&d->saved_lst); 1006 dbg_dump_lstats(&d->saved_lst);
911 ubifs_get_lp_stats(c, &lst); 1007 ubifs_get_lp_stats(c, &lst);
1008
912 ubifs_msg("current lprops statistics dump"); 1009 ubifs_msg("current lprops statistics dump");
913 dbg_dump_lstats(&d->saved_lst); 1010 dbg_dump_lstats(&lst);
1011
914 spin_lock(&c->space_lock); 1012 spin_lock(&c->space_lock);
915 dbg_dump_budg(c); 1013 dbg_dump_budg(c);
916 spin_unlock(&c->space_lock); 1014 spin_unlock(&c->space_lock);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index c1cd73b2e06e..29d960101ea6 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -271,6 +271,7 @@ void ubifs_debugging_exit(struct ubifs_info *c);
271/* Dump functions */ 271/* Dump functions */
272const char *dbg_ntype(int type); 272const char *dbg_ntype(int type);
273const char *dbg_cstate(int cmt_state); 273const char *dbg_cstate(int cmt_state);
274const char *dbg_jhead(int jhead);
274const char *dbg_get_key_dump(const struct ubifs_info *c, 275const char *dbg_get_key_dump(const struct ubifs_info *c,
275 const union ubifs_key *key); 276 const union ubifs_key *key);
276void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); 277void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
@@ -321,6 +322,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
321int dbg_check_lprops(struct ubifs_info *c); 322int dbg_check_lprops(struct ubifs_info *c);
322int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, 323int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
323 int row, int col); 324 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
326 loff_t size);
324 327
325/* Force the use of in-the-gaps method for testing */ 328/* Force the use of in-the-gaps method for testing */
326 329
@@ -425,6 +428,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
425 428
426#define dbg_ntype(type) "" 429#define dbg_ntype(type) ""
427#define dbg_cstate(cmt_state) "" 430#define dbg_cstate(cmt_state) ""
431#define dbg_jhead(jhead) ""
428#define dbg_get_key_dump(c, key) ({}) 432#define dbg_get_key_dump(c, key) ({})
429#define dbg_dump_inode(c, inode) ({}) 433#define dbg_dump_inode(c, inode) ({})
430#define dbg_dump_node(c, node) ({}) 434#define dbg_dump_node(c, node) ({})
@@ -460,6 +464,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
460#define dbg_check_heap(c, heap, cat, add_pos) ({}) 464#define dbg_check_heap(c, heap, cat, add_pos) ({})
461#define dbg_check_lprops(c) 0 465#define dbg_check_lprops(c) 0
462#define dbg_check_lpt_nodes(c, cnode, row, col) 0 466#define dbg_check_lpt_nodes(c, cnode, row, col) 0
467#define dbg_check_inode_size(c, inode, size) 0
463#define dbg_force_in_the_gaps_enabled 0 468#define dbg_force_in_the_gaps_enabled 0
464#define dbg_force_in_the_gaps() 0 469#define dbg_force_in_the_gaps() 0
465#define dbg_failure_mode 0 470#define dbg_failure_mode 0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6d34dc7e33e1..1009adc8d602 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -21,34 +21,32 @@
21 */ 21 */
22 22
23/* 23/*
24 * This file implements VFS file and inode operations of regular files, device 24 * This file implements VFS file and inode operations for regular files, device
25 * nodes and symlinks as well as address space operations. 25 * nodes and symlinks as well as address space operations.
26 * 26 *
27 * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the 27 * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
28 * page is dirty and is used for budgeting purposes - dirty pages should not be 28 * the page is dirty and is used for optimization purposes - dirty pages are
29 * budgeted. The PG_checked flag is set if full budgeting is required for the 29 * not budgeted so the flag shows that 'ubifs_write_end()' should not release
30 * page e.g., when it corresponds to a file hole or it is just beyond the file 30 * the budget for this page. The @PG_checked flag is set if full budgeting is
31 * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to 31 * required for the page e.g., when it corresponds to a file hole or it is
32 * fail in this function, and the budget is released in 'ubifs_write_end()'. So 32 * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
33 * the PG_private and PG_checked flags carry the information about how the page 33 * it is OK to fail in this function, and the budget is released in
34 * was budgeted, to make it possible to release the budget properly. 34 * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
35 * information about how the page was budgeted, to make it possible to release
36 * the budget properly.
35 * 37 *
36 * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations 38 * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
37 * we implement. However, this is not true for '->writepage()', which might be 39 * implement. However, this is not true for 'ubifs_writepage()', which may be
38 * called with 'i_mutex' unlocked. For example, when pdflush is performing 40 * called with @i_mutex unlocked. For example, when pdflush is doing background
39 * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the 41 * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
40 * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is 42 * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
41 * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim 43 * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
42 * path'. So, in '->writepage()' we are only guaranteed that the page is 44 * we are only guaranteed that the page is locked.
43 * locked.
44 * 45 *
45 * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., 46 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
46 * readahead path does not have it locked ("sys_read -> generic_file_aio_read 47 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
47 * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is 48 * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
48 * not set as well. However, UBIFS disables readahead. 49 * set as well. However, UBIFS disables readahead.
49 *
50 * This, for example means that there might be 2 concurrent '->writepage()'
51 * calls for the same inode, but different inode dirty pages.
52 */ 50 */
53 51
54#include "ubifs.h" 52#include "ubifs.h"
@@ -449,9 +447,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
449 /* 447 /*
450 * We change whole page so no need to load it. But we 448 * We change whole page so no need to load it. But we
451 * have to set the @PG_checked flag to make the further 449 * have to set the @PG_checked flag to make the further
452 * code the page is new. This might be not true, but it 450 * code know that the page is new. This might be not
453 * is better to budget more that to read the page from 451 * true, but it is better to budget more than to read
454 * the media. 452 * the page from the media.
455 */ 453 */
456 SetPageChecked(page); 454 SetPageChecked(page);
457 skipped_read = 1; 455 skipped_read = 1;
@@ -497,8 +495,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
497 } 495 }
498 496
499 /* 497 /*
500 * Whee, we aquired budgeting quickly - without involving 498 * Whee, we acquired budgeting quickly - without involving
501 * garbage-collection, committing or forceing write-back. We return 499 * garbage-collection, committing or forcing write-back. We return
502 * with @ui->ui_mutex locked if we are appending pages, and unlocked 500 * with @ui->ui_mutex locked if we are appending pages, and unlocked
503 * otherwise. This is an optimization (slightly hacky though). 501 * otherwise. This is an optimization (slightly hacky though).
504 */ 502 */
@@ -562,7 +560,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
562 560
563 /* 561 /*
564 * Return 0 to force VFS to repeat the whole operation, or the 562 * Return 0 to force VFS to repeat the whole operation, or the
565 * error code if 'do_readpage()' failes. 563 * error code if 'do_readpage()' fails.
566 */ 564 */
567 copied = do_readpage(page); 565 copied = do_readpage(page);
568 goto out; 566 goto out;
@@ -1175,11 +1173,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1175 ui->ui_size = inode->i_size; 1173 ui->ui_size = inode->i_size;
1176 /* Truncation changes inode [mc]time */ 1174 /* Truncation changes inode [mc]time */
1177 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1175 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1178 /* The other attributes may be changed at the same time as well */ 1176 /* Other attributes may be changed at the same time as well */
1179 do_attr_changes(inode, attr); 1177 do_attr_changes(inode, attr);
1180
1181 err = ubifs_jnl_truncate(c, inode, old_size, new_size); 1178 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
1182 mutex_unlock(&ui->ui_mutex); 1179 mutex_unlock(&ui->ui_mutex);
1180
1183out_budg: 1181out_budg:
1184 if (budgeted) 1182 if (budgeted)
1185 ubifs_release_budget(c, &req); 1183 ubifs_release_budget(c, &req);
@@ -1536,7 +1534,7 @@ out_unlock:
1536 return err; 1534 return err;
1537} 1535}
1538 1536
1539static struct vm_operations_struct ubifs_file_vm_ops = { 1537static const struct vm_operations_struct ubifs_file_vm_ops = {
1540 .fault = filemap_fault, 1538 .fault = filemap_fault,
1541 .page_mkwrite = ubifs_vm_page_mkwrite, 1539 .page_mkwrite = ubifs_vm_page_mkwrite,
1542}; 1540};
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index f0f5f15d384e..618c2701d3a7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -529,7 +529,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
529 * We scan the entire LEB even though we only really need to scan up to 529 * We scan the entire LEB even though we only really need to scan up to
530 * (c->leb_size - lp->free). 530 * (c->leb_size - lp->free).
531 */ 531 */
532 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 532 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
533 if (IS_ERR(sleb)) 533 if (IS_ERR(sleb))
534 return PTR_ERR(sleb); 534 return PTR_ERR(sleb);
535 535
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 762a7d6cec73..e589fedaf1ef 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -297,7 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
297{ 297{
298 struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); 298 struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
299 299
300 dbg_io("jhead %d", wbuf->jhead); 300 dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
301 wbuf->need_sync = 1; 301 wbuf->need_sync = 1;
302 wbuf->c->need_wbuf_sync = 1; 302 wbuf->c->need_wbuf_sync = 1;
303 ubifs_wake_up_bgt(wbuf->c); 303 ubifs_wake_up_bgt(wbuf->c);
@@ -314,7 +314,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
314 314
315 if (wbuf->no_timer) 315 if (wbuf->no_timer)
316 return; 316 return;
317 dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead, 317 dbg_io("set timer for jhead %s, %llu-%llu millisecs",
318 dbg_jhead(wbuf->jhead),
318 div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), 319 div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
319 div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, 320 div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
320 USEC_PER_SEC)); 321 USEC_PER_SEC));
@@ -351,8 +352,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
351 /* Write-buffer is empty or not seeked */ 352 /* Write-buffer is empty or not seeked */
352 return 0; 353 return 0;
353 354
354 dbg_io("LEB %d:%d, %d bytes, jhead %d", 355 dbg_io("LEB %d:%d, %d bytes, jhead %s",
355 wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead); 356 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
356 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 357 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
357 ubifs_assert(!(wbuf->avail & 7)); 358 ubifs_assert(!(wbuf->avail & 7));
358 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 359 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -401,7 +402,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
401{ 402{
402 const struct ubifs_info *c = wbuf->c; 403 const struct ubifs_info *c = wbuf->c;
403 404
404 dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead); 405 dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
405 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); 406 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
406 ubifs_assert(offs >= 0 && offs <= c->leb_size); 407 ubifs_assert(offs >= 0 && offs <= c->leb_size);
407 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); 408 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -508,9 +509,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
508 struct ubifs_info *c = wbuf->c; 509 struct ubifs_info *c = wbuf->c;
509 int err, written, n, aligned_len = ALIGN(len, 8), offs; 510 int err, written, n, aligned_len = ALIGN(len, 8), offs;
510 511
511 dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len, 512 dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
512 dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead, 513 dbg_ntype(((struct ubifs_ch *)buf)->node_type),
513 wbuf->lnum, wbuf->offs + wbuf->used); 514 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
514 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); 515 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
515 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); 516 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
516 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 517 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -535,8 +536,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
535 memcpy(wbuf->buf + wbuf->used, buf, len); 536 memcpy(wbuf->buf + wbuf->used, buf, len);
536 537
537 if (aligned_len == wbuf->avail) { 538 if (aligned_len == wbuf->avail) {
538 dbg_io("flush jhead %d wbuf to LEB %d:%d", 539 dbg_io("flush jhead %s wbuf to LEB %d:%d",
539 wbuf->jhead, wbuf->lnum, wbuf->offs); 540 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
540 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, 541 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
541 wbuf->offs, c->min_io_size, 542 wbuf->offs, c->min_io_size,
542 wbuf->dtype); 543 wbuf->dtype);
@@ -564,8 +565,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
564 * minimal I/O unit. We have to fill and flush write-buffer and switch 565 * minimal I/O unit. We have to fill and flush write-buffer and switch
565 * to the next min. I/O unit. 566 * to the next min. I/O unit.
566 */ 567 */
567 dbg_io("flush jhead %d wbuf to LEB %d:%d", 568 dbg_io("flush jhead %s wbuf to LEB %d:%d",
568 wbuf->jhead, wbuf->lnum, wbuf->offs); 569 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
569 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); 570 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
570 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, 571 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
571 c->min_io_size, wbuf->dtype); 572 c->min_io_size, wbuf->dtype);
@@ -698,8 +699,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
698 int err, rlen, overlap; 699 int err, rlen, overlap;
699 struct ubifs_ch *ch = buf; 700 struct ubifs_ch *ch = buf;
700 701
701 dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs, 702 dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
702 dbg_ntype(type), len, wbuf->jhead); 703 dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
703 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 704 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
704 ubifs_assert(!(offs & 7) && offs < c->leb_size); 705 ubifs_assert(!(offs & 7) && offs < c->leb_size);
705 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); 706 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 64b5f3a309f5..d321baeca68d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -158,7 +158,7 @@ again:
158 * some. But the write-buffer mutex has to be unlocked because 158 * some. But the write-buffer mutex has to be unlocked because
159 * GC also takes it. 159 * GC also takes it.
160 */ 160 */
161 dbg_jnl("no free space jhead %d, run GC", jhead); 161 dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
162 mutex_unlock(&wbuf->io_mutex); 162 mutex_unlock(&wbuf->io_mutex);
163 163
164 lnum = ubifs_garbage_collect(c, 0); 164 lnum = ubifs_garbage_collect(c, 0);
@@ -173,7 +173,8 @@ again:
173 * because we dropped @wbuf->io_mutex, so try once 173 * because we dropped @wbuf->io_mutex, so try once
174 * again. 174 * again.
175 */ 175 */
176 dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead); 176 dbg_jnl("GC couldn't make a free LEB for jhead %s",
177 dbg_jhead(jhead));
177 if (retries++ < 2) { 178 if (retries++ < 2) {
178 dbg_jnl("retry (%d)", retries); 179 dbg_jnl("retry (%d)", retries);
179 goto again; 180 goto again;
@@ -184,7 +185,7 @@ again:
184 } 185 }
185 186
186 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 187 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
187 dbg_jnl("got LEB %d for jhead %d", lnum, jhead); 188 dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
188 avail = c->leb_size - wbuf->offs - wbuf->used; 189 avail = c->leb_size - wbuf->offs - wbuf->used;
189 190
190 if (wbuf->lnum != -1 && avail >= len) { 191 if (wbuf->lnum != -1 && avail >= len) {
@@ -255,7 +256,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
255 *lnum = c->jheads[jhead].wbuf.lnum; 256 *lnum = c->jheads[jhead].wbuf.lnum;
256 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; 257 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
257 258
258 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); 259 dbg_jnl("jhead %s, LEB %d:%d, len %d",
260 dbg_jhead(jhead), *lnum, *offs, len);
259 ubifs_prepare_node(c, node, len, 0); 261 ubifs_prepare_node(c, node, len, 0);
260 262
261 return ubifs_wbuf_write_nolock(wbuf, node, len); 263 return ubifs_wbuf_write_nolock(wbuf, node, len);
@@ -285,7 +287,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
285 287
286 *lnum = c->jheads[jhead].wbuf.lnum; 288 *lnum = c->jheads[jhead].wbuf.lnum;
287 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; 289 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
288 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); 290 dbg_jnl("jhead %s, LEB %d:%d, len %d",
291 dbg_jhead(jhead), *lnum, *offs, len);
289 292
290 err = ubifs_wbuf_write_nolock(wbuf, buf, len); 293 err = ubifs_wbuf_write_nolock(wbuf, buf, len);
291 if (err) 294 if (err)
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 5fa27ea031ba..0f530c684f0b 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -229,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c,
229} 229}
230 230
231/** 231/**
232 * xent_key_init_hash - initialize extended attribute entry key without
233 * re-calculating hash function.
234 * @c: UBIFS file-system description object
235 * @key: key to initialize
236 * @inum: host inode number
237 * @hash: extended attribute entry name hash
238 */
239static inline void xent_key_init_hash(const struct ubifs_info *c,
240 union ubifs_key *key, ino_t inum,
241 uint32_t hash)
242{
243 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
244 key->u32[0] = inum;
245 key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
246}
247
248/**
249 * xent_key_init_flash - initialize on-flash extended attribute entry key. 232 * xent_key_init_flash - initialize on-flash extended attribute entry key.
250 * @c: UBIFS file-system description object 233 * @c: UBIFS file-system description object
251 * @k: key to initialize 234 * @k: key to initialize
@@ -295,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c,
295} 278}
296 279
297/** 280/**
298 * data_key_init_flash - initialize on-flash data key. 281 * highest_data_key - get the highest possible data key for an inode.
299 * @c: UBIFS file-system description object 282 * @c: UBIFS file-system description object
300 * @k: key to initialize 283 * @key: key to initialize
301 * @inum: inode number 284 * @inum: inode number
302 * @block: block number
303 */ 285 */
304static inline void data_key_init_flash(const struct ubifs_info *c, void *k, 286static inline void highest_data_key(const struct ubifs_info *c,
305 ino_t inum, unsigned int block) 287 union ubifs_key *key, ino_t inum)
306{ 288{
307 union ubifs_key *key = k; 289 data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
308
309 ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
310 key->j32[0] = cpu_to_le32(inum);
311 key->j32[1] = cpu_to_le32(block |
312 (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
313 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
314} 290}
315 291
316/** 292/**
@@ -554,4 +530,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
554 return 0; 530 return 0;
555 } 531 }
556} 532}
533
557#endif /* !__UBIFS_KEY_H__ */ 534#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 56e33772a1ee..c345e125f42c 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -169,8 +169,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
169 */ 169 */
170 c->bud_bytes += c->leb_size - bud->start; 170 c->bud_bytes += c->leb_size - bud->start;
171 171
172 dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum, 172 dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
173 bud->start, bud->jhead, c->bud_bytes); 173 bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
174 spin_unlock(&c->buds_lock); 174 spin_unlock(&c->buds_lock);
175} 175}
176 176
@@ -355,16 +355,16 @@ static void remove_buds(struct ubifs_info *c)
355 * heads (non-closed buds). 355 * heads (non-closed buds).
356 */ 356 */
357 c->cmt_bud_bytes += wbuf->offs - bud->start; 357 c->cmt_bud_bytes += wbuf->offs - bud->start;
358 dbg_log("preserve %d:%d, jhead %d, bud bytes %d, " 358 dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
359 "cmt_bud_bytes %lld", bud->lnum, bud->start, 359 "cmt_bud_bytes %lld", bud->lnum, bud->start,
360 bud->jhead, wbuf->offs - bud->start, 360 dbg_jhead(bud->jhead), wbuf->offs - bud->start,
361 c->cmt_bud_bytes); 361 c->cmt_bud_bytes);
362 bud->start = wbuf->offs; 362 bud->start = wbuf->offs;
363 } else { 363 } else {
364 c->cmt_bud_bytes += c->leb_size - bud->start; 364 c->cmt_bud_bytes += c->leb_size - bud->start;
365 dbg_log("remove %d:%d, jhead %d, bud bytes %d, " 365 dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
366 "cmt_bud_bytes %lld", bud->lnum, bud->start, 366 "cmt_bud_bytes %lld", bud->lnum, bud->start,
367 bud->jhead, c->leb_size - bud->start, 367 dbg_jhead(bud->jhead), c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 /* 370 /*
@@ -429,7 +429,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
429 if (lnum == -1 || offs == c->leb_size) 429 if (lnum == -1 || offs == c->leb_size)
430 continue; 430 continue;
431 431
432 dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i); 432 dbg_log("add ref to LEB %d:%d for jhead %s",
433 lnum, offs, dbg_jhead(i));
433 ref = buf + len; 434 ref = buf + len;
434 ref->ch.node_type = UBIFS_REF_NODE; 435 ref->ch.node_type = UBIFS_REF_NODE;
435 ref->lnum = cpu_to_le32(lnum); 436 ref->lnum = cpu_to_le32(lnum);
@@ -695,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
695 lnum = c->ltail_lnum; 696 lnum = c->ltail_lnum;
696 write_lnum = lnum; 697 write_lnum = lnum;
697 while (1) { 698 while (1) {
698 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 699 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
699 if (IS_ERR(sleb)) { 700 if (IS_ERR(sleb)) {
700 err = PTR_ERR(sleb); 701 err = PTR_ERR(sleb);
701 goto out_free; 702 goto out_free;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4cdd284dea56..4d4ca388889b 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -281,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
281 case LPROPS_FREE: 281 case LPROPS_FREE:
282 if (add_to_lpt_heap(c, lprops, cat)) 282 if (add_to_lpt_heap(c, lprops, cat))
283 break; 283 break;
284 /* No more room on heap so make it uncategorized */ 284 /* No more room on heap so make it un-categorized */
285 cat = LPROPS_UNCAT; 285 cat = LPROPS_UNCAT;
286 /* Fall through */ 286 /* Fall through */
287 case LPROPS_UNCAT: 287 case LPROPS_UNCAT:
@@ -375,8 +375,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
375 * @lprops: LEB properties 375 * @lprops: LEB properties
376 * 376 *
377 * A LEB may have fallen off of the bottom of a heap, and ended up as 377 * A LEB may have fallen off of the bottom of a heap, and ended up as
378 * uncategorized even though it has enough space for us now. If that is the case 378 * un-categorized even though it has enough space for us now. If that is the
379 * this function will put the LEB back onto a heap. 379 * case this function will put the LEB back onto a heap.
380 */ 380 */
381void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) 381void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
382{ 382{
@@ -436,10 +436,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c,
436/** 436/**
437 * change_category - change LEB properties category. 437 * change_category - change LEB properties category.
438 * @c: UBIFS file-system description object 438 * @c: UBIFS file-system description object
439 * @lprops: LEB properties to recategorize 439 * @lprops: LEB properties to re-categorize
440 * 440 *
441 * LEB properties are categorized to enable fast find operations. When the LEB 441 * LEB properties are categorized to enable fast find operations. When the LEB
442 * properties change they must be recategorized. 442 * properties change they must be re-categorized.
443 */ 443 */
444static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) 444static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
445{ 445{
@@ -461,21 +461,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
461} 461}
462 462
463/** 463/**
464 * calc_dark - calculate LEB dark space size. 464 * ubifs_calc_dark - calculate LEB dark space size.
465 * @c: the UBIFS file-system description object 465 * @c: the UBIFS file-system description object
466 * @spc: amount of free and dirty space in the LEB 466 * @spc: amount of free and dirty space in the LEB
467 * 467 *
468 * This function calculates amount of dark space in an LEB which has @spc bytes 468 * This function calculates and returns amount of dark space in an LEB which
469 * of free and dirty space. Returns the calculations result. 469 * has @spc bytes of free and dirty space.
470 * 470 *
471 * Dark space is the space which is not always usable - it depends on which 471 * UBIFS is trying to account the space which might not be usable, and this
472 * nodes are written in which order. E.g., if an LEB has only 512 free bytes, 472 * space is called "dark space". For example, if an LEB has only %512 free
473 * it is dark space, because it cannot fit a large data node. So UBIFS cannot 473 * bytes, it is dark space, because it cannot fit a large data node.
474 * count on this LEB and treat these 512 bytes as usable because it is not true
475 * if, for example, only big chunks of uncompressible data will be written to
476 * the FS.
477 */ 474 */
478static int calc_dark(struct ubifs_info *c, int spc) 475int ubifs_calc_dark(const struct ubifs_info *c, int spc)
479{ 476{
480 ubifs_assert(!(spc & 7)); 477 ubifs_assert(!(spc & 7));
481 478
@@ -518,7 +515,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
518 * @free: new free space amount 515 * @free: new free space amount
519 * @dirty: new dirty space amount 516 * @dirty: new dirty space amount
520 * @flags: new flags 517 * @flags: new flags
521 * @idx_gc_cnt: change to the count of idx_gc list 518 * @idx_gc_cnt: change to the count of @idx_gc list
522 * 519 *
523 * This function changes LEB properties (@free, @dirty or @flag). However, the 520 * This function changes LEB properties (@free, @dirty or @flag). However, the
524 * property which has the %LPROPS_NC value is not changed. Returns a pointer to 521 * property which has the %LPROPS_NC value is not changed. Returns a pointer to
@@ -535,7 +532,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
535{ 532{
536 /* 533 /*
537 * This is the only function that is allowed to change lprops, so we 534 * This is the only function that is allowed to change lprops, so we
538 * discard the const qualifier. 535 * discard the "const" qualifier.
539 */ 536 */
540 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; 537 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
541 538
@@ -575,7 +572,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
575 if (old_spc < c->dead_wm) 572 if (old_spc < c->dead_wm)
576 c->lst.total_dead -= old_spc; 573 c->lst.total_dead -= old_spc;
577 else 574 else
578 c->lst.total_dark -= calc_dark(c, old_spc); 575 c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
579 576
580 c->lst.total_used -= c->leb_size - old_spc; 577 c->lst.total_used -= c->leb_size - old_spc;
581 } 578 }
@@ -616,7 +613,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
616 if (new_spc < c->dead_wm) 613 if (new_spc < c->dead_wm)
617 c->lst.total_dead += new_spc; 614 c->lst.total_dead += new_spc;
618 else 615 else
619 c->lst.total_dark += calc_dark(c, new_spc); 616 c->lst.total_dark += ubifs_calc_dark(c, new_spc);
620 617
621 c->lst.total_used += c->leb_size - new_spc; 618 c->lst.total_used += c->leb_size - new_spc;
622 } 619 }
@@ -1096,7 +1093,7 @@ static int scan_check_cb(struct ubifs_info *c,
1096 } 1093 }
1097 } 1094 }
1098 1095
1099 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 1096 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
1100 if (IS_ERR(sleb)) { 1097 if (IS_ERR(sleb)) {
1101 /* 1098 /*
1102 * After an unclean unmount, empty and freeable LEBs 1099 * After an unclean unmount, empty and freeable LEBs
@@ -1107,7 +1104,7 @@ static int scan_check_cb(struct ubifs_info *c,
1107 "- continuing checking"); 1104 "- continuing checking");
1108 lst->empty_lebs += 1; 1105 lst->empty_lebs += 1;
1109 lst->total_free += c->leb_size; 1106 lst->total_free += c->leb_size;
1110 lst->total_dark += calc_dark(c, c->leb_size); 1107 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1111 return LPT_SCAN_CONTINUE; 1108 return LPT_SCAN_CONTINUE;
1112 } 1109 }
1113 1110
@@ -1117,7 +1114,7 @@ static int scan_check_cb(struct ubifs_info *c,
1117 "- continuing checking"); 1114 "- continuing checking");
1118 lst->total_free += lp->free; 1115 lst->total_free += lp->free;
1119 lst->total_dirty += lp->dirty; 1116 lst->total_dirty += lp->dirty;
1120 lst->total_dark += calc_dark(c, c->leb_size); 1117 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1121 return LPT_SCAN_CONTINUE; 1118 return LPT_SCAN_CONTINUE;
1122 } 1119 }
1123 data->err = PTR_ERR(sleb); 1120 data->err = PTR_ERR(sleb);
@@ -1235,7 +1232,7 @@ static int scan_check_cb(struct ubifs_info *c,
1235 if (spc < c->dead_wm) 1232 if (spc < c->dead_wm)
1236 lst->total_dead += spc; 1233 lst->total_dead += spc;
1237 else 1234 else
1238 lst->total_dark += calc_dark(c, spc); 1235 lst->total_dark += ubifs_calc_dark(c, spc);
1239 } 1236 }
1240 1237
1241 ubifs_scan_destroy(sleb); 1238 ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index a88f33801b98..28beaeedadc0 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -29,7 +29,8 @@
29 * @c: UBIFS file-system description object 29 * @c: UBIFS file-system description object
30 * 30 *
31 * This function scans the master node LEBs and search for the latest master 31 * This function scans the master node LEBs and search for the latest master
32 * node. Returns zero in case of success and a negative error code in case of 32 * node. Returns zero in case of success, %-EUCLEAN if there master area is
33 * corrupted and requires recovery, and a negative error code in case of
33 * failure. 34 * failure.
34 */ 35 */
35static int scan_for_master(struct ubifs_info *c) 36static int scan_for_master(struct ubifs_info *c)
@@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c)
40 41
41 lnum = UBIFS_MST_LNUM; 42 lnum = UBIFS_MST_LNUM;
42 43
43 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 44 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
44 if (IS_ERR(sleb)) 45 if (IS_ERR(sleb))
45 return PTR_ERR(sleb); 46 return PTR_ERR(sleb);
46 nodes_cnt = sleb->nodes_cnt; 47 nodes_cnt = sleb->nodes_cnt;
@@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c)
48 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, 49 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
49 list); 50 list);
50 if (snod->type != UBIFS_MST_NODE) 51 if (snod->type != UBIFS_MST_NODE)
51 goto out; 52 goto out_dump;
52 memcpy(c->mst_node, snod->node, snod->len); 53 memcpy(c->mst_node, snod->node, snod->len);
53 offs = snod->offs; 54 offs = snod->offs;
54 } 55 }
@@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c)
56 57
57 lnum += 1; 58 lnum += 1;
58 59
59 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 60 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
60 if (IS_ERR(sleb)) 61 if (IS_ERR(sleb))
61 return PTR_ERR(sleb); 62 return PTR_ERR(sleb);
62 if (sleb->nodes_cnt != nodes_cnt) 63 if (sleb->nodes_cnt != nodes_cnt)
@@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c)
65 goto out; 66 goto out;
66 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); 67 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
67 if (snod->type != UBIFS_MST_NODE) 68 if (snod->type != UBIFS_MST_NODE)
68 goto out; 69 goto out_dump;
69 if (snod->offs != offs) 70 if (snod->offs != offs)
70 goto out; 71 goto out;
71 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, 72 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
@@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c)
78 79
79out: 80out:
80 ubifs_scan_destroy(sleb); 81 ubifs_scan_destroy(sleb);
82 return -EUCLEAN;
83
84out_dump:
85 ubifs_err("unexpected node type %d master LEB %d:%d",
86 snod->type, lnum, snod->offs);
87 ubifs_scan_destroy(sleb);
81 return -EINVAL; 88 return -EINVAL;
82} 89}
83 90
@@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c)
256 263
257 err = scan_for_master(c); 264 err = scan_for_master(c);
258 if (err) { 265 if (err) {
259 err = ubifs_recover_master_node(c); 266 if (err == -EUCLEAN)
267 err = ubifs_recover_master_node(c);
260 if (err) 268 if (err)
261 /* 269 /*
262 * Note, we do not free 'c->mst_node' here because the 270 * Note, we do not free 'c->mst_node' here because the
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 152a7b34a141..82009c74b6a3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -670,9 +670,10 @@ static int kill_orphans(struct ubifs_info *c)
670 struct ubifs_scan_leb *sleb; 670 struct ubifs_scan_leb *sleb;
671 671
672 dbg_rcvry("LEB %d", lnum); 672 dbg_rcvry("LEB %d", lnum);
673 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 673 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
674 if (IS_ERR(sleb)) { 674 if (IS_ERR(sleb)) {
675 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); 675 if (PTR_ERR(sleb) == -EUCLEAN)
676 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
676 if (IS_ERR(sleb)) { 677 if (IS_ERR(sleb)) {
677 err = PTR_ERR(sleb); 678 err = PTR_ERR(sleb);
678 break; 679 break;
@@ -899,7 +900,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
899 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { 900 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
900 struct ubifs_scan_leb *sleb; 901 struct ubifs_scan_leb *sleb;
901 902
902 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 903 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
903 if (IS_ERR(sleb)) { 904 if (IS_ERR(sleb)) {
904 err = PTR_ERR(sleb); 905 err = PTR_ERR(sleb);
905 break; 906 break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index e5f6cf8a1155..f94ddf7efba0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -286,7 +286,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
286 mst = mst2; 286 mst = mst2;
287 } 287 }
288 288
289 dbg_rcvry("recovered master node from LEB %d", 289 ubifs_msg("recovered master node from LEB %d",
290 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); 290 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
291 291
292 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); 292 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
@@ -790,7 +790,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
790 * We can only recover at the end of the log, so check that the 790 * We can only recover at the end of the log, so check that the
791 * next log LEB is empty or out of date. 791 * next log LEB is empty or out of date.
792 */ 792 */
793 sleb = ubifs_scan(c, next_lnum, 0, sbuf); 793 sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
794 if (IS_ERR(sleb)) 794 if (IS_ERR(sleb))
795 return sleb; 795 return sleb;
796 if (sleb->nodes_cnt) { 796 if (sleb->nodes_cnt) {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 2970500f32df..5c2d6d759a3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -506,7 +506,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
506 if (c->need_recovery) 506 if (c->need_recovery)
507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); 507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
508 else 508 else
509 sleb = ubifs_scan(c, lnum, offs, c->sbuf); 509 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
510 if (IS_ERR(sleb)) 510 if (IS_ERR(sleb))
511 return PTR_ERR(sleb); 511 return PTR_ERR(sleb);
512 512
@@ -836,8 +836,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
836 const struct ubifs_cs_node *node; 836 const struct ubifs_cs_node *node;
837 837
838 dbg_mnt("replay log LEB %d:%d", lnum, offs); 838 dbg_mnt("replay log LEB %d:%d", lnum, offs);
839 sleb = ubifs_scan(c, lnum, offs, sbuf); 839 sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
840 if (IS_ERR(sleb) ) { 840 if (IS_ERR(sleb)) {
841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) 841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 return PTR_ERR(sleb); 842 return PTR_ERR(sleb);
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 892ebfee4fe5..96c525384191 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -108,10 +108,9 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
108 108
109 /* Make the node pads to 8-byte boundary */ 109 /* Make the node pads to 8-byte boundary */
110 if ((node_len + pad_len) & 7) { 110 if ((node_len + pad_len) & 7) {
111 if (!quiet) { 111 if (!quiet)
112 dbg_err("bad padding length %d - %d", 112 dbg_err("bad padding length %d - %d",
113 offs, offs + node_len + pad_len); 113 offs, offs + node_len + pad_len);
114 }
115 return SCANNED_A_BAD_PAD_NODE; 114 return SCANNED_A_BAD_PAD_NODE;
116 } 115 }
117 116
@@ -253,15 +252,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
253 * @c: UBIFS file-system description object 252 * @c: UBIFS file-system description object
254 * @lnum: logical eraseblock number 253 * @lnum: logical eraseblock number
255 * @offs: offset to start at (usually zero) 254 * @offs: offset to start at (usually zero)
256 * @sbuf: scan buffer (must be c->leb_size) 255 * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
256 * @quiet: print no messages
257 * 257 *
258 * This function scans LEB number @lnum and returns complete information about 258 * This function scans LEB number @lnum and returns complete information about
259 * its contents. Returns the scaned information in case of success and, 259 * its contents. Returns the scaned information in case of success and,
260 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case 260 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
261 * of failure. 261 * of failure.
262 *
263 * If @quiet is non-zero, this function does not print large and scary
264 * error messages and flash dumps in case of errors.
262 */ 265 */
263struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, 266struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
264 int offs, void *sbuf) 267 int offs, void *sbuf, int quiet)
265{ 268{
266 void *buf = sbuf + offs; 269 void *buf = sbuf + offs;
267 int err, len = c->leb_size - offs; 270 int err, len = c->leb_size - offs;
@@ -280,7 +283,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
280 283
281 cond_resched(); 284 cond_resched();
282 285
283 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); 286 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
284 if (ret > 0) { 287 if (ret > 0) {
285 /* Padding bytes or a valid padding node */ 288 /* Padding bytes or a valid padding node */
286 offs += ret; 289 offs += ret;
@@ -320,7 +323,9 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
320 } 323 }
321 324
322 if (offs % c->min_io_size) { 325 if (offs % c->min_io_size) {
323 ubifs_err("empty space starts at non-aligned offset %d", offs); 326 if (!quiet)
327 ubifs_err("empty space starts at non-aligned offset %d",
328 offs);
324 goto corrupted;; 329 goto corrupted;;
325 } 330 }
326 331
@@ -331,18 +336,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
331 break; 336 break;
332 for (; len; offs++, buf++, len--) 337 for (; len; offs++, buf++, len--)
333 if (*(uint8_t *)buf != 0xff) { 338 if (*(uint8_t *)buf != 0xff) {
334 ubifs_err("corrupt empty space at LEB %d:%d", 339 if (!quiet)
335 lnum, offs); 340 ubifs_err("corrupt empty space at LEB %d:%d",
341 lnum, offs);
336 goto corrupted; 342 goto corrupted;
337 } 343 }
338 344
339 return sleb; 345 return sleb;
340 346
341corrupted: 347corrupted:
342 ubifs_scanned_corruption(c, lnum, offs, buf); 348 if (!quiet) {
349 ubifs_scanned_corruption(c, lnum, offs, buf);
350 ubifs_err("LEB %d scanning failed", lnum);
351 }
343 err = -EUCLEAN; 352 err = -EUCLEAN;
353 ubifs_scan_destroy(sleb);
354 return ERR_PTR(err);
355
344error: 356error:
345 ubifs_err("LEB %d scanning failed", lnum); 357 ubifs_err("LEB %d scanning failed, error %d", lnum, err);
346 ubifs_scan_destroy(sleb); 358 ubifs_scan_destroy(sleb);
347 return ERR_PTR(err); 359 return ERR_PTR(err);
348} 360}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c4af069df1ad..333e181ee987 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,7 +36,6 @@
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/math64.h> 37#include <linux/math64.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/smp_lock.h>
40#include "ubifs.h" 39#include "ubifs.h"
41 40
42/* 41/*
@@ -318,6 +317,8 @@ static int ubifs_write_inode(struct inode *inode, int wait)
318 if (err) 317 if (err)
319 ubifs_err("can't write inode %lu, error %d", 318 ubifs_err("can't write inode %lu, error %d",
320 inode->i_ino, err); 319 inode->i_ino, err);
320 else
321 err = dbg_check_inode_size(c, inode, ui->ui_size);
321 } 322 }
322 323
323 ui->dirty = 0; 324 ui->dirty = 0;
@@ -448,17 +449,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
448 return 0; 449 return 0;
449 450
450 /* 451 /*
451 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
452 * pages, so synchronize them first, then commit the journal. Strictly
453 * speaking, it is not necessary to commit the journal here,
454 * synchronizing write-buffers would be enough. But committing makes
455 * UBIFS free space predictions much more accurate, so we want to let
456 * the user be able to get more accurate results of 'statfs()' after
457 * they synchronize the file system.
458 */
459 sync_inodes_sb(sb);
460
461 /*
462 * Synchronize write buffers, because 'ubifs_run_commit()' does not 452 * Synchronize write buffers, because 'ubifs_run_commit()' does not
463 * do this if it waits for an already running commit. 453 * do this if it waits for an already running commit.
464 */ 454 */
@@ -468,6 +458,13 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
468 return err; 458 return err;
469 } 459 }
470 460
461 /*
462 * Strictly speaking, it is not necessary to commit the journal here,
463 * synchronizing write-buffers would be enough. But committing makes
464 * UBIFS free space predictions much more accurate, so we want to let
465 * the user be able to get more accurate results of 'statfs()' after
466 * they synchronize the file system.
467 */
471 err = ubifs_run_commit(c); 468 err = ubifs_run_commit(c);
472 if (err) 469 if (err)
473 return err; 470 return err;
@@ -1720,8 +1717,6 @@ static void ubifs_put_super(struct super_block *sb)
1720 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, 1717 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
1721 c->vi.vol_id); 1718 c->vi.vol_id);
1722 1719
1723 lock_kernel();
1724
1725 /* 1720 /*
1726 * The following asserts are only valid if there has not been a failure 1721 * The following asserts are only valid if there has not been a failure
1727 * of the media. For example, there will be dirty inodes if we failed 1722 * of the media. For example, there will be dirty inodes if we failed
@@ -1786,8 +1781,6 @@ static void ubifs_put_super(struct super_block *sb)
1786 ubi_close_volume(c->ubi); 1781 ubi_close_volume(c->ubi);
1787 mutex_unlock(&c->umount_mutex); 1782 mutex_unlock(&c->umount_mutex);
1788 kfree(c); 1783 kfree(c);
1789
1790 unlock_kernel();
1791} 1784}
1792 1785
1793static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) 1786static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1803,22 +1796,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1803 return err; 1796 return err;
1804 } 1797 }
1805 1798
1806 lock_kernel();
1807 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1799 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1808 if (c->ro_media) { 1800 if (c->ro_media) {
1809 ubifs_msg("cannot re-mount due to prior errors"); 1801 ubifs_msg("cannot re-mount due to prior errors");
1810 unlock_kernel();
1811 return -EROFS; 1802 return -EROFS;
1812 } 1803 }
1813 err = ubifs_remount_rw(c); 1804 err = ubifs_remount_rw(c);
1814 if (err) { 1805 if (err)
1815 unlock_kernel();
1816 return err; 1806 return err;
1817 }
1818 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1807 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1819 if (c->ro_media) { 1808 if (c->ro_media) {
1820 ubifs_msg("cannot re-mount due to prior errors"); 1809 ubifs_msg("cannot re-mount due to prior errors");
1821 unlock_kernel();
1822 return -EROFS; 1810 return -EROFS;
1823 } 1811 }
1824 ubifs_remount_ro(c); 1812 ubifs_remount_ro(c);
@@ -1833,7 +1821,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1833 } 1821 }
1834 1822
1835 ubifs_assert(c->lst.taken_empty_lebs > 0); 1823 ubifs_assert(c->lst.taken_empty_lebs > 0);
1836 unlock_kernel();
1837 return 0; 1824 return 0;
1838} 1825}
1839 1826
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f249f7b0d656..e5b1a7d00fa0 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1159,8 +1159,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
1159 * o exact match, i.e. the found zero-level znode contains key @key, then %1 1159 * o exact match, i.e. the found zero-level znode contains key @key, then %1
1160 * is returned and slot number of the matched branch is stored in @n; 1160 * is returned and slot number of the matched branch is stored in @n;
1161 * o not exact match, which means that zero-level znode does not contain 1161 * o not exact match, which means that zero-level znode does not contain
1162 * @key, then %0 is returned and slot number of the closed branch is stored 1162 * @key, then %0 is returned and slot number of the closest branch is stored
1163 * in @n; 1163 * in @n;
1164 * o @key is so small that it is even less than the lowest key of the 1164 * o @key is so small that it is even less than the lowest key of the
1165 * leftmost zero-level node, then %0 is returned and %0 is stored in @n. 1165 * leftmost zero-level node, then %0 is returned and %0 is stored in @n.
1166 * 1166 *
@@ -1433,7 +1433,7 @@ static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
1433 * @lnum: LEB number is returned here 1433 * @lnum: LEB number is returned here
1434 * @offs: offset is returned here 1434 * @offs: offset is returned here
1435 * 1435 *
1436 * This function look up and reads node with key @key. The caller has to make 1436 * This function looks up and reads node with key @key. The caller has to make
1437 * sure the @node buffer is large enough to fit the node. Returns zero in case 1437 * sure the @node buffer is large enough to fit the node. Returns zero in case
1438 * of success, %-ENOENT if the node was not found, and a negative error code in 1438 * of success, %-ENOENT if the node was not found, and a negative error code in
1439 * case of failure. The node location can be returned in @lnum and @offs. 1439 * case of failure. The node location can be returned in @lnum and @offs.
@@ -3268,3 +3268,73 @@ out_unlock:
3268 mutex_unlock(&c->tnc_mutex); 3268 mutex_unlock(&c->tnc_mutex);
3269 return err; 3269 return err;
3270} 3270}
3271
3272#ifdef CONFIG_UBIFS_FS_DEBUG
3273
3274/**
3275 * dbg_check_inode_size - check if inode size is correct.
3276 * @c: UBIFS file-system description object
3277 * @inum: inode number
3278 * @size: inode size
3279 *
3280 * This function makes sure that the inode size (@size) is correct and it does
3281 * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
3282 * if it has a data page beyond @size, and other negative error code in case of
3283 * other errors.
3284 */
3285int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
3286 loff_t size)
3287{
3288 int err, n;
3289 union ubifs_key from_key, to_key, *key;
3290 struct ubifs_znode *znode;
3291 unsigned int block;
3292
3293 if (!S_ISREG(inode->i_mode))
3294 return 0;
3295 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
3296 return 0;
3297
3298 block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
3299 data_key_init(c, &from_key, inode->i_ino, block);
3300 highest_data_key(c, &to_key, inode->i_ino);
3301
3302 mutex_lock(&c->tnc_mutex);
3303 err = ubifs_lookup_level0(c, &from_key, &znode, &n);
3304 if (err < 0)
3305 goto out_unlock;
3306
3307 if (err) {
3308 err = -EINVAL;
3309 key = &from_key;
3310 goto out_dump;
3311 }
3312
3313 err = tnc_next(c, &znode, &n);
3314 if (err == -ENOENT) {
3315 err = 0;
3316 goto out_unlock;
3317 }
3318 if (err < 0)
3319 goto out_unlock;
3320
3321 ubifs_assert(err == 0);
3322 key = &znode->zbranch[n].key;
3323 if (!key_in_range(c, key, &from_key, &to_key))
3324 goto out_unlock;
3325
3326out_dump:
3327 block = key_block(c, key);
3328 ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
3329 "(data key %s)", (unsigned long)inode->i_ino, size,
3330 ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
3331 dbg_dump_inode(c, inode);
3332 dbg_dump_stack();
3333 err = -EINVAL;
3334
3335out_unlock:
3336 mutex_unlock(&c->tnc_mutex);
3337 return err;
3338}
3339
3340#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index fde8d127c768..53288e5d604e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -245,7 +245,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
245 * it is more comprehensive and less efficient than is needed for this 245 * it is more comprehensive and less efficient than is needed for this
246 * purpose. 246 * purpose.
247 */ 247 */
248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf); 248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
249 c->ileb_len = 0; 249 c->ileb_len = 0;
250 if (IS_ERR(sleb)) 250 if (IS_ERR(sleb))
251 return PTR_ERR(sleb); 251 return PTR_ERR(sleb);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 3eee07e0c495..191ca7863fe7 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -135,6 +135,13 @@
135/* The key is always at the same position in all keyed nodes */ 135/* The key is always at the same position in all keyed nodes */
136#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) 136#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
137 137
138/* Garbage collector journal head number */
139#define UBIFS_GC_HEAD 0
140/* Base journal head number */
141#define UBIFS_BASE_HEAD 1
142/* Data journal head number */
143#define UBIFS_DATA_HEAD 2
144
138/* 145/*
139 * LEB Properties Tree node types. 146 * LEB Properties Tree node types.
140 * 147 *
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a29349094422..b2d976366a46 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -105,12 +105,10 @@
105/* Number of non-data journal heads */ 105/* Number of non-data journal heads */
106#define NONDATA_JHEADS_CNT 2 106#define NONDATA_JHEADS_CNT 2
107 107
108/* Garbage collector head */ 108/* Shorter names for journal head numbers for internal usage */
109#define GCHD 0 109#define GCHD UBIFS_GC_HEAD
110/* Base journal head number */ 110#define BASEHD UBIFS_BASE_HEAD
111#define BASEHD 1 111#define DATAHD UBIFS_DATA_HEAD
112/* First "general purpose" journal head */
113#define DATAHD 2
114 112
115/* 'No change' value for 'ubifs_change_lp()' */ 113/* 'No change' value for 'ubifs_change_lp()' */
116#define LPROPS_NC 0x80000001 114#define LPROPS_NC 0x80000001
@@ -1451,7 +1449,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
1451 1449
1452/* scan.c */ 1450/* scan.c */
1453struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, 1451struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
1454 int offs, void *sbuf); 1452 int offs, void *sbuf, int quiet);
1455void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); 1453void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
1456int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, 1454int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
1457 int offs, int quiet); 1455 int offs, int quiet);
@@ -1676,6 +1674,7 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
1676const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); 1674const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
1677const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); 1675const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
1678const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); 1676const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1677int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1679 1678
1680/* file.c */ 1679/* file.c */
1681int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1680int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index adafcf556531..195830f47569 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -78,9 +78,9 @@ enum {
78 SECURITY_XATTR, 78 SECURITY_XATTR,
79}; 79};
80 80
81static struct inode_operations none_inode_operations; 81static const struct inode_operations none_inode_operations;
82static struct address_space_operations none_address_operations; 82static const struct address_space_operations none_address_operations;
83static struct file_operations none_file_operations; 83static const struct file_operations none_file_operations;
84 84
85/** 85/**
86 * create_xattr - create an extended attribute. 86 * create_xattr - create an extended attribute.
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d5e5559e31db..381854461b28 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1635,4 +1635,5 @@ const struct address_space_operations xfs_address_space_operations = {
1635 .direct_IO = xfs_vm_direct_IO, 1635 .direct_IO = xfs_vm_direct_IO,
1636 .migratepage = buffer_migrate_page, 1636 .migratepage = buffer_migrate_page,
1637 .is_partially_uptodate = block_is_partially_uptodate, 1637 .is_partially_uptodate = block_is_partially_uptodate,
1638 .error_remove_page = generic_error_remove_page,
1638}; 1639};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 988d8f87bc0f..629370974e57 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -42,7 +42,7 @@
42 42
43#include <linux/dcache.h> 43#include <linux/dcache.h>
44 44
45static struct vm_operations_struct xfs_file_vm_ops; 45static const struct vm_operations_struct xfs_file_vm_ops;
46 46
47STATIC ssize_t 47STATIC ssize_t
48xfs_file_aio_read( 48xfs_file_aio_read(
@@ -280,7 +280,7 @@ const struct file_operations xfs_dir_file_operations = {
280 .fsync = xfs_file_fsync, 280 .fsync = xfs_file_fsync,
281}; 281};
282 282
283static struct vm_operations_struct xfs_file_vm_ops = { 283static const struct vm_operations_struct xfs_file_vm_ops = {
284 .fault = filemap_fault, 284 .fault = filemap_fault,
285 .page_mkwrite = xfs_vm_page_mkwrite, 285 .page_mkwrite = xfs_vm_page_mkwrite,
286}; 286};
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index cb6e2cca214f..9e41f91aa269 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -150,7 +150,7 @@ xfs_fs_set_xquota(
150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
151} 151}
152 152
153struct quotactl_ops xfs_quotactl_operations = { 153const struct quotactl_ops xfs_quotactl_operations = {
154 .quota_sync = xfs_fs_quota_sync, 154 .quota_sync = xfs_fs_quota_sync,
155 .get_xstate = xfs_fs_get_xstate, 155 .get_xstate = xfs_fs_get_xstate,
156 .set_xstate = xfs_fs_set_xstate, 156 .set_xstate = xfs_fs_set_xstate,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5d7c60ac77b4..bdd41c8c342f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -67,7 +67,7 @@
67#include <linux/freezer.h> 67#include <linux/freezer.h>
68#include <linux/parser.h> 68#include <linux/parser.h>
69 69
70static struct super_operations xfs_super_operations; 70static const struct super_operations xfs_super_operations;
71static kmem_zone_t *xfs_ioend_zone; 71static kmem_zone_t *xfs_ioend_zone;
72mempool_t *xfs_ioend_pool; 72mempool_t *xfs_ioend_pool;
73 73
@@ -1536,7 +1536,7 @@ xfs_fs_get_sb(
1536 mnt); 1536 mnt);
1537} 1537}
1538 1538
1539static struct super_operations xfs_super_operations = { 1539static const struct super_operations xfs_super_operations = {
1540 .alloc_inode = xfs_fs_alloc_inode, 1540 .alloc_inode = xfs_fs_alloc_inode,
1541 .destroy_inode = xfs_fs_destroy_inode, 1541 .destroy_inode = xfs_fs_destroy_inode,
1542 .write_inode = xfs_fs_write_inode, 1542 .write_inode = xfs_fs_write_inode,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 5a2ea3a21781..18175ebd58ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
93 93
94extern const struct export_operations xfs_export_operations; 94extern const struct export_operations xfs_export_operations;
95extern struct xattr_handler *xfs_xattr_handlers[]; 95extern struct xattr_handler *xfs_xattr_handlers[];
96extern struct quotactl_ops xfs_quotactl_operations; 96extern const struct quotactl_ops xfs_quotactl_operations;
97 97
98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
99 99
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb6083..c5bc67c4e3bb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
26xfs_stats_clear_proc_handler( 26xfs_stats_clear_proc_handler(
27 ctl_table *ctl, 27 ctl_table *ctl,
28 int write, 28 int write,
29 struct file *filp,
30 void __user *buffer, 29 void __user *buffer,
31 size_t *lenp, 30 size_t *lenp,
32 loff_t *ppos) 31 loff_t *ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
34 int c, ret, *valp = ctl->data; 33 int c, ret, *valp = ctl->data;
35 __uint32_t vn_active; 34 __uint32_t vn_active;
36 35
37 ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); 36 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
38 37
39 if (!ret && write && *valp) { 38 if (!ret && write && *valp) {
40 printk("XFS Clearing xfsstats\n"); 39 printk("XFS Clearing xfsstats\n");
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c4ea51b55dce..f52ac276277e 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -117,7 +117,7 @@ struct getbmapx {
117#define BMV_IF_VALID \ 117#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
119 119
120/* bmv_oflags values - returned for for each non-header segment */ 120/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
122#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ 122#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
123#define BMV_OF_LAST 0x4 /* segment is the last in the file */ 123#define BMV_OF_LAST 0x4 /* segment is the last in the file */